2 * Copyright (c) 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <linux/version.h>
37 #include <sys/types.h>
38 #include <sys/ioctl.h>
39 #include <sys/socket.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
51 #include "connectivity.h"
53 #include "dpif-linux.h"
54 #include "dynamic-string.h"
55 #include "fatal-signal.h"
58 #include "netdev-provider.h"
59 #include "netdev-vport.h"
60 #include "netlink-notifier.h"
61 #include "netlink-socket.h"
64 #include "openflow/openflow.h"
65 #include "ovs-atomic.h"
67 #include "poll-loop.h"
68 #include "rtnetlink-link.h"
71 #include "socket-util.h"
74 #include "unaligned.h"
77 VLOG_DEFINE_THIS_MODULE(netdev_linux);
79 COVERAGE_DEFINE(netdev_set_policing);
80 COVERAGE_DEFINE(netdev_arp_lookup);
81 COVERAGE_DEFINE(netdev_get_ifindex);
82 COVERAGE_DEFINE(netdev_get_hwaddr);
83 COVERAGE_DEFINE(netdev_set_hwaddr);
84 COVERAGE_DEFINE(netdev_get_ethtool);
85 COVERAGE_DEFINE(netdev_set_ethtool);
88 /* These were introduced in Linux 2.6.14, so they might be missing if we have
90 #ifndef ADVERTISED_Pause
91 #define ADVERTISED_Pause (1 << 13)
93 #ifndef ADVERTISED_Asym_Pause
94 #define ADVERTISED_Asym_Pause (1 << 14)
97 /* These were introduced in Linux 2.6.24, so they might be missing if we
98 * have old headers. */
99 #ifndef ETHTOOL_GFLAGS
100 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
102 #ifndef ETHTOOL_SFLAGS
103 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
106 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
109 #define TC_RTAB_SIZE 1024
113 VALID_IFINDEX = 1 << 0,
114 VALID_ETHERADDR = 1 << 1,
118 VALID_POLICING = 1 << 5,
119 VALID_VPORT_STAT_ERROR = 1 << 6,
120 VALID_DRVINFO = 1 << 7,
121 VALID_FEATURES = 1 << 8,
124 /* Traffic control. */
126 /* An instance of a traffic control class. Always associated with a particular
129 * Each TC implementation subclasses this with whatever additional data it
132 const struct tc_ops *ops;
133 struct hmap queues; /* Contains "struct tc_queue"s.
134 * Read by generic TC layer.
135 * Written only by TC implementation. */
138 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
140 /* One traffic control queue.
142 * Each TC implementation subclasses this with whatever additional data it
145 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
146 unsigned int queue_id; /* OpenFlow queue ID. */
147 long long int created; /* Time queue was created, in msecs. */
150 /* A particular kind of traffic control. Each implementation generally maps to
151 * one particular Linux qdisc class.
153 * The functions below return 0 if successful or a positive errno value on
154 * failure, except where otherwise noted. All of them must be provided, except
155 * where otherwise noted. */
157 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
158 * This is null for tc_ops_default and tc_ops_other, for which there are no
159 * appropriate values. */
160 const char *linux_name;
162 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
163 const char *ovs_name;
165 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
166 * queues. The queues are numbered 0 through n_queues - 1. */
167 unsigned int n_queues;
169 /* Called to install this TC class on 'netdev'. The implementation should
170 * make the Netlink calls required to set up 'netdev' with the right qdisc
171 * and configure it according to 'details'. The implementation may assume
172 * that the current qdisc is the default; that is, there is no need for it
173 * to delete the current qdisc before installing itself.
175 * The contents of 'details' should be documented as valid for 'ovs_name'
176 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
177 * (which is built as ovs-vswitchd.conf.db(8)).
179 * This function must return 0 if and only if it sets 'netdev->tc' to an
180 * initialized 'struct tc'.
182 * (This function is null for tc_ops_other, which cannot be installed. For
183 * other TC classes it should always be nonnull.) */
184 int (*tc_install)(struct netdev *netdev, const struct smap *details);
186 /* Called when the netdev code determines (through a Netlink query) that
187 * this TC class's qdisc is installed on 'netdev', but we didn't install
188 * it ourselves and so don't know any of the details.
190 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
191 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
192 * implementation should parse the other attributes of 'nlmsg' as
193 * necessary to determine its configuration. If necessary it should also
194 * use Netlink queries to determine the configuration of queues on
197 * This function must return 0 if and only if it sets 'netdev->tc' to an
198 * initialized 'struct tc'. */
199 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
201 /* Destroys the data structures allocated by the implementation as part of
202 * 'tc'. (This includes destroying 'tc->queues' by calling
205 * The implementation should not need to perform any Netlink calls. If
206 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
207 * (But it may not be desirable.)
209 * This function may be null if 'tc' is trivial. */
210 void (*tc_destroy)(struct tc *tc);
212 /* Retrieves details of 'netdev->tc' configuration into 'details'.
214 * The implementation should not need to perform any Netlink calls, because
215 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
216 * cached the configuration.
218 * The contents of 'details' should be documented as valid for 'ovs_name'
219 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
220 * (which is built as ovs-vswitchd.conf.db(8)).
222 * This function may be null if 'tc' is not configurable.
224 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
226 /* Reconfigures 'netdev->tc' according to 'details', performing any
227 * required Netlink calls to complete the reconfiguration.
229 * The contents of 'details' should be documented as valid for 'ovs_name'
230 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
231 * (which is built as ovs-vswitchd.conf.db(8)).
233 * This function may be null if 'tc' is not configurable.
235 int (*qdisc_set)(struct netdev *, const struct smap *details);
237 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
238 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
240 * The contents of 'details' should be documented as valid for 'ovs_name'
241 * in the "other_config" column in the "Queue" table in
242 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
244 * The implementation should not need to perform any Netlink calls, because
245 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
246 * cached the queue configuration.
248 * This function may be null if 'tc' does not have queues ('n_queues' is
250 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
251 struct smap *details);
253 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
254 * 'details', perfoming any required Netlink calls to complete the
255 * reconfiguration. The caller ensures that 'queue_id' is less than
258 * The contents of 'details' should be documented as valid for 'ovs_name'
259 * in the "other_config" column in the "Queue" table in
260 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
262 * This function may be null if 'tc' does not have queues or its queues are
263 * not configurable. */
264 int (*class_set)(struct netdev *, unsigned int queue_id,
265 const struct smap *details);
267 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
268 * tc_queue's within 'netdev->tc->queues'.
270 * This function may be null if 'tc' does not have queues or its queues
271 * cannot be deleted. */
272 int (*class_delete)(struct netdev *, struct tc_queue *queue);
274 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
275 * 'struct tc_queue's within 'netdev->tc->queues'.
277 * On success, initializes '*stats'.
279 * This function may be null if 'tc' does not have queues or if it cannot
280 * report queue statistics. */
281 int (*class_get_stats)(const struct netdev *netdev,
282 const struct tc_queue *queue,
283 struct netdev_queue_stats *stats);
285 /* Extracts queue stats from 'nlmsg', which is a response to a
286 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
288 * This function may be null if 'tc' does not have queues or if it cannot
289 * report queue statistics. */
290 int (*class_dump_stats)(const struct netdev *netdev,
291 const struct ofpbuf *nlmsg,
292 netdev_dump_queue_stats_cb *cb, void *aux);
296 tc_init(struct tc *tc, const struct tc_ops *ops)
299 hmap_init(&tc->queues);
303 tc_destroy(struct tc *tc)
305 hmap_destroy(&tc->queues);
308 static const struct tc_ops tc_ops_htb;
309 static const struct tc_ops tc_ops_hfsc;
310 static const struct tc_ops tc_ops_default;
311 static const struct tc_ops tc_ops_other;
313 static const struct tc_ops *const tcs[] = {
314 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
315 &tc_ops_hfsc, /* Hierarchical fair service curve. */
316 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
317 &tc_ops_other, /* Some other qdisc. */
321 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
322 static unsigned int tc_get_major(unsigned int handle);
323 static unsigned int tc_get_minor(unsigned int handle);
325 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
326 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
327 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
329 static struct tcmsg *tc_make_request(const struct netdev *, int type,
330 unsigned int flags, struct ofpbuf *);
331 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
332 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
333 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
336 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
337 struct nlattr **options);
338 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
339 struct nlattr **options,
340 struct netdev_queue_stats *);
341 static int tc_query_class(const struct netdev *,
342 unsigned int handle, unsigned int parent,
343 struct ofpbuf **replyp);
344 static int tc_delete_class(const struct netdev *, unsigned int handle);
346 static int tc_del_qdisc(struct netdev *netdev);
347 static int tc_query_qdisc(const struct netdev *netdev);
349 static int tc_calc_cell_log(unsigned int mtu);
350 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
351 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
352 const struct tc_ratespec *rate);
353 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
355 struct netdev_linux {
358 /* Protects all members below. */
359 struct ovs_mutex mutex;
361 unsigned int cache_valid;
363 bool miimon; /* Link status of last poll. */
364 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
365 struct timer miimon_timer;
367 /* The following are figured out "on demand" only. They are only valid
368 * when the corresponding VALID_* bit in 'cache_valid' is set. */
370 uint8_t etheraddr[ETH_ADDR_LEN];
371 struct in_addr address, netmask;
374 unsigned int ifi_flags;
375 long long int carrier_resets;
376 uint32_t kbits_rate; /* Policing data. */
377 uint32_t kbits_burst;
378 int vport_stats_error; /* Cached error code from vport_get_stats().
379 0 or an errno value. */
380 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
381 int ether_addr_error; /* Cached error code from set/get etheraddr. */
382 int netdev_policing_error; /* Cached error code from set policing. */
383 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
384 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
386 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
387 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
388 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
390 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
393 /* For devices of class netdev_tap_class only. */
397 struct netdev_rx_linux {
403 /* This is set pretty low because we probably won't learn anything from the
404 * additional log messages. */
405 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
407 /* Polling miimon status for all ports causes performance degradation when
408 * handling a large number of ports. If there are no devices using miimon, then
409 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait(). */
410 static atomic_int miimon_cnt = ATOMIC_VAR_INIT(0);
412 static void netdev_linux_run(void);
414 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
415 int cmd, const char *cmd_name);
416 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
417 int cmd, const char *cmd_name);
418 static int get_flags(const struct netdev *, unsigned int *flags);
419 static int set_flags(const char *, unsigned int flags);
420 static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
421 enum netdev_flags on, enum netdev_flags *old_flagsp)
422 OVS_REQUIRES(netdev->mutex);
423 static int do_get_ifindex(const char *netdev_name);
424 static int get_ifindex(const struct netdev *, int *ifindexp);
425 static int do_set_addr(struct netdev *netdev,
426 int ioctl_nr, const char *ioctl_name,
427 struct in_addr addr);
428 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
429 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
430 static int get_stats_via_netlink(const struct netdev *, struct netdev_stats *);
431 static int af_packet_sock(void);
432 static bool netdev_linux_miimon_enabled(void);
433 static void netdev_linux_miimon_run(void);
434 static void netdev_linux_miimon_wait(void);
437 is_netdev_linux_class(const struct netdev_class *netdev_class)
439 return netdev_class->run == netdev_linux_run;
443 is_tap_netdev(const struct netdev *netdev)
445 return netdev_get_class(netdev) == &netdev_tap_class;
448 static struct netdev_linux *
449 netdev_linux_cast(const struct netdev *netdev)
451 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
453 return CONTAINER_OF(netdev, struct netdev_linux, up);
456 static struct netdev_rx_linux *
457 netdev_rx_linux_cast(const struct netdev_rx *rx)
459 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
460 return CONTAINER_OF(rx, struct netdev_rx_linux, up);
463 static void netdev_linux_update(struct netdev_linux *netdev,
464 const struct rtnetlink_link_change *)
465 OVS_REQUIRES(netdev->mutex);
466 static void netdev_linux_changed(struct netdev_linux *netdev,
467 unsigned int ifi_flags, unsigned int mask)
468 OVS_REQUIRES(netdev->mutex);
470 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK changes, or NULL
471 * if no such socket could be created. */
472 static struct nl_sock *
473 netdev_linux_notify_sock(void)
475 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
476 static struct nl_sock *sock;
478 if (ovsthread_once_start(&once)) {
481 error = nl_sock_create(NETLINK_ROUTE, &sock);
483 error = nl_sock_join_mcgroup(sock, RTNLGRP_LINK);
485 nl_sock_destroy(sock);
489 ovsthread_once_done(&once);
496 netdev_linux_miimon_enabled(void)
500 atomic_read(&miimon_cnt, &miimon);
505 netdev_linux_run(void)
507 struct nl_sock *sock;
510 if (netdev_linux_miimon_enabled()) {
511 netdev_linux_miimon_run();
514 sock = netdev_linux_notify_sock();
520 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
521 uint64_t buf_stub[4096 / 8];
524 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
525 error = nl_sock_recv(sock, &buf, false);
527 struct rtnetlink_link_change change;
529 if (rtnetlink_link_parse(&buf, &change)) {
530 struct netdev *netdev_ = netdev_from_name(change.ifname);
531 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
532 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
534 ovs_mutex_lock(&netdev->mutex);
535 netdev_linux_update(netdev, &change);
536 ovs_mutex_unlock(&netdev->mutex);
538 netdev_close(netdev_);
540 } else if (error == ENOBUFS) {
541 struct shash device_shash;
542 struct shash_node *node;
546 shash_init(&device_shash);
547 netdev_get_devices(&netdev_linux_class, &device_shash);
548 SHASH_FOR_EACH (node, &device_shash) {
549 struct netdev *netdev_ = node->data;
550 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
553 ovs_mutex_lock(&netdev->mutex);
554 get_flags(netdev_, &flags);
555 netdev_linux_changed(netdev, flags, 0);
556 ovs_mutex_unlock(&netdev->mutex);
558 netdev_close(netdev_);
560 shash_destroy(&device_shash);
561 } else if (error != EAGAIN) {
562 VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)",
563 ovs_strerror(error));
570 netdev_linux_wait(void)
572 struct nl_sock *sock;
574 if (netdev_linux_miimon_enabled()) {
575 netdev_linux_miimon_wait();
577 sock = netdev_linux_notify_sock();
579 nl_sock_wait(sock, POLLIN);
584 netdev_linux_changed(struct netdev_linux *dev,
585 unsigned int ifi_flags, unsigned int mask)
586 OVS_REQUIRES(dev->mutex)
588 seq_change(connectivity_seq_get());
590 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
591 dev->carrier_resets++;
593 dev->ifi_flags = ifi_flags;
595 dev->cache_valid &= mask;
599 netdev_linux_update(struct netdev_linux *dev,
600 const struct rtnetlink_link_change *change)
601 OVS_REQUIRES(dev->mutex)
603 if (change->nlmsg_type == RTM_NEWLINK) {
605 netdev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
607 /* Update netdev from rtnl-change msg. */
609 dev->mtu = change->mtu;
610 dev->cache_valid |= VALID_MTU;
611 dev->netdev_mtu_error = 0;
614 if (!eth_addr_is_zero(change->addr)) {
615 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
616 dev->cache_valid |= VALID_ETHERADDR;
617 dev->ether_addr_error = 0;
620 dev->ifindex = change->ifi_index;
621 dev->cache_valid |= VALID_IFINDEX;
622 dev->get_ifindex_error = 0;
625 netdev_linux_changed(dev, change->ifi_flags, 0);
629 static struct netdev *
630 netdev_linux_alloc(void)
632 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
637 netdev_linux_common_construct(struct netdev_linux *netdev)
639 ovs_mutex_init(&netdev->mutex);
642 /* Creates system and internal devices. */
644 netdev_linux_construct(struct netdev *netdev_)
646 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
649 netdev_linux_common_construct(netdev);
651 error = get_flags(&netdev->up, &netdev->ifi_flags);
652 if (error == ENODEV) {
653 if (netdev->up.netdev_class != &netdev_internal_class) {
654 /* The device does not exist, so don't allow it to be opened. */
657 /* "Internal" netdevs have to be created as netdev objects before
658 * they exist in the kernel, because creating them in the kernel
659 * happens by passing a netdev object to dpif_port_add().
660 * Therefore, ignore the error. */
667 /* For most types of netdevs we open the device for each call of
668 * netdev_open(). However, this is not the case with tap devices,
669 * since it is only possible to open the device once. In this
670 * situation we share a single file descriptor, and consequently
671 * buffers, across all readers. Therefore once data is read it will
672 * be unavailable to other reads for tap devices. */
674 netdev_linux_construct_tap(struct netdev *netdev_)
676 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
677 static const char tap_dev[] = "/dev/net/tun";
678 const char *name = netdev_->name;
682 netdev_linux_common_construct(netdev);
684 /* Open tap device. */
685 netdev->tap_fd = open(tap_dev, O_RDWR);
686 if (netdev->tap_fd < 0) {
688 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
692 /* Create tap device. */
693 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
694 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
695 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
696 VLOG_WARN("%s: creating tap device failed: %s", name,
697 ovs_strerror(errno));
702 /* Make non-blocking. */
703 error = set_nonblocking(netdev->tap_fd);
711 close(netdev->tap_fd);
716 netdev_linux_destruct(struct netdev *netdev_)
718 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
720 if (netdev->tc && netdev->tc->ops->tc_destroy) {
721 netdev->tc->ops->tc_destroy(netdev->tc);
724 if (netdev_get_class(netdev_) == &netdev_tap_class
725 && netdev->tap_fd >= 0)
727 close(netdev->tap_fd);
730 if (netdev->miimon_interval > 0) {
732 atomic_sub(&miimon_cnt, 1, &junk);
735 ovs_mutex_destroy(&netdev->mutex);
739 netdev_linux_dealloc(struct netdev *netdev_)
741 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
745 static struct netdev_rx *
746 netdev_linux_rx_alloc(void)
748 struct netdev_rx_linux *rx = xzalloc(sizeof *rx);
753 netdev_linux_rx_construct(struct netdev_rx *rx_)
755 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
756 struct netdev *netdev_ = rx->up.netdev;
757 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
760 ovs_mutex_lock(&netdev->mutex);
761 rx->is_tap = is_tap_netdev(netdev_);
763 rx->fd = netdev->tap_fd;
765 struct sockaddr_ll sll;
767 /* Result of tcpdump -dd inbound */
768 static const struct sock_filter filt[] = {
769 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
770 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
771 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
772 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
774 static const struct sock_fprog fprog = {
775 ARRAY_SIZE(filt), (struct sock_filter *) filt
778 /* Create file descriptor. */
779 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
782 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
786 /* Set non-blocking mode. */
787 error = set_nonblocking(rx->fd);
792 /* Get ethernet device index. */
793 error = get_ifindex(&netdev->up, &ifindex);
798 /* Bind to specific ethernet device. */
799 memset(&sll, 0, sizeof sll);
800 sll.sll_family = AF_PACKET;
801 sll.sll_ifindex = ifindex;
802 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
803 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
805 VLOG_ERR("%s: failed to bind raw socket (%s)",
806 netdev_get_name(netdev_), ovs_strerror(error));
810 /* Filter for only inbound packets. */
811 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
815 VLOG_ERR("%s: failed to attach filter (%s)",
816 netdev_get_name(netdev_), ovs_strerror(error));
820 ovs_mutex_unlock(&netdev->mutex);
828 ovs_mutex_unlock(&netdev->mutex);
833 netdev_linux_rx_destruct(struct netdev_rx *rx_)
835 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
843 netdev_linux_rx_dealloc(struct netdev_rx *rx_)
845 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
851 netdev_linux_rx_recv(struct netdev_rx *rx_, void *data, size_t size)
853 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
858 ? read(rx->fd, data, size)
859 : recv(rx->fd, data, size, MSG_TRUNC));
860 } while (retval < 0 && errno == EINTR);
863 return retval > size ? -EMSGSIZE : retval;
865 if (errno != EAGAIN) {
866 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
867 ovs_strerror(errno), netdev_rx_get_name(rx_));
874 netdev_linux_rx_wait(struct netdev_rx *rx_)
876 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
877 poll_fd_wait(rx->fd, POLLIN);
881 netdev_linux_rx_drain(struct netdev_rx *rx_)
883 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
886 int error = af_inet_ifreq_ioctl(netdev_rx_get_name(rx_), &ifr,
887 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
891 drain_fd(rx->fd, ifr.ifr_qlen);
894 return drain_rcvbuf(rx->fd);
898 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
899 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
900 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
901 * the packet is too big or too small to transmit on the device.
903 * The caller retains ownership of 'buffer' in all cases.
905 * The kernel maintains a packet transmission queue, so the caller is not
906 * expected to do additional queuing of packets. */
908 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
913 if (!is_tap_netdev(netdev_)) {
914 /* Use our AF_PACKET socket to send to this device. */
915 struct sockaddr_ll sll;
921 sock = af_packet_sock();
926 ifindex = netdev_get_ifindex(netdev_);
931 /* We don't bother setting most fields in sockaddr_ll because the
932 * kernel ignores them for SOCK_RAW. */
933 memset(&sll, 0, sizeof sll);
934 sll.sll_family = AF_PACKET;
935 sll.sll_ifindex = ifindex;
937 iov.iov_base = CONST_CAST(void *, data);
941 msg.msg_namelen = sizeof sll;
944 msg.msg_control = NULL;
945 msg.msg_controllen = 0;
948 retval = sendmsg(sock, &msg, 0);
950 /* Use the tap fd to send to this device. This is essential for
951 * tap devices, because packets sent to a tap device with an
952 * AF_PACKET socket will loop back to be *received* again on the
953 * tap device. This doesn't occur on other interface types
954 * because we attach a socket filter to the rx socket. */
955 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
957 retval = write(netdev->tap_fd, data, size);
961 /* The Linux AF_PACKET implementation never blocks waiting for room
962 * for packets, instead returning ENOBUFS. Translate this into
963 * EAGAIN for the caller. */
964 if (errno == ENOBUFS) {
966 } else if (errno == EINTR) {
968 } else if (errno != EAGAIN) {
969 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
970 netdev_get_name(netdev_), ovs_strerror(errno));
973 } else if (retval != size) {
974 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE"d bytes of "
975 "%"PRIuSIZE") on %s", retval, size, netdev_get_name(netdev_));
983 /* Registers with the poll loop to wake up from the next call to poll_block()
984 * when the packet transmission queue has sufficient room to transmit a packet
985 * with netdev_send().
987 * The kernel maintains a packet transmission queue, so the client is not
988 * expected to do additional queuing of packets. Thus, this function is
989 * unlikely to ever be used. It is included for completeness. */
991 netdev_linux_send_wait(struct netdev *netdev)
993 if (is_tap_netdev(netdev)) {
994 /* TAP device always accepts packets.*/
995 poll_immediate_wake();
999 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1000 * otherwise a positive errno value. */
1002 netdev_linux_set_etheraddr(struct netdev *netdev_,
1003 const uint8_t mac[ETH_ADDR_LEN])
1005 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1006 enum netdev_flags old_flags = 0;
1009 ovs_mutex_lock(&netdev->mutex);
1011 if (netdev->cache_valid & VALID_ETHERADDR) {
1012 error = netdev->ether_addr_error;
1013 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1016 netdev->cache_valid &= ~VALID_ETHERADDR;
1019 /* Tap devices must be brought down before setting the address. */
1020 if (is_tap_netdev(netdev_)) {
1021 update_flags(netdev, NETDEV_UP, 0, &old_flags);
1023 error = set_etheraddr(netdev_get_name(netdev_), mac);
1024 if (!error || error == ENODEV) {
1025 netdev->ether_addr_error = error;
1026 netdev->cache_valid |= VALID_ETHERADDR;
1028 memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
1032 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1033 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1037 ovs_mutex_unlock(&netdev->mutex);
1041 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1043 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1044 uint8_t mac[ETH_ADDR_LEN])
1046 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1049 ovs_mutex_lock(&netdev->mutex);
1050 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1051 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1053 netdev->cache_valid |= VALID_ETHERADDR;
1056 error = netdev->ether_addr_error;
1058 memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
1060 ovs_mutex_unlock(&netdev->mutex);
1066 netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
1070 if (!(netdev->cache_valid & VALID_MTU)) {
1073 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
1074 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
1075 netdev->mtu = ifr.ifr_mtu;
1076 netdev->cache_valid |= VALID_MTU;
1079 error = netdev->netdev_mtu_error;
1081 *mtup = netdev->mtu;
1087 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1088 * in bytes, not including the hardware header; thus, this is typically 1500
1089 * bytes for Ethernet devices. */
1091 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1093 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1096 ovs_mutex_lock(&netdev->mutex);
1097 error = netdev_linux_get_mtu__(netdev, mtup);
1098 ovs_mutex_unlock(&netdev->mutex);
1103 /* Sets the maximum size of transmitted (MTU) for given device using linux
1104 * networking ioctl interface.
1107 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1109 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1113 ovs_mutex_lock(&netdev->mutex);
1114 if (netdev->cache_valid & VALID_MTU) {
1115 error = netdev->netdev_mtu_error;
1116 if (error || netdev->mtu == mtu) {
1119 netdev->cache_valid &= ~VALID_MTU;
1122 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1123 SIOCSIFMTU, "SIOCSIFMTU");
1124 if (!error || error == ENODEV) {
1125 netdev->netdev_mtu_error = error;
1126 netdev->mtu = ifr.ifr_mtu;
1127 netdev->cache_valid |= VALID_MTU;
1130 ovs_mutex_unlock(&netdev->mutex);
1134 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1135 * On failure, returns a negative errno value. */
1137 netdev_linux_get_ifindex(const struct netdev *netdev_)
1139 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1142 ovs_mutex_lock(&netdev->mutex);
1143 error = get_ifindex(netdev_, &ifindex);
1144 ovs_mutex_unlock(&netdev->mutex);
1146 return error ? -error : ifindex;
1150 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1152 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1154 ovs_mutex_lock(&netdev->mutex);
1155 if (netdev->miimon_interval > 0) {
1156 *carrier = netdev->miimon;
1158 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1160 ovs_mutex_unlock(&netdev->mutex);
1165 static long long int
1166 netdev_linux_get_carrier_resets(const struct netdev *netdev_)
1168 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1169 long long int carrier_resets;
1171 ovs_mutex_lock(&netdev->mutex);
1172 carrier_resets = netdev->carrier_resets;
1173 ovs_mutex_unlock(&netdev->mutex);
1175 return carrier_resets;
1179 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1180 struct mii_ioctl_data *data)
1185 memset(&ifr, 0, sizeof ifr);
1186 memcpy(&ifr.ifr_data, data, sizeof *data);
1187 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
1188 memcpy(data, &ifr.ifr_data, sizeof *data);
1194 netdev_linux_get_miimon(const char *name, bool *miimon)
1196 struct mii_ioctl_data data;
1201 memset(&data, 0, sizeof data);
1202 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1204 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1205 data.reg_num = MII_BMSR;
1206 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1210 *miimon = !!(data.val_out & BMSR_LSTATUS);
1212 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1215 struct ethtool_cmd ecmd;
1217 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1220 COVERAGE_INC(netdev_get_ethtool);
1221 memset(&ecmd, 0, sizeof ecmd);
1222 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1225 struct ethtool_value eval;
1227 memcpy(&eval, &ecmd, sizeof eval);
1228 *miimon = !!eval.data;
1230 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1238 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1239 long long int interval)
1241 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1243 ovs_mutex_lock(&netdev->mutex);
1244 interval = interval > 0 ? MAX(interval, 100) : 0;
1245 if (netdev->miimon_interval != interval) {
1248 if (interval && !netdev->miimon_interval) {
1249 atomic_add(&miimon_cnt, 1, &junk);
1250 } else if (!interval && netdev->miimon_interval) {
1251 atomic_sub(&miimon_cnt, 1, &junk);
1254 netdev->miimon_interval = interval;
1255 timer_set_expired(&netdev->miimon_timer);
1257 ovs_mutex_unlock(&netdev->mutex);
1263 netdev_linux_miimon_run(void)
1265 struct shash device_shash;
1266 struct shash_node *node;
1268 shash_init(&device_shash);
1269 netdev_get_devices(&netdev_linux_class, &device_shash);
1270 SHASH_FOR_EACH (node, &device_shash) {
1271 struct netdev *netdev = node->data;
1272 struct netdev_linux *dev = netdev_linux_cast(netdev);
1275 ovs_mutex_lock(&dev->mutex);
1276 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1277 netdev_linux_get_miimon(dev->up.name, &miimon);
1278 if (miimon != dev->miimon) {
1279 dev->miimon = miimon;
1280 netdev_linux_changed(dev, dev->ifi_flags, 0);
1283 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1285 ovs_mutex_unlock(&dev->mutex);
1286 netdev_close(netdev);
1289 shash_destroy(&device_shash);
1293 netdev_linux_miimon_wait(void)
1295 struct shash device_shash;
1296 struct shash_node *node;
1298 shash_init(&device_shash);
1299 netdev_get_devices(&netdev_linux_class, &device_shash);
1300 SHASH_FOR_EACH (node, &device_shash) {
1301 struct netdev *netdev = node->data;
1302 struct netdev_linux *dev = netdev_linux_cast(netdev);
1304 ovs_mutex_lock(&dev->mutex);
1305 if (dev->miimon_interval > 0) {
1306 timer_wait(&dev->miimon_timer);
1308 ovs_mutex_unlock(&dev->mutex);
1309 netdev_close(netdev);
1311 shash_destroy(&device_shash);
1315 swap_uint64(uint64_t *a, uint64_t *b)
1322 /* Copies 'src' into 'dst', performing format conversion in the process.
1324 * 'src' is allowed to be misaligned. */
1326 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1327 const struct ovs_vport_stats *src)
1329 dst->rx_packets = get_unaligned_u64(&src->rx_packets);
1330 dst->tx_packets = get_unaligned_u64(&src->tx_packets);
1331 dst->rx_bytes = get_unaligned_u64(&src->rx_bytes);
1332 dst->tx_bytes = get_unaligned_u64(&src->tx_bytes);
1333 dst->rx_errors = get_unaligned_u64(&src->rx_errors);
1334 dst->tx_errors = get_unaligned_u64(&src->tx_errors);
1335 dst->rx_dropped = get_unaligned_u64(&src->rx_dropped);
1336 dst->tx_dropped = get_unaligned_u64(&src->tx_dropped);
1338 dst->collisions = 0;
1339 dst->rx_length_errors = 0;
1340 dst->rx_over_errors = 0;
1341 dst->rx_crc_errors = 0;
1342 dst->rx_frame_errors = 0;
1343 dst->rx_fifo_errors = 0;
1344 dst->rx_missed_errors = 0;
1345 dst->tx_aborted_errors = 0;
1346 dst->tx_carrier_errors = 0;
1347 dst->tx_fifo_errors = 0;
1348 dst->tx_heartbeat_errors = 0;
1349 dst->tx_window_errors = 0;
1353 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1355 struct dpif_linux_vport reply;
1359 error = dpif_linux_vport_get(netdev_get_name(netdev), &reply, &buf);
1362 } else if (!reply.stats) {
1367 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1375 get_stats_via_vport(const struct netdev *netdev_,
1376 struct netdev_stats *stats)
1378 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1380 if (!netdev->vport_stats_error ||
1381 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1384 error = get_stats_via_vport__(netdev_, stats);
1385 if (error && error != ENOENT) {
1386 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1388 netdev_get_name(netdev_), ovs_strerror(error));
1390 netdev->vport_stats_error = error;
1391 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1395 /* Retrieves current device stats for 'netdev-linux'. */
1397 netdev_linux_get_stats(const struct netdev *netdev_,
1398 struct netdev_stats *stats)
1400 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1401 struct netdev_stats dev_stats;
1404 ovs_mutex_lock(&netdev->mutex);
1405 get_stats_via_vport(netdev_, stats);
1406 error = get_stats_via_netlink(netdev_, &dev_stats);
1408 if (!netdev->vport_stats_error) {
1411 } else if (netdev->vport_stats_error) {
1412 /* stats not available from OVS then use ioctl stats. */
1415 stats->rx_errors += dev_stats.rx_errors;
1416 stats->tx_errors += dev_stats.tx_errors;
1417 stats->rx_dropped += dev_stats.rx_dropped;
1418 stats->tx_dropped += dev_stats.tx_dropped;
1419 stats->multicast += dev_stats.multicast;
1420 stats->collisions += dev_stats.collisions;
1421 stats->rx_length_errors += dev_stats.rx_length_errors;
1422 stats->rx_over_errors += dev_stats.rx_over_errors;
1423 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1424 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1425 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1426 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1427 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1428 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1429 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1430 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1431 stats->tx_window_errors += dev_stats.tx_window_errors;
1433 ovs_mutex_unlock(&netdev->mutex);
1438 /* Retrieves current device stats for 'netdev-tap' netdev or
1439 * netdev-internal. */
1441 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1443 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1444 struct netdev_stats dev_stats;
1447 ovs_mutex_lock(&netdev->mutex);
1448 get_stats_via_vport(netdev_, stats);
1449 error = get_stats_via_netlink(netdev_, &dev_stats);
1451 if (!netdev->vport_stats_error) {
1454 } else if (netdev->vport_stats_error) {
1455 /* Transmit and receive stats will appear to be swapped relative to the
1456 * other ports since we are the one sending the data, not a remote
1457 * computer. For consistency, we swap them back here. This does not
1458 * apply if we are getting stats from the vport layer because it always
1459 * tracks stats from the perspective of the switch. */
1462 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1463 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1464 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1465 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1466 stats->rx_length_errors = 0;
1467 stats->rx_over_errors = 0;
1468 stats->rx_crc_errors = 0;
1469 stats->rx_frame_errors = 0;
1470 stats->rx_fifo_errors = 0;
1471 stats->rx_missed_errors = 0;
1472 stats->tx_aborted_errors = 0;
1473 stats->tx_carrier_errors = 0;
1474 stats->tx_fifo_errors = 0;
1475 stats->tx_heartbeat_errors = 0;
1476 stats->tx_window_errors = 0;
1478 stats->rx_dropped += dev_stats.tx_dropped;
1479 stats->tx_dropped += dev_stats.rx_dropped;
1481 stats->rx_errors += dev_stats.tx_errors;
1482 stats->tx_errors += dev_stats.rx_errors;
1484 stats->multicast += dev_stats.multicast;
1485 stats->collisions += dev_stats.collisions;
1487 ovs_mutex_unlock(&netdev->mutex);
1493 netdev_internal_get_stats(const struct netdev *netdev_,
1494 struct netdev_stats *stats)
1496 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1499 ovs_mutex_lock(&netdev->mutex);
1500 get_stats_via_vport(netdev_, stats);
1501 error = netdev->vport_stats_error;
1502 ovs_mutex_unlock(&netdev->mutex);
1508 netdev_internal_set_stats(struct netdev *netdev,
1509 const struct netdev_stats *stats)
1511 struct ovs_vport_stats vport_stats;
1512 struct dpif_linux_vport vport;
1515 vport_stats.rx_packets = stats->rx_packets;
1516 vport_stats.tx_packets = stats->tx_packets;
1517 vport_stats.rx_bytes = stats->rx_bytes;
1518 vport_stats.tx_bytes = stats->tx_bytes;
1519 vport_stats.rx_errors = stats->rx_errors;
1520 vport_stats.tx_errors = stats->tx_errors;
1521 vport_stats.rx_dropped = stats->rx_dropped;
1522 vport_stats.tx_dropped = stats->tx_dropped;
1524 dpif_linux_vport_init(&vport);
1525 vport.cmd = OVS_VPORT_CMD_SET;
1526 vport.name = netdev_get_name(netdev);
1527 vport.stats = &vport_stats;
1529 err = dpif_linux_vport_transact(&vport, NULL, NULL);
1531 /* If the vport layer doesn't know about the device, that doesn't mean it
1532 * doesn't exist (after all were able to open it when netdev_open() was
1533 * called), it just means that it isn't attached and we'll be getting
1534 * stats a different way. */
1535 if (err == ENODEV) {
1543 netdev_linux_read_features(struct netdev_linux *netdev)
1545 struct ethtool_cmd ecmd;
1549 if (netdev->cache_valid & VALID_FEATURES) {
1553 COVERAGE_INC(netdev_get_ethtool);
1554 memset(&ecmd, 0, sizeof ecmd);
1555 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1556 ETHTOOL_GSET, "ETHTOOL_GSET");
1561 /* Supported features. */
1562 netdev->supported = 0;
1563 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1564 netdev->supported |= NETDEV_F_10MB_HD;
1566 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1567 netdev->supported |= NETDEV_F_10MB_FD;
1569 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1570 netdev->supported |= NETDEV_F_100MB_HD;
1572 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1573 netdev->supported |= NETDEV_F_100MB_FD;
1575 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1576 netdev->supported |= NETDEV_F_1GB_HD;
1578 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1579 netdev->supported |= NETDEV_F_1GB_FD;
1581 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1582 netdev->supported |= NETDEV_F_10GB_FD;
1584 if (ecmd.supported & SUPPORTED_TP) {
1585 netdev->supported |= NETDEV_F_COPPER;
1587 if (ecmd.supported & SUPPORTED_FIBRE) {
1588 netdev->supported |= NETDEV_F_FIBER;
1590 if (ecmd.supported & SUPPORTED_Autoneg) {
1591 netdev->supported |= NETDEV_F_AUTONEG;
1593 if (ecmd.supported & SUPPORTED_Pause) {
1594 netdev->supported |= NETDEV_F_PAUSE;
1596 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1597 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1600 /* Advertised features. */
1601 netdev->advertised = 0;
1602 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1603 netdev->advertised |= NETDEV_F_10MB_HD;
1605 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1606 netdev->advertised |= NETDEV_F_10MB_FD;
1608 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1609 netdev->advertised |= NETDEV_F_100MB_HD;
1611 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1612 netdev->advertised |= NETDEV_F_100MB_FD;
1614 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1615 netdev->advertised |= NETDEV_F_1GB_HD;
1617 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1618 netdev->advertised |= NETDEV_F_1GB_FD;
1620 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1621 netdev->advertised |= NETDEV_F_10GB_FD;
1623 if (ecmd.advertising & ADVERTISED_TP) {
1624 netdev->advertised |= NETDEV_F_COPPER;
1626 if (ecmd.advertising & ADVERTISED_FIBRE) {
1627 netdev->advertised |= NETDEV_F_FIBER;
1629 if (ecmd.advertising & ADVERTISED_Autoneg) {
1630 netdev->advertised |= NETDEV_F_AUTONEG;
1632 if (ecmd.advertising & ADVERTISED_Pause) {
1633 netdev->advertised |= NETDEV_F_PAUSE;
1635 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1636 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1639 /* Current settings. */
1641 if (speed == SPEED_10) {
1642 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1643 } else if (speed == SPEED_100) {
1644 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1645 } else if (speed == SPEED_1000) {
1646 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1647 } else if (speed == SPEED_10000) {
1648 netdev->current = NETDEV_F_10GB_FD;
1649 } else if (speed == 40000) {
1650 netdev->current = NETDEV_F_40GB_FD;
1651 } else if (speed == 100000) {
1652 netdev->current = NETDEV_F_100GB_FD;
1653 } else if (speed == 1000000) {
1654 netdev->current = NETDEV_F_1TB_FD;
1656 netdev->current = 0;
1659 if (ecmd.port == PORT_TP) {
1660 netdev->current |= NETDEV_F_COPPER;
1661 } else if (ecmd.port == PORT_FIBRE) {
1662 netdev->current |= NETDEV_F_FIBER;
1666 netdev->current |= NETDEV_F_AUTONEG;
1670 netdev->cache_valid |= VALID_FEATURES;
1671 netdev->get_features_error = error;
1674 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
1675 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1676 * Returns 0 if successful, otherwise a positive errno value. */
1678 netdev_linux_get_features(const struct netdev *netdev_,
1679 enum netdev_features *current,
1680 enum netdev_features *advertised,
1681 enum netdev_features *supported,
1682 enum netdev_features *peer)
1684 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1687 ovs_mutex_lock(&netdev->mutex);
1688 netdev_linux_read_features(netdev);
1689 if (!netdev->get_features_error) {
1690 *current = netdev->current;
1691 *advertised = netdev->advertised;
1692 *supported = netdev->supported;
1693 *peer = 0; /* XXX */
1695 error = netdev->get_features_error;
1696 ovs_mutex_unlock(&netdev->mutex);
1701 /* Set the features advertised by 'netdev' to 'advertise'. */
1703 netdev_linux_set_advertisements(struct netdev *netdev_,
1704 enum netdev_features advertise)
1706 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1707 struct ethtool_cmd ecmd;
1710 ovs_mutex_lock(&netdev->mutex);
1712 COVERAGE_INC(netdev_get_ethtool);
1713 memset(&ecmd, 0, sizeof ecmd);
1714 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1715 ETHTOOL_GSET, "ETHTOOL_GSET");
1720 ecmd.advertising = 0;
1721 if (advertise & NETDEV_F_10MB_HD) {
1722 ecmd.advertising |= ADVERTISED_10baseT_Half;
1724 if (advertise & NETDEV_F_10MB_FD) {
1725 ecmd.advertising |= ADVERTISED_10baseT_Full;
1727 if (advertise & NETDEV_F_100MB_HD) {
1728 ecmd.advertising |= ADVERTISED_100baseT_Half;
1730 if (advertise & NETDEV_F_100MB_FD) {
1731 ecmd.advertising |= ADVERTISED_100baseT_Full;
1733 if (advertise & NETDEV_F_1GB_HD) {
1734 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1736 if (advertise & NETDEV_F_1GB_FD) {
1737 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1739 if (advertise & NETDEV_F_10GB_FD) {
1740 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1742 if (advertise & NETDEV_F_COPPER) {
1743 ecmd.advertising |= ADVERTISED_TP;
1745 if (advertise & NETDEV_F_FIBER) {
1746 ecmd.advertising |= ADVERTISED_FIBRE;
1748 if (advertise & NETDEV_F_AUTONEG) {
1749 ecmd.advertising |= ADVERTISED_Autoneg;
1751 if (advertise & NETDEV_F_PAUSE) {
1752 ecmd.advertising |= ADVERTISED_Pause;
1754 if (advertise & NETDEV_F_PAUSE_ASYM) {
1755 ecmd.advertising |= ADVERTISED_Asym_Pause;
1757 COVERAGE_INC(netdev_set_ethtool);
1758 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1759 ETHTOOL_SSET, "ETHTOOL_SSET");
1762 ovs_mutex_unlock(&netdev->mutex);
1766 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1767 * successful, otherwise a positive errno value. */
1769 netdev_linux_set_policing(struct netdev *netdev_,
1770 uint32_t kbits_rate, uint32_t kbits_burst)
1772 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1773 const char *netdev_name = netdev_get_name(netdev_);
1776 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1777 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1778 : kbits_burst); /* Stick with user-specified value. */
1780 ovs_mutex_lock(&netdev->mutex);
1781 if (netdev->cache_valid & VALID_POLICING) {
1782 error = netdev->netdev_policing_error;
1783 if (error || (netdev->kbits_rate == kbits_rate &&
1784 netdev->kbits_burst == kbits_burst)) {
1785 /* Assume that settings haven't changed since we last set them. */
1788 netdev->cache_valid &= ~VALID_POLICING;
1791 COVERAGE_INC(netdev_set_policing);
1792 /* Remove any existing ingress qdisc. */
1793 error = tc_add_del_ingress_qdisc(netdev_, false);
1795 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1796 netdev_name, ovs_strerror(error));
1801 error = tc_add_del_ingress_qdisc(netdev_, true);
1803 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1804 netdev_name, ovs_strerror(error));
1808 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
1810 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1811 netdev_name, ovs_strerror(error));
1816 netdev->kbits_rate = kbits_rate;
1817 netdev->kbits_burst = kbits_burst;
1820 if (!error || error == ENODEV) {
1821 netdev->netdev_policing_error = error;
1822 netdev->cache_valid |= VALID_POLICING;
1824 ovs_mutex_unlock(&netdev->mutex);
1829 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1832 const struct tc_ops *const *opsp;
1834 for (opsp = tcs; *opsp != NULL; opsp++) {
1835 const struct tc_ops *ops = *opsp;
1836 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1837 sset_add(types, ops->ovs_name);
1843 static const struct tc_ops *
1844 tc_lookup_ovs_name(const char *name)
1846 const struct tc_ops *const *opsp;
1848 for (opsp = tcs; *opsp != NULL; opsp++) {
1849 const struct tc_ops *ops = *opsp;
1850 if (!strcmp(name, ops->ovs_name)) {
1857 static const struct tc_ops *
1858 tc_lookup_linux_name(const char *name)
1860 const struct tc_ops *const *opsp;
1862 for (opsp = tcs; *opsp != NULL; opsp++) {
1863 const struct tc_ops *ops = *opsp;
1864 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1871 static struct tc_queue *
1872 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
1875 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1876 struct tc_queue *queue;
1878 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
1879 if (queue->queue_id == queue_id) {
1886 static struct tc_queue *
1887 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1889 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1893 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1895 struct netdev_qos_capabilities *caps)
1897 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1901 caps->n_queues = ops->n_queues;
1906 netdev_linux_get_qos(const struct netdev *netdev_,
1907 const char **typep, struct smap *details)
1909 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1912 ovs_mutex_lock(&netdev->mutex);
1913 error = tc_query_qdisc(netdev_);
1915 *typep = netdev->tc->ops->ovs_name;
1916 error = (netdev->tc->ops->qdisc_get
1917 ? netdev->tc->ops->qdisc_get(netdev_, details)
1920 ovs_mutex_unlock(&netdev->mutex);
1926 netdev_linux_set_qos(struct netdev *netdev_,
1927 const char *type, const struct smap *details)
1929 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1930 const struct tc_ops *new_ops;
1933 new_ops = tc_lookup_ovs_name(type);
1934 if (!new_ops || !new_ops->tc_install) {
1938 ovs_mutex_lock(&netdev->mutex);
1939 error = tc_query_qdisc(netdev_);
1944 if (new_ops == netdev->tc->ops) {
1945 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
1947 /* Delete existing qdisc. */
1948 error = tc_del_qdisc(netdev_);
1952 ovs_assert(netdev->tc == NULL);
1954 /* Install new qdisc. */
1955 error = new_ops->tc_install(netdev_, details);
1956 ovs_assert((error == 0) == (netdev->tc != NULL));
1960 ovs_mutex_unlock(&netdev->mutex);
1965 netdev_linux_get_queue(const struct netdev *netdev_,
1966 unsigned int queue_id, struct smap *details)
1968 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1971 ovs_mutex_lock(&netdev->mutex);
1972 error = tc_query_qdisc(netdev_);
1974 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
1976 ? netdev->tc->ops->class_get(netdev_, queue, details)
1979 ovs_mutex_unlock(&netdev->mutex);
1985 netdev_linux_set_queue(struct netdev *netdev_,
1986 unsigned int queue_id, const struct smap *details)
1988 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1991 ovs_mutex_lock(&netdev->mutex);
1992 error = tc_query_qdisc(netdev_);
1994 error = (queue_id < netdev->tc->ops->n_queues
1995 && netdev->tc->ops->class_set
1996 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
1999 ovs_mutex_unlock(&netdev->mutex);
2005 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
2007 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2010 ovs_mutex_lock(&netdev->mutex);
2011 error = tc_query_qdisc(netdev_);
2013 if (netdev->tc->ops->class_delete) {
2014 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2016 ? netdev->tc->ops->class_delete(netdev_, queue)
2022 ovs_mutex_unlock(&netdev->mutex);
2028 netdev_linux_get_queue_stats(const struct netdev *netdev_,
2029 unsigned int queue_id,
2030 struct netdev_queue_stats *stats)
2032 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2035 ovs_mutex_lock(&netdev->mutex);
2036 error = tc_query_qdisc(netdev_);
2038 if (netdev->tc->ops->class_get_stats) {
2039 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2041 stats->created = queue->created;
2042 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2051 ovs_mutex_unlock(&netdev->mutex);
2057 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
2059 struct ofpbuf request;
2060 struct tcmsg *tcmsg;
2062 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2066 tcmsg->tcm_parent = 0;
2067 nl_dump_start(dump, NETLINK_ROUTE, &request);
2068 ofpbuf_uninit(&request);
2072 struct netdev_linux_queue_state {
2073 unsigned int *queues;
2079 netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
2081 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2084 ovs_mutex_lock(&netdev->mutex);
2085 error = tc_query_qdisc(netdev_);
2087 if (netdev->tc->ops->class_get) {
2088 struct netdev_linux_queue_state *state;
2089 struct tc_queue *queue;
2092 *statep = state = xmalloc(sizeof *state);
2093 state->n_queues = hmap_count(&netdev->tc->queues);
2094 state->cur_queue = 0;
2095 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2098 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2099 state->queues[i++] = queue->queue_id;
2105 ovs_mutex_unlock(&netdev->mutex);
2111 netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2112 unsigned int *queue_idp, struct smap *details)
2114 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2115 struct netdev_linux_queue_state *state = state_;
2118 ovs_mutex_lock(&netdev->mutex);
2119 while (state->cur_queue < state->n_queues) {
2120 unsigned int queue_id = state->queues[state->cur_queue++];
2121 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2124 *queue_idp = queue_id;
2125 error = netdev->tc->ops->class_get(netdev_, queue, details);
2129 ovs_mutex_unlock(&netdev->mutex);
2135 netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2138 struct netdev_linux_queue_state *state = state_;
2140 free(state->queues);
2146 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2147 netdev_dump_queue_stats_cb *cb, void *aux)
2149 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2152 ovs_mutex_lock(&netdev->mutex);
2153 error = tc_query_qdisc(netdev_);
2155 struct nl_dump dump;
2157 if (!netdev->tc->ops->class_dump_stats) {
2159 } else if (!start_queue_dump(netdev_, &dump)) {
2165 while (nl_dump_next(&dump, &msg)) {
2166 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2173 retval = nl_dump_done(&dump);
2179 ovs_mutex_unlock(&netdev->mutex);
2185 netdev_linux_get_in4(const struct netdev *netdev_,
2186 struct in_addr *address, struct in_addr *netmask)
2188 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2191 ovs_mutex_lock(&netdev->mutex);
2192 if (!(netdev->cache_valid & VALID_IN4)) {
2193 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
2194 SIOCGIFADDR, "SIOCGIFADDR");
2196 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2197 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2199 netdev->cache_valid |= VALID_IN4;
2207 if (netdev->address.s_addr != INADDR_ANY) {
2208 *address = netdev->address;
2209 *netmask = netdev->netmask;
2211 error = EADDRNOTAVAIL;
2214 ovs_mutex_unlock(&netdev->mutex);
2220 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2221 struct in_addr netmask)
2223 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2226 ovs_mutex_lock(&netdev->mutex);
2227 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2229 netdev->cache_valid |= VALID_IN4;
2230 netdev->address = address;
2231 netdev->netmask = netmask;
2232 if (address.s_addr != INADDR_ANY) {
2233 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2234 "SIOCSIFNETMASK", netmask);
2237 ovs_mutex_unlock(&netdev->mutex);
2243 parse_if_inet6_line(const char *line,
2244 struct in6_addr *in6, char ifname[16 + 1])
2246 uint8_t *s6 = in6->s6_addr;
2247 #define X8 "%2"SCNx8
2248 return ovs_scan(line,
2249 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2250 "%*x %*x %*x %*x %16s\n",
2251 &s6[0], &s6[1], &s6[2], &s6[3],
2252 &s6[4], &s6[5], &s6[6], &s6[7],
2253 &s6[8], &s6[9], &s6[10], &s6[11],
2254 &s6[12], &s6[13], &s6[14], &s6[15],
2258 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2259 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2261 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2263 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2265 ovs_mutex_lock(&netdev->mutex);
2266 if (!(netdev->cache_valid & VALID_IN6)) {
2270 netdev->in6 = in6addr_any;
2272 file = fopen("/proc/net/if_inet6", "r");
2274 const char *name = netdev_get_name(netdev_);
2275 while (fgets(line, sizeof line, file)) {
2276 struct in6_addr in6_tmp;
2277 char ifname[16 + 1];
2278 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2279 && !strcmp(name, ifname))
2281 netdev->in6 = in6_tmp;
2287 netdev->cache_valid |= VALID_IN6;
2290 ovs_mutex_unlock(&netdev->mutex);
2296 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2298 struct sockaddr_in sin;
2299 memset(&sin, 0, sizeof sin);
2300 sin.sin_family = AF_INET;
2301 sin.sin_addr = addr;
2304 memset(sa, 0, sizeof *sa);
2305 memcpy(sa, &sin, sizeof sin);
2309 do_set_addr(struct netdev *netdev,
2310 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2314 make_in4_sockaddr(&ifr.ifr_addr, addr);
2315 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2319 /* Adds 'router' as a default IP gateway. */
2321 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2323 struct in_addr any = { INADDR_ANY };
2327 memset(&rt, 0, sizeof rt);
2328 make_in4_sockaddr(&rt.rt_dst, any);
2329 make_in4_sockaddr(&rt.rt_gateway, router);
2330 make_in4_sockaddr(&rt.rt_genmask, any);
2331 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2332 error = af_inet_ioctl(SIOCADDRT, &rt);
2334 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2340 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2343 static const char fn[] = "/proc/net/route";
2348 *netdev_name = NULL;
2349 stream = fopen(fn, "r");
2350 if (stream == NULL) {
2351 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2356 while (fgets(line, sizeof line, stream)) {
2359 ovs_be32 dest, gateway, mask;
2360 int refcnt, metric, mtu;
2361 unsigned int flags, use, window, irtt;
2364 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2366 iface, &dest, &gateway, &flags, &refcnt,
2367 &use, &metric, &mask, &mtu, &window, &irtt)) {
2368 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2372 if (!(flags & RTF_UP)) {
2373 /* Skip routes that aren't up. */
2377 /* The output of 'dest', 'mask', and 'gateway' were given in
2378 * network byte order, so we don't need need any endian
2379 * conversions here. */
2380 if ((dest & mask) == (host->s_addr & mask)) {
2382 /* The host is directly reachable. */
2383 next_hop->s_addr = 0;
2385 /* To reach the host, we must go through a gateway. */
2386 next_hop->s_addr = gateway;
2388 *netdev_name = xstrdup(iface);
2400 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2402 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2405 ovs_mutex_lock(&netdev->mutex);
2406 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2407 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2409 COVERAGE_INC(netdev_get_ethtool);
2410 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2411 error = netdev_linux_do_ethtool(netdev->up.name,
2414 "ETHTOOL_GDRVINFO");
2416 netdev->cache_valid |= VALID_DRVINFO;
2421 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2422 smap_add(smap, "driver_version", netdev->drvinfo.version);
2423 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2425 ovs_mutex_unlock(&netdev->mutex);
2431 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2434 smap_add(smap, "driver_name", "openvswitch");
2438 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2439 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2440 * returns 0. Otherwise, it returns a positive errno value; in particular,
2441 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2443 netdev_linux_arp_lookup(const struct netdev *netdev,
2444 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2447 struct sockaddr_in sin;
2450 memset(&r, 0, sizeof r);
2451 memset(&sin, 0, sizeof sin);
2452 sin.sin_family = AF_INET;
2453 sin.sin_addr.s_addr = ip;
2455 memcpy(&r.arp_pa, &sin, sizeof sin);
2456 r.arp_ha.sa_family = ARPHRD_ETHER;
2458 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2459 COVERAGE_INC(netdev_arp_lookup);
2460 retval = af_inet_ioctl(SIOCGARP, &r);
2462 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2463 } else if (retval != ENXIO) {
2464 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2465 netdev_get_name(netdev), IP_ARGS(ip),
2466 ovs_strerror(retval));
2472 nd_to_iff_flags(enum netdev_flags nd)
2475 if (nd & NETDEV_UP) {
2478 if (nd & NETDEV_PROMISC) {
2481 if (nd & NETDEV_LOOPBACK) {
2482 iff |= IFF_LOOPBACK;
2488 iff_to_nd_flags(int iff)
2490 enum netdev_flags nd = 0;
2494 if (iff & IFF_PROMISC) {
2495 nd |= NETDEV_PROMISC;
2497 if (iff & IFF_LOOPBACK) {
2498 nd |= NETDEV_LOOPBACK;
2504 update_flags(struct netdev_linux *netdev, enum netdev_flags off,
2505 enum netdev_flags on, enum netdev_flags *old_flagsp)
2506 OVS_REQUIRES(netdev->mutex)
2508 int old_flags, new_flags;
2511 old_flags = netdev->ifi_flags;
2512 *old_flagsp = iff_to_nd_flags(old_flags);
2513 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2514 if (new_flags != old_flags) {
2515 error = set_flags(netdev_get_name(&netdev->up), new_flags);
2516 get_flags(&netdev->up, &netdev->ifi_flags);
2523 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2524 enum netdev_flags on, enum netdev_flags *old_flagsp)
2526 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2529 ovs_mutex_lock(&netdev->mutex);
2530 error = update_flags(netdev, off, on, old_flagsp);
2531 ovs_mutex_unlock(&netdev->mutex);
2536 #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, SET_STATS, \
2537 GET_FEATURES, GET_STATUS) \
2543 netdev_linux_wait, \
2545 netdev_linux_alloc, \
2547 netdev_linux_destruct, \
2548 netdev_linux_dealloc, \
2549 NULL, /* get_config */ \
2550 NULL, /* set_config */ \
2551 NULL, /* get_tunnel_config */ \
2553 netdev_linux_send, \
2554 netdev_linux_send_wait, \
2556 netdev_linux_set_etheraddr, \
2557 netdev_linux_get_etheraddr, \
2558 netdev_linux_get_mtu, \
2559 netdev_linux_set_mtu, \
2560 netdev_linux_get_ifindex, \
2561 netdev_linux_get_carrier, \
2562 netdev_linux_get_carrier_resets, \
2563 netdev_linux_set_miimon_interval, \
2568 netdev_linux_set_advertisements, \
2570 netdev_linux_set_policing, \
2571 netdev_linux_get_qos_types, \
2572 netdev_linux_get_qos_capabilities, \
2573 netdev_linux_get_qos, \
2574 netdev_linux_set_qos, \
2575 netdev_linux_get_queue, \
2576 netdev_linux_set_queue, \
2577 netdev_linux_delete_queue, \
2578 netdev_linux_get_queue_stats, \
2579 netdev_linux_queue_dump_start, \
2580 netdev_linux_queue_dump_next, \
2581 netdev_linux_queue_dump_done, \
2582 netdev_linux_dump_queue_stats, \
2584 netdev_linux_get_in4, \
2585 netdev_linux_set_in4, \
2586 netdev_linux_get_in6, \
2587 netdev_linux_add_router, \
2588 netdev_linux_get_next_hop, \
2590 netdev_linux_arp_lookup, \
2592 netdev_linux_update_flags, \
2594 netdev_linux_rx_alloc, \
2595 netdev_linux_rx_construct, \
2596 netdev_linux_rx_destruct, \
2597 netdev_linux_rx_dealloc, \
2598 netdev_linux_rx_recv, \
2599 netdev_linux_rx_wait, \
2600 netdev_linux_rx_drain, \
2603 const struct netdev_class netdev_linux_class =
2606 netdev_linux_construct,
2607 netdev_linux_get_stats,
2608 NULL, /* set_stats */
2609 netdev_linux_get_features,
2610 netdev_linux_get_status);
2612 const struct netdev_class netdev_tap_class =
2615 netdev_linux_construct_tap,
2616 netdev_tap_get_stats,
2617 NULL, /* set_stats */
2618 netdev_linux_get_features,
2619 netdev_linux_get_status);
2621 const struct netdev_class netdev_internal_class =
2624 netdev_linux_construct,
2625 netdev_internal_get_stats,
2626 netdev_internal_set_stats,
2627 NULL, /* get_features */
2628 netdev_internal_get_status);
2630 /* HTB traffic control class. */
2632 #define HTB_N_QUEUES 0xf000
2636 unsigned int max_rate; /* In bytes/s. */
2640 struct tc_queue tc_queue;
2641 unsigned int min_rate; /* In bytes/s. */
2642 unsigned int max_rate; /* In bytes/s. */
2643 unsigned int burst; /* In bytes. */
2644 unsigned int priority; /* Lower values are higher priorities. */
2648 htb_get__(const struct netdev *netdev_)
2650 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2651 return CONTAINER_OF(netdev->tc, struct htb, tc);
2655 htb_install__(struct netdev *netdev_, uint64_t max_rate)
2657 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2660 htb = xmalloc(sizeof *htb);
2661 tc_init(&htb->tc, &tc_ops_htb);
2662 htb->max_rate = max_rate;
2664 netdev->tc = &htb->tc;
2667 /* Create an HTB qdisc.
2669 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2671 htb_setup_qdisc__(struct netdev *netdev)
2674 struct tc_htb_glob opt;
2675 struct ofpbuf request;
2676 struct tcmsg *tcmsg;
2678 tc_del_qdisc(netdev);
2680 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2681 NLM_F_EXCL | NLM_F_CREATE, &request);
2685 tcmsg->tcm_handle = tc_make_handle(1, 0);
2686 tcmsg->tcm_parent = TC_H_ROOT;
2688 nl_msg_put_string(&request, TCA_KIND, "htb");
2690 memset(&opt, 0, sizeof opt);
2691 opt.rate2quantum = 10;
2695 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2696 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2697 nl_msg_end_nested(&request, opt_offset);
2699 return tc_transact(&request, NULL);
2702 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2703 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2705 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2706 unsigned int parent, struct htb_class *class)
2709 struct tc_htb_opt opt;
2710 struct ofpbuf request;
2711 struct tcmsg *tcmsg;
2715 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
2717 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2718 netdev_get_name(netdev));
2722 memset(&opt, 0, sizeof opt);
2723 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2724 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2725 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2726 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2727 opt.prio = class->priority;
2729 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2733 tcmsg->tcm_handle = handle;
2734 tcmsg->tcm_parent = parent;
2736 nl_msg_put_string(&request, TCA_KIND, "htb");
2737 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2738 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2739 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2740 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2741 nl_msg_end_nested(&request, opt_offset);
2743 error = tc_transact(&request, NULL);
2745 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2746 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2747 netdev_get_name(netdev),
2748 tc_get_major(handle), tc_get_minor(handle),
2749 tc_get_major(parent), tc_get_minor(parent),
2750 class->min_rate, class->max_rate,
2751 class->burst, class->priority, ovs_strerror(error));
2756 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2757 * description of them into 'details'. The description complies with the
2758 * specification given in the vswitch database documentation for linux-htb
2761 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2763 static const struct nl_policy tca_htb_policy[] = {
2764 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2765 .min_len = sizeof(struct tc_htb_opt) },
2768 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2769 const struct tc_htb_opt *htb;
2771 if (!nl_parse_nested(nl_options, tca_htb_policy,
2772 attrs, ARRAY_SIZE(tca_htb_policy))) {
2773 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2777 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2778 class->min_rate = htb->rate.rate;
2779 class->max_rate = htb->ceil.rate;
2780 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2781 class->priority = htb->prio;
2786 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2787 struct htb_class *options,
2788 struct netdev_queue_stats *stats)
2790 struct nlattr *nl_options;
2791 unsigned int handle;
2794 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2795 if (!error && queue_id) {
2796 unsigned int major = tc_get_major(handle);
2797 unsigned int minor = tc_get_minor(handle);
2798 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2799 *queue_id = minor - 1;
2804 if (!error && options) {
2805 error = htb_parse_tca_options__(nl_options, options);
2811 htb_parse_qdisc_details__(struct netdev *netdev_,
2812 const struct smap *details, struct htb_class *hc)
2814 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2815 const char *max_rate_s;
2817 max_rate_s = smap_get(details, "max-rate");
2818 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2819 if (!hc->max_rate) {
2820 enum netdev_features current;
2822 netdev_linux_read_features(netdev);
2823 current = !netdev->get_features_error ? netdev->current : 0;
2824 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
2826 hc->min_rate = hc->max_rate;
2832 htb_parse_class_details__(struct netdev *netdev,
2833 const struct smap *details, struct htb_class *hc)
2835 const struct htb *htb = htb_get__(netdev);
2836 const char *min_rate_s = smap_get(details, "min-rate");
2837 const char *max_rate_s = smap_get(details, "max-rate");
2838 const char *burst_s = smap_get(details, "burst");
2839 const char *priority_s = smap_get(details, "priority");
2842 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
2844 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2845 netdev_get_name(netdev));
2849 /* HTB requires at least an mtu sized min-rate to send any traffic even
2850 * on uncongested links. */
2851 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2852 hc->min_rate = MAX(hc->min_rate, mtu);
2853 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2856 hc->max_rate = (max_rate_s
2857 ? strtoull(max_rate_s, NULL, 10) / 8
2859 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2860 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2864 * According to hints in the documentation that I've read, it is important
2865 * that 'burst' be at least as big as the largest frame that might be
2866 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2867 * but having it a bit too small is a problem. Since netdev_get_mtu()
2868 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2869 * the MTU. We actually add 64, instead of 14, as a guard against
2870 * additional headers get tacked on somewhere that we're not aware of. */
2871 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2872 hc->burst = MAX(hc->burst, mtu + 64);
2875 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2881 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2882 unsigned int parent, struct htb_class *options,
2883 struct netdev_queue_stats *stats)
2885 struct ofpbuf *reply;
2888 error = tc_query_class(netdev, handle, parent, &reply);
2890 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2891 ofpbuf_delete(reply);
2897 htb_tc_install(struct netdev *netdev, const struct smap *details)
2901 error = htb_setup_qdisc__(netdev);
2903 struct htb_class hc;
2905 htb_parse_qdisc_details__(netdev, details, &hc);
2906 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2907 tc_make_handle(1, 0), &hc);
2909 htb_install__(netdev, hc.max_rate);
2915 static struct htb_class *
2916 htb_class_cast__(const struct tc_queue *queue)
2918 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2922 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2923 const struct htb_class *hc)
2925 struct htb *htb = htb_get__(netdev);
2926 size_t hash = hash_int(queue_id, 0);
2927 struct tc_queue *queue;
2928 struct htb_class *hcp;
2930 queue = tc_find_queue__(netdev, queue_id, hash);
2932 hcp = htb_class_cast__(queue);
2934 hcp = xmalloc(sizeof *hcp);
2935 queue = &hcp->tc_queue;
2936 queue->queue_id = queue_id;
2937 queue->created = time_msec();
2938 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2941 hcp->min_rate = hc->min_rate;
2942 hcp->max_rate = hc->max_rate;
2943 hcp->burst = hc->burst;
2944 hcp->priority = hc->priority;
2948 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2951 struct nl_dump dump;
2952 struct htb_class hc;
2954 /* Get qdisc options. */
2956 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2957 htb_install__(netdev, hc.max_rate);
2960 if (!start_queue_dump(netdev, &dump)) {
2963 while (nl_dump_next(&dump, &msg)) {
2964 unsigned int queue_id;
2966 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2967 htb_update_queue__(netdev, queue_id, &hc);
2970 nl_dump_done(&dump);
2976 htb_tc_destroy(struct tc *tc)
2978 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2979 struct htb_class *hc, *next;
2981 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2982 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2990 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
2992 const struct htb *htb = htb_get__(netdev);
2993 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
2998 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
3000 struct htb_class hc;
3003 htb_parse_qdisc_details__(netdev, details, &hc);
3004 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3005 tc_make_handle(1, 0), &hc);
3007 htb_get__(netdev)->max_rate = hc.max_rate;
3013 htb_class_get(const struct netdev *netdev OVS_UNUSED,
3014 const struct tc_queue *queue, struct smap *details)
3016 const struct htb_class *hc = htb_class_cast__(queue);
3018 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3019 if (hc->min_rate != hc->max_rate) {
3020 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3022 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
3024 smap_add_format(details, "priority", "%u", hc->priority);
3030 htb_class_set(struct netdev *netdev, unsigned int queue_id,
3031 const struct smap *details)
3033 struct htb_class hc;
3036 error = htb_parse_class_details__(netdev, details, &hc);
3041 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3042 tc_make_handle(1, 0xfffe), &hc);
3047 htb_update_queue__(netdev, queue_id, &hc);
3052 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
3054 struct htb_class *hc = htb_class_cast__(queue);
3055 struct htb *htb = htb_get__(netdev);
3058 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3060 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3067 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3068 struct netdev_queue_stats *stats)
3070 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3071 tc_make_handle(1, 0xfffe), NULL, stats);
3075 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3076 const struct ofpbuf *nlmsg,
3077 netdev_dump_queue_stats_cb *cb, void *aux)
3079 struct netdev_queue_stats stats;
3080 unsigned int handle, major, minor;
3083 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3088 major = tc_get_major(handle);
3089 minor = tc_get_minor(handle);
3090 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3091 (*cb)(minor - 1, &stats, aux);
3096 static const struct tc_ops tc_ops_htb = {
3097 "htb", /* linux_name */
3098 "linux-htb", /* ovs_name */
3099 HTB_N_QUEUES, /* n_queues */
3108 htb_class_get_stats,
3109 htb_class_dump_stats
3112 /* "linux-hfsc" traffic control class. */
3114 #define HFSC_N_QUEUES 0xf000
3122 struct tc_queue tc_queue;
3127 static struct hfsc *
3128 hfsc_get__(const struct netdev *netdev_)
3130 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3131 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
3134 static struct hfsc_class *
3135 hfsc_class_cast__(const struct tc_queue *queue)
3137 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3141 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
3143 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3146 hfsc = xmalloc(sizeof *hfsc);
3147 tc_init(&hfsc->tc, &tc_ops_hfsc);
3148 hfsc->max_rate = max_rate;
3149 netdev->tc = &hfsc->tc;
3153 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3154 const struct hfsc_class *hc)
3158 struct hfsc_class *hcp;
3159 struct tc_queue *queue;
3161 hfsc = hfsc_get__(netdev);
3162 hash = hash_int(queue_id, 0);
3164 queue = tc_find_queue__(netdev, queue_id, hash);
3166 hcp = hfsc_class_cast__(queue);
3168 hcp = xmalloc(sizeof *hcp);
3169 queue = &hcp->tc_queue;
3170 queue->queue_id = queue_id;
3171 queue->created = time_msec();
3172 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3175 hcp->min_rate = hc->min_rate;
3176 hcp->max_rate = hc->max_rate;
3180 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3182 const struct tc_service_curve *rsc, *fsc, *usc;
3183 static const struct nl_policy tca_hfsc_policy[] = {
3185 .type = NL_A_UNSPEC,
3187 .min_len = sizeof(struct tc_service_curve),
3190 .type = NL_A_UNSPEC,
3192 .min_len = sizeof(struct tc_service_curve),
3195 .type = NL_A_UNSPEC,
3197 .min_len = sizeof(struct tc_service_curve),
3200 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3202 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3203 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3204 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3208 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3209 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3210 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3212 if (rsc->m1 != 0 || rsc->d != 0 ||
3213 fsc->m1 != 0 || fsc->d != 0 ||
3214 usc->m1 != 0 || usc->d != 0) {
3215 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3216 "Non-linear service curves are not supported.");
3220 if (rsc->m2 != fsc->m2) {
3221 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3222 "Real-time service curves are not supported ");
3226 if (rsc->m2 > usc->m2) {
3227 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3228 "Min-rate service curve is greater than "
3229 "the max-rate service curve.");
3233 class->min_rate = fsc->m2;
3234 class->max_rate = usc->m2;
3239 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3240 struct hfsc_class *options,
3241 struct netdev_queue_stats *stats)
3244 unsigned int handle;
3245 struct nlattr *nl_options;
3247 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3253 unsigned int major, minor;
3255 major = tc_get_major(handle);
3256 minor = tc_get_minor(handle);
3257 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3258 *queue_id = minor - 1;
3265 error = hfsc_parse_tca_options__(nl_options, options);
3272 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3273 unsigned int parent, struct hfsc_class *options,
3274 struct netdev_queue_stats *stats)
3277 struct ofpbuf *reply;
3279 error = tc_query_class(netdev, handle, parent, &reply);
3284 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3285 ofpbuf_delete(reply);
3290 hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
3291 struct hfsc_class *class)
3293 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3295 const char *max_rate_s;
3297 max_rate_s = smap_get(details, "max-rate");
3298 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3301 enum netdev_features current;
3303 netdev_linux_read_features(netdev);
3304 current = !netdev->get_features_error ? netdev->current : 0;
3305 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3308 class->min_rate = max_rate;
3309 class->max_rate = max_rate;
3313 hfsc_parse_class_details__(struct netdev *netdev,
3314 const struct smap *details,
3315 struct hfsc_class * class)
3317 const struct hfsc *hfsc;
3318 uint32_t min_rate, max_rate;
3319 const char *min_rate_s, *max_rate_s;
3321 hfsc = hfsc_get__(netdev);
3322 min_rate_s = smap_get(details, "min-rate");
3323 max_rate_s = smap_get(details, "max-rate");
3325 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3326 min_rate = MAX(min_rate, 1);
3327 min_rate = MIN(min_rate, hfsc->max_rate);
3329 max_rate = (max_rate_s
3330 ? strtoull(max_rate_s, NULL, 10) / 8
3332 max_rate = MAX(max_rate, min_rate);
3333 max_rate = MIN(max_rate, hfsc->max_rate);
3335 class->min_rate = min_rate;
3336 class->max_rate = max_rate;
3341 /* Create an HFSC qdisc.
3343 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3345 hfsc_setup_qdisc__(struct netdev * netdev)
3347 struct tcmsg *tcmsg;
3348 struct ofpbuf request;
3349 struct tc_hfsc_qopt opt;
3351 tc_del_qdisc(netdev);
3353 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3354 NLM_F_EXCL | NLM_F_CREATE, &request);
3360 tcmsg->tcm_handle = tc_make_handle(1, 0);
3361 tcmsg->tcm_parent = TC_H_ROOT;
3363 memset(&opt, 0, sizeof opt);
3366 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3367 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3369 return tc_transact(&request, NULL);
3372 /* Create an HFSC class.
3374 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3375 * sc rate <min_rate> ul rate <max_rate>" */
3377 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3378 unsigned int parent, struct hfsc_class *class)
3382 struct tcmsg *tcmsg;
3383 struct ofpbuf request;
3384 struct tc_service_curve min, max;
3386 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3392 tcmsg->tcm_handle = handle;
3393 tcmsg->tcm_parent = parent;
3397 min.m2 = class->min_rate;
3401 max.m2 = class->max_rate;
3403 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3404 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3405 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3406 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3407 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3408 nl_msg_end_nested(&request, opt_offset);
3410 error = tc_transact(&request, NULL);
3412 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3413 "min-rate %ubps, max-rate %ubps (%s)",
3414 netdev_get_name(netdev),
3415 tc_get_major(handle), tc_get_minor(handle),
3416 tc_get_major(parent), tc_get_minor(parent),
3417 class->min_rate, class->max_rate, ovs_strerror(error));
3424 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
3427 struct hfsc_class class;
3429 error = hfsc_setup_qdisc__(netdev);
3435 hfsc_parse_qdisc_details__(netdev, details, &class);
3436 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3437 tc_make_handle(1, 0), &class);
3443 hfsc_install__(netdev, class.max_rate);
3448 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3451 struct nl_dump dump;
3452 struct hfsc_class hc;
3455 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3456 hfsc_install__(netdev, hc.max_rate);
3458 if (!start_queue_dump(netdev, &dump)) {
3462 while (nl_dump_next(&dump, &msg)) {
3463 unsigned int queue_id;
3465 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3466 hfsc_update_queue__(netdev, queue_id, &hc);
3470 nl_dump_done(&dump);
3475 hfsc_tc_destroy(struct tc *tc)
3478 struct hfsc_class *hc, *next;
3480 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3482 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3483 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3492 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
3494 const struct hfsc *hfsc;
3495 hfsc = hfsc_get__(netdev);
3496 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
3501 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
3504 struct hfsc_class class;
3506 hfsc_parse_qdisc_details__(netdev, details, &class);
3507 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3508 tc_make_handle(1, 0), &class);
3511 hfsc_get__(netdev)->max_rate = class.max_rate;
3518 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3519 const struct tc_queue *queue, struct smap *details)
3521 const struct hfsc_class *hc;
3523 hc = hfsc_class_cast__(queue);
3524 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3525 if (hc->min_rate != hc->max_rate) {
3526 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3532 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3533 const struct smap *details)
3536 struct hfsc_class class;
3538 error = hfsc_parse_class_details__(netdev, details, &class);
3543 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3544 tc_make_handle(1, 0xfffe), &class);
3549 hfsc_update_queue__(netdev, queue_id, &class);
3554 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3558 struct hfsc_class *hc;
3560 hc = hfsc_class_cast__(queue);
3561 hfsc = hfsc_get__(netdev);
3563 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3565 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3572 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3573 struct netdev_queue_stats *stats)
3575 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3576 tc_make_handle(1, 0xfffe), NULL, stats);
3580 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3581 const struct ofpbuf *nlmsg,
3582 netdev_dump_queue_stats_cb *cb, void *aux)
3584 struct netdev_queue_stats stats;
3585 unsigned int handle, major, minor;
3588 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3593 major = tc_get_major(handle);
3594 minor = tc_get_minor(handle);
3595 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3596 (*cb)(minor - 1, &stats, aux);
3601 static const struct tc_ops tc_ops_hfsc = {
3602 "hfsc", /* linux_name */
3603 "linux-hfsc", /* ovs_name */
3604 HFSC_N_QUEUES, /* n_queues */
3605 hfsc_tc_install, /* tc_install */
3606 hfsc_tc_load, /* tc_load */
3607 hfsc_tc_destroy, /* tc_destroy */
3608 hfsc_qdisc_get, /* qdisc_get */
3609 hfsc_qdisc_set, /* qdisc_set */
3610 hfsc_class_get, /* class_get */
3611 hfsc_class_set, /* class_set */
3612 hfsc_class_delete, /* class_delete */
3613 hfsc_class_get_stats, /* class_get_stats */
3614 hfsc_class_dump_stats /* class_dump_stats */
3617 /* "linux-default" traffic control class.
3619 * This class represents the default, unnamed Linux qdisc. It corresponds to
3620 * the "" (empty string) QoS type in the OVS database. */
3623 default_install__(struct netdev *netdev_)
3625 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3626 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
3628 /* Nothing but a tc class implementation is allowed to write to a tc. This
3629 * class never does that, so we can legitimately use a const tc object. */
3630 netdev->tc = CONST_CAST(struct tc *, &tc);
3634 default_tc_install(struct netdev *netdev,
3635 const struct smap *details OVS_UNUSED)
3637 default_install__(netdev);
3642 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3644 default_install__(netdev);
3648 static const struct tc_ops tc_ops_default = {
3649 NULL, /* linux_name */
3654 NULL, /* tc_destroy */
3655 NULL, /* qdisc_get */
3656 NULL, /* qdisc_set */
3657 NULL, /* class_get */
3658 NULL, /* class_set */
3659 NULL, /* class_delete */
3660 NULL, /* class_get_stats */
3661 NULL /* class_dump_stats */
3664 /* "linux-other" traffic control class.
3669 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
3671 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3672 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
3674 /* Nothing but a tc class implementation is allowed to write to a tc. This
3675 * class never does that, so we can legitimately use a const tc object. */
3676 netdev->tc = CONST_CAST(struct tc *, &tc);
3680 static const struct tc_ops tc_ops_other = {
3681 NULL, /* linux_name */
3682 "linux-other", /* ovs_name */
3684 NULL, /* tc_install */
3686 NULL, /* tc_destroy */
3687 NULL, /* qdisc_get */
3688 NULL, /* qdisc_set */
3689 NULL, /* class_get */
3690 NULL, /* class_set */
3691 NULL, /* class_delete */
3692 NULL, /* class_get_stats */
3693 NULL /* class_dump_stats */
3696 /* Traffic control. */
3698 /* Number of kernel "tc" ticks per second. */
3699 static double ticks_per_s;
3701 /* Number of kernel "jiffies" per second. This is used for the purpose of
3702 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3703 * one jiffy's worth of data.
3705 * There are two possibilities here:
3707 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3708 * approximate range of 100 to 1024. That means that we really need to
3709 * make sure that the qdisc can buffer that much data.
3711 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3712 * has finely granular timers and there's no need to fudge additional room
3713 * for buffers. (There's no extra effort needed to implement that: the
3714 * large 'buffer_hz' is used as a divisor, so practically any number will
3715 * come out as 0 in the division. Small integer results in the case of
3716 * really high dividends won't have any real effect anyhow.)
3718 static unsigned int buffer_hz;
3720 /* Returns tc handle 'major':'minor'. */
3722 tc_make_handle(unsigned int major, unsigned int minor)
3724 return TC_H_MAKE(major << 16, minor);
3727 /* Returns the major number from 'handle'. */
3729 tc_get_major(unsigned int handle)
3731 return TC_H_MAJ(handle) >> 16;
3734 /* Returns the minor number from 'handle'. */
3736 tc_get_minor(unsigned int handle)
3738 return TC_H_MIN(handle);
3741 static struct tcmsg *
3742 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3743 struct ofpbuf *request)
3745 struct tcmsg *tcmsg;
3749 error = get_ifindex(netdev, &ifindex);
3754 ofpbuf_init(request, 512);
3755 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3756 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3757 tcmsg->tcm_family = AF_UNSPEC;
3758 tcmsg->tcm_ifindex = ifindex;
3759 /* Caller should fill in tcmsg->tcm_handle. */
3760 /* Caller should fill in tcmsg->tcm_parent. */
3766 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3768 int error = nl_transact(NETLINK_ROUTE, request, replyp);
3769 ofpbuf_uninit(request);
3773 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3774 * policing configuration.
3776 * This function is equivalent to running the following when 'add' is true:
3777 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3779 * This function is equivalent to running the following when 'add' is false:
3780 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3782 * The configuration and stats may be seen with the following command:
3783 * /sbin/tc -s qdisc show dev <devname>
3785 * Returns 0 if successful, otherwise a positive errno value.
3788 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3790 struct ofpbuf request;
3791 struct tcmsg *tcmsg;
3793 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3794 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3796 tcmsg = tc_make_request(netdev, type, flags, &request);
3800 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3801 tcmsg->tcm_parent = TC_H_INGRESS;
3802 nl_msg_put_string(&request, TCA_KIND, "ingress");
3803 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3805 error = tc_transact(&request, NULL);
3807 /* If we're deleting the qdisc, don't worry about some of the
3808 * error conditions. */
3809 if (!add && (error == ENOENT || error == EINVAL)) {
3818 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3821 * This function is equivalent to running:
3822 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3823 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3826 * The configuration and stats may be seen with the following command:
3827 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3829 * Returns 0 if successful, otherwise a positive errno value.
3832 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3834 struct tc_police tc_police;
3835 struct ofpbuf request;
3836 struct tcmsg *tcmsg;
3837 size_t basic_offset;
3838 size_t police_offset;
3842 memset(&tc_police, 0, sizeof tc_police);
3843 tc_police.action = TC_POLICE_SHOT;
3844 tc_police.mtu = mtu;
3845 tc_fill_rate(&tc_police.rate, (kbits_rate * 1000)/8, mtu);
3846 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3847 kbits_burst * 1024);
3849 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3850 NLM_F_EXCL | NLM_F_CREATE, &request);
3854 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3855 tcmsg->tcm_info = tc_make_handle(49,
3856 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3858 nl_msg_put_string(&request, TCA_KIND, "basic");
3859 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3860 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3861 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3862 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3863 nl_msg_end_nested(&request, police_offset);
3864 nl_msg_end_nested(&request, basic_offset);
3866 error = tc_transact(&request, NULL);
3877 /* The values in psched are not individually very meaningful, but they are
3878 * important. The tables below show some values seen in the wild.
3882 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3883 * (Before that, there are hints that it was 1000000000.)
3885 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3889 * -----------------------------------
3890 * [1] 000c8000 000f4240 000f4240 00000064
3891 * [2] 000003e8 00000400 000f4240 3b9aca00
3892 * [3] 000003e8 00000400 000f4240 3b9aca00
3893 * [4] 000003e8 00000400 000f4240 00000064
3894 * [5] 000003e8 00000040 000f4240 3b9aca00
3895 * [6] 000003e8 00000040 000f4240 000000f9
3897 * a b c d ticks_per_s buffer_hz
3898 * ------- --------- ---------- ------------- ----------- -------------
3899 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3900 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3901 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3902 * [4] 1,000 1,024 1,000,000 100 976,562 100
3903 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3904 * [6] 1,000 64 1,000,000 249 15,625,000 249
3906 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3907 * [2] 2.6.26-1-686-bigmem from Debian lenny
3908 * [3] 2.6.26-2-sparc64 from Debian lenny
3909 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3910 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3911 * [6] 2.6.34 from kernel.org on KVM
3913 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
3914 static const char fn[] = "/proc/net/psched";
3915 unsigned int a, b, c, d;
3918 if (!ovsthread_once_start(&once)) {
3925 stream = fopen(fn, "r");
3927 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
3931 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3932 VLOG_WARN("%s: read failed", fn);
3936 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3940 VLOG_WARN("%s: invalid scheduler parameters", fn);
3944 ticks_per_s = (double) a * c / b;
3948 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3951 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3954 ovsthread_once_done(&once);
3957 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3958 * rate of 'rate' bytes per second. */
3960 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3963 return (rate * ticks) / ticks_per_s;
3966 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3967 * rate of 'rate' bytes per second. */
3969 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3972 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3975 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3976 * a transmission rate of 'rate' bytes per second. */
3978 tc_buffer_per_jiffy(unsigned int rate)
3981 return rate / buffer_hz;
3984 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3985 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3986 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3987 * stores NULL into it if it is absent.
3989 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3992 * Returns 0 if successful, otherwise a positive errno value. */
3994 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3995 struct nlattr **options)
3997 static const struct nl_policy tca_policy[] = {
3998 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3999 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
4001 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4003 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4004 tca_policy, ta, ARRAY_SIZE(ta))) {
4005 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
4010 *kind = nl_attr_get_string(ta[TCA_KIND]);
4014 *options = ta[TCA_OPTIONS];
4029 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
4030 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
4031 * into '*options', and its queue statistics into '*stats'. Any of the output
4032 * arguments may be null.
4034 * Returns 0 if successful, otherwise a positive errno value. */
4036 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
4037 struct nlattr **options, struct netdev_queue_stats *stats)
4039 static const struct nl_policy tca_policy[] = {
4040 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
4041 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
4043 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4045 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4046 tca_policy, ta, ARRAY_SIZE(ta))) {
4047 VLOG_WARN_RL(&rl, "failed to parse class message");
4052 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
4053 *handlep = tc->tcm_handle;
4057 *options = ta[TCA_OPTIONS];
4061 const struct gnet_stats_queue *gsq;
4062 struct gnet_stats_basic gsb;
4064 static const struct nl_policy stats_policy[] = {
4065 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
4066 .min_len = sizeof gsb },
4067 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
4068 .min_len = sizeof *gsq },
4070 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
4072 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
4073 sa, ARRAY_SIZE(sa))) {
4074 VLOG_WARN_RL(&rl, "failed to parse class stats");
4078 /* Alignment issues screw up the length of struct gnet_stats_basic on
4079 * some arch/bitsize combinations. Newer versions of Linux have a
4080 * struct gnet_stats_basic_packed, but we can't depend on that. The
4081 * easiest thing to do is just to make a copy. */
4082 memset(&gsb, 0, sizeof gsb);
4083 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
4084 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
4085 stats->tx_bytes = gsb.bytes;
4086 stats->tx_packets = gsb.packets;
4088 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
4089 stats->tx_errors = gsq->drops;
4099 memset(stats, 0, sizeof *stats);
4104 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
4107 tc_query_class(const struct netdev *netdev,
4108 unsigned int handle, unsigned int parent,
4109 struct ofpbuf **replyp)
4111 struct ofpbuf request;
4112 struct tcmsg *tcmsg;
4115 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
4119 tcmsg->tcm_handle = handle;
4120 tcmsg->tcm_parent = parent;
4122 error = tc_transact(&request, replyp);
4124 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
4125 netdev_get_name(netdev),
4126 tc_get_major(handle), tc_get_minor(handle),
4127 tc_get_major(parent), tc_get_minor(parent),
4128 ovs_strerror(error));
4133 /* Equivalent to "tc class del dev <name> handle <handle>". */
4135 tc_delete_class(const struct netdev *netdev, unsigned int handle)
4137 struct ofpbuf request;
4138 struct tcmsg *tcmsg;
4141 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
4145 tcmsg->tcm_handle = handle;
4146 tcmsg->tcm_parent = 0;
4148 error = tc_transact(&request, NULL);
4150 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4151 netdev_get_name(netdev),
4152 tc_get_major(handle), tc_get_minor(handle),
4153 ovs_strerror(error));
4158 /* Equivalent to "tc qdisc del dev <name> root". */
4160 tc_del_qdisc(struct netdev *netdev_)
4162 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4163 struct ofpbuf request;
4164 struct tcmsg *tcmsg;
4167 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
4171 tcmsg->tcm_handle = tc_make_handle(1, 0);
4172 tcmsg->tcm_parent = TC_H_ROOT;
4174 error = tc_transact(&request, NULL);
4175 if (error == EINVAL) {
4176 /* EINVAL probably means that the default qdisc was in use, in which
4177 * case we've accomplished our purpose. */
4180 if (!error && netdev->tc) {
4181 if (netdev->tc->ops->tc_destroy) {
4182 netdev->tc->ops->tc_destroy(netdev->tc);
4189 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4190 * kernel to determine what they are. Returns 0 if successful, otherwise a
4191 * positive errno value. */
4193 tc_query_qdisc(const struct netdev *netdev_)
4195 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4196 struct ofpbuf request, *qdisc;
4197 const struct tc_ops *ops;
4198 struct tcmsg *tcmsg;
4206 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4207 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4208 * 2.6.35 without that fix backported to it.
4210 * To avoid the OOPS, we must not make a request that would attempt to dump
4211 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4212 * few others. There are a few ways that I can see to do this, but most of
4213 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4214 * technique chosen here is to assume that any non-default qdisc that we
4215 * create will have a class with handle 1:0. The built-in qdiscs only have
4216 * a class with handle 0:0.
4218 * We could check for Linux 2.6.35+ and use a more straightforward method
4220 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
4224 tcmsg->tcm_handle = tc_make_handle(1, 0);
4225 tcmsg->tcm_parent = 0;
4227 /* Figure out what tc class to instantiate. */
4228 error = tc_transact(&request, &qdisc);
4232 error = tc_parse_qdisc(qdisc, &kind, NULL);
4234 ops = &tc_ops_other;
4236 ops = tc_lookup_linux_name(kind);
4238 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4239 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4241 ops = &tc_ops_other;
4244 } else if (error == ENOENT) {
4245 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4246 * other entity that doesn't have a handle 1:0. We will assume
4247 * that it's the system default qdisc. */
4248 ops = &tc_ops_default;
4251 /* Who knows? Maybe the device got deleted. */
4252 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4253 netdev_get_name(netdev_), ovs_strerror(error));
4254 ops = &tc_ops_other;
4257 /* Instantiate it. */
4258 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
4259 ovs_assert((load_error == 0) == (netdev->tc != NULL));
4260 ofpbuf_delete(qdisc);
4262 return error ? error : load_error;
4265 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4266 approximate the time to transmit packets of various lengths. For an MTU of
4267 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4268 represents two possible packet lengths; for a MTU of 513 through 1024, four
4269 possible lengths; and so on.
4271 Returns, for the specified 'mtu', the number of bits that packet lengths
4272 need to be shifted right to fit within such a 256-entry table. */
4274 tc_calc_cell_log(unsigned int mtu)
4279 mtu = ETH_PAYLOAD_MAX;
4281 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4283 for (cell_log = 0; mtu >= 256; cell_log++) {
4290 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4293 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4295 memset(rate, 0, sizeof *rate);
4296 rate->cell_log = tc_calc_cell_log(mtu);
4297 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4298 /* rate->cell_align = 0; */ /* distro headers. */
4299 rate->mpu = ETH_TOTAL_MIN;
4303 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4304 * attribute of the specified "type".
4306 * See tc_calc_cell_log() above for a description of "rtab"s. */
4308 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4313 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4314 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4315 unsigned packet_size = (i + 1) << rate->cell_log;
4316 if (packet_size < rate->mpu) {
4317 packet_size = rate->mpu;
4319 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4323 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4324 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4325 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4328 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4330 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4331 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4334 /* Linux-only functions declared in netdev-linux.h */
4336 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4337 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4339 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4340 const char *flag_name, bool enable)
4342 const char *netdev_name = netdev_get_name(netdev);
4343 struct ethtool_value evalue;
4347 COVERAGE_INC(netdev_get_ethtool);
4348 memset(&evalue, 0, sizeof evalue);
4349 error = netdev_linux_do_ethtool(netdev_name,
4350 (struct ethtool_cmd *)&evalue,
4351 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4356 COVERAGE_INC(netdev_set_ethtool);
4357 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4358 error = netdev_linux_do_ethtool(netdev_name,
4359 (struct ethtool_cmd *)&evalue,
4360 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4365 COVERAGE_INC(netdev_get_ethtool);
4366 memset(&evalue, 0, sizeof evalue);
4367 error = netdev_linux_do_ethtool(netdev_name,
4368 (struct ethtool_cmd *)&evalue,
4369 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4374 if (new_flags != evalue.data) {
4375 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4376 "device %s failed", enable ? "enable" : "disable",
4377 flag_name, netdev_name);
4384 /* Utility functions. */
4386 /* Copies 'src' into 'dst', performing format conversion in the process. */
4388 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4389 const struct rtnl_link_stats *src)
4391 dst->rx_packets = src->rx_packets;
4392 dst->tx_packets = src->tx_packets;
4393 dst->rx_bytes = src->rx_bytes;
4394 dst->tx_bytes = src->tx_bytes;
4395 dst->rx_errors = src->rx_errors;
4396 dst->tx_errors = src->tx_errors;
4397 dst->rx_dropped = src->rx_dropped;
4398 dst->tx_dropped = src->tx_dropped;
4399 dst->multicast = src->multicast;
4400 dst->collisions = src->collisions;
4401 dst->rx_length_errors = src->rx_length_errors;
4402 dst->rx_over_errors = src->rx_over_errors;
4403 dst->rx_crc_errors = src->rx_crc_errors;
4404 dst->rx_frame_errors = src->rx_frame_errors;
4405 dst->rx_fifo_errors = src->rx_fifo_errors;
4406 dst->rx_missed_errors = src->rx_missed_errors;
4407 dst->tx_aborted_errors = src->tx_aborted_errors;
4408 dst->tx_carrier_errors = src->tx_carrier_errors;
4409 dst->tx_fifo_errors = src->tx_fifo_errors;
4410 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4411 dst->tx_window_errors = src->tx_window_errors;
4415 get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
4417 struct ofpbuf request;
4418 struct ofpbuf *reply;
4421 ofpbuf_init(&request, 0);
4422 nl_msg_put_nlmsghdr(&request,
4423 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
4424 RTM_GETLINK, NLM_F_REQUEST);
4425 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
4426 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
4427 error = nl_transact(NETLINK_ROUTE, &request, &reply);
4428 ofpbuf_uninit(&request);
4433 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
4434 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS);
4435 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
4436 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
4439 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4443 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
4448 ofpbuf_delete(reply);
4453 get_flags(const struct netdev *dev, unsigned int *flags)
4459 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
4461 *flags = ifr.ifr_flags;
4467 set_flags(const char *name, unsigned int flags)
4471 ifr.ifr_flags = flags;
4472 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
4476 do_get_ifindex(const char *netdev_name)
4481 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4482 COVERAGE_INC(netdev_get_ifindex);
4484 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
4486 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4487 netdev_name, ovs_strerror(error));
4490 return ifr.ifr_ifindex;
4494 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4496 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4498 if (!(netdev->cache_valid & VALID_IFINDEX)) {
4499 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4502 netdev->get_ifindex_error = -ifindex;
4503 netdev->ifindex = 0;
4505 netdev->get_ifindex_error = 0;
4506 netdev->ifindex = ifindex;
4508 netdev->cache_valid |= VALID_IFINDEX;
4511 *ifindexp = netdev->ifindex;
4512 return netdev->get_ifindex_error;
4516 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4522 memset(&ifr, 0, sizeof ifr);
4523 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4524 COVERAGE_INC(netdev_get_hwaddr);
4525 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
4527 /* ENODEV probably means that a vif disappeared asynchronously and
4528 * hasn't been removed from the database yet, so reduce the log level
4529 * to INFO for that case. */
4530 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
4531 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4532 netdev_name, ovs_strerror(error));
4535 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4536 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4537 VLOG_WARN("%s device has unknown hardware address family %d",
4538 netdev_name, hwaddr_family);
4540 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4545 set_etheraddr(const char *netdev_name,
4546 const uint8_t mac[ETH_ADDR_LEN])
4551 memset(&ifr, 0, sizeof ifr);
4552 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4553 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4554 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4555 COVERAGE_INC(netdev_set_hwaddr);
4556 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
4558 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4559 netdev_name, ovs_strerror(error));
4565 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4566 int cmd, const char *cmd_name)
4571 memset(&ifr, 0, sizeof ifr);
4572 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4573 ifr.ifr_data = (caddr_t) ecmd;
4576 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
4578 if (error != EOPNOTSUPP) {
4579 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4580 "failed: %s", cmd_name, name, ovs_strerror(error));
4582 /* The device doesn't support this operation. That's pretty
4583 * common, so there's no point in logging anything. */
4590 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4591 int cmd, const char *cmd_name)
4596 ifr.ifr_addr.sa_family = AF_INET;
4597 error = af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4599 const struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *,
4601 *ip = sin->sin_addr;
4606 /* Returns an AF_PACKET raw socket or a negative errno value. */
4608 af_packet_sock(void)
4610 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4613 if (ovsthread_once_start(&once)) {
4614 sock = socket(AF_PACKET, SOCK_RAW, 0);
4616 int error = set_nonblocking(sock);
4623 VLOG_ERR("failed to create packet socket: %s",
4624 ovs_strerror(errno));
4626 ovsthread_once_done(&once);