2 * Copyright (c) 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <linux/version.h>
37 #include <sys/types.h>
38 #include <sys/ioctl.h>
39 #include <sys/socket.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
51 #include "connectivity.h"
53 #include "dpif-linux.h"
54 #include "dynamic-string.h"
55 #include "fatal-signal.h"
58 #include "netdev-provider.h"
59 #include "netdev-vport.h"
60 #include "netlink-notifier.h"
61 #include "netlink-socket.h"
64 #include "openflow/openflow.h"
65 #include "ovs-atomic.h"
67 #include "poll-loop.h"
68 #include "rtnetlink-link.h"
71 #include "socket-util.h"
74 #include "unaligned.h"
77 VLOG_DEFINE_THIS_MODULE(netdev_linux);
79 COVERAGE_DEFINE(netdev_set_policing);
80 COVERAGE_DEFINE(netdev_arp_lookup);
81 COVERAGE_DEFINE(netdev_get_ifindex);
82 COVERAGE_DEFINE(netdev_get_hwaddr);
83 COVERAGE_DEFINE(netdev_set_hwaddr);
84 COVERAGE_DEFINE(netdev_get_ethtool);
85 COVERAGE_DEFINE(netdev_set_ethtool);
88 /* These were introduced in Linux 2.6.14, so they might be missing if we have
90 #ifndef ADVERTISED_Pause
91 #define ADVERTISED_Pause (1 << 13)
93 #ifndef ADVERTISED_Asym_Pause
94 #define ADVERTISED_Asym_Pause (1 << 14)
97 /* These were introduced in Linux 2.6.24, so they might be missing if we
98 * have old headers. */
99 #ifndef ETHTOOL_GFLAGS
100 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
102 #ifndef ETHTOOL_SFLAGS
103 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
106 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
109 #define TC_RTAB_SIZE 1024
113 VALID_IFINDEX = 1 << 0,
114 VALID_ETHERADDR = 1 << 1,
118 VALID_POLICING = 1 << 5,
119 VALID_VPORT_STAT_ERROR = 1 << 6,
120 VALID_DRVINFO = 1 << 7,
121 VALID_FEATURES = 1 << 8,
124 /* Traffic control. */
126 /* An instance of a traffic control class. Always associated with a particular
129 * Each TC implementation subclasses this with whatever additional data it
132 const struct tc_ops *ops;
133 struct hmap queues; /* Contains "struct tc_queue"s.
134 * Read by generic TC layer.
135 * Written only by TC implementation. */
138 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
140 /* One traffic control queue.
142 * Each TC implementation subclasses this with whatever additional data it
145 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
146 unsigned int queue_id; /* OpenFlow queue ID. */
147 long long int created; /* Time queue was created, in msecs. */
150 /* A particular kind of traffic control. Each implementation generally maps to
151 * one particular Linux qdisc class.
153 * The functions below return 0 if successful or a positive errno value on
154 * failure, except where otherwise noted. All of them must be provided, except
155 * where otherwise noted. */
157 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
158 * This is null for tc_ops_default and tc_ops_other, for which there are no
159 * appropriate values. */
160 const char *linux_name;
162 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
163 const char *ovs_name;
165 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
166 * queues. The queues are numbered 0 through n_queues - 1. */
167 unsigned int n_queues;
169 /* Called to install this TC class on 'netdev'. The implementation should
170 * make the Netlink calls required to set up 'netdev' with the right qdisc
171 * and configure it according to 'details'. The implementation may assume
172 * that the current qdisc is the default; that is, there is no need for it
173 * to delete the current qdisc before installing itself.
175 * The contents of 'details' should be documented as valid for 'ovs_name'
176 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
177 * (which is built as ovs-vswitchd.conf.db(8)).
179 * This function must return 0 if and only if it sets 'netdev->tc' to an
180 * initialized 'struct tc'.
182 * (This function is null for tc_ops_other, which cannot be installed. For
183 * other TC classes it should always be nonnull.) */
184 int (*tc_install)(struct netdev *netdev, const struct smap *details);
186 /* Called when the netdev code determines (through a Netlink query) that
187 * this TC class's qdisc is installed on 'netdev', but we didn't install
188 * it ourselves and so don't know any of the details.
190 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
191 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
192 * implementation should parse the other attributes of 'nlmsg' as
193 * necessary to determine its configuration. If necessary it should also
194 * use Netlink queries to determine the configuration of queues on
197 * This function must return 0 if and only if it sets 'netdev->tc' to an
198 * initialized 'struct tc'. */
199 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
201 /* Destroys the data structures allocated by the implementation as part of
202 * 'tc'. (This includes destroying 'tc->queues' by calling
205 * The implementation should not need to perform any Netlink calls. If
206 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
207 * (But it may not be desirable.)
209 * This function may be null if 'tc' is trivial. */
210 void (*tc_destroy)(struct tc *tc);
212 /* Retrieves details of 'netdev->tc' configuration into 'details'.
214 * The implementation should not need to perform any Netlink calls, because
215 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
216 * cached the configuration.
218 * The contents of 'details' should be documented as valid for 'ovs_name'
219 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
220 * (which is built as ovs-vswitchd.conf.db(8)).
222 * This function may be null if 'tc' is not configurable.
224 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
226 /* Reconfigures 'netdev->tc' according to 'details', performing any
227 * required Netlink calls to complete the reconfiguration.
229 * The contents of 'details' should be documented as valid for 'ovs_name'
230 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
231 * (which is built as ovs-vswitchd.conf.db(8)).
233 * This function may be null if 'tc' is not configurable.
235 int (*qdisc_set)(struct netdev *, const struct smap *details);
237 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
238 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
240 * The contents of 'details' should be documented as valid for 'ovs_name'
241 * in the "other_config" column in the "Queue" table in
242 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
244 * The implementation should not need to perform any Netlink calls, because
245 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
246 * cached the queue configuration.
248 * This function may be null if 'tc' does not have queues ('n_queues' is
250 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
251 struct smap *details);
253 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
254 * 'details', perfoming any required Netlink calls to complete the
255 * reconfiguration. The caller ensures that 'queue_id' is less than
258 * The contents of 'details' should be documented as valid for 'ovs_name'
259 * in the "other_config" column in the "Queue" table in
260 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
262 * This function may be null if 'tc' does not have queues or its queues are
263 * not configurable. */
264 int (*class_set)(struct netdev *, unsigned int queue_id,
265 const struct smap *details);
267 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
268 * tc_queue's within 'netdev->tc->queues'.
270 * This function may be null if 'tc' does not have queues or its queues
271 * cannot be deleted. */
272 int (*class_delete)(struct netdev *, struct tc_queue *queue);
274 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
275 * 'struct tc_queue's within 'netdev->tc->queues'.
277 * On success, initializes '*stats'.
279 * This function may be null if 'tc' does not have queues or if it cannot
280 * report queue statistics. */
281 int (*class_get_stats)(const struct netdev *netdev,
282 const struct tc_queue *queue,
283 struct netdev_queue_stats *stats);
285 /* Extracts queue stats from 'nlmsg', which is a response to a
286 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
288 * This function may be null if 'tc' does not have queues or if it cannot
289 * report queue statistics. */
290 int (*class_dump_stats)(const struct netdev *netdev,
291 const struct ofpbuf *nlmsg,
292 netdev_dump_queue_stats_cb *cb, void *aux);
296 tc_init(struct tc *tc, const struct tc_ops *ops)
299 hmap_init(&tc->queues);
303 tc_destroy(struct tc *tc)
305 hmap_destroy(&tc->queues);
308 static const struct tc_ops tc_ops_htb;
309 static const struct tc_ops tc_ops_hfsc;
310 static const struct tc_ops tc_ops_default;
311 static const struct tc_ops tc_ops_other;
313 static const struct tc_ops *const tcs[] = {
314 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
315 &tc_ops_hfsc, /* Hierarchical fair service curve. */
316 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
317 &tc_ops_other, /* Some other qdisc. */
321 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
322 static unsigned int tc_get_major(unsigned int handle);
323 static unsigned int tc_get_minor(unsigned int handle);
325 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
326 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
327 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
329 static struct tcmsg *tc_make_request(const struct netdev *, int type,
330 unsigned int flags, struct ofpbuf *);
331 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
332 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
333 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
336 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
337 struct nlattr **options);
338 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
339 struct nlattr **options,
340 struct netdev_queue_stats *);
341 static int tc_query_class(const struct netdev *,
342 unsigned int handle, unsigned int parent,
343 struct ofpbuf **replyp);
344 static int tc_delete_class(const struct netdev *, unsigned int handle);
346 static int tc_del_qdisc(struct netdev *netdev);
347 static int tc_query_qdisc(const struct netdev *netdev);
349 static int tc_calc_cell_log(unsigned int mtu);
350 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
351 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
352 const struct tc_ratespec *rate);
353 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
355 struct netdev_linux {
358 /* Protects all members below. */
359 struct ovs_mutex mutex;
361 unsigned int cache_valid;
363 bool miimon; /* Link status of last poll. */
364 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
365 struct timer miimon_timer;
367 /* The following are figured out "on demand" only. They are only valid
368 * when the corresponding VALID_* bit in 'cache_valid' is set. */
370 uint8_t etheraddr[ETH_ADDR_LEN];
371 struct in_addr address, netmask;
374 unsigned int ifi_flags;
375 long long int carrier_resets;
376 uint32_t kbits_rate; /* Policing data. */
377 uint32_t kbits_burst;
378 int vport_stats_error; /* Cached error code from vport_get_stats().
379 0 or an errno value. */
380 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
381 int ether_addr_error; /* Cached error code from set/get etheraddr. */
382 int netdev_policing_error; /* Cached error code from set policing. */
383 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
384 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
386 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
387 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
388 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
390 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
393 /* For devices of class netdev_tap_class only. */
397 struct netdev_rx_linux {
403 /* This is set pretty low because we probably won't learn anything from the
404 * additional log messages. */
405 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
407 /* Polling miimon status for all ports causes performance degradation when
408 * handling a large number of ports. If there are no devices using miimon, then
409 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait(). */
410 static atomic_int miimon_cnt = ATOMIC_VAR_INIT(0);
412 static void netdev_linux_run(void);
414 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
415 int cmd, const char *cmd_name);
416 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
417 int cmd, const char *cmd_name);
418 static int get_flags(const struct netdev *, unsigned int *flags);
419 static int set_flags(const char *, unsigned int flags);
420 static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
421 enum netdev_flags on, enum netdev_flags *old_flagsp)
422 OVS_REQUIRES(netdev->mutex);
423 static int do_get_ifindex(const char *netdev_name);
424 static int get_ifindex(const struct netdev *, int *ifindexp);
425 static int do_set_addr(struct netdev *netdev,
426 int ioctl_nr, const char *ioctl_name,
427 struct in_addr addr);
428 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
429 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
430 static int get_stats_via_netlink(const struct netdev *, struct netdev_stats *);
431 static int af_packet_sock(void);
432 static bool netdev_linux_miimon_enabled(void);
433 static void netdev_linux_miimon_run(void);
434 static void netdev_linux_miimon_wait(void);
437 is_netdev_linux_class(const struct netdev_class *netdev_class)
439 return netdev_class->run == netdev_linux_run;
443 is_tap_netdev(const struct netdev *netdev)
445 return netdev_get_class(netdev) == &netdev_tap_class;
448 static struct netdev_linux *
449 netdev_linux_cast(const struct netdev *netdev)
451 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
453 return CONTAINER_OF(netdev, struct netdev_linux, up);
456 static struct netdev_rx_linux *
457 netdev_rx_linux_cast(const struct netdev_rx *rx)
459 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
460 return CONTAINER_OF(rx, struct netdev_rx_linux, up);
463 static void netdev_linux_update(struct netdev_linux *netdev,
464 const struct rtnetlink_link_change *)
465 OVS_REQUIRES(netdev->mutex);
466 static void netdev_linux_changed(struct netdev_linux *netdev,
467 unsigned int ifi_flags, unsigned int mask)
468 OVS_REQUIRES(netdev->mutex);
470 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK changes, or NULL
471 * if no such socket could be created. */
472 static struct nl_sock *
473 netdev_linux_notify_sock(void)
475 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
476 static struct nl_sock *sock;
478 if (ovsthread_once_start(&once)) {
481 error = nl_sock_create(NETLINK_ROUTE, &sock);
483 error = nl_sock_join_mcgroup(sock, RTNLGRP_LINK);
485 nl_sock_destroy(sock);
489 ovsthread_once_done(&once);
496 netdev_linux_miimon_enabled(void)
500 atomic_read(&miimon_cnt, &miimon);
505 netdev_linux_run(void)
507 struct nl_sock *sock;
510 if (netdev_linux_miimon_enabled()) {
511 netdev_linux_miimon_run();
514 sock = netdev_linux_notify_sock();
520 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
521 uint64_t buf_stub[4096 / 8];
524 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
525 error = nl_sock_recv(sock, &buf, false);
527 struct rtnetlink_link_change change;
529 if (rtnetlink_link_parse(&buf, &change)) {
530 struct netdev *netdev_ = netdev_from_name(change.ifname);
531 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
532 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
534 ovs_mutex_lock(&netdev->mutex);
535 netdev_linux_update(netdev, &change);
536 ovs_mutex_unlock(&netdev->mutex);
538 netdev_close(netdev_);
540 } else if (error == ENOBUFS) {
541 struct shash device_shash;
542 struct shash_node *node;
546 shash_init(&device_shash);
547 netdev_get_devices(&netdev_linux_class, &device_shash);
548 SHASH_FOR_EACH (node, &device_shash) {
549 struct netdev *netdev_ = node->data;
550 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
553 ovs_mutex_lock(&netdev->mutex);
554 get_flags(netdev_, &flags);
555 netdev_linux_changed(netdev, flags, 0);
556 ovs_mutex_unlock(&netdev->mutex);
558 netdev_close(netdev_);
560 shash_destroy(&device_shash);
561 } else if (error != EAGAIN) {
562 VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)",
563 ovs_strerror(error));
570 netdev_linux_wait(void)
572 struct nl_sock *sock;
574 if (netdev_linux_miimon_enabled()) {
575 netdev_linux_miimon_wait();
577 sock = netdev_linux_notify_sock();
579 nl_sock_wait(sock, POLLIN);
584 netdev_linux_changed(struct netdev_linux *dev,
585 unsigned int ifi_flags, unsigned int mask)
586 OVS_REQUIRES(dev->mutex)
588 seq_change(connectivity_seq_get());
590 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
591 dev->carrier_resets++;
593 dev->ifi_flags = ifi_flags;
595 dev->cache_valid &= mask;
599 netdev_linux_update(struct netdev_linux *dev,
600 const struct rtnetlink_link_change *change)
601 OVS_REQUIRES(dev->mutex)
603 if (change->nlmsg_type == RTM_NEWLINK) {
605 netdev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
607 /* Update netdev from rtnl-change msg. */
609 dev->mtu = change->mtu;
610 dev->cache_valid |= VALID_MTU;
611 dev->netdev_mtu_error = 0;
614 if (!eth_addr_is_zero(change->addr)) {
615 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
616 dev->cache_valid |= VALID_ETHERADDR;
617 dev->ether_addr_error = 0;
620 dev->ifindex = change->ifi_index;
621 dev->cache_valid |= VALID_IFINDEX;
622 dev->get_ifindex_error = 0;
625 netdev_linux_changed(dev, change->ifi_flags, 0);
629 static struct netdev *
630 netdev_linux_alloc(void)
632 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
637 netdev_linux_common_construct(struct netdev_linux *netdev)
639 ovs_mutex_init(&netdev->mutex);
642 /* Creates system and internal devices. */
644 netdev_linux_construct(struct netdev *netdev_)
646 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
649 netdev_linux_common_construct(netdev);
651 error = get_flags(&netdev->up, &netdev->ifi_flags);
652 if (error == ENODEV) {
653 if (netdev->up.netdev_class != &netdev_internal_class) {
654 /* The device does not exist, so don't allow it to be opened. */
657 /* "Internal" netdevs have to be created as netdev objects before
658 * they exist in the kernel, because creating them in the kernel
659 * happens by passing a netdev object to dpif_port_add().
660 * Therefore, ignore the error. */
667 /* For most types of netdevs we open the device for each call of
668 * netdev_open(). However, this is not the case with tap devices,
669 * since it is only possible to open the device once. In this
670 * situation we share a single file descriptor, and consequently
671 * buffers, across all readers. Therefore once data is read it will
672 * be unavailable to other reads for tap devices. */
674 netdev_linux_construct_tap(struct netdev *netdev_)
676 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
677 static const char tap_dev[] = "/dev/net/tun";
678 const char *name = netdev_->name;
682 netdev_linux_common_construct(netdev);
684 /* Open tap device. */
685 netdev->tap_fd = open(tap_dev, O_RDWR);
686 if (netdev->tap_fd < 0) {
688 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
692 /* Create tap device. */
693 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
694 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
695 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
696 VLOG_WARN("%s: creating tap device failed: %s", name,
697 ovs_strerror(errno));
702 /* Make non-blocking. */
703 error = set_nonblocking(netdev->tap_fd);
711 close(netdev->tap_fd);
716 netdev_linux_destruct(struct netdev *netdev_)
718 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
720 if (netdev->tc && netdev->tc->ops->tc_destroy) {
721 netdev->tc->ops->tc_destroy(netdev->tc);
724 if (netdev_get_class(netdev_) == &netdev_tap_class
725 && netdev->tap_fd >= 0)
727 close(netdev->tap_fd);
730 if (netdev->miimon_interval > 0) {
732 atomic_sub(&miimon_cnt, 1, &junk);
735 ovs_mutex_destroy(&netdev->mutex);
739 netdev_linux_dealloc(struct netdev *netdev_)
741 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
745 static struct netdev_rx *
746 netdev_linux_rx_alloc(void)
748 struct netdev_rx_linux *rx = xzalloc(sizeof *rx);
753 netdev_linux_rx_construct(struct netdev_rx *rx_)
755 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
756 struct netdev *netdev_ = rx->up.netdev;
757 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
760 ovs_mutex_lock(&netdev->mutex);
761 rx->is_tap = is_tap_netdev(netdev_);
763 rx->fd = netdev->tap_fd;
765 struct sockaddr_ll sll;
767 /* Result of tcpdump -dd inbound */
768 static const struct sock_filter filt[] = {
769 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
770 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
771 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
772 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
774 static const struct sock_fprog fprog = {
775 ARRAY_SIZE(filt), (struct sock_filter *) filt
778 /* Create file descriptor. */
779 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
782 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
786 /* Set non-blocking mode. */
787 error = set_nonblocking(rx->fd);
792 /* Get ethernet device index. */
793 error = get_ifindex(&netdev->up, &ifindex);
798 /* Bind to specific ethernet device. */
799 memset(&sll, 0, sizeof sll);
800 sll.sll_family = AF_PACKET;
801 sll.sll_ifindex = ifindex;
802 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
803 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
805 VLOG_ERR("%s: failed to bind raw socket (%s)",
806 netdev_get_name(netdev_), ovs_strerror(error));
810 /* Filter for only inbound packets. */
811 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
815 VLOG_ERR("%s: failed to attach filter (%s)",
816 netdev_get_name(netdev_), ovs_strerror(error));
820 ovs_mutex_unlock(&netdev->mutex);
828 ovs_mutex_unlock(&netdev->mutex);
833 netdev_linux_rx_destruct(struct netdev_rx *rx_)
835 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
843 netdev_linux_rx_dealloc(struct netdev_rx *rx_)
845 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
851 netdev_linux_rx_recv(struct netdev_rx *rx_, struct ofpbuf *buffer)
853 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
855 size_t size = ofpbuf_tailroom(buffer);
859 ? read(rx->fd, buffer->data, size)
860 : recv(rx->fd, buffer->data, size, MSG_TRUNC));
861 } while (retval < 0 && errno == EINTR);
864 if (errno != EAGAIN) {
865 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
866 ovs_strerror(errno), netdev_rx_get_name(rx_));
869 } else if (retval > size) {
872 buffer->size += retval;
878 netdev_linux_rx_wait(struct netdev_rx *rx_)
880 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
881 poll_fd_wait(rx->fd, POLLIN);
885 netdev_linux_rx_drain(struct netdev_rx *rx_)
887 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
890 int error = af_inet_ifreq_ioctl(netdev_rx_get_name(rx_), &ifr,
891 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
895 drain_fd(rx->fd, ifr.ifr_qlen);
898 return drain_rcvbuf(rx->fd);
902 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
903 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
904 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
905 * the packet is too big or too small to transmit on the device.
907 * The caller retains ownership of 'buffer' in all cases.
909 * The kernel maintains a packet transmission queue, so the caller is not
910 * expected to do additional queuing of packets. */
912 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
917 if (!is_tap_netdev(netdev_)) {
918 /* Use our AF_PACKET socket to send to this device. */
919 struct sockaddr_ll sll;
925 sock = af_packet_sock();
930 ifindex = netdev_get_ifindex(netdev_);
935 /* We don't bother setting most fields in sockaddr_ll because the
936 * kernel ignores them for SOCK_RAW. */
937 memset(&sll, 0, sizeof sll);
938 sll.sll_family = AF_PACKET;
939 sll.sll_ifindex = ifindex;
941 iov.iov_base = CONST_CAST(void *, data);
945 msg.msg_namelen = sizeof sll;
948 msg.msg_control = NULL;
949 msg.msg_controllen = 0;
952 retval = sendmsg(sock, &msg, 0);
954 /* Use the tap fd to send to this device. This is essential for
955 * tap devices, because packets sent to a tap device with an
956 * AF_PACKET socket will loop back to be *received* again on the
957 * tap device. This doesn't occur on other interface types
958 * because we attach a socket filter to the rx socket. */
959 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
961 retval = write(netdev->tap_fd, data, size);
965 /* The Linux AF_PACKET implementation never blocks waiting for room
966 * for packets, instead returning ENOBUFS. Translate this into
967 * EAGAIN for the caller. */
968 if (errno == ENOBUFS) {
970 } else if (errno == EINTR) {
972 } else if (errno != EAGAIN) {
973 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
974 netdev_get_name(netdev_), ovs_strerror(errno));
977 } else if (retval != size) {
978 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE"d bytes of "
979 "%"PRIuSIZE") on %s", retval, size, netdev_get_name(netdev_));
987 /* Registers with the poll loop to wake up from the next call to poll_block()
988 * when the packet transmission queue has sufficient room to transmit a packet
989 * with netdev_send().
991 * The kernel maintains a packet transmission queue, so the client is not
992 * expected to do additional queuing of packets. Thus, this function is
993 * unlikely to ever be used. It is included for completeness. */
995 netdev_linux_send_wait(struct netdev *netdev)
997 if (is_tap_netdev(netdev)) {
998 /* TAP device always accepts packets.*/
999 poll_immediate_wake();
1003 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1004 * otherwise a positive errno value. */
1006 netdev_linux_set_etheraddr(struct netdev *netdev_,
1007 const uint8_t mac[ETH_ADDR_LEN])
1009 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1010 enum netdev_flags old_flags = 0;
1013 ovs_mutex_lock(&netdev->mutex);
1015 if (netdev->cache_valid & VALID_ETHERADDR) {
1016 error = netdev->ether_addr_error;
1017 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1020 netdev->cache_valid &= ~VALID_ETHERADDR;
1023 /* Tap devices must be brought down before setting the address. */
1024 if (is_tap_netdev(netdev_)) {
1025 update_flags(netdev, NETDEV_UP, 0, &old_flags);
1027 error = set_etheraddr(netdev_get_name(netdev_), mac);
1028 if (!error || error == ENODEV) {
1029 netdev->ether_addr_error = error;
1030 netdev->cache_valid |= VALID_ETHERADDR;
1032 memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
1036 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1037 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1041 ovs_mutex_unlock(&netdev->mutex);
1045 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1047 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1048 uint8_t mac[ETH_ADDR_LEN])
1050 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1053 ovs_mutex_lock(&netdev->mutex);
1054 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1055 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1057 netdev->cache_valid |= VALID_ETHERADDR;
1060 error = netdev->ether_addr_error;
1062 memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
1064 ovs_mutex_unlock(&netdev->mutex);
1070 netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
1074 if (!(netdev->cache_valid & VALID_MTU)) {
1077 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
1078 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
1079 netdev->mtu = ifr.ifr_mtu;
1080 netdev->cache_valid |= VALID_MTU;
1083 error = netdev->netdev_mtu_error;
1085 *mtup = netdev->mtu;
1091 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1092 * in bytes, not including the hardware header; thus, this is typically 1500
1093 * bytes for Ethernet devices. */
1095 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1097 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1100 ovs_mutex_lock(&netdev->mutex);
1101 error = netdev_linux_get_mtu__(netdev, mtup);
1102 ovs_mutex_unlock(&netdev->mutex);
1107 /* Sets the maximum size of transmitted (MTU) for given device using linux
1108 * networking ioctl interface.
1111 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1113 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1117 ovs_mutex_lock(&netdev->mutex);
1118 if (netdev->cache_valid & VALID_MTU) {
1119 error = netdev->netdev_mtu_error;
1120 if (error || netdev->mtu == mtu) {
1123 netdev->cache_valid &= ~VALID_MTU;
1126 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1127 SIOCSIFMTU, "SIOCSIFMTU");
1128 if (!error || error == ENODEV) {
1129 netdev->netdev_mtu_error = error;
1130 netdev->mtu = ifr.ifr_mtu;
1131 netdev->cache_valid |= VALID_MTU;
1134 ovs_mutex_unlock(&netdev->mutex);
1138 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1139 * On failure, returns a negative errno value. */
1141 netdev_linux_get_ifindex(const struct netdev *netdev_)
1143 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1146 ovs_mutex_lock(&netdev->mutex);
1147 error = get_ifindex(netdev_, &ifindex);
1148 ovs_mutex_unlock(&netdev->mutex);
1150 return error ? -error : ifindex;
1154 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1156 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1158 ovs_mutex_lock(&netdev->mutex);
1159 if (netdev->miimon_interval > 0) {
1160 *carrier = netdev->miimon;
1162 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1164 ovs_mutex_unlock(&netdev->mutex);
1169 static long long int
1170 netdev_linux_get_carrier_resets(const struct netdev *netdev_)
1172 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1173 long long int carrier_resets;
1175 ovs_mutex_lock(&netdev->mutex);
1176 carrier_resets = netdev->carrier_resets;
1177 ovs_mutex_unlock(&netdev->mutex);
1179 return carrier_resets;
1183 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1184 struct mii_ioctl_data *data)
1189 memset(&ifr, 0, sizeof ifr);
1190 memcpy(&ifr.ifr_data, data, sizeof *data);
1191 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
1192 memcpy(data, &ifr.ifr_data, sizeof *data);
1198 netdev_linux_get_miimon(const char *name, bool *miimon)
1200 struct mii_ioctl_data data;
1205 memset(&data, 0, sizeof data);
1206 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1208 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1209 data.reg_num = MII_BMSR;
1210 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1214 *miimon = !!(data.val_out & BMSR_LSTATUS);
1216 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1219 struct ethtool_cmd ecmd;
1221 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1224 COVERAGE_INC(netdev_get_ethtool);
1225 memset(&ecmd, 0, sizeof ecmd);
1226 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1229 struct ethtool_value eval;
1231 memcpy(&eval, &ecmd, sizeof eval);
1232 *miimon = !!eval.data;
1234 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1242 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1243 long long int interval)
1245 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1247 ovs_mutex_lock(&netdev->mutex);
1248 interval = interval > 0 ? MAX(interval, 100) : 0;
1249 if (netdev->miimon_interval != interval) {
1252 if (interval && !netdev->miimon_interval) {
1253 atomic_add(&miimon_cnt, 1, &junk);
1254 } else if (!interval && netdev->miimon_interval) {
1255 atomic_sub(&miimon_cnt, 1, &junk);
1258 netdev->miimon_interval = interval;
1259 timer_set_expired(&netdev->miimon_timer);
1261 ovs_mutex_unlock(&netdev->mutex);
1267 netdev_linux_miimon_run(void)
1269 struct shash device_shash;
1270 struct shash_node *node;
1272 shash_init(&device_shash);
1273 netdev_get_devices(&netdev_linux_class, &device_shash);
1274 SHASH_FOR_EACH (node, &device_shash) {
1275 struct netdev *netdev = node->data;
1276 struct netdev_linux *dev = netdev_linux_cast(netdev);
1279 ovs_mutex_lock(&dev->mutex);
1280 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1281 netdev_linux_get_miimon(dev->up.name, &miimon);
1282 if (miimon != dev->miimon) {
1283 dev->miimon = miimon;
1284 netdev_linux_changed(dev, dev->ifi_flags, 0);
1287 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1289 ovs_mutex_unlock(&dev->mutex);
1290 netdev_close(netdev);
1293 shash_destroy(&device_shash);
1297 netdev_linux_miimon_wait(void)
1299 struct shash device_shash;
1300 struct shash_node *node;
1302 shash_init(&device_shash);
1303 netdev_get_devices(&netdev_linux_class, &device_shash);
1304 SHASH_FOR_EACH (node, &device_shash) {
1305 struct netdev *netdev = node->data;
1306 struct netdev_linux *dev = netdev_linux_cast(netdev);
1308 ovs_mutex_lock(&dev->mutex);
1309 if (dev->miimon_interval > 0) {
1310 timer_wait(&dev->miimon_timer);
1312 ovs_mutex_unlock(&dev->mutex);
1313 netdev_close(netdev);
1315 shash_destroy(&device_shash);
1319 swap_uint64(uint64_t *a, uint64_t *b)
1326 /* Copies 'src' into 'dst', performing format conversion in the process.
1328 * 'src' is allowed to be misaligned. */
1330 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1331 const struct ovs_vport_stats *src)
1333 dst->rx_packets = get_unaligned_u64(&src->rx_packets);
1334 dst->tx_packets = get_unaligned_u64(&src->tx_packets);
1335 dst->rx_bytes = get_unaligned_u64(&src->rx_bytes);
1336 dst->tx_bytes = get_unaligned_u64(&src->tx_bytes);
1337 dst->rx_errors = get_unaligned_u64(&src->rx_errors);
1338 dst->tx_errors = get_unaligned_u64(&src->tx_errors);
1339 dst->rx_dropped = get_unaligned_u64(&src->rx_dropped);
1340 dst->tx_dropped = get_unaligned_u64(&src->tx_dropped);
1342 dst->collisions = 0;
1343 dst->rx_length_errors = 0;
1344 dst->rx_over_errors = 0;
1345 dst->rx_crc_errors = 0;
1346 dst->rx_frame_errors = 0;
1347 dst->rx_fifo_errors = 0;
1348 dst->rx_missed_errors = 0;
1349 dst->tx_aborted_errors = 0;
1350 dst->tx_carrier_errors = 0;
1351 dst->tx_fifo_errors = 0;
1352 dst->tx_heartbeat_errors = 0;
1353 dst->tx_window_errors = 0;
1357 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1359 struct dpif_linux_vport reply;
1363 error = dpif_linux_vport_get(netdev_get_name(netdev), &reply, &buf);
1366 } else if (!reply.stats) {
1371 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1379 get_stats_via_vport(const struct netdev *netdev_,
1380 struct netdev_stats *stats)
1382 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1384 if (!netdev->vport_stats_error ||
1385 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1388 error = get_stats_via_vport__(netdev_, stats);
1389 if (error && error != ENOENT) {
1390 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1392 netdev_get_name(netdev_), ovs_strerror(error));
1394 netdev->vport_stats_error = error;
1395 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1399 /* Retrieves current device stats for 'netdev-linux'. */
1401 netdev_linux_get_stats(const struct netdev *netdev_,
1402 struct netdev_stats *stats)
1404 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1405 struct netdev_stats dev_stats;
1408 ovs_mutex_lock(&netdev->mutex);
1409 get_stats_via_vport(netdev_, stats);
1410 error = get_stats_via_netlink(netdev_, &dev_stats);
1412 if (!netdev->vport_stats_error) {
1415 } else if (netdev->vport_stats_error) {
1416 /* stats not available from OVS then use ioctl stats. */
1419 stats->rx_errors += dev_stats.rx_errors;
1420 stats->tx_errors += dev_stats.tx_errors;
1421 stats->rx_dropped += dev_stats.rx_dropped;
1422 stats->tx_dropped += dev_stats.tx_dropped;
1423 stats->multicast += dev_stats.multicast;
1424 stats->collisions += dev_stats.collisions;
1425 stats->rx_length_errors += dev_stats.rx_length_errors;
1426 stats->rx_over_errors += dev_stats.rx_over_errors;
1427 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1428 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1429 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1430 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1431 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1432 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1433 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1434 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1435 stats->tx_window_errors += dev_stats.tx_window_errors;
1437 ovs_mutex_unlock(&netdev->mutex);
1442 /* Retrieves current device stats for 'netdev-tap' netdev or
1443 * netdev-internal. */
1445 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1447 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1448 struct netdev_stats dev_stats;
1451 ovs_mutex_lock(&netdev->mutex);
1452 get_stats_via_vport(netdev_, stats);
1453 error = get_stats_via_netlink(netdev_, &dev_stats);
1455 if (!netdev->vport_stats_error) {
1458 } else if (netdev->vport_stats_error) {
1459 /* Transmit and receive stats will appear to be swapped relative to the
1460 * other ports since we are the one sending the data, not a remote
1461 * computer. For consistency, we swap them back here. This does not
1462 * apply if we are getting stats from the vport layer because it always
1463 * tracks stats from the perspective of the switch. */
1466 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1467 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1468 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1469 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1470 stats->rx_length_errors = 0;
1471 stats->rx_over_errors = 0;
1472 stats->rx_crc_errors = 0;
1473 stats->rx_frame_errors = 0;
1474 stats->rx_fifo_errors = 0;
1475 stats->rx_missed_errors = 0;
1476 stats->tx_aborted_errors = 0;
1477 stats->tx_carrier_errors = 0;
1478 stats->tx_fifo_errors = 0;
1479 stats->tx_heartbeat_errors = 0;
1480 stats->tx_window_errors = 0;
1482 stats->rx_dropped += dev_stats.tx_dropped;
1483 stats->tx_dropped += dev_stats.rx_dropped;
1485 stats->rx_errors += dev_stats.tx_errors;
1486 stats->tx_errors += dev_stats.rx_errors;
1488 stats->multicast += dev_stats.multicast;
1489 stats->collisions += dev_stats.collisions;
1491 ovs_mutex_unlock(&netdev->mutex);
1497 netdev_internal_get_stats(const struct netdev *netdev_,
1498 struct netdev_stats *stats)
1500 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1503 ovs_mutex_lock(&netdev->mutex);
1504 get_stats_via_vport(netdev_, stats);
1505 error = netdev->vport_stats_error;
1506 ovs_mutex_unlock(&netdev->mutex);
1512 netdev_internal_set_stats(struct netdev *netdev,
1513 const struct netdev_stats *stats)
1515 struct ovs_vport_stats vport_stats;
1516 struct dpif_linux_vport vport;
1519 vport_stats.rx_packets = stats->rx_packets;
1520 vport_stats.tx_packets = stats->tx_packets;
1521 vport_stats.rx_bytes = stats->rx_bytes;
1522 vport_stats.tx_bytes = stats->tx_bytes;
1523 vport_stats.rx_errors = stats->rx_errors;
1524 vport_stats.tx_errors = stats->tx_errors;
1525 vport_stats.rx_dropped = stats->rx_dropped;
1526 vport_stats.tx_dropped = stats->tx_dropped;
1528 dpif_linux_vport_init(&vport);
1529 vport.cmd = OVS_VPORT_CMD_SET;
1530 vport.name = netdev_get_name(netdev);
1531 vport.stats = &vport_stats;
1533 err = dpif_linux_vport_transact(&vport, NULL, NULL);
1535 /* If the vport layer doesn't know about the device, that doesn't mean it
1536 * doesn't exist (after all were able to open it when netdev_open() was
1537 * called), it just means that it isn't attached and we'll be getting
1538 * stats a different way. */
1539 if (err == ENODEV) {
1547 netdev_linux_read_features(struct netdev_linux *netdev)
1549 struct ethtool_cmd ecmd;
1553 if (netdev->cache_valid & VALID_FEATURES) {
1557 COVERAGE_INC(netdev_get_ethtool);
1558 memset(&ecmd, 0, sizeof ecmd);
1559 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1560 ETHTOOL_GSET, "ETHTOOL_GSET");
1565 /* Supported features. */
1566 netdev->supported = 0;
1567 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1568 netdev->supported |= NETDEV_F_10MB_HD;
1570 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1571 netdev->supported |= NETDEV_F_10MB_FD;
1573 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1574 netdev->supported |= NETDEV_F_100MB_HD;
1576 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1577 netdev->supported |= NETDEV_F_100MB_FD;
1579 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1580 netdev->supported |= NETDEV_F_1GB_HD;
1582 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1583 netdev->supported |= NETDEV_F_1GB_FD;
1585 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1586 netdev->supported |= NETDEV_F_10GB_FD;
1588 if (ecmd.supported & SUPPORTED_TP) {
1589 netdev->supported |= NETDEV_F_COPPER;
1591 if (ecmd.supported & SUPPORTED_FIBRE) {
1592 netdev->supported |= NETDEV_F_FIBER;
1594 if (ecmd.supported & SUPPORTED_Autoneg) {
1595 netdev->supported |= NETDEV_F_AUTONEG;
1597 if (ecmd.supported & SUPPORTED_Pause) {
1598 netdev->supported |= NETDEV_F_PAUSE;
1600 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1601 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1604 /* Advertised features. */
1605 netdev->advertised = 0;
1606 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1607 netdev->advertised |= NETDEV_F_10MB_HD;
1609 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1610 netdev->advertised |= NETDEV_F_10MB_FD;
1612 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1613 netdev->advertised |= NETDEV_F_100MB_HD;
1615 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1616 netdev->advertised |= NETDEV_F_100MB_FD;
1618 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1619 netdev->advertised |= NETDEV_F_1GB_HD;
1621 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1622 netdev->advertised |= NETDEV_F_1GB_FD;
1624 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1625 netdev->advertised |= NETDEV_F_10GB_FD;
1627 if (ecmd.advertising & ADVERTISED_TP) {
1628 netdev->advertised |= NETDEV_F_COPPER;
1630 if (ecmd.advertising & ADVERTISED_FIBRE) {
1631 netdev->advertised |= NETDEV_F_FIBER;
1633 if (ecmd.advertising & ADVERTISED_Autoneg) {
1634 netdev->advertised |= NETDEV_F_AUTONEG;
1636 if (ecmd.advertising & ADVERTISED_Pause) {
1637 netdev->advertised |= NETDEV_F_PAUSE;
1639 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1640 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1643 /* Current settings. */
1645 if (speed == SPEED_10) {
1646 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1647 } else if (speed == SPEED_100) {
1648 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1649 } else if (speed == SPEED_1000) {
1650 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1651 } else if (speed == SPEED_10000) {
1652 netdev->current = NETDEV_F_10GB_FD;
1653 } else if (speed == 40000) {
1654 netdev->current = NETDEV_F_40GB_FD;
1655 } else if (speed == 100000) {
1656 netdev->current = NETDEV_F_100GB_FD;
1657 } else if (speed == 1000000) {
1658 netdev->current = NETDEV_F_1TB_FD;
1660 netdev->current = 0;
1663 if (ecmd.port == PORT_TP) {
1664 netdev->current |= NETDEV_F_COPPER;
1665 } else if (ecmd.port == PORT_FIBRE) {
1666 netdev->current |= NETDEV_F_FIBER;
1670 netdev->current |= NETDEV_F_AUTONEG;
1674 netdev->cache_valid |= VALID_FEATURES;
1675 netdev->get_features_error = error;
1678 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
1679 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1680 * Returns 0 if successful, otherwise a positive errno value. */
1682 netdev_linux_get_features(const struct netdev *netdev_,
1683 enum netdev_features *current,
1684 enum netdev_features *advertised,
1685 enum netdev_features *supported,
1686 enum netdev_features *peer)
1688 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1691 ovs_mutex_lock(&netdev->mutex);
1692 netdev_linux_read_features(netdev);
1693 if (!netdev->get_features_error) {
1694 *current = netdev->current;
1695 *advertised = netdev->advertised;
1696 *supported = netdev->supported;
1697 *peer = 0; /* XXX */
1699 error = netdev->get_features_error;
1700 ovs_mutex_unlock(&netdev->mutex);
1705 /* Set the features advertised by 'netdev' to 'advertise'. */
1707 netdev_linux_set_advertisements(struct netdev *netdev_,
1708 enum netdev_features advertise)
1710 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1711 struct ethtool_cmd ecmd;
1714 ovs_mutex_lock(&netdev->mutex);
1716 COVERAGE_INC(netdev_get_ethtool);
1717 memset(&ecmd, 0, sizeof ecmd);
1718 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1719 ETHTOOL_GSET, "ETHTOOL_GSET");
1724 ecmd.advertising = 0;
1725 if (advertise & NETDEV_F_10MB_HD) {
1726 ecmd.advertising |= ADVERTISED_10baseT_Half;
1728 if (advertise & NETDEV_F_10MB_FD) {
1729 ecmd.advertising |= ADVERTISED_10baseT_Full;
1731 if (advertise & NETDEV_F_100MB_HD) {
1732 ecmd.advertising |= ADVERTISED_100baseT_Half;
1734 if (advertise & NETDEV_F_100MB_FD) {
1735 ecmd.advertising |= ADVERTISED_100baseT_Full;
1737 if (advertise & NETDEV_F_1GB_HD) {
1738 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1740 if (advertise & NETDEV_F_1GB_FD) {
1741 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1743 if (advertise & NETDEV_F_10GB_FD) {
1744 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1746 if (advertise & NETDEV_F_COPPER) {
1747 ecmd.advertising |= ADVERTISED_TP;
1749 if (advertise & NETDEV_F_FIBER) {
1750 ecmd.advertising |= ADVERTISED_FIBRE;
1752 if (advertise & NETDEV_F_AUTONEG) {
1753 ecmd.advertising |= ADVERTISED_Autoneg;
1755 if (advertise & NETDEV_F_PAUSE) {
1756 ecmd.advertising |= ADVERTISED_Pause;
1758 if (advertise & NETDEV_F_PAUSE_ASYM) {
1759 ecmd.advertising |= ADVERTISED_Asym_Pause;
1761 COVERAGE_INC(netdev_set_ethtool);
1762 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1763 ETHTOOL_SSET, "ETHTOOL_SSET");
1766 ovs_mutex_unlock(&netdev->mutex);
1770 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1771 * successful, otherwise a positive errno value. */
1773 netdev_linux_set_policing(struct netdev *netdev_,
1774 uint32_t kbits_rate, uint32_t kbits_burst)
1776 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1777 const char *netdev_name = netdev_get_name(netdev_);
1780 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1781 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1782 : kbits_burst); /* Stick with user-specified value. */
1784 ovs_mutex_lock(&netdev->mutex);
1785 if (netdev->cache_valid & VALID_POLICING) {
1786 error = netdev->netdev_policing_error;
1787 if (error || (netdev->kbits_rate == kbits_rate &&
1788 netdev->kbits_burst == kbits_burst)) {
1789 /* Assume that settings haven't changed since we last set them. */
1792 netdev->cache_valid &= ~VALID_POLICING;
1795 COVERAGE_INC(netdev_set_policing);
1796 /* Remove any existing ingress qdisc. */
1797 error = tc_add_del_ingress_qdisc(netdev_, false);
1799 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1800 netdev_name, ovs_strerror(error));
1805 error = tc_add_del_ingress_qdisc(netdev_, true);
1807 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1808 netdev_name, ovs_strerror(error));
1812 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
1814 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1815 netdev_name, ovs_strerror(error));
1820 netdev->kbits_rate = kbits_rate;
1821 netdev->kbits_burst = kbits_burst;
1824 if (!error || error == ENODEV) {
1825 netdev->netdev_policing_error = error;
1826 netdev->cache_valid |= VALID_POLICING;
1828 ovs_mutex_unlock(&netdev->mutex);
1833 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1836 const struct tc_ops *const *opsp;
1838 for (opsp = tcs; *opsp != NULL; opsp++) {
1839 const struct tc_ops *ops = *opsp;
1840 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1841 sset_add(types, ops->ovs_name);
1847 static const struct tc_ops *
1848 tc_lookup_ovs_name(const char *name)
1850 const struct tc_ops *const *opsp;
1852 for (opsp = tcs; *opsp != NULL; opsp++) {
1853 const struct tc_ops *ops = *opsp;
1854 if (!strcmp(name, ops->ovs_name)) {
1861 static const struct tc_ops *
1862 tc_lookup_linux_name(const char *name)
1864 const struct tc_ops *const *opsp;
1866 for (opsp = tcs; *opsp != NULL; opsp++) {
1867 const struct tc_ops *ops = *opsp;
1868 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1875 static struct tc_queue *
1876 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
1879 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1880 struct tc_queue *queue;
1882 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
1883 if (queue->queue_id == queue_id) {
1890 static struct tc_queue *
1891 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1893 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1897 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1899 struct netdev_qos_capabilities *caps)
1901 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1905 caps->n_queues = ops->n_queues;
1910 netdev_linux_get_qos(const struct netdev *netdev_,
1911 const char **typep, struct smap *details)
1913 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1916 ovs_mutex_lock(&netdev->mutex);
1917 error = tc_query_qdisc(netdev_);
1919 *typep = netdev->tc->ops->ovs_name;
1920 error = (netdev->tc->ops->qdisc_get
1921 ? netdev->tc->ops->qdisc_get(netdev_, details)
1924 ovs_mutex_unlock(&netdev->mutex);
1930 netdev_linux_set_qos(struct netdev *netdev_,
1931 const char *type, const struct smap *details)
1933 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1934 const struct tc_ops *new_ops;
1937 new_ops = tc_lookup_ovs_name(type);
1938 if (!new_ops || !new_ops->tc_install) {
1942 ovs_mutex_lock(&netdev->mutex);
1943 error = tc_query_qdisc(netdev_);
1948 if (new_ops == netdev->tc->ops) {
1949 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
1951 /* Delete existing qdisc. */
1952 error = tc_del_qdisc(netdev_);
1956 ovs_assert(netdev->tc == NULL);
1958 /* Install new qdisc. */
1959 error = new_ops->tc_install(netdev_, details);
1960 ovs_assert((error == 0) == (netdev->tc != NULL));
1964 ovs_mutex_unlock(&netdev->mutex);
1969 netdev_linux_get_queue(const struct netdev *netdev_,
1970 unsigned int queue_id, struct smap *details)
1972 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1975 ovs_mutex_lock(&netdev->mutex);
1976 error = tc_query_qdisc(netdev_);
1978 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
1980 ? netdev->tc->ops->class_get(netdev_, queue, details)
1983 ovs_mutex_unlock(&netdev->mutex);
1989 netdev_linux_set_queue(struct netdev *netdev_,
1990 unsigned int queue_id, const struct smap *details)
1992 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1995 ovs_mutex_lock(&netdev->mutex);
1996 error = tc_query_qdisc(netdev_);
1998 error = (queue_id < netdev->tc->ops->n_queues
1999 && netdev->tc->ops->class_set
2000 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2003 ovs_mutex_unlock(&netdev->mutex);
2009 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
2011 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2014 ovs_mutex_lock(&netdev->mutex);
2015 error = tc_query_qdisc(netdev_);
2017 if (netdev->tc->ops->class_delete) {
2018 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2020 ? netdev->tc->ops->class_delete(netdev_, queue)
2026 ovs_mutex_unlock(&netdev->mutex);
2032 netdev_linux_get_queue_stats(const struct netdev *netdev_,
2033 unsigned int queue_id,
2034 struct netdev_queue_stats *stats)
2036 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2039 ovs_mutex_lock(&netdev->mutex);
2040 error = tc_query_qdisc(netdev_);
2042 if (netdev->tc->ops->class_get_stats) {
2043 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2045 stats->created = queue->created;
2046 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2055 ovs_mutex_unlock(&netdev->mutex);
2061 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
2063 struct ofpbuf request;
2064 struct tcmsg *tcmsg;
2066 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2070 tcmsg->tcm_parent = 0;
2071 nl_dump_start(dump, NETLINK_ROUTE, &request);
2072 ofpbuf_uninit(&request);
2076 struct netdev_linux_queue_state {
2077 unsigned int *queues;
2083 netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
2085 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2088 ovs_mutex_lock(&netdev->mutex);
2089 error = tc_query_qdisc(netdev_);
2091 if (netdev->tc->ops->class_get) {
2092 struct netdev_linux_queue_state *state;
2093 struct tc_queue *queue;
2096 *statep = state = xmalloc(sizeof *state);
2097 state->n_queues = hmap_count(&netdev->tc->queues);
2098 state->cur_queue = 0;
2099 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2102 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2103 state->queues[i++] = queue->queue_id;
2109 ovs_mutex_unlock(&netdev->mutex);
2115 netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2116 unsigned int *queue_idp, struct smap *details)
2118 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2119 struct netdev_linux_queue_state *state = state_;
2122 ovs_mutex_lock(&netdev->mutex);
2123 while (state->cur_queue < state->n_queues) {
2124 unsigned int queue_id = state->queues[state->cur_queue++];
2125 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2128 *queue_idp = queue_id;
2129 error = netdev->tc->ops->class_get(netdev_, queue, details);
2133 ovs_mutex_unlock(&netdev->mutex);
2139 netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2142 struct netdev_linux_queue_state *state = state_;
2144 free(state->queues);
2150 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2151 netdev_dump_queue_stats_cb *cb, void *aux)
2153 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2156 ovs_mutex_lock(&netdev->mutex);
2157 error = tc_query_qdisc(netdev_);
2159 struct nl_dump dump;
2161 if (!netdev->tc->ops->class_dump_stats) {
2163 } else if (!start_queue_dump(netdev_, &dump)) {
2169 while (nl_dump_next(&dump, &msg)) {
2170 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2177 retval = nl_dump_done(&dump);
2183 ovs_mutex_unlock(&netdev->mutex);
2189 netdev_linux_get_in4(const struct netdev *netdev_,
2190 struct in_addr *address, struct in_addr *netmask)
2192 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2195 ovs_mutex_lock(&netdev->mutex);
2196 if (!(netdev->cache_valid & VALID_IN4)) {
2197 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
2198 SIOCGIFADDR, "SIOCGIFADDR");
2200 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2201 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2203 netdev->cache_valid |= VALID_IN4;
2211 if (netdev->address.s_addr != INADDR_ANY) {
2212 *address = netdev->address;
2213 *netmask = netdev->netmask;
2215 error = EADDRNOTAVAIL;
2218 ovs_mutex_unlock(&netdev->mutex);
2224 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2225 struct in_addr netmask)
2227 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2230 ovs_mutex_lock(&netdev->mutex);
2231 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2233 netdev->cache_valid |= VALID_IN4;
2234 netdev->address = address;
2235 netdev->netmask = netmask;
2236 if (address.s_addr != INADDR_ANY) {
2237 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2238 "SIOCSIFNETMASK", netmask);
2241 ovs_mutex_unlock(&netdev->mutex);
2247 parse_if_inet6_line(const char *line,
2248 struct in6_addr *in6, char ifname[16 + 1])
2250 uint8_t *s6 = in6->s6_addr;
2251 #define X8 "%2"SCNx8
2252 return ovs_scan(line,
2253 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2254 "%*x %*x %*x %*x %16s\n",
2255 &s6[0], &s6[1], &s6[2], &s6[3],
2256 &s6[4], &s6[5], &s6[6], &s6[7],
2257 &s6[8], &s6[9], &s6[10], &s6[11],
2258 &s6[12], &s6[13], &s6[14], &s6[15],
2262 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2263 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2265 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2267 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2269 ovs_mutex_lock(&netdev->mutex);
2270 if (!(netdev->cache_valid & VALID_IN6)) {
2274 netdev->in6 = in6addr_any;
2276 file = fopen("/proc/net/if_inet6", "r");
2278 const char *name = netdev_get_name(netdev_);
2279 while (fgets(line, sizeof line, file)) {
2280 struct in6_addr in6_tmp;
2281 char ifname[16 + 1];
2282 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2283 && !strcmp(name, ifname))
2285 netdev->in6 = in6_tmp;
2291 netdev->cache_valid |= VALID_IN6;
2294 ovs_mutex_unlock(&netdev->mutex);
2300 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2302 struct sockaddr_in sin;
2303 memset(&sin, 0, sizeof sin);
2304 sin.sin_family = AF_INET;
2305 sin.sin_addr = addr;
2308 memset(sa, 0, sizeof *sa);
2309 memcpy(sa, &sin, sizeof sin);
2313 do_set_addr(struct netdev *netdev,
2314 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2318 make_in4_sockaddr(&ifr.ifr_addr, addr);
2319 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2323 /* Adds 'router' as a default IP gateway. */
2325 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2327 struct in_addr any = { INADDR_ANY };
2331 memset(&rt, 0, sizeof rt);
2332 make_in4_sockaddr(&rt.rt_dst, any);
2333 make_in4_sockaddr(&rt.rt_gateway, router);
2334 make_in4_sockaddr(&rt.rt_genmask, any);
2335 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2336 error = af_inet_ioctl(SIOCADDRT, &rt);
2338 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2344 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2347 static const char fn[] = "/proc/net/route";
2352 *netdev_name = NULL;
2353 stream = fopen(fn, "r");
2354 if (stream == NULL) {
2355 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2360 while (fgets(line, sizeof line, stream)) {
2363 ovs_be32 dest, gateway, mask;
2364 int refcnt, metric, mtu;
2365 unsigned int flags, use, window, irtt;
2368 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2370 iface, &dest, &gateway, &flags, &refcnt,
2371 &use, &metric, &mask, &mtu, &window, &irtt)) {
2372 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2376 if (!(flags & RTF_UP)) {
2377 /* Skip routes that aren't up. */
2381 /* The output of 'dest', 'mask', and 'gateway' were given in
2382 * network byte order, so we don't need need any endian
2383 * conversions here. */
2384 if ((dest & mask) == (host->s_addr & mask)) {
2386 /* The host is directly reachable. */
2387 next_hop->s_addr = 0;
2389 /* To reach the host, we must go through a gateway. */
2390 next_hop->s_addr = gateway;
2392 *netdev_name = xstrdup(iface);
2404 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2406 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2409 ovs_mutex_lock(&netdev->mutex);
2410 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2411 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2413 COVERAGE_INC(netdev_get_ethtool);
2414 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2415 error = netdev_linux_do_ethtool(netdev->up.name,
2418 "ETHTOOL_GDRVINFO");
2420 netdev->cache_valid |= VALID_DRVINFO;
2425 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2426 smap_add(smap, "driver_version", netdev->drvinfo.version);
2427 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2429 ovs_mutex_unlock(&netdev->mutex);
2435 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2438 smap_add(smap, "driver_name", "openvswitch");
2442 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2443 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2444 * returns 0. Otherwise, it returns a positive errno value; in particular,
2445 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2447 netdev_linux_arp_lookup(const struct netdev *netdev,
2448 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2451 struct sockaddr_in sin;
2454 memset(&r, 0, sizeof r);
2455 memset(&sin, 0, sizeof sin);
2456 sin.sin_family = AF_INET;
2457 sin.sin_addr.s_addr = ip;
2459 memcpy(&r.arp_pa, &sin, sizeof sin);
2460 r.arp_ha.sa_family = ARPHRD_ETHER;
2462 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2463 COVERAGE_INC(netdev_arp_lookup);
2464 retval = af_inet_ioctl(SIOCGARP, &r);
2466 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2467 } else if (retval != ENXIO) {
2468 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2469 netdev_get_name(netdev), IP_ARGS(ip),
2470 ovs_strerror(retval));
2476 nd_to_iff_flags(enum netdev_flags nd)
2479 if (nd & NETDEV_UP) {
2482 if (nd & NETDEV_PROMISC) {
2485 if (nd & NETDEV_LOOPBACK) {
2486 iff |= IFF_LOOPBACK;
2492 iff_to_nd_flags(int iff)
2494 enum netdev_flags nd = 0;
2498 if (iff & IFF_PROMISC) {
2499 nd |= NETDEV_PROMISC;
2501 if (iff & IFF_LOOPBACK) {
2502 nd |= NETDEV_LOOPBACK;
2508 update_flags(struct netdev_linux *netdev, enum netdev_flags off,
2509 enum netdev_flags on, enum netdev_flags *old_flagsp)
2510 OVS_REQUIRES(netdev->mutex)
2512 int old_flags, new_flags;
2515 old_flags = netdev->ifi_flags;
2516 *old_flagsp = iff_to_nd_flags(old_flags);
2517 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2518 if (new_flags != old_flags) {
2519 error = set_flags(netdev_get_name(&netdev->up), new_flags);
2520 get_flags(&netdev->up, &netdev->ifi_flags);
2527 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2528 enum netdev_flags on, enum netdev_flags *old_flagsp)
2530 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2533 ovs_mutex_lock(&netdev->mutex);
2534 error = update_flags(netdev, off, on, old_flagsp);
2535 ovs_mutex_unlock(&netdev->mutex);
2540 #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, SET_STATS, \
2541 GET_FEATURES, GET_STATUS) \
2547 netdev_linux_wait, \
2549 netdev_linux_alloc, \
2551 netdev_linux_destruct, \
2552 netdev_linux_dealloc, \
2553 NULL, /* get_config */ \
2554 NULL, /* set_config */ \
2555 NULL, /* get_tunnel_config */ \
2557 netdev_linux_send, \
2558 netdev_linux_send_wait, \
2560 netdev_linux_set_etheraddr, \
2561 netdev_linux_get_etheraddr, \
2562 netdev_linux_get_mtu, \
2563 netdev_linux_set_mtu, \
2564 netdev_linux_get_ifindex, \
2565 netdev_linux_get_carrier, \
2566 netdev_linux_get_carrier_resets, \
2567 netdev_linux_set_miimon_interval, \
2572 netdev_linux_set_advertisements, \
2574 netdev_linux_set_policing, \
2575 netdev_linux_get_qos_types, \
2576 netdev_linux_get_qos_capabilities, \
2577 netdev_linux_get_qos, \
2578 netdev_linux_set_qos, \
2579 netdev_linux_get_queue, \
2580 netdev_linux_set_queue, \
2581 netdev_linux_delete_queue, \
2582 netdev_linux_get_queue_stats, \
2583 netdev_linux_queue_dump_start, \
2584 netdev_linux_queue_dump_next, \
2585 netdev_linux_queue_dump_done, \
2586 netdev_linux_dump_queue_stats, \
2588 netdev_linux_get_in4, \
2589 netdev_linux_set_in4, \
2590 netdev_linux_get_in6, \
2591 netdev_linux_add_router, \
2592 netdev_linux_get_next_hop, \
2594 netdev_linux_arp_lookup, \
2596 netdev_linux_update_flags, \
2598 netdev_linux_rx_alloc, \
2599 netdev_linux_rx_construct, \
2600 netdev_linux_rx_destruct, \
2601 netdev_linux_rx_dealloc, \
2602 netdev_linux_rx_recv, \
2603 netdev_linux_rx_wait, \
2604 netdev_linux_rx_drain, \
2607 const struct netdev_class netdev_linux_class =
2610 netdev_linux_construct,
2611 netdev_linux_get_stats,
2612 NULL, /* set_stats */
2613 netdev_linux_get_features,
2614 netdev_linux_get_status);
2616 const struct netdev_class netdev_tap_class =
2619 netdev_linux_construct_tap,
2620 netdev_tap_get_stats,
2621 NULL, /* set_stats */
2622 netdev_linux_get_features,
2623 netdev_linux_get_status);
2625 const struct netdev_class netdev_internal_class =
2628 netdev_linux_construct,
2629 netdev_internal_get_stats,
2630 netdev_internal_set_stats,
2631 NULL, /* get_features */
2632 netdev_internal_get_status);
2634 /* HTB traffic control class. */
2636 #define HTB_N_QUEUES 0xf000
2640 unsigned int max_rate; /* In bytes/s. */
2644 struct tc_queue tc_queue;
2645 unsigned int min_rate; /* In bytes/s. */
2646 unsigned int max_rate; /* In bytes/s. */
2647 unsigned int burst; /* In bytes. */
2648 unsigned int priority; /* Lower values are higher priorities. */
2652 htb_get__(const struct netdev *netdev_)
2654 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2655 return CONTAINER_OF(netdev->tc, struct htb, tc);
2659 htb_install__(struct netdev *netdev_, uint64_t max_rate)
2661 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2664 htb = xmalloc(sizeof *htb);
2665 tc_init(&htb->tc, &tc_ops_htb);
2666 htb->max_rate = max_rate;
2668 netdev->tc = &htb->tc;
2671 /* Create an HTB qdisc.
2673 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2675 htb_setup_qdisc__(struct netdev *netdev)
2678 struct tc_htb_glob opt;
2679 struct ofpbuf request;
2680 struct tcmsg *tcmsg;
2682 tc_del_qdisc(netdev);
2684 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2685 NLM_F_EXCL | NLM_F_CREATE, &request);
2689 tcmsg->tcm_handle = tc_make_handle(1, 0);
2690 tcmsg->tcm_parent = TC_H_ROOT;
2692 nl_msg_put_string(&request, TCA_KIND, "htb");
2694 memset(&opt, 0, sizeof opt);
2695 opt.rate2quantum = 10;
2699 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2700 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2701 nl_msg_end_nested(&request, opt_offset);
2703 return tc_transact(&request, NULL);
2706 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2707 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2709 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2710 unsigned int parent, struct htb_class *class)
2713 struct tc_htb_opt opt;
2714 struct ofpbuf request;
2715 struct tcmsg *tcmsg;
2719 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
2721 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2722 netdev_get_name(netdev));
2726 memset(&opt, 0, sizeof opt);
2727 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2728 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2729 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2730 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2731 opt.prio = class->priority;
2733 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2737 tcmsg->tcm_handle = handle;
2738 tcmsg->tcm_parent = parent;
2740 nl_msg_put_string(&request, TCA_KIND, "htb");
2741 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2742 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2743 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2744 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2745 nl_msg_end_nested(&request, opt_offset);
2747 error = tc_transact(&request, NULL);
2749 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2750 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2751 netdev_get_name(netdev),
2752 tc_get_major(handle), tc_get_minor(handle),
2753 tc_get_major(parent), tc_get_minor(parent),
2754 class->min_rate, class->max_rate,
2755 class->burst, class->priority, ovs_strerror(error));
2760 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2761 * description of them into 'details'. The description complies with the
2762 * specification given in the vswitch database documentation for linux-htb
2765 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2767 static const struct nl_policy tca_htb_policy[] = {
2768 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2769 .min_len = sizeof(struct tc_htb_opt) },
2772 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2773 const struct tc_htb_opt *htb;
2775 if (!nl_parse_nested(nl_options, tca_htb_policy,
2776 attrs, ARRAY_SIZE(tca_htb_policy))) {
2777 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2781 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2782 class->min_rate = htb->rate.rate;
2783 class->max_rate = htb->ceil.rate;
2784 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2785 class->priority = htb->prio;
2790 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2791 struct htb_class *options,
2792 struct netdev_queue_stats *stats)
2794 struct nlattr *nl_options;
2795 unsigned int handle;
2798 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2799 if (!error && queue_id) {
2800 unsigned int major = tc_get_major(handle);
2801 unsigned int minor = tc_get_minor(handle);
2802 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2803 *queue_id = minor - 1;
2808 if (!error && options) {
2809 error = htb_parse_tca_options__(nl_options, options);
2815 htb_parse_qdisc_details__(struct netdev *netdev_,
2816 const struct smap *details, struct htb_class *hc)
2818 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2819 const char *max_rate_s;
2821 max_rate_s = smap_get(details, "max-rate");
2822 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2823 if (!hc->max_rate) {
2824 enum netdev_features current;
2826 netdev_linux_read_features(netdev);
2827 current = !netdev->get_features_error ? netdev->current : 0;
2828 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
2830 hc->min_rate = hc->max_rate;
2836 htb_parse_class_details__(struct netdev *netdev,
2837 const struct smap *details, struct htb_class *hc)
2839 const struct htb *htb = htb_get__(netdev);
2840 const char *min_rate_s = smap_get(details, "min-rate");
2841 const char *max_rate_s = smap_get(details, "max-rate");
2842 const char *burst_s = smap_get(details, "burst");
2843 const char *priority_s = smap_get(details, "priority");
2846 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
2848 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2849 netdev_get_name(netdev));
2853 /* HTB requires at least an mtu sized min-rate to send any traffic even
2854 * on uncongested links. */
2855 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2856 hc->min_rate = MAX(hc->min_rate, mtu);
2857 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2860 hc->max_rate = (max_rate_s
2861 ? strtoull(max_rate_s, NULL, 10) / 8
2863 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2864 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2868 * According to hints in the documentation that I've read, it is important
2869 * that 'burst' be at least as big as the largest frame that might be
2870 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2871 * but having it a bit too small is a problem. Since netdev_get_mtu()
2872 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2873 * the MTU. We actually add 64, instead of 14, as a guard against
2874 * additional headers get tacked on somewhere that we're not aware of. */
2875 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2876 hc->burst = MAX(hc->burst, mtu + 64);
2879 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2885 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2886 unsigned int parent, struct htb_class *options,
2887 struct netdev_queue_stats *stats)
2889 struct ofpbuf *reply;
2892 error = tc_query_class(netdev, handle, parent, &reply);
2894 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2895 ofpbuf_delete(reply);
2901 htb_tc_install(struct netdev *netdev, const struct smap *details)
2905 error = htb_setup_qdisc__(netdev);
2907 struct htb_class hc;
2909 htb_parse_qdisc_details__(netdev, details, &hc);
2910 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2911 tc_make_handle(1, 0), &hc);
2913 htb_install__(netdev, hc.max_rate);
2919 static struct htb_class *
2920 htb_class_cast__(const struct tc_queue *queue)
2922 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2926 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2927 const struct htb_class *hc)
2929 struct htb *htb = htb_get__(netdev);
2930 size_t hash = hash_int(queue_id, 0);
2931 struct tc_queue *queue;
2932 struct htb_class *hcp;
2934 queue = tc_find_queue__(netdev, queue_id, hash);
2936 hcp = htb_class_cast__(queue);
2938 hcp = xmalloc(sizeof *hcp);
2939 queue = &hcp->tc_queue;
2940 queue->queue_id = queue_id;
2941 queue->created = time_msec();
2942 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2945 hcp->min_rate = hc->min_rate;
2946 hcp->max_rate = hc->max_rate;
2947 hcp->burst = hc->burst;
2948 hcp->priority = hc->priority;
2952 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2955 struct nl_dump dump;
2956 struct htb_class hc;
2958 /* Get qdisc options. */
2960 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2961 htb_install__(netdev, hc.max_rate);
2964 if (!start_queue_dump(netdev, &dump)) {
2967 while (nl_dump_next(&dump, &msg)) {
2968 unsigned int queue_id;
2970 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2971 htb_update_queue__(netdev, queue_id, &hc);
2974 nl_dump_done(&dump);
2980 htb_tc_destroy(struct tc *tc)
2982 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2983 struct htb_class *hc, *next;
2985 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2986 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2994 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
2996 const struct htb *htb = htb_get__(netdev);
2997 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
3002 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
3004 struct htb_class hc;
3007 htb_parse_qdisc_details__(netdev, details, &hc);
3008 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3009 tc_make_handle(1, 0), &hc);
3011 htb_get__(netdev)->max_rate = hc.max_rate;
3017 htb_class_get(const struct netdev *netdev OVS_UNUSED,
3018 const struct tc_queue *queue, struct smap *details)
3020 const struct htb_class *hc = htb_class_cast__(queue);
3022 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3023 if (hc->min_rate != hc->max_rate) {
3024 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3026 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
3028 smap_add_format(details, "priority", "%u", hc->priority);
3034 htb_class_set(struct netdev *netdev, unsigned int queue_id,
3035 const struct smap *details)
3037 struct htb_class hc;
3040 error = htb_parse_class_details__(netdev, details, &hc);
3045 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3046 tc_make_handle(1, 0xfffe), &hc);
3051 htb_update_queue__(netdev, queue_id, &hc);
3056 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
3058 struct htb_class *hc = htb_class_cast__(queue);
3059 struct htb *htb = htb_get__(netdev);
3062 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3064 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3071 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3072 struct netdev_queue_stats *stats)
3074 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3075 tc_make_handle(1, 0xfffe), NULL, stats);
3079 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3080 const struct ofpbuf *nlmsg,
3081 netdev_dump_queue_stats_cb *cb, void *aux)
3083 struct netdev_queue_stats stats;
3084 unsigned int handle, major, minor;
3087 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3092 major = tc_get_major(handle);
3093 minor = tc_get_minor(handle);
3094 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3095 (*cb)(minor - 1, &stats, aux);
3100 static const struct tc_ops tc_ops_htb = {
3101 "htb", /* linux_name */
3102 "linux-htb", /* ovs_name */
3103 HTB_N_QUEUES, /* n_queues */
3112 htb_class_get_stats,
3113 htb_class_dump_stats
3116 /* "linux-hfsc" traffic control class. */
3118 #define HFSC_N_QUEUES 0xf000
3126 struct tc_queue tc_queue;
3131 static struct hfsc *
3132 hfsc_get__(const struct netdev *netdev_)
3134 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3135 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
3138 static struct hfsc_class *
3139 hfsc_class_cast__(const struct tc_queue *queue)
3141 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3145 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
3147 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3150 hfsc = xmalloc(sizeof *hfsc);
3151 tc_init(&hfsc->tc, &tc_ops_hfsc);
3152 hfsc->max_rate = max_rate;
3153 netdev->tc = &hfsc->tc;
3157 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3158 const struct hfsc_class *hc)
3162 struct hfsc_class *hcp;
3163 struct tc_queue *queue;
3165 hfsc = hfsc_get__(netdev);
3166 hash = hash_int(queue_id, 0);
3168 queue = tc_find_queue__(netdev, queue_id, hash);
3170 hcp = hfsc_class_cast__(queue);
3172 hcp = xmalloc(sizeof *hcp);
3173 queue = &hcp->tc_queue;
3174 queue->queue_id = queue_id;
3175 queue->created = time_msec();
3176 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3179 hcp->min_rate = hc->min_rate;
3180 hcp->max_rate = hc->max_rate;
3184 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3186 const struct tc_service_curve *rsc, *fsc, *usc;
3187 static const struct nl_policy tca_hfsc_policy[] = {
3189 .type = NL_A_UNSPEC,
3191 .min_len = sizeof(struct tc_service_curve),
3194 .type = NL_A_UNSPEC,
3196 .min_len = sizeof(struct tc_service_curve),
3199 .type = NL_A_UNSPEC,
3201 .min_len = sizeof(struct tc_service_curve),
3204 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3206 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3207 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3208 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3212 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3213 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3214 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3216 if (rsc->m1 != 0 || rsc->d != 0 ||
3217 fsc->m1 != 0 || fsc->d != 0 ||
3218 usc->m1 != 0 || usc->d != 0) {
3219 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3220 "Non-linear service curves are not supported.");
3224 if (rsc->m2 != fsc->m2) {
3225 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3226 "Real-time service curves are not supported ");
3230 if (rsc->m2 > usc->m2) {
3231 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3232 "Min-rate service curve is greater than "
3233 "the max-rate service curve.");
3237 class->min_rate = fsc->m2;
3238 class->max_rate = usc->m2;
3243 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3244 struct hfsc_class *options,
3245 struct netdev_queue_stats *stats)
3248 unsigned int handle;
3249 struct nlattr *nl_options;
3251 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3257 unsigned int major, minor;
3259 major = tc_get_major(handle);
3260 minor = tc_get_minor(handle);
3261 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3262 *queue_id = minor - 1;
3269 error = hfsc_parse_tca_options__(nl_options, options);
3276 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3277 unsigned int parent, struct hfsc_class *options,
3278 struct netdev_queue_stats *stats)
3281 struct ofpbuf *reply;
3283 error = tc_query_class(netdev, handle, parent, &reply);
3288 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3289 ofpbuf_delete(reply);
3294 hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
3295 struct hfsc_class *class)
3297 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3299 const char *max_rate_s;
3301 max_rate_s = smap_get(details, "max-rate");
3302 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3305 enum netdev_features current;
3307 netdev_linux_read_features(netdev);
3308 current = !netdev->get_features_error ? netdev->current : 0;
3309 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3312 class->min_rate = max_rate;
3313 class->max_rate = max_rate;
3317 hfsc_parse_class_details__(struct netdev *netdev,
3318 const struct smap *details,
3319 struct hfsc_class * class)
3321 const struct hfsc *hfsc;
3322 uint32_t min_rate, max_rate;
3323 const char *min_rate_s, *max_rate_s;
3325 hfsc = hfsc_get__(netdev);
3326 min_rate_s = smap_get(details, "min-rate");
3327 max_rate_s = smap_get(details, "max-rate");
3329 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3330 min_rate = MAX(min_rate, 1);
3331 min_rate = MIN(min_rate, hfsc->max_rate);
3333 max_rate = (max_rate_s
3334 ? strtoull(max_rate_s, NULL, 10) / 8
3336 max_rate = MAX(max_rate, min_rate);
3337 max_rate = MIN(max_rate, hfsc->max_rate);
3339 class->min_rate = min_rate;
3340 class->max_rate = max_rate;
3345 /* Create an HFSC qdisc.
3347 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3349 hfsc_setup_qdisc__(struct netdev * netdev)
3351 struct tcmsg *tcmsg;
3352 struct ofpbuf request;
3353 struct tc_hfsc_qopt opt;
3355 tc_del_qdisc(netdev);
3357 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3358 NLM_F_EXCL | NLM_F_CREATE, &request);
3364 tcmsg->tcm_handle = tc_make_handle(1, 0);
3365 tcmsg->tcm_parent = TC_H_ROOT;
3367 memset(&opt, 0, sizeof opt);
3370 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3371 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3373 return tc_transact(&request, NULL);
3376 /* Create an HFSC class.
3378 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3379 * sc rate <min_rate> ul rate <max_rate>" */
3381 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3382 unsigned int parent, struct hfsc_class *class)
3386 struct tcmsg *tcmsg;
3387 struct ofpbuf request;
3388 struct tc_service_curve min, max;
3390 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3396 tcmsg->tcm_handle = handle;
3397 tcmsg->tcm_parent = parent;
3401 min.m2 = class->min_rate;
3405 max.m2 = class->max_rate;
3407 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3408 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3409 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3410 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3411 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3412 nl_msg_end_nested(&request, opt_offset);
3414 error = tc_transact(&request, NULL);
3416 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3417 "min-rate %ubps, max-rate %ubps (%s)",
3418 netdev_get_name(netdev),
3419 tc_get_major(handle), tc_get_minor(handle),
3420 tc_get_major(parent), tc_get_minor(parent),
3421 class->min_rate, class->max_rate, ovs_strerror(error));
3428 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
3431 struct hfsc_class class;
3433 error = hfsc_setup_qdisc__(netdev);
3439 hfsc_parse_qdisc_details__(netdev, details, &class);
3440 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3441 tc_make_handle(1, 0), &class);
3447 hfsc_install__(netdev, class.max_rate);
3452 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3455 struct nl_dump dump;
3456 struct hfsc_class hc;
3459 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3460 hfsc_install__(netdev, hc.max_rate);
3462 if (!start_queue_dump(netdev, &dump)) {
3466 while (nl_dump_next(&dump, &msg)) {
3467 unsigned int queue_id;
3469 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3470 hfsc_update_queue__(netdev, queue_id, &hc);
3474 nl_dump_done(&dump);
3479 hfsc_tc_destroy(struct tc *tc)
3482 struct hfsc_class *hc, *next;
3484 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3486 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3487 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3496 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
3498 const struct hfsc *hfsc;
3499 hfsc = hfsc_get__(netdev);
3500 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
3505 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
3508 struct hfsc_class class;
3510 hfsc_parse_qdisc_details__(netdev, details, &class);
3511 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3512 tc_make_handle(1, 0), &class);
3515 hfsc_get__(netdev)->max_rate = class.max_rate;
3522 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3523 const struct tc_queue *queue, struct smap *details)
3525 const struct hfsc_class *hc;
3527 hc = hfsc_class_cast__(queue);
3528 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3529 if (hc->min_rate != hc->max_rate) {
3530 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3536 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3537 const struct smap *details)
3540 struct hfsc_class class;
3542 error = hfsc_parse_class_details__(netdev, details, &class);
3547 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3548 tc_make_handle(1, 0xfffe), &class);
3553 hfsc_update_queue__(netdev, queue_id, &class);
3558 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3562 struct hfsc_class *hc;
3564 hc = hfsc_class_cast__(queue);
3565 hfsc = hfsc_get__(netdev);
3567 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3569 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3576 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3577 struct netdev_queue_stats *stats)
3579 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3580 tc_make_handle(1, 0xfffe), NULL, stats);
3584 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3585 const struct ofpbuf *nlmsg,
3586 netdev_dump_queue_stats_cb *cb, void *aux)
3588 struct netdev_queue_stats stats;
3589 unsigned int handle, major, minor;
3592 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3597 major = tc_get_major(handle);
3598 minor = tc_get_minor(handle);
3599 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3600 (*cb)(minor - 1, &stats, aux);
3605 static const struct tc_ops tc_ops_hfsc = {
3606 "hfsc", /* linux_name */
3607 "linux-hfsc", /* ovs_name */
3608 HFSC_N_QUEUES, /* n_queues */
3609 hfsc_tc_install, /* tc_install */
3610 hfsc_tc_load, /* tc_load */
3611 hfsc_tc_destroy, /* tc_destroy */
3612 hfsc_qdisc_get, /* qdisc_get */
3613 hfsc_qdisc_set, /* qdisc_set */
3614 hfsc_class_get, /* class_get */
3615 hfsc_class_set, /* class_set */
3616 hfsc_class_delete, /* class_delete */
3617 hfsc_class_get_stats, /* class_get_stats */
3618 hfsc_class_dump_stats /* class_dump_stats */
3621 /* "linux-default" traffic control class.
3623 * This class represents the default, unnamed Linux qdisc. It corresponds to
3624 * the "" (empty string) QoS type in the OVS database. */
3627 default_install__(struct netdev *netdev_)
3629 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3630 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
3632 /* Nothing but a tc class implementation is allowed to write to a tc. This
3633 * class never does that, so we can legitimately use a const tc object. */
3634 netdev->tc = CONST_CAST(struct tc *, &tc);
3638 default_tc_install(struct netdev *netdev,
3639 const struct smap *details OVS_UNUSED)
3641 default_install__(netdev);
3646 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3648 default_install__(netdev);
3652 static const struct tc_ops tc_ops_default = {
3653 NULL, /* linux_name */
3658 NULL, /* tc_destroy */
3659 NULL, /* qdisc_get */
3660 NULL, /* qdisc_set */
3661 NULL, /* class_get */
3662 NULL, /* class_set */
3663 NULL, /* class_delete */
3664 NULL, /* class_get_stats */
3665 NULL /* class_dump_stats */
3668 /* "linux-other" traffic control class.
3673 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
3675 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3676 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
3678 /* Nothing but a tc class implementation is allowed to write to a tc. This
3679 * class never does that, so we can legitimately use a const tc object. */
3680 netdev->tc = CONST_CAST(struct tc *, &tc);
3684 static const struct tc_ops tc_ops_other = {
3685 NULL, /* linux_name */
3686 "linux-other", /* ovs_name */
3688 NULL, /* tc_install */
3690 NULL, /* tc_destroy */
3691 NULL, /* qdisc_get */
3692 NULL, /* qdisc_set */
3693 NULL, /* class_get */
3694 NULL, /* class_set */
3695 NULL, /* class_delete */
3696 NULL, /* class_get_stats */
3697 NULL /* class_dump_stats */
3700 /* Traffic control. */
3702 /* Number of kernel "tc" ticks per second. */
3703 static double ticks_per_s;
3705 /* Number of kernel "jiffies" per second. This is used for the purpose of
3706 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3707 * one jiffy's worth of data.
3709 * There are two possibilities here:
3711 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3712 * approximate range of 100 to 1024. That means that we really need to
3713 * make sure that the qdisc can buffer that much data.
3715 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3716 * has finely granular timers and there's no need to fudge additional room
3717 * for buffers. (There's no extra effort needed to implement that: the
3718 * large 'buffer_hz' is used as a divisor, so practically any number will
3719 * come out as 0 in the division. Small integer results in the case of
3720 * really high dividends won't have any real effect anyhow.)
3722 static unsigned int buffer_hz;
3724 /* Returns tc handle 'major':'minor'. */
3726 tc_make_handle(unsigned int major, unsigned int minor)
3728 return TC_H_MAKE(major << 16, minor);
3731 /* Returns the major number from 'handle'. */
3733 tc_get_major(unsigned int handle)
3735 return TC_H_MAJ(handle) >> 16;
3738 /* Returns the minor number from 'handle'. */
3740 tc_get_minor(unsigned int handle)
3742 return TC_H_MIN(handle);
3745 static struct tcmsg *
3746 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3747 struct ofpbuf *request)
3749 struct tcmsg *tcmsg;
3753 error = get_ifindex(netdev, &ifindex);
3758 ofpbuf_init(request, 512);
3759 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3760 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3761 tcmsg->tcm_family = AF_UNSPEC;
3762 tcmsg->tcm_ifindex = ifindex;
3763 /* Caller should fill in tcmsg->tcm_handle. */
3764 /* Caller should fill in tcmsg->tcm_parent. */
3770 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3772 int error = nl_transact(NETLINK_ROUTE, request, replyp);
3773 ofpbuf_uninit(request);
3777 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3778 * policing configuration.
3780 * This function is equivalent to running the following when 'add' is true:
3781 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3783 * This function is equivalent to running the following when 'add' is false:
3784 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3786 * The configuration and stats may be seen with the following command:
3787 * /sbin/tc -s qdisc show dev <devname>
3789 * Returns 0 if successful, otherwise a positive errno value.
3792 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3794 struct ofpbuf request;
3795 struct tcmsg *tcmsg;
3797 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3798 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3800 tcmsg = tc_make_request(netdev, type, flags, &request);
3804 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3805 tcmsg->tcm_parent = TC_H_INGRESS;
3806 nl_msg_put_string(&request, TCA_KIND, "ingress");
3807 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3809 error = tc_transact(&request, NULL);
3811 /* If we're deleting the qdisc, don't worry about some of the
3812 * error conditions. */
3813 if (!add && (error == ENOENT || error == EINVAL)) {
3822 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3825 * This function is equivalent to running:
3826 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3827 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3830 * The configuration and stats may be seen with the following command:
3831 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3833 * Returns 0 if successful, otherwise a positive errno value.
3836 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3838 struct tc_police tc_police;
3839 struct ofpbuf request;
3840 struct tcmsg *tcmsg;
3841 size_t basic_offset;
3842 size_t police_offset;
3846 memset(&tc_police, 0, sizeof tc_police);
3847 tc_police.action = TC_POLICE_SHOT;
3848 tc_police.mtu = mtu;
3849 tc_fill_rate(&tc_police.rate, (kbits_rate * 1000)/8, mtu);
3850 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3851 kbits_burst * 1024);
3853 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3854 NLM_F_EXCL | NLM_F_CREATE, &request);
3858 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3859 tcmsg->tcm_info = tc_make_handle(49,
3860 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3862 nl_msg_put_string(&request, TCA_KIND, "basic");
3863 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3864 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3865 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3866 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3867 nl_msg_end_nested(&request, police_offset);
3868 nl_msg_end_nested(&request, basic_offset);
3870 error = tc_transact(&request, NULL);
3881 /* The values in psched are not individually very meaningful, but they are
3882 * important. The tables below show some values seen in the wild.
3886 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3887 * (Before that, there are hints that it was 1000000000.)
3889 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3893 * -----------------------------------
3894 * [1] 000c8000 000f4240 000f4240 00000064
3895 * [2] 000003e8 00000400 000f4240 3b9aca00
3896 * [3] 000003e8 00000400 000f4240 3b9aca00
3897 * [4] 000003e8 00000400 000f4240 00000064
3898 * [5] 000003e8 00000040 000f4240 3b9aca00
3899 * [6] 000003e8 00000040 000f4240 000000f9
3901 * a b c d ticks_per_s buffer_hz
3902 * ------- --------- ---------- ------------- ----------- -------------
3903 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3904 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3905 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3906 * [4] 1,000 1,024 1,000,000 100 976,562 100
3907 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3908 * [6] 1,000 64 1,000,000 249 15,625,000 249
3910 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3911 * [2] 2.6.26-1-686-bigmem from Debian lenny
3912 * [3] 2.6.26-2-sparc64 from Debian lenny
3913 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3914 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3915 * [6] 2.6.34 from kernel.org on KVM
3917 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
3918 static const char fn[] = "/proc/net/psched";
3919 unsigned int a, b, c, d;
3922 if (!ovsthread_once_start(&once)) {
3929 stream = fopen(fn, "r");
3931 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
3935 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3936 VLOG_WARN("%s: read failed", fn);
3940 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3944 VLOG_WARN("%s: invalid scheduler parameters", fn);
3948 ticks_per_s = (double) a * c / b;
3952 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3955 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3958 ovsthread_once_done(&once);
3961 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3962 * rate of 'rate' bytes per second. */
3964 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3967 return (rate * ticks) / ticks_per_s;
3970 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3971 * rate of 'rate' bytes per second. */
3973 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3976 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3979 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3980 * a transmission rate of 'rate' bytes per second. */
3982 tc_buffer_per_jiffy(unsigned int rate)
3985 return rate / buffer_hz;
3988 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3989 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3990 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3991 * stores NULL into it if it is absent.
3993 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3996 * Returns 0 if successful, otherwise a positive errno value. */
3998 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3999 struct nlattr **options)
4001 static const struct nl_policy tca_policy[] = {
4002 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
4003 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
4005 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4007 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4008 tca_policy, ta, ARRAY_SIZE(ta))) {
4009 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
4014 *kind = nl_attr_get_string(ta[TCA_KIND]);
4018 *options = ta[TCA_OPTIONS];
4033 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
4034 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
4035 * into '*options', and its queue statistics into '*stats'. Any of the output
4036 * arguments may be null.
4038 * Returns 0 if successful, otherwise a positive errno value. */
4040 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
4041 struct nlattr **options, struct netdev_queue_stats *stats)
4043 static const struct nl_policy tca_policy[] = {
4044 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
4045 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
4047 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4049 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4050 tca_policy, ta, ARRAY_SIZE(ta))) {
4051 VLOG_WARN_RL(&rl, "failed to parse class message");
4056 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
4057 *handlep = tc->tcm_handle;
4061 *options = ta[TCA_OPTIONS];
4065 const struct gnet_stats_queue *gsq;
4066 struct gnet_stats_basic gsb;
4068 static const struct nl_policy stats_policy[] = {
4069 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
4070 .min_len = sizeof gsb },
4071 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
4072 .min_len = sizeof *gsq },
4074 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
4076 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
4077 sa, ARRAY_SIZE(sa))) {
4078 VLOG_WARN_RL(&rl, "failed to parse class stats");
4082 /* Alignment issues screw up the length of struct gnet_stats_basic on
4083 * some arch/bitsize combinations. Newer versions of Linux have a
4084 * struct gnet_stats_basic_packed, but we can't depend on that. The
4085 * easiest thing to do is just to make a copy. */
4086 memset(&gsb, 0, sizeof gsb);
4087 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
4088 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
4089 stats->tx_bytes = gsb.bytes;
4090 stats->tx_packets = gsb.packets;
4092 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
4093 stats->tx_errors = gsq->drops;
4103 memset(stats, 0, sizeof *stats);
4108 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
4111 tc_query_class(const struct netdev *netdev,
4112 unsigned int handle, unsigned int parent,
4113 struct ofpbuf **replyp)
4115 struct ofpbuf request;
4116 struct tcmsg *tcmsg;
4119 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
4123 tcmsg->tcm_handle = handle;
4124 tcmsg->tcm_parent = parent;
4126 error = tc_transact(&request, replyp);
4128 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
4129 netdev_get_name(netdev),
4130 tc_get_major(handle), tc_get_minor(handle),
4131 tc_get_major(parent), tc_get_minor(parent),
4132 ovs_strerror(error));
4137 /* Equivalent to "tc class del dev <name> handle <handle>". */
4139 tc_delete_class(const struct netdev *netdev, unsigned int handle)
4141 struct ofpbuf request;
4142 struct tcmsg *tcmsg;
4145 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
4149 tcmsg->tcm_handle = handle;
4150 tcmsg->tcm_parent = 0;
4152 error = tc_transact(&request, NULL);
4154 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4155 netdev_get_name(netdev),
4156 tc_get_major(handle), tc_get_minor(handle),
4157 ovs_strerror(error));
4162 /* Equivalent to "tc qdisc del dev <name> root". */
4164 tc_del_qdisc(struct netdev *netdev_)
4166 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4167 struct ofpbuf request;
4168 struct tcmsg *tcmsg;
4171 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
4175 tcmsg->tcm_handle = tc_make_handle(1, 0);
4176 tcmsg->tcm_parent = TC_H_ROOT;
4178 error = tc_transact(&request, NULL);
4179 if (error == EINVAL) {
4180 /* EINVAL probably means that the default qdisc was in use, in which
4181 * case we've accomplished our purpose. */
4184 if (!error && netdev->tc) {
4185 if (netdev->tc->ops->tc_destroy) {
4186 netdev->tc->ops->tc_destroy(netdev->tc);
4193 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4194 * kernel to determine what they are. Returns 0 if successful, otherwise a
4195 * positive errno value. */
4197 tc_query_qdisc(const struct netdev *netdev_)
4199 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4200 struct ofpbuf request, *qdisc;
4201 const struct tc_ops *ops;
4202 struct tcmsg *tcmsg;
4210 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4211 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4212 * 2.6.35 without that fix backported to it.
4214 * To avoid the OOPS, we must not make a request that would attempt to dump
4215 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4216 * few others. There are a few ways that I can see to do this, but most of
4217 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4218 * technique chosen here is to assume that any non-default qdisc that we
4219 * create will have a class with handle 1:0. The built-in qdiscs only have
4220 * a class with handle 0:0.
4222 * We could check for Linux 2.6.35+ and use a more straightforward method
4224 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
4228 tcmsg->tcm_handle = tc_make_handle(1, 0);
4229 tcmsg->tcm_parent = 0;
4231 /* Figure out what tc class to instantiate. */
4232 error = tc_transact(&request, &qdisc);
4236 error = tc_parse_qdisc(qdisc, &kind, NULL);
4238 ops = &tc_ops_other;
4240 ops = tc_lookup_linux_name(kind);
4242 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4243 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4245 ops = &tc_ops_other;
4248 } else if (error == ENOENT) {
4249 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4250 * other entity that doesn't have a handle 1:0. We will assume
4251 * that it's the system default qdisc. */
4252 ops = &tc_ops_default;
4255 /* Who knows? Maybe the device got deleted. */
4256 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4257 netdev_get_name(netdev_), ovs_strerror(error));
4258 ops = &tc_ops_other;
4261 /* Instantiate it. */
4262 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
4263 ovs_assert((load_error == 0) == (netdev->tc != NULL));
4264 ofpbuf_delete(qdisc);
4266 return error ? error : load_error;
4269 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4270 approximate the time to transmit packets of various lengths. For an MTU of
4271 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4272 represents two possible packet lengths; for a MTU of 513 through 1024, four
4273 possible lengths; and so on.
4275 Returns, for the specified 'mtu', the number of bits that packet lengths
4276 need to be shifted right to fit within such a 256-entry table. */
4278 tc_calc_cell_log(unsigned int mtu)
4283 mtu = ETH_PAYLOAD_MAX;
4285 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4287 for (cell_log = 0; mtu >= 256; cell_log++) {
4294 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4297 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4299 memset(rate, 0, sizeof *rate);
4300 rate->cell_log = tc_calc_cell_log(mtu);
4301 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4302 /* rate->cell_align = 0; */ /* distro headers. */
4303 rate->mpu = ETH_TOTAL_MIN;
4307 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4308 * attribute of the specified "type".
4310 * See tc_calc_cell_log() above for a description of "rtab"s. */
4312 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4317 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4318 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4319 unsigned packet_size = (i + 1) << rate->cell_log;
4320 if (packet_size < rate->mpu) {
4321 packet_size = rate->mpu;
4323 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4327 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4328 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4329 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4332 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4334 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4335 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4338 /* Linux-only functions declared in netdev-linux.h */
4340 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4341 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4343 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4344 const char *flag_name, bool enable)
4346 const char *netdev_name = netdev_get_name(netdev);
4347 struct ethtool_value evalue;
4351 COVERAGE_INC(netdev_get_ethtool);
4352 memset(&evalue, 0, sizeof evalue);
4353 error = netdev_linux_do_ethtool(netdev_name,
4354 (struct ethtool_cmd *)&evalue,
4355 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4360 COVERAGE_INC(netdev_set_ethtool);
4361 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4362 error = netdev_linux_do_ethtool(netdev_name,
4363 (struct ethtool_cmd *)&evalue,
4364 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4369 COVERAGE_INC(netdev_get_ethtool);
4370 memset(&evalue, 0, sizeof evalue);
4371 error = netdev_linux_do_ethtool(netdev_name,
4372 (struct ethtool_cmd *)&evalue,
4373 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4378 if (new_flags != evalue.data) {
4379 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4380 "device %s failed", enable ? "enable" : "disable",
4381 flag_name, netdev_name);
4388 /* Utility functions. */
4390 /* Copies 'src' into 'dst', performing format conversion in the process. */
4392 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4393 const struct rtnl_link_stats *src)
4395 dst->rx_packets = src->rx_packets;
4396 dst->tx_packets = src->tx_packets;
4397 dst->rx_bytes = src->rx_bytes;
4398 dst->tx_bytes = src->tx_bytes;
4399 dst->rx_errors = src->rx_errors;
4400 dst->tx_errors = src->tx_errors;
4401 dst->rx_dropped = src->rx_dropped;
4402 dst->tx_dropped = src->tx_dropped;
4403 dst->multicast = src->multicast;
4404 dst->collisions = src->collisions;
4405 dst->rx_length_errors = src->rx_length_errors;
4406 dst->rx_over_errors = src->rx_over_errors;
4407 dst->rx_crc_errors = src->rx_crc_errors;
4408 dst->rx_frame_errors = src->rx_frame_errors;
4409 dst->rx_fifo_errors = src->rx_fifo_errors;
4410 dst->rx_missed_errors = src->rx_missed_errors;
4411 dst->tx_aborted_errors = src->tx_aborted_errors;
4412 dst->tx_carrier_errors = src->tx_carrier_errors;
4413 dst->tx_fifo_errors = src->tx_fifo_errors;
4414 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4415 dst->tx_window_errors = src->tx_window_errors;
4419 get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
4421 struct ofpbuf request;
4422 struct ofpbuf *reply;
4425 ofpbuf_init(&request, 0);
4426 nl_msg_put_nlmsghdr(&request,
4427 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
4428 RTM_GETLINK, NLM_F_REQUEST);
4429 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
4430 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
4431 error = nl_transact(NETLINK_ROUTE, &request, &reply);
4432 ofpbuf_uninit(&request);
4437 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
4438 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS);
4439 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
4440 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
4443 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4447 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
4452 ofpbuf_delete(reply);
4457 get_flags(const struct netdev *dev, unsigned int *flags)
4463 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
4465 *flags = ifr.ifr_flags;
4471 set_flags(const char *name, unsigned int flags)
4475 ifr.ifr_flags = flags;
4476 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
4480 do_get_ifindex(const char *netdev_name)
4485 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4486 COVERAGE_INC(netdev_get_ifindex);
4488 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
4490 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4491 netdev_name, ovs_strerror(error));
4494 return ifr.ifr_ifindex;
4498 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4500 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4502 if (!(netdev->cache_valid & VALID_IFINDEX)) {
4503 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4506 netdev->get_ifindex_error = -ifindex;
4507 netdev->ifindex = 0;
4509 netdev->get_ifindex_error = 0;
4510 netdev->ifindex = ifindex;
4512 netdev->cache_valid |= VALID_IFINDEX;
4515 *ifindexp = netdev->ifindex;
4516 return netdev->get_ifindex_error;
4520 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4526 memset(&ifr, 0, sizeof ifr);
4527 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4528 COVERAGE_INC(netdev_get_hwaddr);
4529 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
4531 /* ENODEV probably means that a vif disappeared asynchronously and
4532 * hasn't been removed from the database yet, so reduce the log level
4533 * to INFO for that case. */
4534 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
4535 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4536 netdev_name, ovs_strerror(error));
4539 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4540 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4541 VLOG_WARN("%s device has unknown hardware address family %d",
4542 netdev_name, hwaddr_family);
4544 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4549 set_etheraddr(const char *netdev_name,
4550 const uint8_t mac[ETH_ADDR_LEN])
4555 memset(&ifr, 0, sizeof ifr);
4556 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4557 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4558 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4559 COVERAGE_INC(netdev_set_hwaddr);
4560 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
4562 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4563 netdev_name, ovs_strerror(error));
4569 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4570 int cmd, const char *cmd_name)
4575 memset(&ifr, 0, sizeof ifr);
4576 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4577 ifr.ifr_data = (caddr_t) ecmd;
4580 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
4582 if (error != EOPNOTSUPP) {
4583 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4584 "failed: %s", cmd_name, name, ovs_strerror(error));
4586 /* The device doesn't support this operation. That's pretty
4587 * common, so there's no point in logging anything. */
4594 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4595 int cmd, const char *cmd_name)
4600 ifr.ifr_addr.sa_family = AF_INET;
4601 error = af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4603 const struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *,
4605 *ip = sin->sin_addr;
4610 /* Returns an AF_PACKET raw socket or a negative errno value. */
4612 af_packet_sock(void)
4614 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4617 if (ovsthread_once_start(&once)) {
4618 sock = socket(AF_PACKET, SOCK_RAW, 0);
4620 int error = set_nonblocking(sock);
4627 VLOG_ERR("failed to create packet socket: %s",
4628 ovs_strerror(errno));
4630 ovsthread_once_done(&once);