2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <linux/version.h>
37 #include <sys/types.h>
38 #include <sys/ioctl.h>
39 #include <sys/socket.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
51 #include "connectivity.h"
53 #include "dpif-linux.h"
54 #include "dynamic-string.h"
55 #include "fatal-signal.h"
58 #include "netdev-provider.h"
59 #include "netdev-vport.h"
60 #include "netlink-notifier.h"
61 #include "netlink-socket.h"
64 #include "openflow/openflow.h"
65 #include "ovs-atomic.h"
67 #include "poll-loop.h"
68 #include "rtnetlink-link.h"
71 #include "socket-util.h"
74 #include "unaligned.h"
77 VLOG_DEFINE_THIS_MODULE(netdev_linux);
79 COVERAGE_DEFINE(netdev_set_policing);
80 COVERAGE_DEFINE(netdev_arp_lookup);
81 COVERAGE_DEFINE(netdev_get_ifindex);
82 COVERAGE_DEFINE(netdev_get_hwaddr);
83 COVERAGE_DEFINE(netdev_set_hwaddr);
84 COVERAGE_DEFINE(netdev_get_ethtool);
85 COVERAGE_DEFINE(netdev_set_ethtool);
88 /* These were introduced in Linux 2.6.14, so they might be missing if we have
90 #ifndef ADVERTISED_Pause
91 #define ADVERTISED_Pause (1 << 13)
93 #ifndef ADVERTISED_Asym_Pause
94 #define ADVERTISED_Asym_Pause (1 << 14)
97 /* These were introduced in Linux 2.6.24, so they might be missing if we
98 * have old headers. */
99 #ifndef ETHTOOL_GFLAGS
100 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
102 #ifndef ETHTOOL_SFLAGS
103 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
106 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
109 #define TC_RTAB_SIZE 1024
112 /* Linux 2.6.21 introduced struct tpacket_auxdata.
113 * Linux 2.6.27 added the tp_vlan_tci member.
114 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
115 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
116 * TP_STATUS_VLAN_TPID_VALID.
118 * With all this churn it's easiest to unconditionally define a replacement
119 * structure that has everything we want.
121 #ifndef PACKET_AUXDATA
122 #define PACKET_AUXDATA 8
124 #ifndef TP_STATUS_VLAN_VALID
125 #define TP_STATUS_VLAN_VALID (1 << 4)
127 #ifndef TP_STATUS_VLAN_TPID_VALID
128 #define TP_STATUS_VLAN_TPID_VALID (1 << 6)
130 #undef tpacket_auxdata
131 #define tpacket_auxdata rpl_tpacket_auxdata
132 struct tpacket_auxdata {
138 uint16_t tp_vlan_tci;
139 uint16_t tp_vlan_tpid;
143 VALID_IFINDEX = 1 << 0,
144 VALID_ETHERADDR = 1 << 1,
148 VALID_POLICING = 1 << 5,
149 VALID_VPORT_STAT_ERROR = 1 << 6,
150 VALID_DRVINFO = 1 << 7,
151 VALID_FEATURES = 1 << 8,
154 /* Traffic control. */
156 /* An instance of a traffic control class. Always associated with a particular
159 * Each TC implementation subclasses this with whatever additional data it
162 const struct tc_ops *ops;
163 struct hmap queues; /* Contains "struct tc_queue"s.
164 * Read by generic TC layer.
165 * Written only by TC implementation. */
168 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
170 /* One traffic control queue.
172 * Each TC implementation subclasses this with whatever additional data it
175 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
176 unsigned int queue_id; /* OpenFlow queue ID. */
177 long long int created; /* Time queue was created, in msecs. */
180 /* A particular kind of traffic control. Each implementation generally maps to
181 * one particular Linux qdisc class.
183 * The functions below return 0 if successful or a positive errno value on
184 * failure, except where otherwise noted. All of them must be provided, except
185 * where otherwise noted. */
187 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
188 * This is null for tc_ops_default and tc_ops_other, for which there are no
189 * appropriate values. */
190 const char *linux_name;
192 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
193 const char *ovs_name;
195 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
196 * queues. The queues are numbered 0 through n_queues - 1. */
197 unsigned int n_queues;
199 /* Called to install this TC class on 'netdev'. The implementation should
200 * make the Netlink calls required to set up 'netdev' with the right qdisc
201 * and configure it according to 'details'. The implementation may assume
202 * that the current qdisc is the default; that is, there is no need for it
203 * to delete the current qdisc before installing itself.
205 * The contents of 'details' should be documented as valid for 'ovs_name'
206 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
207 * (which is built as ovs-vswitchd.conf.db(8)).
209 * This function must return 0 if and only if it sets 'netdev->tc' to an
210 * initialized 'struct tc'.
212 * (This function is null for tc_ops_other, which cannot be installed. For
213 * other TC classes it should always be nonnull.) */
214 int (*tc_install)(struct netdev *netdev, const struct smap *details);
216 /* Called when the netdev code determines (through a Netlink query) that
217 * this TC class's qdisc is installed on 'netdev', but we didn't install
218 * it ourselves and so don't know any of the details.
220 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
221 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
222 * implementation should parse the other attributes of 'nlmsg' as
223 * necessary to determine its configuration. If necessary it should also
224 * use Netlink queries to determine the configuration of queues on
227 * This function must return 0 if and only if it sets 'netdev->tc' to an
228 * initialized 'struct tc'. */
229 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
231 /* Destroys the data structures allocated by the implementation as part of
232 * 'tc'. (This includes destroying 'tc->queues' by calling
235 * The implementation should not need to perform any Netlink calls. If
236 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
237 * (But it may not be desirable.)
239 * This function may be null if 'tc' is trivial. */
240 void (*tc_destroy)(struct tc *tc);
242 /* Retrieves details of 'netdev->tc' configuration into 'details'.
244 * The implementation should not need to perform any Netlink calls, because
245 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
246 * cached the configuration.
248 * The contents of 'details' should be documented as valid for 'ovs_name'
249 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
250 * (which is built as ovs-vswitchd.conf.db(8)).
252 * This function may be null if 'tc' is not configurable.
254 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
256 /* Reconfigures 'netdev->tc' according to 'details', performing any
257 * required Netlink calls to complete the reconfiguration.
259 * The contents of 'details' should be documented as valid for 'ovs_name'
260 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
261 * (which is built as ovs-vswitchd.conf.db(8)).
263 * This function may be null if 'tc' is not configurable.
265 int (*qdisc_set)(struct netdev *, const struct smap *details);
267 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
268 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
270 * The contents of 'details' should be documented as valid for 'ovs_name'
271 * in the "other_config" column in the "Queue" table in
272 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
274 * The implementation should not need to perform any Netlink calls, because
275 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
276 * cached the queue configuration.
278 * This function may be null if 'tc' does not have queues ('n_queues' is
280 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
281 struct smap *details);
283 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
284 * 'details', perfoming any required Netlink calls to complete the
285 * reconfiguration. The caller ensures that 'queue_id' is less than
288 * The contents of 'details' should be documented as valid for 'ovs_name'
289 * in the "other_config" column in the "Queue" table in
290 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
292 * This function may be null if 'tc' does not have queues or its queues are
293 * not configurable. */
294 int (*class_set)(struct netdev *, unsigned int queue_id,
295 const struct smap *details);
297 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
298 * tc_queue's within 'netdev->tc->queues'.
300 * This function may be null if 'tc' does not have queues or its queues
301 * cannot be deleted. */
302 int (*class_delete)(struct netdev *, struct tc_queue *queue);
304 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
305 * 'struct tc_queue's within 'netdev->tc->queues'.
307 * On success, initializes '*stats'.
309 * This function may be null if 'tc' does not have queues or if it cannot
310 * report queue statistics. */
311 int (*class_get_stats)(const struct netdev *netdev,
312 const struct tc_queue *queue,
313 struct netdev_queue_stats *stats);
315 /* Extracts queue stats from 'nlmsg', which is a response to a
316 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
318 * This function may be null if 'tc' does not have queues or if it cannot
319 * report queue statistics. */
320 int (*class_dump_stats)(const struct netdev *netdev,
321 const struct ofpbuf *nlmsg,
322 netdev_dump_queue_stats_cb *cb, void *aux);
326 tc_init(struct tc *tc, const struct tc_ops *ops)
329 hmap_init(&tc->queues);
333 tc_destroy(struct tc *tc)
335 hmap_destroy(&tc->queues);
338 static const struct tc_ops tc_ops_htb;
339 static const struct tc_ops tc_ops_hfsc;
340 static const struct tc_ops tc_ops_default;
341 static const struct tc_ops tc_ops_other;
343 static const struct tc_ops *const tcs[] = {
344 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
345 &tc_ops_hfsc, /* Hierarchical fair service curve. */
346 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
347 &tc_ops_other, /* Some other qdisc. */
351 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
352 static unsigned int tc_get_major(unsigned int handle);
353 static unsigned int tc_get_minor(unsigned int handle);
355 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
356 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
357 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
359 static struct tcmsg *tc_make_request(const struct netdev *, int type,
360 unsigned int flags, struct ofpbuf *);
361 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
362 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
363 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
366 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
367 struct nlattr **options);
368 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
369 struct nlattr **options,
370 struct netdev_queue_stats *);
371 static int tc_query_class(const struct netdev *,
372 unsigned int handle, unsigned int parent,
373 struct ofpbuf **replyp);
374 static int tc_delete_class(const struct netdev *, unsigned int handle);
376 static int tc_del_qdisc(struct netdev *netdev);
377 static int tc_query_qdisc(const struct netdev *netdev);
379 static int tc_calc_cell_log(unsigned int mtu);
380 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
381 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
382 const struct tc_ratespec *rate);
383 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
385 struct netdev_linux {
388 /* Protects all members below. */
389 struct ovs_mutex mutex;
391 unsigned int cache_valid;
393 bool miimon; /* Link status of last poll. */
394 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
395 struct timer miimon_timer;
397 /* The following are figured out "on demand" only. They are only valid
398 * when the corresponding VALID_* bit in 'cache_valid' is set. */
400 uint8_t etheraddr[ETH_ADDR_LEN];
401 struct in_addr address, netmask;
404 unsigned int ifi_flags;
405 long long int carrier_resets;
406 uint32_t kbits_rate; /* Policing data. */
407 uint32_t kbits_burst;
408 int vport_stats_error; /* Cached error code from vport_get_stats().
409 0 or an errno value. */
410 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
411 int ether_addr_error; /* Cached error code from set/get etheraddr. */
412 int netdev_policing_error; /* Cached error code from set policing. */
413 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
414 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
416 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
417 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
418 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
420 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
423 /* For devices of class netdev_tap_class only. */
427 struct netdev_rx_linux {
433 /* This is set pretty low because we probably won't learn anything from the
434 * additional log messages. */
435 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
437 /* Polling miimon status for all ports causes performance degradation when
438 * handling a large number of ports. If there are no devices using miimon, then
439 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait(). */
440 static atomic_int miimon_cnt = ATOMIC_VAR_INIT(0);
442 static void netdev_linux_run(void);
444 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
445 int cmd, const char *cmd_name);
446 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
447 int cmd, const char *cmd_name);
448 static int get_flags(const struct netdev *, unsigned int *flags);
449 static int set_flags(const char *, unsigned int flags);
450 static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
451 enum netdev_flags on, enum netdev_flags *old_flagsp)
452 OVS_REQUIRES(netdev->mutex);
453 static int do_get_ifindex(const char *netdev_name);
454 static int get_ifindex(const struct netdev *, int *ifindexp);
455 static int do_set_addr(struct netdev *netdev,
456 int ioctl_nr, const char *ioctl_name,
457 struct in_addr addr);
458 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
459 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
460 static int get_stats_via_netlink(const struct netdev *, struct netdev_stats *);
461 static int af_packet_sock(void);
462 static bool netdev_linux_miimon_enabled(void);
463 static void netdev_linux_miimon_run(void);
464 static void netdev_linux_miimon_wait(void);
467 is_netdev_linux_class(const struct netdev_class *netdev_class)
469 return netdev_class->run == netdev_linux_run;
473 is_tap_netdev(const struct netdev *netdev)
475 return netdev_get_class(netdev) == &netdev_tap_class;
478 static struct netdev_linux *
479 netdev_linux_cast(const struct netdev *netdev)
481 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
483 return CONTAINER_OF(netdev, struct netdev_linux, up);
486 static struct netdev_rx_linux *
487 netdev_rx_linux_cast(const struct netdev_rx *rx)
489 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
490 return CONTAINER_OF(rx, struct netdev_rx_linux, up);
493 static void netdev_linux_update(struct netdev_linux *netdev,
494 const struct rtnetlink_link_change *)
495 OVS_REQUIRES(netdev->mutex);
496 static void netdev_linux_changed(struct netdev_linux *netdev,
497 unsigned int ifi_flags, unsigned int mask)
498 OVS_REQUIRES(netdev->mutex);
500 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK changes, or NULL
501 * if no such socket could be created. */
502 static struct nl_sock *
503 netdev_linux_notify_sock(void)
505 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
506 static struct nl_sock *sock;
508 if (ovsthread_once_start(&once)) {
511 error = nl_sock_create(NETLINK_ROUTE, &sock);
513 error = nl_sock_join_mcgroup(sock, RTNLGRP_LINK);
515 nl_sock_destroy(sock);
519 ovsthread_once_done(&once);
526 netdev_linux_miimon_enabled(void)
530 atomic_read(&miimon_cnt, &miimon);
535 netdev_linux_run(void)
537 struct nl_sock *sock;
540 if (netdev_linux_miimon_enabled()) {
541 netdev_linux_miimon_run();
544 sock = netdev_linux_notify_sock();
550 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
551 uint64_t buf_stub[4096 / 8];
554 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
555 error = nl_sock_recv(sock, &buf, false);
557 struct rtnetlink_link_change change;
559 if (rtnetlink_link_parse(&buf, &change)) {
560 struct netdev *netdev_ = netdev_from_name(change.ifname);
561 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
562 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
564 ovs_mutex_lock(&netdev->mutex);
565 netdev_linux_update(netdev, &change);
566 ovs_mutex_unlock(&netdev->mutex);
568 netdev_close(netdev_);
570 } else if (error == ENOBUFS) {
571 struct shash device_shash;
572 struct shash_node *node;
576 shash_init(&device_shash);
577 netdev_get_devices(&netdev_linux_class, &device_shash);
578 SHASH_FOR_EACH (node, &device_shash) {
579 struct netdev *netdev_ = node->data;
580 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
583 ovs_mutex_lock(&netdev->mutex);
584 get_flags(netdev_, &flags);
585 netdev_linux_changed(netdev, flags, 0);
586 ovs_mutex_unlock(&netdev->mutex);
588 netdev_close(netdev_);
590 shash_destroy(&device_shash);
591 } else if (error != EAGAIN) {
592 VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)",
593 ovs_strerror(error));
600 netdev_linux_wait(void)
602 struct nl_sock *sock;
604 if (netdev_linux_miimon_enabled()) {
605 netdev_linux_miimon_wait();
607 sock = netdev_linux_notify_sock();
609 nl_sock_wait(sock, POLLIN);
614 netdev_linux_changed(struct netdev_linux *dev,
615 unsigned int ifi_flags, unsigned int mask)
616 OVS_REQUIRES(dev->mutex)
618 seq_change(connectivity_seq_get());
620 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
621 dev->carrier_resets++;
623 dev->ifi_flags = ifi_flags;
625 dev->cache_valid &= mask;
629 netdev_linux_update(struct netdev_linux *dev,
630 const struct rtnetlink_link_change *change)
631 OVS_REQUIRES(dev->mutex)
633 if (change->nlmsg_type == RTM_NEWLINK) {
635 netdev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
637 /* Update netdev from rtnl-change msg. */
639 dev->mtu = change->mtu;
640 dev->cache_valid |= VALID_MTU;
641 dev->netdev_mtu_error = 0;
644 if (!eth_addr_is_zero(change->addr)) {
645 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
646 dev->cache_valid |= VALID_ETHERADDR;
647 dev->ether_addr_error = 0;
650 dev->ifindex = change->ifi_index;
651 dev->cache_valid |= VALID_IFINDEX;
652 dev->get_ifindex_error = 0;
655 netdev_linux_changed(dev, change->ifi_flags, 0);
659 static struct netdev *
660 netdev_linux_alloc(void)
662 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
667 netdev_linux_common_construct(struct netdev_linux *netdev)
669 ovs_mutex_init(&netdev->mutex);
672 /* Creates system and internal devices. */
674 netdev_linux_construct(struct netdev *netdev_)
676 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
679 netdev_linux_common_construct(netdev);
681 error = get_flags(&netdev->up, &netdev->ifi_flags);
682 if (error == ENODEV) {
683 if (netdev->up.netdev_class != &netdev_internal_class) {
684 /* The device does not exist, so don't allow it to be opened. */
687 /* "Internal" netdevs have to be created as netdev objects before
688 * they exist in the kernel, because creating them in the kernel
689 * happens by passing a netdev object to dpif_port_add().
690 * Therefore, ignore the error. */
697 /* For most types of netdevs we open the device for each call of
698 * netdev_open(). However, this is not the case with tap devices,
699 * since it is only possible to open the device once. In this
700 * situation we share a single file descriptor, and consequently
701 * buffers, across all readers. Therefore once data is read it will
702 * be unavailable to other reads for tap devices. */
704 netdev_linux_construct_tap(struct netdev *netdev_)
706 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
707 static const char tap_dev[] = "/dev/net/tun";
708 const char *name = netdev_->name;
712 netdev_linux_common_construct(netdev);
714 /* Open tap device. */
715 netdev->tap_fd = open(tap_dev, O_RDWR);
716 if (netdev->tap_fd < 0) {
718 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
722 /* Create tap device. */
723 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
724 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
725 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
726 VLOG_WARN("%s: creating tap device failed: %s", name,
727 ovs_strerror(errno));
732 /* Make non-blocking. */
733 error = set_nonblocking(netdev->tap_fd);
741 close(netdev->tap_fd);
746 netdev_linux_destruct(struct netdev *netdev_)
748 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
750 if (netdev->tc && netdev->tc->ops->tc_destroy) {
751 netdev->tc->ops->tc_destroy(netdev->tc);
754 if (netdev_get_class(netdev_) == &netdev_tap_class
755 && netdev->tap_fd >= 0)
757 close(netdev->tap_fd);
760 if (netdev->miimon_interval > 0) {
762 atomic_sub(&miimon_cnt, 1, &junk);
765 ovs_mutex_destroy(&netdev->mutex);
769 netdev_linux_dealloc(struct netdev *netdev_)
771 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
775 static struct netdev_rx *
776 netdev_linux_rx_alloc(void)
778 struct netdev_rx_linux *rx = xzalloc(sizeof *rx);
783 netdev_linux_rx_construct(struct netdev_rx *rx_)
785 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
786 struct netdev *netdev_ = rx->up.netdev;
787 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
790 ovs_mutex_lock(&netdev->mutex);
791 rx->is_tap = is_tap_netdev(netdev_);
793 rx->fd = netdev->tap_fd;
795 struct sockaddr_ll sll;
797 /* Result of tcpdump -dd inbound */
798 static const struct sock_filter filt[] = {
799 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
800 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
801 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
802 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
804 static const struct sock_fprog fprog = {
805 ARRAY_SIZE(filt), (struct sock_filter *) filt
808 /* Create file descriptor. */
809 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
812 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
817 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
819 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
820 netdev_get_name(netdev_), ovs_strerror(error));
824 /* Set non-blocking mode. */
825 error = set_nonblocking(rx->fd);
830 /* Get ethernet device index. */
831 error = get_ifindex(&netdev->up, &ifindex);
836 /* Bind to specific ethernet device. */
837 memset(&sll, 0, sizeof sll);
838 sll.sll_family = AF_PACKET;
839 sll.sll_ifindex = ifindex;
840 sll.sll_protocol = htons(ETH_P_ALL);
841 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
843 VLOG_ERR("%s: failed to bind raw socket (%s)",
844 netdev_get_name(netdev_), ovs_strerror(error));
848 /* Filter for only inbound packets. */
849 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
853 VLOG_ERR("%s: failed to attach filter (%s)",
854 netdev_get_name(netdev_), ovs_strerror(error));
858 ovs_mutex_unlock(&netdev->mutex);
866 ovs_mutex_unlock(&netdev->mutex);
871 netdev_linux_rx_destruct(struct netdev_rx *rx_)
873 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
881 netdev_linux_rx_dealloc(struct netdev_rx *rx_)
883 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
889 auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux)
891 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
892 return htons(aux->tp_vlan_tpid);
894 return htons(ETH_TYPE_VLAN);
899 auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
901 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
905 netdev_linux_rx_recv_sock(int fd, struct ofpbuf *buffer)
910 struct cmsghdr *cmsg;
913 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
917 /* Reserve headroom for a single VLAN tag */
918 ofpbuf_reserve(buffer, VLAN_HEADER_LEN);
919 size = ofpbuf_tailroom(buffer);
921 iov.iov_base = buffer->data;
923 msgh.msg_name = NULL;
924 msgh.msg_namelen = 0;
927 msgh.msg_control = &cmsg_buffer;
928 msgh.msg_controllen = sizeof cmsg_buffer;
932 retval = recvmsg(fd, &msgh, MSG_TRUNC);
933 } while (retval < 0 && errno == EINTR);
937 } else if (retval > size) {
941 buffer->size += retval;
943 for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
944 const struct tpacket_auxdata *aux;
946 if (cmsg->cmsg_level != SOL_PACKET
947 || cmsg->cmsg_type != PACKET_AUXDATA
948 || cmsg->cmsg_len < CMSG_LEN(sizeof(struct tpacket_auxdata))) {
952 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
953 if (auxdata_has_vlan_tci(aux)) {
954 if (retval < ETH_HEADER_LEN) {
958 eth_push_vlan(buffer, auxdata_to_vlan_tpid(aux),
959 htons(aux->tp_vlan_tci));
968 netdev_linux_rx_recv_tap(int fd, struct ofpbuf *buffer)
971 size_t size = ofpbuf_tailroom(buffer);
974 retval = read(fd, buffer->data, size);
975 } while (retval < 0 && errno == EINTR);
979 } else if (retval > size) {
983 buffer->size += retval;
988 netdev_linux_rx_recv(struct netdev_rx *rx_, struct ofpbuf *buffer)
990 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
994 ? netdev_linux_rx_recv_tap(rx->fd, buffer)
995 : netdev_linux_rx_recv_sock(rx->fd, buffer));
996 if (retval && retval != EAGAIN && retval != EMSGSIZE) {
997 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
998 ovs_strerror(errno), netdev_rx_get_name(rx_));
1005 netdev_linux_rx_wait(struct netdev_rx *rx_)
1007 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
1008 poll_fd_wait(rx->fd, POLLIN);
1012 netdev_linux_rx_drain(struct netdev_rx *rx_)
1014 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
1017 int error = af_inet_ifreq_ioctl(netdev_rx_get_name(rx_), &ifr,
1018 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
1022 drain_fd(rx->fd, ifr.ifr_qlen);
1025 return drain_rcvbuf(rx->fd);
1029 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
1030 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1031 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1032 * the packet is too big or too small to transmit on the device.
1034 * The caller retains ownership of 'buffer' in all cases.
1036 * The kernel maintains a packet transmission queue, so the caller is not
1037 * expected to do additional queuing of packets. */
1039 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
1044 if (!is_tap_netdev(netdev_)) {
1045 /* Use our AF_PACKET socket to send to this device. */
1046 struct sockaddr_ll sll;
1052 sock = af_packet_sock();
1057 ifindex = netdev_get_ifindex(netdev_);
1062 /* We don't bother setting most fields in sockaddr_ll because the
1063 * kernel ignores them for SOCK_RAW. */
1064 memset(&sll, 0, sizeof sll);
1065 sll.sll_family = AF_PACKET;
1066 sll.sll_ifindex = ifindex;
1068 iov.iov_base = CONST_CAST(void *, data);
1071 msg.msg_name = &sll;
1072 msg.msg_namelen = sizeof sll;
1075 msg.msg_control = NULL;
1076 msg.msg_controllen = 0;
1079 retval = sendmsg(sock, &msg, 0);
1081 /* Use the tap fd to send to this device. This is essential for
1082 * tap devices, because packets sent to a tap device with an
1083 * AF_PACKET socket will loop back to be *received* again on the
1084 * tap device. This doesn't occur on other interface types
1085 * because we attach a socket filter to the rx socket. */
1086 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1088 retval = write(netdev->tap_fd, data, size);
1092 /* The Linux AF_PACKET implementation never blocks waiting for room
1093 * for packets, instead returning ENOBUFS. Translate this into
1094 * EAGAIN for the caller. */
1095 if (errno == ENOBUFS) {
1097 } else if (errno == EINTR) {
1099 } else if (errno != EAGAIN) {
1100 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1101 netdev_get_name(netdev_), ovs_strerror(errno));
1104 } else if (retval != size) {
1105 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE"d bytes of "
1106 "%"PRIuSIZE") on %s", retval, size, netdev_get_name(netdev_));
1114 /* Registers with the poll loop to wake up from the next call to poll_block()
1115 * when the packet transmission queue has sufficient room to transmit a packet
1116 * with netdev_send().
1118 * The kernel maintains a packet transmission queue, so the client is not
1119 * expected to do additional queuing of packets. Thus, this function is
1120 * unlikely to ever be used. It is included for completeness. */
1122 netdev_linux_send_wait(struct netdev *netdev)
1124 if (is_tap_netdev(netdev)) {
1125 /* TAP device always accepts packets.*/
1126 poll_immediate_wake();
1130 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1131 * otherwise a positive errno value. */
1133 netdev_linux_set_etheraddr(struct netdev *netdev_,
1134 const uint8_t mac[ETH_ADDR_LEN])
1136 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1137 enum netdev_flags old_flags = 0;
1140 ovs_mutex_lock(&netdev->mutex);
1142 if (netdev->cache_valid & VALID_ETHERADDR) {
1143 error = netdev->ether_addr_error;
1144 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1147 netdev->cache_valid &= ~VALID_ETHERADDR;
1150 /* Tap devices must be brought down before setting the address. */
1151 if (is_tap_netdev(netdev_)) {
1152 update_flags(netdev, NETDEV_UP, 0, &old_flags);
1154 error = set_etheraddr(netdev_get_name(netdev_), mac);
1155 if (!error || error == ENODEV) {
1156 netdev->ether_addr_error = error;
1157 netdev->cache_valid |= VALID_ETHERADDR;
1159 memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
1163 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1164 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1168 ovs_mutex_unlock(&netdev->mutex);
1172 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1174 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1175 uint8_t mac[ETH_ADDR_LEN])
1177 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1180 ovs_mutex_lock(&netdev->mutex);
1181 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1182 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1184 netdev->cache_valid |= VALID_ETHERADDR;
1187 error = netdev->ether_addr_error;
1189 memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
1191 ovs_mutex_unlock(&netdev->mutex);
1197 netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
1201 if (!(netdev->cache_valid & VALID_MTU)) {
1204 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
1205 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
1206 netdev->mtu = ifr.ifr_mtu;
1207 netdev->cache_valid |= VALID_MTU;
1210 error = netdev->netdev_mtu_error;
1212 *mtup = netdev->mtu;
1218 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1219 * in bytes, not including the hardware header; thus, this is typically 1500
1220 * bytes for Ethernet devices. */
1222 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1224 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1227 ovs_mutex_lock(&netdev->mutex);
1228 error = netdev_linux_get_mtu__(netdev, mtup);
1229 ovs_mutex_unlock(&netdev->mutex);
1234 /* Sets the maximum size of transmitted (MTU) for given device using linux
1235 * networking ioctl interface.
1238 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1240 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1244 ovs_mutex_lock(&netdev->mutex);
1245 if (netdev->cache_valid & VALID_MTU) {
1246 error = netdev->netdev_mtu_error;
1247 if (error || netdev->mtu == mtu) {
1250 netdev->cache_valid &= ~VALID_MTU;
1253 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1254 SIOCSIFMTU, "SIOCSIFMTU");
1255 if (!error || error == ENODEV) {
1256 netdev->netdev_mtu_error = error;
1257 netdev->mtu = ifr.ifr_mtu;
1258 netdev->cache_valid |= VALID_MTU;
1261 ovs_mutex_unlock(&netdev->mutex);
1265 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1266 * On failure, returns a negative errno value. */
1268 netdev_linux_get_ifindex(const struct netdev *netdev_)
1270 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1273 ovs_mutex_lock(&netdev->mutex);
1274 error = get_ifindex(netdev_, &ifindex);
1275 ovs_mutex_unlock(&netdev->mutex);
1277 return error ? -error : ifindex;
1281 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1283 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1285 ovs_mutex_lock(&netdev->mutex);
1286 if (netdev->miimon_interval > 0) {
1287 *carrier = netdev->miimon;
1289 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1291 ovs_mutex_unlock(&netdev->mutex);
1296 static long long int
1297 netdev_linux_get_carrier_resets(const struct netdev *netdev_)
1299 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1300 long long int carrier_resets;
1302 ovs_mutex_lock(&netdev->mutex);
1303 carrier_resets = netdev->carrier_resets;
1304 ovs_mutex_unlock(&netdev->mutex);
1306 return carrier_resets;
1310 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1311 struct mii_ioctl_data *data)
1316 memset(&ifr, 0, sizeof ifr);
1317 memcpy(&ifr.ifr_data, data, sizeof *data);
1318 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
1319 memcpy(data, &ifr.ifr_data, sizeof *data);
1325 netdev_linux_get_miimon(const char *name, bool *miimon)
1327 struct mii_ioctl_data data;
1332 memset(&data, 0, sizeof data);
1333 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1335 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1336 data.reg_num = MII_BMSR;
1337 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1341 *miimon = !!(data.val_out & BMSR_LSTATUS);
1343 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1346 struct ethtool_cmd ecmd;
1348 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1351 COVERAGE_INC(netdev_get_ethtool);
1352 memset(&ecmd, 0, sizeof ecmd);
1353 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1356 struct ethtool_value eval;
1358 memcpy(&eval, &ecmd, sizeof eval);
1359 *miimon = !!eval.data;
1361 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1369 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1370 long long int interval)
1372 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1374 ovs_mutex_lock(&netdev->mutex);
1375 interval = interval > 0 ? MAX(interval, 100) : 0;
1376 if (netdev->miimon_interval != interval) {
1379 if (interval && !netdev->miimon_interval) {
1380 atomic_add(&miimon_cnt, 1, &junk);
1381 } else if (!interval && netdev->miimon_interval) {
1382 atomic_sub(&miimon_cnt, 1, &junk);
1385 netdev->miimon_interval = interval;
1386 timer_set_expired(&netdev->miimon_timer);
1388 ovs_mutex_unlock(&netdev->mutex);
1394 netdev_linux_miimon_run(void)
1396 struct shash device_shash;
1397 struct shash_node *node;
1399 shash_init(&device_shash);
1400 netdev_get_devices(&netdev_linux_class, &device_shash);
1401 SHASH_FOR_EACH (node, &device_shash) {
1402 struct netdev *netdev = node->data;
1403 struct netdev_linux *dev = netdev_linux_cast(netdev);
1406 ovs_mutex_lock(&dev->mutex);
1407 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1408 netdev_linux_get_miimon(dev->up.name, &miimon);
1409 if (miimon != dev->miimon) {
1410 dev->miimon = miimon;
1411 netdev_linux_changed(dev, dev->ifi_flags, 0);
1414 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1416 ovs_mutex_unlock(&dev->mutex);
1417 netdev_close(netdev);
1420 shash_destroy(&device_shash);
1424 netdev_linux_miimon_wait(void)
1426 struct shash device_shash;
1427 struct shash_node *node;
1429 shash_init(&device_shash);
1430 netdev_get_devices(&netdev_linux_class, &device_shash);
1431 SHASH_FOR_EACH (node, &device_shash) {
1432 struct netdev *netdev = node->data;
1433 struct netdev_linux *dev = netdev_linux_cast(netdev);
1435 ovs_mutex_lock(&dev->mutex);
1436 if (dev->miimon_interval > 0) {
1437 timer_wait(&dev->miimon_timer);
1439 ovs_mutex_unlock(&dev->mutex);
1440 netdev_close(netdev);
1442 shash_destroy(&device_shash);
1446 swap_uint64(uint64_t *a, uint64_t *b)
1453 /* Copies 'src' into 'dst', performing format conversion in the process.
1455 * 'src' is allowed to be misaligned. */
1457 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1458 const struct ovs_vport_stats *src)
1460 dst->rx_packets = get_unaligned_u64(&src->rx_packets);
1461 dst->tx_packets = get_unaligned_u64(&src->tx_packets);
1462 dst->rx_bytes = get_unaligned_u64(&src->rx_bytes);
1463 dst->tx_bytes = get_unaligned_u64(&src->tx_bytes);
1464 dst->rx_errors = get_unaligned_u64(&src->rx_errors);
1465 dst->tx_errors = get_unaligned_u64(&src->tx_errors);
1466 dst->rx_dropped = get_unaligned_u64(&src->rx_dropped);
1467 dst->tx_dropped = get_unaligned_u64(&src->tx_dropped);
1469 dst->collisions = 0;
1470 dst->rx_length_errors = 0;
1471 dst->rx_over_errors = 0;
1472 dst->rx_crc_errors = 0;
1473 dst->rx_frame_errors = 0;
1474 dst->rx_fifo_errors = 0;
1475 dst->rx_missed_errors = 0;
1476 dst->tx_aborted_errors = 0;
1477 dst->tx_carrier_errors = 0;
1478 dst->tx_fifo_errors = 0;
1479 dst->tx_heartbeat_errors = 0;
1480 dst->tx_window_errors = 0;
1484 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1486 struct dpif_linux_vport reply;
1490 error = dpif_linux_vport_get(netdev_get_name(netdev), &reply, &buf);
1493 } else if (!reply.stats) {
1498 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1506 get_stats_via_vport(const struct netdev *netdev_,
1507 struct netdev_stats *stats)
1509 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1511 if (!netdev->vport_stats_error ||
1512 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1515 error = get_stats_via_vport__(netdev_, stats);
1516 if (error && error != ENOENT) {
1517 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1519 netdev_get_name(netdev_), ovs_strerror(error));
1521 netdev->vport_stats_error = error;
1522 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1526 /* Retrieves current device stats for 'netdev-linux'. */
1528 netdev_linux_get_stats(const struct netdev *netdev_,
1529 struct netdev_stats *stats)
1531 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1532 struct netdev_stats dev_stats;
1535 ovs_mutex_lock(&netdev->mutex);
1536 get_stats_via_vport(netdev_, stats);
1537 error = get_stats_via_netlink(netdev_, &dev_stats);
1539 if (!netdev->vport_stats_error) {
1542 } else if (netdev->vport_stats_error) {
1543 /* stats not available from OVS then use ioctl stats. */
1546 stats->rx_errors += dev_stats.rx_errors;
1547 stats->tx_errors += dev_stats.tx_errors;
1548 stats->rx_dropped += dev_stats.rx_dropped;
1549 stats->tx_dropped += dev_stats.tx_dropped;
1550 stats->multicast += dev_stats.multicast;
1551 stats->collisions += dev_stats.collisions;
1552 stats->rx_length_errors += dev_stats.rx_length_errors;
1553 stats->rx_over_errors += dev_stats.rx_over_errors;
1554 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1555 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1556 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1557 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1558 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1559 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1560 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1561 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1562 stats->tx_window_errors += dev_stats.tx_window_errors;
1564 ovs_mutex_unlock(&netdev->mutex);
1569 /* Retrieves current device stats for 'netdev-tap' netdev or
1570 * netdev-internal. */
1572 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1574 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1575 struct netdev_stats dev_stats;
1578 ovs_mutex_lock(&netdev->mutex);
1579 get_stats_via_vport(netdev_, stats);
1580 error = get_stats_via_netlink(netdev_, &dev_stats);
1582 if (!netdev->vport_stats_error) {
1585 } else if (netdev->vport_stats_error) {
1586 /* Transmit and receive stats will appear to be swapped relative to the
1587 * other ports since we are the one sending the data, not a remote
1588 * computer. For consistency, we swap them back here. This does not
1589 * apply if we are getting stats from the vport layer because it always
1590 * tracks stats from the perspective of the switch. */
1593 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1594 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1595 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1596 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1597 stats->rx_length_errors = 0;
1598 stats->rx_over_errors = 0;
1599 stats->rx_crc_errors = 0;
1600 stats->rx_frame_errors = 0;
1601 stats->rx_fifo_errors = 0;
1602 stats->rx_missed_errors = 0;
1603 stats->tx_aborted_errors = 0;
1604 stats->tx_carrier_errors = 0;
1605 stats->tx_fifo_errors = 0;
1606 stats->tx_heartbeat_errors = 0;
1607 stats->tx_window_errors = 0;
1609 stats->rx_dropped += dev_stats.tx_dropped;
1610 stats->tx_dropped += dev_stats.rx_dropped;
1612 stats->rx_errors += dev_stats.tx_errors;
1613 stats->tx_errors += dev_stats.rx_errors;
1615 stats->multicast += dev_stats.multicast;
1616 stats->collisions += dev_stats.collisions;
1618 ovs_mutex_unlock(&netdev->mutex);
1624 netdev_internal_get_stats(const struct netdev *netdev_,
1625 struct netdev_stats *stats)
1627 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1630 ovs_mutex_lock(&netdev->mutex);
1631 get_stats_via_vport(netdev_, stats);
1632 error = netdev->vport_stats_error;
1633 ovs_mutex_unlock(&netdev->mutex);
1639 netdev_internal_set_stats(struct netdev *netdev,
1640 const struct netdev_stats *stats)
1642 struct ovs_vport_stats vport_stats;
1643 struct dpif_linux_vport vport;
1646 vport_stats.rx_packets = stats->rx_packets;
1647 vport_stats.tx_packets = stats->tx_packets;
1648 vport_stats.rx_bytes = stats->rx_bytes;
1649 vport_stats.tx_bytes = stats->tx_bytes;
1650 vport_stats.rx_errors = stats->rx_errors;
1651 vport_stats.tx_errors = stats->tx_errors;
1652 vport_stats.rx_dropped = stats->rx_dropped;
1653 vport_stats.tx_dropped = stats->tx_dropped;
1655 dpif_linux_vport_init(&vport);
1656 vport.cmd = OVS_VPORT_CMD_SET;
1657 vport.name = netdev_get_name(netdev);
1658 vport.stats = &vport_stats;
1660 err = dpif_linux_vport_transact(&vport, NULL, NULL);
1662 /* If the vport layer doesn't know about the device, that doesn't mean it
1663 * doesn't exist (after all were able to open it when netdev_open() was
1664 * called), it just means that it isn't attached and we'll be getting
1665 * stats a different way. */
1666 if (err == ENODEV) {
1674 netdev_linux_read_features(struct netdev_linux *netdev)
1676 struct ethtool_cmd ecmd;
1680 if (netdev->cache_valid & VALID_FEATURES) {
1684 COVERAGE_INC(netdev_get_ethtool);
1685 memset(&ecmd, 0, sizeof ecmd);
1686 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1687 ETHTOOL_GSET, "ETHTOOL_GSET");
1692 /* Supported features. */
1693 netdev->supported = 0;
1694 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1695 netdev->supported |= NETDEV_F_10MB_HD;
1697 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1698 netdev->supported |= NETDEV_F_10MB_FD;
1700 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1701 netdev->supported |= NETDEV_F_100MB_HD;
1703 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1704 netdev->supported |= NETDEV_F_100MB_FD;
1706 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1707 netdev->supported |= NETDEV_F_1GB_HD;
1709 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1710 netdev->supported |= NETDEV_F_1GB_FD;
1712 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1713 netdev->supported |= NETDEV_F_10GB_FD;
1715 if (ecmd.supported & SUPPORTED_TP) {
1716 netdev->supported |= NETDEV_F_COPPER;
1718 if (ecmd.supported & SUPPORTED_FIBRE) {
1719 netdev->supported |= NETDEV_F_FIBER;
1721 if (ecmd.supported & SUPPORTED_Autoneg) {
1722 netdev->supported |= NETDEV_F_AUTONEG;
1724 if (ecmd.supported & SUPPORTED_Pause) {
1725 netdev->supported |= NETDEV_F_PAUSE;
1727 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1728 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1731 /* Advertised features. */
1732 netdev->advertised = 0;
1733 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1734 netdev->advertised |= NETDEV_F_10MB_HD;
1736 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1737 netdev->advertised |= NETDEV_F_10MB_FD;
1739 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1740 netdev->advertised |= NETDEV_F_100MB_HD;
1742 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1743 netdev->advertised |= NETDEV_F_100MB_FD;
1745 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1746 netdev->advertised |= NETDEV_F_1GB_HD;
1748 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1749 netdev->advertised |= NETDEV_F_1GB_FD;
1751 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1752 netdev->advertised |= NETDEV_F_10GB_FD;
1754 if (ecmd.advertising & ADVERTISED_TP) {
1755 netdev->advertised |= NETDEV_F_COPPER;
1757 if (ecmd.advertising & ADVERTISED_FIBRE) {
1758 netdev->advertised |= NETDEV_F_FIBER;
1760 if (ecmd.advertising & ADVERTISED_Autoneg) {
1761 netdev->advertised |= NETDEV_F_AUTONEG;
1763 if (ecmd.advertising & ADVERTISED_Pause) {
1764 netdev->advertised |= NETDEV_F_PAUSE;
1766 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1767 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1770 /* Current settings. */
1772 if (speed == SPEED_10) {
1773 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1774 } else if (speed == SPEED_100) {
1775 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1776 } else if (speed == SPEED_1000) {
1777 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1778 } else if (speed == SPEED_10000) {
1779 netdev->current = NETDEV_F_10GB_FD;
1780 } else if (speed == 40000) {
1781 netdev->current = NETDEV_F_40GB_FD;
1782 } else if (speed == 100000) {
1783 netdev->current = NETDEV_F_100GB_FD;
1784 } else if (speed == 1000000) {
1785 netdev->current = NETDEV_F_1TB_FD;
1787 netdev->current = 0;
1790 if (ecmd.port == PORT_TP) {
1791 netdev->current |= NETDEV_F_COPPER;
1792 } else if (ecmd.port == PORT_FIBRE) {
1793 netdev->current |= NETDEV_F_FIBER;
1797 netdev->current |= NETDEV_F_AUTONEG;
1801 netdev->cache_valid |= VALID_FEATURES;
1802 netdev->get_features_error = error;
1805 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
1806 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1807 * Returns 0 if successful, otherwise a positive errno value. */
1809 netdev_linux_get_features(const struct netdev *netdev_,
1810 enum netdev_features *current,
1811 enum netdev_features *advertised,
1812 enum netdev_features *supported,
1813 enum netdev_features *peer)
1815 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1818 ovs_mutex_lock(&netdev->mutex);
1819 netdev_linux_read_features(netdev);
1820 if (!netdev->get_features_error) {
1821 *current = netdev->current;
1822 *advertised = netdev->advertised;
1823 *supported = netdev->supported;
1824 *peer = 0; /* XXX */
1826 error = netdev->get_features_error;
1827 ovs_mutex_unlock(&netdev->mutex);
1832 /* Set the features advertised by 'netdev' to 'advertise'. */
1834 netdev_linux_set_advertisements(struct netdev *netdev_,
1835 enum netdev_features advertise)
1837 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1838 struct ethtool_cmd ecmd;
1841 ovs_mutex_lock(&netdev->mutex);
1843 COVERAGE_INC(netdev_get_ethtool);
1844 memset(&ecmd, 0, sizeof ecmd);
1845 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1846 ETHTOOL_GSET, "ETHTOOL_GSET");
1851 ecmd.advertising = 0;
1852 if (advertise & NETDEV_F_10MB_HD) {
1853 ecmd.advertising |= ADVERTISED_10baseT_Half;
1855 if (advertise & NETDEV_F_10MB_FD) {
1856 ecmd.advertising |= ADVERTISED_10baseT_Full;
1858 if (advertise & NETDEV_F_100MB_HD) {
1859 ecmd.advertising |= ADVERTISED_100baseT_Half;
1861 if (advertise & NETDEV_F_100MB_FD) {
1862 ecmd.advertising |= ADVERTISED_100baseT_Full;
1864 if (advertise & NETDEV_F_1GB_HD) {
1865 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1867 if (advertise & NETDEV_F_1GB_FD) {
1868 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1870 if (advertise & NETDEV_F_10GB_FD) {
1871 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1873 if (advertise & NETDEV_F_COPPER) {
1874 ecmd.advertising |= ADVERTISED_TP;
1876 if (advertise & NETDEV_F_FIBER) {
1877 ecmd.advertising |= ADVERTISED_FIBRE;
1879 if (advertise & NETDEV_F_AUTONEG) {
1880 ecmd.advertising |= ADVERTISED_Autoneg;
1882 if (advertise & NETDEV_F_PAUSE) {
1883 ecmd.advertising |= ADVERTISED_Pause;
1885 if (advertise & NETDEV_F_PAUSE_ASYM) {
1886 ecmd.advertising |= ADVERTISED_Asym_Pause;
1888 COVERAGE_INC(netdev_set_ethtool);
1889 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1890 ETHTOOL_SSET, "ETHTOOL_SSET");
1893 ovs_mutex_unlock(&netdev->mutex);
1897 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1898 * successful, otherwise a positive errno value. */
1900 netdev_linux_set_policing(struct netdev *netdev_,
1901 uint32_t kbits_rate, uint32_t kbits_burst)
1903 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1904 const char *netdev_name = netdev_get_name(netdev_);
1907 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1908 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1909 : kbits_burst); /* Stick with user-specified value. */
1911 ovs_mutex_lock(&netdev->mutex);
1912 if (netdev->cache_valid & VALID_POLICING) {
1913 error = netdev->netdev_policing_error;
1914 if (error || (netdev->kbits_rate == kbits_rate &&
1915 netdev->kbits_burst == kbits_burst)) {
1916 /* Assume that settings haven't changed since we last set them. */
1919 netdev->cache_valid &= ~VALID_POLICING;
1922 COVERAGE_INC(netdev_set_policing);
1923 /* Remove any existing ingress qdisc. */
1924 error = tc_add_del_ingress_qdisc(netdev_, false);
1926 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1927 netdev_name, ovs_strerror(error));
1932 error = tc_add_del_ingress_qdisc(netdev_, true);
1934 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1935 netdev_name, ovs_strerror(error));
1939 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
1941 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1942 netdev_name, ovs_strerror(error));
1947 netdev->kbits_rate = kbits_rate;
1948 netdev->kbits_burst = kbits_burst;
1951 if (!error || error == ENODEV) {
1952 netdev->netdev_policing_error = error;
1953 netdev->cache_valid |= VALID_POLICING;
1955 ovs_mutex_unlock(&netdev->mutex);
1960 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1963 const struct tc_ops *const *opsp;
1965 for (opsp = tcs; *opsp != NULL; opsp++) {
1966 const struct tc_ops *ops = *opsp;
1967 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1968 sset_add(types, ops->ovs_name);
1974 static const struct tc_ops *
1975 tc_lookup_ovs_name(const char *name)
1977 const struct tc_ops *const *opsp;
1979 for (opsp = tcs; *opsp != NULL; opsp++) {
1980 const struct tc_ops *ops = *opsp;
1981 if (!strcmp(name, ops->ovs_name)) {
1988 static const struct tc_ops *
1989 tc_lookup_linux_name(const char *name)
1991 const struct tc_ops *const *opsp;
1993 for (opsp = tcs; *opsp != NULL; opsp++) {
1994 const struct tc_ops *ops = *opsp;
1995 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
2002 static struct tc_queue *
2003 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
2006 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2007 struct tc_queue *queue;
2009 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
2010 if (queue->queue_id == queue_id) {
2017 static struct tc_queue *
2018 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2020 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2024 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2026 struct netdev_qos_capabilities *caps)
2028 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2032 caps->n_queues = ops->n_queues;
2037 netdev_linux_get_qos(const struct netdev *netdev_,
2038 const char **typep, struct smap *details)
2040 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2043 ovs_mutex_lock(&netdev->mutex);
2044 error = tc_query_qdisc(netdev_);
2046 *typep = netdev->tc->ops->ovs_name;
2047 error = (netdev->tc->ops->qdisc_get
2048 ? netdev->tc->ops->qdisc_get(netdev_, details)
2051 ovs_mutex_unlock(&netdev->mutex);
2057 netdev_linux_set_qos(struct netdev *netdev_,
2058 const char *type, const struct smap *details)
2060 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2061 const struct tc_ops *new_ops;
2064 new_ops = tc_lookup_ovs_name(type);
2065 if (!new_ops || !new_ops->tc_install) {
2069 ovs_mutex_lock(&netdev->mutex);
2070 error = tc_query_qdisc(netdev_);
2075 if (new_ops == netdev->tc->ops) {
2076 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
2078 /* Delete existing qdisc. */
2079 error = tc_del_qdisc(netdev_);
2083 ovs_assert(netdev->tc == NULL);
2085 /* Install new qdisc. */
2086 error = new_ops->tc_install(netdev_, details);
2087 ovs_assert((error == 0) == (netdev->tc != NULL));
2091 ovs_mutex_unlock(&netdev->mutex);
2096 netdev_linux_get_queue(const struct netdev *netdev_,
2097 unsigned int queue_id, struct smap *details)
2099 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2102 ovs_mutex_lock(&netdev->mutex);
2103 error = tc_query_qdisc(netdev_);
2105 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2107 ? netdev->tc->ops->class_get(netdev_, queue, details)
2110 ovs_mutex_unlock(&netdev->mutex);
2116 netdev_linux_set_queue(struct netdev *netdev_,
2117 unsigned int queue_id, const struct smap *details)
2119 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2122 ovs_mutex_lock(&netdev->mutex);
2123 error = tc_query_qdisc(netdev_);
2125 error = (queue_id < netdev->tc->ops->n_queues
2126 && netdev->tc->ops->class_set
2127 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2130 ovs_mutex_unlock(&netdev->mutex);
2136 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
2138 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2141 ovs_mutex_lock(&netdev->mutex);
2142 error = tc_query_qdisc(netdev_);
2144 if (netdev->tc->ops->class_delete) {
2145 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2147 ? netdev->tc->ops->class_delete(netdev_, queue)
2153 ovs_mutex_unlock(&netdev->mutex);
2159 netdev_linux_get_queue_stats(const struct netdev *netdev_,
2160 unsigned int queue_id,
2161 struct netdev_queue_stats *stats)
2163 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2166 ovs_mutex_lock(&netdev->mutex);
2167 error = tc_query_qdisc(netdev_);
2169 if (netdev->tc->ops->class_get_stats) {
2170 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2172 stats->created = queue->created;
2173 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2182 ovs_mutex_unlock(&netdev->mutex);
2188 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
2190 struct ofpbuf request;
2191 struct tcmsg *tcmsg;
2193 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2197 tcmsg->tcm_parent = 0;
2198 nl_dump_start(dump, NETLINK_ROUTE, &request);
2199 ofpbuf_uninit(&request);
2203 struct netdev_linux_queue_state {
2204 unsigned int *queues;
2210 netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
2212 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2215 ovs_mutex_lock(&netdev->mutex);
2216 error = tc_query_qdisc(netdev_);
2218 if (netdev->tc->ops->class_get) {
2219 struct netdev_linux_queue_state *state;
2220 struct tc_queue *queue;
2223 *statep = state = xmalloc(sizeof *state);
2224 state->n_queues = hmap_count(&netdev->tc->queues);
2225 state->cur_queue = 0;
2226 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2229 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2230 state->queues[i++] = queue->queue_id;
2236 ovs_mutex_unlock(&netdev->mutex);
2242 netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2243 unsigned int *queue_idp, struct smap *details)
2245 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2246 struct netdev_linux_queue_state *state = state_;
2249 ovs_mutex_lock(&netdev->mutex);
2250 while (state->cur_queue < state->n_queues) {
2251 unsigned int queue_id = state->queues[state->cur_queue++];
2252 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2255 *queue_idp = queue_id;
2256 error = netdev->tc->ops->class_get(netdev_, queue, details);
2260 ovs_mutex_unlock(&netdev->mutex);
2266 netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2269 struct netdev_linux_queue_state *state = state_;
2271 free(state->queues);
2277 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2278 netdev_dump_queue_stats_cb *cb, void *aux)
2280 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2283 ovs_mutex_lock(&netdev->mutex);
2284 error = tc_query_qdisc(netdev_);
2286 struct nl_dump dump;
2288 if (!netdev->tc->ops->class_dump_stats) {
2290 } else if (!start_queue_dump(netdev_, &dump)) {
2296 while (nl_dump_next(&dump, &msg)) {
2297 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2304 retval = nl_dump_done(&dump);
2310 ovs_mutex_unlock(&netdev->mutex);
2316 netdev_linux_get_in4(const struct netdev *netdev_,
2317 struct in_addr *address, struct in_addr *netmask)
2319 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2322 ovs_mutex_lock(&netdev->mutex);
2323 if (!(netdev->cache_valid & VALID_IN4)) {
2324 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
2325 SIOCGIFADDR, "SIOCGIFADDR");
2327 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2328 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2330 netdev->cache_valid |= VALID_IN4;
2338 if (netdev->address.s_addr != INADDR_ANY) {
2339 *address = netdev->address;
2340 *netmask = netdev->netmask;
2342 error = EADDRNOTAVAIL;
2345 ovs_mutex_unlock(&netdev->mutex);
2351 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2352 struct in_addr netmask)
2354 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2357 ovs_mutex_lock(&netdev->mutex);
2358 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2360 netdev->cache_valid |= VALID_IN4;
2361 netdev->address = address;
2362 netdev->netmask = netmask;
2363 if (address.s_addr != INADDR_ANY) {
2364 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2365 "SIOCSIFNETMASK", netmask);
2368 ovs_mutex_unlock(&netdev->mutex);
2374 parse_if_inet6_line(const char *line,
2375 struct in6_addr *in6, char ifname[16 + 1])
2377 uint8_t *s6 = in6->s6_addr;
2378 #define X8 "%2"SCNx8
2379 return ovs_scan(line,
2380 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2381 "%*x %*x %*x %*x %16s\n",
2382 &s6[0], &s6[1], &s6[2], &s6[3],
2383 &s6[4], &s6[5], &s6[6], &s6[7],
2384 &s6[8], &s6[9], &s6[10], &s6[11],
2385 &s6[12], &s6[13], &s6[14], &s6[15],
2389 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2390 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2392 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2394 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2396 ovs_mutex_lock(&netdev->mutex);
2397 if (!(netdev->cache_valid & VALID_IN6)) {
2401 netdev->in6 = in6addr_any;
2403 file = fopen("/proc/net/if_inet6", "r");
2405 const char *name = netdev_get_name(netdev_);
2406 while (fgets(line, sizeof line, file)) {
2407 struct in6_addr in6_tmp;
2408 char ifname[16 + 1];
2409 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2410 && !strcmp(name, ifname))
2412 netdev->in6 = in6_tmp;
2418 netdev->cache_valid |= VALID_IN6;
2421 ovs_mutex_unlock(&netdev->mutex);
2427 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2429 struct sockaddr_in sin;
2430 memset(&sin, 0, sizeof sin);
2431 sin.sin_family = AF_INET;
2432 sin.sin_addr = addr;
2435 memset(sa, 0, sizeof *sa);
2436 memcpy(sa, &sin, sizeof sin);
2440 do_set_addr(struct netdev *netdev,
2441 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2445 make_in4_sockaddr(&ifr.ifr_addr, addr);
2446 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2450 /* Adds 'router' as a default IP gateway. */
2452 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2454 struct in_addr any = { INADDR_ANY };
2458 memset(&rt, 0, sizeof rt);
2459 make_in4_sockaddr(&rt.rt_dst, any);
2460 make_in4_sockaddr(&rt.rt_gateway, router);
2461 make_in4_sockaddr(&rt.rt_genmask, any);
2462 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2463 error = af_inet_ioctl(SIOCADDRT, &rt);
2465 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2471 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2474 static const char fn[] = "/proc/net/route";
2479 *netdev_name = NULL;
2480 stream = fopen(fn, "r");
2481 if (stream == NULL) {
2482 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2487 while (fgets(line, sizeof line, stream)) {
2490 ovs_be32 dest, gateway, mask;
2491 int refcnt, metric, mtu;
2492 unsigned int flags, use, window, irtt;
2495 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2497 iface, &dest, &gateway, &flags, &refcnt,
2498 &use, &metric, &mask, &mtu, &window, &irtt)) {
2499 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2503 if (!(flags & RTF_UP)) {
2504 /* Skip routes that aren't up. */
2508 /* The output of 'dest', 'mask', and 'gateway' were given in
2509 * network byte order, so we don't need need any endian
2510 * conversions here. */
2511 if ((dest & mask) == (host->s_addr & mask)) {
2513 /* The host is directly reachable. */
2514 next_hop->s_addr = 0;
2516 /* To reach the host, we must go through a gateway. */
2517 next_hop->s_addr = gateway;
2519 *netdev_name = xstrdup(iface);
2531 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2533 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2536 ovs_mutex_lock(&netdev->mutex);
2537 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2538 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2540 COVERAGE_INC(netdev_get_ethtool);
2541 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2542 error = netdev_linux_do_ethtool(netdev->up.name,
2545 "ETHTOOL_GDRVINFO");
2547 netdev->cache_valid |= VALID_DRVINFO;
2552 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2553 smap_add(smap, "driver_version", netdev->drvinfo.version);
2554 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2556 ovs_mutex_unlock(&netdev->mutex);
2562 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2565 smap_add(smap, "driver_name", "openvswitch");
2569 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2570 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2571 * returns 0. Otherwise, it returns a positive errno value; in particular,
2572 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2574 netdev_linux_arp_lookup(const struct netdev *netdev,
2575 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2578 struct sockaddr_in sin;
2581 memset(&r, 0, sizeof r);
2582 memset(&sin, 0, sizeof sin);
2583 sin.sin_family = AF_INET;
2584 sin.sin_addr.s_addr = ip;
2586 memcpy(&r.arp_pa, &sin, sizeof sin);
2587 r.arp_ha.sa_family = ARPHRD_ETHER;
2589 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2590 COVERAGE_INC(netdev_arp_lookup);
2591 retval = af_inet_ioctl(SIOCGARP, &r);
2593 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2594 } else if (retval != ENXIO) {
2595 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2596 netdev_get_name(netdev), IP_ARGS(ip),
2597 ovs_strerror(retval));
2603 nd_to_iff_flags(enum netdev_flags nd)
2606 if (nd & NETDEV_UP) {
2609 if (nd & NETDEV_PROMISC) {
2612 if (nd & NETDEV_LOOPBACK) {
2613 iff |= IFF_LOOPBACK;
2619 iff_to_nd_flags(int iff)
2621 enum netdev_flags nd = 0;
2625 if (iff & IFF_PROMISC) {
2626 nd |= NETDEV_PROMISC;
2628 if (iff & IFF_LOOPBACK) {
2629 nd |= NETDEV_LOOPBACK;
2635 update_flags(struct netdev_linux *netdev, enum netdev_flags off,
2636 enum netdev_flags on, enum netdev_flags *old_flagsp)
2637 OVS_REQUIRES(netdev->mutex)
2639 int old_flags, new_flags;
2642 old_flags = netdev->ifi_flags;
2643 *old_flagsp = iff_to_nd_flags(old_flags);
2644 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2645 if (new_flags != old_flags) {
2646 error = set_flags(netdev_get_name(&netdev->up), new_flags);
2647 get_flags(&netdev->up, &netdev->ifi_flags);
2654 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2655 enum netdev_flags on, enum netdev_flags *old_flagsp)
2657 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2660 ovs_mutex_lock(&netdev->mutex);
2661 error = update_flags(netdev, off, on, old_flagsp);
2662 ovs_mutex_unlock(&netdev->mutex);
2667 #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, SET_STATS, \
2668 GET_FEATURES, GET_STATUS) \
2674 netdev_linux_wait, \
2676 netdev_linux_alloc, \
2678 netdev_linux_destruct, \
2679 netdev_linux_dealloc, \
2680 NULL, /* get_config */ \
2681 NULL, /* set_config */ \
2682 NULL, /* get_tunnel_config */ \
2684 netdev_linux_send, \
2685 netdev_linux_send_wait, \
2687 netdev_linux_set_etheraddr, \
2688 netdev_linux_get_etheraddr, \
2689 netdev_linux_get_mtu, \
2690 netdev_linux_set_mtu, \
2691 netdev_linux_get_ifindex, \
2692 netdev_linux_get_carrier, \
2693 netdev_linux_get_carrier_resets, \
2694 netdev_linux_set_miimon_interval, \
2699 netdev_linux_set_advertisements, \
2701 netdev_linux_set_policing, \
2702 netdev_linux_get_qos_types, \
2703 netdev_linux_get_qos_capabilities, \
2704 netdev_linux_get_qos, \
2705 netdev_linux_set_qos, \
2706 netdev_linux_get_queue, \
2707 netdev_linux_set_queue, \
2708 netdev_linux_delete_queue, \
2709 netdev_linux_get_queue_stats, \
2710 netdev_linux_queue_dump_start, \
2711 netdev_linux_queue_dump_next, \
2712 netdev_linux_queue_dump_done, \
2713 netdev_linux_dump_queue_stats, \
2715 netdev_linux_get_in4, \
2716 netdev_linux_set_in4, \
2717 netdev_linux_get_in6, \
2718 netdev_linux_add_router, \
2719 netdev_linux_get_next_hop, \
2721 netdev_linux_arp_lookup, \
2723 netdev_linux_update_flags, \
2725 netdev_linux_rx_alloc, \
2726 netdev_linux_rx_construct, \
2727 netdev_linux_rx_destruct, \
2728 netdev_linux_rx_dealloc, \
2729 netdev_linux_rx_recv, \
2730 netdev_linux_rx_wait, \
2731 netdev_linux_rx_drain, \
2734 const struct netdev_class netdev_linux_class =
2737 netdev_linux_construct,
2738 netdev_linux_get_stats,
2739 NULL, /* set_stats */
2740 netdev_linux_get_features,
2741 netdev_linux_get_status);
2743 const struct netdev_class netdev_tap_class =
2746 netdev_linux_construct_tap,
2747 netdev_tap_get_stats,
2748 NULL, /* set_stats */
2749 netdev_linux_get_features,
2750 netdev_linux_get_status);
2752 const struct netdev_class netdev_internal_class =
2755 netdev_linux_construct,
2756 netdev_internal_get_stats,
2757 netdev_internal_set_stats,
2758 NULL, /* get_features */
2759 netdev_internal_get_status);
2761 /* HTB traffic control class. */
2763 #define HTB_N_QUEUES 0xf000
2767 unsigned int max_rate; /* In bytes/s. */
2771 struct tc_queue tc_queue;
2772 unsigned int min_rate; /* In bytes/s. */
2773 unsigned int max_rate; /* In bytes/s. */
2774 unsigned int burst; /* In bytes. */
2775 unsigned int priority; /* Lower values are higher priorities. */
2779 htb_get__(const struct netdev *netdev_)
2781 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2782 return CONTAINER_OF(netdev->tc, struct htb, tc);
2786 htb_install__(struct netdev *netdev_, uint64_t max_rate)
2788 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2791 htb = xmalloc(sizeof *htb);
2792 tc_init(&htb->tc, &tc_ops_htb);
2793 htb->max_rate = max_rate;
2795 netdev->tc = &htb->tc;
2798 /* Create an HTB qdisc.
2800 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2802 htb_setup_qdisc__(struct netdev *netdev)
2805 struct tc_htb_glob opt;
2806 struct ofpbuf request;
2807 struct tcmsg *tcmsg;
2809 tc_del_qdisc(netdev);
2811 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2812 NLM_F_EXCL | NLM_F_CREATE, &request);
2816 tcmsg->tcm_handle = tc_make_handle(1, 0);
2817 tcmsg->tcm_parent = TC_H_ROOT;
2819 nl_msg_put_string(&request, TCA_KIND, "htb");
2821 memset(&opt, 0, sizeof opt);
2822 opt.rate2quantum = 10;
2826 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2827 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2828 nl_msg_end_nested(&request, opt_offset);
2830 return tc_transact(&request, NULL);
2833 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2834 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2836 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2837 unsigned int parent, struct htb_class *class)
2840 struct tc_htb_opt opt;
2841 struct ofpbuf request;
2842 struct tcmsg *tcmsg;
2846 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
2848 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2849 netdev_get_name(netdev));
2853 memset(&opt, 0, sizeof opt);
2854 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2855 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2856 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2857 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2858 opt.prio = class->priority;
2860 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2864 tcmsg->tcm_handle = handle;
2865 tcmsg->tcm_parent = parent;
2867 nl_msg_put_string(&request, TCA_KIND, "htb");
2868 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2869 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2870 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2871 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2872 nl_msg_end_nested(&request, opt_offset);
2874 error = tc_transact(&request, NULL);
2876 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2877 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2878 netdev_get_name(netdev),
2879 tc_get_major(handle), tc_get_minor(handle),
2880 tc_get_major(parent), tc_get_minor(parent),
2881 class->min_rate, class->max_rate,
2882 class->burst, class->priority, ovs_strerror(error));
2887 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2888 * description of them into 'details'. The description complies with the
2889 * specification given in the vswitch database documentation for linux-htb
2892 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2894 static const struct nl_policy tca_htb_policy[] = {
2895 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2896 .min_len = sizeof(struct tc_htb_opt) },
2899 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2900 const struct tc_htb_opt *htb;
2902 if (!nl_parse_nested(nl_options, tca_htb_policy,
2903 attrs, ARRAY_SIZE(tca_htb_policy))) {
2904 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2908 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2909 class->min_rate = htb->rate.rate;
2910 class->max_rate = htb->ceil.rate;
2911 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2912 class->priority = htb->prio;
2917 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2918 struct htb_class *options,
2919 struct netdev_queue_stats *stats)
2921 struct nlattr *nl_options;
2922 unsigned int handle;
2925 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2926 if (!error && queue_id) {
2927 unsigned int major = tc_get_major(handle);
2928 unsigned int minor = tc_get_minor(handle);
2929 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2930 *queue_id = minor - 1;
2935 if (!error && options) {
2936 error = htb_parse_tca_options__(nl_options, options);
2942 htb_parse_qdisc_details__(struct netdev *netdev_,
2943 const struct smap *details, struct htb_class *hc)
2945 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2946 const char *max_rate_s;
2948 max_rate_s = smap_get(details, "max-rate");
2949 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2950 if (!hc->max_rate) {
2951 enum netdev_features current;
2953 netdev_linux_read_features(netdev);
2954 current = !netdev->get_features_error ? netdev->current : 0;
2955 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
2957 hc->min_rate = hc->max_rate;
2963 htb_parse_class_details__(struct netdev *netdev,
2964 const struct smap *details, struct htb_class *hc)
2966 const struct htb *htb = htb_get__(netdev);
2967 const char *min_rate_s = smap_get(details, "min-rate");
2968 const char *max_rate_s = smap_get(details, "max-rate");
2969 const char *burst_s = smap_get(details, "burst");
2970 const char *priority_s = smap_get(details, "priority");
2973 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
2975 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2976 netdev_get_name(netdev));
2980 /* HTB requires at least an mtu sized min-rate to send any traffic even
2981 * on uncongested links. */
2982 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2983 hc->min_rate = MAX(hc->min_rate, mtu);
2984 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2987 hc->max_rate = (max_rate_s
2988 ? strtoull(max_rate_s, NULL, 10) / 8
2990 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2991 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2995 * According to hints in the documentation that I've read, it is important
2996 * that 'burst' be at least as big as the largest frame that might be
2997 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2998 * but having it a bit too small is a problem. Since netdev_get_mtu()
2999 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
3000 * the MTU. We actually add 64, instead of 14, as a guard against
3001 * additional headers get tacked on somewhere that we're not aware of. */
3002 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
3003 hc->burst = MAX(hc->burst, mtu + 64);
3006 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
3012 htb_query_class__(const struct netdev *netdev, unsigned int handle,
3013 unsigned int parent, struct htb_class *options,
3014 struct netdev_queue_stats *stats)
3016 struct ofpbuf *reply;
3019 error = tc_query_class(netdev, handle, parent, &reply);
3021 error = htb_parse_tcmsg__(reply, NULL, options, stats);
3022 ofpbuf_delete(reply);
3028 htb_tc_install(struct netdev *netdev, const struct smap *details)
3032 error = htb_setup_qdisc__(netdev);
3034 struct htb_class hc;
3036 htb_parse_qdisc_details__(netdev, details, &hc);
3037 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3038 tc_make_handle(1, 0), &hc);
3040 htb_install__(netdev, hc.max_rate);
3046 static struct htb_class *
3047 htb_class_cast__(const struct tc_queue *queue)
3049 return CONTAINER_OF(queue, struct htb_class, tc_queue);
3053 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
3054 const struct htb_class *hc)
3056 struct htb *htb = htb_get__(netdev);
3057 size_t hash = hash_int(queue_id, 0);
3058 struct tc_queue *queue;
3059 struct htb_class *hcp;
3061 queue = tc_find_queue__(netdev, queue_id, hash);
3063 hcp = htb_class_cast__(queue);
3065 hcp = xmalloc(sizeof *hcp);
3066 queue = &hcp->tc_queue;
3067 queue->queue_id = queue_id;
3068 queue->created = time_msec();
3069 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
3072 hcp->min_rate = hc->min_rate;
3073 hcp->max_rate = hc->max_rate;
3074 hcp->burst = hc->burst;
3075 hcp->priority = hc->priority;
3079 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3082 struct nl_dump dump;
3083 struct htb_class hc;
3085 /* Get qdisc options. */
3087 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3088 htb_install__(netdev, hc.max_rate);
3091 if (!start_queue_dump(netdev, &dump)) {
3094 while (nl_dump_next(&dump, &msg)) {
3095 unsigned int queue_id;
3097 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3098 htb_update_queue__(netdev, queue_id, &hc);
3101 nl_dump_done(&dump);
3107 htb_tc_destroy(struct tc *tc)
3109 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
3110 struct htb_class *hc, *next;
3112 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
3113 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3121 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
3123 const struct htb *htb = htb_get__(netdev);
3124 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
3129 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
3131 struct htb_class hc;
3134 htb_parse_qdisc_details__(netdev, details, &hc);
3135 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3136 tc_make_handle(1, 0), &hc);
3138 htb_get__(netdev)->max_rate = hc.max_rate;
3144 htb_class_get(const struct netdev *netdev OVS_UNUSED,
3145 const struct tc_queue *queue, struct smap *details)
3147 const struct htb_class *hc = htb_class_cast__(queue);
3149 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3150 if (hc->min_rate != hc->max_rate) {
3151 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3153 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
3155 smap_add_format(details, "priority", "%u", hc->priority);
3161 htb_class_set(struct netdev *netdev, unsigned int queue_id,
3162 const struct smap *details)
3164 struct htb_class hc;
3167 error = htb_parse_class_details__(netdev, details, &hc);
3172 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3173 tc_make_handle(1, 0xfffe), &hc);
3178 htb_update_queue__(netdev, queue_id, &hc);
3183 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
3185 struct htb_class *hc = htb_class_cast__(queue);
3186 struct htb *htb = htb_get__(netdev);
3189 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3191 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3198 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3199 struct netdev_queue_stats *stats)
3201 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3202 tc_make_handle(1, 0xfffe), NULL, stats);
3206 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3207 const struct ofpbuf *nlmsg,
3208 netdev_dump_queue_stats_cb *cb, void *aux)
3210 struct netdev_queue_stats stats;
3211 unsigned int handle, major, minor;
3214 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3219 major = tc_get_major(handle);
3220 minor = tc_get_minor(handle);
3221 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3222 (*cb)(minor - 1, &stats, aux);
3227 static const struct tc_ops tc_ops_htb = {
3228 "htb", /* linux_name */
3229 "linux-htb", /* ovs_name */
3230 HTB_N_QUEUES, /* n_queues */
3239 htb_class_get_stats,
3240 htb_class_dump_stats
3243 /* "linux-hfsc" traffic control class. */
3245 #define HFSC_N_QUEUES 0xf000
3253 struct tc_queue tc_queue;
3258 static struct hfsc *
3259 hfsc_get__(const struct netdev *netdev_)
3261 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3262 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
3265 static struct hfsc_class *
3266 hfsc_class_cast__(const struct tc_queue *queue)
3268 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3272 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
3274 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3277 hfsc = xmalloc(sizeof *hfsc);
3278 tc_init(&hfsc->tc, &tc_ops_hfsc);
3279 hfsc->max_rate = max_rate;
3280 netdev->tc = &hfsc->tc;
3284 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3285 const struct hfsc_class *hc)
3289 struct hfsc_class *hcp;
3290 struct tc_queue *queue;
3292 hfsc = hfsc_get__(netdev);
3293 hash = hash_int(queue_id, 0);
3295 queue = tc_find_queue__(netdev, queue_id, hash);
3297 hcp = hfsc_class_cast__(queue);
3299 hcp = xmalloc(sizeof *hcp);
3300 queue = &hcp->tc_queue;
3301 queue->queue_id = queue_id;
3302 queue->created = time_msec();
3303 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3306 hcp->min_rate = hc->min_rate;
3307 hcp->max_rate = hc->max_rate;
3311 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3313 const struct tc_service_curve *rsc, *fsc, *usc;
3314 static const struct nl_policy tca_hfsc_policy[] = {
3316 .type = NL_A_UNSPEC,
3318 .min_len = sizeof(struct tc_service_curve),
3321 .type = NL_A_UNSPEC,
3323 .min_len = sizeof(struct tc_service_curve),
3326 .type = NL_A_UNSPEC,
3328 .min_len = sizeof(struct tc_service_curve),
3331 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3333 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3334 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3335 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3339 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3340 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3341 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3343 if (rsc->m1 != 0 || rsc->d != 0 ||
3344 fsc->m1 != 0 || fsc->d != 0 ||
3345 usc->m1 != 0 || usc->d != 0) {
3346 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3347 "Non-linear service curves are not supported.");
3351 if (rsc->m2 != fsc->m2) {
3352 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3353 "Real-time service curves are not supported ");
3357 if (rsc->m2 > usc->m2) {
3358 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3359 "Min-rate service curve is greater than "
3360 "the max-rate service curve.");
3364 class->min_rate = fsc->m2;
3365 class->max_rate = usc->m2;
3370 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3371 struct hfsc_class *options,
3372 struct netdev_queue_stats *stats)
3375 unsigned int handle;
3376 struct nlattr *nl_options;
3378 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3384 unsigned int major, minor;
3386 major = tc_get_major(handle);
3387 minor = tc_get_minor(handle);
3388 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3389 *queue_id = minor - 1;
3396 error = hfsc_parse_tca_options__(nl_options, options);
3403 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3404 unsigned int parent, struct hfsc_class *options,
3405 struct netdev_queue_stats *stats)
3408 struct ofpbuf *reply;
3410 error = tc_query_class(netdev, handle, parent, &reply);
3415 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3416 ofpbuf_delete(reply);
3421 hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
3422 struct hfsc_class *class)
3424 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3426 const char *max_rate_s;
3428 max_rate_s = smap_get(details, "max-rate");
3429 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3432 enum netdev_features current;
3434 netdev_linux_read_features(netdev);
3435 current = !netdev->get_features_error ? netdev->current : 0;
3436 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3439 class->min_rate = max_rate;
3440 class->max_rate = max_rate;
3444 hfsc_parse_class_details__(struct netdev *netdev,
3445 const struct smap *details,
3446 struct hfsc_class * class)
3448 const struct hfsc *hfsc;
3449 uint32_t min_rate, max_rate;
3450 const char *min_rate_s, *max_rate_s;
3452 hfsc = hfsc_get__(netdev);
3453 min_rate_s = smap_get(details, "min-rate");
3454 max_rate_s = smap_get(details, "max-rate");
3456 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3457 min_rate = MAX(min_rate, 1);
3458 min_rate = MIN(min_rate, hfsc->max_rate);
3460 max_rate = (max_rate_s
3461 ? strtoull(max_rate_s, NULL, 10) / 8
3463 max_rate = MAX(max_rate, min_rate);
3464 max_rate = MIN(max_rate, hfsc->max_rate);
3466 class->min_rate = min_rate;
3467 class->max_rate = max_rate;
3472 /* Create an HFSC qdisc.
3474 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3476 hfsc_setup_qdisc__(struct netdev * netdev)
3478 struct tcmsg *tcmsg;
3479 struct ofpbuf request;
3480 struct tc_hfsc_qopt opt;
3482 tc_del_qdisc(netdev);
3484 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3485 NLM_F_EXCL | NLM_F_CREATE, &request);
3491 tcmsg->tcm_handle = tc_make_handle(1, 0);
3492 tcmsg->tcm_parent = TC_H_ROOT;
3494 memset(&opt, 0, sizeof opt);
3497 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3498 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3500 return tc_transact(&request, NULL);
3503 /* Create an HFSC class.
3505 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3506 * sc rate <min_rate> ul rate <max_rate>" */
3508 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3509 unsigned int parent, struct hfsc_class *class)
3513 struct tcmsg *tcmsg;
3514 struct ofpbuf request;
3515 struct tc_service_curve min, max;
3517 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3523 tcmsg->tcm_handle = handle;
3524 tcmsg->tcm_parent = parent;
3528 min.m2 = class->min_rate;
3532 max.m2 = class->max_rate;
3534 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3535 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3536 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3537 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3538 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3539 nl_msg_end_nested(&request, opt_offset);
3541 error = tc_transact(&request, NULL);
3543 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3544 "min-rate %ubps, max-rate %ubps (%s)",
3545 netdev_get_name(netdev),
3546 tc_get_major(handle), tc_get_minor(handle),
3547 tc_get_major(parent), tc_get_minor(parent),
3548 class->min_rate, class->max_rate, ovs_strerror(error));
3555 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
3558 struct hfsc_class class;
3560 error = hfsc_setup_qdisc__(netdev);
3566 hfsc_parse_qdisc_details__(netdev, details, &class);
3567 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3568 tc_make_handle(1, 0), &class);
3574 hfsc_install__(netdev, class.max_rate);
3579 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3582 struct nl_dump dump;
3583 struct hfsc_class hc;
3586 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3587 hfsc_install__(netdev, hc.max_rate);
3589 if (!start_queue_dump(netdev, &dump)) {
3593 while (nl_dump_next(&dump, &msg)) {
3594 unsigned int queue_id;
3596 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3597 hfsc_update_queue__(netdev, queue_id, &hc);
3601 nl_dump_done(&dump);
3606 hfsc_tc_destroy(struct tc *tc)
3609 struct hfsc_class *hc, *next;
3611 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3613 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3614 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3623 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
3625 const struct hfsc *hfsc;
3626 hfsc = hfsc_get__(netdev);
3627 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
3632 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
3635 struct hfsc_class class;
3637 hfsc_parse_qdisc_details__(netdev, details, &class);
3638 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3639 tc_make_handle(1, 0), &class);
3642 hfsc_get__(netdev)->max_rate = class.max_rate;
3649 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3650 const struct tc_queue *queue, struct smap *details)
3652 const struct hfsc_class *hc;
3654 hc = hfsc_class_cast__(queue);
3655 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3656 if (hc->min_rate != hc->max_rate) {
3657 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3663 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3664 const struct smap *details)
3667 struct hfsc_class class;
3669 error = hfsc_parse_class_details__(netdev, details, &class);
3674 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3675 tc_make_handle(1, 0xfffe), &class);
3680 hfsc_update_queue__(netdev, queue_id, &class);
3685 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3689 struct hfsc_class *hc;
3691 hc = hfsc_class_cast__(queue);
3692 hfsc = hfsc_get__(netdev);
3694 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3696 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3703 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3704 struct netdev_queue_stats *stats)
3706 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3707 tc_make_handle(1, 0xfffe), NULL, stats);
3711 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3712 const struct ofpbuf *nlmsg,
3713 netdev_dump_queue_stats_cb *cb, void *aux)
3715 struct netdev_queue_stats stats;
3716 unsigned int handle, major, minor;
3719 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3724 major = tc_get_major(handle);
3725 minor = tc_get_minor(handle);
3726 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3727 (*cb)(minor - 1, &stats, aux);
3732 static const struct tc_ops tc_ops_hfsc = {
3733 "hfsc", /* linux_name */
3734 "linux-hfsc", /* ovs_name */
3735 HFSC_N_QUEUES, /* n_queues */
3736 hfsc_tc_install, /* tc_install */
3737 hfsc_tc_load, /* tc_load */
3738 hfsc_tc_destroy, /* tc_destroy */
3739 hfsc_qdisc_get, /* qdisc_get */
3740 hfsc_qdisc_set, /* qdisc_set */
3741 hfsc_class_get, /* class_get */
3742 hfsc_class_set, /* class_set */
3743 hfsc_class_delete, /* class_delete */
3744 hfsc_class_get_stats, /* class_get_stats */
3745 hfsc_class_dump_stats /* class_dump_stats */
3748 /* "linux-default" traffic control class.
3750 * This class represents the default, unnamed Linux qdisc. It corresponds to
3751 * the "" (empty string) QoS type in the OVS database. */
3754 default_install__(struct netdev *netdev_)
3756 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3757 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
3759 /* Nothing but a tc class implementation is allowed to write to a tc. This
3760 * class never does that, so we can legitimately use a const tc object. */
3761 netdev->tc = CONST_CAST(struct tc *, &tc);
3765 default_tc_install(struct netdev *netdev,
3766 const struct smap *details OVS_UNUSED)
3768 default_install__(netdev);
3773 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3775 default_install__(netdev);
3779 static const struct tc_ops tc_ops_default = {
3780 NULL, /* linux_name */
3785 NULL, /* tc_destroy */
3786 NULL, /* qdisc_get */
3787 NULL, /* qdisc_set */
3788 NULL, /* class_get */
3789 NULL, /* class_set */
3790 NULL, /* class_delete */
3791 NULL, /* class_get_stats */
3792 NULL /* class_dump_stats */
3795 /* "linux-other" traffic control class.
3800 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
3802 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3803 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
3805 /* Nothing but a tc class implementation is allowed to write to a tc. This
3806 * class never does that, so we can legitimately use a const tc object. */
3807 netdev->tc = CONST_CAST(struct tc *, &tc);
3811 static const struct tc_ops tc_ops_other = {
3812 NULL, /* linux_name */
3813 "linux-other", /* ovs_name */
3815 NULL, /* tc_install */
3817 NULL, /* tc_destroy */
3818 NULL, /* qdisc_get */
3819 NULL, /* qdisc_set */
3820 NULL, /* class_get */
3821 NULL, /* class_set */
3822 NULL, /* class_delete */
3823 NULL, /* class_get_stats */
3824 NULL /* class_dump_stats */
3827 /* Traffic control. */
3829 /* Number of kernel "tc" ticks per second. */
3830 static double ticks_per_s;
3832 /* Number of kernel "jiffies" per second. This is used for the purpose of
3833 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3834 * one jiffy's worth of data.
3836 * There are two possibilities here:
3838 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3839 * approximate range of 100 to 1024. That means that we really need to
3840 * make sure that the qdisc can buffer that much data.
3842 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3843 * has finely granular timers and there's no need to fudge additional room
3844 * for buffers. (There's no extra effort needed to implement that: the
3845 * large 'buffer_hz' is used as a divisor, so practically any number will
3846 * come out as 0 in the division. Small integer results in the case of
3847 * really high dividends won't have any real effect anyhow.)
3849 static unsigned int buffer_hz;
3851 /* Returns tc handle 'major':'minor'. */
3853 tc_make_handle(unsigned int major, unsigned int minor)
3855 return TC_H_MAKE(major << 16, minor);
3858 /* Returns the major number from 'handle'. */
3860 tc_get_major(unsigned int handle)
3862 return TC_H_MAJ(handle) >> 16;
3865 /* Returns the minor number from 'handle'. */
3867 tc_get_minor(unsigned int handle)
3869 return TC_H_MIN(handle);
3872 static struct tcmsg *
3873 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3874 struct ofpbuf *request)
3876 struct tcmsg *tcmsg;
3880 error = get_ifindex(netdev, &ifindex);
3885 ofpbuf_init(request, 512);
3886 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3887 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3888 tcmsg->tcm_family = AF_UNSPEC;
3889 tcmsg->tcm_ifindex = ifindex;
3890 /* Caller should fill in tcmsg->tcm_handle. */
3891 /* Caller should fill in tcmsg->tcm_parent. */
3897 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3899 int error = nl_transact(NETLINK_ROUTE, request, replyp);
3900 ofpbuf_uninit(request);
3904 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3905 * policing configuration.
3907 * This function is equivalent to running the following when 'add' is true:
3908 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3910 * This function is equivalent to running the following when 'add' is false:
3911 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3913 * The configuration and stats may be seen with the following command:
3914 * /sbin/tc -s qdisc show dev <devname>
3916 * Returns 0 if successful, otherwise a positive errno value.
3919 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3921 struct ofpbuf request;
3922 struct tcmsg *tcmsg;
3924 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3925 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3927 tcmsg = tc_make_request(netdev, type, flags, &request);
3931 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3932 tcmsg->tcm_parent = TC_H_INGRESS;
3933 nl_msg_put_string(&request, TCA_KIND, "ingress");
3934 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3936 error = tc_transact(&request, NULL);
3938 /* If we're deleting the qdisc, don't worry about some of the
3939 * error conditions. */
3940 if (!add && (error == ENOENT || error == EINVAL)) {
3949 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3952 * This function is equivalent to running:
3953 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3954 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3957 * The configuration and stats may be seen with the following command:
3958 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3960 * Returns 0 if successful, otherwise a positive errno value.
3963 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3965 struct tc_police tc_police;
3966 struct ofpbuf request;
3967 struct tcmsg *tcmsg;
3968 size_t basic_offset;
3969 size_t police_offset;
3973 memset(&tc_police, 0, sizeof tc_police);
3974 tc_police.action = TC_POLICE_SHOT;
3975 tc_police.mtu = mtu;
3976 tc_fill_rate(&tc_police.rate, (kbits_rate * 1000)/8, mtu);
3977 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3978 kbits_burst * 1024);
3980 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3981 NLM_F_EXCL | NLM_F_CREATE, &request);
3985 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3986 tcmsg->tcm_info = tc_make_handle(49,
3987 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3989 nl_msg_put_string(&request, TCA_KIND, "basic");
3990 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3991 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3992 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3993 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3994 nl_msg_end_nested(&request, police_offset);
3995 nl_msg_end_nested(&request, basic_offset);
3997 error = tc_transact(&request, NULL);
4008 /* The values in psched are not individually very meaningful, but they are
4009 * important. The tables below show some values seen in the wild.
4013 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
4014 * (Before that, there are hints that it was 1000000000.)
4016 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
4020 * -----------------------------------
4021 * [1] 000c8000 000f4240 000f4240 00000064
4022 * [2] 000003e8 00000400 000f4240 3b9aca00
4023 * [3] 000003e8 00000400 000f4240 3b9aca00
4024 * [4] 000003e8 00000400 000f4240 00000064
4025 * [5] 000003e8 00000040 000f4240 3b9aca00
4026 * [6] 000003e8 00000040 000f4240 000000f9
4028 * a b c d ticks_per_s buffer_hz
4029 * ------- --------- ---------- ------------- ----------- -------------
4030 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
4031 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4032 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4033 * [4] 1,000 1,024 1,000,000 100 976,562 100
4034 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
4035 * [6] 1,000 64 1,000,000 249 15,625,000 249
4037 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
4038 * [2] 2.6.26-1-686-bigmem from Debian lenny
4039 * [3] 2.6.26-2-sparc64 from Debian lenny
4040 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
4041 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
4042 * [6] 2.6.34 from kernel.org on KVM
4044 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4045 static const char fn[] = "/proc/net/psched";
4046 unsigned int a, b, c, d;
4049 if (!ovsthread_once_start(&once)) {
4056 stream = fopen(fn, "r");
4058 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
4062 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
4063 VLOG_WARN("%s: read failed", fn);
4067 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
4071 VLOG_WARN("%s: invalid scheduler parameters", fn);
4075 ticks_per_s = (double) a * c / b;
4079 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
4082 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
4085 ovsthread_once_done(&once);
4088 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
4089 * rate of 'rate' bytes per second. */
4091 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
4094 return (rate * ticks) / ticks_per_s;
4097 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
4098 * rate of 'rate' bytes per second. */
4100 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
4103 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
4106 /* Returns the number of bytes that need to be reserved for qdisc buffering at
4107 * a transmission rate of 'rate' bytes per second. */
4109 tc_buffer_per_jiffy(unsigned int rate)
4112 return rate / buffer_hz;
4115 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
4116 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
4117 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
4118 * stores NULL into it if it is absent.
4120 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
4123 * Returns 0 if successful, otherwise a positive errno value. */
4125 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
4126 struct nlattr **options)
4128 static const struct nl_policy tca_policy[] = {
4129 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
4130 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
4132 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4134 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4135 tca_policy, ta, ARRAY_SIZE(ta))) {
4136 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
4141 *kind = nl_attr_get_string(ta[TCA_KIND]);
4145 *options = ta[TCA_OPTIONS];
4160 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
4161 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
4162 * into '*options', and its queue statistics into '*stats'. Any of the output
4163 * arguments may be null.
4165 * Returns 0 if successful, otherwise a positive errno value. */
4167 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
4168 struct nlattr **options, struct netdev_queue_stats *stats)
4170 static const struct nl_policy tca_policy[] = {
4171 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
4172 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
4174 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4176 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4177 tca_policy, ta, ARRAY_SIZE(ta))) {
4178 VLOG_WARN_RL(&rl, "failed to parse class message");
4183 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
4184 *handlep = tc->tcm_handle;
4188 *options = ta[TCA_OPTIONS];
4192 const struct gnet_stats_queue *gsq;
4193 struct gnet_stats_basic gsb;
4195 static const struct nl_policy stats_policy[] = {
4196 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
4197 .min_len = sizeof gsb },
4198 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
4199 .min_len = sizeof *gsq },
4201 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
4203 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
4204 sa, ARRAY_SIZE(sa))) {
4205 VLOG_WARN_RL(&rl, "failed to parse class stats");
4209 /* Alignment issues screw up the length of struct gnet_stats_basic on
4210 * some arch/bitsize combinations. Newer versions of Linux have a
4211 * struct gnet_stats_basic_packed, but we can't depend on that. The
4212 * easiest thing to do is just to make a copy. */
4213 memset(&gsb, 0, sizeof gsb);
4214 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
4215 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
4216 stats->tx_bytes = gsb.bytes;
4217 stats->tx_packets = gsb.packets;
4219 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
4220 stats->tx_errors = gsq->drops;
4230 memset(stats, 0, sizeof *stats);
4235 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
4238 tc_query_class(const struct netdev *netdev,
4239 unsigned int handle, unsigned int parent,
4240 struct ofpbuf **replyp)
4242 struct ofpbuf request;
4243 struct tcmsg *tcmsg;
4246 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
4250 tcmsg->tcm_handle = handle;
4251 tcmsg->tcm_parent = parent;
4253 error = tc_transact(&request, replyp);
4255 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
4256 netdev_get_name(netdev),
4257 tc_get_major(handle), tc_get_minor(handle),
4258 tc_get_major(parent), tc_get_minor(parent),
4259 ovs_strerror(error));
4264 /* Equivalent to "tc class del dev <name> handle <handle>". */
4266 tc_delete_class(const struct netdev *netdev, unsigned int handle)
4268 struct ofpbuf request;
4269 struct tcmsg *tcmsg;
4272 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
4276 tcmsg->tcm_handle = handle;
4277 tcmsg->tcm_parent = 0;
4279 error = tc_transact(&request, NULL);
4281 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4282 netdev_get_name(netdev),
4283 tc_get_major(handle), tc_get_minor(handle),
4284 ovs_strerror(error));
4289 /* Equivalent to "tc qdisc del dev <name> root". */
4291 tc_del_qdisc(struct netdev *netdev_)
4293 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4294 struct ofpbuf request;
4295 struct tcmsg *tcmsg;
4298 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
4302 tcmsg->tcm_handle = tc_make_handle(1, 0);
4303 tcmsg->tcm_parent = TC_H_ROOT;
4305 error = tc_transact(&request, NULL);
4306 if (error == EINVAL) {
4307 /* EINVAL probably means that the default qdisc was in use, in which
4308 * case we've accomplished our purpose. */
4311 if (!error && netdev->tc) {
4312 if (netdev->tc->ops->tc_destroy) {
4313 netdev->tc->ops->tc_destroy(netdev->tc);
4320 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4321 * kernel to determine what they are. Returns 0 if successful, otherwise a
4322 * positive errno value. */
4324 tc_query_qdisc(const struct netdev *netdev_)
4326 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4327 struct ofpbuf request, *qdisc;
4328 const struct tc_ops *ops;
4329 struct tcmsg *tcmsg;
4337 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4338 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4339 * 2.6.35 without that fix backported to it.
4341 * To avoid the OOPS, we must not make a request that would attempt to dump
4342 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4343 * few others. There are a few ways that I can see to do this, but most of
4344 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4345 * technique chosen here is to assume that any non-default qdisc that we
4346 * create will have a class with handle 1:0. The built-in qdiscs only have
4347 * a class with handle 0:0.
4349 * We could check for Linux 2.6.35+ and use a more straightforward method
4351 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
4355 tcmsg->tcm_handle = tc_make_handle(1, 0);
4356 tcmsg->tcm_parent = 0;
4358 /* Figure out what tc class to instantiate. */
4359 error = tc_transact(&request, &qdisc);
4363 error = tc_parse_qdisc(qdisc, &kind, NULL);
4365 ops = &tc_ops_other;
4367 ops = tc_lookup_linux_name(kind);
4369 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4370 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4372 ops = &tc_ops_other;
4375 } else if (error == ENOENT) {
4376 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4377 * other entity that doesn't have a handle 1:0. We will assume
4378 * that it's the system default qdisc. */
4379 ops = &tc_ops_default;
4382 /* Who knows? Maybe the device got deleted. */
4383 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4384 netdev_get_name(netdev_), ovs_strerror(error));
4385 ops = &tc_ops_other;
4388 /* Instantiate it. */
4389 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
4390 ovs_assert((load_error == 0) == (netdev->tc != NULL));
4391 ofpbuf_delete(qdisc);
4393 return error ? error : load_error;
4396 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4397 approximate the time to transmit packets of various lengths. For an MTU of
4398 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4399 represents two possible packet lengths; for a MTU of 513 through 1024, four
4400 possible lengths; and so on.
4402 Returns, for the specified 'mtu', the number of bits that packet lengths
4403 need to be shifted right to fit within such a 256-entry table. */
4405 tc_calc_cell_log(unsigned int mtu)
4410 mtu = ETH_PAYLOAD_MAX;
4412 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4414 for (cell_log = 0; mtu >= 256; cell_log++) {
4421 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4424 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4426 memset(rate, 0, sizeof *rate);
4427 rate->cell_log = tc_calc_cell_log(mtu);
4428 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4429 /* rate->cell_align = 0; */ /* distro headers. */
4430 rate->mpu = ETH_TOTAL_MIN;
4434 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4435 * attribute of the specified "type".
4437 * See tc_calc_cell_log() above for a description of "rtab"s. */
4439 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4444 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4445 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4446 unsigned packet_size = (i + 1) << rate->cell_log;
4447 if (packet_size < rate->mpu) {
4448 packet_size = rate->mpu;
4450 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4454 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4455 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4456 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4459 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4461 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4462 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4465 /* Linux-only functions declared in netdev-linux.h */
4467 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4468 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4470 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4471 const char *flag_name, bool enable)
4473 const char *netdev_name = netdev_get_name(netdev);
4474 struct ethtool_value evalue;
4478 COVERAGE_INC(netdev_get_ethtool);
4479 memset(&evalue, 0, sizeof evalue);
4480 error = netdev_linux_do_ethtool(netdev_name,
4481 (struct ethtool_cmd *)&evalue,
4482 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4487 COVERAGE_INC(netdev_set_ethtool);
4488 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4489 error = netdev_linux_do_ethtool(netdev_name,
4490 (struct ethtool_cmd *)&evalue,
4491 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4496 COVERAGE_INC(netdev_get_ethtool);
4497 memset(&evalue, 0, sizeof evalue);
4498 error = netdev_linux_do_ethtool(netdev_name,
4499 (struct ethtool_cmd *)&evalue,
4500 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4505 if (new_flags != evalue.data) {
4506 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4507 "device %s failed", enable ? "enable" : "disable",
4508 flag_name, netdev_name);
4515 /* Utility functions. */
4517 /* Copies 'src' into 'dst', performing format conversion in the process. */
4519 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4520 const struct rtnl_link_stats *src)
4522 dst->rx_packets = src->rx_packets;
4523 dst->tx_packets = src->tx_packets;
4524 dst->rx_bytes = src->rx_bytes;
4525 dst->tx_bytes = src->tx_bytes;
4526 dst->rx_errors = src->rx_errors;
4527 dst->tx_errors = src->tx_errors;
4528 dst->rx_dropped = src->rx_dropped;
4529 dst->tx_dropped = src->tx_dropped;
4530 dst->multicast = src->multicast;
4531 dst->collisions = src->collisions;
4532 dst->rx_length_errors = src->rx_length_errors;
4533 dst->rx_over_errors = src->rx_over_errors;
4534 dst->rx_crc_errors = src->rx_crc_errors;
4535 dst->rx_frame_errors = src->rx_frame_errors;
4536 dst->rx_fifo_errors = src->rx_fifo_errors;
4537 dst->rx_missed_errors = src->rx_missed_errors;
4538 dst->tx_aborted_errors = src->tx_aborted_errors;
4539 dst->tx_carrier_errors = src->tx_carrier_errors;
4540 dst->tx_fifo_errors = src->tx_fifo_errors;
4541 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4542 dst->tx_window_errors = src->tx_window_errors;
4546 get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
4548 struct ofpbuf request;
4549 struct ofpbuf *reply;
4552 ofpbuf_init(&request, 0);
4553 nl_msg_put_nlmsghdr(&request,
4554 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
4555 RTM_GETLINK, NLM_F_REQUEST);
4556 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
4557 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
4558 error = nl_transact(NETLINK_ROUTE, &request, &reply);
4559 ofpbuf_uninit(&request);
4564 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
4565 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS);
4566 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
4567 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
4570 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4574 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
4579 ofpbuf_delete(reply);
4584 get_flags(const struct netdev *dev, unsigned int *flags)
4590 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
4592 *flags = ifr.ifr_flags;
4598 set_flags(const char *name, unsigned int flags)
4602 ifr.ifr_flags = flags;
4603 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
4607 do_get_ifindex(const char *netdev_name)
4612 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4613 COVERAGE_INC(netdev_get_ifindex);
4615 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
4617 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4618 netdev_name, ovs_strerror(error));
4621 return ifr.ifr_ifindex;
4625 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4627 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4629 if (!(netdev->cache_valid & VALID_IFINDEX)) {
4630 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4633 netdev->get_ifindex_error = -ifindex;
4634 netdev->ifindex = 0;
4636 netdev->get_ifindex_error = 0;
4637 netdev->ifindex = ifindex;
4639 netdev->cache_valid |= VALID_IFINDEX;
4642 *ifindexp = netdev->ifindex;
4643 return netdev->get_ifindex_error;
4647 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4653 memset(&ifr, 0, sizeof ifr);
4654 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4655 COVERAGE_INC(netdev_get_hwaddr);
4656 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
4658 /* ENODEV probably means that a vif disappeared asynchronously and
4659 * hasn't been removed from the database yet, so reduce the log level
4660 * to INFO for that case. */
4661 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
4662 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4663 netdev_name, ovs_strerror(error));
4666 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4667 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4668 VLOG_WARN("%s device has unknown hardware address family %d",
4669 netdev_name, hwaddr_family);
4671 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4676 set_etheraddr(const char *netdev_name,
4677 const uint8_t mac[ETH_ADDR_LEN])
4682 memset(&ifr, 0, sizeof ifr);
4683 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4684 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4685 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4686 COVERAGE_INC(netdev_set_hwaddr);
4687 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
4689 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4690 netdev_name, ovs_strerror(error));
4696 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4697 int cmd, const char *cmd_name)
4702 memset(&ifr, 0, sizeof ifr);
4703 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4704 ifr.ifr_data = (caddr_t) ecmd;
4707 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
4709 if (error != EOPNOTSUPP) {
4710 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4711 "failed: %s", cmd_name, name, ovs_strerror(error));
4713 /* The device doesn't support this operation. That's pretty
4714 * common, so there's no point in logging anything. */
4721 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4722 int cmd, const char *cmd_name)
4727 ifr.ifr_addr.sa_family = AF_INET;
4728 error = af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4730 const struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *,
4732 *ip = sin->sin_addr;
4737 /* Returns an AF_PACKET raw socket or a negative errno value. */
4739 af_packet_sock(void)
4741 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4744 if (ovsthread_once_start(&once)) {
4745 sock = socket(AF_PACKET, SOCK_RAW, 0);
4747 int error = set_nonblocking(sock);
4754 VLOG_ERR("failed to create packet socket: %s",
4755 ovs_strerror(errno));
4757 ovsthread_once_done(&once);