2 * Copyright (c) 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <linux/version.h>
37 #include <sys/types.h>
38 #include <sys/ioctl.h>
39 #include <sys/socket.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
51 #include "connectivity.h"
53 #include "dpif-linux.h"
54 #include "dynamic-string.h"
55 #include "fatal-signal.h"
58 #include "netdev-provider.h"
59 #include "netdev-vport.h"
60 #include "netlink-notifier.h"
61 #include "netlink-socket.h"
64 #include "openflow/openflow.h"
65 #include "ovs-atomic.h"
67 #include "poll-loop.h"
68 #include "rtnetlink-link.h"
71 #include "socket-util.h"
74 #include "unaligned.h"
77 VLOG_DEFINE_THIS_MODULE(netdev_linux);
79 COVERAGE_DEFINE(netdev_set_policing);
80 COVERAGE_DEFINE(netdev_arp_lookup);
81 COVERAGE_DEFINE(netdev_get_ifindex);
82 COVERAGE_DEFINE(netdev_get_hwaddr);
83 COVERAGE_DEFINE(netdev_set_hwaddr);
84 COVERAGE_DEFINE(netdev_get_ethtool);
85 COVERAGE_DEFINE(netdev_set_ethtool);
88 /* These were introduced in Linux 2.6.14, so they might be missing if we have
90 #ifndef ADVERTISED_Pause
91 #define ADVERTISED_Pause (1 << 13)
93 #ifndef ADVERTISED_Asym_Pause
94 #define ADVERTISED_Asym_Pause (1 << 14)
97 /* These were introduced in Linux 2.6.24, so they might be missing if we
98 * have old headers. */
99 #ifndef ETHTOOL_GFLAGS
100 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
102 #ifndef ETHTOOL_SFLAGS
103 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
106 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
109 #define TC_RTAB_SIZE 1024
113 VALID_IFINDEX = 1 << 0,
114 VALID_ETHERADDR = 1 << 1,
118 VALID_POLICING = 1 << 5,
119 VALID_VPORT_STAT_ERROR = 1 << 6,
120 VALID_DRVINFO = 1 << 7,
121 VALID_FEATURES = 1 << 8,
124 /* Traffic control. */
126 /* An instance of a traffic control class. Always associated with a particular
129 * Each TC implementation subclasses this with whatever additional data it
132 const struct tc_ops *ops;
133 struct hmap queues; /* Contains "struct tc_queue"s.
134 * Read by generic TC layer.
135 * Written only by TC implementation. */
138 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
140 /* One traffic control queue.
142 * Each TC implementation subclasses this with whatever additional data it
145 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
146 unsigned int queue_id; /* OpenFlow queue ID. */
147 long long int created; /* Time queue was created, in msecs. */
150 /* A particular kind of traffic control. Each implementation generally maps to
151 * one particular Linux qdisc class.
153 * The functions below return 0 if successful or a positive errno value on
154 * failure, except where otherwise noted. All of them must be provided, except
155 * where otherwise noted. */
157 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
158 * This is null for tc_ops_default and tc_ops_other, for which there are no
159 * appropriate values. */
160 const char *linux_name;
162 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
163 const char *ovs_name;
165 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
166 * queues. The queues are numbered 0 through n_queues - 1. */
167 unsigned int n_queues;
169 /* Called to install this TC class on 'netdev'. The implementation should
170 * make the Netlink calls required to set up 'netdev' with the right qdisc
171 * and configure it according to 'details'. The implementation may assume
172 * that the current qdisc is the default; that is, there is no need for it
173 * to delete the current qdisc before installing itself.
175 * The contents of 'details' should be documented as valid for 'ovs_name'
176 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
177 * (which is built as ovs-vswitchd.conf.db(8)).
179 * This function must return 0 if and only if it sets 'netdev->tc' to an
180 * initialized 'struct tc'.
182 * (This function is null for tc_ops_other, which cannot be installed. For
183 * other TC classes it should always be nonnull.) */
184 int (*tc_install)(struct netdev *netdev, const struct smap *details);
186 /* Called when the netdev code determines (through a Netlink query) that
187 * this TC class's qdisc is installed on 'netdev', but we didn't install
188 * it ourselves and so don't know any of the details.
190 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
191 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
192 * implementation should parse the other attributes of 'nlmsg' as
193 * necessary to determine its configuration. If necessary it should also
194 * use Netlink queries to determine the configuration of queues on
197 * This function must return 0 if and only if it sets 'netdev->tc' to an
198 * initialized 'struct tc'. */
199 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
201 /* Destroys the data structures allocated by the implementation as part of
202 * 'tc'. (This includes destroying 'tc->queues' by calling
205 * The implementation should not need to perform any Netlink calls. If
206 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
207 * (But it may not be desirable.)
209 * This function may be null if 'tc' is trivial. */
210 void (*tc_destroy)(struct tc *tc);
212 /* Retrieves details of 'netdev->tc' configuration into 'details'.
214 * The implementation should not need to perform any Netlink calls, because
215 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
216 * cached the configuration.
218 * The contents of 'details' should be documented as valid for 'ovs_name'
219 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
220 * (which is built as ovs-vswitchd.conf.db(8)).
222 * This function may be null if 'tc' is not configurable.
224 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
226 /* Reconfigures 'netdev->tc' according to 'details', performing any
227 * required Netlink calls to complete the reconfiguration.
229 * The contents of 'details' should be documented as valid for 'ovs_name'
230 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
231 * (which is built as ovs-vswitchd.conf.db(8)).
233 * This function may be null if 'tc' is not configurable.
235 int (*qdisc_set)(struct netdev *, const struct smap *details);
237 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
238 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
240 * The contents of 'details' should be documented as valid for 'ovs_name'
241 * in the "other_config" column in the "Queue" table in
242 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
244 * The implementation should not need to perform any Netlink calls, because
245 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
246 * cached the queue configuration.
248 * This function may be null if 'tc' does not have queues ('n_queues' is
250 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
251 struct smap *details);
253 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
254 * 'details', perfoming any required Netlink calls to complete the
255 * reconfiguration. The caller ensures that 'queue_id' is less than
258 * The contents of 'details' should be documented as valid for 'ovs_name'
259 * in the "other_config" column in the "Queue" table in
260 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
262 * This function may be null if 'tc' does not have queues or its queues are
263 * not configurable. */
264 int (*class_set)(struct netdev *, unsigned int queue_id,
265 const struct smap *details);
267 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
268 * tc_queue's within 'netdev->tc->queues'.
270 * This function may be null if 'tc' does not have queues or its queues
271 * cannot be deleted. */
272 int (*class_delete)(struct netdev *, struct tc_queue *queue);
274 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
275 * 'struct tc_queue's within 'netdev->tc->queues'.
277 * On success, initializes '*stats'.
279 * This function may be null if 'tc' does not have queues or if it cannot
280 * report queue statistics. */
281 int (*class_get_stats)(const struct netdev *netdev,
282 const struct tc_queue *queue,
283 struct netdev_queue_stats *stats);
285 /* Extracts queue stats from 'nlmsg', which is a response to a
286 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
288 * This function may be null if 'tc' does not have queues or if it cannot
289 * report queue statistics. */
290 int (*class_dump_stats)(const struct netdev *netdev,
291 const struct ofpbuf *nlmsg,
292 netdev_dump_queue_stats_cb *cb, void *aux);
296 tc_init(struct tc *tc, const struct tc_ops *ops)
299 hmap_init(&tc->queues);
303 tc_destroy(struct tc *tc)
305 hmap_destroy(&tc->queues);
308 static const struct tc_ops tc_ops_htb;
309 static const struct tc_ops tc_ops_hfsc;
310 static const struct tc_ops tc_ops_default;
311 static const struct tc_ops tc_ops_other;
313 static const struct tc_ops *const tcs[] = {
314 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
315 &tc_ops_hfsc, /* Hierarchical fair service curve. */
316 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
317 &tc_ops_other, /* Some other qdisc. */
321 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
322 static unsigned int tc_get_major(unsigned int handle);
323 static unsigned int tc_get_minor(unsigned int handle);
325 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
326 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
327 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
329 static struct tcmsg *tc_make_request(const struct netdev *, int type,
330 unsigned int flags, struct ofpbuf *);
331 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
332 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
333 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
336 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
337 struct nlattr **options);
338 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
339 struct nlattr **options,
340 struct netdev_queue_stats *);
341 static int tc_query_class(const struct netdev *,
342 unsigned int handle, unsigned int parent,
343 struct ofpbuf **replyp);
344 static int tc_delete_class(const struct netdev *, unsigned int handle);
346 static int tc_del_qdisc(struct netdev *netdev);
347 static int tc_query_qdisc(const struct netdev *netdev);
349 static int tc_calc_cell_log(unsigned int mtu);
350 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
351 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
352 const struct tc_ratespec *rate);
353 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
355 struct netdev_linux {
358 /* Protects all members below. */
359 struct ovs_mutex mutex;
361 unsigned int cache_valid;
363 bool miimon; /* Link status of last poll. */
364 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
365 struct timer miimon_timer;
367 /* The following are figured out "on demand" only. They are only valid
368 * when the corresponding VALID_* bit in 'cache_valid' is set. */
370 uint8_t etheraddr[ETH_ADDR_LEN];
371 struct in_addr address, netmask;
374 unsigned int ifi_flags;
375 long long int carrier_resets;
376 uint32_t kbits_rate; /* Policing data. */
377 uint32_t kbits_burst;
378 int vport_stats_error; /* Cached error code from vport_get_stats().
379 0 or an errno value. */
380 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
381 int ether_addr_error; /* Cached error code from set/get etheraddr. */
382 int netdev_policing_error; /* Cached error code from set policing. */
383 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
384 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
386 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
387 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
388 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
390 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
393 /* For devices of class netdev_tap_class only. */
397 struct netdev_rx_linux {
403 /* This is set pretty low because we probably won't learn anything from the
404 * additional log messages. */
405 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
407 /* Polling miimon status for all ports causes performance degradation when
408 * handling a large number of ports. If there are no devices using miimon, then
409 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait(). */
410 static atomic_int miimon_cnt = ATOMIC_VAR_INIT(0);
412 static void netdev_linux_run(void);
414 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
415 int cmd, const char *cmd_name);
416 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
417 int cmd, const char *cmd_name);
418 static int get_flags(const struct netdev *, unsigned int *flags);
419 static int set_flags(const char *, unsigned int flags);
420 static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
421 enum netdev_flags on, enum netdev_flags *old_flagsp)
422 OVS_REQUIRES(netdev->mutex);
423 static int do_get_ifindex(const char *netdev_name);
424 static int get_ifindex(const struct netdev *, int *ifindexp);
425 static int do_set_addr(struct netdev *netdev,
426 int ioctl_nr, const char *ioctl_name,
427 struct in_addr addr);
428 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
429 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
430 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
431 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
432 static int af_packet_sock(void);
433 static bool netdev_linux_miimon_enabled(void);
434 static void netdev_linux_miimon_run(void);
435 static void netdev_linux_miimon_wait(void);
438 is_netdev_linux_class(const struct netdev_class *netdev_class)
440 return netdev_class->run == netdev_linux_run;
444 is_tap_netdev(const struct netdev *netdev)
446 return netdev_get_class(netdev) == &netdev_tap_class;
449 static struct netdev_linux *
450 netdev_linux_cast(const struct netdev *netdev)
452 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
454 return CONTAINER_OF(netdev, struct netdev_linux, up);
457 static struct netdev_rx_linux *
458 netdev_rx_linux_cast(const struct netdev_rx *rx)
460 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
461 return CONTAINER_OF(rx, struct netdev_rx_linux, up);
464 static void netdev_linux_update(struct netdev_linux *netdev,
465 const struct rtnetlink_link_change *)
466 OVS_REQUIRES(netdev->mutex);
467 static void netdev_linux_changed(struct netdev_linux *netdev,
468 unsigned int ifi_flags, unsigned int mask)
469 OVS_REQUIRES(netdev->mutex);
471 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK changes, or NULL
472 * if no such socket could be created. */
473 static struct nl_sock *
474 netdev_linux_notify_sock(void)
476 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
477 static struct nl_sock *sock;
479 if (ovsthread_once_start(&once)) {
482 error = nl_sock_create(NETLINK_ROUTE, &sock);
484 error = nl_sock_join_mcgroup(sock, RTNLGRP_LINK);
486 nl_sock_destroy(sock);
490 ovsthread_once_done(&once);
497 netdev_linux_miimon_enabled(void)
501 atomic_read(&miimon_cnt, &miimon);
506 netdev_linux_run(void)
508 struct nl_sock *sock;
511 if (netdev_linux_miimon_enabled()) {
512 netdev_linux_miimon_run();
515 sock = netdev_linux_notify_sock();
521 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
522 uint64_t buf_stub[4096 / 8];
525 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
526 error = nl_sock_recv(sock, &buf, false);
528 struct rtnetlink_link_change change;
530 if (rtnetlink_link_parse(&buf, &change)) {
531 struct netdev *netdev_ = netdev_from_name(change.ifname);
532 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
533 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
535 ovs_mutex_lock(&netdev->mutex);
536 netdev_linux_update(netdev, &change);
537 ovs_mutex_unlock(&netdev->mutex);
539 netdev_close(netdev_);
541 } else if (error == ENOBUFS) {
542 struct shash device_shash;
543 struct shash_node *node;
547 shash_init(&device_shash);
548 netdev_get_devices(&netdev_linux_class, &device_shash);
549 SHASH_FOR_EACH (node, &device_shash) {
550 struct netdev *netdev_ = node->data;
551 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
554 ovs_mutex_lock(&netdev->mutex);
555 get_flags(netdev_, &flags);
556 netdev_linux_changed(netdev, flags, 0);
557 ovs_mutex_unlock(&netdev->mutex);
559 netdev_close(netdev_);
561 shash_destroy(&device_shash);
562 } else if (error != EAGAIN) {
563 VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)",
564 ovs_strerror(error));
571 netdev_linux_wait(void)
573 struct nl_sock *sock;
575 if (netdev_linux_miimon_enabled()) {
576 netdev_linux_miimon_wait();
578 sock = netdev_linux_notify_sock();
580 nl_sock_wait(sock, POLLIN);
585 netdev_linux_changed(struct netdev_linux *dev,
586 unsigned int ifi_flags, unsigned int mask)
587 OVS_REQUIRES(dev->mutex)
589 seq_change(connectivity_seq_get());
591 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
592 dev->carrier_resets++;
594 dev->ifi_flags = ifi_flags;
596 dev->cache_valid &= mask;
600 netdev_linux_update(struct netdev_linux *dev,
601 const struct rtnetlink_link_change *change)
602 OVS_REQUIRES(dev->mutex)
604 if (change->nlmsg_type == RTM_NEWLINK) {
606 netdev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
608 /* Update netdev from rtnl-change msg. */
610 dev->mtu = change->mtu;
611 dev->cache_valid |= VALID_MTU;
612 dev->netdev_mtu_error = 0;
615 if (!eth_addr_is_zero(change->addr)) {
616 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
617 dev->cache_valid |= VALID_ETHERADDR;
618 dev->ether_addr_error = 0;
621 dev->ifindex = change->ifi_index;
622 dev->cache_valid |= VALID_IFINDEX;
623 dev->get_ifindex_error = 0;
626 netdev_linux_changed(dev, change->ifi_flags, 0);
630 static struct netdev *
631 netdev_linux_alloc(void)
633 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
638 netdev_linux_common_construct(struct netdev_linux *netdev)
640 ovs_mutex_init(&netdev->mutex);
643 /* Creates system and internal devices. */
645 netdev_linux_construct(struct netdev *netdev_)
647 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
650 netdev_linux_common_construct(netdev);
652 error = get_flags(&netdev->up, &netdev->ifi_flags);
653 if (error == ENODEV) {
654 if (netdev->up.netdev_class != &netdev_internal_class) {
655 /* The device does not exist, so don't allow it to be opened. */
658 /* "Internal" netdevs have to be created as netdev objects before
659 * they exist in the kernel, because creating them in the kernel
660 * happens by passing a netdev object to dpif_port_add().
661 * Therefore, ignore the error. */
668 /* For most types of netdevs we open the device for each call of
669 * netdev_open(). However, this is not the case with tap devices,
670 * since it is only possible to open the device once. In this
671 * situation we share a single file descriptor, and consequently
672 * buffers, across all readers. Therefore once data is read it will
673 * be unavailable to other reads for tap devices. */
675 netdev_linux_construct_tap(struct netdev *netdev_)
677 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
678 static const char tap_dev[] = "/dev/net/tun";
679 const char *name = netdev_->name;
683 netdev_linux_common_construct(netdev);
685 /* Open tap device. */
686 netdev->tap_fd = open(tap_dev, O_RDWR);
687 if (netdev->tap_fd < 0) {
689 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
693 /* Create tap device. */
694 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
695 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
696 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
697 VLOG_WARN("%s: creating tap device failed: %s", name,
698 ovs_strerror(errno));
703 /* Make non-blocking. */
704 error = set_nonblocking(netdev->tap_fd);
712 close(netdev->tap_fd);
717 netdev_linux_destruct(struct netdev *netdev_)
719 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
721 if (netdev->tc && netdev->tc->ops->tc_destroy) {
722 netdev->tc->ops->tc_destroy(netdev->tc);
725 if (netdev_get_class(netdev_) == &netdev_tap_class
726 && netdev->tap_fd >= 0)
728 close(netdev->tap_fd);
731 if (netdev->miimon_interval > 0) {
733 atomic_sub(&miimon_cnt, 1, &junk);
736 ovs_mutex_destroy(&netdev->mutex);
740 netdev_linux_dealloc(struct netdev *netdev_)
742 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
746 static struct netdev_rx *
747 netdev_linux_rx_alloc(void)
749 struct netdev_rx_linux *rx = xzalloc(sizeof *rx);
754 netdev_linux_rx_construct(struct netdev_rx *rx_)
756 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
757 struct netdev *netdev_ = rx->up.netdev;
758 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
761 ovs_mutex_lock(&netdev->mutex);
762 rx->is_tap = is_tap_netdev(netdev_);
764 rx->fd = netdev->tap_fd;
766 struct sockaddr_ll sll;
768 /* Result of tcpdump -dd inbound */
769 static const struct sock_filter filt[] = {
770 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
771 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
772 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
773 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
775 static const struct sock_fprog fprog = {
776 ARRAY_SIZE(filt), (struct sock_filter *) filt
779 /* Create file descriptor. */
780 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
783 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
787 /* Set non-blocking mode. */
788 error = set_nonblocking(rx->fd);
793 /* Get ethernet device index. */
794 error = get_ifindex(&netdev->up, &ifindex);
799 /* Bind to specific ethernet device. */
800 memset(&sll, 0, sizeof sll);
801 sll.sll_family = AF_PACKET;
802 sll.sll_ifindex = ifindex;
803 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
804 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
806 VLOG_ERR("%s: failed to bind raw socket (%s)",
807 netdev_get_name(netdev_), ovs_strerror(error));
811 /* Filter for only inbound packets. */
812 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
816 VLOG_ERR("%s: failed to attach filter (%s)",
817 netdev_get_name(netdev_), ovs_strerror(error));
821 ovs_mutex_unlock(&netdev->mutex);
829 ovs_mutex_unlock(&netdev->mutex);
834 netdev_linux_rx_destruct(struct netdev_rx *rx_)
836 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
844 netdev_linux_rx_dealloc(struct netdev_rx *rx_)
846 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
852 netdev_linux_rx_recv(struct netdev_rx *rx_, void *data, size_t size)
854 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
859 ? read(rx->fd, data, size)
860 : recv(rx->fd, data, size, MSG_TRUNC));
861 } while (retval < 0 && errno == EINTR);
864 return retval > size ? -EMSGSIZE : retval;
866 if (errno != EAGAIN) {
867 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
868 ovs_strerror(errno), netdev_rx_get_name(rx_));
875 netdev_linux_rx_wait(struct netdev_rx *rx_)
877 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
878 poll_fd_wait(rx->fd, POLLIN);
882 netdev_linux_rx_drain(struct netdev_rx *rx_)
884 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
887 int error = af_inet_ifreq_ioctl(netdev_rx_get_name(rx_), &ifr,
888 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
892 drain_fd(rx->fd, ifr.ifr_qlen);
895 return drain_rcvbuf(rx->fd);
899 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
900 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
901 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
902 * the packet is too big or too small to transmit on the device.
904 * The caller retains ownership of 'buffer' in all cases.
906 * The kernel maintains a packet transmission queue, so the caller is not
907 * expected to do additional queuing of packets. */
909 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
914 if (!is_tap_netdev(netdev_)) {
915 /* Use our AF_PACKET socket to send to this device. */
916 struct sockaddr_ll sll;
922 sock = af_packet_sock();
927 ifindex = netdev_get_ifindex(netdev_);
932 /* We don't bother setting most fields in sockaddr_ll because the
933 * kernel ignores them for SOCK_RAW. */
934 memset(&sll, 0, sizeof sll);
935 sll.sll_family = AF_PACKET;
936 sll.sll_ifindex = ifindex;
938 iov.iov_base = CONST_CAST(void *, data);
942 msg.msg_namelen = sizeof sll;
945 msg.msg_control = NULL;
946 msg.msg_controllen = 0;
949 retval = sendmsg(sock, &msg, 0);
951 /* Use the tap fd to send to this device. This is essential for
952 * tap devices, because packets sent to a tap device with an
953 * AF_PACKET socket will loop back to be *received* again on the
954 * tap device. This doesn't occur on other interface types
955 * because we attach a socket filter to the rx socket. */
956 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
958 retval = write(netdev->tap_fd, data, size);
962 /* The Linux AF_PACKET implementation never blocks waiting for room
963 * for packets, instead returning ENOBUFS. Translate this into
964 * EAGAIN for the caller. */
965 if (errno == ENOBUFS) {
967 } else if (errno == EINTR) {
969 } else if (errno != EAGAIN) {
970 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
971 netdev_get_name(netdev_), ovs_strerror(errno));
974 } else if (retval != size) {
975 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE"d bytes of "
976 "%"PRIuSIZE") on %s", retval, size, netdev_get_name(netdev_));
984 /* Registers with the poll loop to wake up from the next call to poll_block()
985 * when the packet transmission queue has sufficient room to transmit a packet
986 * with netdev_send().
988 * The kernel maintains a packet transmission queue, so the client is not
989 * expected to do additional queuing of packets. Thus, this function is
990 * unlikely to ever be used. It is included for completeness. */
992 netdev_linux_send_wait(struct netdev *netdev)
994 if (is_tap_netdev(netdev)) {
995 /* TAP device always accepts packets.*/
996 poll_immediate_wake();
1000 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1001 * otherwise a positive errno value. */
1003 netdev_linux_set_etheraddr(struct netdev *netdev_,
1004 const uint8_t mac[ETH_ADDR_LEN])
1006 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1007 enum netdev_flags old_flags = 0;
1010 ovs_mutex_lock(&netdev->mutex);
1012 if (netdev->cache_valid & VALID_ETHERADDR) {
1013 error = netdev->ether_addr_error;
1014 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1017 netdev->cache_valid &= ~VALID_ETHERADDR;
1020 /* Tap devices must be brought down before setting the address. */
1021 if (is_tap_netdev(netdev_)) {
1022 update_flags(netdev, NETDEV_UP, 0, &old_flags);
1024 error = set_etheraddr(netdev_get_name(netdev_), mac);
1025 if (!error || error == ENODEV) {
1026 netdev->ether_addr_error = error;
1027 netdev->cache_valid |= VALID_ETHERADDR;
1029 memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
1033 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1034 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1038 ovs_mutex_unlock(&netdev->mutex);
1042 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1044 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1045 uint8_t mac[ETH_ADDR_LEN])
1047 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1050 ovs_mutex_lock(&netdev->mutex);
1051 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1052 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1054 netdev->cache_valid |= VALID_ETHERADDR;
1057 error = netdev->ether_addr_error;
1059 memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
1061 ovs_mutex_unlock(&netdev->mutex);
1067 netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
1071 if (!(netdev->cache_valid & VALID_MTU)) {
1074 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
1075 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
1076 netdev->mtu = ifr.ifr_mtu;
1077 netdev->cache_valid |= VALID_MTU;
1080 error = netdev->netdev_mtu_error;
1082 *mtup = netdev->mtu;
1088 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1089 * in bytes, not including the hardware header; thus, this is typically 1500
1090 * bytes for Ethernet devices. */
1092 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1094 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1097 ovs_mutex_lock(&netdev->mutex);
1098 error = netdev_linux_get_mtu__(netdev, mtup);
1099 ovs_mutex_unlock(&netdev->mutex);
1104 /* Sets the maximum size of transmitted (MTU) for given device using linux
1105 * networking ioctl interface.
1108 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1110 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1114 ovs_mutex_lock(&netdev->mutex);
1115 if (netdev->cache_valid & VALID_MTU) {
1116 error = netdev->netdev_mtu_error;
1117 if (error || netdev->mtu == mtu) {
1120 netdev->cache_valid &= ~VALID_MTU;
1123 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1124 SIOCSIFMTU, "SIOCSIFMTU");
1125 if (!error || error == ENODEV) {
1126 netdev->netdev_mtu_error = error;
1127 netdev->mtu = ifr.ifr_mtu;
1128 netdev->cache_valid |= VALID_MTU;
1131 ovs_mutex_unlock(&netdev->mutex);
1135 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1136 * On failure, returns a negative errno value. */
1138 netdev_linux_get_ifindex(const struct netdev *netdev_)
1140 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1143 ovs_mutex_lock(&netdev->mutex);
1144 error = get_ifindex(netdev_, &ifindex);
1145 ovs_mutex_unlock(&netdev->mutex);
1147 return error ? -error : ifindex;
1151 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1153 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1155 ovs_mutex_lock(&netdev->mutex);
1156 if (netdev->miimon_interval > 0) {
1157 *carrier = netdev->miimon;
1159 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1161 ovs_mutex_unlock(&netdev->mutex);
1166 static long long int
1167 netdev_linux_get_carrier_resets(const struct netdev *netdev_)
1169 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1170 long long int carrier_resets;
1172 ovs_mutex_lock(&netdev->mutex);
1173 carrier_resets = netdev->carrier_resets;
1174 ovs_mutex_unlock(&netdev->mutex);
1176 return carrier_resets;
1180 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1181 struct mii_ioctl_data *data)
1186 memset(&ifr, 0, sizeof ifr);
1187 memcpy(&ifr.ifr_data, data, sizeof *data);
1188 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
1189 memcpy(data, &ifr.ifr_data, sizeof *data);
1195 netdev_linux_get_miimon(const char *name, bool *miimon)
1197 struct mii_ioctl_data data;
1202 memset(&data, 0, sizeof data);
1203 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1205 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1206 data.reg_num = MII_BMSR;
1207 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1211 *miimon = !!(data.val_out & BMSR_LSTATUS);
1213 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1216 struct ethtool_cmd ecmd;
1218 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1221 COVERAGE_INC(netdev_get_ethtool);
1222 memset(&ecmd, 0, sizeof ecmd);
1223 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1226 struct ethtool_value eval;
1228 memcpy(&eval, &ecmd, sizeof eval);
1229 *miimon = !!eval.data;
1231 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1239 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1240 long long int interval)
1242 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1244 ovs_mutex_lock(&netdev->mutex);
1245 interval = interval > 0 ? MAX(interval, 100) : 0;
1246 if (netdev->miimon_interval != interval) {
1249 if (interval && !netdev->miimon_interval) {
1250 atomic_add(&miimon_cnt, 1, &junk);
1251 } else if (!interval && netdev->miimon_interval) {
1252 atomic_sub(&miimon_cnt, 1, &junk);
1255 netdev->miimon_interval = interval;
1256 timer_set_expired(&netdev->miimon_timer);
1258 ovs_mutex_unlock(&netdev->mutex);
1264 netdev_linux_miimon_run(void)
1266 struct shash device_shash;
1267 struct shash_node *node;
1269 shash_init(&device_shash);
1270 netdev_get_devices(&netdev_linux_class, &device_shash);
1271 SHASH_FOR_EACH (node, &device_shash) {
1272 struct netdev *netdev = node->data;
1273 struct netdev_linux *dev = netdev_linux_cast(netdev);
1276 ovs_mutex_lock(&dev->mutex);
1277 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1278 netdev_linux_get_miimon(dev->up.name, &miimon);
1279 if (miimon != dev->miimon) {
1280 dev->miimon = miimon;
1281 netdev_linux_changed(dev, dev->ifi_flags, 0);
1284 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1286 ovs_mutex_unlock(&dev->mutex);
1287 netdev_close(netdev);
1290 shash_destroy(&device_shash);
1294 netdev_linux_miimon_wait(void)
1296 struct shash device_shash;
1297 struct shash_node *node;
1299 shash_init(&device_shash);
1300 netdev_get_devices(&netdev_linux_class, &device_shash);
1301 SHASH_FOR_EACH (node, &device_shash) {
1302 struct netdev *netdev = node->data;
1303 struct netdev_linux *dev = netdev_linux_cast(netdev);
1305 ovs_mutex_lock(&dev->mutex);
1306 if (dev->miimon_interval > 0) {
1307 timer_wait(&dev->miimon_timer);
1309 ovs_mutex_unlock(&dev->mutex);
1310 netdev_close(netdev);
1312 shash_destroy(&device_shash);
1315 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1316 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1319 check_for_working_netlink_stats(void)
1321 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1322 * preferable, so if that works, we'll use it. */
1323 int ifindex = do_get_ifindex("lo");
1325 VLOG_WARN("failed to get ifindex for lo, "
1326 "obtaining netdev stats from proc");
1329 struct netdev_stats stats;
1330 int error = get_stats_via_netlink(ifindex, &stats);
1332 VLOG_DBG("obtaining netdev stats via rtnetlink");
1335 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1336 "via proc (you are probably running a pre-2.6.19 "
1337 "kernel)", ovs_strerror(error));
1344 swap_uint64(uint64_t *a, uint64_t *b)
1351 /* Copies 'src' into 'dst', performing format conversion in the process.
1353 * 'src' is allowed to be misaligned. */
1355 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1356 const struct ovs_vport_stats *src)
1358 dst->rx_packets = get_unaligned_u64(&src->rx_packets);
1359 dst->tx_packets = get_unaligned_u64(&src->tx_packets);
1360 dst->rx_bytes = get_unaligned_u64(&src->rx_bytes);
1361 dst->tx_bytes = get_unaligned_u64(&src->tx_bytes);
1362 dst->rx_errors = get_unaligned_u64(&src->rx_errors);
1363 dst->tx_errors = get_unaligned_u64(&src->tx_errors);
1364 dst->rx_dropped = get_unaligned_u64(&src->rx_dropped);
1365 dst->tx_dropped = get_unaligned_u64(&src->tx_dropped);
1367 dst->collisions = 0;
1368 dst->rx_length_errors = 0;
1369 dst->rx_over_errors = 0;
1370 dst->rx_crc_errors = 0;
1371 dst->rx_frame_errors = 0;
1372 dst->rx_fifo_errors = 0;
1373 dst->rx_missed_errors = 0;
1374 dst->tx_aborted_errors = 0;
1375 dst->tx_carrier_errors = 0;
1376 dst->tx_fifo_errors = 0;
1377 dst->tx_heartbeat_errors = 0;
1378 dst->tx_window_errors = 0;
1382 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1384 struct dpif_linux_vport reply;
1388 error = dpif_linux_vport_get(netdev_get_name(netdev), &reply, &buf);
1391 } else if (!reply.stats) {
1396 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1404 get_stats_via_vport(const struct netdev *netdev_,
1405 struct netdev_stats *stats)
1407 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1409 if (!netdev->vport_stats_error ||
1410 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1413 error = get_stats_via_vport__(netdev_, stats);
1414 if (error && error != ENOENT) {
1415 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1417 netdev_get_name(netdev_), ovs_strerror(error));
1419 netdev->vport_stats_error = error;
1420 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1425 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1426 struct netdev_stats *stats)
1428 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
1429 static int use_netlink_stats;
1432 if (ovsthread_once_start(&once)) {
1433 use_netlink_stats = check_for_working_netlink_stats();
1434 ovsthread_once_done(&once);
1437 if (use_netlink_stats) {
1440 error = get_ifindex(netdev_, &ifindex);
1442 error = get_stats_via_netlink(ifindex, stats);
1445 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1449 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1450 netdev_get_name(netdev_), error);
1456 /* Retrieves current device stats for 'netdev-linux'. */
1458 netdev_linux_get_stats(const struct netdev *netdev_,
1459 struct netdev_stats *stats)
1461 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1462 struct netdev_stats dev_stats;
1465 ovs_mutex_lock(&netdev->mutex);
1466 get_stats_via_vport(netdev_, stats);
1467 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1469 if (!netdev->vport_stats_error) {
1472 } else if (netdev->vport_stats_error) {
1473 /* stats not available from OVS then use ioctl stats. */
1476 stats->rx_errors += dev_stats.rx_errors;
1477 stats->tx_errors += dev_stats.tx_errors;
1478 stats->rx_dropped += dev_stats.rx_dropped;
1479 stats->tx_dropped += dev_stats.tx_dropped;
1480 stats->multicast += dev_stats.multicast;
1481 stats->collisions += dev_stats.collisions;
1482 stats->rx_length_errors += dev_stats.rx_length_errors;
1483 stats->rx_over_errors += dev_stats.rx_over_errors;
1484 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1485 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1486 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1487 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1488 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1489 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1490 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1491 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1492 stats->tx_window_errors += dev_stats.tx_window_errors;
1494 ovs_mutex_unlock(&netdev->mutex);
1499 /* Retrieves current device stats for 'netdev-tap' netdev or
1500 * netdev-internal. */
1502 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1504 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1505 struct netdev_stats dev_stats;
1508 ovs_mutex_lock(&netdev->mutex);
1509 get_stats_via_vport(netdev_, stats);
1510 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1512 if (!netdev->vport_stats_error) {
1515 } else if (netdev->vport_stats_error) {
1516 /* Transmit and receive stats will appear to be swapped relative to the
1517 * other ports since we are the one sending the data, not a remote
1518 * computer. For consistency, we swap them back here. This does not
1519 * apply if we are getting stats from the vport layer because it always
1520 * tracks stats from the perspective of the switch. */
1523 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1524 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1525 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1526 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1527 stats->rx_length_errors = 0;
1528 stats->rx_over_errors = 0;
1529 stats->rx_crc_errors = 0;
1530 stats->rx_frame_errors = 0;
1531 stats->rx_fifo_errors = 0;
1532 stats->rx_missed_errors = 0;
1533 stats->tx_aborted_errors = 0;
1534 stats->tx_carrier_errors = 0;
1535 stats->tx_fifo_errors = 0;
1536 stats->tx_heartbeat_errors = 0;
1537 stats->tx_window_errors = 0;
1539 stats->rx_dropped += dev_stats.tx_dropped;
1540 stats->tx_dropped += dev_stats.rx_dropped;
1542 stats->rx_errors += dev_stats.tx_errors;
1543 stats->tx_errors += dev_stats.rx_errors;
1545 stats->multicast += dev_stats.multicast;
1546 stats->collisions += dev_stats.collisions;
1548 ovs_mutex_unlock(&netdev->mutex);
1554 netdev_internal_get_stats(const struct netdev *netdev_,
1555 struct netdev_stats *stats)
1557 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1560 ovs_mutex_lock(&netdev->mutex);
1561 get_stats_via_vport(netdev_, stats);
1562 error = netdev->vport_stats_error;
1563 ovs_mutex_unlock(&netdev->mutex);
1569 netdev_internal_set_stats(struct netdev *netdev,
1570 const struct netdev_stats *stats)
1572 struct ovs_vport_stats vport_stats;
1573 struct dpif_linux_vport vport;
1576 vport_stats.rx_packets = stats->rx_packets;
1577 vport_stats.tx_packets = stats->tx_packets;
1578 vport_stats.rx_bytes = stats->rx_bytes;
1579 vport_stats.tx_bytes = stats->tx_bytes;
1580 vport_stats.rx_errors = stats->rx_errors;
1581 vport_stats.tx_errors = stats->tx_errors;
1582 vport_stats.rx_dropped = stats->rx_dropped;
1583 vport_stats.tx_dropped = stats->tx_dropped;
1585 dpif_linux_vport_init(&vport);
1586 vport.cmd = OVS_VPORT_CMD_SET;
1587 vport.name = netdev_get_name(netdev);
1588 vport.stats = &vport_stats;
1590 err = dpif_linux_vport_transact(&vport, NULL, NULL);
1592 /* If the vport layer doesn't know about the device, that doesn't mean it
1593 * doesn't exist (after all were able to open it when netdev_open() was
1594 * called), it just means that it isn't attached and we'll be getting
1595 * stats a different way. */
1596 if (err == ENODEV) {
1604 netdev_linux_read_features(struct netdev_linux *netdev)
1606 struct ethtool_cmd ecmd;
1610 if (netdev->cache_valid & VALID_FEATURES) {
1614 COVERAGE_INC(netdev_get_ethtool);
1615 memset(&ecmd, 0, sizeof ecmd);
1616 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1617 ETHTOOL_GSET, "ETHTOOL_GSET");
1622 /* Supported features. */
1623 netdev->supported = 0;
1624 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1625 netdev->supported |= NETDEV_F_10MB_HD;
1627 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1628 netdev->supported |= NETDEV_F_10MB_FD;
1630 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1631 netdev->supported |= NETDEV_F_100MB_HD;
1633 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1634 netdev->supported |= NETDEV_F_100MB_FD;
1636 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1637 netdev->supported |= NETDEV_F_1GB_HD;
1639 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1640 netdev->supported |= NETDEV_F_1GB_FD;
1642 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1643 netdev->supported |= NETDEV_F_10GB_FD;
1645 if (ecmd.supported & SUPPORTED_TP) {
1646 netdev->supported |= NETDEV_F_COPPER;
1648 if (ecmd.supported & SUPPORTED_FIBRE) {
1649 netdev->supported |= NETDEV_F_FIBER;
1651 if (ecmd.supported & SUPPORTED_Autoneg) {
1652 netdev->supported |= NETDEV_F_AUTONEG;
1654 if (ecmd.supported & SUPPORTED_Pause) {
1655 netdev->supported |= NETDEV_F_PAUSE;
1657 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1658 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1661 /* Advertised features. */
1662 netdev->advertised = 0;
1663 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1664 netdev->advertised |= NETDEV_F_10MB_HD;
1666 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1667 netdev->advertised |= NETDEV_F_10MB_FD;
1669 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1670 netdev->advertised |= NETDEV_F_100MB_HD;
1672 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1673 netdev->advertised |= NETDEV_F_100MB_FD;
1675 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1676 netdev->advertised |= NETDEV_F_1GB_HD;
1678 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1679 netdev->advertised |= NETDEV_F_1GB_FD;
1681 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1682 netdev->advertised |= NETDEV_F_10GB_FD;
1684 if (ecmd.advertising & ADVERTISED_TP) {
1685 netdev->advertised |= NETDEV_F_COPPER;
1687 if (ecmd.advertising & ADVERTISED_FIBRE) {
1688 netdev->advertised |= NETDEV_F_FIBER;
1690 if (ecmd.advertising & ADVERTISED_Autoneg) {
1691 netdev->advertised |= NETDEV_F_AUTONEG;
1693 if (ecmd.advertising & ADVERTISED_Pause) {
1694 netdev->advertised |= NETDEV_F_PAUSE;
1696 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1697 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1700 /* Current settings. */
1702 if (speed == SPEED_10) {
1703 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1704 } else if (speed == SPEED_100) {
1705 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1706 } else if (speed == SPEED_1000) {
1707 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1708 } else if (speed == SPEED_10000) {
1709 netdev->current = NETDEV_F_10GB_FD;
1710 } else if (speed == 40000) {
1711 netdev->current = NETDEV_F_40GB_FD;
1712 } else if (speed == 100000) {
1713 netdev->current = NETDEV_F_100GB_FD;
1714 } else if (speed == 1000000) {
1715 netdev->current = NETDEV_F_1TB_FD;
1717 netdev->current = 0;
1720 if (ecmd.port == PORT_TP) {
1721 netdev->current |= NETDEV_F_COPPER;
1722 } else if (ecmd.port == PORT_FIBRE) {
1723 netdev->current |= NETDEV_F_FIBER;
1727 netdev->current |= NETDEV_F_AUTONEG;
1731 netdev->cache_valid |= VALID_FEATURES;
1732 netdev->get_features_error = error;
1735 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
1736 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1737 * Returns 0 if successful, otherwise a positive errno value. */
1739 netdev_linux_get_features(const struct netdev *netdev_,
1740 enum netdev_features *current,
1741 enum netdev_features *advertised,
1742 enum netdev_features *supported,
1743 enum netdev_features *peer)
1745 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1748 ovs_mutex_lock(&netdev->mutex);
1749 netdev_linux_read_features(netdev);
1750 if (!netdev->get_features_error) {
1751 *current = netdev->current;
1752 *advertised = netdev->advertised;
1753 *supported = netdev->supported;
1754 *peer = 0; /* XXX */
1756 error = netdev->get_features_error;
1757 ovs_mutex_unlock(&netdev->mutex);
1762 /* Set the features advertised by 'netdev' to 'advertise'. */
1764 netdev_linux_set_advertisements(struct netdev *netdev_,
1765 enum netdev_features advertise)
1767 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1768 struct ethtool_cmd ecmd;
1771 ovs_mutex_lock(&netdev->mutex);
1773 COVERAGE_INC(netdev_get_ethtool);
1774 memset(&ecmd, 0, sizeof ecmd);
1775 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1776 ETHTOOL_GSET, "ETHTOOL_GSET");
1781 ecmd.advertising = 0;
1782 if (advertise & NETDEV_F_10MB_HD) {
1783 ecmd.advertising |= ADVERTISED_10baseT_Half;
1785 if (advertise & NETDEV_F_10MB_FD) {
1786 ecmd.advertising |= ADVERTISED_10baseT_Full;
1788 if (advertise & NETDEV_F_100MB_HD) {
1789 ecmd.advertising |= ADVERTISED_100baseT_Half;
1791 if (advertise & NETDEV_F_100MB_FD) {
1792 ecmd.advertising |= ADVERTISED_100baseT_Full;
1794 if (advertise & NETDEV_F_1GB_HD) {
1795 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1797 if (advertise & NETDEV_F_1GB_FD) {
1798 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1800 if (advertise & NETDEV_F_10GB_FD) {
1801 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1803 if (advertise & NETDEV_F_COPPER) {
1804 ecmd.advertising |= ADVERTISED_TP;
1806 if (advertise & NETDEV_F_FIBER) {
1807 ecmd.advertising |= ADVERTISED_FIBRE;
1809 if (advertise & NETDEV_F_AUTONEG) {
1810 ecmd.advertising |= ADVERTISED_Autoneg;
1812 if (advertise & NETDEV_F_PAUSE) {
1813 ecmd.advertising |= ADVERTISED_Pause;
1815 if (advertise & NETDEV_F_PAUSE_ASYM) {
1816 ecmd.advertising |= ADVERTISED_Asym_Pause;
1818 COVERAGE_INC(netdev_set_ethtool);
1819 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1820 ETHTOOL_SSET, "ETHTOOL_SSET");
1823 ovs_mutex_unlock(&netdev->mutex);
1827 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1828 * successful, otherwise a positive errno value. */
1830 netdev_linux_set_policing(struct netdev *netdev_,
1831 uint32_t kbits_rate, uint32_t kbits_burst)
1833 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1834 const char *netdev_name = netdev_get_name(netdev_);
1837 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1838 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1839 : kbits_burst); /* Stick with user-specified value. */
1841 ovs_mutex_lock(&netdev->mutex);
1842 if (netdev->cache_valid & VALID_POLICING) {
1843 error = netdev->netdev_policing_error;
1844 if (error || (netdev->kbits_rate == kbits_rate &&
1845 netdev->kbits_burst == kbits_burst)) {
1846 /* Assume that settings haven't changed since we last set them. */
1849 netdev->cache_valid &= ~VALID_POLICING;
1852 COVERAGE_INC(netdev_set_policing);
1853 /* Remove any existing ingress qdisc. */
1854 error = tc_add_del_ingress_qdisc(netdev_, false);
1856 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1857 netdev_name, ovs_strerror(error));
1862 error = tc_add_del_ingress_qdisc(netdev_, true);
1864 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1865 netdev_name, ovs_strerror(error));
1869 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
1871 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1872 netdev_name, ovs_strerror(error));
1877 netdev->kbits_rate = kbits_rate;
1878 netdev->kbits_burst = kbits_burst;
1881 if (!error || error == ENODEV) {
1882 netdev->netdev_policing_error = error;
1883 netdev->cache_valid |= VALID_POLICING;
1885 ovs_mutex_unlock(&netdev->mutex);
1890 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1893 const struct tc_ops *const *opsp;
1895 for (opsp = tcs; *opsp != NULL; opsp++) {
1896 const struct tc_ops *ops = *opsp;
1897 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1898 sset_add(types, ops->ovs_name);
1904 static const struct tc_ops *
1905 tc_lookup_ovs_name(const char *name)
1907 const struct tc_ops *const *opsp;
1909 for (opsp = tcs; *opsp != NULL; opsp++) {
1910 const struct tc_ops *ops = *opsp;
1911 if (!strcmp(name, ops->ovs_name)) {
1918 static const struct tc_ops *
1919 tc_lookup_linux_name(const char *name)
1921 const struct tc_ops *const *opsp;
1923 for (opsp = tcs; *opsp != NULL; opsp++) {
1924 const struct tc_ops *ops = *opsp;
1925 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1932 static struct tc_queue *
1933 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
1936 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1937 struct tc_queue *queue;
1939 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
1940 if (queue->queue_id == queue_id) {
1947 static struct tc_queue *
1948 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1950 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1954 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1956 struct netdev_qos_capabilities *caps)
1958 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1962 caps->n_queues = ops->n_queues;
1967 netdev_linux_get_qos(const struct netdev *netdev_,
1968 const char **typep, struct smap *details)
1970 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1973 ovs_mutex_lock(&netdev->mutex);
1974 error = tc_query_qdisc(netdev_);
1976 *typep = netdev->tc->ops->ovs_name;
1977 error = (netdev->tc->ops->qdisc_get
1978 ? netdev->tc->ops->qdisc_get(netdev_, details)
1981 ovs_mutex_unlock(&netdev->mutex);
1987 netdev_linux_set_qos(struct netdev *netdev_,
1988 const char *type, const struct smap *details)
1990 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1991 const struct tc_ops *new_ops;
1994 new_ops = tc_lookup_ovs_name(type);
1995 if (!new_ops || !new_ops->tc_install) {
1999 ovs_mutex_lock(&netdev->mutex);
2000 error = tc_query_qdisc(netdev_);
2005 if (new_ops == netdev->tc->ops) {
2006 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
2008 /* Delete existing qdisc. */
2009 error = tc_del_qdisc(netdev_);
2013 ovs_assert(netdev->tc == NULL);
2015 /* Install new qdisc. */
2016 error = new_ops->tc_install(netdev_, details);
2017 ovs_assert((error == 0) == (netdev->tc != NULL));
2021 ovs_mutex_unlock(&netdev->mutex);
2026 netdev_linux_get_queue(const struct netdev *netdev_,
2027 unsigned int queue_id, struct smap *details)
2029 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2032 ovs_mutex_lock(&netdev->mutex);
2033 error = tc_query_qdisc(netdev_);
2035 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2037 ? netdev->tc->ops->class_get(netdev_, queue, details)
2040 ovs_mutex_unlock(&netdev->mutex);
2046 netdev_linux_set_queue(struct netdev *netdev_,
2047 unsigned int queue_id, const struct smap *details)
2049 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2052 ovs_mutex_lock(&netdev->mutex);
2053 error = tc_query_qdisc(netdev_);
2055 error = (queue_id < netdev->tc->ops->n_queues
2056 && netdev->tc->ops->class_set
2057 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2060 ovs_mutex_unlock(&netdev->mutex);
2066 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
2068 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2071 ovs_mutex_lock(&netdev->mutex);
2072 error = tc_query_qdisc(netdev_);
2074 if (netdev->tc->ops->class_delete) {
2075 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2077 ? netdev->tc->ops->class_delete(netdev_, queue)
2083 ovs_mutex_unlock(&netdev->mutex);
2089 netdev_linux_get_queue_stats(const struct netdev *netdev_,
2090 unsigned int queue_id,
2091 struct netdev_queue_stats *stats)
2093 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2096 ovs_mutex_lock(&netdev->mutex);
2097 error = tc_query_qdisc(netdev_);
2099 if (netdev->tc->ops->class_get_stats) {
2100 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2102 stats->created = queue->created;
2103 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2112 ovs_mutex_unlock(&netdev->mutex);
2118 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
2120 struct ofpbuf request;
2121 struct tcmsg *tcmsg;
2123 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2127 tcmsg->tcm_parent = 0;
2128 nl_dump_start(dump, NETLINK_ROUTE, &request);
2129 ofpbuf_uninit(&request);
2133 struct netdev_linux_queue_state {
2134 unsigned int *queues;
2140 netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
2142 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2145 ovs_mutex_lock(&netdev->mutex);
2146 error = tc_query_qdisc(netdev_);
2148 if (netdev->tc->ops->class_get) {
2149 struct netdev_linux_queue_state *state;
2150 struct tc_queue *queue;
2153 *statep = state = xmalloc(sizeof *state);
2154 state->n_queues = hmap_count(&netdev->tc->queues);
2155 state->cur_queue = 0;
2156 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2159 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2160 state->queues[i++] = queue->queue_id;
2166 ovs_mutex_unlock(&netdev->mutex);
2172 netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2173 unsigned int *queue_idp, struct smap *details)
2175 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2176 struct netdev_linux_queue_state *state = state_;
2179 ovs_mutex_lock(&netdev->mutex);
2180 while (state->cur_queue < state->n_queues) {
2181 unsigned int queue_id = state->queues[state->cur_queue++];
2182 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2185 *queue_idp = queue_id;
2186 error = netdev->tc->ops->class_get(netdev_, queue, details);
2190 ovs_mutex_unlock(&netdev->mutex);
2196 netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2199 struct netdev_linux_queue_state *state = state_;
2201 free(state->queues);
2207 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2208 netdev_dump_queue_stats_cb *cb, void *aux)
2210 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2213 ovs_mutex_lock(&netdev->mutex);
2214 error = tc_query_qdisc(netdev_);
2216 struct nl_dump dump;
2218 if (!netdev->tc->ops->class_dump_stats) {
2220 } else if (!start_queue_dump(netdev_, &dump)) {
2226 while (nl_dump_next(&dump, &msg)) {
2227 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2234 retval = nl_dump_done(&dump);
2240 ovs_mutex_unlock(&netdev->mutex);
2246 netdev_linux_get_in4(const struct netdev *netdev_,
2247 struct in_addr *address, struct in_addr *netmask)
2249 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2252 ovs_mutex_lock(&netdev->mutex);
2253 if (!(netdev->cache_valid & VALID_IN4)) {
2254 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
2255 SIOCGIFADDR, "SIOCGIFADDR");
2257 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2258 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2260 netdev->cache_valid |= VALID_IN4;
2268 if (netdev->address.s_addr != INADDR_ANY) {
2269 *address = netdev->address;
2270 *netmask = netdev->netmask;
2272 error = EADDRNOTAVAIL;
2275 ovs_mutex_unlock(&netdev->mutex);
2281 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2282 struct in_addr netmask)
2284 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2287 ovs_mutex_lock(&netdev->mutex);
2288 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2290 netdev->cache_valid |= VALID_IN4;
2291 netdev->address = address;
2292 netdev->netmask = netmask;
2293 if (address.s_addr != INADDR_ANY) {
2294 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2295 "SIOCSIFNETMASK", netmask);
2298 ovs_mutex_unlock(&netdev->mutex);
2304 parse_if_inet6_line(const char *line,
2305 struct in6_addr *in6, char ifname[16 + 1])
2307 uint8_t *s6 = in6->s6_addr;
2308 #define X8 "%2"SCNx8
2309 return ovs_scan(line,
2310 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2311 "%*x %*x %*x %*x %16s\n",
2312 &s6[0], &s6[1], &s6[2], &s6[3],
2313 &s6[4], &s6[5], &s6[6], &s6[7],
2314 &s6[8], &s6[9], &s6[10], &s6[11],
2315 &s6[12], &s6[13], &s6[14], &s6[15],
2319 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2320 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2322 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2324 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2326 ovs_mutex_lock(&netdev->mutex);
2327 if (!(netdev->cache_valid & VALID_IN6)) {
2331 netdev->in6 = in6addr_any;
2333 file = fopen("/proc/net/if_inet6", "r");
2335 const char *name = netdev_get_name(netdev_);
2336 while (fgets(line, sizeof line, file)) {
2337 struct in6_addr in6_tmp;
2338 char ifname[16 + 1];
2339 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2340 && !strcmp(name, ifname))
2342 netdev->in6 = in6_tmp;
2348 netdev->cache_valid |= VALID_IN6;
2351 ovs_mutex_unlock(&netdev->mutex);
2357 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2359 struct sockaddr_in sin;
2360 memset(&sin, 0, sizeof sin);
2361 sin.sin_family = AF_INET;
2362 sin.sin_addr = addr;
2365 memset(sa, 0, sizeof *sa);
2366 memcpy(sa, &sin, sizeof sin);
2370 do_set_addr(struct netdev *netdev,
2371 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2375 make_in4_sockaddr(&ifr.ifr_addr, addr);
2376 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2380 /* Adds 'router' as a default IP gateway. */
2382 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2384 struct in_addr any = { INADDR_ANY };
2388 memset(&rt, 0, sizeof rt);
2389 make_in4_sockaddr(&rt.rt_dst, any);
2390 make_in4_sockaddr(&rt.rt_gateway, router);
2391 make_in4_sockaddr(&rt.rt_genmask, any);
2392 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2393 error = af_inet_ioctl(SIOCADDRT, &rt);
2395 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2401 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2404 static const char fn[] = "/proc/net/route";
2409 *netdev_name = NULL;
2410 stream = fopen(fn, "r");
2411 if (stream == NULL) {
2412 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2417 while (fgets(line, sizeof line, stream)) {
2420 ovs_be32 dest, gateway, mask;
2421 int refcnt, metric, mtu;
2422 unsigned int flags, use, window, irtt;
2425 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2427 iface, &dest, &gateway, &flags, &refcnt,
2428 &use, &metric, &mask, &mtu, &window, &irtt)) {
2429 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2433 if (!(flags & RTF_UP)) {
2434 /* Skip routes that aren't up. */
2438 /* The output of 'dest', 'mask', and 'gateway' were given in
2439 * network byte order, so we don't need need any endian
2440 * conversions here. */
2441 if ((dest & mask) == (host->s_addr & mask)) {
2443 /* The host is directly reachable. */
2444 next_hop->s_addr = 0;
2446 /* To reach the host, we must go through a gateway. */
2447 next_hop->s_addr = gateway;
2449 *netdev_name = xstrdup(iface);
2461 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2463 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2466 ovs_mutex_lock(&netdev->mutex);
2467 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2468 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2470 COVERAGE_INC(netdev_get_ethtool);
2471 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2472 error = netdev_linux_do_ethtool(netdev->up.name,
2475 "ETHTOOL_GDRVINFO");
2477 netdev->cache_valid |= VALID_DRVINFO;
2482 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2483 smap_add(smap, "driver_version", netdev->drvinfo.version);
2484 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2486 ovs_mutex_unlock(&netdev->mutex);
2492 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2495 smap_add(smap, "driver_name", "openvswitch");
2499 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2500 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2501 * returns 0. Otherwise, it returns a positive errno value; in particular,
2502 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2504 netdev_linux_arp_lookup(const struct netdev *netdev,
2505 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2508 struct sockaddr_in sin;
2511 memset(&r, 0, sizeof r);
2512 memset(&sin, 0, sizeof sin);
2513 sin.sin_family = AF_INET;
2514 sin.sin_addr.s_addr = ip;
2516 memcpy(&r.arp_pa, &sin, sizeof sin);
2517 r.arp_ha.sa_family = ARPHRD_ETHER;
2519 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2520 COVERAGE_INC(netdev_arp_lookup);
2521 retval = af_inet_ioctl(SIOCGARP, &r);
2523 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2524 } else if (retval != ENXIO) {
2525 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2526 netdev_get_name(netdev), IP_ARGS(ip),
2527 ovs_strerror(retval));
2533 nd_to_iff_flags(enum netdev_flags nd)
2536 if (nd & NETDEV_UP) {
2539 if (nd & NETDEV_PROMISC) {
2542 if (nd & NETDEV_LOOPBACK) {
2543 iff |= IFF_LOOPBACK;
2549 iff_to_nd_flags(int iff)
2551 enum netdev_flags nd = 0;
2555 if (iff & IFF_PROMISC) {
2556 nd |= NETDEV_PROMISC;
2558 if (iff & IFF_LOOPBACK) {
2559 nd |= NETDEV_LOOPBACK;
2565 update_flags(struct netdev_linux *netdev, enum netdev_flags off,
2566 enum netdev_flags on, enum netdev_flags *old_flagsp)
2567 OVS_REQUIRES(netdev->mutex)
2569 int old_flags, new_flags;
2572 old_flags = netdev->ifi_flags;
2573 *old_flagsp = iff_to_nd_flags(old_flags);
2574 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2575 if (new_flags != old_flags) {
2576 error = set_flags(netdev_get_name(&netdev->up), new_flags);
2577 get_flags(&netdev->up, &netdev->ifi_flags);
2584 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2585 enum netdev_flags on, enum netdev_flags *old_flagsp)
2587 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2590 ovs_mutex_lock(&netdev->mutex);
2591 error = update_flags(netdev, off, on, old_flagsp);
2592 ovs_mutex_unlock(&netdev->mutex);
2597 #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, SET_STATS, \
2598 GET_FEATURES, GET_STATUS) \
2604 netdev_linux_wait, \
2606 netdev_linux_alloc, \
2608 netdev_linux_destruct, \
2609 netdev_linux_dealloc, \
2610 NULL, /* get_config */ \
2611 NULL, /* set_config */ \
2612 NULL, /* get_tunnel_config */ \
2614 netdev_linux_send, \
2615 netdev_linux_send_wait, \
2617 netdev_linux_set_etheraddr, \
2618 netdev_linux_get_etheraddr, \
2619 netdev_linux_get_mtu, \
2620 netdev_linux_set_mtu, \
2621 netdev_linux_get_ifindex, \
2622 netdev_linux_get_carrier, \
2623 netdev_linux_get_carrier_resets, \
2624 netdev_linux_set_miimon_interval, \
2629 netdev_linux_set_advertisements, \
2631 netdev_linux_set_policing, \
2632 netdev_linux_get_qos_types, \
2633 netdev_linux_get_qos_capabilities, \
2634 netdev_linux_get_qos, \
2635 netdev_linux_set_qos, \
2636 netdev_linux_get_queue, \
2637 netdev_linux_set_queue, \
2638 netdev_linux_delete_queue, \
2639 netdev_linux_get_queue_stats, \
2640 netdev_linux_queue_dump_start, \
2641 netdev_linux_queue_dump_next, \
2642 netdev_linux_queue_dump_done, \
2643 netdev_linux_dump_queue_stats, \
2645 netdev_linux_get_in4, \
2646 netdev_linux_set_in4, \
2647 netdev_linux_get_in6, \
2648 netdev_linux_add_router, \
2649 netdev_linux_get_next_hop, \
2651 netdev_linux_arp_lookup, \
2653 netdev_linux_update_flags, \
2655 netdev_linux_rx_alloc, \
2656 netdev_linux_rx_construct, \
2657 netdev_linux_rx_destruct, \
2658 netdev_linux_rx_dealloc, \
2659 netdev_linux_rx_recv, \
2660 netdev_linux_rx_wait, \
2661 netdev_linux_rx_drain, \
2664 const struct netdev_class netdev_linux_class =
2667 netdev_linux_construct,
2668 netdev_linux_get_stats,
2669 NULL, /* set_stats */
2670 netdev_linux_get_features,
2671 netdev_linux_get_status);
2673 const struct netdev_class netdev_tap_class =
2676 netdev_linux_construct_tap,
2677 netdev_tap_get_stats,
2678 NULL, /* set_stats */
2679 netdev_linux_get_features,
2680 netdev_linux_get_status);
2682 const struct netdev_class netdev_internal_class =
2685 netdev_linux_construct,
2686 netdev_internal_get_stats,
2687 netdev_internal_set_stats,
2688 NULL, /* get_features */
2689 netdev_internal_get_status);
2691 /* HTB traffic control class. */
2693 #define HTB_N_QUEUES 0xf000
2697 unsigned int max_rate; /* In bytes/s. */
2701 struct tc_queue tc_queue;
2702 unsigned int min_rate; /* In bytes/s. */
2703 unsigned int max_rate; /* In bytes/s. */
2704 unsigned int burst; /* In bytes. */
2705 unsigned int priority; /* Lower values are higher priorities. */
2709 htb_get__(const struct netdev *netdev_)
2711 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2712 return CONTAINER_OF(netdev->tc, struct htb, tc);
2716 htb_install__(struct netdev *netdev_, uint64_t max_rate)
2718 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2721 htb = xmalloc(sizeof *htb);
2722 tc_init(&htb->tc, &tc_ops_htb);
2723 htb->max_rate = max_rate;
2725 netdev->tc = &htb->tc;
2728 /* Create an HTB qdisc.
2730 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2732 htb_setup_qdisc__(struct netdev *netdev)
2735 struct tc_htb_glob opt;
2736 struct ofpbuf request;
2737 struct tcmsg *tcmsg;
2739 tc_del_qdisc(netdev);
2741 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2742 NLM_F_EXCL | NLM_F_CREATE, &request);
2746 tcmsg->tcm_handle = tc_make_handle(1, 0);
2747 tcmsg->tcm_parent = TC_H_ROOT;
2749 nl_msg_put_string(&request, TCA_KIND, "htb");
2751 memset(&opt, 0, sizeof opt);
2752 opt.rate2quantum = 10;
2756 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2757 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2758 nl_msg_end_nested(&request, opt_offset);
2760 return tc_transact(&request, NULL);
2763 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2764 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2766 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2767 unsigned int parent, struct htb_class *class)
2770 struct tc_htb_opt opt;
2771 struct ofpbuf request;
2772 struct tcmsg *tcmsg;
2776 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
2778 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2779 netdev_get_name(netdev));
2783 memset(&opt, 0, sizeof opt);
2784 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2785 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2786 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2787 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2788 opt.prio = class->priority;
2790 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2794 tcmsg->tcm_handle = handle;
2795 tcmsg->tcm_parent = parent;
2797 nl_msg_put_string(&request, TCA_KIND, "htb");
2798 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2799 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2800 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2801 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2802 nl_msg_end_nested(&request, opt_offset);
2804 error = tc_transact(&request, NULL);
2806 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2807 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2808 netdev_get_name(netdev),
2809 tc_get_major(handle), tc_get_minor(handle),
2810 tc_get_major(parent), tc_get_minor(parent),
2811 class->min_rate, class->max_rate,
2812 class->burst, class->priority, ovs_strerror(error));
2817 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2818 * description of them into 'details'. The description complies with the
2819 * specification given in the vswitch database documentation for linux-htb
2822 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2824 static const struct nl_policy tca_htb_policy[] = {
2825 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2826 .min_len = sizeof(struct tc_htb_opt) },
2829 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2830 const struct tc_htb_opt *htb;
2832 if (!nl_parse_nested(nl_options, tca_htb_policy,
2833 attrs, ARRAY_SIZE(tca_htb_policy))) {
2834 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2838 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2839 class->min_rate = htb->rate.rate;
2840 class->max_rate = htb->ceil.rate;
2841 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2842 class->priority = htb->prio;
2847 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2848 struct htb_class *options,
2849 struct netdev_queue_stats *stats)
2851 struct nlattr *nl_options;
2852 unsigned int handle;
2855 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2856 if (!error && queue_id) {
2857 unsigned int major = tc_get_major(handle);
2858 unsigned int minor = tc_get_minor(handle);
2859 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2860 *queue_id = minor - 1;
2865 if (!error && options) {
2866 error = htb_parse_tca_options__(nl_options, options);
2872 htb_parse_qdisc_details__(struct netdev *netdev_,
2873 const struct smap *details, struct htb_class *hc)
2875 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2876 const char *max_rate_s;
2878 max_rate_s = smap_get(details, "max-rate");
2879 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2880 if (!hc->max_rate) {
2881 enum netdev_features current;
2883 netdev_linux_read_features(netdev);
2884 current = !netdev->get_features_error ? netdev->current : 0;
2885 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
2887 hc->min_rate = hc->max_rate;
2893 htb_parse_class_details__(struct netdev *netdev,
2894 const struct smap *details, struct htb_class *hc)
2896 const struct htb *htb = htb_get__(netdev);
2897 const char *min_rate_s = smap_get(details, "min-rate");
2898 const char *max_rate_s = smap_get(details, "max-rate");
2899 const char *burst_s = smap_get(details, "burst");
2900 const char *priority_s = smap_get(details, "priority");
2903 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
2905 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2906 netdev_get_name(netdev));
2910 /* HTB requires at least an mtu sized min-rate to send any traffic even
2911 * on uncongested links. */
2912 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2913 hc->min_rate = MAX(hc->min_rate, mtu);
2914 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2917 hc->max_rate = (max_rate_s
2918 ? strtoull(max_rate_s, NULL, 10) / 8
2920 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2921 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2925 * According to hints in the documentation that I've read, it is important
2926 * that 'burst' be at least as big as the largest frame that might be
2927 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2928 * but having it a bit too small is a problem. Since netdev_get_mtu()
2929 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2930 * the MTU. We actually add 64, instead of 14, as a guard against
2931 * additional headers get tacked on somewhere that we're not aware of. */
2932 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2933 hc->burst = MAX(hc->burst, mtu + 64);
2936 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2942 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2943 unsigned int parent, struct htb_class *options,
2944 struct netdev_queue_stats *stats)
2946 struct ofpbuf *reply;
2949 error = tc_query_class(netdev, handle, parent, &reply);
2951 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2952 ofpbuf_delete(reply);
2958 htb_tc_install(struct netdev *netdev, const struct smap *details)
2962 error = htb_setup_qdisc__(netdev);
2964 struct htb_class hc;
2966 htb_parse_qdisc_details__(netdev, details, &hc);
2967 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2968 tc_make_handle(1, 0), &hc);
2970 htb_install__(netdev, hc.max_rate);
2976 static struct htb_class *
2977 htb_class_cast__(const struct tc_queue *queue)
2979 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2983 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2984 const struct htb_class *hc)
2986 struct htb *htb = htb_get__(netdev);
2987 size_t hash = hash_int(queue_id, 0);
2988 struct tc_queue *queue;
2989 struct htb_class *hcp;
2991 queue = tc_find_queue__(netdev, queue_id, hash);
2993 hcp = htb_class_cast__(queue);
2995 hcp = xmalloc(sizeof *hcp);
2996 queue = &hcp->tc_queue;
2997 queue->queue_id = queue_id;
2998 queue->created = time_msec();
2999 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
3002 hcp->min_rate = hc->min_rate;
3003 hcp->max_rate = hc->max_rate;
3004 hcp->burst = hc->burst;
3005 hcp->priority = hc->priority;
3009 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3012 struct nl_dump dump;
3013 struct htb_class hc;
3015 /* Get qdisc options. */
3017 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3018 htb_install__(netdev, hc.max_rate);
3021 if (!start_queue_dump(netdev, &dump)) {
3024 while (nl_dump_next(&dump, &msg)) {
3025 unsigned int queue_id;
3027 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3028 htb_update_queue__(netdev, queue_id, &hc);
3031 nl_dump_done(&dump);
3037 htb_tc_destroy(struct tc *tc)
3039 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
3040 struct htb_class *hc, *next;
3042 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
3043 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3051 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
3053 const struct htb *htb = htb_get__(netdev);
3054 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
3059 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
3061 struct htb_class hc;
3064 htb_parse_qdisc_details__(netdev, details, &hc);
3065 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3066 tc_make_handle(1, 0), &hc);
3068 htb_get__(netdev)->max_rate = hc.max_rate;
3074 htb_class_get(const struct netdev *netdev OVS_UNUSED,
3075 const struct tc_queue *queue, struct smap *details)
3077 const struct htb_class *hc = htb_class_cast__(queue);
3079 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3080 if (hc->min_rate != hc->max_rate) {
3081 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3083 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
3085 smap_add_format(details, "priority", "%u", hc->priority);
3091 htb_class_set(struct netdev *netdev, unsigned int queue_id,
3092 const struct smap *details)
3094 struct htb_class hc;
3097 error = htb_parse_class_details__(netdev, details, &hc);
3102 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3103 tc_make_handle(1, 0xfffe), &hc);
3108 htb_update_queue__(netdev, queue_id, &hc);
3113 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
3115 struct htb_class *hc = htb_class_cast__(queue);
3116 struct htb *htb = htb_get__(netdev);
3119 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3121 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3128 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3129 struct netdev_queue_stats *stats)
3131 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3132 tc_make_handle(1, 0xfffe), NULL, stats);
3136 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3137 const struct ofpbuf *nlmsg,
3138 netdev_dump_queue_stats_cb *cb, void *aux)
3140 struct netdev_queue_stats stats;
3141 unsigned int handle, major, minor;
3144 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3149 major = tc_get_major(handle);
3150 minor = tc_get_minor(handle);
3151 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3152 (*cb)(minor - 1, &stats, aux);
3157 static const struct tc_ops tc_ops_htb = {
3158 "htb", /* linux_name */
3159 "linux-htb", /* ovs_name */
3160 HTB_N_QUEUES, /* n_queues */
3169 htb_class_get_stats,
3170 htb_class_dump_stats
3173 /* "linux-hfsc" traffic control class. */
3175 #define HFSC_N_QUEUES 0xf000
3183 struct tc_queue tc_queue;
3188 static struct hfsc *
3189 hfsc_get__(const struct netdev *netdev_)
3191 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3192 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
3195 static struct hfsc_class *
3196 hfsc_class_cast__(const struct tc_queue *queue)
3198 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3202 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
3204 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3207 hfsc = xmalloc(sizeof *hfsc);
3208 tc_init(&hfsc->tc, &tc_ops_hfsc);
3209 hfsc->max_rate = max_rate;
3210 netdev->tc = &hfsc->tc;
3214 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3215 const struct hfsc_class *hc)
3219 struct hfsc_class *hcp;
3220 struct tc_queue *queue;
3222 hfsc = hfsc_get__(netdev);
3223 hash = hash_int(queue_id, 0);
3225 queue = tc_find_queue__(netdev, queue_id, hash);
3227 hcp = hfsc_class_cast__(queue);
3229 hcp = xmalloc(sizeof *hcp);
3230 queue = &hcp->tc_queue;
3231 queue->queue_id = queue_id;
3232 queue->created = time_msec();
3233 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3236 hcp->min_rate = hc->min_rate;
3237 hcp->max_rate = hc->max_rate;
3241 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3243 const struct tc_service_curve *rsc, *fsc, *usc;
3244 static const struct nl_policy tca_hfsc_policy[] = {
3246 .type = NL_A_UNSPEC,
3248 .min_len = sizeof(struct tc_service_curve),
3251 .type = NL_A_UNSPEC,
3253 .min_len = sizeof(struct tc_service_curve),
3256 .type = NL_A_UNSPEC,
3258 .min_len = sizeof(struct tc_service_curve),
3261 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3263 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3264 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3265 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3269 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3270 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3271 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3273 if (rsc->m1 != 0 || rsc->d != 0 ||
3274 fsc->m1 != 0 || fsc->d != 0 ||
3275 usc->m1 != 0 || usc->d != 0) {
3276 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3277 "Non-linear service curves are not supported.");
3281 if (rsc->m2 != fsc->m2) {
3282 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3283 "Real-time service curves are not supported ");
3287 if (rsc->m2 > usc->m2) {
3288 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3289 "Min-rate service curve is greater than "
3290 "the max-rate service curve.");
3294 class->min_rate = fsc->m2;
3295 class->max_rate = usc->m2;
3300 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3301 struct hfsc_class *options,
3302 struct netdev_queue_stats *stats)
3305 unsigned int handle;
3306 struct nlattr *nl_options;
3308 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3314 unsigned int major, minor;
3316 major = tc_get_major(handle);
3317 minor = tc_get_minor(handle);
3318 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3319 *queue_id = minor - 1;
3326 error = hfsc_parse_tca_options__(nl_options, options);
3333 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3334 unsigned int parent, struct hfsc_class *options,
3335 struct netdev_queue_stats *stats)
3338 struct ofpbuf *reply;
3340 error = tc_query_class(netdev, handle, parent, &reply);
3345 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3346 ofpbuf_delete(reply);
3351 hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
3352 struct hfsc_class *class)
3354 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3356 const char *max_rate_s;
3358 max_rate_s = smap_get(details, "max-rate");
3359 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3362 enum netdev_features current;
3364 netdev_linux_read_features(netdev);
3365 current = !netdev->get_features_error ? netdev->current : 0;
3366 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3369 class->min_rate = max_rate;
3370 class->max_rate = max_rate;
3374 hfsc_parse_class_details__(struct netdev *netdev,
3375 const struct smap *details,
3376 struct hfsc_class * class)
3378 const struct hfsc *hfsc;
3379 uint32_t min_rate, max_rate;
3380 const char *min_rate_s, *max_rate_s;
3382 hfsc = hfsc_get__(netdev);
3383 min_rate_s = smap_get(details, "min-rate");
3384 max_rate_s = smap_get(details, "max-rate");
3386 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3387 min_rate = MAX(min_rate, 1);
3388 min_rate = MIN(min_rate, hfsc->max_rate);
3390 max_rate = (max_rate_s
3391 ? strtoull(max_rate_s, NULL, 10) / 8
3393 max_rate = MAX(max_rate, min_rate);
3394 max_rate = MIN(max_rate, hfsc->max_rate);
3396 class->min_rate = min_rate;
3397 class->max_rate = max_rate;
3402 /* Create an HFSC qdisc.
3404 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3406 hfsc_setup_qdisc__(struct netdev * netdev)
3408 struct tcmsg *tcmsg;
3409 struct ofpbuf request;
3410 struct tc_hfsc_qopt opt;
3412 tc_del_qdisc(netdev);
3414 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3415 NLM_F_EXCL | NLM_F_CREATE, &request);
3421 tcmsg->tcm_handle = tc_make_handle(1, 0);
3422 tcmsg->tcm_parent = TC_H_ROOT;
3424 memset(&opt, 0, sizeof opt);
3427 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3428 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3430 return tc_transact(&request, NULL);
3433 /* Create an HFSC class.
3435 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3436 * sc rate <min_rate> ul rate <max_rate>" */
3438 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3439 unsigned int parent, struct hfsc_class *class)
3443 struct tcmsg *tcmsg;
3444 struct ofpbuf request;
3445 struct tc_service_curve min, max;
3447 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3453 tcmsg->tcm_handle = handle;
3454 tcmsg->tcm_parent = parent;
3458 min.m2 = class->min_rate;
3462 max.m2 = class->max_rate;
3464 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3465 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3466 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3467 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3468 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3469 nl_msg_end_nested(&request, opt_offset);
3471 error = tc_transact(&request, NULL);
3473 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3474 "min-rate %ubps, max-rate %ubps (%s)",
3475 netdev_get_name(netdev),
3476 tc_get_major(handle), tc_get_minor(handle),
3477 tc_get_major(parent), tc_get_minor(parent),
3478 class->min_rate, class->max_rate, ovs_strerror(error));
3485 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
3488 struct hfsc_class class;
3490 error = hfsc_setup_qdisc__(netdev);
3496 hfsc_parse_qdisc_details__(netdev, details, &class);
3497 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3498 tc_make_handle(1, 0), &class);
3504 hfsc_install__(netdev, class.max_rate);
3509 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3512 struct nl_dump dump;
3513 struct hfsc_class hc;
3516 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3517 hfsc_install__(netdev, hc.max_rate);
3519 if (!start_queue_dump(netdev, &dump)) {
3523 while (nl_dump_next(&dump, &msg)) {
3524 unsigned int queue_id;
3526 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3527 hfsc_update_queue__(netdev, queue_id, &hc);
3531 nl_dump_done(&dump);
3536 hfsc_tc_destroy(struct tc *tc)
3539 struct hfsc_class *hc, *next;
3541 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3543 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3544 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3553 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
3555 const struct hfsc *hfsc;
3556 hfsc = hfsc_get__(netdev);
3557 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
3562 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
3565 struct hfsc_class class;
3567 hfsc_parse_qdisc_details__(netdev, details, &class);
3568 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3569 tc_make_handle(1, 0), &class);
3572 hfsc_get__(netdev)->max_rate = class.max_rate;
3579 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3580 const struct tc_queue *queue, struct smap *details)
3582 const struct hfsc_class *hc;
3584 hc = hfsc_class_cast__(queue);
3585 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3586 if (hc->min_rate != hc->max_rate) {
3587 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3593 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3594 const struct smap *details)
3597 struct hfsc_class class;
3599 error = hfsc_parse_class_details__(netdev, details, &class);
3604 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3605 tc_make_handle(1, 0xfffe), &class);
3610 hfsc_update_queue__(netdev, queue_id, &class);
3615 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3619 struct hfsc_class *hc;
3621 hc = hfsc_class_cast__(queue);
3622 hfsc = hfsc_get__(netdev);
3624 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3626 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3633 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3634 struct netdev_queue_stats *stats)
3636 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3637 tc_make_handle(1, 0xfffe), NULL, stats);
3641 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3642 const struct ofpbuf *nlmsg,
3643 netdev_dump_queue_stats_cb *cb, void *aux)
3645 struct netdev_queue_stats stats;
3646 unsigned int handle, major, minor;
3649 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3654 major = tc_get_major(handle);
3655 minor = tc_get_minor(handle);
3656 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3657 (*cb)(minor - 1, &stats, aux);
3662 static const struct tc_ops tc_ops_hfsc = {
3663 "hfsc", /* linux_name */
3664 "linux-hfsc", /* ovs_name */
3665 HFSC_N_QUEUES, /* n_queues */
3666 hfsc_tc_install, /* tc_install */
3667 hfsc_tc_load, /* tc_load */
3668 hfsc_tc_destroy, /* tc_destroy */
3669 hfsc_qdisc_get, /* qdisc_get */
3670 hfsc_qdisc_set, /* qdisc_set */
3671 hfsc_class_get, /* class_get */
3672 hfsc_class_set, /* class_set */
3673 hfsc_class_delete, /* class_delete */
3674 hfsc_class_get_stats, /* class_get_stats */
3675 hfsc_class_dump_stats /* class_dump_stats */
3678 /* "linux-default" traffic control class.
3680 * This class represents the default, unnamed Linux qdisc. It corresponds to
3681 * the "" (empty string) QoS type in the OVS database. */
3684 default_install__(struct netdev *netdev_)
3686 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3687 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
3689 /* Nothing but a tc class implementation is allowed to write to a tc. This
3690 * class never does that, so we can legitimately use a const tc object. */
3691 netdev->tc = CONST_CAST(struct tc *, &tc);
3695 default_tc_install(struct netdev *netdev,
3696 const struct smap *details OVS_UNUSED)
3698 default_install__(netdev);
3703 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3705 default_install__(netdev);
3709 static const struct tc_ops tc_ops_default = {
3710 NULL, /* linux_name */
3715 NULL, /* tc_destroy */
3716 NULL, /* qdisc_get */
3717 NULL, /* qdisc_set */
3718 NULL, /* class_get */
3719 NULL, /* class_set */
3720 NULL, /* class_delete */
3721 NULL, /* class_get_stats */
3722 NULL /* class_dump_stats */
3725 /* "linux-other" traffic control class.
3730 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
3732 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3733 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
3735 /* Nothing but a tc class implementation is allowed to write to a tc. This
3736 * class never does that, so we can legitimately use a const tc object. */
3737 netdev->tc = CONST_CAST(struct tc *, &tc);
3741 static const struct tc_ops tc_ops_other = {
3742 NULL, /* linux_name */
3743 "linux-other", /* ovs_name */
3745 NULL, /* tc_install */
3747 NULL, /* tc_destroy */
3748 NULL, /* qdisc_get */
3749 NULL, /* qdisc_set */
3750 NULL, /* class_get */
3751 NULL, /* class_set */
3752 NULL, /* class_delete */
3753 NULL, /* class_get_stats */
3754 NULL /* class_dump_stats */
3757 /* Traffic control. */
3759 /* Number of kernel "tc" ticks per second. */
3760 static double ticks_per_s;
3762 /* Number of kernel "jiffies" per second. This is used for the purpose of
3763 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3764 * one jiffy's worth of data.
3766 * There are two possibilities here:
3768 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3769 * approximate range of 100 to 1024. That means that we really need to
3770 * make sure that the qdisc can buffer that much data.
3772 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3773 * has finely granular timers and there's no need to fudge additional room
3774 * for buffers. (There's no extra effort needed to implement that: the
3775 * large 'buffer_hz' is used as a divisor, so practically any number will
3776 * come out as 0 in the division. Small integer results in the case of
3777 * really high dividends won't have any real effect anyhow.)
3779 static unsigned int buffer_hz;
3781 /* Returns tc handle 'major':'minor'. */
3783 tc_make_handle(unsigned int major, unsigned int minor)
3785 return TC_H_MAKE(major << 16, minor);
3788 /* Returns the major number from 'handle'. */
3790 tc_get_major(unsigned int handle)
3792 return TC_H_MAJ(handle) >> 16;
3795 /* Returns the minor number from 'handle'. */
3797 tc_get_minor(unsigned int handle)
3799 return TC_H_MIN(handle);
3802 static struct tcmsg *
3803 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3804 struct ofpbuf *request)
3806 struct tcmsg *tcmsg;
3810 error = get_ifindex(netdev, &ifindex);
3815 ofpbuf_init(request, 512);
3816 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3817 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3818 tcmsg->tcm_family = AF_UNSPEC;
3819 tcmsg->tcm_ifindex = ifindex;
3820 /* Caller should fill in tcmsg->tcm_handle. */
3821 /* Caller should fill in tcmsg->tcm_parent. */
3827 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3829 int error = nl_transact(NETLINK_ROUTE, request, replyp);
3830 ofpbuf_uninit(request);
3834 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3835 * policing configuration.
3837 * This function is equivalent to running the following when 'add' is true:
3838 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3840 * This function is equivalent to running the following when 'add' is false:
3841 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3843 * The configuration and stats may be seen with the following command:
3844 * /sbin/tc -s qdisc show dev <devname>
3846 * Returns 0 if successful, otherwise a positive errno value.
3849 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3851 struct ofpbuf request;
3852 struct tcmsg *tcmsg;
3854 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3855 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3857 tcmsg = tc_make_request(netdev, type, flags, &request);
3861 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3862 tcmsg->tcm_parent = TC_H_INGRESS;
3863 nl_msg_put_string(&request, TCA_KIND, "ingress");
3864 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3866 error = tc_transact(&request, NULL);
3868 /* If we're deleting the qdisc, don't worry about some of the
3869 * error conditions. */
3870 if (!add && (error == ENOENT || error == EINVAL)) {
3879 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3882 * This function is equivalent to running:
3883 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3884 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3887 * The configuration and stats may be seen with the following command:
3888 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3890 * Returns 0 if successful, otherwise a positive errno value.
3893 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3895 struct tc_police tc_police;
3896 struct ofpbuf request;
3897 struct tcmsg *tcmsg;
3898 size_t basic_offset;
3899 size_t police_offset;
3903 memset(&tc_police, 0, sizeof tc_police);
3904 tc_police.action = TC_POLICE_SHOT;
3905 tc_police.mtu = mtu;
3906 tc_fill_rate(&tc_police.rate, (kbits_rate * 1000)/8, mtu);
3907 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3908 kbits_burst * 1024);
3910 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3911 NLM_F_EXCL | NLM_F_CREATE, &request);
3915 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3916 tcmsg->tcm_info = tc_make_handle(49,
3917 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3919 nl_msg_put_string(&request, TCA_KIND, "basic");
3920 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3921 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3922 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3923 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3924 nl_msg_end_nested(&request, police_offset);
3925 nl_msg_end_nested(&request, basic_offset);
3927 error = tc_transact(&request, NULL);
3938 /* The values in psched are not individually very meaningful, but they are
3939 * important. The tables below show some values seen in the wild.
3943 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3944 * (Before that, there are hints that it was 1000000000.)
3946 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3950 * -----------------------------------
3951 * [1] 000c8000 000f4240 000f4240 00000064
3952 * [2] 000003e8 00000400 000f4240 3b9aca00
3953 * [3] 000003e8 00000400 000f4240 3b9aca00
3954 * [4] 000003e8 00000400 000f4240 00000064
3955 * [5] 000003e8 00000040 000f4240 3b9aca00
3956 * [6] 000003e8 00000040 000f4240 000000f9
3958 * a b c d ticks_per_s buffer_hz
3959 * ------- --------- ---------- ------------- ----------- -------------
3960 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3961 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3962 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3963 * [4] 1,000 1,024 1,000,000 100 976,562 100
3964 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3965 * [6] 1,000 64 1,000,000 249 15,625,000 249
3967 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3968 * [2] 2.6.26-1-686-bigmem from Debian lenny
3969 * [3] 2.6.26-2-sparc64 from Debian lenny
3970 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3971 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3972 * [6] 2.6.34 from kernel.org on KVM
3974 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
3975 static const char fn[] = "/proc/net/psched";
3976 unsigned int a, b, c, d;
3979 if (!ovsthread_once_start(&once)) {
3986 stream = fopen(fn, "r");
3988 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
3992 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3993 VLOG_WARN("%s: read failed", fn);
3997 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
4001 VLOG_WARN("%s: invalid scheduler parameters", fn);
4005 ticks_per_s = (double) a * c / b;
4009 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
4012 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
4015 ovsthread_once_done(&once);
4018 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
4019 * rate of 'rate' bytes per second. */
4021 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
4024 return (rate * ticks) / ticks_per_s;
4027 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
4028 * rate of 'rate' bytes per second. */
4030 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
4033 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
4036 /* Returns the number of bytes that need to be reserved for qdisc buffering at
4037 * a transmission rate of 'rate' bytes per second. */
4039 tc_buffer_per_jiffy(unsigned int rate)
4042 return rate / buffer_hz;
4045 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
4046 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
4047 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
4048 * stores NULL into it if it is absent.
4050 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
4053 * Returns 0 if successful, otherwise a positive errno value. */
4055 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
4056 struct nlattr **options)
4058 static const struct nl_policy tca_policy[] = {
4059 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
4060 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
4062 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4064 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4065 tca_policy, ta, ARRAY_SIZE(ta))) {
4066 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
4071 *kind = nl_attr_get_string(ta[TCA_KIND]);
4075 *options = ta[TCA_OPTIONS];
4090 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
4091 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
4092 * into '*options', and its queue statistics into '*stats'. Any of the output
4093 * arguments may be null.
4095 * Returns 0 if successful, otherwise a positive errno value. */
4097 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
4098 struct nlattr **options, struct netdev_queue_stats *stats)
4100 static const struct nl_policy tca_policy[] = {
4101 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
4102 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
4104 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4106 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4107 tca_policy, ta, ARRAY_SIZE(ta))) {
4108 VLOG_WARN_RL(&rl, "failed to parse class message");
4113 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
4114 *handlep = tc->tcm_handle;
4118 *options = ta[TCA_OPTIONS];
4122 const struct gnet_stats_queue *gsq;
4123 struct gnet_stats_basic gsb;
4125 static const struct nl_policy stats_policy[] = {
4126 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
4127 .min_len = sizeof gsb },
4128 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
4129 .min_len = sizeof *gsq },
4131 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
4133 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
4134 sa, ARRAY_SIZE(sa))) {
4135 VLOG_WARN_RL(&rl, "failed to parse class stats");
4139 /* Alignment issues screw up the length of struct gnet_stats_basic on
4140 * some arch/bitsize combinations. Newer versions of Linux have a
4141 * struct gnet_stats_basic_packed, but we can't depend on that. The
4142 * easiest thing to do is just to make a copy. */
4143 memset(&gsb, 0, sizeof gsb);
4144 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
4145 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
4146 stats->tx_bytes = gsb.bytes;
4147 stats->tx_packets = gsb.packets;
4149 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
4150 stats->tx_errors = gsq->drops;
4160 memset(stats, 0, sizeof *stats);
4165 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
4168 tc_query_class(const struct netdev *netdev,
4169 unsigned int handle, unsigned int parent,
4170 struct ofpbuf **replyp)
4172 struct ofpbuf request;
4173 struct tcmsg *tcmsg;
4176 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
4180 tcmsg->tcm_handle = handle;
4181 tcmsg->tcm_parent = parent;
4183 error = tc_transact(&request, replyp);
4185 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
4186 netdev_get_name(netdev),
4187 tc_get_major(handle), tc_get_minor(handle),
4188 tc_get_major(parent), tc_get_minor(parent),
4189 ovs_strerror(error));
4194 /* Equivalent to "tc class del dev <name> handle <handle>". */
4196 tc_delete_class(const struct netdev *netdev, unsigned int handle)
4198 struct ofpbuf request;
4199 struct tcmsg *tcmsg;
4202 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
4206 tcmsg->tcm_handle = handle;
4207 tcmsg->tcm_parent = 0;
4209 error = tc_transact(&request, NULL);
4211 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4212 netdev_get_name(netdev),
4213 tc_get_major(handle), tc_get_minor(handle),
4214 ovs_strerror(error));
4219 /* Equivalent to "tc qdisc del dev <name> root". */
4221 tc_del_qdisc(struct netdev *netdev_)
4223 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4224 struct ofpbuf request;
4225 struct tcmsg *tcmsg;
4228 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
4232 tcmsg->tcm_handle = tc_make_handle(1, 0);
4233 tcmsg->tcm_parent = TC_H_ROOT;
4235 error = tc_transact(&request, NULL);
4236 if (error == EINVAL) {
4237 /* EINVAL probably means that the default qdisc was in use, in which
4238 * case we've accomplished our purpose. */
4241 if (!error && netdev->tc) {
4242 if (netdev->tc->ops->tc_destroy) {
4243 netdev->tc->ops->tc_destroy(netdev->tc);
4250 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4251 * kernel to determine what they are. Returns 0 if successful, otherwise a
4252 * positive errno value. */
4254 tc_query_qdisc(const struct netdev *netdev_)
4256 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4257 struct ofpbuf request, *qdisc;
4258 const struct tc_ops *ops;
4259 struct tcmsg *tcmsg;
4267 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4268 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4269 * 2.6.35 without that fix backported to it.
4271 * To avoid the OOPS, we must not make a request that would attempt to dump
4272 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4273 * few others. There are a few ways that I can see to do this, but most of
4274 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4275 * technique chosen here is to assume that any non-default qdisc that we
4276 * create will have a class with handle 1:0. The built-in qdiscs only have
4277 * a class with handle 0:0.
4279 * We could check for Linux 2.6.35+ and use a more straightforward method
4281 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
4285 tcmsg->tcm_handle = tc_make_handle(1, 0);
4286 tcmsg->tcm_parent = 0;
4288 /* Figure out what tc class to instantiate. */
4289 error = tc_transact(&request, &qdisc);
4293 error = tc_parse_qdisc(qdisc, &kind, NULL);
4295 ops = &tc_ops_other;
4297 ops = tc_lookup_linux_name(kind);
4299 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4300 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4302 ops = &tc_ops_other;
4305 } else if (error == ENOENT) {
4306 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4307 * other entity that doesn't have a handle 1:0. We will assume
4308 * that it's the system default qdisc. */
4309 ops = &tc_ops_default;
4312 /* Who knows? Maybe the device got deleted. */
4313 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4314 netdev_get_name(netdev_), ovs_strerror(error));
4315 ops = &tc_ops_other;
4318 /* Instantiate it. */
4319 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
4320 ovs_assert((load_error == 0) == (netdev->tc != NULL));
4321 ofpbuf_delete(qdisc);
4323 return error ? error : load_error;
4326 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4327 approximate the time to transmit packets of various lengths. For an MTU of
4328 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4329 represents two possible packet lengths; for a MTU of 513 through 1024, four
4330 possible lengths; and so on.
4332 Returns, for the specified 'mtu', the number of bits that packet lengths
4333 need to be shifted right to fit within such a 256-entry table. */
4335 tc_calc_cell_log(unsigned int mtu)
4340 mtu = ETH_PAYLOAD_MAX;
4342 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4344 for (cell_log = 0; mtu >= 256; cell_log++) {
4351 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4354 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4356 memset(rate, 0, sizeof *rate);
4357 rate->cell_log = tc_calc_cell_log(mtu);
4358 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4359 /* rate->cell_align = 0; */ /* distro headers. */
4360 rate->mpu = ETH_TOTAL_MIN;
4364 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4365 * attribute of the specified "type".
4367 * See tc_calc_cell_log() above for a description of "rtab"s. */
4369 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4374 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4375 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4376 unsigned packet_size = (i + 1) << rate->cell_log;
4377 if (packet_size < rate->mpu) {
4378 packet_size = rate->mpu;
4380 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4384 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4385 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4386 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4389 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4391 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4392 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4395 /* Linux-only functions declared in netdev-linux.h */
4397 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4398 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4400 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4401 const char *flag_name, bool enable)
4403 const char *netdev_name = netdev_get_name(netdev);
4404 struct ethtool_value evalue;
4408 COVERAGE_INC(netdev_get_ethtool);
4409 memset(&evalue, 0, sizeof evalue);
4410 error = netdev_linux_do_ethtool(netdev_name,
4411 (struct ethtool_cmd *)&evalue,
4412 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4417 COVERAGE_INC(netdev_set_ethtool);
4418 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4419 error = netdev_linux_do_ethtool(netdev_name,
4420 (struct ethtool_cmd *)&evalue,
4421 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4426 COVERAGE_INC(netdev_get_ethtool);
4427 memset(&evalue, 0, sizeof evalue);
4428 error = netdev_linux_do_ethtool(netdev_name,
4429 (struct ethtool_cmd *)&evalue,
4430 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4435 if (new_flags != evalue.data) {
4436 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4437 "device %s failed", enable ? "enable" : "disable",
4438 flag_name, netdev_name);
4445 /* Utility functions. */
4447 /* Copies 'src' into 'dst', performing format conversion in the process. */
4449 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4450 const struct rtnl_link_stats *src)
4452 dst->rx_packets = src->rx_packets;
4453 dst->tx_packets = src->tx_packets;
4454 dst->rx_bytes = src->rx_bytes;
4455 dst->tx_bytes = src->tx_bytes;
4456 dst->rx_errors = src->rx_errors;
4457 dst->tx_errors = src->tx_errors;
4458 dst->rx_dropped = src->rx_dropped;
4459 dst->tx_dropped = src->tx_dropped;
4460 dst->multicast = src->multicast;
4461 dst->collisions = src->collisions;
4462 dst->rx_length_errors = src->rx_length_errors;
4463 dst->rx_over_errors = src->rx_over_errors;
4464 dst->rx_crc_errors = src->rx_crc_errors;
4465 dst->rx_frame_errors = src->rx_frame_errors;
4466 dst->rx_fifo_errors = src->rx_fifo_errors;
4467 dst->rx_missed_errors = src->rx_missed_errors;
4468 dst->tx_aborted_errors = src->tx_aborted_errors;
4469 dst->tx_carrier_errors = src->tx_carrier_errors;
4470 dst->tx_fifo_errors = src->tx_fifo_errors;
4471 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4472 dst->tx_window_errors = src->tx_window_errors;
4476 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4478 /* Policy for RTNLGRP_LINK messages.
4480 * There are *many* more fields in these messages, but currently we only
4481 * care about these fields. */
4482 static const struct nl_policy rtnlgrp_link_policy[] = {
4483 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4484 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4485 .min_len = sizeof(struct rtnl_link_stats) },
4488 struct ofpbuf request;
4489 struct ofpbuf *reply;
4490 struct ifinfomsg *ifi;
4491 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4494 ofpbuf_init(&request, 0);
4495 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4496 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4497 ifi->ifi_family = PF_UNSPEC;
4498 ifi->ifi_index = ifindex;
4499 error = nl_transact(NETLINK_ROUTE, &request, &reply);
4500 ofpbuf_uninit(&request);
4505 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4506 rtnlgrp_link_policy,
4507 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4508 ofpbuf_delete(reply);
4512 if (!attrs[IFLA_STATS]) {
4513 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4514 ofpbuf_delete(reply);
4518 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4520 ofpbuf_delete(reply);
4526 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4528 static const char fn[] = "/proc/net/dev";
4533 stream = fopen(fn, "r");
4535 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
4540 while (fgets(line, sizeof line, stream)) {
4543 #define X64 "%"SCNu64
4546 X64 X64 X64 X64 X64 X64 X64 "%*u"
4547 X64 X64 X64 X64 X64 X64 X64 "%*u",
4553 &stats->rx_fifo_errors,
4554 &stats->rx_frame_errors,
4560 &stats->tx_fifo_errors,
4562 &stats->tx_carrier_errors)) {
4563 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4564 } else if (!strcmp(devname, netdev_name)) {
4565 stats->rx_length_errors = UINT64_MAX;
4566 stats->rx_over_errors = UINT64_MAX;
4567 stats->rx_crc_errors = UINT64_MAX;
4568 stats->rx_missed_errors = UINT64_MAX;
4569 stats->tx_aborted_errors = UINT64_MAX;
4570 stats->tx_heartbeat_errors = UINT64_MAX;
4571 stats->tx_window_errors = UINT64_MAX;
4577 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4583 get_flags(const struct netdev *dev, unsigned int *flags)
4589 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
4591 *flags = ifr.ifr_flags;
4597 set_flags(const char *name, unsigned int flags)
4601 ifr.ifr_flags = flags;
4602 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
4606 do_get_ifindex(const char *netdev_name)
4611 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4612 COVERAGE_INC(netdev_get_ifindex);
4614 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
4616 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4617 netdev_name, ovs_strerror(error));
4620 return ifr.ifr_ifindex;
4624 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4626 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4628 if (!(netdev->cache_valid & VALID_IFINDEX)) {
4629 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4632 netdev->get_ifindex_error = -ifindex;
4633 netdev->ifindex = 0;
4635 netdev->get_ifindex_error = 0;
4636 netdev->ifindex = ifindex;
4638 netdev->cache_valid |= VALID_IFINDEX;
4641 *ifindexp = netdev->ifindex;
4642 return netdev->get_ifindex_error;
4646 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4652 memset(&ifr, 0, sizeof ifr);
4653 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4654 COVERAGE_INC(netdev_get_hwaddr);
4655 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
4657 /* ENODEV probably means that a vif disappeared asynchronously and
4658 * hasn't been removed from the database yet, so reduce the log level
4659 * to INFO for that case. */
4660 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
4661 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4662 netdev_name, ovs_strerror(error));
4665 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4666 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4667 VLOG_WARN("%s device has unknown hardware address family %d",
4668 netdev_name, hwaddr_family);
4670 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4675 set_etheraddr(const char *netdev_name,
4676 const uint8_t mac[ETH_ADDR_LEN])
4681 memset(&ifr, 0, sizeof ifr);
4682 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4683 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4684 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4685 COVERAGE_INC(netdev_set_hwaddr);
4686 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
4688 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4689 netdev_name, ovs_strerror(error));
4695 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4696 int cmd, const char *cmd_name)
4701 memset(&ifr, 0, sizeof ifr);
4702 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4703 ifr.ifr_data = (caddr_t) ecmd;
4706 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
4708 if (error != EOPNOTSUPP) {
4709 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4710 "failed: %s", cmd_name, name, ovs_strerror(error));
4712 /* The device doesn't support this operation. That's pretty
4713 * common, so there's no point in logging anything. */
4720 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4721 int cmd, const char *cmd_name)
4726 ifr.ifr_addr.sa_family = AF_INET;
4727 error = af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4729 const struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *,
4731 *ip = sin->sin_addr;
4736 /* Returns an AF_PACKET raw socket or a negative errno value. */
4738 af_packet_sock(void)
4740 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4743 if (ovsthread_once_start(&once)) {
4744 sock = socket(AF_PACKET, SOCK_RAW, 0);
4746 int error = set_nonblocking(sock);
4753 VLOG_ERR("failed to create packet socket: %s",
4754 ovs_strerror(errno));
4756 ovsthread_once_done(&once);