2 * Copyright (c) 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
24 #include <linux/filter.h>
25 #include <linux/gen_stats.h>
26 #include <linux/if_ether.h>
27 #include <linux/if_packet.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <linux/version.h>
37 #include <sys/types.h>
38 #include <sys/ioctl.h>
39 #include <sys/socket.h>
41 #include <net/if_arp.h>
42 #include <net/route.h>
43 #include <netinet/in.h>
49 #include "connectivity.h"
51 #include "dpif-linux.h"
52 #include "dynamic-string.h"
53 #include "fatal-signal.h"
56 #include "netdev-provider.h"
57 #include "netdev-vport.h"
58 #include "netlink-notifier.h"
59 #include "netlink-socket.h"
62 #include "openflow/openflow.h"
63 #include "ovs-atomic.h"
65 #include "poll-loop.h"
66 #include "rtnetlink-link.h"
69 #include "socket-util.h"
72 #include "unaligned.h"
75 VLOG_DEFINE_THIS_MODULE(netdev_linux);
77 COVERAGE_DEFINE(netdev_set_policing);
78 COVERAGE_DEFINE(netdev_arp_lookup);
79 COVERAGE_DEFINE(netdev_get_ifindex);
80 COVERAGE_DEFINE(netdev_get_hwaddr);
81 COVERAGE_DEFINE(netdev_set_hwaddr);
82 COVERAGE_DEFINE(netdev_get_ethtool);
83 COVERAGE_DEFINE(netdev_set_ethtool);
86 /* These were introduced in Linux 2.6.14, so they might be missing if we have
88 #ifndef ADVERTISED_Pause
89 #define ADVERTISED_Pause (1 << 13)
91 #ifndef ADVERTISED_Asym_Pause
92 #define ADVERTISED_Asym_Pause (1 << 14)
95 /* These were introduced in Linux 2.6.24, so they might be missing if we
96 * have old headers. */
97 #ifndef ETHTOOL_GFLAGS
98 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
100 #ifndef ETHTOOL_SFLAGS
101 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
104 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
107 #define TC_RTAB_SIZE 1024
110 /* Linux 2.6.21 introduced struct tpacket_auxdata.
111 * Linux 2.6.27 added the tp_vlan_tci member.
112 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
113 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
114 * TP_STATUS_VLAN_TPID_VALID.
116 * With all this churn it's easiest to unconditionally define a replacement
117 * structure that has everything we want.
119 #ifndef TP_STATUS_VLAN_VALID
120 #define TP_STATUS_VLAN_VALID (1 << 4)
122 #ifndef TP_STATUS_VLAN_TPID_VALID
123 #define TP_STATUS_VLAN_TPID_VALID (1 << 6)
125 #undef tpacket_auxdata
126 #define tpacket_auxdata rpl_tpacket_auxdata
127 struct tpacket_auxdata {
133 uint16_t tp_vlan_tci;
134 uint16_t tp_vlan_tpid;
138 VALID_IFINDEX = 1 << 0,
139 VALID_ETHERADDR = 1 << 1,
143 VALID_POLICING = 1 << 5,
144 VALID_VPORT_STAT_ERROR = 1 << 6,
145 VALID_DRVINFO = 1 << 7,
146 VALID_FEATURES = 1 << 8,
149 /* Traffic control. */
151 /* An instance of a traffic control class. Always associated with a particular
154 * Each TC implementation subclasses this with whatever additional data it
157 const struct tc_ops *ops;
158 struct hmap queues; /* Contains "struct tc_queue"s.
159 * Read by generic TC layer.
160 * Written only by TC implementation. */
163 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
165 /* One traffic control queue.
167 * Each TC implementation subclasses this with whatever additional data it
170 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
171 unsigned int queue_id; /* OpenFlow queue ID. */
172 long long int created; /* Time queue was created, in msecs. */
175 /* A particular kind of traffic control. Each implementation generally maps to
176 * one particular Linux qdisc class.
178 * The functions below return 0 if successful or a positive errno value on
179 * failure, except where otherwise noted. All of them must be provided, except
180 * where otherwise noted. */
182 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
183 * This is null for tc_ops_default and tc_ops_other, for which there are no
184 * appropriate values. */
185 const char *linux_name;
187 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
188 const char *ovs_name;
190 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
191 * queues. The queues are numbered 0 through n_queues - 1. */
192 unsigned int n_queues;
194 /* Called to install this TC class on 'netdev'. The implementation should
195 * make the Netlink calls required to set up 'netdev' with the right qdisc
196 * and configure it according to 'details'. The implementation may assume
197 * that the current qdisc is the default; that is, there is no need for it
198 * to delete the current qdisc before installing itself.
200 * The contents of 'details' should be documented as valid for 'ovs_name'
201 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
202 * (which is built as ovs-vswitchd.conf.db(8)).
204 * This function must return 0 if and only if it sets 'netdev->tc' to an
205 * initialized 'struct tc'.
207 * (This function is null for tc_ops_other, which cannot be installed. For
208 * other TC classes it should always be nonnull.) */
209 int (*tc_install)(struct netdev *netdev, const struct smap *details);
211 /* Called when the netdev code determines (through a Netlink query) that
212 * this TC class's qdisc is installed on 'netdev', but we didn't install
213 * it ourselves and so don't know any of the details.
215 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
216 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
217 * implementation should parse the other attributes of 'nlmsg' as
218 * necessary to determine its configuration. If necessary it should also
219 * use Netlink queries to determine the configuration of queues on
222 * This function must return 0 if and only if it sets 'netdev->tc' to an
223 * initialized 'struct tc'. */
224 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
226 /* Destroys the data structures allocated by the implementation as part of
227 * 'tc'. (This includes destroying 'tc->queues' by calling
230 * The implementation should not need to perform any Netlink calls. If
231 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
232 * (But it may not be desirable.)
234 * This function may be null if 'tc' is trivial. */
235 void (*tc_destroy)(struct tc *tc);
237 /* Retrieves details of 'netdev->tc' configuration into 'details'.
239 * The implementation should not need to perform any Netlink calls, because
240 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
241 * cached the configuration.
243 * The contents of 'details' should be documented as valid for 'ovs_name'
244 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
245 * (which is built as ovs-vswitchd.conf.db(8)).
247 * This function may be null if 'tc' is not configurable.
249 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
251 /* Reconfigures 'netdev->tc' according to 'details', performing any
252 * required Netlink calls to complete the reconfiguration.
254 * The contents of 'details' should be documented as valid for 'ovs_name'
255 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
256 * (which is built as ovs-vswitchd.conf.db(8)).
258 * This function may be null if 'tc' is not configurable.
260 int (*qdisc_set)(struct netdev *, const struct smap *details);
262 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
263 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
265 * The contents of 'details' should be documented as valid for 'ovs_name'
266 * in the "other_config" column in the "Queue" table in
267 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
269 * The implementation should not need to perform any Netlink calls, because
270 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
271 * cached the queue configuration.
273 * This function may be null if 'tc' does not have queues ('n_queues' is
275 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
276 struct smap *details);
278 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
279 * 'details', perfoming any required Netlink calls to complete the
280 * reconfiguration. The caller ensures that 'queue_id' is less than
283 * The contents of 'details' should be documented as valid for 'ovs_name'
284 * in the "other_config" column in the "Queue" table in
285 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
287 * This function may be null if 'tc' does not have queues or its queues are
288 * not configurable. */
289 int (*class_set)(struct netdev *, unsigned int queue_id,
290 const struct smap *details);
292 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
293 * tc_queue's within 'netdev->tc->queues'.
295 * This function may be null if 'tc' does not have queues or its queues
296 * cannot be deleted. */
297 int (*class_delete)(struct netdev *, struct tc_queue *queue);
299 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
300 * 'struct tc_queue's within 'netdev->tc->queues'.
302 * On success, initializes '*stats'.
304 * This function may be null if 'tc' does not have queues or if it cannot
305 * report queue statistics. */
306 int (*class_get_stats)(const struct netdev *netdev,
307 const struct tc_queue *queue,
308 struct netdev_queue_stats *stats);
310 /* Extracts queue stats from 'nlmsg', which is a response to a
311 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
313 * This function may be null if 'tc' does not have queues or if it cannot
314 * report queue statistics. */
315 int (*class_dump_stats)(const struct netdev *netdev,
316 const struct ofpbuf *nlmsg,
317 netdev_dump_queue_stats_cb *cb, void *aux);
321 tc_init(struct tc *tc, const struct tc_ops *ops)
324 hmap_init(&tc->queues);
328 tc_destroy(struct tc *tc)
330 hmap_destroy(&tc->queues);
333 static const struct tc_ops tc_ops_htb;
334 static const struct tc_ops tc_ops_hfsc;
335 static const struct tc_ops tc_ops_default;
336 static const struct tc_ops tc_ops_other;
338 static const struct tc_ops *const tcs[] = {
339 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
340 &tc_ops_hfsc, /* Hierarchical fair service curve. */
341 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
342 &tc_ops_other, /* Some other qdisc. */
346 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
347 static unsigned int tc_get_major(unsigned int handle);
348 static unsigned int tc_get_minor(unsigned int handle);
350 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
351 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
352 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
354 static struct tcmsg *tc_make_request(const struct netdev *, int type,
355 unsigned int flags, struct ofpbuf *);
356 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
357 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
358 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
361 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
362 struct nlattr **options);
363 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
364 struct nlattr **options,
365 struct netdev_queue_stats *);
366 static int tc_query_class(const struct netdev *,
367 unsigned int handle, unsigned int parent,
368 struct ofpbuf **replyp);
369 static int tc_delete_class(const struct netdev *, unsigned int handle);
371 static int tc_del_qdisc(struct netdev *netdev);
372 static int tc_query_qdisc(const struct netdev *netdev);
374 static int tc_calc_cell_log(unsigned int mtu);
375 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
376 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
377 const struct tc_ratespec *rate);
378 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
380 struct netdev_linux {
383 /* Protects all members below. */
384 struct ovs_mutex mutex;
386 unsigned int cache_valid;
388 bool miimon; /* Link status of last poll. */
389 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
390 struct timer miimon_timer;
392 /* The following are figured out "on demand" only. They are only valid
393 * when the corresponding VALID_* bit in 'cache_valid' is set. */
395 uint8_t etheraddr[ETH_ADDR_LEN];
396 struct in_addr address, netmask;
399 unsigned int ifi_flags;
400 long long int carrier_resets;
401 uint32_t kbits_rate; /* Policing data. */
402 uint32_t kbits_burst;
403 int vport_stats_error; /* Cached error code from vport_get_stats().
404 0 or an errno value. */
405 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
406 int ether_addr_error; /* Cached error code from set/get etheraddr. */
407 int netdev_policing_error; /* Cached error code from set policing. */
408 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
409 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
411 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
412 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
413 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
415 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
418 /* For devices of class netdev_tap_class only. */
422 struct netdev_rx_linux {
428 /* This is set pretty low because we probably won't learn anything from the
429 * additional log messages. */
430 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
432 /* Polling miimon status for all ports causes performance degradation when
433 * handling a large number of ports. If there are no devices using miimon, then
434 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait(). */
435 static atomic_int miimon_cnt = ATOMIC_VAR_INIT(0);
437 static void netdev_linux_run(void);
439 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
440 int cmd, const char *cmd_name);
441 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
442 int cmd, const char *cmd_name);
443 static int get_flags(const struct netdev *, unsigned int *flags);
444 static int set_flags(const char *, unsigned int flags);
445 static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
446 enum netdev_flags on, enum netdev_flags *old_flagsp)
447 OVS_REQUIRES(netdev->mutex);
448 static int do_get_ifindex(const char *netdev_name);
449 static int get_ifindex(const struct netdev *, int *ifindexp);
450 static int do_set_addr(struct netdev *netdev,
451 int ioctl_nr, const char *ioctl_name,
452 struct in_addr addr);
453 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
454 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
455 static int get_stats_via_netlink(const struct netdev *, struct netdev_stats *);
456 static int af_packet_sock(void);
457 static bool netdev_linux_miimon_enabled(void);
458 static void netdev_linux_miimon_run(void);
459 static void netdev_linux_miimon_wait(void);
462 is_netdev_linux_class(const struct netdev_class *netdev_class)
464 return netdev_class->run == netdev_linux_run;
468 is_tap_netdev(const struct netdev *netdev)
470 return netdev_get_class(netdev) == &netdev_tap_class;
473 static struct netdev_linux *
474 netdev_linux_cast(const struct netdev *netdev)
476 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
478 return CONTAINER_OF(netdev, struct netdev_linux, up);
481 static struct netdev_rx_linux *
482 netdev_rx_linux_cast(const struct netdev_rx *rx)
484 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
485 return CONTAINER_OF(rx, struct netdev_rx_linux, up);
488 static void netdev_linux_update(struct netdev_linux *netdev,
489 const struct rtnetlink_link_change *)
490 OVS_REQUIRES(netdev->mutex);
491 static void netdev_linux_changed(struct netdev_linux *netdev,
492 unsigned int ifi_flags, unsigned int mask)
493 OVS_REQUIRES(netdev->mutex);
495 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK changes, or NULL
496 * if no such socket could be created. */
497 static struct nl_sock *
498 netdev_linux_notify_sock(void)
500 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
501 static struct nl_sock *sock;
503 if (ovsthread_once_start(&once)) {
506 error = nl_sock_create(NETLINK_ROUTE, &sock);
508 error = nl_sock_join_mcgroup(sock, RTNLGRP_LINK);
510 nl_sock_destroy(sock);
514 ovsthread_once_done(&once);
521 netdev_linux_miimon_enabled(void)
525 atomic_read(&miimon_cnt, &miimon);
530 netdev_linux_run(void)
532 struct nl_sock *sock;
535 if (netdev_linux_miimon_enabled()) {
536 netdev_linux_miimon_run();
539 sock = netdev_linux_notify_sock();
545 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
546 uint64_t buf_stub[4096 / 8];
549 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
550 error = nl_sock_recv(sock, &buf, false);
552 struct rtnetlink_link_change change;
554 if (rtnetlink_link_parse(&buf, &change)) {
555 struct netdev *netdev_ = netdev_from_name(change.ifname);
556 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
557 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
559 ovs_mutex_lock(&netdev->mutex);
560 netdev_linux_update(netdev, &change);
561 ovs_mutex_unlock(&netdev->mutex);
563 netdev_close(netdev_);
565 } else if (error == ENOBUFS) {
566 struct shash device_shash;
567 struct shash_node *node;
571 shash_init(&device_shash);
572 netdev_get_devices(&netdev_linux_class, &device_shash);
573 SHASH_FOR_EACH (node, &device_shash) {
574 struct netdev *netdev_ = node->data;
575 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
578 ovs_mutex_lock(&netdev->mutex);
579 get_flags(netdev_, &flags);
580 netdev_linux_changed(netdev, flags, 0);
581 ovs_mutex_unlock(&netdev->mutex);
583 netdev_close(netdev_);
585 shash_destroy(&device_shash);
586 } else if (error != EAGAIN) {
587 VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)",
588 ovs_strerror(error));
595 netdev_linux_wait(void)
597 struct nl_sock *sock;
599 if (netdev_linux_miimon_enabled()) {
600 netdev_linux_miimon_wait();
602 sock = netdev_linux_notify_sock();
604 nl_sock_wait(sock, POLLIN);
609 netdev_linux_changed(struct netdev_linux *dev,
610 unsigned int ifi_flags, unsigned int mask)
611 OVS_REQUIRES(dev->mutex)
613 seq_change(connectivity_seq_get());
615 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
616 dev->carrier_resets++;
618 dev->ifi_flags = ifi_flags;
620 dev->cache_valid &= mask;
624 netdev_linux_update(struct netdev_linux *dev,
625 const struct rtnetlink_link_change *change)
626 OVS_REQUIRES(dev->mutex)
628 if (change->nlmsg_type == RTM_NEWLINK) {
630 netdev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
632 /* Update netdev from rtnl-change msg. */
634 dev->mtu = change->mtu;
635 dev->cache_valid |= VALID_MTU;
636 dev->netdev_mtu_error = 0;
639 if (!eth_addr_is_zero(change->addr)) {
640 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
641 dev->cache_valid |= VALID_ETHERADDR;
642 dev->ether_addr_error = 0;
645 dev->ifindex = change->ifi_index;
646 dev->cache_valid |= VALID_IFINDEX;
647 dev->get_ifindex_error = 0;
650 netdev_linux_changed(dev, change->ifi_flags, 0);
654 static struct netdev *
655 netdev_linux_alloc(void)
657 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
662 netdev_linux_common_construct(struct netdev_linux *netdev)
664 ovs_mutex_init(&netdev->mutex);
667 /* Creates system and internal devices. */
669 netdev_linux_construct(struct netdev *netdev_)
671 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
674 netdev_linux_common_construct(netdev);
676 error = get_flags(&netdev->up, &netdev->ifi_flags);
677 if (error == ENODEV) {
678 if (netdev->up.netdev_class != &netdev_internal_class) {
679 /* The device does not exist, so don't allow it to be opened. */
682 /* "Internal" netdevs have to be created as netdev objects before
683 * they exist in the kernel, because creating them in the kernel
684 * happens by passing a netdev object to dpif_port_add().
685 * Therefore, ignore the error. */
692 /* For most types of netdevs we open the device for each call of
693 * netdev_open(). However, this is not the case with tap devices,
694 * since it is only possible to open the device once. In this
695 * situation we share a single file descriptor, and consequently
696 * buffers, across all readers. Therefore once data is read it will
697 * be unavailable to other reads for tap devices. */
699 netdev_linux_construct_tap(struct netdev *netdev_)
701 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
702 static const char tap_dev[] = "/dev/net/tun";
703 const char *name = netdev_->name;
707 netdev_linux_common_construct(netdev);
709 /* Open tap device. */
710 netdev->tap_fd = open(tap_dev, O_RDWR);
711 if (netdev->tap_fd < 0) {
713 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
717 /* Create tap device. */
718 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
719 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
720 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
721 VLOG_WARN("%s: creating tap device failed: %s", name,
722 ovs_strerror(errno));
727 /* Make non-blocking. */
728 error = set_nonblocking(netdev->tap_fd);
736 close(netdev->tap_fd);
741 netdev_linux_destruct(struct netdev *netdev_)
743 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
745 if (netdev->tc && netdev->tc->ops->tc_destroy) {
746 netdev->tc->ops->tc_destroy(netdev->tc);
749 if (netdev_get_class(netdev_) == &netdev_tap_class
750 && netdev->tap_fd >= 0)
752 close(netdev->tap_fd);
755 if (netdev->miimon_interval > 0) {
757 atomic_sub(&miimon_cnt, 1, &junk);
760 ovs_mutex_destroy(&netdev->mutex);
764 netdev_linux_dealloc(struct netdev *netdev_)
766 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
770 static struct netdev_rx *
771 netdev_linux_rx_alloc(void)
773 struct netdev_rx_linux *rx = xzalloc(sizeof *rx);
778 netdev_linux_rx_construct(struct netdev_rx *rx_)
780 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
781 struct netdev *netdev_ = rx->up.netdev;
782 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
785 ovs_mutex_lock(&netdev->mutex);
786 rx->is_tap = is_tap_netdev(netdev_);
788 rx->fd = netdev->tap_fd;
790 struct sockaddr_ll sll;
792 /* Result of tcpdump -dd inbound */
793 static const struct sock_filter filt[] = {
794 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
795 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
796 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
797 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
799 static const struct sock_fprog fprog = {
800 ARRAY_SIZE(filt), (struct sock_filter *) filt
803 /* Create file descriptor. */
804 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
807 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
812 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
814 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
815 netdev_get_name(netdev_), ovs_strerror(error));
819 /* Set non-blocking mode. */
820 error = set_nonblocking(rx->fd);
825 /* Get ethernet device index. */
826 error = get_ifindex(&netdev->up, &ifindex);
831 /* Bind to specific ethernet device. */
832 memset(&sll, 0, sizeof sll);
833 sll.sll_family = AF_PACKET;
834 sll.sll_ifindex = ifindex;
835 sll.sll_protocol = htons(ETH_P_ALL);
836 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
838 VLOG_ERR("%s: failed to bind raw socket (%s)",
839 netdev_get_name(netdev_), ovs_strerror(error));
843 /* Filter for only inbound packets. */
844 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
848 VLOG_ERR("%s: failed to attach filter (%s)",
849 netdev_get_name(netdev_), ovs_strerror(error));
853 ovs_mutex_unlock(&netdev->mutex);
861 ovs_mutex_unlock(&netdev->mutex);
866 netdev_linux_rx_destruct(struct netdev_rx *rx_)
868 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
876 netdev_linux_rx_dealloc(struct netdev_rx *rx_)
878 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
884 auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux)
886 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
887 return htons(aux->tp_vlan_tpid);
889 return htons(ETH_TYPE_VLAN);
894 auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
896 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
900 netdev_linux_rx_recv_sock(int fd, struct ofpbuf *buffer)
905 struct cmsghdr *cmsg;
908 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
912 /* Reserve headroom for a single VLAN tag */
913 ofpbuf_reserve(buffer, VLAN_HEADER_LEN);
914 size = ofpbuf_tailroom(buffer);
916 iov.iov_base = buffer->data;
918 msgh.msg_name = NULL;
919 msgh.msg_namelen = 0;
922 msgh.msg_control = &cmsg_buffer;
923 msgh.msg_controllen = sizeof cmsg_buffer;
927 retval = recvmsg(fd, &msgh, MSG_TRUNC);
928 } while (retval < 0 && errno == EINTR);
932 } else if (retval > size) {
936 buffer->size += retval;
938 for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
939 const struct tpacket_auxdata *aux;
941 if (cmsg->cmsg_level != SOL_PACKET
942 || cmsg->cmsg_type != PACKET_AUXDATA
943 || cmsg->cmsg_len < CMSG_LEN(sizeof(struct tpacket_auxdata))) {
947 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
948 if (auxdata_has_vlan_tci(aux)) {
949 if (retval < ETH_HEADER_LEN) {
953 eth_push_vlan(buffer, auxdata_to_vlan_tpid(aux),
954 htons(aux->tp_vlan_tci));
963 netdev_linux_rx_recv_tap(int fd, struct ofpbuf *buffer)
966 size_t size = ofpbuf_tailroom(buffer);
969 retval = read(fd, buffer->data, size);
970 } while (retval < 0 && errno == EINTR);
974 } else if (retval > size) {
978 buffer->size += retval;
983 netdev_linux_rx_recv(struct netdev_rx *rx_, struct ofpbuf *buffer)
985 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
989 ? netdev_linux_rx_recv_tap(rx->fd, buffer)
990 : netdev_linux_rx_recv_sock(rx->fd, buffer));
991 if (retval && retval != EAGAIN && retval != EMSGSIZE) {
992 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
993 ovs_strerror(errno), netdev_rx_get_name(rx_));
1000 netdev_linux_rx_wait(struct netdev_rx *rx_)
1002 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
1003 poll_fd_wait(rx->fd, POLLIN);
1007 netdev_linux_rx_drain(struct netdev_rx *rx_)
1009 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
1012 int error = af_inet_ifreq_ioctl(netdev_rx_get_name(rx_), &ifr,
1013 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
1017 drain_fd(rx->fd, ifr.ifr_qlen);
1020 return drain_rcvbuf(rx->fd);
1024 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
1025 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1026 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1027 * the packet is too big or too small to transmit on the device.
1029 * The caller retains ownership of 'buffer' in all cases.
1031 * The kernel maintains a packet transmission queue, so the caller is not
1032 * expected to do additional queuing of packets. */
1034 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
1039 if (!is_tap_netdev(netdev_)) {
1040 /* Use our AF_PACKET socket to send to this device. */
1041 struct sockaddr_ll sll;
1047 sock = af_packet_sock();
1052 ifindex = netdev_get_ifindex(netdev_);
1057 /* We don't bother setting most fields in sockaddr_ll because the
1058 * kernel ignores them for SOCK_RAW. */
1059 memset(&sll, 0, sizeof sll);
1060 sll.sll_family = AF_PACKET;
1061 sll.sll_ifindex = ifindex;
1063 iov.iov_base = CONST_CAST(void *, data);
1066 msg.msg_name = &sll;
1067 msg.msg_namelen = sizeof sll;
1070 msg.msg_control = NULL;
1071 msg.msg_controllen = 0;
1074 retval = sendmsg(sock, &msg, 0);
1076 /* Use the tap fd to send to this device. This is essential for
1077 * tap devices, because packets sent to a tap device with an
1078 * AF_PACKET socket will loop back to be *received* again on the
1079 * tap device. This doesn't occur on other interface types
1080 * because we attach a socket filter to the rx socket. */
1081 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1083 retval = write(netdev->tap_fd, data, size);
1087 /* The Linux AF_PACKET implementation never blocks waiting for room
1088 * for packets, instead returning ENOBUFS. Translate this into
1089 * EAGAIN for the caller. */
1090 if (errno == ENOBUFS) {
1092 } else if (errno == EINTR) {
1094 } else if (errno != EAGAIN) {
1095 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1096 netdev_get_name(netdev_), ovs_strerror(errno));
1099 } else if (retval != size) {
1100 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE"d bytes of "
1101 "%"PRIuSIZE") on %s", retval, size, netdev_get_name(netdev_));
1109 /* Registers with the poll loop to wake up from the next call to poll_block()
1110 * when the packet transmission queue has sufficient room to transmit a packet
1111 * with netdev_send().
1113 * The kernel maintains a packet transmission queue, so the client is not
1114 * expected to do additional queuing of packets. Thus, this function is
1115 * unlikely to ever be used. It is included for completeness. */
1117 netdev_linux_send_wait(struct netdev *netdev)
1119 if (is_tap_netdev(netdev)) {
1120 /* TAP device always accepts packets.*/
1121 poll_immediate_wake();
1125 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1126 * otherwise a positive errno value. */
1128 netdev_linux_set_etheraddr(struct netdev *netdev_,
1129 const uint8_t mac[ETH_ADDR_LEN])
1131 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1132 enum netdev_flags old_flags = 0;
1135 ovs_mutex_lock(&netdev->mutex);
1137 if (netdev->cache_valid & VALID_ETHERADDR) {
1138 error = netdev->ether_addr_error;
1139 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1142 netdev->cache_valid &= ~VALID_ETHERADDR;
1145 /* Tap devices must be brought down before setting the address. */
1146 if (is_tap_netdev(netdev_)) {
1147 update_flags(netdev, NETDEV_UP, 0, &old_flags);
1149 error = set_etheraddr(netdev_get_name(netdev_), mac);
1150 if (!error || error == ENODEV) {
1151 netdev->ether_addr_error = error;
1152 netdev->cache_valid |= VALID_ETHERADDR;
1154 memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
1158 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1159 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1163 ovs_mutex_unlock(&netdev->mutex);
1167 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1169 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1170 uint8_t mac[ETH_ADDR_LEN])
1172 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1175 ovs_mutex_lock(&netdev->mutex);
1176 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1177 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1179 netdev->cache_valid |= VALID_ETHERADDR;
1182 error = netdev->ether_addr_error;
1184 memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
1186 ovs_mutex_unlock(&netdev->mutex);
1192 netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
1196 if (!(netdev->cache_valid & VALID_MTU)) {
1199 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
1200 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
1201 netdev->mtu = ifr.ifr_mtu;
1202 netdev->cache_valid |= VALID_MTU;
1205 error = netdev->netdev_mtu_error;
1207 *mtup = netdev->mtu;
1213 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1214 * in bytes, not including the hardware header; thus, this is typically 1500
1215 * bytes for Ethernet devices. */
1217 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1219 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1222 ovs_mutex_lock(&netdev->mutex);
1223 error = netdev_linux_get_mtu__(netdev, mtup);
1224 ovs_mutex_unlock(&netdev->mutex);
1229 /* Sets the maximum size of transmitted (MTU) for given device using linux
1230 * networking ioctl interface.
1233 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1235 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1239 ovs_mutex_lock(&netdev->mutex);
1240 if (netdev->cache_valid & VALID_MTU) {
1241 error = netdev->netdev_mtu_error;
1242 if (error || netdev->mtu == mtu) {
1245 netdev->cache_valid &= ~VALID_MTU;
1248 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1249 SIOCSIFMTU, "SIOCSIFMTU");
1250 if (!error || error == ENODEV) {
1251 netdev->netdev_mtu_error = error;
1252 netdev->mtu = ifr.ifr_mtu;
1253 netdev->cache_valid |= VALID_MTU;
1256 ovs_mutex_unlock(&netdev->mutex);
1260 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1261 * On failure, returns a negative errno value. */
1263 netdev_linux_get_ifindex(const struct netdev *netdev_)
1265 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1268 ovs_mutex_lock(&netdev->mutex);
1269 error = get_ifindex(netdev_, &ifindex);
1270 ovs_mutex_unlock(&netdev->mutex);
1272 return error ? -error : ifindex;
1276 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1278 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1280 ovs_mutex_lock(&netdev->mutex);
1281 if (netdev->miimon_interval > 0) {
1282 *carrier = netdev->miimon;
1284 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1286 ovs_mutex_unlock(&netdev->mutex);
1291 static long long int
1292 netdev_linux_get_carrier_resets(const struct netdev *netdev_)
1294 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1295 long long int carrier_resets;
1297 ovs_mutex_lock(&netdev->mutex);
1298 carrier_resets = netdev->carrier_resets;
1299 ovs_mutex_unlock(&netdev->mutex);
1301 return carrier_resets;
1305 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1306 struct mii_ioctl_data *data)
1311 memset(&ifr, 0, sizeof ifr);
1312 memcpy(&ifr.ifr_data, data, sizeof *data);
1313 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
1314 memcpy(data, &ifr.ifr_data, sizeof *data);
1320 netdev_linux_get_miimon(const char *name, bool *miimon)
1322 struct mii_ioctl_data data;
1327 memset(&data, 0, sizeof data);
1328 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1330 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1331 data.reg_num = MII_BMSR;
1332 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1336 *miimon = !!(data.val_out & BMSR_LSTATUS);
1338 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1341 struct ethtool_cmd ecmd;
1343 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1346 COVERAGE_INC(netdev_get_ethtool);
1347 memset(&ecmd, 0, sizeof ecmd);
1348 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1351 struct ethtool_value eval;
1353 memcpy(&eval, &ecmd, sizeof eval);
1354 *miimon = !!eval.data;
1356 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1364 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1365 long long int interval)
1367 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1369 ovs_mutex_lock(&netdev->mutex);
1370 interval = interval > 0 ? MAX(interval, 100) : 0;
1371 if (netdev->miimon_interval != interval) {
1374 if (interval && !netdev->miimon_interval) {
1375 atomic_add(&miimon_cnt, 1, &junk);
1376 } else if (!interval && netdev->miimon_interval) {
1377 atomic_sub(&miimon_cnt, 1, &junk);
1380 netdev->miimon_interval = interval;
1381 timer_set_expired(&netdev->miimon_timer);
1383 ovs_mutex_unlock(&netdev->mutex);
1389 netdev_linux_miimon_run(void)
1391 struct shash device_shash;
1392 struct shash_node *node;
1394 shash_init(&device_shash);
1395 netdev_get_devices(&netdev_linux_class, &device_shash);
1396 SHASH_FOR_EACH (node, &device_shash) {
1397 struct netdev *netdev = node->data;
1398 struct netdev_linux *dev = netdev_linux_cast(netdev);
1401 ovs_mutex_lock(&dev->mutex);
1402 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1403 netdev_linux_get_miimon(dev->up.name, &miimon);
1404 if (miimon != dev->miimon) {
1405 dev->miimon = miimon;
1406 netdev_linux_changed(dev, dev->ifi_flags, 0);
1409 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1411 ovs_mutex_unlock(&dev->mutex);
1412 netdev_close(netdev);
1415 shash_destroy(&device_shash);
1419 netdev_linux_miimon_wait(void)
1421 struct shash device_shash;
1422 struct shash_node *node;
1424 shash_init(&device_shash);
1425 netdev_get_devices(&netdev_linux_class, &device_shash);
1426 SHASH_FOR_EACH (node, &device_shash) {
1427 struct netdev *netdev = node->data;
1428 struct netdev_linux *dev = netdev_linux_cast(netdev);
1430 ovs_mutex_lock(&dev->mutex);
1431 if (dev->miimon_interval > 0) {
1432 timer_wait(&dev->miimon_timer);
1434 ovs_mutex_unlock(&dev->mutex);
1435 netdev_close(netdev);
1437 shash_destroy(&device_shash);
1441 swap_uint64(uint64_t *a, uint64_t *b)
1448 /* Copies 'src' into 'dst', performing format conversion in the process.
1450 * 'src' is allowed to be misaligned. */
1452 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1453 const struct ovs_vport_stats *src)
1455 dst->rx_packets = get_unaligned_u64(&src->rx_packets);
1456 dst->tx_packets = get_unaligned_u64(&src->tx_packets);
1457 dst->rx_bytes = get_unaligned_u64(&src->rx_bytes);
1458 dst->tx_bytes = get_unaligned_u64(&src->tx_bytes);
1459 dst->rx_errors = get_unaligned_u64(&src->rx_errors);
1460 dst->tx_errors = get_unaligned_u64(&src->tx_errors);
1461 dst->rx_dropped = get_unaligned_u64(&src->rx_dropped);
1462 dst->tx_dropped = get_unaligned_u64(&src->tx_dropped);
1464 dst->collisions = 0;
1465 dst->rx_length_errors = 0;
1466 dst->rx_over_errors = 0;
1467 dst->rx_crc_errors = 0;
1468 dst->rx_frame_errors = 0;
1469 dst->rx_fifo_errors = 0;
1470 dst->rx_missed_errors = 0;
1471 dst->tx_aborted_errors = 0;
1472 dst->tx_carrier_errors = 0;
1473 dst->tx_fifo_errors = 0;
1474 dst->tx_heartbeat_errors = 0;
1475 dst->tx_window_errors = 0;
1479 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1481 struct dpif_linux_vport reply;
1485 error = dpif_linux_vport_get(netdev_get_name(netdev), &reply, &buf);
1488 } else if (!reply.stats) {
1493 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1501 get_stats_via_vport(const struct netdev *netdev_,
1502 struct netdev_stats *stats)
1504 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1506 if (!netdev->vport_stats_error ||
1507 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1510 error = get_stats_via_vport__(netdev_, stats);
1511 if (error && error != ENOENT) {
1512 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1514 netdev_get_name(netdev_), ovs_strerror(error));
1516 netdev->vport_stats_error = error;
1517 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1521 /* Retrieves current device stats for 'netdev-linux'. */
1523 netdev_linux_get_stats(const struct netdev *netdev_,
1524 struct netdev_stats *stats)
1526 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1527 struct netdev_stats dev_stats;
1530 ovs_mutex_lock(&netdev->mutex);
1531 get_stats_via_vport(netdev_, stats);
1532 error = get_stats_via_netlink(netdev_, &dev_stats);
1534 if (!netdev->vport_stats_error) {
1537 } else if (netdev->vport_stats_error) {
1538 /* stats not available from OVS then use ioctl stats. */
1541 stats->rx_errors += dev_stats.rx_errors;
1542 stats->tx_errors += dev_stats.tx_errors;
1543 stats->rx_dropped += dev_stats.rx_dropped;
1544 stats->tx_dropped += dev_stats.tx_dropped;
1545 stats->multicast += dev_stats.multicast;
1546 stats->collisions += dev_stats.collisions;
1547 stats->rx_length_errors += dev_stats.rx_length_errors;
1548 stats->rx_over_errors += dev_stats.rx_over_errors;
1549 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1550 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1551 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1552 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1553 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1554 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1555 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1556 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1557 stats->tx_window_errors += dev_stats.tx_window_errors;
1559 ovs_mutex_unlock(&netdev->mutex);
1564 /* Retrieves current device stats for 'netdev-tap' netdev or
1565 * netdev-internal. */
1567 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1569 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1570 struct netdev_stats dev_stats;
1573 ovs_mutex_lock(&netdev->mutex);
1574 get_stats_via_vport(netdev_, stats);
1575 error = get_stats_via_netlink(netdev_, &dev_stats);
1577 if (!netdev->vport_stats_error) {
1580 } else if (netdev->vport_stats_error) {
1581 /* Transmit and receive stats will appear to be swapped relative to the
1582 * other ports since we are the one sending the data, not a remote
1583 * computer. For consistency, we swap them back here. This does not
1584 * apply if we are getting stats from the vport layer because it always
1585 * tracks stats from the perspective of the switch. */
1588 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1589 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1590 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1591 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1592 stats->rx_length_errors = 0;
1593 stats->rx_over_errors = 0;
1594 stats->rx_crc_errors = 0;
1595 stats->rx_frame_errors = 0;
1596 stats->rx_fifo_errors = 0;
1597 stats->rx_missed_errors = 0;
1598 stats->tx_aborted_errors = 0;
1599 stats->tx_carrier_errors = 0;
1600 stats->tx_fifo_errors = 0;
1601 stats->tx_heartbeat_errors = 0;
1602 stats->tx_window_errors = 0;
1604 stats->rx_dropped += dev_stats.tx_dropped;
1605 stats->tx_dropped += dev_stats.rx_dropped;
1607 stats->rx_errors += dev_stats.tx_errors;
1608 stats->tx_errors += dev_stats.rx_errors;
1610 stats->multicast += dev_stats.multicast;
1611 stats->collisions += dev_stats.collisions;
1613 ovs_mutex_unlock(&netdev->mutex);
1619 netdev_internal_get_stats(const struct netdev *netdev_,
1620 struct netdev_stats *stats)
1622 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1625 ovs_mutex_lock(&netdev->mutex);
1626 get_stats_via_vport(netdev_, stats);
1627 error = netdev->vport_stats_error;
1628 ovs_mutex_unlock(&netdev->mutex);
1634 netdev_internal_set_stats(struct netdev *netdev,
1635 const struct netdev_stats *stats)
1637 struct ovs_vport_stats vport_stats;
1638 struct dpif_linux_vport vport;
1641 vport_stats.rx_packets = stats->rx_packets;
1642 vport_stats.tx_packets = stats->tx_packets;
1643 vport_stats.rx_bytes = stats->rx_bytes;
1644 vport_stats.tx_bytes = stats->tx_bytes;
1645 vport_stats.rx_errors = stats->rx_errors;
1646 vport_stats.tx_errors = stats->tx_errors;
1647 vport_stats.rx_dropped = stats->rx_dropped;
1648 vport_stats.tx_dropped = stats->tx_dropped;
1650 dpif_linux_vport_init(&vport);
1651 vport.cmd = OVS_VPORT_CMD_SET;
1652 vport.name = netdev_get_name(netdev);
1653 vport.stats = &vport_stats;
1655 err = dpif_linux_vport_transact(&vport, NULL, NULL);
1657 /* If the vport layer doesn't know about the device, that doesn't mean it
1658 * doesn't exist (after all were able to open it when netdev_open() was
1659 * called), it just means that it isn't attached and we'll be getting
1660 * stats a different way. */
1661 if (err == ENODEV) {
1669 netdev_linux_read_features(struct netdev_linux *netdev)
1671 struct ethtool_cmd ecmd;
1675 if (netdev->cache_valid & VALID_FEATURES) {
1679 COVERAGE_INC(netdev_get_ethtool);
1680 memset(&ecmd, 0, sizeof ecmd);
1681 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1682 ETHTOOL_GSET, "ETHTOOL_GSET");
1687 /* Supported features. */
1688 netdev->supported = 0;
1689 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1690 netdev->supported |= NETDEV_F_10MB_HD;
1692 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1693 netdev->supported |= NETDEV_F_10MB_FD;
1695 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1696 netdev->supported |= NETDEV_F_100MB_HD;
1698 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1699 netdev->supported |= NETDEV_F_100MB_FD;
1701 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1702 netdev->supported |= NETDEV_F_1GB_HD;
1704 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1705 netdev->supported |= NETDEV_F_1GB_FD;
1707 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1708 netdev->supported |= NETDEV_F_10GB_FD;
1710 if (ecmd.supported & SUPPORTED_TP) {
1711 netdev->supported |= NETDEV_F_COPPER;
1713 if (ecmd.supported & SUPPORTED_FIBRE) {
1714 netdev->supported |= NETDEV_F_FIBER;
1716 if (ecmd.supported & SUPPORTED_Autoneg) {
1717 netdev->supported |= NETDEV_F_AUTONEG;
1719 if (ecmd.supported & SUPPORTED_Pause) {
1720 netdev->supported |= NETDEV_F_PAUSE;
1722 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1723 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1726 /* Advertised features. */
1727 netdev->advertised = 0;
1728 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1729 netdev->advertised |= NETDEV_F_10MB_HD;
1731 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1732 netdev->advertised |= NETDEV_F_10MB_FD;
1734 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1735 netdev->advertised |= NETDEV_F_100MB_HD;
1737 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1738 netdev->advertised |= NETDEV_F_100MB_FD;
1740 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1741 netdev->advertised |= NETDEV_F_1GB_HD;
1743 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1744 netdev->advertised |= NETDEV_F_1GB_FD;
1746 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1747 netdev->advertised |= NETDEV_F_10GB_FD;
1749 if (ecmd.advertising & ADVERTISED_TP) {
1750 netdev->advertised |= NETDEV_F_COPPER;
1752 if (ecmd.advertising & ADVERTISED_FIBRE) {
1753 netdev->advertised |= NETDEV_F_FIBER;
1755 if (ecmd.advertising & ADVERTISED_Autoneg) {
1756 netdev->advertised |= NETDEV_F_AUTONEG;
1758 if (ecmd.advertising & ADVERTISED_Pause) {
1759 netdev->advertised |= NETDEV_F_PAUSE;
1761 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1762 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1765 /* Current settings. */
1767 if (speed == SPEED_10) {
1768 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1769 } else if (speed == SPEED_100) {
1770 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1771 } else if (speed == SPEED_1000) {
1772 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1773 } else if (speed == SPEED_10000) {
1774 netdev->current = NETDEV_F_10GB_FD;
1775 } else if (speed == 40000) {
1776 netdev->current = NETDEV_F_40GB_FD;
1777 } else if (speed == 100000) {
1778 netdev->current = NETDEV_F_100GB_FD;
1779 } else if (speed == 1000000) {
1780 netdev->current = NETDEV_F_1TB_FD;
1782 netdev->current = 0;
1785 if (ecmd.port == PORT_TP) {
1786 netdev->current |= NETDEV_F_COPPER;
1787 } else if (ecmd.port == PORT_FIBRE) {
1788 netdev->current |= NETDEV_F_FIBER;
1792 netdev->current |= NETDEV_F_AUTONEG;
1796 netdev->cache_valid |= VALID_FEATURES;
1797 netdev->get_features_error = error;
1800 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
1801 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1802 * Returns 0 if successful, otherwise a positive errno value. */
1804 netdev_linux_get_features(const struct netdev *netdev_,
1805 enum netdev_features *current,
1806 enum netdev_features *advertised,
1807 enum netdev_features *supported,
1808 enum netdev_features *peer)
1810 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1813 ovs_mutex_lock(&netdev->mutex);
1814 netdev_linux_read_features(netdev);
1815 if (!netdev->get_features_error) {
1816 *current = netdev->current;
1817 *advertised = netdev->advertised;
1818 *supported = netdev->supported;
1819 *peer = 0; /* XXX */
1821 error = netdev->get_features_error;
1822 ovs_mutex_unlock(&netdev->mutex);
1827 /* Set the features advertised by 'netdev' to 'advertise'. */
1829 netdev_linux_set_advertisements(struct netdev *netdev_,
1830 enum netdev_features advertise)
1832 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1833 struct ethtool_cmd ecmd;
1836 ovs_mutex_lock(&netdev->mutex);
1838 COVERAGE_INC(netdev_get_ethtool);
1839 memset(&ecmd, 0, sizeof ecmd);
1840 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1841 ETHTOOL_GSET, "ETHTOOL_GSET");
1846 ecmd.advertising = 0;
1847 if (advertise & NETDEV_F_10MB_HD) {
1848 ecmd.advertising |= ADVERTISED_10baseT_Half;
1850 if (advertise & NETDEV_F_10MB_FD) {
1851 ecmd.advertising |= ADVERTISED_10baseT_Full;
1853 if (advertise & NETDEV_F_100MB_HD) {
1854 ecmd.advertising |= ADVERTISED_100baseT_Half;
1856 if (advertise & NETDEV_F_100MB_FD) {
1857 ecmd.advertising |= ADVERTISED_100baseT_Full;
1859 if (advertise & NETDEV_F_1GB_HD) {
1860 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1862 if (advertise & NETDEV_F_1GB_FD) {
1863 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1865 if (advertise & NETDEV_F_10GB_FD) {
1866 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1868 if (advertise & NETDEV_F_COPPER) {
1869 ecmd.advertising |= ADVERTISED_TP;
1871 if (advertise & NETDEV_F_FIBER) {
1872 ecmd.advertising |= ADVERTISED_FIBRE;
1874 if (advertise & NETDEV_F_AUTONEG) {
1875 ecmd.advertising |= ADVERTISED_Autoneg;
1877 if (advertise & NETDEV_F_PAUSE) {
1878 ecmd.advertising |= ADVERTISED_Pause;
1880 if (advertise & NETDEV_F_PAUSE_ASYM) {
1881 ecmd.advertising |= ADVERTISED_Asym_Pause;
1883 COVERAGE_INC(netdev_set_ethtool);
1884 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1885 ETHTOOL_SSET, "ETHTOOL_SSET");
1888 ovs_mutex_unlock(&netdev->mutex);
1892 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1893 * successful, otherwise a positive errno value. */
1895 netdev_linux_set_policing(struct netdev *netdev_,
1896 uint32_t kbits_rate, uint32_t kbits_burst)
1898 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1899 const char *netdev_name = netdev_get_name(netdev_);
1902 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1903 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1904 : kbits_burst); /* Stick with user-specified value. */
1906 ovs_mutex_lock(&netdev->mutex);
1907 if (netdev->cache_valid & VALID_POLICING) {
1908 error = netdev->netdev_policing_error;
1909 if (error || (netdev->kbits_rate == kbits_rate &&
1910 netdev->kbits_burst == kbits_burst)) {
1911 /* Assume that settings haven't changed since we last set them. */
1914 netdev->cache_valid &= ~VALID_POLICING;
1917 COVERAGE_INC(netdev_set_policing);
1918 /* Remove any existing ingress qdisc. */
1919 error = tc_add_del_ingress_qdisc(netdev_, false);
1921 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1922 netdev_name, ovs_strerror(error));
1927 error = tc_add_del_ingress_qdisc(netdev_, true);
1929 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1930 netdev_name, ovs_strerror(error));
1934 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
1936 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1937 netdev_name, ovs_strerror(error));
1942 netdev->kbits_rate = kbits_rate;
1943 netdev->kbits_burst = kbits_burst;
1946 if (!error || error == ENODEV) {
1947 netdev->netdev_policing_error = error;
1948 netdev->cache_valid |= VALID_POLICING;
1950 ovs_mutex_unlock(&netdev->mutex);
1955 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1958 const struct tc_ops *const *opsp;
1960 for (opsp = tcs; *opsp != NULL; opsp++) {
1961 const struct tc_ops *ops = *opsp;
1962 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1963 sset_add(types, ops->ovs_name);
1969 static const struct tc_ops *
1970 tc_lookup_ovs_name(const char *name)
1972 const struct tc_ops *const *opsp;
1974 for (opsp = tcs; *opsp != NULL; opsp++) {
1975 const struct tc_ops *ops = *opsp;
1976 if (!strcmp(name, ops->ovs_name)) {
1983 static const struct tc_ops *
1984 tc_lookup_linux_name(const char *name)
1986 const struct tc_ops *const *opsp;
1988 for (opsp = tcs; *opsp != NULL; opsp++) {
1989 const struct tc_ops *ops = *opsp;
1990 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1997 static struct tc_queue *
1998 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
2001 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2002 struct tc_queue *queue;
2004 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
2005 if (queue->queue_id == queue_id) {
2012 static struct tc_queue *
2013 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2015 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2019 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2021 struct netdev_qos_capabilities *caps)
2023 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2027 caps->n_queues = ops->n_queues;
2032 netdev_linux_get_qos(const struct netdev *netdev_,
2033 const char **typep, struct smap *details)
2035 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2038 ovs_mutex_lock(&netdev->mutex);
2039 error = tc_query_qdisc(netdev_);
2041 *typep = netdev->tc->ops->ovs_name;
2042 error = (netdev->tc->ops->qdisc_get
2043 ? netdev->tc->ops->qdisc_get(netdev_, details)
2046 ovs_mutex_unlock(&netdev->mutex);
2052 netdev_linux_set_qos(struct netdev *netdev_,
2053 const char *type, const struct smap *details)
2055 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2056 const struct tc_ops *new_ops;
2059 new_ops = tc_lookup_ovs_name(type);
2060 if (!new_ops || !new_ops->tc_install) {
2064 ovs_mutex_lock(&netdev->mutex);
2065 error = tc_query_qdisc(netdev_);
2070 if (new_ops == netdev->tc->ops) {
2071 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
2073 /* Delete existing qdisc. */
2074 error = tc_del_qdisc(netdev_);
2078 ovs_assert(netdev->tc == NULL);
2080 /* Install new qdisc. */
2081 error = new_ops->tc_install(netdev_, details);
2082 ovs_assert((error == 0) == (netdev->tc != NULL));
2086 ovs_mutex_unlock(&netdev->mutex);
2091 netdev_linux_get_queue(const struct netdev *netdev_,
2092 unsigned int queue_id, struct smap *details)
2094 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2097 ovs_mutex_lock(&netdev->mutex);
2098 error = tc_query_qdisc(netdev_);
2100 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2102 ? netdev->tc->ops->class_get(netdev_, queue, details)
2105 ovs_mutex_unlock(&netdev->mutex);
2111 netdev_linux_set_queue(struct netdev *netdev_,
2112 unsigned int queue_id, const struct smap *details)
2114 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2117 ovs_mutex_lock(&netdev->mutex);
2118 error = tc_query_qdisc(netdev_);
2120 error = (queue_id < netdev->tc->ops->n_queues
2121 && netdev->tc->ops->class_set
2122 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2125 ovs_mutex_unlock(&netdev->mutex);
2131 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
2133 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2136 ovs_mutex_lock(&netdev->mutex);
2137 error = tc_query_qdisc(netdev_);
2139 if (netdev->tc->ops->class_delete) {
2140 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2142 ? netdev->tc->ops->class_delete(netdev_, queue)
2148 ovs_mutex_unlock(&netdev->mutex);
2154 netdev_linux_get_queue_stats(const struct netdev *netdev_,
2155 unsigned int queue_id,
2156 struct netdev_queue_stats *stats)
2158 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2161 ovs_mutex_lock(&netdev->mutex);
2162 error = tc_query_qdisc(netdev_);
2164 if (netdev->tc->ops->class_get_stats) {
2165 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2167 stats->created = queue->created;
2168 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2177 ovs_mutex_unlock(&netdev->mutex);
2183 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
2185 struct ofpbuf request;
2186 struct tcmsg *tcmsg;
2188 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2192 tcmsg->tcm_parent = 0;
2193 nl_dump_start(dump, NETLINK_ROUTE, &request);
2194 ofpbuf_uninit(&request);
2198 struct netdev_linux_queue_state {
2199 unsigned int *queues;
2205 netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
2207 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2210 ovs_mutex_lock(&netdev->mutex);
2211 error = tc_query_qdisc(netdev_);
2213 if (netdev->tc->ops->class_get) {
2214 struct netdev_linux_queue_state *state;
2215 struct tc_queue *queue;
2218 *statep = state = xmalloc(sizeof *state);
2219 state->n_queues = hmap_count(&netdev->tc->queues);
2220 state->cur_queue = 0;
2221 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2224 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2225 state->queues[i++] = queue->queue_id;
2231 ovs_mutex_unlock(&netdev->mutex);
2237 netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2238 unsigned int *queue_idp, struct smap *details)
2240 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2241 struct netdev_linux_queue_state *state = state_;
2244 ovs_mutex_lock(&netdev->mutex);
2245 while (state->cur_queue < state->n_queues) {
2246 unsigned int queue_id = state->queues[state->cur_queue++];
2247 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2250 *queue_idp = queue_id;
2251 error = netdev->tc->ops->class_get(netdev_, queue, details);
2255 ovs_mutex_unlock(&netdev->mutex);
2261 netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2264 struct netdev_linux_queue_state *state = state_;
2266 free(state->queues);
2272 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2273 netdev_dump_queue_stats_cb *cb, void *aux)
2275 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2278 ovs_mutex_lock(&netdev->mutex);
2279 error = tc_query_qdisc(netdev_);
2281 struct nl_dump dump;
2283 if (!netdev->tc->ops->class_dump_stats) {
2285 } else if (!start_queue_dump(netdev_, &dump)) {
2291 while (nl_dump_next(&dump, &msg)) {
2292 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2299 retval = nl_dump_done(&dump);
2305 ovs_mutex_unlock(&netdev->mutex);
2311 netdev_linux_get_in4(const struct netdev *netdev_,
2312 struct in_addr *address, struct in_addr *netmask)
2314 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2317 ovs_mutex_lock(&netdev->mutex);
2318 if (!(netdev->cache_valid & VALID_IN4)) {
2319 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
2320 SIOCGIFADDR, "SIOCGIFADDR");
2322 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2323 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2325 netdev->cache_valid |= VALID_IN4;
2333 if (netdev->address.s_addr != INADDR_ANY) {
2334 *address = netdev->address;
2335 *netmask = netdev->netmask;
2337 error = EADDRNOTAVAIL;
2340 ovs_mutex_unlock(&netdev->mutex);
2346 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2347 struct in_addr netmask)
2349 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2352 ovs_mutex_lock(&netdev->mutex);
2353 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2355 netdev->cache_valid |= VALID_IN4;
2356 netdev->address = address;
2357 netdev->netmask = netmask;
2358 if (address.s_addr != INADDR_ANY) {
2359 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2360 "SIOCSIFNETMASK", netmask);
2363 ovs_mutex_unlock(&netdev->mutex);
2369 parse_if_inet6_line(const char *line,
2370 struct in6_addr *in6, char ifname[16 + 1])
2372 uint8_t *s6 = in6->s6_addr;
2373 #define X8 "%2"SCNx8
2374 return ovs_scan(line,
2375 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2376 "%*x %*x %*x %*x %16s\n",
2377 &s6[0], &s6[1], &s6[2], &s6[3],
2378 &s6[4], &s6[5], &s6[6], &s6[7],
2379 &s6[8], &s6[9], &s6[10], &s6[11],
2380 &s6[12], &s6[13], &s6[14], &s6[15],
2384 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2385 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2387 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2389 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2391 ovs_mutex_lock(&netdev->mutex);
2392 if (!(netdev->cache_valid & VALID_IN6)) {
2396 netdev->in6 = in6addr_any;
2398 file = fopen("/proc/net/if_inet6", "r");
2400 const char *name = netdev_get_name(netdev_);
2401 while (fgets(line, sizeof line, file)) {
2402 struct in6_addr in6_tmp;
2403 char ifname[16 + 1];
2404 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2405 && !strcmp(name, ifname))
2407 netdev->in6 = in6_tmp;
2413 netdev->cache_valid |= VALID_IN6;
2416 ovs_mutex_unlock(&netdev->mutex);
2422 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2424 struct sockaddr_in sin;
2425 memset(&sin, 0, sizeof sin);
2426 sin.sin_family = AF_INET;
2427 sin.sin_addr = addr;
2430 memset(sa, 0, sizeof *sa);
2431 memcpy(sa, &sin, sizeof sin);
2435 do_set_addr(struct netdev *netdev,
2436 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2440 make_in4_sockaddr(&ifr.ifr_addr, addr);
2441 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2445 /* Adds 'router' as a default IP gateway. */
2447 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2449 struct in_addr any = { INADDR_ANY };
2453 memset(&rt, 0, sizeof rt);
2454 make_in4_sockaddr(&rt.rt_dst, any);
2455 make_in4_sockaddr(&rt.rt_gateway, router);
2456 make_in4_sockaddr(&rt.rt_genmask, any);
2457 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2458 error = af_inet_ioctl(SIOCADDRT, &rt);
2460 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2466 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2469 static const char fn[] = "/proc/net/route";
2474 *netdev_name = NULL;
2475 stream = fopen(fn, "r");
2476 if (stream == NULL) {
2477 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2482 while (fgets(line, sizeof line, stream)) {
2485 ovs_be32 dest, gateway, mask;
2486 int refcnt, metric, mtu;
2487 unsigned int flags, use, window, irtt;
2490 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2492 iface, &dest, &gateway, &flags, &refcnt,
2493 &use, &metric, &mask, &mtu, &window, &irtt)) {
2494 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2498 if (!(flags & RTF_UP)) {
2499 /* Skip routes that aren't up. */
2503 /* The output of 'dest', 'mask', and 'gateway' were given in
2504 * network byte order, so we don't need need any endian
2505 * conversions here. */
2506 if ((dest & mask) == (host->s_addr & mask)) {
2508 /* The host is directly reachable. */
2509 next_hop->s_addr = 0;
2511 /* To reach the host, we must go through a gateway. */
2512 next_hop->s_addr = gateway;
2514 *netdev_name = xstrdup(iface);
2526 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2528 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2531 ovs_mutex_lock(&netdev->mutex);
2532 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2533 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2535 COVERAGE_INC(netdev_get_ethtool);
2536 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2537 error = netdev_linux_do_ethtool(netdev->up.name,
2540 "ETHTOOL_GDRVINFO");
2542 netdev->cache_valid |= VALID_DRVINFO;
2547 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2548 smap_add(smap, "driver_version", netdev->drvinfo.version);
2549 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2551 ovs_mutex_unlock(&netdev->mutex);
2557 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2560 smap_add(smap, "driver_name", "openvswitch");
2564 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2565 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2566 * returns 0. Otherwise, it returns a positive errno value; in particular,
2567 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2569 netdev_linux_arp_lookup(const struct netdev *netdev,
2570 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2573 struct sockaddr_in sin;
2576 memset(&r, 0, sizeof r);
2577 memset(&sin, 0, sizeof sin);
2578 sin.sin_family = AF_INET;
2579 sin.sin_addr.s_addr = ip;
2581 memcpy(&r.arp_pa, &sin, sizeof sin);
2582 r.arp_ha.sa_family = ARPHRD_ETHER;
2584 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2585 COVERAGE_INC(netdev_arp_lookup);
2586 retval = af_inet_ioctl(SIOCGARP, &r);
2588 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2589 } else if (retval != ENXIO) {
2590 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2591 netdev_get_name(netdev), IP_ARGS(ip),
2592 ovs_strerror(retval));
2598 nd_to_iff_flags(enum netdev_flags nd)
2601 if (nd & NETDEV_UP) {
2604 if (nd & NETDEV_PROMISC) {
2607 if (nd & NETDEV_LOOPBACK) {
2608 iff |= IFF_LOOPBACK;
2614 iff_to_nd_flags(int iff)
2616 enum netdev_flags nd = 0;
2620 if (iff & IFF_PROMISC) {
2621 nd |= NETDEV_PROMISC;
2623 if (iff & IFF_LOOPBACK) {
2624 nd |= NETDEV_LOOPBACK;
2630 update_flags(struct netdev_linux *netdev, enum netdev_flags off,
2631 enum netdev_flags on, enum netdev_flags *old_flagsp)
2632 OVS_REQUIRES(netdev->mutex)
2634 int old_flags, new_flags;
2637 old_flags = netdev->ifi_flags;
2638 *old_flagsp = iff_to_nd_flags(old_flags);
2639 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2640 if (new_flags != old_flags) {
2641 error = set_flags(netdev_get_name(&netdev->up), new_flags);
2642 get_flags(&netdev->up, &netdev->ifi_flags);
2649 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2650 enum netdev_flags on, enum netdev_flags *old_flagsp)
2652 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2655 ovs_mutex_lock(&netdev->mutex);
2656 error = update_flags(netdev, off, on, old_flagsp);
2657 ovs_mutex_unlock(&netdev->mutex);
2662 #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, SET_STATS, \
2663 GET_FEATURES, GET_STATUS) \
2669 netdev_linux_wait, \
2671 netdev_linux_alloc, \
2673 netdev_linux_destruct, \
2674 netdev_linux_dealloc, \
2675 NULL, /* get_config */ \
2676 NULL, /* set_config */ \
2677 NULL, /* get_tunnel_config */ \
2679 netdev_linux_send, \
2680 netdev_linux_send_wait, \
2682 netdev_linux_set_etheraddr, \
2683 netdev_linux_get_etheraddr, \
2684 netdev_linux_get_mtu, \
2685 netdev_linux_set_mtu, \
2686 netdev_linux_get_ifindex, \
2687 netdev_linux_get_carrier, \
2688 netdev_linux_get_carrier_resets, \
2689 netdev_linux_set_miimon_interval, \
2694 netdev_linux_set_advertisements, \
2696 netdev_linux_set_policing, \
2697 netdev_linux_get_qos_types, \
2698 netdev_linux_get_qos_capabilities, \
2699 netdev_linux_get_qos, \
2700 netdev_linux_set_qos, \
2701 netdev_linux_get_queue, \
2702 netdev_linux_set_queue, \
2703 netdev_linux_delete_queue, \
2704 netdev_linux_get_queue_stats, \
2705 netdev_linux_queue_dump_start, \
2706 netdev_linux_queue_dump_next, \
2707 netdev_linux_queue_dump_done, \
2708 netdev_linux_dump_queue_stats, \
2710 netdev_linux_get_in4, \
2711 netdev_linux_set_in4, \
2712 netdev_linux_get_in6, \
2713 netdev_linux_add_router, \
2714 netdev_linux_get_next_hop, \
2716 netdev_linux_arp_lookup, \
2718 netdev_linux_update_flags, \
2720 netdev_linux_rx_alloc, \
2721 netdev_linux_rx_construct, \
2722 netdev_linux_rx_destruct, \
2723 netdev_linux_rx_dealloc, \
2724 netdev_linux_rx_recv, \
2725 netdev_linux_rx_wait, \
2726 netdev_linux_rx_drain, \
2729 const struct netdev_class netdev_linux_class =
2732 netdev_linux_construct,
2733 netdev_linux_get_stats,
2734 NULL, /* set_stats */
2735 netdev_linux_get_features,
2736 netdev_linux_get_status);
2738 const struct netdev_class netdev_tap_class =
2741 netdev_linux_construct_tap,
2742 netdev_tap_get_stats,
2743 NULL, /* set_stats */
2744 netdev_linux_get_features,
2745 netdev_linux_get_status);
2747 const struct netdev_class netdev_internal_class =
2750 netdev_linux_construct,
2751 netdev_internal_get_stats,
2752 netdev_internal_set_stats,
2753 NULL, /* get_features */
2754 netdev_internal_get_status);
2756 /* HTB traffic control class. */
2758 #define HTB_N_QUEUES 0xf000
2762 unsigned int max_rate; /* In bytes/s. */
2766 struct tc_queue tc_queue;
2767 unsigned int min_rate; /* In bytes/s. */
2768 unsigned int max_rate; /* In bytes/s. */
2769 unsigned int burst; /* In bytes. */
2770 unsigned int priority; /* Lower values are higher priorities. */
2774 htb_get__(const struct netdev *netdev_)
2776 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2777 return CONTAINER_OF(netdev->tc, struct htb, tc);
2781 htb_install__(struct netdev *netdev_, uint64_t max_rate)
2783 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2786 htb = xmalloc(sizeof *htb);
2787 tc_init(&htb->tc, &tc_ops_htb);
2788 htb->max_rate = max_rate;
2790 netdev->tc = &htb->tc;
2793 /* Create an HTB qdisc.
2795 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2797 htb_setup_qdisc__(struct netdev *netdev)
2800 struct tc_htb_glob opt;
2801 struct ofpbuf request;
2802 struct tcmsg *tcmsg;
2804 tc_del_qdisc(netdev);
2806 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2807 NLM_F_EXCL | NLM_F_CREATE, &request);
2811 tcmsg->tcm_handle = tc_make_handle(1, 0);
2812 tcmsg->tcm_parent = TC_H_ROOT;
2814 nl_msg_put_string(&request, TCA_KIND, "htb");
2816 memset(&opt, 0, sizeof opt);
2817 opt.rate2quantum = 10;
2821 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2822 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2823 nl_msg_end_nested(&request, opt_offset);
2825 return tc_transact(&request, NULL);
2828 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2829 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2831 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2832 unsigned int parent, struct htb_class *class)
2835 struct tc_htb_opt opt;
2836 struct ofpbuf request;
2837 struct tcmsg *tcmsg;
2841 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
2843 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2844 netdev_get_name(netdev));
2848 memset(&opt, 0, sizeof opt);
2849 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2850 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2851 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2852 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2853 opt.prio = class->priority;
2855 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2859 tcmsg->tcm_handle = handle;
2860 tcmsg->tcm_parent = parent;
2862 nl_msg_put_string(&request, TCA_KIND, "htb");
2863 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2864 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2865 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2866 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2867 nl_msg_end_nested(&request, opt_offset);
2869 error = tc_transact(&request, NULL);
2871 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2872 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2873 netdev_get_name(netdev),
2874 tc_get_major(handle), tc_get_minor(handle),
2875 tc_get_major(parent), tc_get_minor(parent),
2876 class->min_rate, class->max_rate,
2877 class->burst, class->priority, ovs_strerror(error));
2882 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2883 * description of them into 'details'. The description complies with the
2884 * specification given in the vswitch database documentation for linux-htb
2887 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2889 static const struct nl_policy tca_htb_policy[] = {
2890 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2891 .min_len = sizeof(struct tc_htb_opt) },
2894 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2895 const struct tc_htb_opt *htb;
2897 if (!nl_parse_nested(nl_options, tca_htb_policy,
2898 attrs, ARRAY_SIZE(tca_htb_policy))) {
2899 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2903 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2904 class->min_rate = htb->rate.rate;
2905 class->max_rate = htb->ceil.rate;
2906 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2907 class->priority = htb->prio;
2912 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2913 struct htb_class *options,
2914 struct netdev_queue_stats *stats)
2916 struct nlattr *nl_options;
2917 unsigned int handle;
2920 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2921 if (!error && queue_id) {
2922 unsigned int major = tc_get_major(handle);
2923 unsigned int minor = tc_get_minor(handle);
2924 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2925 *queue_id = minor - 1;
2930 if (!error && options) {
2931 error = htb_parse_tca_options__(nl_options, options);
2937 htb_parse_qdisc_details__(struct netdev *netdev_,
2938 const struct smap *details, struct htb_class *hc)
2940 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2941 const char *max_rate_s;
2943 max_rate_s = smap_get(details, "max-rate");
2944 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2945 if (!hc->max_rate) {
2946 enum netdev_features current;
2948 netdev_linux_read_features(netdev);
2949 current = !netdev->get_features_error ? netdev->current : 0;
2950 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
2952 hc->min_rate = hc->max_rate;
2958 htb_parse_class_details__(struct netdev *netdev,
2959 const struct smap *details, struct htb_class *hc)
2961 const struct htb *htb = htb_get__(netdev);
2962 const char *min_rate_s = smap_get(details, "min-rate");
2963 const char *max_rate_s = smap_get(details, "max-rate");
2964 const char *burst_s = smap_get(details, "burst");
2965 const char *priority_s = smap_get(details, "priority");
2968 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
2970 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2971 netdev_get_name(netdev));
2975 /* HTB requires at least an mtu sized min-rate to send any traffic even
2976 * on uncongested links. */
2977 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2978 hc->min_rate = MAX(hc->min_rate, mtu);
2979 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2982 hc->max_rate = (max_rate_s
2983 ? strtoull(max_rate_s, NULL, 10) / 8
2985 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2986 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2990 * According to hints in the documentation that I've read, it is important
2991 * that 'burst' be at least as big as the largest frame that might be
2992 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2993 * but having it a bit too small is a problem. Since netdev_get_mtu()
2994 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2995 * the MTU. We actually add 64, instead of 14, as a guard against
2996 * additional headers get tacked on somewhere that we're not aware of. */
2997 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2998 hc->burst = MAX(hc->burst, mtu + 64);
3001 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
3007 htb_query_class__(const struct netdev *netdev, unsigned int handle,
3008 unsigned int parent, struct htb_class *options,
3009 struct netdev_queue_stats *stats)
3011 struct ofpbuf *reply;
3014 error = tc_query_class(netdev, handle, parent, &reply);
3016 error = htb_parse_tcmsg__(reply, NULL, options, stats);
3017 ofpbuf_delete(reply);
3023 htb_tc_install(struct netdev *netdev, const struct smap *details)
3027 error = htb_setup_qdisc__(netdev);
3029 struct htb_class hc;
3031 htb_parse_qdisc_details__(netdev, details, &hc);
3032 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3033 tc_make_handle(1, 0), &hc);
3035 htb_install__(netdev, hc.max_rate);
3041 static struct htb_class *
3042 htb_class_cast__(const struct tc_queue *queue)
3044 return CONTAINER_OF(queue, struct htb_class, tc_queue);
3048 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
3049 const struct htb_class *hc)
3051 struct htb *htb = htb_get__(netdev);
3052 size_t hash = hash_int(queue_id, 0);
3053 struct tc_queue *queue;
3054 struct htb_class *hcp;
3056 queue = tc_find_queue__(netdev, queue_id, hash);
3058 hcp = htb_class_cast__(queue);
3060 hcp = xmalloc(sizeof *hcp);
3061 queue = &hcp->tc_queue;
3062 queue->queue_id = queue_id;
3063 queue->created = time_msec();
3064 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
3067 hcp->min_rate = hc->min_rate;
3068 hcp->max_rate = hc->max_rate;
3069 hcp->burst = hc->burst;
3070 hcp->priority = hc->priority;
3074 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3077 struct nl_dump dump;
3078 struct htb_class hc;
3080 /* Get qdisc options. */
3082 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3083 htb_install__(netdev, hc.max_rate);
3086 if (!start_queue_dump(netdev, &dump)) {
3089 while (nl_dump_next(&dump, &msg)) {
3090 unsigned int queue_id;
3092 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3093 htb_update_queue__(netdev, queue_id, &hc);
3096 nl_dump_done(&dump);
3102 htb_tc_destroy(struct tc *tc)
3104 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
3105 struct htb_class *hc, *next;
3107 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
3108 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3116 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
3118 const struct htb *htb = htb_get__(netdev);
3119 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
3124 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
3126 struct htb_class hc;
3129 htb_parse_qdisc_details__(netdev, details, &hc);
3130 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3131 tc_make_handle(1, 0), &hc);
3133 htb_get__(netdev)->max_rate = hc.max_rate;
3139 htb_class_get(const struct netdev *netdev OVS_UNUSED,
3140 const struct tc_queue *queue, struct smap *details)
3142 const struct htb_class *hc = htb_class_cast__(queue);
3144 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3145 if (hc->min_rate != hc->max_rate) {
3146 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3148 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
3150 smap_add_format(details, "priority", "%u", hc->priority);
3156 htb_class_set(struct netdev *netdev, unsigned int queue_id,
3157 const struct smap *details)
3159 struct htb_class hc;
3162 error = htb_parse_class_details__(netdev, details, &hc);
3167 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3168 tc_make_handle(1, 0xfffe), &hc);
3173 htb_update_queue__(netdev, queue_id, &hc);
3178 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
3180 struct htb_class *hc = htb_class_cast__(queue);
3181 struct htb *htb = htb_get__(netdev);
3184 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3186 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3193 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3194 struct netdev_queue_stats *stats)
3196 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3197 tc_make_handle(1, 0xfffe), NULL, stats);
3201 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3202 const struct ofpbuf *nlmsg,
3203 netdev_dump_queue_stats_cb *cb, void *aux)
3205 struct netdev_queue_stats stats;
3206 unsigned int handle, major, minor;
3209 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3214 major = tc_get_major(handle);
3215 minor = tc_get_minor(handle);
3216 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3217 (*cb)(minor - 1, &stats, aux);
3222 static const struct tc_ops tc_ops_htb = {
3223 "htb", /* linux_name */
3224 "linux-htb", /* ovs_name */
3225 HTB_N_QUEUES, /* n_queues */
3234 htb_class_get_stats,
3235 htb_class_dump_stats
3238 /* "linux-hfsc" traffic control class. */
3240 #define HFSC_N_QUEUES 0xf000
3248 struct tc_queue tc_queue;
3253 static struct hfsc *
3254 hfsc_get__(const struct netdev *netdev_)
3256 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3257 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
3260 static struct hfsc_class *
3261 hfsc_class_cast__(const struct tc_queue *queue)
3263 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3267 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
3269 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3272 hfsc = xmalloc(sizeof *hfsc);
3273 tc_init(&hfsc->tc, &tc_ops_hfsc);
3274 hfsc->max_rate = max_rate;
3275 netdev->tc = &hfsc->tc;
3279 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3280 const struct hfsc_class *hc)
3284 struct hfsc_class *hcp;
3285 struct tc_queue *queue;
3287 hfsc = hfsc_get__(netdev);
3288 hash = hash_int(queue_id, 0);
3290 queue = tc_find_queue__(netdev, queue_id, hash);
3292 hcp = hfsc_class_cast__(queue);
3294 hcp = xmalloc(sizeof *hcp);
3295 queue = &hcp->tc_queue;
3296 queue->queue_id = queue_id;
3297 queue->created = time_msec();
3298 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3301 hcp->min_rate = hc->min_rate;
3302 hcp->max_rate = hc->max_rate;
3306 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3308 const struct tc_service_curve *rsc, *fsc, *usc;
3309 static const struct nl_policy tca_hfsc_policy[] = {
3311 .type = NL_A_UNSPEC,
3313 .min_len = sizeof(struct tc_service_curve),
3316 .type = NL_A_UNSPEC,
3318 .min_len = sizeof(struct tc_service_curve),
3321 .type = NL_A_UNSPEC,
3323 .min_len = sizeof(struct tc_service_curve),
3326 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3328 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3329 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3330 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3334 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3335 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3336 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3338 if (rsc->m1 != 0 || rsc->d != 0 ||
3339 fsc->m1 != 0 || fsc->d != 0 ||
3340 usc->m1 != 0 || usc->d != 0) {
3341 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3342 "Non-linear service curves are not supported.");
3346 if (rsc->m2 != fsc->m2) {
3347 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3348 "Real-time service curves are not supported ");
3352 if (rsc->m2 > usc->m2) {
3353 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3354 "Min-rate service curve is greater than "
3355 "the max-rate service curve.");
3359 class->min_rate = fsc->m2;
3360 class->max_rate = usc->m2;
3365 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3366 struct hfsc_class *options,
3367 struct netdev_queue_stats *stats)
3370 unsigned int handle;
3371 struct nlattr *nl_options;
3373 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3379 unsigned int major, minor;
3381 major = tc_get_major(handle);
3382 minor = tc_get_minor(handle);
3383 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3384 *queue_id = minor - 1;
3391 error = hfsc_parse_tca_options__(nl_options, options);
3398 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3399 unsigned int parent, struct hfsc_class *options,
3400 struct netdev_queue_stats *stats)
3403 struct ofpbuf *reply;
3405 error = tc_query_class(netdev, handle, parent, &reply);
3410 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3411 ofpbuf_delete(reply);
3416 hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
3417 struct hfsc_class *class)
3419 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3421 const char *max_rate_s;
3423 max_rate_s = smap_get(details, "max-rate");
3424 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3427 enum netdev_features current;
3429 netdev_linux_read_features(netdev);
3430 current = !netdev->get_features_error ? netdev->current : 0;
3431 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3434 class->min_rate = max_rate;
3435 class->max_rate = max_rate;
3439 hfsc_parse_class_details__(struct netdev *netdev,
3440 const struct smap *details,
3441 struct hfsc_class * class)
3443 const struct hfsc *hfsc;
3444 uint32_t min_rate, max_rate;
3445 const char *min_rate_s, *max_rate_s;
3447 hfsc = hfsc_get__(netdev);
3448 min_rate_s = smap_get(details, "min-rate");
3449 max_rate_s = smap_get(details, "max-rate");
3451 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3452 min_rate = MAX(min_rate, 1);
3453 min_rate = MIN(min_rate, hfsc->max_rate);
3455 max_rate = (max_rate_s
3456 ? strtoull(max_rate_s, NULL, 10) / 8
3458 max_rate = MAX(max_rate, min_rate);
3459 max_rate = MIN(max_rate, hfsc->max_rate);
3461 class->min_rate = min_rate;
3462 class->max_rate = max_rate;
3467 /* Create an HFSC qdisc.
3469 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3471 hfsc_setup_qdisc__(struct netdev * netdev)
3473 struct tcmsg *tcmsg;
3474 struct ofpbuf request;
3475 struct tc_hfsc_qopt opt;
3477 tc_del_qdisc(netdev);
3479 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3480 NLM_F_EXCL | NLM_F_CREATE, &request);
3486 tcmsg->tcm_handle = tc_make_handle(1, 0);
3487 tcmsg->tcm_parent = TC_H_ROOT;
3489 memset(&opt, 0, sizeof opt);
3492 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3493 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3495 return tc_transact(&request, NULL);
3498 /* Create an HFSC class.
3500 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3501 * sc rate <min_rate> ul rate <max_rate>" */
3503 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3504 unsigned int parent, struct hfsc_class *class)
3508 struct tcmsg *tcmsg;
3509 struct ofpbuf request;
3510 struct tc_service_curve min, max;
3512 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3518 tcmsg->tcm_handle = handle;
3519 tcmsg->tcm_parent = parent;
3523 min.m2 = class->min_rate;
3527 max.m2 = class->max_rate;
3529 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3530 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3531 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3532 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3533 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3534 nl_msg_end_nested(&request, opt_offset);
3536 error = tc_transact(&request, NULL);
3538 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3539 "min-rate %ubps, max-rate %ubps (%s)",
3540 netdev_get_name(netdev),
3541 tc_get_major(handle), tc_get_minor(handle),
3542 tc_get_major(parent), tc_get_minor(parent),
3543 class->min_rate, class->max_rate, ovs_strerror(error));
3550 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
3553 struct hfsc_class class;
3555 error = hfsc_setup_qdisc__(netdev);
3561 hfsc_parse_qdisc_details__(netdev, details, &class);
3562 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3563 tc_make_handle(1, 0), &class);
3569 hfsc_install__(netdev, class.max_rate);
3574 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3577 struct nl_dump dump;
3578 struct hfsc_class hc;
3581 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3582 hfsc_install__(netdev, hc.max_rate);
3584 if (!start_queue_dump(netdev, &dump)) {
3588 while (nl_dump_next(&dump, &msg)) {
3589 unsigned int queue_id;
3591 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3592 hfsc_update_queue__(netdev, queue_id, &hc);
3596 nl_dump_done(&dump);
3601 hfsc_tc_destroy(struct tc *tc)
3604 struct hfsc_class *hc, *next;
3606 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3608 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3609 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3618 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
3620 const struct hfsc *hfsc;
3621 hfsc = hfsc_get__(netdev);
3622 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
3627 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
3630 struct hfsc_class class;
3632 hfsc_parse_qdisc_details__(netdev, details, &class);
3633 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3634 tc_make_handle(1, 0), &class);
3637 hfsc_get__(netdev)->max_rate = class.max_rate;
3644 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3645 const struct tc_queue *queue, struct smap *details)
3647 const struct hfsc_class *hc;
3649 hc = hfsc_class_cast__(queue);
3650 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3651 if (hc->min_rate != hc->max_rate) {
3652 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3658 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3659 const struct smap *details)
3662 struct hfsc_class class;
3664 error = hfsc_parse_class_details__(netdev, details, &class);
3669 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3670 tc_make_handle(1, 0xfffe), &class);
3675 hfsc_update_queue__(netdev, queue_id, &class);
3680 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3684 struct hfsc_class *hc;
3686 hc = hfsc_class_cast__(queue);
3687 hfsc = hfsc_get__(netdev);
3689 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3691 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3698 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3699 struct netdev_queue_stats *stats)
3701 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3702 tc_make_handle(1, 0xfffe), NULL, stats);
3706 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3707 const struct ofpbuf *nlmsg,
3708 netdev_dump_queue_stats_cb *cb, void *aux)
3710 struct netdev_queue_stats stats;
3711 unsigned int handle, major, minor;
3714 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3719 major = tc_get_major(handle);
3720 minor = tc_get_minor(handle);
3721 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3722 (*cb)(minor - 1, &stats, aux);
3727 static const struct tc_ops tc_ops_hfsc = {
3728 "hfsc", /* linux_name */
3729 "linux-hfsc", /* ovs_name */
3730 HFSC_N_QUEUES, /* n_queues */
3731 hfsc_tc_install, /* tc_install */
3732 hfsc_tc_load, /* tc_load */
3733 hfsc_tc_destroy, /* tc_destroy */
3734 hfsc_qdisc_get, /* qdisc_get */
3735 hfsc_qdisc_set, /* qdisc_set */
3736 hfsc_class_get, /* class_get */
3737 hfsc_class_set, /* class_set */
3738 hfsc_class_delete, /* class_delete */
3739 hfsc_class_get_stats, /* class_get_stats */
3740 hfsc_class_dump_stats /* class_dump_stats */
3743 /* "linux-default" traffic control class.
3745 * This class represents the default, unnamed Linux qdisc. It corresponds to
3746 * the "" (empty string) QoS type in the OVS database. */
3749 default_install__(struct netdev *netdev_)
3751 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3752 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
3754 /* Nothing but a tc class implementation is allowed to write to a tc. This
3755 * class never does that, so we can legitimately use a const tc object. */
3756 netdev->tc = CONST_CAST(struct tc *, &tc);
3760 default_tc_install(struct netdev *netdev,
3761 const struct smap *details OVS_UNUSED)
3763 default_install__(netdev);
3768 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3770 default_install__(netdev);
3774 static const struct tc_ops tc_ops_default = {
3775 NULL, /* linux_name */
3780 NULL, /* tc_destroy */
3781 NULL, /* qdisc_get */
3782 NULL, /* qdisc_set */
3783 NULL, /* class_get */
3784 NULL, /* class_set */
3785 NULL, /* class_delete */
3786 NULL, /* class_get_stats */
3787 NULL /* class_dump_stats */
3790 /* "linux-other" traffic control class.
3795 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
3797 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3798 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
3800 /* Nothing but a tc class implementation is allowed to write to a tc. This
3801 * class never does that, so we can legitimately use a const tc object. */
3802 netdev->tc = CONST_CAST(struct tc *, &tc);
3806 static const struct tc_ops tc_ops_other = {
3807 NULL, /* linux_name */
3808 "linux-other", /* ovs_name */
3810 NULL, /* tc_install */
3812 NULL, /* tc_destroy */
3813 NULL, /* qdisc_get */
3814 NULL, /* qdisc_set */
3815 NULL, /* class_get */
3816 NULL, /* class_set */
3817 NULL, /* class_delete */
3818 NULL, /* class_get_stats */
3819 NULL /* class_dump_stats */
3822 /* Traffic control. */
3824 /* Number of kernel "tc" ticks per second. */
3825 static double ticks_per_s;
3827 /* Number of kernel "jiffies" per second. This is used for the purpose of
3828 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3829 * one jiffy's worth of data.
3831 * There are two possibilities here:
3833 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3834 * approximate range of 100 to 1024. That means that we really need to
3835 * make sure that the qdisc can buffer that much data.
3837 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3838 * has finely granular timers and there's no need to fudge additional room
3839 * for buffers. (There's no extra effort needed to implement that: the
3840 * large 'buffer_hz' is used as a divisor, so practically any number will
3841 * come out as 0 in the division. Small integer results in the case of
3842 * really high dividends won't have any real effect anyhow.)
3844 static unsigned int buffer_hz;
3846 /* Returns tc handle 'major':'minor'. */
3848 tc_make_handle(unsigned int major, unsigned int minor)
3850 return TC_H_MAKE(major << 16, minor);
3853 /* Returns the major number from 'handle'. */
3855 tc_get_major(unsigned int handle)
3857 return TC_H_MAJ(handle) >> 16;
3860 /* Returns the minor number from 'handle'. */
3862 tc_get_minor(unsigned int handle)
3864 return TC_H_MIN(handle);
3867 static struct tcmsg *
3868 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3869 struct ofpbuf *request)
3871 struct tcmsg *tcmsg;
3875 error = get_ifindex(netdev, &ifindex);
3880 ofpbuf_init(request, 512);
3881 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3882 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3883 tcmsg->tcm_family = AF_UNSPEC;
3884 tcmsg->tcm_ifindex = ifindex;
3885 /* Caller should fill in tcmsg->tcm_handle. */
3886 /* Caller should fill in tcmsg->tcm_parent. */
3892 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3894 int error = nl_transact(NETLINK_ROUTE, request, replyp);
3895 ofpbuf_uninit(request);
3899 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3900 * policing configuration.
3902 * This function is equivalent to running the following when 'add' is true:
3903 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3905 * This function is equivalent to running the following when 'add' is false:
3906 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3908 * The configuration and stats may be seen with the following command:
3909 * /sbin/tc -s qdisc show dev <devname>
3911 * Returns 0 if successful, otherwise a positive errno value.
3914 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3916 struct ofpbuf request;
3917 struct tcmsg *tcmsg;
3919 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3920 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3922 tcmsg = tc_make_request(netdev, type, flags, &request);
3926 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3927 tcmsg->tcm_parent = TC_H_INGRESS;
3928 nl_msg_put_string(&request, TCA_KIND, "ingress");
3929 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3931 error = tc_transact(&request, NULL);
3933 /* If we're deleting the qdisc, don't worry about some of the
3934 * error conditions. */
3935 if (!add && (error == ENOENT || error == EINVAL)) {
3944 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3947 * This function is equivalent to running:
3948 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3949 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3952 * The configuration and stats may be seen with the following command:
3953 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3955 * Returns 0 if successful, otherwise a positive errno value.
3958 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3960 struct tc_police tc_police;
3961 struct ofpbuf request;
3962 struct tcmsg *tcmsg;
3963 size_t basic_offset;
3964 size_t police_offset;
3968 memset(&tc_police, 0, sizeof tc_police);
3969 tc_police.action = TC_POLICE_SHOT;
3970 tc_police.mtu = mtu;
3971 tc_fill_rate(&tc_police.rate, (kbits_rate * 1000)/8, mtu);
3972 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3973 kbits_burst * 1024);
3975 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3976 NLM_F_EXCL | NLM_F_CREATE, &request);
3980 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3981 tcmsg->tcm_info = tc_make_handle(49,
3982 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3984 nl_msg_put_string(&request, TCA_KIND, "basic");
3985 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3986 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3987 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3988 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3989 nl_msg_end_nested(&request, police_offset);
3990 nl_msg_end_nested(&request, basic_offset);
3992 error = tc_transact(&request, NULL);
4003 /* The values in psched are not individually very meaningful, but they are
4004 * important. The tables below show some values seen in the wild.
4008 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
4009 * (Before that, there are hints that it was 1000000000.)
4011 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
4015 * -----------------------------------
4016 * [1] 000c8000 000f4240 000f4240 00000064
4017 * [2] 000003e8 00000400 000f4240 3b9aca00
4018 * [3] 000003e8 00000400 000f4240 3b9aca00
4019 * [4] 000003e8 00000400 000f4240 00000064
4020 * [5] 000003e8 00000040 000f4240 3b9aca00
4021 * [6] 000003e8 00000040 000f4240 000000f9
4023 * a b c d ticks_per_s buffer_hz
4024 * ------- --------- ---------- ------------- ----------- -------------
4025 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
4026 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4027 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4028 * [4] 1,000 1,024 1,000,000 100 976,562 100
4029 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
4030 * [6] 1,000 64 1,000,000 249 15,625,000 249
4032 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
4033 * [2] 2.6.26-1-686-bigmem from Debian lenny
4034 * [3] 2.6.26-2-sparc64 from Debian lenny
4035 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
4036 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
4037 * [6] 2.6.34 from kernel.org on KVM
4039 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4040 static const char fn[] = "/proc/net/psched";
4041 unsigned int a, b, c, d;
4044 if (!ovsthread_once_start(&once)) {
4051 stream = fopen(fn, "r");
4053 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
4057 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
4058 VLOG_WARN("%s: read failed", fn);
4062 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
4066 VLOG_WARN("%s: invalid scheduler parameters", fn);
4070 ticks_per_s = (double) a * c / b;
4074 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
4077 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
4080 ovsthread_once_done(&once);
4083 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
4084 * rate of 'rate' bytes per second. */
4086 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
4089 return (rate * ticks) / ticks_per_s;
4092 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
4093 * rate of 'rate' bytes per second. */
4095 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
4098 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
4101 /* Returns the number of bytes that need to be reserved for qdisc buffering at
4102 * a transmission rate of 'rate' bytes per second. */
4104 tc_buffer_per_jiffy(unsigned int rate)
4107 return rate / buffer_hz;
4110 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
4111 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
4112 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
4113 * stores NULL into it if it is absent.
4115 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
4118 * Returns 0 if successful, otherwise a positive errno value. */
4120 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
4121 struct nlattr **options)
4123 static const struct nl_policy tca_policy[] = {
4124 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
4125 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
4127 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4129 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4130 tca_policy, ta, ARRAY_SIZE(ta))) {
4131 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
4136 *kind = nl_attr_get_string(ta[TCA_KIND]);
4140 *options = ta[TCA_OPTIONS];
4155 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
4156 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
4157 * into '*options', and its queue statistics into '*stats'. Any of the output
4158 * arguments may be null.
4160 * Returns 0 if successful, otherwise a positive errno value. */
4162 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
4163 struct nlattr **options, struct netdev_queue_stats *stats)
4165 static const struct nl_policy tca_policy[] = {
4166 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
4167 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
4169 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4171 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4172 tca_policy, ta, ARRAY_SIZE(ta))) {
4173 VLOG_WARN_RL(&rl, "failed to parse class message");
4178 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
4179 *handlep = tc->tcm_handle;
4183 *options = ta[TCA_OPTIONS];
4187 const struct gnet_stats_queue *gsq;
4188 struct gnet_stats_basic gsb;
4190 static const struct nl_policy stats_policy[] = {
4191 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
4192 .min_len = sizeof gsb },
4193 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
4194 .min_len = sizeof *gsq },
4196 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
4198 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
4199 sa, ARRAY_SIZE(sa))) {
4200 VLOG_WARN_RL(&rl, "failed to parse class stats");
4204 /* Alignment issues screw up the length of struct gnet_stats_basic on
4205 * some arch/bitsize combinations. Newer versions of Linux have a
4206 * struct gnet_stats_basic_packed, but we can't depend on that. The
4207 * easiest thing to do is just to make a copy. */
4208 memset(&gsb, 0, sizeof gsb);
4209 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
4210 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
4211 stats->tx_bytes = gsb.bytes;
4212 stats->tx_packets = gsb.packets;
4214 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
4215 stats->tx_errors = gsq->drops;
4225 memset(stats, 0, sizeof *stats);
4230 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
4233 tc_query_class(const struct netdev *netdev,
4234 unsigned int handle, unsigned int parent,
4235 struct ofpbuf **replyp)
4237 struct ofpbuf request;
4238 struct tcmsg *tcmsg;
4241 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
4245 tcmsg->tcm_handle = handle;
4246 tcmsg->tcm_parent = parent;
4248 error = tc_transact(&request, replyp);
4250 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
4251 netdev_get_name(netdev),
4252 tc_get_major(handle), tc_get_minor(handle),
4253 tc_get_major(parent), tc_get_minor(parent),
4254 ovs_strerror(error));
4259 /* Equivalent to "tc class del dev <name> handle <handle>". */
4261 tc_delete_class(const struct netdev *netdev, unsigned int handle)
4263 struct ofpbuf request;
4264 struct tcmsg *tcmsg;
4267 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
4271 tcmsg->tcm_handle = handle;
4272 tcmsg->tcm_parent = 0;
4274 error = tc_transact(&request, NULL);
4276 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4277 netdev_get_name(netdev),
4278 tc_get_major(handle), tc_get_minor(handle),
4279 ovs_strerror(error));
4284 /* Equivalent to "tc qdisc del dev <name> root". */
4286 tc_del_qdisc(struct netdev *netdev_)
4288 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4289 struct ofpbuf request;
4290 struct tcmsg *tcmsg;
4293 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
4297 tcmsg->tcm_handle = tc_make_handle(1, 0);
4298 tcmsg->tcm_parent = TC_H_ROOT;
4300 error = tc_transact(&request, NULL);
4301 if (error == EINVAL) {
4302 /* EINVAL probably means that the default qdisc was in use, in which
4303 * case we've accomplished our purpose. */
4306 if (!error && netdev->tc) {
4307 if (netdev->tc->ops->tc_destroy) {
4308 netdev->tc->ops->tc_destroy(netdev->tc);
4315 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4316 * kernel to determine what they are. Returns 0 if successful, otherwise a
4317 * positive errno value. */
4319 tc_query_qdisc(const struct netdev *netdev_)
4321 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4322 struct ofpbuf request, *qdisc;
4323 const struct tc_ops *ops;
4324 struct tcmsg *tcmsg;
4332 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4333 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4334 * 2.6.35 without that fix backported to it.
4336 * To avoid the OOPS, we must not make a request that would attempt to dump
4337 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4338 * few others. There are a few ways that I can see to do this, but most of
4339 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4340 * technique chosen here is to assume that any non-default qdisc that we
4341 * create will have a class with handle 1:0. The built-in qdiscs only have
4342 * a class with handle 0:0.
4344 * We could check for Linux 2.6.35+ and use a more straightforward method
4346 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
4350 tcmsg->tcm_handle = tc_make_handle(1, 0);
4351 tcmsg->tcm_parent = 0;
4353 /* Figure out what tc class to instantiate. */
4354 error = tc_transact(&request, &qdisc);
4358 error = tc_parse_qdisc(qdisc, &kind, NULL);
4360 ops = &tc_ops_other;
4362 ops = tc_lookup_linux_name(kind);
4364 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4365 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4367 ops = &tc_ops_other;
4370 } else if (error == ENOENT) {
4371 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4372 * other entity that doesn't have a handle 1:0. We will assume
4373 * that it's the system default qdisc. */
4374 ops = &tc_ops_default;
4377 /* Who knows? Maybe the device got deleted. */
4378 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4379 netdev_get_name(netdev_), ovs_strerror(error));
4380 ops = &tc_ops_other;
4383 /* Instantiate it. */
4384 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
4385 ovs_assert((load_error == 0) == (netdev->tc != NULL));
4386 ofpbuf_delete(qdisc);
4388 return error ? error : load_error;
4391 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4392 approximate the time to transmit packets of various lengths. For an MTU of
4393 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4394 represents two possible packet lengths; for a MTU of 513 through 1024, four
4395 possible lengths; and so on.
4397 Returns, for the specified 'mtu', the number of bits that packet lengths
4398 need to be shifted right to fit within such a 256-entry table. */
4400 tc_calc_cell_log(unsigned int mtu)
4405 mtu = ETH_PAYLOAD_MAX;
4407 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4409 for (cell_log = 0; mtu >= 256; cell_log++) {
4416 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4419 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4421 memset(rate, 0, sizeof *rate);
4422 rate->cell_log = tc_calc_cell_log(mtu);
4423 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4424 /* rate->cell_align = 0; */ /* distro headers. */
4425 rate->mpu = ETH_TOTAL_MIN;
4429 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4430 * attribute of the specified "type".
4432 * See tc_calc_cell_log() above for a description of "rtab"s. */
4434 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4439 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4440 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4441 unsigned packet_size = (i + 1) << rate->cell_log;
4442 if (packet_size < rate->mpu) {
4443 packet_size = rate->mpu;
4445 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4449 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4450 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4451 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4454 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4456 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4457 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4460 /* Linux-only functions declared in netdev-linux.h */
4462 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4463 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4465 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4466 const char *flag_name, bool enable)
4468 const char *netdev_name = netdev_get_name(netdev);
4469 struct ethtool_value evalue;
4473 COVERAGE_INC(netdev_get_ethtool);
4474 memset(&evalue, 0, sizeof evalue);
4475 error = netdev_linux_do_ethtool(netdev_name,
4476 (struct ethtool_cmd *)&evalue,
4477 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4482 COVERAGE_INC(netdev_set_ethtool);
4483 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4484 error = netdev_linux_do_ethtool(netdev_name,
4485 (struct ethtool_cmd *)&evalue,
4486 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4491 COVERAGE_INC(netdev_get_ethtool);
4492 memset(&evalue, 0, sizeof evalue);
4493 error = netdev_linux_do_ethtool(netdev_name,
4494 (struct ethtool_cmd *)&evalue,
4495 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4500 if (new_flags != evalue.data) {
4501 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4502 "device %s failed", enable ? "enable" : "disable",
4503 flag_name, netdev_name);
4510 /* Utility functions. */
4512 /* Copies 'src' into 'dst', performing format conversion in the process. */
4514 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4515 const struct rtnl_link_stats *src)
4517 dst->rx_packets = src->rx_packets;
4518 dst->tx_packets = src->tx_packets;
4519 dst->rx_bytes = src->rx_bytes;
4520 dst->tx_bytes = src->tx_bytes;
4521 dst->rx_errors = src->rx_errors;
4522 dst->tx_errors = src->tx_errors;
4523 dst->rx_dropped = src->rx_dropped;
4524 dst->tx_dropped = src->tx_dropped;
4525 dst->multicast = src->multicast;
4526 dst->collisions = src->collisions;
4527 dst->rx_length_errors = src->rx_length_errors;
4528 dst->rx_over_errors = src->rx_over_errors;
4529 dst->rx_crc_errors = src->rx_crc_errors;
4530 dst->rx_frame_errors = src->rx_frame_errors;
4531 dst->rx_fifo_errors = src->rx_fifo_errors;
4532 dst->rx_missed_errors = src->rx_missed_errors;
4533 dst->tx_aborted_errors = src->tx_aborted_errors;
4534 dst->tx_carrier_errors = src->tx_carrier_errors;
4535 dst->tx_fifo_errors = src->tx_fifo_errors;
4536 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4537 dst->tx_window_errors = src->tx_window_errors;
4541 get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
4543 struct ofpbuf request;
4544 struct ofpbuf *reply;
4547 ofpbuf_init(&request, 0);
4548 nl_msg_put_nlmsghdr(&request,
4549 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
4550 RTM_GETLINK, NLM_F_REQUEST);
4551 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
4552 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
4553 error = nl_transact(NETLINK_ROUTE, &request, &reply);
4554 ofpbuf_uninit(&request);
4559 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
4560 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS);
4561 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
4562 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
4565 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4569 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
4574 ofpbuf_delete(reply);
4579 get_flags(const struct netdev *dev, unsigned int *flags)
4585 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
4587 *flags = ifr.ifr_flags;
4593 set_flags(const char *name, unsigned int flags)
4597 ifr.ifr_flags = flags;
4598 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
4602 do_get_ifindex(const char *netdev_name)
4607 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4608 COVERAGE_INC(netdev_get_ifindex);
4610 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
4612 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4613 netdev_name, ovs_strerror(error));
4616 return ifr.ifr_ifindex;
4620 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4622 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4624 if (!(netdev->cache_valid & VALID_IFINDEX)) {
4625 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4628 netdev->get_ifindex_error = -ifindex;
4629 netdev->ifindex = 0;
4631 netdev->get_ifindex_error = 0;
4632 netdev->ifindex = ifindex;
4634 netdev->cache_valid |= VALID_IFINDEX;
4637 *ifindexp = netdev->ifindex;
4638 return netdev->get_ifindex_error;
4642 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4648 memset(&ifr, 0, sizeof ifr);
4649 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4650 COVERAGE_INC(netdev_get_hwaddr);
4651 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
4653 /* ENODEV probably means that a vif disappeared asynchronously and
4654 * hasn't been removed from the database yet, so reduce the log level
4655 * to INFO for that case. */
4656 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
4657 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4658 netdev_name, ovs_strerror(error));
4661 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4662 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4663 VLOG_WARN("%s device has unknown hardware address family %d",
4664 netdev_name, hwaddr_family);
4666 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4671 set_etheraddr(const char *netdev_name,
4672 const uint8_t mac[ETH_ADDR_LEN])
4677 memset(&ifr, 0, sizeof ifr);
4678 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4679 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4680 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4681 COVERAGE_INC(netdev_set_hwaddr);
4682 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
4684 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4685 netdev_name, ovs_strerror(error));
4691 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4692 int cmd, const char *cmd_name)
4697 memset(&ifr, 0, sizeof ifr);
4698 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4699 ifr.ifr_data = (caddr_t) ecmd;
4702 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
4704 if (error != EOPNOTSUPP) {
4705 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4706 "failed: %s", cmd_name, name, ovs_strerror(error));
4708 /* The device doesn't support this operation. That's pretty
4709 * common, so there's no point in logging anything. */
4716 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4717 int cmd, const char *cmd_name)
4722 ifr.ifr_addr.sa_family = AF_INET;
4723 error = af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4725 const struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *,
4727 *ip = sin->sin_addr;
4732 /* Returns an AF_PACKET raw socket or a negative errno value. */
4734 af_packet_sock(void)
4736 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4739 if (ovsthread_once_start(&once)) {
4740 sock = socket(AF_PACKET, SOCK_RAW, 0);
4742 int error = set_nonblocking(sock);
4749 VLOG_ERR("failed to create packet socket: %s",
4750 ovs_strerror(errno));
4752 ovsthread_once_done(&once);