2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <sys/types.h>
37 #include <sys/ioctl.h>
38 #include <sys/socket.h>
39 #include <netpacket/packet.h>
41 #include <net/if_arp.h>
42 #include <net/if_packet.h>
43 #include <net/route.h>
44 #include <netinet/in.h>
50 #include "connectivity.h"
52 #include "dpif-linux.h"
53 #include "dynamic-string.h"
54 #include "fatal-signal.h"
57 #include "netdev-provider.h"
58 #include "netdev-vport.h"
59 #include "netlink-notifier.h"
60 #include "netlink-socket.h"
63 #include "openflow/openflow.h"
64 #include "ovs-atomic.h"
66 #include "poll-loop.h"
67 #include "rtnetlink-link.h"
70 #include "socket-util.h"
73 #include "unaligned.h"
76 VLOG_DEFINE_THIS_MODULE(netdev_linux);
78 COVERAGE_DEFINE(netdev_set_policing);
79 COVERAGE_DEFINE(netdev_arp_lookup);
80 COVERAGE_DEFINE(netdev_get_ifindex);
81 COVERAGE_DEFINE(netdev_get_hwaddr);
82 COVERAGE_DEFINE(netdev_set_hwaddr);
83 COVERAGE_DEFINE(netdev_get_ethtool);
84 COVERAGE_DEFINE(netdev_set_ethtool);
87 /* These were introduced in Linux 2.6.14, so they might be missing if we have
89 #ifndef ADVERTISED_Pause
90 #define ADVERTISED_Pause (1 << 13)
92 #ifndef ADVERTISED_Asym_Pause
93 #define ADVERTISED_Asym_Pause (1 << 14)
96 /* These were introduced in Linux 2.6.24, so they might be missing if we
97 * have old headers. */
98 #ifndef ETHTOOL_GFLAGS
99 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
101 #ifndef ETHTOOL_SFLAGS
102 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
105 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
108 #define TC_RTAB_SIZE 1024
111 /* Linux 2.6.21 introduced struct tpacket_auxdata.
112 * Linux 2.6.27 added the tp_vlan_tci member.
113 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
114 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
115 * TP_STATUS_VLAN_TPID_VALID.
117 * With all this churn it's easiest to unconditionally define a replacement
118 * structure that has everything we want.
120 #ifndef PACKET_AUXDATA
121 #define PACKET_AUXDATA 8
123 #ifndef TP_STATUS_VLAN_VALID
124 #define TP_STATUS_VLAN_VALID (1 << 4)
126 #ifndef TP_STATUS_VLAN_TPID_VALID
127 #define TP_STATUS_VLAN_TPID_VALID (1 << 6)
129 #undef tpacket_auxdata
130 #define tpacket_auxdata rpl_tpacket_auxdata
131 struct tpacket_auxdata {
137 uint16_t tp_vlan_tci;
138 uint16_t tp_vlan_tpid;
142 VALID_IFINDEX = 1 << 0,
143 VALID_ETHERADDR = 1 << 1,
147 VALID_POLICING = 1 << 5,
148 VALID_VPORT_STAT_ERROR = 1 << 6,
149 VALID_DRVINFO = 1 << 7,
150 VALID_FEATURES = 1 << 8,
153 /* Traffic control. */
155 /* An instance of a traffic control class. Always associated with a particular
158 * Each TC implementation subclasses this with whatever additional data it
161 const struct tc_ops *ops;
162 struct hmap queues; /* Contains "struct tc_queue"s.
163 * Read by generic TC layer.
164 * Written only by TC implementation. */
167 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
169 /* One traffic control queue.
171 * Each TC implementation subclasses this with whatever additional data it
174 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
175 unsigned int queue_id; /* OpenFlow queue ID. */
176 long long int created; /* Time queue was created, in msecs. */
179 /* A particular kind of traffic control. Each implementation generally maps to
180 * one particular Linux qdisc class.
182 * The functions below return 0 if successful or a positive errno value on
183 * failure, except where otherwise noted. All of them must be provided, except
184 * where otherwise noted. */
186 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
187 * This is null for tc_ops_default and tc_ops_other, for which there are no
188 * appropriate values. */
189 const char *linux_name;
191 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
192 const char *ovs_name;
194 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
195 * queues. The queues are numbered 0 through n_queues - 1. */
196 unsigned int n_queues;
198 /* Called to install this TC class on 'netdev'. The implementation should
199 * make the Netlink calls required to set up 'netdev' with the right qdisc
200 * and configure it according to 'details'. The implementation may assume
201 * that the current qdisc is the default; that is, there is no need for it
202 * to delete the current qdisc before installing itself.
204 * The contents of 'details' should be documented as valid for 'ovs_name'
205 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
206 * (which is built as ovs-vswitchd.conf.db(8)).
208 * This function must return 0 if and only if it sets 'netdev->tc' to an
209 * initialized 'struct tc'.
211 * (This function is null for tc_ops_other, which cannot be installed. For
212 * other TC classes it should always be nonnull.) */
213 int (*tc_install)(struct netdev *netdev, const struct smap *details);
215 /* Called when the netdev code determines (through a Netlink query) that
216 * this TC class's qdisc is installed on 'netdev', but we didn't install
217 * it ourselves and so don't know any of the details.
219 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
220 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
221 * implementation should parse the other attributes of 'nlmsg' as
222 * necessary to determine its configuration. If necessary it should also
223 * use Netlink queries to determine the configuration of queues on
226 * This function must return 0 if and only if it sets 'netdev->tc' to an
227 * initialized 'struct tc'. */
228 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
230 /* Destroys the data structures allocated by the implementation as part of
231 * 'tc'. (This includes destroying 'tc->queues' by calling
234 * The implementation should not need to perform any Netlink calls. If
235 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
236 * (But it may not be desirable.)
238 * This function may be null if 'tc' is trivial. */
239 void (*tc_destroy)(struct tc *tc);
241 /* Retrieves details of 'netdev->tc' configuration into 'details'.
243 * The implementation should not need to perform any Netlink calls, because
244 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
245 * cached the configuration.
247 * The contents of 'details' should be documented as valid for 'ovs_name'
248 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
249 * (which is built as ovs-vswitchd.conf.db(8)).
251 * This function may be null if 'tc' is not configurable.
253 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
255 /* Reconfigures 'netdev->tc' according to 'details', performing any
256 * required Netlink calls to complete the reconfiguration.
258 * The contents of 'details' should be documented as valid for 'ovs_name'
259 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
260 * (which is built as ovs-vswitchd.conf.db(8)).
262 * This function may be null if 'tc' is not configurable.
264 int (*qdisc_set)(struct netdev *, const struct smap *details);
266 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
267 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
269 * The contents of 'details' should be documented as valid for 'ovs_name'
270 * in the "other_config" column in the "Queue" table in
271 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
273 * The implementation should not need to perform any Netlink calls, because
274 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
275 * cached the queue configuration.
277 * This function may be null if 'tc' does not have queues ('n_queues' is
279 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
280 struct smap *details);
282 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
283 * 'details', perfoming any required Netlink calls to complete the
284 * reconfiguration. The caller ensures that 'queue_id' is less than
287 * The contents of 'details' should be documented as valid for 'ovs_name'
288 * in the "other_config" column in the "Queue" table in
289 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
291 * This function may be null if 'tc' does not have queues or its queues are
292 * not configurable. */
293 int (*class_set)(struct netdev *, unsigned int queue_id,
294 const struct smap *details);
296 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
297 * tc_queue's within 'netdev->tc->queues'.
299 * This function may be null if 'tc' does not have queues or its queues
300 * cannot be deleted. */
301 int (*class_delete)(struct netdev *, struct tc_queue *queue);
303 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
304 * 'struct tc_queue's within 'netdev->tc->queues'.
306 * On success, initializes '*stats'.
308 * This function may be null if 'tc' does not have queues or if it cannot
309 * report queue statistics. */
310 int (*class_get_stats)(const struct netdev *netdev,
311 const struct tc_queue *queue,
312 struct netdev_queue_stats *stats);
314 /* Extracts queue stats from 'nlmsg', which is a response to a
315 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
317 * This function may be null if 'tc' does not have queues or if it cannot
318 * report queue statistics. */
319 int (*class_dump_stats)(const struct netdev *netdev,
320 const struct ofpbuf *nlmsg,
321 netdev_dump_queue_stats_cb *cb, void *aux);
325 tc_init(struct tc *tc, const struct tc_ops *ops)
328 hmap_init(&tc->queues);
332 tc_destroy(struct tc *tc)
334 hmap_destroy(&tc->queues);
337 static const struct tc_ops tc_ops_htb;
338 static const struct tc_ops tc_ops_hfsc;
339 static const struct tc_ops tc_ops_default;
340 static const struct tc_ops tc_ops_other;
342 static const struct tc_ops *const tcs[] = {
343 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
344 &tc_ops_hfsc, /* Hierarchical fair service curve. */
345 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
346 &tc_ops_other, /* Some other qdisc. */
350 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
351 static unsigned int tc_get_major(unsigned int handle);
352 static unsigned int tc_get_minor(unsigned int handle);
354 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
355 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
356 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
358 static struct tcmsg *tc_make_request(const struct netdev *, int type,
359 unsigned int flags, struct ofpbuf *);
360 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
361 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
362 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
365 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
366 struct nlattr **options);
367 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
368 struct nlattr **options,
369 struct netdev_queue_stats *);
370 static int tc_query_class(const struct netdev *,
371 unsigned int handle, unsigned int parent,
372 struct ofpbuf **replyp);
373 static int tc_delete_class(const struct netdev *, unsigned int handle);
375 static int tc_del_qdisc(struct netdev *netdev);
376 static int tc_query_qdisc(const struct netdev *netdev);
378 static int tc_calc_cell_log(unsigned int mtu);
379 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
380 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
381 const struct tc_ratespec *rate);
382 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
384 struct netdev_linux {
387 /* Protects all members below. */
388 struct ovs_mutex mutex;
390 unsigned int cache_valid;
392 bool miimon; /* Link status of last poll. */
393 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
394 struct timer miimon_timer;
396 /* The following are figured out "on demand" only. They are only valid
397 * when the corresponding VALID_* bit in 'cache_valid' is set. */
399 uint8_t etheraddr[ETH_ADDR_LEN];
400 struct in_addr address, netmask;
403 unsigned int ifi_flags;
404 long long int carrier_resets;
405 uint32_t kbits_rate; /* Policing data. */
406 uint32_t kbits_burst;
407 int vport_stats_error; /* Cached error code from vport_get_stats().
408 0 or an errno value. */
409 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
410 int ether_addr_error; /* Cached error code from set/get etheraddr. */
411 int netdev_policing_error; /* Cached error code from set policing. */
412 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
413 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
415 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
416 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
417 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
419 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
422 /* For devices of class netdev_tap_class only. */
426 struct netdev_rx_linux {
432 /* This is set pretty low because we probably won't learn anything from the
433 * additional log messages. */
434 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
436 /* Polling miimon status for all ports causes performance degradation when
437 * handling a large number of ports. If there are no devices using miimon, then
438 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait(). */
439 static atomic_int miimon_cnt = ATOMIC_VAR_INIT(0);
441 static void netdev_linux_run(void);
443 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
444 int cmd, const char *cmd_name);
445 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
446 int cmd, const char *cmd_name);
447 static int get_flags(const struct netdev *, unsigned int *flags);
448 static int set_flags(const char *, unsigned int flags);
449 static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
450 enum netdev_flags on, enum netdev_flags *old_flagsp)
451 OVS_REQUIRES(netdev->mutex);
452 static int do_get_ifindex(const char *netdev_name);
453 static int get_ifindex(const struct netdev *, int *ifindexp);
454 static int do_set_addr(struct netdev *netdev,
455 int ioctl_nr, const char *ioctl_name,
456 struct in_addr addr);
457 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
458 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
459 static int get_stats_via_netlink(const struct netdev *, struct netdev_stats *);
460 static int af_packet_sock(void);
461 static bool netdev_linux_miimon_enabled(void);
462 static void netdev_linux_miimon_run(void);
463 static void netdev_linux_miimon_wait(void);
466 is_netdev_linux_class(const struct netdev_class *netdev_class)
468 return netdev_class->run == netdev_linux_run;
472 is_tap_netdev(const struct netdev *netdev)
474 return netdev_get_class(netdev) == &netdev_tap_class;
477 static struct netdev_linux *
478 netdev_linux_cast(const struct netdev *netdev)
480 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
482 return CONTAINER_OF(netdev, struct netdev_linux, up);
485 static struct netdev_rx_linux *
486 netdev_rx_linux_cast(const struct netdev_rx *rx)
488 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
489 return CONTAINER_OF(rx, struct netdev_rx_linux, up);
492 static void netdev_linux_update(struct netdev_linux *netdev,
493 const struct rtnetlink_link_change *)
494 OVS_REQUIRES(netdev->mutex);
495 static void netdev_linux_changed(struct netdev_linux *netdev,
496 unsigned int ifi_flags, unsigned int mask)
497 OVS_REQUIRES(netdev->mutex);
499 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK changes, or NULL
500 * if no such socket could be created. */
501 static struct nl_sock *
502 netdev_linux_notify_sock(void)
504 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
505 static struct nl_sock *sock;
507 if (ovsthread_once_start(&once)) {
510 error = nl_sock_create(NETLINK_ROUTE, &sock);
512 error = nl_sock_join_mcgroup(sock, RTNLGRP_LINK);
514 nl_sock_destroy(sock);
518 ovsthread_once_done(&once);
525 netdev_linux_miimon_enabled(void)
529 atomic_read(&miimon_cnt, &miimon);
534 netdev_linux_run(void)
536 struct nl_sock *sock;
539 if (netdev_linux_miimon_enabled()) {
540 netdev_linux_miimon_run();
543 sock = netdev_linux_notify_sock();
549 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
550 uint64_t buf_stub[4096 / 8];
553 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
554 error = nl_sock_recv(sock, &buf, false);
556 struct rtnetlink_link_change change;
558 if (rtnetlink_link_parse(&buf, &change)) {
559 struct netdev *netdev_ = netdev_from_name(change.ifname);
560 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
561 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
563 ovs_mutex_lock(&netdev->mutex);
564 netdev_linux_update(netdev, &change);
565 ovs_mutex_unlock(&netdev->mutex);
567 netdev_close(netdev_);
569 } else if (error == ENOBUFS) {
570 struct shash device_shash;
571 struct shash_node *node;
575 shash_init(&device_shash);
576 netdev_get_devices(&netdev_linux_class, &device_shash);
577 SHASH_FOR_EACH (node, &device_shash) {
578 struct netdev *netdev_ = node->data;
579 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
582 ovs_mutex_lock(&netdev->mutex);
583 get_flags(netdev_, &flags);
584 netdev_linux_changed(netdev, flags, 0);
585 ovs_mutex_unlock(&netdev->mutex);
587 netdev_close(netdev_);
589 shash_destroy(&device_shash);
590 } else if (error != EAGAIN) {
591 VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)",
592 ovs_strerror(error));
599 netdev_linux_wait(void)
601 struct nl_sock *sock;
603 if (netdev_linux_miimon_enabled()) {
604 netdev_linux_miimon_wait();
606 sock = netdev_linux_notify_sock();
608 nl_sock_wait(sock, POLLIN);
613 netdev_linux_changed(struct netdev_linux *dev,
614 unsigned int ifi_flags, unsigned int mask)
615 OVS_REQUIRES(dev->mutex)
617 seq_change(connectivity_seq_get());
619 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
620 dev->carrier_resets++;
622 dev->ifi_flags = ifi_flags;
624 dev->cache_valid &= mask;
628 netdev_linux_update(struct netdev_linux *dev,
629 const struct rtnetlink_link_change *change)
630 OVS_REQUIRES(dev->mutex)
632 if (change->nlmsg_type == RTM_NEWLINK) {
634 netdev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
636 /* Update netdev from rtnl-change msg. */
638 dev->mtu = change->mtu;
639 dev->cache_valid |= VALID_MTU;
640 dev->netdev_mtu_error = 0;
643 if (!eth_addr_is_zero(change->addr)) {
644 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
645 dev->cache_valid |= VALID_ETHERADDR;
646 dev->ether_addr_error = 0;
649 dev->ifindex = change->ifi_index;
650 dev->cache_valid |= VALID_IFINDEX;
651 dev->get_ifindex_error = 0;
654 netdev_linux_changed(dev, change->ifi_flags, 0);
658 static struct netdev *
659 netdev_linux_alloc(void)
661 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
666 netdev_linux_common_construct(struct netdev_linux *netdev)
668 ovs_mutex_init(&netdev->mutex);
671 /* Creates system and internal devices. */
673 netdev_linux_construct(struct netdev *netdev_)
675 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
678 netdev_linux_common_construct(netdev);
680 error = get_flags(&netdev->up, &netdev->ifi_flags);
681 if (error == ENODEV) {
682 if (netdev->up.netdev_class != &netdev_internal_class) {
683 /* The device does not exist, so don't allow it to be opened. */
686 /* "Internal" netdevs have to be created as netdev objects before
687 * they exist in the kernel, because creating them in the kernel
688 * happens by passing a netdev object to dpif_port_add().
689 * Therefore, ignore the error. */
696 /* For most types of netdevs we open the device for each call of
697 * netdev_open(). However, this is not the case with tap devices,
698 * since it is only possible to open the device once. In this
699 * situation we share a single file descriptor, and consequently
700 * buffers, across all readers. Therefore once data is read it will
701 * be unavailable to other reads for tap devices. */
703 netdev_linux_construct_tap(struct netdev *netdev_)
705 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
706 static const char tap_dev[] = "/dev/net/tun";
707 const char *name = netdev_->name;
711 netdev_linux_common_construct(netdev);
713 /* Open tap device. */
714 netdev->tap_fd = open(tap_dev, O_RDWR);
715 if (netdev->tap_fd < 0) {
717 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
721 /* Create tap device. */
722 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
723 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
724 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
725 VLOG_WARN("%s: creating tap device failed: %s", name,
726 ovs_strerror(errno));
731 /* Make non-blocking. */
732 error = set_nonblocking(netdev->tap_fd);
740 close(netdev->tap_fd);
745 netdev_linux_destruct(struct netdev *netdev_)
747 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
749 if (netdev->tc && netdev->tc->ops->tc_destroy) {
750 netdev->tc->ops->tc_destroy(netdev->tc);
753 if (netdev_get_class(netdev_) == &netdev_tap_class
754 && netdev->tap_fd >= 0)
756 close(netdev->tap_fd);
759 if (netdev->miimon_interval > 0) {
761 atomic_sub(&miimon_cnt, 1, &junk);
764 ovs_mutex_destroy(&netdev->mutex);
768 netdev_linux_dealloc(struct netdev *netdev_)
770 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
774 static struct netdev_rx *
775 netdev_linux_rx_alloc(void)
777 struct netdev_rx_linux *rx = xzalloc(sizeof *rx);
782 netdev_linux_rx_construct(struct netdev_rx *rx_)
784 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
785 struct netdev *netdev_ = rx->up.netdev;
786 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
789 ovs_mutex_lock(&netdev->mutex);
790 rx->is_tap = is_tap_netdev(netdev_);
792 rx->fd = netdev->tap_fd;
794 struct sockaddr_ll sll;
796 /* Result of tcpdump -dd inbound */
797 static const struct sock_filter filt[] = {
798 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
799 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
800 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
801 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
803 static const struct sock_fprog fprog = {
804 ARRAY_SIZE(filt), (struct sock_filter *) filt
807 /* Create file descriptor. */
808 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
811 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
816 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
818 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
819 netdev_get_name(netdev_), ovs_strerror(error));
823 /* Set non-blocking mode. */
824 error = set_nonblocking(rx->fd);
829 /* Get ethernet device index. */
830 error = get_ifindex(&netdev->up, &ifindex);
835 /* Bind to specific ethernet device. */
836 memset(&sll, 0, sizeof sll);
837 sll.sll_family = AF_PACKET;
838 sll.sll_ifindex = ifindex;
839 sll.sll_protocol = htons(ETH_P_ALL);
840 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
842 VLOG_ERR("%s: failed to bind raw socket (%s)",
843 netdev_get_name(netdev_), ovs_strerror(error));
847 /* Filter for only inbound packets. */
848 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
852 VLOG_ERR("%s: failed to attach filter (%s)",
853 netdev_get_name(netdev_), ovs_strerror(error));
857 ovs_mutex_unlock(&netdev->mutex);
865 ovs_mutex_unlock(&netdev->mutex);
870 netdev_linux_rx_destruct(struct netdev_rx *rx_)
872 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
880 netdev_linux_rx_dealloc(struct netdev_rx *rx_)
882 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
888 auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux)
890 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
891 return htons(aux->tp_vlan_tpid);
893 return htons(ETH_TYPE_VLAN);
898 auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
900 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
904 netdev_linux_rx_recv_sock(int fd, struct ofpbuf *buffer)
909 struct cmsghdr *cmsg;
912 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
916 /* Reserve headroom for a single VLAN tag */
917 ofpbuf_reserve(buffer, VLAN_HEADER_LEN);
918 size = ofpbuf_tailroom(buffer);
920 iov.iov_base = buffer->data;
922 msgh.msg_name = NULL;
923 msgh.msg_namelen = 0;
926 msgh.msg_control = &cmsg_buffer;
927 msgh.msg_controllen = sizeof cmsg_buffer;
931 retval = recvmsg(fd, &msgh, MSG_TRUNC);
932 } while (retval < 0 && errno == EINTR);
936 } else if (retval > size) {
940 buffer->size += retval;
942 for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
943 const struct tpacket_auxdata *aux;
945 if (cmsg->cmsg_level != SOL_PACKET
946 || cmsg->cmsg_type != PACKET_AUXDATA
947 || cmsg->cmsg_len < CMSG_LEN(sizeof(struct tpacket_auxdata))) {
951 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
952 if (auxdata_has_vlan_tci(aux)) {
953 if (retval < ETH_HEADER_LEN) {
957 eth_push_vlan(buffer, auxdata_to_vlan_tpid(aux),
958 htons(aux->tp_vlan_tci));
967 netdev_linux_rx_recv_tap(int fd, struct ofpbuf *buffer)
970 size_t size = ofpbuf_tailroom(buffer);
973 retval = read(fd, buffer->data, size);
974 } while (retval < 0 && errno == EINTR);
978 } else if (retval > size) {
982 buffer->size += retval;
987 netdev_linux_rx_recv(struct netdev_rx *rx_, struct ofpbuf *buffer)
989 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
993 ? netdev_linux_rx_recv_tap(rx->fd, buffer)
994 : netdev_linux_rx_recv_sock(rx->fd, buffer));
995 if (retval && retval != EAGAIN && retval != EMSGSIZE) {
996 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
997 ovs_strerror(errno), netdev_rx_get_name(rx_));
1004 netdev_linux_rx_wait(struct netdev_rx *rx_)
1006 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
1007 poll_fd_wait(rx->fd, POLLIN);
1011 netdev_linux_rx_drain(struct netdev_rx *rx_)
1013 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
1016 int error = af_inet_ifreq_ioctl(netdev_rx_get_name(rx_), &ifr,
1017 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
1021 drain_fd(rx->fd, ifr.ifr_qlen);
1024 return drain_rcvbuf(rx->fd);
1028 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
1029 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1030 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1031 * the packet is too big or too small to transmit on the device.
1033 * The caller retains ownership of 'buffer' in all cases.
1035 * The kernel maintains a packet transmission queue, so the caller is not
1036 * expected to do additional queuing of packets. */
1038 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
1043 if (!is_tap_netdev(netdev_)) {
1044 /* Use our AF_PACKET socket to send to this device. */
1045 struct sockaddr_ll sll;
1051 sock = af_packet_sock();
1056 ifindex = netdev_get_ifindex(netdev_);
1061 /* We don't bother setting most fields in sockaddr_ll because the
1062 * kernel ignores them for SOCK_RAW. */
1063 memset(&sll, 0, sizeof sll);
1064 sll.sll_family = AF_PACKET;
1065 sll.sll_ifindex = ifindex;
1067 iov.iov_base = CONST_CAST(void *, data);
1070 msg.msg_name = &sll;
1071 msg.msg_namelen = sizeof sll;
1074 msg.msg_control = NULL;
1075 msg.msg_controllen = 0;
1078 retval = sendmsg(sock, &msg, 0);
1080 /* Use the tap fd to send to this device. This is essential for
1081 * tap devices, because packets sent to a tap device with an
1082 * AF_PACKET socket will loop back to be *received* again on the
1083 * tap device. This doesn't occur on other interface types
1084 * because we attach a socket filter to the rx socket. */
1085 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1087 retval = write(netdev->tap_fd, data, size);
1091 /* The Linux AF_PACKET implementation never blocks waiting for room
1092 * for packets, instead returning ENOBUFS. Translate this into
1093 * EAGAIN for the caller. */
1094 if (errno == ENOBUFS) {
1096 } else if (errno == EINTR) {
1098 } else if (errno != EAGAIN) {
1099 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1100 netdev_get_name(netdev_), ovs_strerror(errno));
1103 } else if (retval != size) {
1104 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE"d bytes of "
1105 "%"PRIuSIZE") on %s", retval, size, netdev_get_name(netdev_));
1113 /* Registers with the poll loop to wake up from the next call to poll_block()
1114 * when the packet transmission queue has sufficient room to transmit a packet
1115 * with netdev_send().
1117 * The kernel maintains a packet transmission queue, so the client is not
1118 * expected to do additional queuing of packets. Thus, this function is
1119 * unlikely to ever be used. It is included for completeness. */
1121 netdev_linux_send_wait(struct netdev *netdev)
1123 if (is_tap_netdev(netdev)) {
1124 /* TAP device always accepts packets.*/
1125 poll_immediate_wake();
1129 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1130 * otherwise a positive errno value. */
1132 netdev_linux_set_etheraddr(struct netdev *netdev_,
1133 const uint8_t mac[ETH_ADDR_LEN])
1135 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1136 enum netdev_flags old_flags = 0;
1139 ovs_mutex_lock(&netdev->mutex);
1141 if (netdev->cache_valid & VALID_ETHERADDR) {
1142 error = netdev->ether_addr_error;
1143 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1146 netdev->cache_valid &= ~VALID_ETHERADDR;
1149 /* Tap devices must be brought down before setting the address. */
1150 if (is_tap_netdev(netdev_)) {
1151 update_flags(netdev, NETDEV_UP, 0, &old_flags);
1153 error = set_etheraddr(netdev_get_name(netdev_), mac);
1154 if (!error || error == ENODEV) {
1155 netdev->ether_addr_error = error;
1156 netdev->cache_valid |= VALID_ETHERADDR;
1158 memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
1162 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1163 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1167 ovs_mutex_unlock(&netdev->mutex);
1171 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1173 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1174 uint8_t mac[ETH_ADDR_LEN])
1176 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1179 ovs_mutex_lock(&netdev->mutex);
1180 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1181 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1183 netdev->cache_valid |= VALID_ETHERADDR;
1186 error = netdev->ether_addr_error;
1188 memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
1190 ovs_mutex_unlock(&netdev->mutex);
1196 netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
1200 if (!(netdev->cache_valid & VALID_MTU)) {
1203 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
1204 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
1205 netdev->mtu = ifr.ifr_mtu;
1206 netdev->cache_valid |= VALID_MTU;
1209 error = netdev->netdev_mtu_error;
1211 *mtup = netdev->mtu;
1217 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1218 * in bytes, not including the hardware header; thus, this is typically 1500
1219 * bytes for Ethernet devices. */
1221 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1223 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1226 ovs_mutex_lock(&netdev->mutex);
1227 error = netdev_linux_get_mtu__(netdev, mtup);
1228 ovs_mutex_unlock(&netdev->mutex);
1233 /* Sets the maximum size of transmitted (MTU) for given device using linux
1234 * networking ioctl interface.
1237 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1239 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1243 ovs_mutex_lock(&netdev->mutex);
1244 if (netdev->cache_valid & VALID_MTU) {
1245 error = netdev->netdev_mtu_error;
1246 if (error || netdev->mtu == mtu) {
1249 netdev->cache_valid &= ~VALID_MTU;
1252 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1253 SIOCSIFMTU, "SIOCSIFMTU");
1254 if (!error || error == ENODEV) {
1255 netdev->netdev_mtu_error = error;
1256 netdev->mtu = ifr.ifr_mtu;
1257 netdev->cache_valid |= VALID_MTU;
1260 ovs_mutex_unlock(&netdev->mutex);
1264 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1265 * On failure, returns a negative errno value. */
1267 netdev_linux_get_ifindex(const struct netdev *netdev_)
1269 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1272 ovs_mutex_lock(&netdev->mutex);
1273 error = get_ifindex(netdev_, &ifindex);
1274 ovs_mutex_unlock(&netdev->mutex);
1276 return error ? -error : ifindex;
1280 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1282 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1284 ovs_mutex_lock(&netdev->mutex);
1285 if (netdev->miimon_interval > 0) {
1286 *carrier = netdev->miimon;
1288 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1290 ovs_mutex_unlock(&netdev->mutex);
1295 static long long int
1296 netdev_linux_get_carrier_resets(const struct netdev *netdev_)
1298 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1299 long long int carrier_resets;
1301 ovs_mutex_lock(&netdev->mutex);
1302 carrier_resets = netdev->carrier_resets;
1303 ovs_mutex_unlock(&netdev->mutex);
1305 return carrier_resets;
1309 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1310 struct mii_ioctl_data *data)
1315 memset(&ifr, 0, sizeof ifr);
1316 memcpy(&ifr.ifr_data, data, sizeof *data);
1317 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
1318 memcpy(data, &ifr.ifr_data, sizeof *data);
1324 netdev_linux_get_miimon(const char *name, bool *miimon)
1326 struct mii_ioctl_data data;
1331 memset(&data, 0, sizeof data);
1332 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1334 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1335 data.reg_num = MII_BMSR;
1336 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1340 *miimon = !!(data.val_out & BMSR_LSTATUS);
1342 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1345 struct ethtool_cmd ecmd;
1347 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1350 COVERAGE_INC(netdev_get_ethtool);
1351 memset(&ecmd, 0, sizeof ecmd);
1352 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1355 struct ethtool_value eval;
1357 memcpy(&eval, &ecmd, sizeof eval);
1358 *miimon = !!eval.data;
1360 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1368 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1369 long long int interval)
1371 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1373 ovs_mutex_lock(&netdev->mutex);
1374 interval = interval > 0 ? MAX(interval, 100) : 0;
1375 if (netdev->miimon_interval != interval) {
1378 if (interval && !netdev->miimon_interval) {
1379 atomic_add(&miimon_cnt, 1, &junk);
1380 } else if (!interval && netdev->miimon_interval) {
1381 atomic_sub(&miimon_cnt, 1, &junk);
1384 netdev->miimon_interval = interval;
1385 timer_set_expired(&netdev->miimon_timer);
1387 ovs_mutex_unlock(&netdev->mutex);
1393 netdev_linux_miimon_run(void)
1395 struct shash device_shash;
1396 struct shash_node *node;
1398 shash_init(&device_shash);
1399 netdev_get_devices(&netdev_linux_class, &device_shash);
1400 SHASH_FOR_EACH (node, &device_shash) {
1401 struct netdev *netdev = node->data;
1402 struct netdev_linux *dev = netdev_linux_cast(netdev);
1405 ovs_mutex_lock(&dev->mutex);
1406 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1407 netdev_linux_get_miimon(dev->up.name, &miimon);
1408 if (miimon != dev->miimon) {
1409 dev->miimon = miimon;
1410 netdev_linux_changed(dev, dev->ifi_flags, 0);
1413 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1415 ovs_mutex_unlock(&dev->mutex);
1416 netdev_close(netdev);
1419 shash_destroy(&device_shash);
1423 netdev_linux_miimon_wait(void)
1425 struct shash device_shash;
1426 struct shash_node *node;
1428 shash_init(&device_shash);
1429 netdev_get_devices(&netdev_linux_class, &device_shash);
1430 SHASH_FOR_EACH (node, &device_shash) {
1431 struct netdev *netdev = node->data;
1432 struct netdev_linux *dev = netdev_linux_cast(netdev);
1434 ovs_mutex_lock(&dev->mutex);
1435 if (dev->miimon_interval > 0) {
1436 timer_wait(&dev->miimon_timer);
1438 ovs_mutex_unlock(&dev->mutex);
1439 netdev_close(netdev);
1441 shash_destroy(&device_shash);
1445 swap_uint64(uint64_t *a, uint64_t *b)
1452 /* Copies 'src' into 'dst', performing format conversion in the process.
1454 * 'src' is allowed to be misaligned. */
1456 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1457 const struct ovs_vport_stats *src)
1459 dst->rx_packets = get_unaligned_u64(&src->rx_packets);
1460 dst->tx_packets = get_unaligned_u64(&src->tx_packets);
1461 dst->rx_bytes = get_unaligned_u64(&src->rx_bytes);
1462 dst->tx_bytes = get_unaligned_u64(&src->tx_bytes);
1463 dst->rx_errors = get_unaligned_u64(&src->rx_errors);
1464 dst->tx_errors = get_unaligned_u64(&src->tx_errors);
1465 dst->rx_dropped = get_unaligned_u64(&src->rx_dropped);
1466 dst->tx_dropped = get_unaligned_u64(&src->tx_dropped);
1468 dst->collisions = 0;
1469 dst->rx_length_errors = 0;
1470 dst->rx_over_errors = 0;
1471 dst->rx_crc_errors = 0;
1472 dst->rx_frame_errors = 0;
1473 dst->rx_fifo_errors = 0;
1474 dst->rx_missed_errors = 0;
1475 dst->tx_aborted_errors = 0;
1476 dst->tx_carrier_errors = 0;
1477 dst->tx_fifo_errors = 0;
1478 dst->tx_heartbeat_errors = 0;
1479 dst->tx_window_errors = 0;
1483 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1485 struct dpif_linux_vport reply;
1489 error = dpif_linux_vport_get(netdev_get_name(netdev), &reply, &buf);
1492 } else if (!reply.stats) {
1497 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1505 get_stats_via_vport(const struct netdev *netdev_,
1506 struct netdev_stats *stats)
1508 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1510 if (!netdev->vport_stats_error ||
1511 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1514 error = get_stats_via_vport__(netdev_, stats);
1515 if (error && error != ENOENT) {
1516 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1518 netdev_get_name(netdev_), ovs_strerror(error));
1520 netdev->vport_stats_error = error;
1521 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1525 /* Retrieves current device stats for 'netdev-linux'. */
1527 netdev_linux_get_stats(const struct netdev *netdev_,
1528 struct netdev_stats *stats)
1530 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1531 struct netdev_stats dev_stats;
1534 ovs_mutex_lock(&netdev->mutex);
1535 get_stats_via_vport(netdev_, stats);
1536 error = get_stats_via_netlink(netdev_, &dev_stats);
1538 if (!netdev->vport_stats_error) {
1541 } else if (netdev->vport_stats_error) {
1542 /* stats not available from OVS then use ioctl stats. */
1545 stats->rx_errors += dev_stats.rx_errors;
1546 stats->tx_errors += dev_stats.tx_errors;
1547 stats->rx_dropped += dev_stats.rx_dropped;
1548 stats->tx_dropped += dev_stats.tx_dropped;
1549 stats->multicast += dev_stats.multicast;
1550 stats->collisions += dev_stats.collisions;
1551 stats->rx_length_errors += dev_stats.rx_length_errors;
1552 stats->rx_over_errors += dev_stats.rx_over_errors;
1553 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1554 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1555 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1556 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1557 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1558 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1559 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1560 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1561 stats->tx_window_errors += dev_stats.tx_window_errors;
1563 ovs_mutex_unlock(&netdev->mutex);
1568 /* Retrieves current device stats for 'netdev-tap' netdev or
1569 * netdev-internal. */
1571 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1573 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1574 struct netdev_stats dev_stats;
1577 ovs_mutex_lock(&netdev->mutex);
1578 get_stats_via_vport(netdev_, stats);
1579 error = get_stats_via_netlink(netdev_, &dev_stats);
1581 if (!netdev->vport_stats_error) {
1584 } else if (netdev->vport_stats_error) {
1585 /* Transmit and receive stats will appear to be swapped relative to the
1586 * other ports since we are the one sending the data, not a remote
1587 * computer. For consistency, we swap them back here. This does not
1588 * apply if we are getting stats from the vport layer because it always
1589 * tracks stats from the perspective of the switch. */
1592 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1593 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1594 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1595 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1596 stats->rx_length_errors = 0;
1597 stats->rx_over_errors = 0;
1598 stats->rx_crc_errors = 0;
1599 stats->rx_frame_errors = 0;
1600 stats->rx_fifo_errors = 0;
1601 stats->rx_missed_errors = 0;
1602 stats->tx_aborted_errors = 0;
1603 stats->tx_carrier_errors = 0;
1604 stats->tx_fifo_errors = 0;
1605 stats->tx_heartbeat_errors = 0;
1606 stats->tx_window_errors = 0;
1608 stats->rx_dropped += dev_stats.tx_dropped;
1609 stats->tx_dropped += dev_stats.rx_dropped;
1611 stats->rx_errors += dev_stats.tx_errors;
1612 stats->tx_errors += dev_stats.rx_errors;
1614 stats->multicast += dev_stats.multicast;
1615 stats->collisions += dev_stats.collisions;
1617 ovs_mutex_unlock(&netdev->mutex);
1623 netdev_internal_get_stats(const struct netdev *netdev_,
1624 struct netdev_stats *stats)
1626 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1629 ovs_mutex_lock(&netdev->mutex);
1630 get_stats_via_vport(netdev_, stats);
1631 error = netdev->vport_stats_error;
1632 ovs_mutex_unlock(&netdev->mutex);
1638 netdev_internal_set_stats(struct netdev *netdev,
1639 const struct netdev_stats *stats)
1641 struct ovs_vport_stats vport_stats;
1642 struct dpif_linux_vport vport;
1645 vport_stats.rx_packets = stats->rx_packets;
1646 vport_stats.tx_packets = stats->tx_packets;
1647 vport_stats.rx_bytes = stats->rx_bytes;
1648 vport_stats.tx_bytes = stats->tx_bytes;
1649 vport_stats.rx_errors = stats->rx_errors;
1650 vport_stats.tx_errors = stats->tx_errors;
1651 vport_stats.rx_dropped = stats->rx_dropped;
1652 vport_stats.tx_dropped = stats->tx_dropped;
1654 dpif_linux_vport_init(&vport);
1655 vport.cmd = OVS_VPORT_CMD_SET;
1656 vport.name = netdev_get_name(netdev);
1657 vport.stats = &vport_stats;
1659 err = dpif_linux_vport_transact(&vport, NULL, NULL);
1661 /* If the vport layer doesn't know about the device, that doesn't mean it
1662 * doesn't exist (after all were able to open it when netdev_open() was
1663 * called), it just means that it isn't attached and we'll be getting
1664 * stats a different way. */
1665 if (err == ENODEV) {
1673 netdev_linux_read_features(struct netdev_linux *netdev)
1675 struct ethtool_cmd ecmd;
1679 if (netdev->cache_valid & VALID_FEATURES) {
1683 COVERAGE_INC(netdev_get_ethtool);
1684 memset(&ecmd, 0, sizeof ecmd);
1685 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1686 ETHTOOL_GSET, "ETHTOOL_GSET");
1691 /* Supported features. */
1692 netdev->supported = 0;
1693 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1694 netdev->supported |= NETDEV_F_10MB_HD;
1696 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1697 netdev->supported |= NETDEV_F_10MB_FD;
1699 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1700 netdev->supported |= NETDEV_F_100MB_HD;
1702 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1703 netdev->supported |= NETDEV_F_100MB_FD;
1705 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1706 netdev->supported |= NETDEV_F_1GB_HD;
1708 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1709 netdev->supported |= NETDEV_F_1GB_FD;
1711 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1712 netdev->supported |= NETDEV_F_10GB_FD;
1714 if (ecmd.supported & SUPPORTED_TP) {
1715 netdev->supported |= NETDEV_F_COPPER;
1717 if (ecmd.supported & SUPPORTED_FIBRE) {
1718 netdev->supported |= NETDEV_F_FIBER;
1720 if (ecmd.supported & SUPPORTED_Autoneg) {
1721 netdev->supported |= NETDEV_F_AUTONEG;
1723 if (ecmd.supported & SUPPORTED_Pause) {
1724 netdev->supported |= NETDEV_F_PAUSE;
1726 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1727 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1730 /* Advertised features. */
1731 netdev->advertised = 0;
1732 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1733 netdev->advertised |= NETDEV_F_10MB_HD;
1735 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1736 netdev->advertised |= NETDEV_F_10MB_FD;
1738 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1739 netdev->advertised |= NETDEV_F_100MB_HD;
1741 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1742 netdev->advertised |= NETDEV_F_100MB_FD;
1744 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1745 netdev->advertised |= NETDEV_F_1GB_HD;
1747 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1748 netdev->advertised |= NETDEV_F_1GB_FD;
1750 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1751 netdev->advertised |= NETDEV_F_10GB_FD;
1753 if (ecmd.advertising & ADVERTISED_TP) {
1754 netdev->advertised |= NETDEV_F_COPPER;
1756 if (ecmd.advertising & ADVERTISED_FIBRE) {
1757 netdev->advertised |= NETDEV_F_FIBER;
1759 if (ecmd.advertising & ADVERTISED_Autoneg) {
1760 netdev->advertised |= NETDEV_F_AUTONEG;
1762 if (ecmd.advertising & ADVERTISED_Pause) {
1763 netdev->advertised |= NETDEV_F_PAUSE;
1765 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1766 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1769 /* Current settings. */
1771 if (speed == SPEED_10) {
1772 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1773 } else if (speed == SPEED_100) {
1774 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1775 } else if (speed == SPEED_1000) {
1776 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1777 } else if (speed == SPEED_10000) {
1778 netdev->current = NETDEV_F_10GB_FD;
1779 } else if (speed == 40000) {
1780 netdev->current = NETDEV_F_40GB_FD;
1781 } else if (speed == 100000) {
1782 netdev->current = NETDEV_F_100GB_FD;
1783 } else if (speed == 1000000) {
1784 netdev->current = NETDEV_F_1TB_FD;
1786 netdev->current = 0;
1789 if (ecmd.port == PORT_TP) {
1790 netdev->current |= NETDEV_F_COPPER;
1791 } else if (ecmd.port == PORT_FIBRE) {
1792 netdev->current |= NETDEV_F_FIBER;
1796 netdev->current |= NETDEV_F_AUTONEG;
1800 netdev->cache_valid |= VALID_FEATURES;
1801 netdev->get_features_error = error;
1804 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
1805 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1806 * Returns 0 if successful, otherwise a positive errno value. */
1808 netdev_linux_get_features(const struct netdev *netdev_,
1809 enum netdev_features *current,
1810 enum netdev_features *advertised,
1811 enum netdev_features *supported,
1812 enum netdev_features *peer)
1814 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1817 ovs_mutex_lock(&netdev->mutex);
1818 netdev_linux_read_features(netdev);
1819 if (!netdev->get_features_error) {
1820 *current = netdev->current;
1821 *advertised = netdev->advertised;
1822 *supported = netdev->supported;
1823 *peer = 0; /* XXX */
1825 error = netdev->get_features_error;
1826 ovs_mutex_unlock(&netdev->mutex);
1831 /* Set the features advertised by 'netdev' to 'advertise'. */
1833 netdev_linux_set_advertisements(struct netdev *netdev_,
1834 enum netdev_features advertise)
1836 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1837 struct ethtool_cmd ecmd;
1840 ovs_mutex_lock(&netdev->mutex);
1842 COVERAGE_INC(netdev_get_ethtool);
1843 memset(&ecmd, 0, sizeof ecmd);
1844 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1845 ETHTOOL_GSET, "ETHTOOL_GSET");
1850 ecmd.advertising = 0;
1851 if (advertise & NETDEV_F_10MB_HD) {
1852 ecmd.advertising |= ADVERTISED_10baseT_Half;
1854 if (advertise & NETDEV_F_10MB_FD) {
1855 ecmd.advertising |= ADVERTISED_10baseT_Full;
1857 if (advertise & NETDEV_F_100MB_HD) {
1858 ecmd.advertising |= ADVERTISED_100baseT_Half;
1860 if (advertise & NETDEV_F_100MB_FD) {
1861 ecmd.advertising |= ADVERTISED_100baseT_Full;
1863 if (advertise & NETDEV_F_1GB_HD) {
1864 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1866 if (advertise & NETDEV_F_1GB_FD) {
1867 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1869 if (advertise & NETDEV_F_10GB_FD) {
1870 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1872 if (advertise & NETDEV_F_COPPER) {
1873 ecmd.advertising |= ADVERTISED_TP;
1875 if (advertise & NETDEV_F_FIBER) {
1876 ecmd.advertising |= ADVERTISED_FIBRE;
1878 if (advertise & NETDEV_F_AUTONEG) {
1879 ecmd.advertising |= ADVERTISED_Autoneg;
1881 if (advertise & NETDEV_F_PAUSE) {
1882 ecmd.advertising |= ADVERTISED_Pause;
1884 if (advertise & NETDEV_F_PAUSE_ASYM) {
1885 ecmd.advertising |= ADVERTISED_Asym_Pause;
1887 COVERAGE_INC(netdev_set_ethtool);
1888 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1889 ETHTOOL_SSET, "ETHTOOL_SSET");
1892 ovs_mutex_unlock(&netdev->mutex);
1896 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1897 * successful, otherwise a positive errno value. */
1899 netdev_linux_set_policing(struct netdev *netdev_,
1900 uint32_t kbits_rate, uint32_t kbits_burst)
1902 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1903 const char *netdev_name = netdev_get_name(netdev_);
1906 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1907 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1908 : kbits_burst); /* Stick with user-specified value. */
1910 ovs_mutex_lock(&netdev->mutex);
1911 if (netdev->cache_valid & VALID_POLICING) {
1912 error = netdev->netdev_policing_error;
1913 if (error || (netdev->kbits_rate == kbits_rate &&
1914 netdev->kbits_burst == kbits_burst)) {
1915 /* Assume that settings haven't changed since we last set them. */
1918 netdev->cache_valid &= ~VALID_POLICING;
1921 COVERAGE_INC(netdev_set_policing);
1922 /* Remove any existing ingress qdisc. */
1923 error = tc_add_del_ingress_qdisc(netdev_, false);
1925 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1926 netdev_name, ovs_strerror(error));
1931 error = tc_add_del_ingress_qdisc(netdev_, true);
1933 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1934 netdev_name, ovs_strerror(error));
1938 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
1940 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1941 netdev_name, ovs_strerror(error));
1946 netdev->kbits_rate = kbits_rate;
1947 netdev->kbits_burst = kbits_burst;
1950 if (!error || error == ENODEV) {
1951 netdev->netdev_policing_error = error;
1952 netdev->cache_valid |= VALID_POLICING;
1954 ovs_mutex_unlock(&netdev->mutex);
1959 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1962 const struct tc_ops *const *opsp;
1964 for (opsp = tcs; *opsp != NULL; opsp++) {
1965 const struct tc_ops *ops = *opsp;
1966 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1967 sset_add(types, ops->ovs_name);
1973 static const struct tc_ops *
1974 tc_lookup_ovs_name(const char *name)
1976 const struct tc_ops *const *opsp;
1978 for (opsp = tcs; *opsp != NULL; opsp++) {
1979 const struct tc_ops *ops = *opsp;
1980 if (!strcmp(name, ops->ovs_name)) {
1987 static const struct tc_ops *
1988 tc_lookup_linux_name(const char *name)
1990 const struct tc_ops *const *opsp;
1992 for (opsp = tcs; *opsp != NULL; opsp++) {
1993 const struct tc_ops *ops = *opsp;
1994 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
2001 static struct tc_queue *
2002 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
2005 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2006 struct tc_queue *queue;
2008 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
2009 if (queue->queue_id == queue_id) {
2016 static struct tc_queue *
2017 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2019 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2023 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2025 struct netdev_qos_capabilities *caps)
2027 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2031 caps->n_queues = ops->n_queues;
2036 netdev_linux_get_qos(const struct netdev *netdev_,
2037 const char **typep, struct smap *details)
2039 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2042 ovs_mutex_lock(&netdev->mutex);
2043 error = tc_query_qdisc(netdev_);
2045 *typep = netdev->tc->ops->ovs_name;
2046 error = (netdev->tc->ops->qdisc_get
2047 ? netdev->tc->ops->qdisc_get(netdev_, details)
2050 ovs_mutex_unlock(&netdev->mutex);
2056 netdev_linux_set_qos(struct netdev *netdev_,
2057 const char *type, const struct smap *details)
2059 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2060 const struct tc_ops *new_ops;
2063 new_ops = tc_lookup_ovs_name(type);
2064 if (!new_ops || !new_ops->tc_install) {
2068 ovs_mutex_lock(&netdev->mutex);
2069 error = tc_query_qdisc(netdev_);
2074 if (new_ops == netdev->tc->ops) {
2075 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
2077 /* Delete existing qdisc. */
2078 error = tc_del_qdisc(netdev_);
2082 ovs_assert(netdev->tc == NULL);
2084 /* Install new qdisc. */
2085 error = new_ops->tc_install(netdev_, details);
2086 ovs_assert((error == 0) == (netdev->tc != NULL));
2090 ovs_mutex_unlock(&netdev->mutex);
2095 netdev_linux_get_queue(const struct netdev *netdev_,
2096 unsigned int queue_id, struct smap *details)
2098 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2101 ovs_mutex_lock(&netdev->mutex);
2102 error = tc_query_qdisc(netdev_);
2104 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2106 ? netdev->tc->ops->class_get(netdev_, queue, details)
2109 ovs_mutex_unlock(&netdev->mutex);
2115 netdev_linux_set_queue(struct netdev *netdev_,
2116 unsigned int queue_id, const struct smap *details)
2118 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2121 ovs_mutex_lock(&netdev->mutex);
2122 error = tc_query_qdisc(netdev_);
2124 error = (queue_id < netdev->tc->ops->n_queues
2125 && netdev->tc->ops->class_set
2126 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2129 ovs_mutex_unlock(&netdev->mutex);
2135 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
2137 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2140 ovs_mutex_lock(&netdev->mutex);
2141 error = tc_query_qdisc(netdev_);
2143 if (netdev->tc->ops->class_delete) {
2144 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2146 ? netdev->tc->ops->class_delete(netdev_, queue)
2152 ovs_mutex_unlock(&netdev->mutex);
2158 netdev_linux_get_queue_stats(const struct netdev *netdev_,
2159 unsigned int queue_id,
2160 struct netdev_queue_stats *stats)
2162 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2165 ovs_mutex_lock(&netdev->mutex);
2166 error = tc_query_qdisc(netdev_);
2168 if (netdev->tc->ops->class_get_stats) {
2169 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2171 stats->created = queue->created;
2172 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2181 ovs_mutex_unlock(&netdev->mutex);
2186 struct queue_dump_state {
2187 struct nl_dump dump;
2192 start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
2194 struct ofpbuf request;
2195 struct tcmsg *tcmsg;
2197 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2201 tcmsg->tcm_parent = 0;
2202 nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
2203 ofpbuf_uninit(&request);
2205 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
2210 finish_queue_dump(struct queue_dump_state *state)
2212 ofpbuf_uninit(&state->buf);
2213 return nl_dump_done(&state->dump);
2216 struct netdev_linux_queue_state {
2217 unsigned int *queues;
2223 netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
2225 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2228 ovs_mutex_lock(&netdev->mutex);
2229 error = tc_query_qdisc(netdev_);
2231 if (netdev->tc->ops->class_get) {
2232 struct netdev_linux_queue_state *state;
2233 struct tc_queue *queue;
2236 *statep = state = xmalloc(sizeof *state);
2237 state->n_queues = hmap_count(&netdev->tc->queues);
2238 state->cur_queue = 0;
2239 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2242 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2243 state->queues[i++] = queue->queue_id;
2249 ovs_mutex_unlock(&netdev->mutex);
2255 netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2256 unsigned int *queue_idp, struct smap *details)
2258 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2259 struct netdev_linux_queue_state *state = state_;
2262 ovs_mutex_lock(&netdev->mutex);
2263 while (state->cur_queue < state->n_queues) {
2264 unsigned int queue_id = state->queues[state->cur_queue++];
2265 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2268 *queue_idp = queue_id;
2269 error = netdev->tc->ops->class_get(netdev_, queue, details);
2273 ovs_mutex_unlock(&netdev->mutex);
2279 netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2282 struct netdev_linux_queue_state *state = state_;
2284 free(state->queues);
2290 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2291 netdev_dump_queue_stats_cb *cb, void *aux)
2293 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2296 ovs_mutex_lock(&netdev->mutex);
2297 error = tc_query_qdisc(netdev_);
2299 struct queue_dump_state state;
2301 if (!netdev->tc->ops->class_dump_stats) {
2303 } else if (!start_queue_dump(netdev_, &state)) {
2309 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
2310 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2317 retval = finish_queue_dump(&state);
2323 ovs_mutex_unlock(&netdev->mutex);
2329 netdev_linux_get_in4(const struct netdev *netdev_,
2330 struct in_addr *address, struct in_addr *netmask)
2332 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2335 ovs_mutex_lock(&netdev->mutex);
2336 if (!(netdev->cache_valid & VALID_IN4)) {
2337 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
2338 SIOCGIFADDR, "SIOCGIFADDR");
2340 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2341 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2343 netdev->cache_valid |= VALID_IN4;
2351 if (netdev->address.s_addr != INADDR_ANY) {
2352 *address = netdev->address;
2353 *netmask = netdev->netmask;
2355 error = EADDRNOTAVAIL;
2358 ovs_mutex_unlock(&netdev->mutex);
2364 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2365 struct in_addr netmask)
2367 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2370 ovs_mutex_lock(&netdev->mutex);
2371 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2373 netdev->cache_valid |= VALID_IN4;
2374 netdev->address = address;
2375 netdev->netmask = netmask;
2376 if (address.s_addr != INADDR_ANY) {
2377 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2378 "SIOCSIFNETMASK", netmask);
2381 ovs_mutex_unlock(&netdev->mutex);
2387 parse_if_inet6_line(const char *line,
2388 struct in6_addr *in6, char ifname[16 + 1])
2390 uint8_t *s6 = in6->s6_addr;
2391 #define X8 "%2"SCNx8
2392 return ovs_scan(line,
2393 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2394 "%*x %*x %*x %*x %16s\n",
2395 &s6[0], &s6[1], &s6[2], &s6[3],
2396 &s6[4], &s6[5], &s6[6], &s6[7],
2397 &s6[8], &s6[9], &s6[10], &s6[11],
2398 &s6[12], &s6[13], &s6[14], &s6[15],
2402 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2403 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2405 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2407 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2409 ovs_mutex_lock(&netdev->mutex);
2410 if (!(netdev->cache_valid & VALID_IN6)) {
2414 netdev->in6 = in6addr_any;
2416 file = fopen("/proc/net/if_inet6", "r");
2418 const char *name = netdev_get_name(netdev_);
2419 while (fgets(line, sizeof line, file)) {
2420 struct in6_addr in6_tmp;
2421 char ifname[16 + 1];
2422 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2423 && !strcmp(name, ifname))
2425 netdev->in6 = in6_tmp;
2431 netdev->cache_valid |= VALID_IN6;
2434 ovs_mutex_unlock(&netdev->mutex);
2440 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2442 struct sockaddr_in sin;
2443 memset(&sin, 0, sizeof sin);
2444 sin.sin_family = AF_INET;
2445 sin.sin_addr = addr;
2448 memset(sa, 0, sizeof *sa);
2449 memcpy(sa, &sin, sizeof sin);
2453 do_set_addr(struct netdev *netdev,
2454 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2458 make_in4_sockaddr(&ifr.ifr_addr, addr);
2459 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2463 /* Adds 'router' as a default IP gateway. */
2465 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2467 struct in_addr any = { INADDR_ANY };
2471 memset(&rt, 0, sizeof rt);
2472 make_in4_sockaddr(&rt.rt_dst, any);
2473 make_in4_sockaddr(&rt.rt_gateway, router);
2474 make_in4_sockaddr(&rt.rt_genmask, any);
2475 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2476 error = af_inet_ioctl(SIOCADDRT, &rt);
2478 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2484 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2487 static const char fn[] = "/proc/net/route";
2492 *netdev_name = NULL;
2493 stream = fopen(fn, "r");
2494 if (stream == NULL) {
2495 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2500 while (fgets(line, sizeof line, stream)) {
2503 ovs_be32 dest, gateway, mask;
2504 int refcnt, metric, mtu;
2505 unsigned int flags, use, window, irtt;
2508 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2510 iface, &dest, &gateway, &flags, &refcnt,
2511 &use, &metric, &mask, &mtu, &window, &irtt)) {
2512 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2516 if (!(flags & RTF_UP)) {
2517 /* Skip routes that aren't up. */
2521 /* The output of 'dest', 'mask', and 'gateway' were given in
2522 * network byte order, so we don't need need any endian
2523 * conversions here. */
2524 if ((dest & mask) == (host->s_addr & mask)) {
2526 /* The host is directly reachable. */
2527 next_hop->s_addr = 0;
2529 /* To reach the host, we must go through a gateway. */
2530 next_hop->s_addr = gateway;
2532 *netdev_name = xstrdup(iface);
2544 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2546 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2549 ovs_mutex_lock(&netdev->mutex);
2550 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2551 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2553 COVERAGE_INC(netdev_get_ethtool);
2554 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2555 error = netdev_linux_do_ethtool(netdev->up.name,
2558 "ETHTOOL_GDRVINFO");
2560 netdev->cache_valid |= VALID_DRVINFO;
2565 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2566 smap_add(smap, "driver_version", netdev->drvinfo.version);
2567 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2569 ovs_mutex_unlock(&netdev->mutex);
2575 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2578 smap_add(smap, "driver_name", "openvswitch");
2582 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2583 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2584 * returns 0. Otherwise, it returns a positive errno value; in particular,
2585 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2587 netdev_linux_arp_lookup(const struct netdev *netdev,
2588 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2591 struct sockaddr_in sin;
2594 memset(&r, 0, sizeof r);
2595 memset(&sin, 0, sizeof sin);
2596 sin.sin_family = AF_INET;
2597 sin.sin_addr.s_addr = ip;
2599 memcpy(&r.arp_pa, &sin, sizeof sin);
2600 r.arp_ha.sa_family = ARPHRD_ETHER;
2602 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2603 COVERAGE_INC(netdev_arp_lookup);
2604 retval = af_inet_ioctl(SIOCGARP, &r);
2606 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2607 } else if (retval != ENXIO) {
2608 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2609 netdev_get_name(netdev), IP_ARGS(ip),
2610 ovs_strerror(retval));
2616 nd_to_iff_flags(enum netdev_flags nd)
2619 if (nd & NETDEV_UP) {
2622 if (nd & NETDEV_PROMISC) {
2625 if (nd & NETDEV_LOOPBACK) {
2626 iff |= IFF_LOOPBACK;
2632 iff_to_nd_flags(int iff)
2634 enum netdev_flags nd = 0;
2638 if (iff & IFF_PROMISC) {
2639 nd |= NETDEV_PROMISC;
2641 if (iff & IFF_LOOPBACK) {
2642 nd |= NETDEV_LOOPBACK;
2648 update_flags(struct netdev_linux *netdev, enum netdev_flags off,
2649 enum netdev_flags on, enum netdev_flags *old_flagsp)
2650 OVS_REQUIRES(netdev->mutex)
2652 int old_flags, new_flags;
2655 old_flags = netdev->ifi_flags;
2656 *old_flagsp = iff_to_nd_flags(old_flags);
2657 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2658 if (new_flags != old_flags) {
2659 error = set_flags(netdev_get_name(&netdev->up), new_flags);
2660 get_flags(&netdev->up, &netdev->ifi_flags);
2667 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2668 enum netdev_flags on, enum netdev_flags *old_flagsp)
2670 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2673 ovs_mutex_lock(&netdev->mutex);
2674 error = update_flags(netdev, off, on, old_flagsp);
2675 ovs_mutex_unlock(&netdev->mutex);
2680 #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, SET_STATS, \
2681 GET_FEATURES, GET_STATUS) \
2687 netdev_linux_wait, \
2689 netdev_linux_alloc, \
2691 netdev_linux_destruct, \
2692 netdev_linux_dealloc, \
2693 NULL, /* get_config */ \
2694 NULL, /* set_config */ \
2695 NULL, /* get_tunnel_config */ \
2697 netdev_linux_send, \
2698 netdev_linux_send_wait, \
2700 netdev_linux_set_etheraddr, \
2701 netdev_linux_get_etheraddr, \
2702 netdev_linux_get_mtu, \
2703 netdev_linux_set_mtu, \
2704 netdev_linux_get_ifindex, \
2705 netdev_linux_get_carrier, \
2706 netdev_linux_get_carrier_resets, \
2707 netdev_linux_set_miimon_interval, \
2712 netdev_linux_set_advertisements, \
2714 netdev_linux_set_policing, \
2715 netdev_linux_get_qos_types, \
2716 netdev_linux_get_qos_capabilities, \
2717 netdev_linux_get_qos, \
2718 netdev_linux_set_qos, \
2719 netdev_linux_get_queue, \
2720 netdev_linux_set_queue, \
2721 netdev_linux_delete_queue, \
2722 netdev_linux_get_queue_stats, \
2723 netdev_linux_queue_dump_start, \
2724 netdev_linux_queue_dump_next, \
2725 netdev_linux_queue_dump_done, \
2726 netdev_linux_dump_queue_stats, \
2728 netdev_linux_get_in4, \
2729 netdev_linux_set_in4, \
2730 netdev_linux_get_in6, \
2731 netdev_linux_add_router, \
2732 netdev_linux_get_next_hop, \
2734 netdev_linux_arp_lookup, \
2736 netdev_linux_update_flags, \
2738 netdev_linux_rx_alloc, \
2739 netdev_linux_rx_construct, \
2740 netdev_linux_rx_destruct, \
2741 netdev_linux_rx_dealloc, \
2742 netdev_linux_rx_recv, \
2743 netdev_linux_rx_wait, \
2744 netdev_linux_rx_drain, \
2747 const struct netdev_class netdev_linux_class =
2750 netdev_linux_construct,
2751 netdev_linux_get_stats,
2752 NULL, /* set_stats */
2753 netdev_linux_get_features,
2754 netdev_linux_get_status);
2756 const struct netdev_class netdev_tap_class =
2759 netdev_linux_construct_tap,
2760 netdev_tap_get_stats,
2761 NULL, /* set_stats */
2762 netdev_linux_get_features,
2763 netdev_linux_get_status);
2765 const struct netdev_class netdev_internal_class =
2768 netdev_linux_construct,
2769 netdev_internal_get_stats,
2770 netdev_internal_set_stats,
2771 NULL, /* get_features */
2772 netdev_internal_get_status);
2774 /* HTB traffic control class. */
2776 #define HTB_N_QUEUES 0xf000
2780 unsigned int max_rate; /* In bytes/s. */
2784 struct tc_queue tc_queue;
2785 unsigned int min_rate; /* In bytes/s. */
2786 unsigned int max_rate; /* In bytes/s. */
2787 unsigned int burst; /* In bytes. */
2788 unsigned int priority; /* Lower values are higher priorities. */
2792 htb_get__(const struct netdev *netdev_)
2794 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2795 return CONTAINER_OF(netdev->tc, struct htb, tc);
2799 htb_install__(struct netdev *netdev_, uint64_t max_rate)
2801 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2804 htb = xmalloc(sizeof *htb);
2805 tc_init(&htb->tc, &tc_ops_htb);
2806 htb->max_rate = max_rate;
2808 netdev->tc = &htb->tc;
2811 /* Create an HTB qdisc.
2813 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2815 htb_setup_qdisc__(struct netdev *netdev)
2818 struct tc_htb_glob opt;
2819 struct ofpbuf request;
2820 struct tcmsg *tcmsg;
2822 tc_del_qdisc(netdev);
2824 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2825 NLM_F_EXCL | NLM_F_CREATE, &request);
2829 tcmsg->tcm_handle = tc_make_handle(1, 0);
2830 tcmsg->tcm_parent = TC_H_ROOT;
2832 nl_msg_put_string(&request, TCA_KIND, "htb");
2834 memset(&opt, 0, sizeof opt);
2835 opt.rate2quantum = 10;
2839 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2840 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2841 nl_msg_end_nested(&request, opt_offset);
2843 return tc_transact(&request, NULL);
2846 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2847 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2849 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2850 unsigned int parent, struct htb_class *class)
2853 struct tc_htb_opt opt;
2854 struct ofpbuf request;
2855 struct tcmsg *tcmsg;
2859 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
2861 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2862 netdev_get_name(netdev));
2866 memset(&opt, 0, sizeof opt);
2867 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2868 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2869 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2870 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2871 opt.prio = class->priority;
2873 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2877 tcmsg->tcm_handle = handle;
2878 tcmsg->tcm_parent = parent;
2880 nl_msg_put_string(&request, TCA_KIND, "htb");
2881 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2882 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2883 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2884 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2885 nl_msg_end_nested(&request, opt_offset);
2887 error = tc_transact(&request, NULL);
2889 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2890 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2891 netdev_get_name(netdev),
2892 tc_get_major(handle), tc_get_minor(handle),
2893 tc_get_major(parent), tc_get_minor(parent),
2894 class->min_rate, class->max_rate,
2895 class->burst, class->priority, ovs_strerror(error));
2900 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2901 * description of them into 'details'. The description complies with the
2902 * specification given in the vswitch database documentation for linux-htb
2905 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2907 static const struct nl_policy tca_htb_policy[] = {
2908 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2909 .min_len = sizeof(struct tc_htb_opt) },
2912 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2913 const struct tc_htb_opt *htb;
2915 if (!nl_parse_nested(nl_options, tca_htb_policy,
2916 attrs, ARRAY_SIZE(tca_htb_policy))) {
2917 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2921 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2922 class->min_rate = htb->rate.rate;
2923 class->max_rate = htb->ceil.rate;
2924 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2925 class->priority = htb->prio;
2930 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2931 struct htb_class *options,
2932 struct netdev_queue_stats *stats)
2934 struct nlattr *nl_options;
2935 unsigned int handle;
2938 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2939 if (!error && queue_id) {
2940 unsigned int major = tc_get_major(handle);
2941 unsigned int minor = tc_get_minor(handle);
2942 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2943 *queue_id = minor - 1;
2948 if (!error && options) {
2949 error = htb_parse_tca_options__(nl_options, options);
2955 htb_parse_qdisc_details__(struct netdev *netdev_,
2956 const struct smap *details, struct htb_class *hc)
2958 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2959 const char *max_rate_s;
2961 max_rate_s = smap_get(details, "max-rate");
2962 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2963 if (!hc->max_rate) {
2964 enum netdev_features current;
2966 netdev_linux_read_features(netdev);
2967 current = !netdev->get_features_error ? netdev->current : 0;
2968 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
2970 hc->min_rate = hc->max_rate;
2976 htb_parse_class_details__(struct netdev *netdev,
2977 const struct smap *details, struct htb_class *hc)
2979 const struct htb *htb = htb_get__(netdev);
2980 const char *min_rate_s = smap_get(details, "min-rate");
2981 const char *max_rate_s = smap_get(details, "max-rate");
2982 const char *burst_s = smap_get(details, "burst");
2983 const char *priority_s = smap_get(details, "priority");
2986 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
2988 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2989 netdev_get_name(netdev));
2993 /* HTB requires at least an mtu sized min-rate to send any traffic even
2994 * on uncongested links. */
2995 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2996 hc->min_rate = MAX(hc->min_rate, mtu);
2997 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
3000 hc->max_rate = (max_rate_s
3001 ? strtoull(max_rate_s, NULL, 10) / 8
3003 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
3004 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
3008 * According to hints in the documentation that I've read, it is important
3009 * that 'burst' be at least as big as the largest frame that might be
3010 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
3011 * but having it a bit too small is a problem. Since netdev_get_mtu()
3012 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
3013 * the MTU. We actually add 64, instead of 14, as a guard against
3014 * additional headers get tacked on somewhere that we're not aware of. */
3015 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
3016 hc->burst = MAX(hc->burst, mtu + 64);
3019 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
3025 htb_query_class__(const struct netdev *netdev, unsigned int handle,
3026 unsigned int parent, struct htb_class *options,
3027 struct netdev_queue_stats *stats)
3029 struct ofpbuf *reply;
3032 error = tc_query_class(netdev, handle, parent, &reply);
3034 error = htb_parse_tcmsg__(reply, NULL, options, stats);
3035 ofpbuf_delete(reply);
3041 htb_tc_install(struct netdev *netdev, const struct smap *details)
3045 error = htb_setup_qdisc__(netdev);
3047 struct htb_class hc;
3049 htb_parse_qdisc_details__(netdev, details, &hc);
3050 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3051 tc_make_handle(1, 0), &hc);
3053 htb_install__(netdev, hc.max_rate);
3059 static struct htb_class *
3060 htb_class_cast__(const struct tc_queue *queue)
3062 return CONTAINER_OF(queue, struct htb_class, tc_queue);
3066 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
3067 const struct htb_class *hc)
3069 struct htb *htb = htb_get__(netdev);
3070 size_t hash = hash_int(queue_id, 0);
3071 struct tc_queue *queue;
3072 struct htb_class *hcp;
3074 queue = tc_find_queue__(netdev, queue_id, hash);
3076 hcp = htb_class_cast__(queue);
3078 hcp = xmalloc(sizeof *hcp);
3079 queue = &hcp->tc_queue;
3080 queue->queue_id = queue_id;
3081 queue->created = time_msec();
3082 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
3085 hcp->min_rate = hc->min_rate;
3086 hcp->max_rate = hc->max_rate;
3087 hcp->burst = hc->burst;
3088 hcp->priority = hc->priority;
3092 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3095 struct queue_dump_state state;
3096 struct htb_class hc;
3098 /* Get qdisc options. */
3100 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3101 htb_install__(netdev, hc.max_rate);
3104 if (!start_queue_dump(netdev, &state)) {
3107 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
3108 unsigned int queue_id;
3110 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3111 htb_update_queue__(netdev, queue_id, &hc);
3114 finish_queue_dump(&state);
3120 htb_tc_destroy(struct tc *tc)
3122 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
3123 struct htb_class *hc, *next;
3125 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
3126 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3134 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
3136 const struct htb *htb = htb_get__(netdev);
3137 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
3142 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
3144 struct htb_class hc;
3147 htb_parse_qdisc_details__(netdev, details, &hc);
3148 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3149 tc_make_handle(1, 0), &hc);
3151 htb_get__(netdev)->max_rate = hc.max_rate;
3157 htb_class_get(const struct netdev *netdev OVS_UNUSED,
3158 const struct tc_queue *queue, struct smap *details)
3160 const struct htb_class *hc = htb_class_cast__(queue);
3162 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3163 if (hc->min_rate != hc->max_rate) {
3164 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3166 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
3168 smap_add_format(details, "priority", "%u", hc->priority);
3174 htb_class_set(struct netdev *netdev, unsigned int queue_id,
3175 const struct smap *details)
3177 struct htb_class hc;
3180 error = htb_parse_class_details__(netdev, details, &hc);
3185 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3186 tc_make_handle(1, 0xfffe), &hc);
3191 htb_update_queue__(netdev, queue_id, &hc);
3196 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
3198 struct htb_class *hc = htb_class_cast__(queue);
3199 struct htb *htb = htb_get__(netdev);
3202 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3204 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3211 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3212 struct netdev_queue_stats *stats)
3214 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3215 tc_make_handle(1, 0xfffe), NULL, stats);
3219 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3220 const struct ofpbuf *nlmsg,
3221 netdev_dump_queue_stats_cb *cb, void *aux)
3223 struct netdev_queue_stats stats;
3224 unsigned int handle, major, minor;
3227 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3232 major = tc_get_major(handle);
3233 minor = tc_get_minor(handle);
3234 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3235 (*cb)(minor - 1, &stats, aux);
3240 static const struct tc_ops tc_ops_htb = {
3241 "htb", /* linux_name */
3242 "linux-htb", /* ovs_name */
3243 HTB_N_QUEUES, /* n_queues */
3252 htb_class_get_stats,
3253 htb_class_dump_stats
3256 /* "linux-hfsc" traffic control class. */
3258 #define HFSC_N_QUEUES 0xf000
3266 struct tc_queue tc_queue;
3271 static struct hfsc *
3272 hfsc_get__(const struct netdev *netdev_)
3274 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3275 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
3278 static struct hfsc_class *
3279 hfsc_class_cast__(const struct tc_queue *queue)
3281 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3285 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
3287 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3290 hfsc = xmalloc(sizeof *hfsc);
3291 tc_init(&hfsc->tc, &tc_ops_hfsc);
3292 hfsc->max_rate = max_rate;
3293 netdev->tc = &hfsc->tc;
3297 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3298 const struct hfsc_class *hc)
3302 struct hfsc_class *hcp;
3303 struct tc_queue *queue;
3305 hfsc = hfsc_get__(netdev);
3306 hash = hash_int(queue_id, 0);
3308 queue = tc_find_queue__(netdev, queue_id, hash);
3310 hcp = hfsc_class_cast__(queue);
3312 hcp = xmalloc(sizeof *hcp);
3313 queue = &hcp->tc_queue;
3314 queue->queue_id = queue_id;
3315 queue->created = time_msec();
3316 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3319 hcp->min_rate = hc->min_rate;
3320 hcp->max_rate = hc->max_rate;
3324 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3326 const struct tc_service_curve *rsc, *fsc, *usc;
3327 static const struct nl_policy tca_hfsc_policy[] = {
3329 .type = NL_A_UNSPEC,
3331 .min_len = sizeof(struct tc_service_curve),
3334 .type = NL_A_UNSPEC,
3336 .min_len = sizeof(struct tc_service_curve),
3339 .type = NL_A_UNSPEC,
3341 .min_len = sizeof(struct tc_service_curve),
3344 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3346 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3347 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3348 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3352 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3353 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3354 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3356 if (rsc->m1 != 0 || rsc->d != 0 ||
3357 fsc->m1 != 0 || fsc->d != 0 ||
3358 usc->m1 != 0 || usc->d != 0) {
3359 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3360 "Non-linear service curves are not supported.");
3364 if (rsc->m2 != fsc->m2) {
3365 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3366 "Real-time service curves are not supported ");
3370 if (rsc->m2 > usc->m2) {
3371 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3372 "Min-rate service curve is greater than "
3373 "the max-rate service curve.");
3377 class->min_rate = fsc->m2;
3378 class->max_rate = usc->m2;
3383 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3384 struct hfsc_class *options,
3385 struct netdev_queue_stats *stats)
3388 unsigned int handle;
3389 struct nlattr *nl_options;
3391 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3397 unsigned int major, minor;
3399 major = tc_get_major(handle);
3400 minor = tc_get_minor(handle);
3401 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3402 *queue_id = minor - 1;
3409 error = hfsc_parse_tca_options__(nl_options, options);
3416 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3417 unsigned int parent, struct hfsc_class *options,
3418 struct netdev_queue_stats *stats)
3421 struct ofpbuf *reply;
3423 error = tc_query_class(netdev, handle, parent, &reply);
3428 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3429 ofpbuf_delete(reply);
3434 hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
3435 struct hfsc_class *class)
3437 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3439 const char *max_rate_s;
3441 max_rate_s = smap_get(details, "max-rate");
3442 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3445 enum netdev_features current;
3447 netdev_linux_read_features(netdev);
3448 current = !netdev->get_features_error ? netdev->current : 0;
3449 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3452 class->min_rate = max_rate;
3453 class->max_rate = max_rate;
3457 hfsc_parse_class_details__(struct netdev *netdev,
3458 const struct smap *details,
3459 struct hfsc_class * class)
3461 const struct hfsc *hfsc;
3462 uint32_t min_rate, max_rate;
3463 const char *min_rate_s, *max_rate_s;
3465 hfsc = hfsc_get__(netdev);
3466 min_rate_s = smap_get(details, "min-rate");
3467 max_rate_s = smap_get(details, "max-rate");
3469 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3470 min_rate = MAX(min_rate, 1);
3471 min_rate = MIN(min_rate, hfsc->max_rate);
3473 max_rate = (max_rate_s
3474 ? strtoull(max_rate_s, NULL, 10) / 8
3476 max_rate = MAX(max_rate, min_rate);
3477 max_rate = MIN(max_rate, hfsc->max_rate);
3479 class->min_rate = min_rate;
3480 class->max_rate = max_rate;
3485 /* Create an HFSC qdisc.
3487 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3489 hfsc_setup_qdisc__(struct netdev * netdev)
3491 struct tcmsg *tcmsg;
3492 struct ofpbuf request;
3493 struct tc_hfsc_qopt opt;
3495 tc_del_qdisc(netdev);
3497 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3498 NLM_F_EXCL | NLM_F_CREATE, &request);
3504 tcmsg->tcm_handle = tc_make_handle(1, 0);
3505 tcmsg->tcm_parent = TC_H_ROOT;
3507 memset(&opt, 0, sizeof opt);
3510 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3511 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3513 return tc_transact(&request, NULL);
3516 /* Create an HFSC class.
3518 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3519 * sc rate <min_rate> ul rate <max_rate>" */
3521 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3522 unsigned int parent, struct hfsc_class *class)
3526 struct tcmsg *tcmsg;
3527 struct ofpbuf request;
3528 struct tc_service_curve min, max;
3530 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3536 tcmsg->tcm_handle = handle;
3537 tcmsg->tcm_parent = parent;
3541 min.m2 = class->min_rate;
3545 max.m2 = class->max_rate;
3547 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3548 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3549 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3550 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3551 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3552 nl_msg_end_nested(&request, opt_offset);
3554 error = tc_transact(&request, NULL);
3556 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3557 "min-rate %ubps, max-rate %ubps (%s)",
3558 netdev_get_name(netdev),
3559 tc_get_major(handle), tc_get_minor(handle),
3560 tc_get_major(parent), tc_get_minor(parent),
3561 class->min_rate, class->max_rate, ovs_strerror(error));
3568 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
3571 struct hfsc_class class;
3573 error = hfsc_setup_qdisc__(netdev);
3579 hfsc_parse_qdisc_details__(netdev, details, &class);
3580 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3581 tc_make_handle(1, 0), &class);
3587 hfsc_install__(netdev, class.max_rate);
3592 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3595 struct queue_dump_state state;
3596 struct hfsc_class hc;
3599 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3600 hfsc_install__(netdev, hc.max_rate);
3602 if (!start_queue_dump(netdev, &state)) {
3606 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
3607 unsigned int queue_id;
3609 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3610 hfsc_update_queue__(netdev, queue_id, &hc);
3614 finish_queue_dump(&state);
3619 hfsc_tc_destroy(struct tc *tc)
3622 struct hfsc_class *hc, *next;
3624 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3626 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3627 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3636 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
3638 const struct hfsc *hfsc;
3639 hfsc = hfsc_get__(netdev);
3640 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
3645 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
3648 struct hfsc_class class;
3650 hfsc_parse_qdisc_details__(netdev, details, &class);
3651 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3652 tc_make_handle(1, 0), &class);
3655 hfsc_get__(netdev)->max_rate = class.max_rate;
3662 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3663 const struct tc_queue *queue, struct smap *details)
3665 const struct hfsc_class *hc;
3667 hc = hfsc_class_cast__(queue);
3668 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3669 if (hc->min_rate != hc->max_rate) {
3670 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3676 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3677 const struct smap *details)
3680 struct hfsc_class class;
3682 error = hfsc_parse_class_details__(netdev, details, &class);
3687 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3688 tc_make_handle(1, 0xfffe), &class);
3693 hfsc_update_queue__(netdev, queue_id, &class);
3698 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3702 struct hfsc_class *hc;
3704 hc = hfsc_class_cast__(queue);
3705 hfsc = hfsc_get__(netdev);
3707 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3709 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3716 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3717 struct netdev_queue_stats *stats)
3719 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3720 tc_make_handle(1, 0xfffe), NULL, stats);
3724 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3725 const struct ofpbuf *nlmsg,
3726 netdev_dump_queue_stats_cb *cb, void *aux)
3728 struct netdev_queue_stats stats;
3729 unsigned int handle, major, minor;
3732 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3737 major = tc_get_major(handle);
3738 minor = tc_get_minor(handle);
3739 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3740 (*cb)(minor - 1, &stats, aux);
3745 static const struct tc_ops tc_ops_hfsc = {
3746 "hfsc", /* linux_name */
3747 "linux-hfsc", /* ovs_name */
3748 HFSC_N_QUEUES, /* n_queues */
3749 hfsc_tc_install, /* tc_install */
3750 hfsc_tc_load, /* tc_load */
3751 hfsc_tc_destroy, /* tc_destroy */
3752 hfsc_qdisc_get, /* qdisc_get */
3753 hfsc_qdisc_set, /* qdisc_set */
3754 hfsc_class_get, /* class_get */
3755 hfsc_class_set, /* class_set */
3756 hfsc_class_delete, /* class_delete */
3757 hfsc_class_get_stats, /* class_get_stats */
3758 hfsc_class_dump_stats /* class_dump_stats */
3761 /* "linux-default" traffic control class.
3763 * This class represents the default, unnamed Linux qdisc. It corresponds to
3764 * the "" (empty string) QoS type in the OVS database. */
3767 default_install__(struct netdev *netdev_)
3769 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3770 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
3772 /* Nothing but a tc class implementation is allowed to write to a tc. This
3773 * class never does that, so we can legitimately use a const tc object. */
3774 netdev->tc = CONST_CAST(struct tc *, &tc);
3778 default_tc_install(struct netdev *netdev,
3779 const struct smap *details OVS_UNUSED)
3781 default_install__(netdev);
3786 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3788 default_install__(netdev);
3792 static const struct tc_ops tc_ops_default = {
3793 NULL, /* linux_name */
3798 NULL, /* tc_destroy */
3799 NULL, /* qdisc_get */
3800 NULL, /* qdisc_set */
3801 NULL, /* class_get */
3802 NULL, /* class_set */
3803 NULL, /* class_delete */
3804 NULL, /* class_get_stats */
3805 NULL /* class_dump_stats */
3808 /* "linux-other" traffic control class.
3813 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
3815 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3816 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
3818 /* Nothing but a tc class implementation is allowed to write to a tc. This
3819 * class never does that, so we can legitimately use a const tc object. */
3820 netdev->tc = CONST_CAST(struct tc *, &tc);
3824 static const struct tc_ops tc_ops_other = {
3825 NULL, /* linux_name */
3826 "linux-other", /* ovs_name */
3828 NULL, /* tc_install */
3830 NULL, /* tc_destroy */
3831 NULL, /* qdisc_get */
3832 NULL, /* qdisc_set */
3833 NULL, /* class_get */
3834 NULL, /* class_set */
3835 NULL, /* class_delete */
3836 NULL, /* class_get_stats */
3837 NULL /* class_dump_stats */
3840 /* Traffic control. */
3842 /* Number of kernel "tc" ticks per second. */
3843 static double ticks_per_s;
3845 /* Number of kernel "jiffies" per second. This is used for the purpose of
3846 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3847 * one jiffy's worth of data.
3849 * There are two possibilities here:
3851 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3852 * approximate range of 100 to 1024. That means that we really need to
3853 * make sure that the qdisc can buffer that much data.
3855 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3856 * has finely granular timers and there's no need to fudge additional room
3857 * for buffers. (There's no extra effort needed to implement that: the
3858 * large 'buffer_hz' is used as a divisor, so practically any number will
3859 * come out as 0 in the division. Small integer results in the case of
3860 * really high dividends won't have any real effect anyhow.)
3862 static unsigned int buffer_hz;
3864 /* Returns tc handle 'major':'minor'. */
3866 tc_make_handle(unsigned int major, unsigned int minor)
3868 return TC_H_MAKE(major << 16, minor);
3871 /* Returns the major number from 'handle'. */
3873 tc_get_major(unsigned int handle)
3875 return TC_H_MAJ(handle) >> 16;
3878 /* Returns the minor number from 'handle'. */
3880 tc_get_minor(unsigned int handle)
3882 return TC_H_MIN(handle);
3885 static struct tcmsg *
3886 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3887 struct ofpbuf *request)
3889 struct tcmsg *tcmsg;
3893 error = get_ifindex(netdev, &ifindex);
3898 ofpbuf_init(request, 512);
3899 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3900 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3901 tcmsg->tcm_family = AF_UNSPEC;
3902 tcmsg->tcm_ifindex = ifindex;
3903 /* Caller should fill in tcmsg->tcm_handle. */
3904 /* Caller should fill in tcmsg->tcm_parent. */
3910 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3912 int error = nl_transact(NETLINK_ROUTE, request, replyp);
3913 ofpbuf_uninit(request);
3917 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3918 * policing configuration.
3920 * This function is equivalent to running the following when 'add' is true:
3921 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3923 * This function is equivalent to running the following when 'add' is false:
3924 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3926 * The configuration and stats may be seen with the following command:
3927 * /sbin/tc -s qdisc show dev <devname>
3929 * Returns 0 if successful, otherwise a positive errno value.
3932 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3934 struct ofpbuf request;
3935 struct tcmsg *tcmsg;
3937 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3938 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3940 tcmsg = tc_make_request(netdev, type, flags, &request);
3944 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3945 tcmsg->tcm_parent = TC_H_INGRESS;
3946 nl_msg_put_string(&request, TCA_KIND, "ingress");
3947 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3949 error = tc_transact(&request, NULL);
3951 /* If we're deleting the qdisc, don't worry about some of the
3952 * error conditions. */
3953 if (!add && (error == ENOENT || error == EINVAL)) {
3962 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3965 * This function is equivalent to running:
3966 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3967 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3970 * The configuration and stats may be seen with the following command:
3971 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3973 * Returns 0 if successful, otherwise a positive errno value.
3976 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3978 struct tc_police tc_police;
3979 struct ofpbuf request;
3980 struct tcmsg *tcmsg;
3981 size_t basic_offset;
3982 size_t police_offset;
3986 memset(&tc_police, 0, sizeof tc_police);
3987 tc_police.action = TC_POLICE_SHOT;
3988 tc_police.mtu = mtu;
3989 tc_fill_rate(&tc_police.rate, (kbits_rate * 1000)/8, mtu);
3990 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3991 kbits_burst * 1024);
3993 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3994 NLM_F_EXCL | NLM_F_CREATE, &request);
3998 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3999 tcmsg->tcm_info = tc_make_handle(49,
4000 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
4002 nl_msg_put_string(&request, TCA_KIND, "basic");
4003 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4004 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
4005 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
4006 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
4007 nl_msg_end_nested(&request, police_offset);
4008 nl_msg_end_nested(&request, basic_offset);
4010 error = tc_transact(&request, NULL);
4021 /* The values in psched are not individually very meaningful, but they are
4022 * important. The tables below show some values seen in the wild.
4026 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
4027 * (Before that, there are hints that it was 1000000000.)
4029 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
4033 * -----------------------------------
4034 * [1] 000c8000 000f4240 000f4240 00000064
4035 * [2] 000003e8 00000400 000f4240 3b9aca00
4036 * [3] 000003e8 00000400 000f4240 3b9aca00
4037 * [4] 000003e8 00000400 000f4240 00000064
4038 * [5] 000003e8 00000040 000f4240 3b9aca00
4039 * [6] 000003e8 00000040 000f4240 000000f9
4041 * a b c d ticks_per_s buffer_hz
4042 * ------- --------- ---------- ------------- ----------- -------------
4043 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
4044 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4045 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4046 * [4] 1,000 1,024 1,000,000 100 976,562 100
4047 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
4048 * [6] 1,000 64 1,000,000 249 15,625,000 249
4050 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
4051 * [2] 2.6.26-1-686-bigmem from Debian lenny
4052 * [3] 2.6.26-2-sparc64 from Debian lenny
4053 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
4054 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
4055 * [6] 2.6.34 from kernel.org on KVM
4057 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4058 static const char fn[] = "/proc/net/psched";
4059 unsigned int a, b, c, d;
4062 if (!ovsthread_once_start(&once)) {
4069 stream = fopen(fn, "r");
4071 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
4075 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
4076 VLOG_WARN("%s: read failed", fn);
4080 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
4084 VLOG_WARN("%s: invalid scheduler parameters", fn);
4088 ticks_per_s = (double) a * c / b;
4092 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
4095 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
4098 ovsthread_once_done(&once);
4101 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
4102 * rate of 'rate' bytes per second. */
4104 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
4107 return (rate * ticks) / ticks_per_s;
4110 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
4111 * rate of 'rate' bytes per second. */
4113 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
4116 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
4119 /* Returns the number of bytes that need to be reserved for qdisc buffering at
4120 * a transmission rate of 'rate' bytes per second. */
4122 tc_buffer_per_jiffy(unsigned int rate)
4125 return rate / buffer_hz;
4128 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
4129 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
4130 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
4131 * stores NULL into it if it is absent.
4133 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
4136 * Returns 0 if successful, otherwise a positive errno value. */
4138 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
4139 struct nlattr **options)
4141 static const struct nl_policy tca_policy[] = {
4142 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
4143 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
4145 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4147 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4148 tca_policy, ta, ARRAY_SIZE(ta))) {
4149 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
4154 *kind = nl_attr_get_string(ta[TCA_KIND]);
4158 *options = ta[TCA_OPTIONS];
4173 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
4174 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
4175 * into '*options', and its queue statistics into '*stats'. Any of the output
4176 * arguments may be null.
4178 * Returns 0 if successful, otherwise a positive errno value. */
4180 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
4181 struct nlattr **options, struct netdev_queue_stats *stats)
4183 static const struct nl_policy tca_policy[] = {
4184 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
4185 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
4187 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4189 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4190 tca_policy, ta, ARRAY_SIZE(ta))) {
4191 VLOG_WARN_RL(&rl, "failed to parse class message");
4196 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
4197 *handlep = tc->tcm_handle;
4201 *options = ta[TCA_OPTIONS];
4205 const struct gnet_stats_queue *gsq;
4206 struct gnet_stats_basic gsb;
4208 static const struct nl_policy stats_policy[] = {
4209 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
4210 .min_len = sizeof gsb },
4211 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
4212 .min_len = sizeof *gsq },
4214 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
4216 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
4217 sa, ARRAY_SIZE(sa))) {
4218 VLOG_WARN_RL(&rl, "failed to parse class stats");
4222 /* Alignment issues screw up the length of struct gnet_stats_basic on
4223 * some arch/bitsize combinations. Newer versions of Linux have a
4224 * struct gnet_stats_basic_packed, but we can't depend on that. The
4225 * easiest thing to do is just to make a copy. */
4226 memset(&gsb, 0, sizeof gsb);
4227 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
4228 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
4229 stats->tx_bytes = gsb.bytes;
4230 stats->tx_packets = gsb.packets;
4232 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
4233 stats->tx_errors = gsq->drops;
4243 memset(stats, 0, sizeof *stats);
4248 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
4251 tc_query_class(const struct netdev *netdev,
4252 unsigned int handle, unsigned int parent,
4253 struct ofpbuf **replyp)
4255 struct ofpbuf request;
4256 struct tcmsg *tcmsg;
4259 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
4263 tcmsg->tcm_handle = handle;
4264 tcmsg->tcm_parent = parent;
4266 error = tc_transact(&request, replyp);
4268 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
4269 netdev_get_name(netdev),
4270 tc_get_major(handle), tc_get_minor(handle),
4271 tc_get_major(parent), tc_get_minor(parent),
4272 ovs_strerror(error));
4277 /* Equivalent to "tc class del dev <name> handle <handle>". */
4279 tc_delete_class(const struct netdev *netdev, unsigned int handle)
4281 struct ofpbuf request;
4282 struct tcmsg *tcmsg;
4285 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
4289 tcmsg->tcm_handle = handle;
4290 tcmsg->tcm_parent = 0;
4292 error = tc_transact(&request, NULL);
4294 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4295 netdev_get_name(netdev),
4296 tc_get_major(handle), tc_get_minor(handle),
4297 ovs_strerror(error));
4302 /* Equivalent to "tc qdisc del dev <name> root". */
4304 tc_del_qdisc(struct netdev *netdev_)
4306 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4307 struct ofpbuf request;
4308 struct tcmsg *tcmsg;
4311 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
4315 tcmsg->tcm_handle = tc_make_handle(1, 0);
4316 tcmsg->tcm_parent = TC_H_ROOT;
4318 error = tc_transact(&request, NULL);
4319 if (error == EINVAL) {
4320 /* EINVAL probably means that the default qdisc was in use, in which
4321 * case we've accomplished our purpose. */
4324 if (!error && netdev->tc) {
4325 if (netdev->tc->ops->tc_destroy) {
4326 netdev->tc->ops->tc_destroy(netdev->tc);
4333 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4334 * kernel to determine what they are. Returns 0 if successful, otherwise a
4335 * positive errno value. */
4337 tc_query_qdisc(const struct netdev *netdev_)
4339 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4340 struct ofpbuf request, *qdisc;
4341 const struct tc_ops *ops;
4342 struct tcmsg *tcmsg;
4350 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4351 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4352 * 2.6.35 without that fix backported to it.
4354 * To avoid the OOPS, we must not make a request that would attempt to dump
4355 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4356 * few others. There are a few ways that I can see to do this, but most of
4357 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4358 * technique chosen here is to assume that any non-default qdisc that we
4359 * create will have a class with handle 1:0. The built-in qdiscs only have
4360 * a class with handle 0:0.
4362 * We could check for Linux 2.6.35+ and use a more straightforward method
4364 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
4368 tcmsg->tcm_handle = tc_make_handle(1, 0);
4369 tcmsg->tcm_parent = 0;
4371 /* Figure out what tc class to instantiate. */
4372 error = tc_transact(&request, &qdisc);
4376 error = tc_parse_qdisc(qdisc, &kind, NULL);
4378 ops = &tc_ops_other;
4380 ops = tc_lookup_linux_name(kind);
4382 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4383 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4385 ops = &tc_ops_other;
4388 } else if (error == ENOENT) {
4389 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4390 * other entity that doesn't have a handle 1:0. We will assume
4391 * that it's the system default qdisc. */
4392 ops = &tc_ops_default;
4395 /* Who knows? Maybe the device got deleted. */
4396 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4397 netdev_get_name(netdev_), ovs_strerror(error));
4398 ops = &tc_ops_other;
4401 /* Instantiate it. */
4402 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
4403 ovs_assert((load_error == 0) == (netdev->tc != NULL));
4404 ofpbuf_delete(qdisc);
4406 return error ? error : load_error;
4409 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4410 approximate the time to transmit packets of various lengths. For an MTU of
4411 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4412 represents two possible packet lengths; for a MTU of 513 through 1024, four
4413 possible lengths; and so on.
4415 Returns, for the specified 'mtu', the number of bits that packet lengths
4416 need to be shifted right to fit within such a 256-entry table. */
4418 tc_calc_cell_log(unsigned int mtu)
4423 mtu = ETH_PAYLOAD_MAX;
4425 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4427 for (cell_log = 0; mtu >= 256; cell_log++) {
4434 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4437 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4439 memset(rate, 0, sizeof *rate);
4440 rate->cell_log = tc_calc_cell_log(mtu);
4441 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4442 /* rate->cell_align = 0; */ /* distro headers. */
4443 rate->mpu = ETH_TOTAL_MIN;
4447 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4448 * attribute of the specified "type".
4450 * See tc_calc_cell_log() above for a description of "rtab"s. */
4452 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4457 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4458 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4459 unsigned packet_size = (i + 1) << rate->cell_log;
4460 if (packet_size < rate->mpu) {
4461 packet_size = rate->mpu;
4463 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4467 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4468 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4469 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4472 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4474 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4475 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4478 /* Linux-only functions declared in netdev-linux.h */
4480 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4481 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4483 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4484 const char *flag_name, bool enable)
4486 const char *netdev_name = netdev_get_name(netdev);
4487 struct ethtool_value evalue;
4491 COVERAGE_INC(netdev_get_ethtool);
4492 memset(&evalue, 0, sizeof evalue);
4493 error = netdev_linux_do_ethtool(netdev_name,
4494 (struct ethtool_cmd *)&evalue,
4495 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4500 COVERAGE_INC(netdev_set_ethtool);
4501 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4502 error = netdev_linux_do_ethtool(netdev_name,
4503 (struct ethtool_cmd *)&evalue,
4504 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4509 COVERAGE_INC(netdev_get_ethtool);
4510 memset(&evalue, 0, sizeof evalue);
4511 error = netdev_linux_do_ethtool(netdev_name,
4512 (struct ethtool_cmd *)&evalue,
4513 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4518 if (new_flags != evalue.data) {
4519 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4520 "device %s failed", enable ? "enable" : "disable",
4521 flag_name, netdev_name);
4528 /* Utility functions. */
4530 /* Copies 'src' into 'dst', performing format conversion in the process. */
4532 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4533 const struct rtnl_link_stats *src)
4535 dst->rx_packets = src->rx_packets;
4536 dst->tx_packets = src->tx_packets;
4537 dst->rx_bytes = src->rx_bytes;
4538 dst->tx_bytes = src->tx_bytes;
4539 dst->rx_errors = src->rx_errors;
4540 dst->tx_errors = src->tx_errors;
4541 dst->rx_dropped = src->rx_dropped;
4542 dst->tx_dropped = src->tx_dropped;
4543 dst->multicast = src->multicast;
4544 dst->collisions = src->collisions;
4545 dst->rx_length_errors = src->rx_length_errors;
4546 dst->rx_over_errors = src->rx_over_errors;
4547 dst->rx_crc_errors = src->rx_crc_errors;
4548 dst->rx_frame_errors = src->rx_frame_errors;
4549 dst->rx_fifo_errors = src->rx_fifo_errors;
4550 dst->rx_missed_errors = src->rx_missed_errors;
4551 dst->tx_aborted_errors = src->tx_aborted_errors;
4552 dst->tx_carrier_errors = src->tx_carrier_errors;
4553 dst->tx_fifo_errors = src->tx_fifo_errors;
4554 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4555 dst->tx_window_errors = src->tx_window_errors;
4559 get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
4561 struct ofpbuf request;
4562 struct ofpbuf *reply;
4565 ofpbuf_init(&request, 0);
4566 nl_msg_put_nlmsghdr(&request,
4567 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
4568 RTM_GETLINK, NLM_F_REQUEST);
4569 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
4570 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
4571 error = nl_transact(NETLINK_ROUTE, &request, &reply);
4572 ofpbuf_uninit(&request);
4577 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
4578 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS);
4579 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
4580 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
4583 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4587 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
4592 ofpbuf_delete(reply);
4597 get_flags(const struct netdev *dev, unsigned int *flags)
4603 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
4605 *flags = ifr.ifr_flags;
4611 set_flags(const char *name, unsigned int flags)
4615 ifr.ifr_flags = flags;
4616 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
4620 do_get_ifindex(const char *netdev_name)
4625 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4626 COVERAGE_INC(netdev_get_ifindex);
4628 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
4630 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4631 netdev_name, ovs_strerror(error));
4634 return ifr.ifr_ifindex;
4638 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4640 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4642 if (!(netdev->cache_valid & VALID_IFINDEX)) {
4643 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4646 netdev->get_ifindex_error = -ifindex;
4647 netdev->ifindex = 0;
4649 netdev->get_ifindex_error = 0;
4650 netdev->ifindex = ifindex;
4652 netdev->cache_valid |= VALID_IFINDEX;
4655 *ifindexp = netdev->ifindex;
4656 return netdev->get_ifindex_error;
4660 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4666 memset(&ifr, 0, sizeof ifr);
4667 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4668 COVERAGE_INC(netdev_get_hwaddr);
4669 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
4671 /* ENODEV probably means that a vif disappeared asynchronously and
4672 * hasn't been removed from the database yet, so reduce the log level
4673 * to INFO for that case. */
4674 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
4675 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4676 netdev_name, ovs_strerror(error));
4679 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4680 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4681 VLOG_WARN("%s device has unknown hardware address family %d",
4682 netdev_name, hwaddr_family);
4684 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4689 set_etheraddr(const char *netdev_name,
4690 const uint8_t mac[ETH_ADDR_LEN])
4695 memset(&ifr, 0, sizeof ifr);
4696 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4697 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4698 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4699 COVERAGE_INC(netdev_set_hwaddr);
4700 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
4702 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4703 netdev_name, ovs_strerror(error));
4709 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4710 int cmd, const char *cmd_name)
4715 memset(&ifr, 0, sizeof ifr);
4716 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4717 ifr.ifr_data = (caddr_t) ecmd;
4720 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
4722 if (error != EOPNOTSUPP) {
4723 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4724 "failed: %s", cmd_name, name, ovs_strerror(error));
4726 /* The device doesn't support this operation. That's pretty
4727 * common, so there's no point in logging anything. */
4734 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4735 int cmd, const char *cmd_name)
4740 ifr.ifr_addr.sa_family = AF_INET;
4741 error = af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4743 const struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *,
4745 *ip = sin->sin_addr;
4750 /* Returns an AF_PACKET raw socket or a negative errno value. */
4752 af_packet_sock(void)
4754 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4757 if (ovsthread_once_start(&once)) {
4758 sock = socket(AF_PACKET, SOCK_RAW, 0);
4760 int error = set_nonblocking(sock);
4767 VLOG_ERR("failed to create packet socket: %s",
4768 ovs_strerror(errno));
4770 ovsthread_once_done(&once);