2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <sys/types.h>
37 #include <sys/ioctl.h>
38 #include <sys/socket.h>
39 #include <netpacket/packet.h>
41 #include <net/if_arp.h>
42 #include <net/if_packet.h>
43 #include <net/route.h>
44 #include <netinet/in.h>
50 #include "connectivity.h"
52 #include "dpif-linux.h"
53 #include "dpif-netdev.h"
54 #include "dynamic-string.h"
55 #include "fatal-signal.h"
58 #include "netdev-provider.h"
59 #include "netdev-vport.h"
60 #include "netlink-notifier.h"
61 #include "netlink-socket.h"
64 #include "openflow/openflow.h"
65 #include "ovs-atomic.h"
67 #include "poll-loop.h"
68 #include "rtnetlink-link.h"
71 #include "socket-util.h"
74 #include "unaligned.h"
77 VLOG_DEFINE_THIS_MODULE(netdev_linux);
79 COVERAGE_DEFINE(netdev_set_policing);
80 COVERAGE_DEFINE(netdev_arp_lookup);
81 COVERAGE_DEFINE(netdev_get_ifindex);
82 COVERAGE_DEFINE(netdev_get_hwaddr);
83 COVERAGE_DEFINE(netdev_set_hwaddr);
84 COVERAGE_DEFINE(netdev_get_ethtool);
85 COVERAGE_DEFINE(netdev_set_ethtool);
88 /* These were introduced in Linux 2.6.14, so they might be missing if we have
90 #ifndef ADVERTISED_Pause
91 #define ADVERTISED_Pause (1 << 13)
93 #ifndef ADVERTISED_Asym_Pause
94 #define ADVERTISED_Asym_Pause (1 << 14)
97 /* These were introduced in Linux 2.6.24, so they might be missing if we
98 * have old headers. */
99 #ifndef ETHTOOL_GFLAGS
100 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
102 #ifndef ETHTOOL_SFLAGS
103 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
106 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
109 #define TC_RTAB_SIZE 1024
112 /* Linux 2.6.21 introduced struct tpacket_auxdata.
113 * Linux 2.6.27 added the tp_vlan_tci member.
114 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
115 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
116 * TP_STATUS_VLAN_TPID_VALID.
118 * With all this churn it's easiest to unconditionally define a replacement
119 * structure that has everything we want.
121 #ifndef PACKET_AUXDATA
122 #define PACKET_AUXDATA 8
124 #ifndef TP_STATUS_VLAN_VALID
125 #define TP_STATUS_VLAN_VALID (1 << 4)
127 #ifndef TP_STATUS_VLAN_TPID_VALID
128 #define TP_STATUS_VLAN_TPID_VALID (1 << 6)
130 #undef tpacket_auxdata
131 #define tpacket_auxdata rpl_tpacket_auxdata
132 struct tpacket_auxdata {
138 uint16_t tp_vlan_tci;
139 uint16_t tp_vlan_tpid;
143 VALID_IFINDEX = 1 << 0,
144 VALID_ETHERADDR = 1 << 1,
148 VALID_POLICING = 1 << 5,
149 VALID_VPORT_STAT_ERROR = 1 << 6,
150 VALID_DRVINFO = 1 << 7,
151 VALID_FEATURES = 1 << 8,
154 /* Traffic control. */
156 /* An instance of a traffic control class. Always associated with a particular
159 * Each TC implementation subclasses this with whatever additional data it
162 const struct tc_ops *ops;
163 struct hmap queues; /* Contains "struct tc_queue"s.
164 * Read by generic TC layer.
165 * Written only by TC implementation. */
168 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
170 /* One traffic control queue.
172 * Each TC implementation subclasses this with whatever additional data it
175 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
176 unsigned int queue_id; /* OpenFlow queue ID. */
177 long long int created; /* Time queue was created, in msecs. */
180 /* A particular kind of traffic control. Each implementation generally maps to
181 * one particular Linux qdisc class.
183 * The functions below return 0 if successful or a positive errno value on
184 * failure, except where otherwise noted. All of them must be provided, except
185 * where otherwise noted. */
187 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
188 * This is null for tc_ops_default and tc_ops_other, for which there are no
189 * appropriate values. */
190 const char *linux_name;
192 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
193 const char *ovs_name;
195 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
196 * queues. The queues are numbered 0 through n_queues - 1. */
197 unsigned int n_queues;
199 /* Called to install this TC class on 'netdev'. The implementation should
200 * make the Netlink calls required to set up 'netdev' with the right qdisc
201 * and configure it according to 'details'. The implementation may assume
202 * that the current qdisc is the default; that is, there is no need for it
203 * to delete the current qdisc before installing itself.
205 * The contents of 'details' should be documented as valid for 'ovs_name'
206 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
207 * (which is built as ovs-vswitchd.conf.db(8)).
209 * This function must return 0 if and only if it sets 'netdev->tc' to an
210 * initialized 'struct tc'.
212 * (This function is null for tc_ops_other, which cannot be installed. For
213 * other TC classes it should always be nonnull.) */
214 int (*tc_install)(struct netdev *netdev, const struct smap *details);
216 /* Called when the netdev code determines (through a Netlink query) that
217 * this TC class's qdisc is installed on 'netdev', but we didn't install
218 * it ourselves and so don't know any of the details.
220 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
221 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
222 * implementation should parse the other attributes of 'nlmsg' as
223 * necessary to determine its configuration. If necessary it should also
224 * use Netlink queries to determine the configuration of queues on
227 * This function must return 0 if and only if it sets 'netdev->tc' to an
228 * initialized 'struct tc'. */
229 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
231 /* Destroys the data structures allocated by the implementation as part of
232 * 'tc'. (This includes destroying 'tc->queues' by calling
235 * The implementation should not need to perform any Netlink calls. If
236 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
237 * (But it may not be desirable.)
239 * This function may be null if 'tc' is trivial. */
240 void (*tc_destroy)(struct tc *tc);
242 /* Retrieves details of 'netdev->tc' configuration into 'details'.
244 * The implementation should not need to perform any Netlink calls, because
245 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
246 * cached the configuration.
248 * The contents of 'details' should be documented as valid for 'ovs_name'
249 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
250 * (which is built as ovs-vswitchd.conf.db(8)).
252 * This function may be null if 'tc' is not configurable.
254 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
256 /* Reconfigures 'netdev->tc' according to 'details', performing any
257 * required Netlink calls to complete the reconfiguration.
259 * The contents of 'details' should be documented as valid for 'ovs_name'
260 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
261 * (which is built as ovs-vswitchd.conf.db(8)).
263 * This function may be null if 'tc' is not configurable.
265 int (*qdisc_set)(struct netdev *, const struct smap *details);
267 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
268 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
270 * The contents of 'details' should be documented as valid for 'ovs_name'
271 * in the "other_config" column in the "Queue" table in
272 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
274 * The implementation should not need to perform any Netlink calls, because
275 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
276 * cached the queue configuration.
278 * This function may be null if 'tc' does not have queues ('n_queues' is
280 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
281 struct smap *details);
283 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
284 * 'details', perfoming any required Netlink calls to complete the
285 * reconfiguration. The caller ensures that 'queue_id' is less than
288 * The contents of 'details' should be documented as valid for 'ovs_name'
289 * in the "other_config" column in the "Queue" table in
290 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
292 * This function may be null if 'tc' does not have queues or its queues are
293 * not configurable. */
294 int (*class_set)(struct netdev *, unsigned int queue_id,
295 const struct smap *details);
297 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
298 * tc_queue's within 'netdev->tc->queues'.
300 * This function may be null if 'tc' does not have queues or its queues
301 * cannot be deleted. */
302 int (*class_delete)(struct netdev *, struct tc_queue *queue);
304 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
305 * 'struct tc_queue's within 'netdev->tc->queues'.
307 * On success, initializes '*stats'.
309 * This function may be null if 'tc' does not have queues or if it cannot
310 * report queue statistics. */
311 int (*class_get_stats)(const struct netdev *netdev,
312 const struct tc_queue *queue,
313 struct netdev_queue_stats *stats);
315 /* Extracts queue stats from 'nlmsg', which is a response to a
316 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
318 * This function may be null if 'tc' does not have queues or if it cannot
319 * report queue statistics. */
320 int (*class_dump_stats)(const struct netdev *netdev,
321 const struct ofpbuf *nlmsg,
322 netdev_dump_queue_stats_cb *cb, void *aux);
326 tc_init(struct tc *tc, const struct tc_ops *ops)
329 hmap_init(&tc->queues);
333 tc_destroy(struct tc *tc)
335 hmap_destroy(&tc->queues);
338 static const struct tc_ops tc_ops_htb;
339 static const struct tc_ops tc_ops_hfsc;
340 static const struct tc_ops tc_ops_default;
341 static const struct tc_ops tc_ops_other;
343 static const struct tc_ops *const tcs[] = {
344 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
345 &tc_ops_hfsc, /* Hierarchical fair service curve. */
346 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
347 &tc_ops_other, /* Some other qdisc. */
351 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
352 static unsigned int tc_get_major(unsigned int handle);
353 static unsigned int tc_get_minor(unsigned int handle);
355 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
356 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
357 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
359 static struct tcmsg *tc_make_request(const struct netdev *, int type,
360 unsigned int flags, struct ofpbuf *);
361 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
362 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
363 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
366 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
367 struct nlattr **options);
368 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
369 struct nlattr **options,
370 struct netdev_queue_stats *);
371 static int tc_query_class(const struct netdev *,
372 unsigned int handle, unsigned int parent,
373 struct ofpbuf **replyp);
374 static int tc_delete_class(const struct netdev *, unsigned int handle);
376 static int tc_del_qdisc(struct netdev *netdev);
377 static int tc_query_qdisc(const struct netdev *netdev);
379 static int tc_calc_cell_log(unsigned int mtu);
380 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
381 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
382 const struct tc_ratespec *rate);
383 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
385 struct netdev_linux {
388 /* Protects all members below. */
389 struct ovs_mutex mutex;
391 unsigned int cache_valid;
393 bool miimon; /* Link status of last poll. */
394 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
395 struct timer miimon_timer;
397 /* The following are figured out "on demand" only. They are only valid
398 * when the corresponding VALID_* bit in 'cache_valid' is set. */
400 uint8_t etheraddr[ETH_ADDR_LEN];
401 struct in_addr address, netmask;
404 unsigned int ifi_flags;
405 long long int carrier_resets;
406 uint32_t kbits_rate; /* Policing data. */
407 uint32_t kbits_burst;
408 int vport_stats_error; /* Cached error code from vport_get_stats().
409 0 or an errno value. */
410 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
411 int ether_addr_error; /* Cached error code from set/get etheraddr. */
412 int netdev_policing_error; /* Cached error code from set policing. */
413 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
414 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
416 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
417 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
418 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
420 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
423 /* For devices of class netdev_tap_class only. */
427 struct netdev_rx_linux {
433 /* This is set pretty low because we probably won't learn anything from the
434 * additional log messages. */
435 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
437 /* Polling miimon status for all ports causes performance degradation when
438 * handling a large number of ports. If there are no devices using miimon, then
439 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait(). */
440 static atomic_int miimon_cnt = ATOMIC_VAR_INIT(0);
442 static void netdev_linux_run(void);
444 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
445 int cmd, const char *cmd_name);
446 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
447 int cmd, const char *cmd_name);
448 static int get_flags(const struct netdev *, unsigned int *flags);
449 static int set_flags(const char *, unsigned int flags);
450 static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
451 enum netdev_flags on, enum netdev_flags *old_flagsp)
452 OVS_REQUIRES(netdev->mutex);
453 static int do_get_ifindex(const char *netdev_name);
454 static int get_ifindex(const struct netdev *, int *ifindexp);
455 static int do_set_addr(struct netdev *netdev,
456 int ioctl_nr, const char *ioctl_name,
457 struct in_addr addr);
458 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
459 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
460 static int get_stats_via_netlink(const struct netdev *, struct netdev_stats *);
461 static int af_packet_sock(void);
462 static bool netdev_linux_miimon_enabled(void);
463 static void netdev_linux_miimon_run(void);
464 static void netdev_linux_miimon_wait(void);
465 static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
468 is_netdev_linux_class(const struct netdev_class *netdev_class)
470 return netdev_class->run == netdev_linux_run;
474 is_tap_netdev(const struct netdev *netdev)
476 return netdev_get_class(netdev) == &netdev_tap_class;
479 static struct netdev_linux *
480 netdev_linux_cast(const struct netdev *netdev)
482 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
484 return CONTAINER_OF(netdev, struct netdev_linux, up);
487 static struct netdev_rx_linux *
488 netdev_rx_linux_cast(const struct netdev_rx *rx)
490 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
491 return CONTAINER_OF(rx, struct netdev_rx_linux, up);
494 static void netdev_linux_update(struct netdev_linux *netdev,
495 const struct rtnetlink_link_change *)
496 OVS_REQUIRES(netdev->mutex);
497 static void netdev_linux_changed(struct netdev_linux *netdev,
498 unsigned int ifi_flags, unsigned int mask)
499 OVS_REQUIRES(netdev->mutex);
501 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK changes, or NULL
502 * if no such socket could be created. */
503 static struct nl_sock *
504 netdev_linux_notify_sock(void)
506 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
507 static struct nl_sock *sock;
509 if (ovsthread_once_start(&once)) {
512 error = nl_sock_create(NETLINK_ROUTE, &sock);
514 error = nl_sock_join_mcgroup(sock, RTNLGRP_LINK);
516 nl_sock_destroy(sock);
520 ovsthread_once_done(&once);
527 netdev_linux_miimon_enabled(void)
531 atomic_read(&miimon_cnt, &miimon);
536 netdev_linux_run(void)
538 struct nl_sock *sock;
541 if (netdev_linux_miimon_enabled()) {
542 netdev_linux_miimon_run();
545 sock = netdev_linux_notify_sock();
551 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
552 uint64_t buf_stub[4096 / 8];
555 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
556 error = nl_sock_recv(sock, &buf, false);
558 struct rtnetlink_link_change change;
560 if (rtnetlink_link_parse(&buf, &change)) {
561 struct netdev *netdev_ = netdev_from_name(change.ifname);
562 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
563 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
565 ovs_mutex_lock(&netdev->mutex);
566 netdev_linux_update(netdev, &change);
567 ovs_mutex_unlock(&netdev->mutex);
569 netdev_close(netdev_);
571 } else if (error == ENOBUFS) {
572 struct shash device_shash;
573 struct shash_node *node;
577 shash_init(&device_shash);
578 netdev_get_devices(&netdev_linux_class, &device_shash);
579 SHASH_FOR_EACH (node, &device_shash) {
580 struct netdev *netdev_ = node->data;
581 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
584 ovs_mutex_lock(&netdev->mutex);
585 get_flags(netdev_, &flags);
586 netdev_linux_changed(netdev, flags, 0);
587 ovs_mutex_unlock(&netdev->mutex);
589 netdev_close(netdev_);
591 shash_destroy(&device_shash);
592 } else if (error != EAGAIN) {
593 VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)",
594 ovs_strerror(error));
601 netdev_linux_wait(void)
603 struct nl_sock *sock;
605 if (netdev_linux_miimon_enabled()) {
606 netdev_linux_miimon_wait();
608 sock = netdev_linux_notify_sock();
610 nl_sock_wait(sock, POLLIN);
615 netdev_linux_changed(struct netdev_linux *dev,
616 unsigned int ifi_flags, unsigned int mask)
617 OVS_REQUIRES(dev->mutex)
619 seq_change(connectivity_seq_get());
621 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
622 dev->carrier_resets++;
624 dev->ifi_flags = ifi_flags;
626 dev->cache_valid &= mask;
630 netdev_linux_update(struct netdev_linux *dev,
631 const struct rtnetlink_link_change *change)
632 OVS_REQUIRES(dev->mutex)
634 if (change->nlmsg_type == RTM_NEWLINK) {
636 netdev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
638 /* Update netdev from rtnl-change msg. */
640 dev->mtu = change->mtu;
641 dev->cache_valid |= VALID_MTU;
642 dev->netdev_mtu_error = 0;
645 if (!eth_addr_is_zero(change->addr)) {
646 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
647 dev->cache_valid |= VALID_ETHERADDR;
648 dev->ether_addr_error = 0;
651 dev->ifindex = change->ifi_index;
652 dev->cache_valid |= VALID_IFINDEX;
653 dev->get_ifindex_error = 0;
656 netdev_linux_changed(dev, change->ifi_flags, 0);
660 static struct netdev *
661 netdev_linux_alloc(void)
663 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
668 netdev_linux_common_construct(struct netdev_linux *netdev)
670 ovs_mutex_init(&netdev->mutex);
673 /* Creates system and internal devices. */
675 netdev_linux_construct(struct netdev *netdev_)
677 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
680 netdev_linux_common_construct(netdev);
682 error = get_flags(&netdev->up, &netdev->ifi_flags);
683 if (error == ENODEV) {
684 if (netdev->up.netdev_class != &netdev_internal_class) {
685 /* The device does not exist, so don't allow it to be opened. */
688 /* "Internal" netdevs have to be created as netdev objects before
689 * they exist in the kernel, because creating them in the kernel
690 * happens by passing a netdev object to dpif_port_add().
691 * Therefore, ignore the error. */
698 /* For most types of netdevs we open the device for each call of
699 * netdev_open(). However, this is not the case with tap devices,
700 * since it is only possible to open the device once. In this
701 * situation we share a single file descriptor, and consequently
702 * buffers, across all readers. Therefore once data is read it will
703 * be unavailable to other reads for tap devices. */
705 netdev_linux_construct_tap(struct netdev *netdev_)
707 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
708 static const char tap_dev[] = "/dev/net/tun";
709 const char *name = netdev_->name;
713 netdev_linux_common_construct(netdev);
715 /* Open tap device. */
716 netdev->tap_fd = open(tap_dev, O_RDWR);
717 if (netdev->tap_fd < 0) {
719 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
723 /* Create tap device. */
724 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
725 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
726 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
727 VLOG_WARN("%s: creating tap device failed: %s", name,
728 ovs_strerror(errno));
733 /* Make non-blocking. */
734 error = set_nonblocking(netdev->tap_fd);
742 close(netdev->tap_fd);
747 netdev_linux_destruct(struct netdev *netdev_)
749 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
751 if (netdev->tc && netdev->tc->ops->tc_destroy) {
752 netdev->tc->ops->tc_destroy(netdev->tc);
755 if (netdev_get_class(netdev_) == &netdev_tap_class
756 && netdev->tap_fd >= 0)
758 close(netdev->tap_fd);
761 if (netdev->miimon_interval > 0) {
763 atomic_sub(&miimon_cnt, 1, &junk);
766 ovs_mutex_destroy(&netdev->mutex);
770 netdev_linux_dealloc(struct netdev *netdev_)
772 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
776 static struct netdev_rx *
777 netdev_linux_rx_alloc(void)
779 struct netdev_rx_linux *rx = xzalloc(sizeof *rx);
784 netdev_linux_rx_construct(struct netdev_rx *rx_)
786 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
787 struct netdev *netdev_ = rx->up.netdev;
788 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
791 ovs_mutex_lock(&netdev->mutex);
792 rx->is_tap = is_tap_netdev(netdev_);
794 rx->fd = netdev->tap_fd;
796 struct sockaddr_ll sll;
798 /* Result of tcpdump -dd inbound */
799 static const struct sock_filter filt[] = {
800 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
801 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
802 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
803 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
805 static const struct sock_fprog fprog = {
806 ARRAY_SIZE(filt), (struct sock_filter *) filt
809 /* Create file descriptor. */
810 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
813 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
818 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
820 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
821 netdev_get_name(netdev_), ovs_strerror(error));
825 /* Set non-blocking mode. */
826 error = set_nonblocking(rx->fd);
831 /* Get ethernet device index. */
832 error = get_ifindex(&netdev->up, &ifindex);
837 /* Bind to specific ethernet device. */
838 memset(&sll, 0, sizeof sll);
839 sll.sll_family = AF_PACKET;
840 sll.sll_ifindex = ifindex;
841 sll.sll_protocol = htons(ETH_P_ALL);
842 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
844 VLOG_ERR("%s: failed to bind raw socket (%s)",
845 netdev_get_name(netdev_), ovs_strerror(error));
849 /* Filter for only inbound packets. */
850 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
854 VLOG_ERR("%s: failed to attach filter (%s)",
855 netdev_get_name(netdev_), ovs_strerror(error));
859 ovs_mutex_unlock(&netdev->mutex);
867 ovs_mutex_unlock(&netdev->mutex);
872 netdev_linux_rx_destruct(struct netdev_rx *rx_)
874 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
882 netdev_linux_rx_dealloc(struct netdev_rx *rx_)
884 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
890 auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux)
892 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
893 return htons(aux->tp_vlan_tpid);
895 return htons(ETH_TYPE_VLAN);
900 auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
902 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
906 netdev_linux_rx_recv_sock(int fd, struct ofpbuf *buffer)
911 struct cmsghdr *cmsg;
914 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
918 /* Reserve headroom for a single VLAN tag */
919 ofpbuf_reserve(buffer, VLAN_HEADER_LEN);
920 size = ofpbuf_tailroom(buffer);
922 iov.iov_base = buffer->data;
924 msgh.msg_name = NULL;
925 msgh.msg_namelen = 0;
928 msgh.msg_control = &cmsg_buffer;
929 msgh.msg_controllen = sizeof cmsg_buffer;
933 retval = recvmsg(fd, &msgh, MSG_TRUNC);
934 } while (retval < 0 && errno == EINTR);
938 } else if (retval > size) {
942 buffer->size += retval;
944 for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
945 const struct tpacket_auxdata *aux;
947 if (cmsg->cmsg_level != SOL_PACKET
948 || cmsg->cmsg_type != PACKET_AUXDATA
949 || cmsg->cmsg_len < CMSG_LEN(sizeof(struct tpacket_auxdata))) {
953 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
954 if (auxdata_has_vlan_tci(aux)) {
955 if (retval < ETH_HEADER_LEN) {
959 eth_push_vlan(buffer, auxdata_to_vlan_tpid(aux),
960 htons(aux->tp_vlan_tci));
969 netdev_linux_rx_recv_tap(int fd, struct ofpbuf *buffer)
972 size_t size = ofpbuf_tailroom(buffer);
975 retval = read(fd, buffer->data, size);
976 } while (retval < 0 && errno == EINTR);
980 } else if (retval > size) {
984 buffer->size += retval;
989 netdev_linux_rx_recv(struct netdev_rx *rx_, struct ofpbuf **packet, int *c)
991 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
992 struct netdev *netdev = rx->up.netdev;
993 struct ofpbuf *buffer;
997 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
998 mtu = ETH_PAYLOAD_MAX;
1001 buffer = ofpbuf_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu, DP_NETDEV_HEADROOM);
1003 retval = (rx->is_tap
1004 ? netdev_linux_rx_recv_tap(rx->fd, buffer)
1005 : netdev_linux_rx_recv_sock(rx->fd, buffer));
1008 if (retval != EAGAIN && retval != EMSGSIZE) {
1009 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
1010 ovs_strerror(errno), netdev_rx_get_name(rx_));
1012 ofpbuf_delete(buffer);
1014 dp_packet_pad(buffer);
1023 netdev_linux_rx_wait(struct netdev_rx *rx_)
1025 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
1026 poll_fd_wait(rx->fd, POLLIN);
1030 netdev_linux_rx_drain(struct netdev_rx *rx_)
1032 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
1035 int error = af_inet_ifreq_ioctl(netdev_rx_get_name(rx_), &ifr,
1036 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
1040 drain_fd(rx->fd, ifr.ifr_qlen);
1043 return drain_rcvbuf(rx->fd);
1047 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
1048 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1049 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1050 * the packet is too big or too small to transmit on the device.
1052 * The caller retains ownership of 'buffer' in all cases.
1054 * The kernel maintains a packet transmission queue, so the caller is not
1055 * expected to do additional queuing of packets. */
1057 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
1062 if (!is_tap_netdev(netdev_)) {
1063 /* Use our AF_PACKET socket to send to this device. */
1064 struct sockaddr_ll sll;
1070 sock = af_packet_sock();
1075 ifindex = netdev_get_ifindex(netdev_);
1080 /* We don't bother setting most fields in sockaddr_ll because the
1081 * kernel ignores them for SOCK_RAW. */
1082 memset(&sll, 0, sizeof sll);
1083 sll.sll_family = AF_PACKET;
1084 sll.sll_ifindex = ifindex;
1086 iov.iov_base = CONST_CAST(void *, data);
1089 msg.msg_name = &sll;
1090 msg.msg_namelen = sizeof sll;
1093 msg.msg_control = NULL;
1094 msg.msg_controllen = 0;
1097 retval = sendmsg(sock, &msg, 0);
1099 /* Use the tap fd to send to this device. This is essential for
1100 * tap devices, because packets sent to a tap device with an
1101 * AF_PACKET socket will loop back to be *received* again on the
1102 * tap device. This doesn't occur on other interface types
1103 * because we attach a socket filter to the rx socket. */
1104 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1106 retval = write(netdev->tap_fd, data, size);
1110 /* The Linux AF_PACKET implementation never blocks waiting for room
1111 * for packets, instead returning ENOBUFS. Translate this into
1112 * EAGAIN for the caller. */
1113 if (errno == ENOBUFS) {
1115 } else if (errno == EINTR) {
1117 } else if (errno != EAGAIN) {
1118 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1119 netdev_get_name(netdev_), ovs_strerror(errno));
1122 } else if (retval != size) {
1123 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE"d bytes of "
1124 "%"PRIuSIZE") on %s", retval, size, netdev_get_name(netdev_));
1132 /* Registers with the poll loop to wake up from the next call to poll_block()
1133 * when the packet transmission queue has sufficient room to transmit a packet
1134 * with netdev_send().
1136 * The kernel maintains a packet transmission queue, so the client is not
1137 * expected to do additional queuing of packets. Thus, this function is
1138 * unlikely to ever be used. It is included for completeness. */
1140 netdev_linux_send_wait(struct netdev *netdev)
1142 if (is_tap_netdev(netdev)) {
1143 /* TAP device always accepts packets.*/
1144 poll_immediate_wake();
1148 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1149 * otherwise a positive errno value. */
1151 netdev_linux_set_etheraddr(struct netdev *netdev_,
1152 const uint8_t mac[ETH_ADDR_LEN])
1154 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1155 enum netdev_flags old_flags = 0;
1158 ovs_mutex_lock(&netdev->mutex);
1160 if (netdev->cache_valid & VALID_ETHERADDR) {
1161 error = netdev->ether_addr_error;
1162 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1165 netdev->cache_valid &= ~VALID_ETHERADDR;
1168 /* Tap devices must be brought down before setting the address. */
1169 if (is_tap_netdev(netdev_)) {
1170 update_flags(netdev, NETDEV_UP, 0, &old_flags);
1172 error = set_etheraddr(netdev_get_name(netdev_), mac);
1173 if (!error || error == ENODEV) {
1174 netdev->ether_addr_error = error;
1175 netdev->cache_valid |= VALID_ETHERADDR;
1177 memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
1181 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1182 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1186 ovs_mutex_unlock(&netdev->mutex);
1190 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1192 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1193 uint8_t mac[ETH_ADDR_LEN])
1195 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1198 ovs_mutex_lock(&netdev->mutex);
1199 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1200 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1202 netdev->cache_valid |= VALID_ETHERADDR;
1205 error = netdev->ether_addr_error;
1207 memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
1209 ovs_mutex_unlock(&netdev->mutex);
1215 netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
1219 if (!(netdev->cache_valid & VALID_MTU)) {
1222 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
1223 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
1224 netdev->mtu = ifr.ifr_mtu;
1225 netdev->cache_valid |= VALID_MTU;
1228 error = netdev->netdev_mtu_error;
1230 *mtup = netdev->mtu;
1236 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1237 * in bytes, not including the hardware header; thus, this is typically 1500
1238 * bytes for Ethernet devices. */
1240 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1242 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1245 ovs_mutex_lock(&netdev->mutex);
1246 error = netdev_linux_get_mtu__(netdev, mtup);
1247 ovs_mutex_unlock(&netdev->mutex);
1252 /* Sets the maximum size of transmitted (MTU) for given device using linux
1253 * networking ioctl interface.
1256 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1258 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1262 ovs_mutex_lock(&netdev->mutex);
1263 if (netdev->cache_valid & VALID_MTU) {
1264 error = netdev->netdev_mtu_error;
1265 if (error || netdev->mtu == mtu) {
1268 netdev->cache_valid &= ~VALID_MTU;
1271 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1272 SIOCSIFMTU, "SIOCSIFMTU");
1273 if (!error || error == ENODEV) {
1274 netdev->netdev_mtu_error = error;
1275 netdev->mtu = ifr.ifr_mtu;
1276 netdev->cache_valid |= VALID_MTU;
1279 ovs_mutex_unlock(&netdev->mutex);
1283 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1284 * On failure, returns a negative errno value. */
1286 netdev_linux_get_ifindex(const struct netdev *netdev_)
1288 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1291 ovs_mutex_lock(&netdev->mutex);
1292 error = get_ifindex(netdev_, &ifindex);
1293 ovs_mutex_unlock(&netdev->mutex);
1295 return error ? -error : ifindex;
1299 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1301 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1303 ovs_mutex_lock(&netdev->mutex);
1304 if (netdev->miimon_interval > 0) {
1305 *carrier = netdev->miimon;
1307 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1309 ovs_mutex_unlock(&netdev->mutex);
1314 static long long int
1315 netdev_linux_get_carrier_resets(const struct netdev *netdev_)
1317 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1318 long long int carrier_resets;
1320 ovs_mutex_lock(&netdev->mutex);
1321 carrier_resets = netdev->carrier_resets;
1322 ovs_mutex_unlock(&netdev->mutex);
1324 return carrier_resets;
1328 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1329 struct mii_ioctl_data *data)
1334 memset(&ifr, 0, sizeof ifr);
1335 memcpy(&ifr.ifr_data, data, sizeof *data);
1336 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
1337 memcpy(data, &ifr.ifr_data, sizeof *data);
1343 netdev_linux_get_miimon(const char *name, bool *miimon)
1345 struct mii_ioctl_data data;
1350 memset(&data, 0, sizeof data);
1351 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1353 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1354 data.reg_num = MII_BMSR;
1355 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1359 *miimon = !!(data.val_out & BMSR_LSTATUS);
1361 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1364 struct ethtool_cmd ecmd;
1366 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1369 COVERAGE_INC(netdev_get_ethtool);
1370 memset(&ecmd, 0, sizeof ecmd);
1371 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1374 struct ethtool_value eval;
1376 memcpy(&eval, &ecmd, sizeof eval);
1377 *miimon = !!eval.data;
1379 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1387 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1388 long long int interval)
1390 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1392 ovs_mutex_lock(&netdev->mutex);
1393 interval = interval > 0 ? MAX(interval, 100) : 0;
1394 if (netdev->miimon_interval != interval) {
1397 if (interval && !netdev->miimon_interval) {
1398 atomic_add(&miimon_cnt, 1, &junk);
1399 } else if (!interval && netdev->miimon_interval) {
1400 atomic_sub(&miimon_cnt, 1, &junk);
1403 netdev->miimon_interval = interval;
1404 timer_set_expired(&netdev->miimon_timer);
1406 ovs_mutex_unlock(&netdev->mutex);
1412 netdev_linux_miimon_run(void)
1414 struct shash device_shash;
1415 struct shash_node *node;
1417 shash_init(&device_shash);
1418 netdev_get_devices(&netdev_linux_class, &device_shash);
1419 SHASH_FOR_EACH (node, &device_shash) {
1420 struct netdev *netdev = node->data;
1421 struct netdev_linux *dev = netdev_linux_cast(netdev);
1424 ovs_mutex_lock(&dev->mutex);
1425 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1426 netdev_linux_get_miimon(dev->up.name, &miimon);
1427 if (miimon != dev->miimon) {
1428 dev->miimon = miimon;
1429 netdev_linux_changed(dev, dev->ifi_flags, 0);
1432 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1434 ovs_mutex_unlock(&dev->mutex);
1435 netdev_close(netdev);
1438 shash_destroy(&device_shash);
1442 netdev_linux_miimon_wait(void)
1444 struct shash device_shash;
1445 struct shash_node *node;
1447 shash_init(&device_shash);
1448 netdev_get_devices(&netdev_linux_class, &device_shash);
1449 SHASH_FOR_EACH (node, &device_shash) {
1450 struct netdev *netdev = node->data;
1451 struct netdev_linux *dev = netdev_linux_cast(netdev);
1453 ovs_mutex_lock(&dev->mutex);
1454 if (dev->miimon_interval > 0) {
1455 timer_wait(&dev->miimon_timer);
1457 ovs_mutex_unlock(&dev->mutex);
1458 netdev_close(netdev);
1460 shash_destroy(&device_shash);
1464 swap_uint64(uint64_t *a, uint64_t *b)
1471 /* Copies 'src' into 'dst', performing format conversion in the process.
1473 * 'src' is allowed to be misaligned. */
1475 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1476 const struct ovs_vport_stats *src)
1478 dst->rx_packets = get_unaligned_u64(&src->rx_packets);
1479 dst->tx_packets = get_unaligned_u64(&src->tx_packets);
1480 dst->rx_bytes = get_unaligned_u64(&src->rx_bytes);
1481 dst->tx_bytes = get_unaligned_u64(&src->tx_bytes);
1482 dst->rx_errors = get_unaligned_u64(&src->rx_errors);
1483 dst->tx_errors = get_unaligned_u64(&src->tx_errors);
1484 dst->rx_dropped = get_unaligned_u64(&src->rx_dropped);
1485 dst->tx_dropped = get_unaligned_u64(&src->tx_dropped);
1487 dst->collisions = 0;
1488 dst->rx_length_errors = 0;
1489 dst->rx_over_errors = 0;
1490 dst->rx_crc_errors = 0;
1491 dst->rx_frame_errors = 0;
1492 dst->rx_fifo_errors = 0;
1493 dst->rx_missed_errors = 0;
1494 dst->tx_aborted_errors = 0;
1495 dst->tx_carrier_errors = 0;
1496 dst->tx_fifo_errors = 0;
1497 dst->tx_heartbeat_errors = 0;
1498 dst->tx_window_errors = 0;
1502 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1504 struct dpif_linux_vport reply;
1508 error = dpif_linux_vport_get(netdev_get_name(netdev), &reply, &buf);
1511 } else if (!reply.stats) {
1516 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1524 get_stats_via_vport(const struct netdev *netdev_,
1525 struct netdev_stats *stats)
1527 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1529 if (!netdev->vport_stats_error ||
1530 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1533 error = get_stats_via_vport__(netdev_, stats);
1534 if (error && error != ENOENT) {
1535 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1537 netdev_get_name(netdev_), ovs_strerror(error));
1539 netdev->vport_stats_error = error;
1540 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1544 /* Retrieves current device stats for 'netdev-linux'. */
1546 netdev_linux_get_stats(const struct netdev *netdev_,
1547 struct netdev_stats *stats)
1549 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1550 struct netdev_stats dev_stats;
1553 ovs_mutex_lock(&netdev->mutex);
1554 get_stats_via_vport(netdev_, stats);
1555 error = get_stats_via_netlink(netdev_, &dev_stats);
1557 if (!netdev->vport_stats_error) {
1560 } else if (netdev->vport_stats_error) {
1561 /* stats not available from OVS then use ioctl stats. */
1564 stats->rx_errors += dev_stats.rx_errors;
1565 stats->tx_errors += dev_stats.tx_errors;
1566 stats->rx_dropped += dev_stats.rx_dropped;
1567 stats->tx_dropped += dev_stats.tx_dropped;
1568 stats->multicast += dev_stats.multicast;
1569 stats->collisions += dev_stats.collisions;
1570 stats->rx_length_errors += dev_stats.rx_length_errors;
1571 stats->rx_over_errors += dev_stats.rx_over_errors;
1572 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1573 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1574 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1575 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1576 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1577 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1578 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1579 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1580 stats->tx_window_errors += dev_stats.tx_window_errors;
1582 ovs_mutex_unlock(&netdev->mutex);
1587 /* Retrieves current device stats for 'netdev-tap' netdev or
1588 * netdev-internal. */
1590 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1592 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1593 struct netdev_stats dev_stats;
1596 ovs_mutex_lock(&netdev->mutex);
1597 get_stats_via_vport(netdev_, stats);
1598 error = get_stats_via_netlink(netdev_, &dev_stats);
1600 if (!netdev->vport_stats_error) {
1603 } else if (netdev->vport_stats_error) {
1604 /* Transmit and receive stats will appear to be swapped relative to the
1605 * other ports since we are the one sending the data, not a remote
1606 * computer. For consistency, we swap them back here. This does not
1607 * apply if we are getting stats from the vport layer because it always
1608 * tracks stats from the perspective of the switch. */
1611 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1612 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1613 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1614 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1615 stats->rx_length_errors = 0;
1616 stats->rx_over_errors = 0;
1617 stats->rx_crc_errors = 0;
1618 stats->rx_frame_errors = 0;
1619 stats->rx_fifo_errors = 0;
1620 stats->rx_missed_errors = 0;
1621 stats->tx_aborted_errors = 0;
1622 stats->tx_carrier_errors = 0;
1623 stats->tx_fifo_errors = 0;
1624 stats->tx_heartbeat_errors = 0;
1625 stats->tx_window_errors = 0;
1627 stats->rx_dropped += dev_stats.tx_dropped;
1628 stats->tx_dropped += dev_stats.rx_dropped;
1630 stats->rx_errors += dev_stats.tx_errors;
1631 stats->tx_errors += dev_stats.rx_errors;
1633 stats->multicast += dev_stats.multicast;
1634 stats->collisions += dev_stats.collisions;
1636 ovs_mutex_unlock(&netdev->mutex);
1642 netdev_internal_get_stats(const struct netdev *netdev_,
1643 struct netdev_stats *stats)
1645 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1648 ovs_mutex_lock(&netdev->mutex);
1649 get_stats_via_vport(netdev_, stats);
1650 error = netdev->vport_stats_error;
1651 ovs_mutex_unlock(&netdev->mutex);
1657 netdev_internal_set_stats(struct netdev *netdev,
1658 const struct netdev_stats *stats)
1660 struct ovs_vport_stats vport_stats;
1661 struct dpif_linux_vport vport;
1664 vport_stats.rx_packets = stats->rx_packets;
1665 vport_stats.tx_packets = stats->tx_packets;
1666 vport_stats.rx_bytes = stats->rx_bytes;
1667 vport_stats.tx_bytes = stats->tx_bytes;
1668 vport_stats.rx_errors = stats->rx_errors;
1669 vport_stats.tx_errors = stats->tx_errors;
1670 vport_stats.rx_dropped = stats->rx_dropped;
1671 vport_stats.tx_dropped = stats->tx_dropped;
1673 dpif_linux_vport_init(&vport);
1674 vport.cmd = OVS_VPORT_CMD_SET;
1675 vport.name = netdev_get_name(netdev);
1676 vport.stats = &vport_stats;
1678 err = dpif_linux_vport_transact(&vport, NULL, NULL);
1680 /* If the vport layer doesn't know about the device, that doesn't mean it
1681 * doesn't exist (after all were able to open it when netdev_open() was
1682 * called), it just means that it isn't attached and we'll be getting
1683 * stats a different way. */
1684 if (err == ENODEV) {
1692 netdev_linux_read_features(struct netdev_linux *netdev)
1694 struct ethtool_cmd ecmd;
1698 if (netdev->cache_valid & VALID_FEATURES) {
1702 COVERAGE_INC(netdev_get_ethtool);
1703 memset(&ecmd, 0, sizeof ecmd);
1704 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1705 ETHTOOL_GSET, "ETHTOOL_GSET");
1710 /* Supported features. */
1711 netdev->supported = 0;
1712 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1713 netdev->supported |= NETDEV_F_10MB_HD;
1715 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1716 netdev->supported |= NETDEV_F_10MB_FD;
1718 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1719 netdev->supported |= NETDEV_F_100MB_HD;
1721 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1722 netdev->supported |= NETDEV_F_100MB_FD;
1724 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1725 netdev->supported |= NETDEV_F_1GB_HD;
1727 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1728 netdev->supported |= NETDEV_F_1GB_FD;
1730 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1731 netdev->supported |= NETDEV_F_10GB_FD;
1733 if (ecmd.supported & SUPPORTED_TP) {
1734 netdev->supported |= NETDEV_F_COPPER;
1736 if (ecmd.supported & SUPPORTED_FIBRE) {
1737 netdev->supported |= NETDEV_F_FIBER;
1739 if (ecmd.supported & SUPPORTED_Autoneg) {
1740 netdev->supported |= NETDEV_F_AUTONEG;
1742 if (ecmd.supported & SUPPORTED_Pause) {
1743 netdev->supported |= NETDEV_F_PAUSE;
1745 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1746 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1749 /* Advertised features. */
1750 netdev->advertised = 0;
1751 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1752 netdev->advertised |= NETDEV_F_10MB_HD;
1754 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1755 netdev->advertised |= NETDEV_F_10MB_FD;
1757 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1758 netdev->advertised |= NETDEV_F_100MB_HD;
1760 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1761 netdev->advertised |= NETDEV_F_100MB_FD;
1763 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1764 netdev->advertised |= NETDEV_F_1GB_HD;
1766 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1767 netdev->advertised |= NETDEV_F_1GB_FD;
1769 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1770 netdev->advertised |= NETDEV_F_10GB_FD;
1772 if (ecmd.advertising & ADVERTISED_TP) {
1773 netdev->advertised |= NETDEV_F_COPPER;
1775 if (ecmd.advertising & ADVERTISED_FIBRE) {
1776 netdev->advertised |= NETDEV_F_FIBER;
1778 if (ecmd.advertising & ADVERTISED_Autoneg) {
1779 netdev->advertised |= NETDEV_F_AUTONEG;
1781 if (ecmd.advertising & ADVERTISED_Pause) {
1782 netdev->advertised |= NETDEV_F_PAUSE;
1784 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1785 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1788 /* Current settings. */
1790 if (speed == SPEED_10) {
1791 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1792 } else if (speed == SPEED_100) {
1793 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1794 } else if (speed == SPEED_1000) {
1795 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1796 } else if (speed == SPEED_10000) {
1797 netdev->current = NETDEV_F_10GB_FD;
1798 } else if (speed == 40000) {
1799 netdev->current = NETDEV_F_40GB_FD;
1800 } else if (speed == 100000) {
1801 netdev->current = NETDEV_F_100GB_FD;
1802 } else if (speed == 1000000) {
1803 netdev->current = NETDEV_F_1TB_FD;
1805 netdev->current = 0;
1808 if (ecmd.port == PORT_TP) {
1809 netdev->current |= NETDEV_F_COPPER;
1810 } else if (ecmd.port == PORT_FIBRE) {
1811 netdev->current |= NETDEV_F_FIBER;
1815 netdev->current |= NETDEV_F_AUTONEG;
1819 netdev->cache_valid |= VALID_FEATURES;
1820 netdev->get_features_error = error;
1823 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
1824 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1825 * Returns 0 if successful, otherwise a positive errno value. */
1827 netdev_linux_get_features(const struct netdev *netdev_,
1828 enum netdev_features *current,
1829 enum netdev_features *advertised,
1830 enum netdev_features *supported,
1831 enum netdev_features *peer)
1833 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1836 ovs_mutex_lock(&netdev->mutex);
1837 netdev_linux_read_features(netdev);
1838 if (!netdev->get_features_error) {
1839 *current = netdev->current;
1840 *advertised = netdev->advertised;
1841 *supported = netdev->supported;
1842 *peer = 0; /* XXX */
1844 error = netdev->get_features_error;
1845 ovs_mutex_unlock(&netdev->mutex);
1850 /* Set the features advertised by 'netdev' to 'advertise'. */
1852 netdev_linux_set_advertisements(struct netdev *netdev_,
1853 enum netdev_features advertise)
1855 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1856 struct ethtool_cmd ecmd;
1859 ovs_mutex_lock(&netdev->mutex);
1861 COVERAGE_INC(netdev_get_ethtool);
1862 memset(&ecmd, 0, sizeof ecmd);
1863 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1864 ETHTOOL_GSET, "ETHTOOL_GSET");
1869 ecmd.advertising = 0;
1870 if (advertise & NETDEV_F_10MB_HD) {
1871 ecmd.advertising |= ADVERTISED_10baseT_Half;
1873 if (advertise & NETDEV_F_10MB_FD) {
1874 ecmd.advertising |= ADVERTISED_10baseT_Full;
1876 if (advertise & NETDEV_F_100MB_HD) {
1877 ecmd.advertising |= ADVERTISED_100baseT_Half;
1879 if (advertise & NETDEV_F_100MB_FD) {
1880 ecmd.advertising |= ADVERTISED_100baseT_Full;
1882 if (advertise & NETDEV_F_1GB_HD) {
1883 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1885 if (advertise & NETDEV_F_1GB_FD) {
1886 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1888 if (advertise & NETDEV_F_10GB_FD) {
1889 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1891 if (advertise & NETDEV_F_COPPER) {
1892 ecmd.advertising |= ADVERTISED_TP;
1894 if (advertise & NETDEV_F_FIBER) {
1895 ecmd.advertising |= ADVERTISED_FIBRE;
1897 if (advertise & NETDEV_F_AUTONEG) {
1898 ecmd.advertising |= ADVERTISED_Autoneg;
1900 if (advertise & NETDEV_F_PAUSE) {
1901 ecmd.advertising |= ADVERTISED_Pause;
1903 if (advertise & NETDEV_F_PAUSE_ASYM) {
1904 ecmd.advertising |= ADVERTISED_Asym_Pause;
1906 COVERAGE_INC(netdev_set_ethtool);
1907 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1908 ETHTOOL_SSET, "ETHTOOL_SSET");
1911 ovs_mutex_unlock(&netdev->mutex);
1915 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1916 * successful, otherwise a positive errno value. */
1918 netdev_linux_set_policing(struct netdev *netdev_,
1919 uint32_t kbits_rate, uint32_t kbits_burst)
1921 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1922 const char *netdev_name = netdev_get_name(netdev_);
1925 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1926 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1927 : kbits_burst); /* Stick with user-specified value. */
1929 ovs_mutex_lock(&netdev->mutex);
1930 if (netdev->cache_valid & VALID_POLICING) {
1931 error = netdev->netdev_policing_error;
1932 if (error || (netdev->kbits_rate == kbits_rate &&
1933 netdev->kbits_burst == kbits_burst)) {
1934 /* Assume that settings haven't changed since we last set them. */
1937 netdev->cache_valid &= ~VALID_POLICING;
1940 COVERAGE_INC(netdev_set_policing);
1941 /* Remove any existing ingress qdisc. */
1942 error = tc_add_del_ingress_qdisc(netdev_, false);
1944 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1945 netdev_name, ovs_strerror(error));
1950 error = tc_add_del_ingress_qdisc(netdev_, true);
1952 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1953 netdev_name, ovs_strerror(error));
1957 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
1959 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1960 netdev_name, ovs_strerror(error));
1965 netdev->kbits_rate = kbits_rate;
1966 netdev->kbits_burst = kbits_burst;
1969 if (!error || error == ENODEV) {
1970 netdev->netdev_policing_error = error;
1971 netdev->cache_valid |= VALID_POLICING;
1973 ovs_mutex_unlock(&netdev->mutex);
1978 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1981 const struct tc_ops *const *opsp;
1983 for (opsp = tcs; *opsp != NULL; opsp++) {
1984 const struct tc_ops *ops = *opsp;
1985 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1986 sset_add(types, ops->ovs_name);
1992 static const struct tc_ops *
1993 tc_lookup_ovs_name(const char *name)
1995 const struct tc_ops *const *opsp;
1997 for (opsp = tcs; *opsp != NULL; opsp++) {
1998 const struct tc_ops *ops = *opsp;
1999 if (!strcmp(name, ops->ovs_name)) {
2006 static const struct tc_ops *
2007 tc_lookup_linux_name(const char *name)
2009 const struct tc_ops *const *opsp;
2011 for (opsp = tcs; *opsp != NULL; opsp++) {
2012 const struct tc_ops *ops = *opsp;
2013 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
2020 static struct tc_queue *
2021 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
2024 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2025 struct tc_queue *queue;
2027 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
2028 if (queue->queue_id == queue_id) {
2035 static struct tc_queue *
2036 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2038 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2042 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2044 struct netdev_qos_capabilities *caps)
2046 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2050 caps->n_queues = ops->n_queues;
2055 netdev_linux_get_qos(const struct netdev *netdev_,
2056 const char **typep, struct smap *details)
2058 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2061 ovs_mutex_lock(&netdev->mutex);
2062 error = tc_query_qdisc(netdev_);
2064 *typep = netdev->tc->ops->ovs_name;
2065 error = (netdev->tc->ops->qdisc_get
2066 ? netdev->tc->ops->qdisc_get(netdev_, details)
2069 ovs_mutex_unlock(&netdev->mutex);
2075 netdev_linux_set_qos(struct netdev *netdev_,
2076 const char *type, const struct smap *details)
2078 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2079 const struct tc_ops *new_ops;
2082 new_ops = tc_lookup_ovs_name(type);
2083 if (!new_ops || !new_ops->tc_install) {
2087 ovs_mutex_lock(&netdev->mutex);
2088 error = tc_query_qdisc(netdev_);
2093 if (new_ops == netdev->tc->ops) {
2094 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
2096 /* Delete existing qdisc. */
2097 error = tc_del_qdisc(netdev_);
2101 ovs_assert(netdev->tc == NULL);
2103 /* Install new qdisc. */
2104 error = new_ops->tc_install(netdev_, details);
2105 ovs_assert((error == 0) == (netdev->tc != NULL));
2109 ovs_mutex_unlock(&netdev->mutex);
2114 netdev_linux_get_queue(const struct netdev *netdev_,
2115 unsigned int queue_id, struct smap *details)
2117 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2120 ovs_mutex_lock(&netdev->mutex);
2121 error = tc_query_qdisc(netdev_);
2123 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2125 ? netdev->tc->ops->class_get(netdev_, queue, details)
2128 ovs_mutex_unlock(&netdev->mutex);
2134 netdev_linux_set_queue(struct netdev *netdev_,
2135 unsigned int queue_id, const struct smap *details)
2137 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2140 ovs_mutex_lock(&netdev->mutex);
2141 error = tc_query_qdisc(netdev_);
2143 error = (queue_id < netdev->tc->ops->n_queues
2144 && netdev->tc->ops->class_set
2145 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2148 ovs_mutex_unlock(&netdev->mutex);
2154 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
2156 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2159 ovs_mutex_lock(&netdev->mutex);
2160 error = tc_query_qdisc(netdev_);
2162 if (netdev->tc->ops->class_delete) {
2163 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2165 ? netdev->tc->ops->class_delete(netdev_, queue)
2171 ovs_mutex_unlock(&netdev->mutex);
2177 netdev_linux_get_queue_stats(const struct netdev *netdev_,
2178 unsigned int queue_id,
2179 struct netdev_queue_stats *stats)
2181 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2184 ovs_mutex_lock(&netdev->mutex);
2185 error = tc_query_qdisc(netdev_);
2187 if (netdev->tc->ops->class_get_stats) {
2188 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2190 stats->created = queue->created;
2191 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2200 ovs_mutex_unlock(&netdev->mutex);
2205 struct queue_dump_state {
2206 struct nl_dump dump;
2211 start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
2213 struct ofpbuf request;
2214 struct tcmsg *tcmsg;
2216 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2220 tcmsg->tcm_parent = 0;
2221 nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
2222 ofpbuf_uninit(&request);
2224 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
2229 finish_queue_dump(struct queue_dump_state *state)
2231 ofpbuf_uninit(&state->buf);
2232 return nl_dump_done(&state->dump);
2235 struct netdev_linux_queue_state {
2236 unsigned int *queues;
2242 netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
2244 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2247 ovs_mutex_lock(&netdev->mutex);
2248 error = tc_query_qdisc(netdev_);
2250 if (netdev->tc->ops->class_get) {
2251 struct netdev_linux_queue_state *state;
2252 struct tc_queue *queue;
2255 *statep = state = xmalloc(sizeof *state);
2256 state->n_queues = hmap_count(&netdev->tc->queues);
2257 state->cur_queue = 0;
2258 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2261 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2262 state->queues[i++] = queue->queue_id;
2268 ovs_mutex_unlock(&netdev->mutex);
2274 netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2275 unsigned int *queue_idp, struct smap *details)
2277 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2278 struct netdev_linux_queue_state *state = state_;
2281 ovs_mutex_lock(&netdev->mutex);
2282 while (state->cur_queue < state->n_queues) {
2283 unsigned int queue_id = state->queues[state->cur_queue++];
2284 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2287 *queue_idp = queue_id;
2288 error = netdev->tc->ops->class_get(netdev_, queue, details);
2292 ovs_mutex_unlock(&netdev->mutex);
2298 netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2301 struct netdev_linux_queue_state *state = state_;
2303 free(state->queues);
2309 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2310 netdev_dump_queue_stats_cb *cb, void *aux)
2312 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2315 ovs_mutex_lock(&netdev->mutex);
2316 error = tc_query_qdisc(netdev_);
2318 struct queue_dump_state state;
2320 if (!netdev->tc->ops->class_dump_stats) {
2322 } else if (!start_queue_dump(netdev_, &state)) {
2328 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
2329 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2336 retval = finish_queue_dump(&state);
2342 ovs_mutex_unlock(&netdev->mutex);
2348 netdev_linux_get_in4(const struct netdev *netdev_,
2349 struct in_addr *address, struct in_addr *netmask)
2351 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2354 ovs_mutex_lock(&netdev->mutex);
2355 if (!(netdev->cache_valid & VALID_IN4)) {
2356 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
2357 SIOCGIFADDR, "SIOCGIFADDR");
2359 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2360 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2362 netdev->cache_valid |= VALID_IN4;
2370 if (netdev->address.s_addr != INADDR_ANY) {
2371 *address = netdev->address;
2372 *netmask = netdev->netmask;
2374 error = EADDRNOTAVAIL;
2377 ovs_mutex_unlock(&netdev->mutex);
2383 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2384 struct in_addr netmask)
2386 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2389 ovs_mutex_lock(&netdev->mutex);
2390 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2392 netdev->cache_valid |= VALID_IN4;
2393 netdev->address = address;
2394 netdev->netmask = netmask;
2395 if (address.s_addr != INADDR_ANY) {
2396 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2397 "SIOCSIFNETMASK", netmask);
2400 ovs_mutex_unlock(&netdev->mutex);
2406 parse_if_inet6_line(const char *line,
2407 struct in6_addr *in6, char ifname[16 + 1])
2409 uint8_t *s6 = in6->s6_addr;
2410 #define X8 "%2"SCNx8
2411 return ovs_scan(line,
2412 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2413 "%*x %*x %*x %*x %16s\n",
2414 &s6[0], &s6[1], &s6[2], &s6[3],
2415 &s6[4], &s6[5], &s6[6], &s6[7],
2416 &s6[8], &s6[9], &s6[10], &s6[11],
2417 &s6[12], &s6[13], &s6[14], &s6[15],
2421 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2422 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2424 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2426 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2428 ovs_mutex_lock(&netdev->mutex);
2429 if (!(netdev->cache_valid & VALID_IN6)) {
2433 netdev->in6 = in6addr_any;
2435 file = fopen("/proc/net/if_inet6", "r");
2437 const char *name = netdev_get_name(netdev_);
2438 while (fgets(line, sizeof line, file)) {
2439 struct in6_addr in6_tmp;
2440 char ifname[16 + 1];
2441 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2442 && !strcmp(name, ifname))
2444 netdev->in6 = in6_tmp;
2450 netdev->cache_valid |= VALID_IN6;
2453 ovs_mutex_unlock(&netdev->mutex);
2459 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2461 struct sockaddr_in sin;
2462 memset(&sin, 0, sizeof sin);
2463 sin.sin_family = AF_INET;
2464 sin.sin_addr = addr;
2467 memset(sa, 0, sizeof *sa);
2468 memcpy(sa, &sin, sizeof sin);
2472 do_set_addr(struct netdev *netdev,
2473 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2477 make_in4_sockaddr(&ifr.ifr_addr, addr);
2478 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2482 /* Adds 'router' as a default IP gateway. */
2484 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2486 struct in_addr any = { INADDR_ANY };
2490 memset(&rt, 0, sizeof rt);
2491 make_in4_sockaddr(&rt.rt_dst, any);
2492 make_in4_sockaddr(&rt.rt_gateway, router);
2493 make_in4_sockaddr(&rt.rt_genmask, any);
2494 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2495 error = af_inet_ioctl(SIOCADDRT, &rt);
2497 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2503 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2506 static const char fn[] = "/proc/net/route";
2511 *netdev_name = NULL;
2512 stream = fopen(fn, "r");
2513 if (stream == NULL) {
2514 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2519 while (fgets(line, sizeof line, stream)) {
2522 ovs_be32 dest, gateway, mask;
2523 int refcnt, metric, mtu;
2524 unsigned int flags, use, window, irtt;
2527 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2529 iface, &dest, &gateway, &flags, &refcnt,
2530 &use, &metric, &mask, &mtu, &window, &irtt)) {
2531 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2535 if (!(flags & RTF_UP)) {
2536 /* Skip routes that aren't up. */
2540 /* The output of 'dest', 'mask', and 'gateway' were given in
2541 * network byte order, so we don't need need any endian
2542 * conversions here. */
2543 if ((dest & mask) == (host->s_addr & mask)) {
2545 /* The host is directly reachable. */
2546 next_hop->s_addr = 0;
2548 /* To reach the host, we must go through a gateway. */
2549 next_hop->s_addr = gateway;
2551 *netdev_name = xstrdup(iface);
2563 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2565 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2568 ovs_mutex_lock(&netdev->mutex);
2569 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2570 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2572 COVERAGE_INC(netdev_get_ethtool);
2573 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2574 error = netdev_linux_do_ethtool(netdev->up.name,
2577 "ETHTOOL_GDRVINFO");
2579 netdev->cache_valid |= VALID_DRVINFO;
2584 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2585 smap_add(smap, "driver_version", netdev->drvinfo.version);
2586 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2588 ovs_mutex_unlock(&netdev->mutex);
2594 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2597 smap_add(smap, "driver_name", "openvswitch");
2601 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2602 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2603 * returns 0. Otherwise, it returns a positive errno value; in particular,
2604 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2606 netdev_linux_arp_lookup(const struct netdev *netdev,
2607 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2610 struct sockaddr_in sin;
2613 memset(&r, 0, sizeof r);
2614 memset(&sin, 0, sizeof sin);
2615 sin.sin_family = AF_INET;
2616 sin.sin_addr.s_addr = ip;
2618 memcpy(&r.arp_pa, &sin, sizeof sin);
2619 r.arp_ha.sa_family = ARPHRD_ETHER;
2621 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2622 COVERAGE_INC(netdev_arp_lookup);
2623 retval = af_inet_ioctl(SIOCGARP, &r);
2625 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2626 } else if (retval != ENXIO) {
2627 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2628 netdev_get_name(netdev), IP_ARGS(ip),
2629 ovs_strerror(retval));
2635 nd_to_iff_flags(enum netdev_flags nd)
2638 if (nd & NETDEV_UP) {
2641 if (nd & NETDEV_PROMISC) {
2644 if (nd & NETDEV_LOOPBACK) {
2645 iff |= IFF_LOOPBACK;
2651 iff_to_nd_flags(int iff)
2653 enum netdev_flags nd = 0;
2657 if (iff & IFF_PROMISC) {
2658 nd |= NETDEV_PROMISC;
2660 if (iff & IFF_LOOPBACK) {
2661 nd |= NETDEV_LOOPBACK;
2667 update_flags(struct netdev_linux *netdev, enum netdev_flags off,
2668 enum netdev_flags on, enum netdev_flags *old_flagsp)
2669 OVS_REQUIRES(netdev->mutex)
2671 int old_flags, new_flags;
2674 old_flags = netdev->ifi_flags;
2675 *old_flagsp = iff_to_nd_flags(old_flags);
2676 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2677 if (new_flags != old_flags) {
2678 error = set_flags(netdev_get_name(&netdev->up), new_flags);
2679 get_flags(&netdev->up, &netdev->ifi_flags);
2686 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2687 enum netdev_flags on, enum netdev_flags *old_flagsp)
2689 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2692 ovs_mutex_lock(&netdev->mutex);
2693 error = update_flags(netdev, off, on, old_flagsp);
2694 ovs_mutex_unlock(&netdev->mutex);
2699 #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, SET_STATS, \
2700 GET_FEATURES, GET_STATUS) \
2706 netdev_linux_wait, \
2708 netdev_linux_alloc, \
2710 netdev_linux_destruct, \
2711 netdev_linux_dealloc, \
2712 NULL, /* get_config */ \
2713 NULL, /* set_config */ \
2714 NULL, /* get_tunnel_config */ \
2716 netdev_linux_send, \
2717 netdev_linux_send_wait, \
2719 netdev_linux_set_etheraddr, \
2720 netdev_linux_get_etheraddr, \
2721 netdev_linux_get_mtu, \
2722 netdev_linux_set_mtu, \
2723 netdev_linux_get_ifindex, \
2724 netdev_linux_get_carrier, \
2725 netdev_linux_get_carrier_resets, \
2726 netdev_linux_set_miimon_interval, \
2731 netdev_linux_set_advertisements, \
2733 netdev_linux_set_policing, \
2734 netdev_linux_get_qos_types, \
2735 netdev_linux_get_qos_capabilities, \
2736 netdev_linux_get_qos, \
2737 netdev_linux_set_qos, \
2738 netdev_linux_get_queue, \
2739 netdev_linux_set_queue, \
2740 netdev_linux_delete_queue, \
2741 netdev_linux_get_queue_stats, \
2742 netdev_linux_queue_dump_start, \
2743 netdev_linux_queue_dump_next, \
2744 netdev_linux_queue_dump_done, \
2745 netdev_linux_dump_queue_stats, \
2747 netdev_linux_get_in4, \
2748 netdev_linux_set_in4, \
2749 netdev_linux_get_in6, \
2750 netdev_linux_add_router, \
2751 netdev_linux_get_next_hop, \
2753 netdev_linux_arp_lookup, \
2755 netdev_linux_update_flags, \
2757 netdev_linux_rx_alloc, \
2758 netdev_linux_rx_construct, \
2759 netdev_linux_rx_destruct, \
2760 netdev_linux_rx_dealloc, \
2761 netdev_linux_rx_recv, \
2762 netdev_linux_rx_wait, \
2763 netdev_linux_rx_drain, \
2766 const struct netdev_class netdev_linux_class =
2769 netdev_linux_construct,
2770 netdev_linux_get_stats,
2771 NULL, /* set_stats */
2772 netdev_linux_get_features,
2773 netdev_linux_get_status);
2775 const struct netdev_class netdev_tap_class =
2778 netdev_linux_construct_tap,
2779 netdev_tap_get_stats,
2780 NULL, /* set_stats */
2781 netdev_linux_get_features,
2782 netdev_linux_get_status);
2784 const struct netdev_class netdev_internal_class =
2787 netdev_linux_construct,
2788 netdev_internal_get_stats,
2789 netdev_internal_set_stats,
2790 NULL, /* get_features */
2791 netdev_internal_get_status);
2793 /* HTB traffic control class. */
2795 #define HTB_N_QUEUES 0xf000
2799 unsigned int max_rate; /* In bytes/s. */
2803 struct tc_queue tc_queue;
2804 unsigned int min_rate; /* In bytes/s. */
2805 unsigned int max_rate; /* In bytes/s. */
2806 unsigned int burst; /* In bytes. */
2807 unsigned int priority; /* Lower values are higher priorities. */
2811 htb_get__(const struct netdev *netdev_)
2813 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2814 return CONTAINER_OF(netdev->tc, struct htb, tc);
2818 htb_install__(struct netdev *netdev_, uint64_t max_rate)
2820 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2823 htb = xmalloc(sizeof *htb);
2824 tc_init(&htb->tc, &tc_ops_htb);
2825 htb->max_rate = max_rate;
2827 netdev->tc = &htb->tc;
2830 /* Create an HTB qdisc.
2832 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2834 htb_setup_qdisc__(struct netdev *netdev)
2837 struct tc_htb_glob opt;
2838 struct ofpbuf request;
2839 struct tcmsg *tcmsg;
2841 tc_del_qdisc(netdev);
2843 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2844 NLM_F_EXCL | NLM_F_CREATE, &request);
2848 tcmsg->tcm_handle = tc_make_handle(1, 0);
2849 tcmsg->tcm_parent = TC_H_ROOT;
2851 nl_msg_put_string(&request, TCA_KIND, "htb");
2853 memset(&opt, 0, sizeof opt);
2854 opt.rate2quantum = 10;
2858 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2859 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2860 nl_msg_end_nested(&request, opt_offset);
2862 return tc_transact(&request, NULL);
2865 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2866 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2868 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2869 unsigned int parent, struct htb_class *class)
2872 struct tc_htb_opt opt;
2873 struct ofpbuf request;
2874 struct tcmsg *tcmsg;
2878 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
2880 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2881 netdev_get_name(netdev));
2885 memset(&opt, 0, sizeof opt);
2886 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2887 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2888 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2889 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2890 opt.prio = class->priority;
2892 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2896 tcmsg->tcm_handle = handle;
2897 tcmsg->tcm_parent = parent;
2899 nl_msg_put_string(&request, TCA_KIND, "htb");
2900 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2901 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2902 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2903 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2904 nl_msg_end_nested(&request, opt_offset);
2906 error = tc_transact(&request, NULL);
2908 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2909 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2910 netdev_get_name(netdev),
2911 tc_get_major(handle), tc_get_minor(handle),
2912 tc_get_major(parent), tc_get_minor(parent),
2913 class->min_rate, class->max_rate,
2914 class->burst, class->priority, ovs_strerror(error));
2919 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2920 * description of them into 'details'. The description complies with the
2921 * specification given in the vswitch database documentation for linux-htb
2924 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2926 static const struct nl_policy tca_htb_policy[] = {
2927 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2928 .min_len = sizeof(struct tc_htb_opt) },
2931 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2932 const struct tc_htb_opt *htb;
2934 if (!nl_parse_nested(nl_options, tca_htb_policy,
2935 attrs, ARRAY_SIZE(tca_htb_policy))) {
2936 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2940 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2941 class->min_rate = htb->rate.rate;
2942 class->max_rate = htb->ceil.rate;
2943 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2944 class->priority = htb->prio;
2949 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2950 struct htb_class *options,
2951 struct netdev_queue_stats *stats)
2953 struct nlattr *nl_options;
2954 unsigned int handle;
2957 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2958 if (!error && queue_id) {
2959 unsigned int major = tc_get_major(handle);
2960 unsigned int minor = tc_get_minor(handle);
2961 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2962 *queue_id = minor - 1;
2967 if (!error && options) {
2968 error = htb_parse_tca_options__(nl_options, options);
2974 htb_parse_qdisc_details__(struct netdev *netdev_,
2975 const struct smap *details, struct htb_class *hc)
2977 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2978 const char *max_rate_s;
2980 max_rate_s = smap_get(details, "max-rate");
2981 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2982 if (!hc->max_rate) {
2983 enum netdev_features current;
2985 netdev_linux_read_features(netdev);
2986 current = !netdev->get_features_error ? netdev->current : 0;
2987 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
2989 hc->min_rate = hc->max_rate;
2995 htb_parse_class_details__(struct netdev *netdev,
2996 const struct smap *details, struct htb_class *hc)
2998 const struct htb *htb = htb_get__(netdev);
2999 const char *min_rate_s = smap_get(details, "min-rate");
3000 const char *max_rate_s = smap_get(details, "max-rate");
3001 const char *burst_s = smap_get(details, "burst");
3002 const char *priority_s = smap_get(details, "priority");
3005 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3007 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
3008 netdev_get_name(netdev));
3012 /* HTB requires at least an mtu sized min-rate to send any traffic even
3013 * on uncongested links. */
3014 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3015 hc->min_rate = MAX(hc->min_rate, mtu);
3016 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
3019 hc->max_rate = (max_rate_s
3020 ? strtoull(max_rate_s, NULL, 10) / 8
3022 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
3023 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
3027 * According to hints in the documentation that I've read, it is important
3028 * that 'burst' be at least as big as the largest frame that might be
3029 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
3030 * but having it a bit too small is a problem. Since netdev_get_mtu()
3031 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
3032 * the MTU. We actually add 64, instead of 14, as a guard against
3033 * additional headers get tacked on somewhere that we're not aware of. */
3034 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
3035 hc->burst = MAX(hc->burst, mtu + 64);
3038 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
3044 htb_query_class__(const struct netdev *netdev, unsigned int handle,
3045 unsigned int parent, struct htb_class *options,
3046 struct netdev_queue_stats *stats)
3048 struct ofpbuf *reply;
3051 error = tc_query_class(netdev, handle, parent, &reply);
3053 error = htb_parse_tcmsg__(reply, NULL, options, stats);
3054 ofpbuf_delete(reply);
3060 htb_tc_install(struct netdev *netdev, const struct smap *details)
3064 error = htb_setup_qdisc__(netdev);
3066 struct htb_class hc;
3068 htb_parse_qdisc_details__(netdev, details, &hc);
3069 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3070 tc_make_handle(1, 0), &hc);
3072 htb_install__(netdev, hc.max_rate);
3078 static struct htb_class *
3079 htb_class_cast__(const struct tc_queue *queue)
3081 return CONTAINER_OF(queue, struct htb_class, tc_queue);
3085 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
3086 const struct htb_class *hc)
3088 struct htb *htb = htb_get__(netdev);
3089 size_t hash = hash_int(queue_id, 0);
3090 struct tc_queue *queue;
3091 struct htb_class *hcp;
3093 queue = tc_find_queue__(netdev, queue_id, hash);
3095 hcp = htb_class_cast__(queue);
3097 hcp = xmalloc(sizeof *hcp);
3098 queue = &hcp->tc_queue;
3099 queue->queue_id = queue_id;
3100 queue->created = time_msec();
3101 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
3104 hcp->min_rate = hc->min_rate;
3105 hcp->max_rate = hc->max_rate;
3106 hcp->burst = hc->burst;
3107 hcp->priority = hc->priority;
3111 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3114 struct queue_dump_state state;
3115 struct htb_class hc;
3117 /* Get qdisc options. */
3119 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3120 htb_install__(netdev, hc.max_rate);
3123 if (!start_queue_dump(netdev, &state)) {
3126 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
3127 unsigned int queue_id;
3129 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3130 htb_update_queue__(netdev, queue_id, &hc);
3133 finish_queue_dump(&state);
3139 htb_tc_destroy(struct tc *tc)
3141 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
3142 struct htb_class *hc, *next;
3144 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
3145 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3153 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
3155 const struct htb *htb = htb_get__(netdev);
3156 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
3161 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
3163 struct htb_class hc;
3166 htb_parse_qdisc_details__(netdev, details, &hc);
3167 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3168 tc_make_handle(1, 0), &hc);
3170 htb_get__(netdev)->max_rate = hc.max_rate;
3176 htb_class_get(const struct netdev *netdev OVS_UNUSED,
3177 const struct tc_queue *queue, struct smap *details)
3179 const struct htb_class *hc = htb_class_cast__(queue);
3181 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3182 if (hc->min_rate != hc->max_rate) {
3183 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3185 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
3187 smap_add_format(details, "priority", "%u", hc->priority);
3193 htb_class_set(struct netdev *netdev, unsigned int queue_id,
3194 const struct smap *details)
3196 struct htb_class hc;
3199 error = htb_parse_class_details__(netdev, details, &hc);
3204 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3205 tc_make_handle(1, 0xfffe), &hc);
3210 htb_update_queue__(netdev, queue_id, &hc);
3215 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
3217 struct htb_class *hc = htb_class_cast__(queue);
3218 struct htb *htb = htb_get__(netdev);
3221 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3223 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3230 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3231 struct netdev_queue_stats *stats)
3233 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3234 tc_make_handle(1, 0xfffe), NULL, stats);
3238 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3239 const struct ofpbuf *nlmsg,
3240 netdev_dump_queue_stats_cb *cb, void *aux)
3242 struct netdev_queue_stats stats;
3243 unsigned int handle, major, minor;
3246 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3251 major = tc_get_major(handle);
3252 minor = tc_get_minor(handle);
3253 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3254 (*cb)(minor - 1, &stats, aux);
3259 static const struct tc_ops tc_ops_htb = {
3260 "htb", /* linux_name */
3261 "linux-htb", /* ovs_name */
3262 HTB_N_QUEUES, /* n_queues */
3271 htb_class_get_stats,
3272 htb_class_dump_stats
3275 /* "linux-hfsc" traffic control class. */
3277 #define HFSC_N_QUEUES 0xf000
3285 struct tc_queue tc_queue;
3290 static struct hfsc *
3291 hfsc_get__(const struct netdev *netdev_)
3293 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3294 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
3297 static struct hfsc_class *
3298 hfsc_class_cast__(const struct tc_queue *queue)
3300 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3304 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
3306 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3309 hfsc = xmalloc(sizeof *hfsc);
3310 tc_init(&hfsc->tc, &tc_ops_hfsc);
3311 hfsc->max_rate = max_rate;
3312 netdev->tc = &hfsc->tc;
3316 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3317 const struct hfsc_class *hc)
3321 struct hfsc_class *hcp;
3322 struct tc_queue *queue;
3324 hfsc = hfsc_get__(netdev);
3325 hash = hash_int(queue_id, 0);
3327 queue = tc_find_queue__(netdev, queue_id, hash);
3329 hcp = hfsc_class_cast__(queue);
3331 hcp = xmalloc(sizeof *hcp);
3332 queue = &hcp->tc_queue;
3333 queue->queue_id = queue_id;
3334 queue->created = time_msec();
3335 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3338 hcp->min_rate = hc->min_rate;
3339 hcp->max_rate = hc->max_rate;
3343 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3345 const struct tc_service_curve *rsc, *fsc, *usc;
3346 static const struct nl_policy tca_hfsc_policy[] = {
3348 .type = NL_A_UNSPEC,
3350 .min_len = sizeof(struct tc_service_curve),
3353 .type = NL_A_UNSPEC,
3355 .min_len = sizeof(struct tc_service_curve),
3358 .type = NL_A_UNSPEC,
3360 .min_len = sizeof(struct tc_service_curve),
3363 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3365 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3366 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3367 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3371 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3372 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3373 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3375 if (rsc->m1 != 0 || rsc->d != 0 ||
3376 fsc->m1 != 0 || fsc->d != 0 ||
3377 usc->m1 != 0 || usc->d != 0) {
3378 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3379 "Non-linear service curves are not supported.");
3383 if (rsc->m2 != fsc->m2) {
3384 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3385 "Real-time service curves are not supported ");
3389 if (rsc->m2 > usc->m2) {
3390 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3391 "Min-rate service curve is greater than "
3392 "the max-rate service curve.");
3396 class->min_rate = fsc->m2;
3397 class->max_rate = usc->m2;
3402 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3403 struct hfsc_class *options,
3404 struct netdev_queue_stats *stats)
3407 unsigned int handle;
3408 struct nlattr *nl_options;
3410 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3416 unsigned int major, minor;
3418 major = tc_get_major(handle);
3419 minor = tc_get_minor(handle);
3420 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3421 *queue_id = minor - 1;
3428 error = hfsc_parse_tca_options__(nl_options, options);
3435 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3436 unsigned int parent, struct hfsc_class *options,
3437 struct netdev_queue_stats *stats)
3440 struct ofpbuf *reply;
3442 error = tc_query_class(netdev, handle, parent, &reply);
3447 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3448 ofpbuf_delete(reply);
3453 hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
3454 struct hfsc_class *class)
3456 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3458 const char *max_rate_s;
3460 max_rate_s = smap_get(details, "max-rate");
3461 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3464 enum netdev_features current;
3466 netdev_linux_read_features(netdev);
3467 current = !netdev->get_features_error ? netdev->current : 0;
3468 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3471 class->min_rate = max_rate;
3472 class->max_rate = max_rate;
3476 hfsc_parse_class_details__(struct netdev *netdev,
3477 const struct smap *details,
3478 struct hfsc_class * class)
3480 const struct hfsc *hfsc;
3481 uint32_t min_rate, max_rate;
3482 const char *min_rate_s, *max_rate_s;
3484 hfsc = hfsc_get__(netdev);
3485 min_rate_s = smap_get(details, "min-rate");
3486 max_rate_s = smap_get(details, "max-rate");
3488 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3489 min_rate = MAX(min_rate, 1);
3490 min_rate = MIN(min_rate, hfsc->max_rate);
3492 max_rate = (max_rate_s
3493 ? strtoull(max_rate_s, NULL, 10) / 8
3495 max_rate = MAX(max_rate, min_rate);
3496 max_rate = MIN(max_rate, hfsc->max_rate);
3498 class->min_rate = min_rate;
3499 class->max_rate = max_rate;
3504 /* Create an HFSC qdisc.
3506 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3508 hfsc_setup_qdisc__(struct netdev * netdev)
3510 struct tcmsg *tcmsg;
3511 struct ofpbuf request;
3512 struct tc_hfsc_qopt opt;
3514 tc_del_qdisc(netdev);
3516 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3517 NLM_F_EXCL | NLM_F_CREATE, &request);
3523 tcmsg->tcm_handle = tc_make_handle(1, 0);
3524 tcmsg->tcm_parent = TC_H_ROOT;
3526 memset(&opt, 0, sizeof opt);
3529 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3530 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3532 return tc_transact(&request, NULL);
3535 /* Create an HFSC class.
3537 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3538 * sc rate <min_rate> ul rate <max_rate>" */
3540 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3541 unsigned int parent, struct hfsc_class *class)
3545 struct tcmsg *tcmsg;
3546 struct ofpbuf request;
3547 struct tc_service_curve min, max;
3549 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3555 tcmsg->tcm_handle = handle;
3556 tcmsg->tcm_parent = parent;
3560 min.m2 = class->min_rate;
3564 max.m2 = class->max_rate;
3566 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3567 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3568 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3569 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3570 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3571 nl_msg_end_nested(&request, opt_offset);
3573 error = tc_transact(&request, NULL);
3575 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3576 "min-rate %ubps, max-rate %ubps (%s)",
3577 netdev_get_name(netdev),
3578 tc_get_major(handle), tc_get_minor(handle),
3579 tc_get_major(parent), tc_get_minor(parent),
3580 class->min_rate, class->max_rate, ovs_strerror(error));
3587 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
3590 struct hfsc_class class;
3592 error = hfsc_setup_qdisc__(netdev);
3598 hfsc_parse_qdisc_details__(netdev, details, &class);
3599 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3600 tc_make_handle(1, 0), &class);
3606 hfsc_install__(netdev, class.max_rate);
3611 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3614 struct queue_dump_state state;
3615 struct hfsc_class hc;
3618 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3619 hfsc_install__(netdev, hc.max_rate);
3621 if (!start_queue_dump(netdev, &state)) {
3625 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
3626 unsigned int queue_id;
3628 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3629 hfsc_update_queue__(netdev, queue_id, &hc);
3633 finish_queue_dump(&state);
3638 hfsc_tc_destroy(struct tc *tc)
3641 struct hfsc_class *hc, *next;
3643 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3645 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3646 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3655 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
3657 const struct hfsc *hfsc;
3658 hfsc = hfsc_get__(netdev);
3659 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
3664 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
3667 struct hfsc_class class;
3669 hfsc_parse_qdisc_details__(netdev, details, &class);
3670 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3671 tc_make_handle(1, 0), &class);
3674 hfsc_get__(netdev)->max_rate = class.max_rate;
3681 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3682 const struct tc_queue *queue, struct smap *details)
3684 const struct hfsc_class *hc;
3686 hc = hfsc_class_cast__(queue);
3687 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3688 if (hc->min_rate != hc->max_rate) {
3689 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3695 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3696 const struct smap *details)
3699 struct hfsc_class class;
3701 error = hfsc_parse_class_details__(netdev, details, &class);
3706 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3707 tc_make_handle(1, 0xfffe), &class);
3712 hfsc_update_queue__(netdev, queue_id, &class);
3717 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3721 struct hfsc_class *hc;
3723 hc = hfsc_class_cast__(queue);
3724 hfsc = hfsc_get__(netdev);
3726 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3728 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3735 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3736 struct netdev_queue_stats *stats)
3738 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3739 tc_make_handle(1, 0xfffe), NULL, stats);
3743 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3744 const struct ofpbuf *nlmsg,
3745 netdev_dump_queue_stats_cb *cb, void *aux)
3747 struct netdev_queue_stats stats;
3748 unsigned int handle, major, minor;
3751 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3756 major = tc_get_major(handle);
3757 minor = tc_get_minor(handle);
3758 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3759 (*cb)(minor - 1, &stats, aux);
3764 static const struct tc_ops tc_ops_hfsc = {
3765 "hfsc", /* linux_name */
3766 "linux-hfsc", /* ovs_name */
3767 HFSC_N_QUEUES, /* n_queues */
3768 hfsc_tc_install, /* tc_install */
3769 hfsc_tc_load, /* tc_load */
3770 hfsc_tc_destroy, /* tc_destroy */
3771 hfsc_qdisc_get, /* qdisc_get */
3772 hfsc_qdisc_set, /* qdisc_set */
3773 hfsc_class_get, /* class_get */
3774 hfsc_class_set, /* class_set */
3775 hfsc_class_delete, /* class_delete */
3776 hfsc_class_get_stats, /* class_get_stats */
3777 hfsc_class_dump_stats /* class_dump_stats */
3780 /* "linux-default" traffic control class.
3782 * This class represents the default, unnamed Linux qdisc. It corresponds to
3783 * the "" (empty string) QoS type in the OVS database. */
3786 default_install__(struct netdev *netdev_)
3788 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3789 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
3791 /* Nothing but a tc class implementation is allowed to write to a tc. This
3792 * class never does that, so we can legitimately use a const tc object. */
3793 netdev->tc = CONST_CAST(struct tc *, &tc);
3797 default_tc_install(struct netdev *netdev,
3798 const struct smap *details OVS_UNUSED)
3800 default_install__(netdev);
3805 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3807 default_install__(netdev);
3811 static const struct tc_ops tc_ops_default = {
3812 NULL, /* linux_name */
3817 NULL, /* tc_destroy */
3818 NULL, /* qdisc_get */
3819 NULL, /* qdisc_set */
3820 NULL, /* class_get */
3821 NULL, /* class_set */
3822 NULL, /* class_delete */
3823 NULL, /* class_get_stats */
3824 NULL /* class_dump_stats */
3827 /* "linux-other" traffic control class.
3832 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
3834 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3835 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
3837 /* Nothing but a tc class implementation is allowed to write to a tc. This
3838 * class never does that, so we can legitimately use a const tc object. */
3839 netdev->tc = CONST_CAST(struct tc *, &tc);
3843 static const struct tc_ops tc_ops_other = {
3844 NULL, /* linux_name */
3845 "linux-other", /* ovs_name */
3847 NULL, /* tc_install */
3849 NULL, /* tc_destroy */
3850 NULL, /* qdisc_get */
3851 NULL, /* qdisc_set */
3852 NULL, /* class_get */
3853 NULL, /* class_set */
3854 NULL, /* class_delete */
3855 NULL, /* class_get_stats */
3856 NULL /* class_dump_stats */
3859 /* Traffic control. */
3861 /* Number of kernel "tc" ticks per second. */
3862 static double ticks_per_s;
3864 /* Number of kernel "jiffies" per second. This is used for the purpose of
3865 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3866 * one jiffy's worth of data.
3868 * There are two possibilities here:
3870 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3871 * approximate range of 100 to 1024. That means that we really need to
3872 * make sure that the qdisc can buffer that much data.
3874 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3875 * has finely granular timers and there's no need to fudge additional room
3876 * for buffers. (There's no extra effort needed to implement that: the
3877 * large 'buffer_hz' is used as a divisor, so practically any number will
3878 * come out as 0 in the division. Small integer results in the case of
3879 * really high dividends won't have any real effect anyhow.)
3881 static unsigned int buffer_hz;
3883 /* Returns tc handle 'major':'minor'. */
3885 tc_make_handle(unsigned int major, unsigned int minor)
3887 return TC_H_MAKE(major << 16, minor);
3890 /* Returns the major number from 'handle'. */
3892 tc_get_major(unsigned int handle)
3894 return TC_H_MAJ(handle) >> 16;
3897 /* Returns the minor number from 'handle'. */
3899 tc_get_minor(unsigned int handle)
3901 return TC_H_MIN(handle);
3904 static struct tcmsg *
3905 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3906 struct ofpbuf *request)
3908 struct tcmsg *tcmsg;
3912 error = get_ifindex(netdev, &ifindex);
3917 ofpbuf_init(request, 512);
3918 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3919 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3920 tcmsg->tcm_family = AF_UNSPEC;
3921 tcmsg->tcm_ifindex = ifindex;
3922 /* Caller should fill in tcmsg->tcm_handle. */
3923 /* Caller should fill in tcmsg->tcm_parent. */
3929 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3931 int error = nl_transact(NETLINK_ROUTE, request, replyp);
3932 ofpbuf_uninit(request);
3936 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3937 * policing configuration.
3939 * This function is equivalent to running the following when 'add' is true:
3940 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3942 * This function is equivalent to running the following when 'add' is false:
3943 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3945 * The configuration and stats may be seen with the following command:
3946 * /sbin/tc -s qdisc show dev <devname>
3948 * Returns 0 if successful, otherwise a positive errno value.
3951 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3953 struct ofpbuf request;
3954 struct tcmsg *tcmsg;
3956 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3957 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3959 tcmsg = tc_make_request(netdev, type, flags, &request);
3963 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3964 tcmsg->tcm_parent = TC_H_INGRESS;
3965 nl_msg_put_string(&request, TCA_KIND, "ingress");
3966 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3968 error = tc_transact(&request, NULL);
3970 /* If we're deleting the qdisc, don't worry about some of the
3971 * error conditions. */
3972 if (!add && (error == ENOENT || error == EINVAL)) {
3981 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3984 * This function is equivalent to running:
3985 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3986 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3989 * The configuration and stats may be seen with the following command:
3990 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3992 * Returns 0 if successful, otherwise a positive errno value.
3995 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3997 struct tc_police tc_police;
3998 struct ofpbuf request;
3999 struct tcmsg *tcmsg;
4000 size_t basic_offset;
4001 size_t police_offset;
4005 memset(&tc_police, 0, sizeof tc_police);
4006 tc_police.action = TC_POLICE_SHOT;
4007 tc_police.mtu = mtu;
4008 tc_fill_rate(&tc_police.rate, (kbits_rate * 1000)/8, mtu);
4009 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
4010 kbits_burst * 1024);
4012 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
4013 NLM_F_EXCL | NLM_F_CREATE, &request);
4017 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
4018 tcmsg->tcm_info = tc_make_handle(49,
4019 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
4021 nl_msg_put_string(&request, TCA_KIND, "basic");
4022 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4023 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
4024 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
4025 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
4026 nl_msg_end_nested(&request, police_offset);
4027 nl_msg_end_nested(&request, basic_offset);
4029 error = tc_transact(&request, NULL);
4040 /* The values in psched are not individually very meaningful, but they are
4041 * important. The tables below show some values seen in the wild.
4045 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
4046 * (Before that, there are hints that it was 1000000000.)
4048 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
4052 * -----------------------------------
4053 * [1] 000c8000 000f4240 000f4240 00000064
4054 * [2] 000003e8 00000400 000f4240 3b9aca00
4055 * [3] 000003e8 00000400 000f4240 3b9aca00
4056 * [4] 000003e8 00000400 000f4240 00000064
4057 * [5] 000003e8 00000040 000f4240 3b9aca00
4058 * [6] 000003e8 00000040 000f4240 000000f9
4060 * a b c d ticks_per_s buffer_hz
4061 * ------- --------- ---------- ------------- ----------- -------------
4062 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
4063 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4064 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4065 * [4] 1,000 1,024 1,000,000 100 976,562 100
4066 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
4067 * [6] 1,000 64 1,000,000 249 15,625,000 249
4069 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
4070 * [2] 2.6.26-1-686-bigmem from Debian lenny
4071 * [3] 2.6.26-2-sparc64 from Debian lenny
4072 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
4073 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
4074 * [6] 2.6.34 from kernel.org on KVM
4076 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4077 static const char fn[] = "/proc/net/psched";
4078 unsigned int a, b, c, d;
4081 if (!ovsthread_once_start(&once)) {
4088 stream = fopen(fn, "r");
4090 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
4094 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
4095 VLOG_WARN("%s: read failed", fn);
4099 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
4103 VLOG_WARN("%s: invalid scheduler parameters", fn);
4107 ticks_per_s = (double) a * c / b;
4111 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
4114 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
4117 ovsthread_once_done(&once);
4120 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
4121 * rate of 'rate' bytes per second. */
4123 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
4126 return (rate * ticks) / ticks_per_s;
4129 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
4130 * rate of 'rate' bytes per second. */
4132 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
4135 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
4138 /* Returns the number of bytes that need to be reserved for qdisc buffering at
4139 * a transmission rate of 'rate' bytes per second. */
4141 tc_buffer_per_jiffy(unsigned int rate)
4144 return rate / buffer_hz;
4147 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
4148 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
4149 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
4150 * stores NULL into it if it is absent.
4152 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
4155 * Returns 0 if successful, otherwise a positive errno value. */
4157 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
4158 struct nlattr **options)
4160 static const struct nl_policy tca_policy[] = {
4161 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
4162 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
4164 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4166 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4167 tca_policy, ta, ARRAY_SIZE(ta))) {
4168 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
4173 *kind = nl_attr_get_string(ta[TCA_KIND]);
4177 *options = ta[TCA_OPTIONS];
4192 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
4193 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
4194 * into '*options', and its queue statistics into '*stats'. Any of the output
4195 * arguments may be null.
4197 * Returns 0 if successful, otherwise a positive errno value. */
4199 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
4200 struct nlattr **options, struct netdev_queue_stats *stats)
4202 static const struct nl_policy tca_policy[] = {
4203 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
4204 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
4206 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4208 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4209 tca_policy, ta, ARRAY_SIZE(ta))) {
4210 VLOG_WARN_RL(&rl, "failed to parse class message");
4215 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
4216 *handlep = tc->tcm_handle;
4220 *options = ta[TCA_OPTIONS];
4224 const struct gnet_stats_queue *gsq;
4225 struct gnet_stats_basic gsb;
4227 static const struct nl_policy stats_policy[] = {
4228 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
4229 .min_len = sizeof gsb },
4230 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
4231 .min_len = sizeof *gsq },
4233 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
4235 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
4236 sa, ARRAY_SIZE(sa))) {
4237 VLOG_WARN_RL(&rl, "failed to parse class stats");
4241 /* Alignment issues screw up the length of struct gnet_stats_basic on
4242 * some arch/bitsize combinations. Newer versions of Linux have a
4243 * struct gnet_stats_basic_packed, but we can't depend on that. The
4244 * easiest thing to do is just to make a copy. */
4245 memset(&gsb, 0, sizeof gsb);
4246 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
4247 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
4248 stats->tx_bytes = gsb.bytes;
4249 stats->tx_packets = gsb.packets;
4251 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
4252 stats->tx_errors = gsq->drops;
4262 memset(stats, 0, sizeof *stats);
4267 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
4270 tc_query_class(const struct netdev *netdev,
4271 unsigned int handle, unsigned int parent,
4272 struct ofpbuf **replyp)
4274 struct ofpbuf request;
4275 struct tcmsg *tcmsg;
4278 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
4282 tcmsg->tcm_handle = handle;
4283 tcmsg->tcm_parent = parent;
4285 error = tc_transact(&request, replyp);
4287 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
4288 netdev_get_name(netdev),
4289 tc_get_major(handle), tc_get_minor(handle),
4290 tc_get_major(parent), tc_get_minor(parent),
4291 ovs_strerror(error));
4296 /* Equivalent to "tc class del dev <name> handle <handle>". */
4298 tc_delete_class(const struct netdev *netdev, unsigned int handle)
4300 struct ofpbuf request;
4301 struct tcmsg *tcmsg;
4304 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
4308 tcmsg->tcm_handle = handle;
4309 tcmsg->tcm_parent = 0;
4311 error = tc_transact(&request, NULL);
4313 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4314 netdev_get_name(netdev),
4315 tc_get_major(handle), tc_get_minor(handle),
4316 ovs_strerror(error));
4321 /* Equivalent to "tc qdisc del dev <name> root". */
4323 tc_del_qdisc(struct netdev *netdev_)
4325 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4326 struct ofpbuf request;
4327 struct tcmsg *tcmsg;
4330 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
4334 tcmsg->tcm_handle = tc_make_handle(1, 0);
4335 tcmsg->tcm_parent = TC_H_ROOT;
4337 error = tc_transact(&request, NULL);
4338 if (error == EINVAL) {
4339 /* EINVAL probably means that the default qdisc was in use, in which
4340 * case we've accomplished our purpose. */
4343 if (!error && netdev->tc) {
4344 if (netdev->tc->ops->tc_destroy) {
4345 netdev->tc->ops->tc_destroy(netdev->tc);
4352 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4353 * kernel to determine what they are. Returns 0 if successful, otherwise a
4354 * positive errno value. */
4356 tc_query_qdisc(const struct netdev *netdev_)
4358 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4359 struct ofpbuf request, *qdisc;
4360 const struct tc_ops *ops;
4361 struct tcmsg *tcmsg;
4369 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4370 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4371 * 2.6.35 without that fix backported to it.
4373 * To avoid the OOPS, we must not make a request that would attempt to dump
4374 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4375 * few others. There are a few ways that I can see to do this, but most of
4376 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4377 * technique chosen here is to assume that any non-default qdisc that we
4378 * create will have a class with handle 1:0. The built-in qdiscs only have
4379 * a class with handle 0:0.
4381 * We could check for Linux 2.6.35+ and use a more straightforward method
4383 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
4387 tcmsg->tcm_handle = tc_make_handle(1, 0);
4388 tcmsg->tcm_parent = 0;
4390 /* Figure out what tc class to instantiate. */
4391 error = tc_transact(&request, &qdisc);
4395 error = tc_parse_qdisc(qdisc, &kind, NULL);
4397 ops = &tc_ops_other;
4399 ops = tc_lookup_linux_name(kind);
4401 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4402 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4404 ops = &tc_ops_other;
4407 } else if (error == ENOENT) {
4408 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4409 * other entity that doesn't have a handle 1:0. We will assume
4410 * that it's the system default qdisc. */
4411 ops = &tc_ops_default;
4414 /* Who knows? Maybe the device got deleted. */
4415 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4416 netdev_get_name(netdev_), ovs_strerror(error));
4417 ops = &tc_ops_other;
4420 /* Instantiate it. */
4421 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
4422 ovs_assert((load_error == 0) == (netdev->tc != NULL));
4423 ofpbuf_delete(qdisc);
4425 return error ? error : load_error;
4428 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4429 approximate the time to transmit packets of various lengths. For an MTU of
4430 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4431 represents two possible packet lengths; for a MTU of 513 through 1024, four
4432 possible lengths; and so on.
4434 Returns, for the specified 'mtu', the number of bits that packet lengths
4435 need to be shifted right to fit within such a 256-entry table. */
4437 tc_calc_cell_log(unsigned int mtu)
4442 mtu = ETH_PAYLOAD_MAX;
4444 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4446 for (cell_log = 0; mtu >= 256; cell_log++) {
4453 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4456 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4458 memset(rate, 0, sizeof *rate);
4459 rate->cell_log = tc_calc_cell_log(mtu);
4460 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4461 /* rate->cell_align = 0; */ /* distro headers. */
4462 rate->mpu = ETH_TOTAL_MIN;
4466 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4467 * attribute of the specified "type".
4469 * See tc_calc_cell_log() above for a description of "rtab"s. */
4471 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4476 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4477 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4478 unsigned packet_size = (i + 1) << rate->cell_log;
4479 if (packet_size < rate->mpu) {
4480 packet_size = rate->mpu;
4482 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4486 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4487 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4488 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4491 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4493 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4494 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4497 /* Linux-only functions declared in netdev-linux.h */
4499 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4500 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4502 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4503 const char *flag_name, bool enable)
4505 const char *netdev_name = netdev_get_name(netdev);
4506 struct ethtool_value evalue;
4510 COVERAGE_INC(netdev_get_ethtool);
4511 memset(&evalue, 0, sizeof evalue);
4512 error = netdev_linux_do_ethtool(netdev_name,
4513 (struct ethtool_cmd *)&evalue,
4514 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4519 COVERAGE_INC(netdev_set_ethtool);
4520 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4521 error = netdev_linux_do_ethtool(netdev_name,
4522 (struct ethtool_cmd *)&evalue,
4523 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4528 COVERAGE_INC(netdev_get_ethtool);
4529 memset(&evalue, 0, sizeof evalue);
4530 error = netdev_linux_do_ethtool(netdev_name,
4531 (struct ethtool_cmd *)&evalue,
4532 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4537 if (new_flags != evalue.data) {
4538 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4539 "device %s failed", enable ? "enable" : "disable",
4540 flag_name, netdev_name);
4547 /* Utility functions. */
4549 /* Copies 'src' into 'dst', performing format conversion in the process. */
4551 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4552 const struct rtnl_link_stats *src)
4554 dst->rx_packets = src->rx_packets;
4555 dst->tx_packets = src->tx_packets;
4556 dst->rx_bytes = src->rx_bytes;
4557 dst->tx_bytes = src->tx_bytes;
4558 dst->rx_errors = src->rx_errors;
4559 dst->tx_errors = src->tx_errors;
4560 dst->rx_dropped = src->rx_dropped;
4561 dst->tx_dropped = src->tx_dropped;
4562 dst->multicast = src->multicast;
4563 dst->collisions = src->collisions;
4564 dst->rx_length_errors = src->rx_length_errors;
4565 dst->rx_over_errors = src->rx_over_errors;
4566 dst->rx_crc_errors = src->rx_crc_errors;
4567 dst->rx_frame_errors = src->rx_frame_errors;
4568 dst->rx_fifo_errors = src->rx_fifo_errors;
4569 dst->rx_missed_errors = src->rx_missed_errors;
4570 dst->tx_aborted_errors = src->tx_aborted_errors;
4571 dst->tx_carrier_errors = src->tx_carrier_errors;
4572 dst->tx_fifo_errors = src->tx_fifo_errors;
4573 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4574 dst->tx_window_errors = src->tx_window_errors;
4578 get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
4580 struct ofpbuf request;
4581 struct ofpbuf *reply;
4584 ofpbuf_init(&request, 0);
4585 nl_msg_put_nlmsghdr(&request,
4586 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
4587 RTM_GETLINK, NLM_F_REQUEST);
4588 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
4589 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
4590 error = nl_transact(NETLINK_ROUTE, &request, &reply);
4591 ofpbuf_uninit(&request);
4596 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
4597 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS);
4598 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
4599 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
4602 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4606 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
4611 ofpbuf_delete(reply);
4616 get_flags(const struct netdev *dev, unsigned int *flags)
4622 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
4624 *flags = ifr.ifr_flags;
4630 set_flags(const char *name, unsigned int flags)
4634 ifr.ifr_flags = flags;
4635 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
4639 do_get_ifindex(const char *netdev_name)
4644 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4645 COVERAGE_INC(netdev_get_ifindex);
4647 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
4649 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4650 netdev_name, ovs_strerror(error));
4653 return ifr.ifr_ifindex;
4657 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4659 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4661 if (!(netdev->cache_valid & VALID_IFINDEX)) {
4662 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4665 netdev->get_ifindex_error = -ifindex;
4666 netdev->ifindex = 0;
4668 netdev->get_ifindex_error = 0;
4669 netdev->ifindex = ifindex;
4671 netdev->cache_valid |= VALID_IFINDEX;
4674 *ifindexp = netdev->ifindex;
4675 return netdev->get_ifindex_error;
4679 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4685 memset(&ifr, 0, sizeof ifr);
4686 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4687 COVERAGE_INC(netdev_get_hwaddr);
4688 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
4690 /* ENODEV probably means that a vif disappeared asynchronously and
4691 * hasn't been removed from the database yet, so reduce the log level
4692 * to INFO for that case. */
4693 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
4694 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4695 netdev_name, ovs_strerror(error));
4698 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4699 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4700 VLOG_WARN("%s device has unknown hardware address family %d",
4701 netdev_name, hwaddr_family);
4703 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4708 set_etheraddr(const char *netdev_name,
4709 const uint8_t mac[ETH_ADDR_LEN])
4714 memset(&ifr, 0, sizeof ifr);
4715 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4716 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4717 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4718 COVERAGE_INC(netdev_set_hwaddr);
4719 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
4721 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4722 netdev_name, ovs_strerror(error));
4728 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4729 int cmd, const char *cmd_name)
4734 memset(&ifr, 0, sizeof ifr);
4735 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4736 ifr.ifr_data = (caddr_t) ecmd;
4739 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
4741 if (error != EOPNOTSUPP) {
4742 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4743 "failed: %s", cmd_name, name, ovs_strerror(error));
4745 /* The device doesn't support this operation. That's pretty
4746 * common, so there's no point in logging anything. */
4753 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4754 int cmd, const char *cmd_name)
4759 ifr.ifr_addr.sa_family = AF_INET;
4760 error = af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4762 const struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *,
4764 *ip = sin->sin_addr;
4769 /* Returns an AF_PACKET raw socket or a negative errno value. */
4771 af_packet_sock(void)
4773 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4776 if (ovsthread_once_start(&once)) {
4777 sock = socket(AF_PACKET, SOCK_RAW, 0);
4779 int error = set_nonblocking(sock);
4786 VLOG_ERR("failed to create packet socket: %s",
4787 ovs_strerror(errno));
4789 ovsthread_once_done(&once);