2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <sys/types.h>
37 #include <sys/ioctl.h>
38 #include <sys/socket.h>
39 #include <netpacket/packet.h>
41 #include <net/if_arp.h>
42 #include <net/if_packet.h>
43 #include <net/route.h>
44 #include <netinet/in.h>
50 #include "connectivity.h"
52 #include "dpif-linux.h"
53 #include "dpif-netdev.h"
54 #include "dynamic-string.h"
55 #include "fatal-signal.h"
58 #include "netdev-provider.h"
59 #include "netdev-vport.h"
60 #include "netlink-notifier.h"
61 #include "netlink-socket.h"
64 #include "openflow/openflow.h"
65 #include "ovs-atomic.h"
67 #include "poll-loop.h"
68 #include "rtnetlink-link.h"
71 #include "socket-util.h"
74 #include "unaligned.h"
77 VLOG_DEFINE_THIS_MODULE(netdev_linux);
79 COVERAGE_DEFINE(netdev_set_policing);
80 COVERAGE_DEFINE(netdev_arp_lookup);
81 COVERAGE_DEFINE(netdev_get_ifindex);
82 COVERAGE_DEFINE(netdev_get_hwaddr);
83 COVERAGE_DEFINE(netdev_set_hwaddr);
84 COVERAGE_DEFINE(netdev_get_ethtool);
85 COVERAGE_DEFINE(netdev_set_ethtool);
88 /* These were introduced in Linux 2.6.14, so they might be missing if we have
90 #ifndef ADVERTISED_Pause
91 #define ADVERTISED_Pause (1 << 13)
93 #ifndef ADVERTISED_Asym_Pause
94 #define ADVERTISED_Asym_Pause (1 << 14)
97 /* These were introduced in Linux 2.6.24, so they might be missing if we
98 * have old headers. */
99 #ifndef ETHTOOL_GFLAGS
100 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
102 #ifndef ETHTOOL_SFLAGS
103 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
106 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
109 #define TC_RTAB_SIZE 1024
112 /* Linux 2.6.21 introduced struct tpacket_auxdata.
113 * Linux 2.6.27 added the tp_vlan_tci member.
114 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
115 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
116 * TP_STATUS_VLAN_TPID_VALID.
118 * With all this churn it's easiest to unconditionally define a replacement
119 * structure that has everything we want.
121 #ifndef PACKET_AUXDATA
122 #define PACKET_AUXDATA 8
124 #ifndef TP_STATUS_VLAN_VALID
125 #define TP_STATUS_VLAN_VALID (1 << 4)
127 #ifndef TP_STATUS_VLAN_TPID_VALID
128 #define TP_STATUS_VLAN_TPID_VALID (1 << 6)
130 #undef tpacket_auxdata
131 #define tpacket_auxdata rpl_tpacket_auxdata
132 struct tpacket_auxdata {
138 uint16_t tp_vlan_tci;
139 uint16_t tp_vlan_tpid;
143 VALID_IFINDEX = 1 << 0,
144 VALID_ETHERADDR = 1 << 1,
148 VALID_POLICING = 1 << 5,
149 VALID_VPORT_STAT_ERROR = 1 << 6,
150 VALID_DRVINFO = 1 << 7,
151 VALID_FEATURES = 1 << 8,
154 /* Traffic control. */
156 /* An instance of a traffic control class. Always associated with a particular
159 * Each TC implementation subclasses this with whatever additional data it
162 const struct tc_ops *ops;
163 struct hmap queues; /* Contains "struct tc_queue"s.
164 * Read by generic TC layer.
165 * Written only by TC implementation. */
168 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
170 /* One traffic control queue.
172 * Each TC implementation subclasses this with whatever additional data it
175 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
176 unsigned int queue_id; /* OpenFlow queue ID. */
177 long long int created; /* Time queue was created, in msecs. */
180 /* A particular kind of traffic control. Each implementation generally maps to
181 * one particular Linux qdisc class.
183 * The functions below return 0 if successful or a positive errno value on
184 * failure, except where otherwise noted. All of them must be provided, except
185 * where otherwise noted. */
187 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
188 * This is null for tc_ops_default and tc_ops_other, for which there are no
189 * appropriate values. */
190 const char *linux_name;
192 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
193 const char *ovs_name;
195 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
196 * queues. The queues are numbered 0 through n_queues - 1. */
197 unsigned int n_queues;
199 /* Called to install this TC class on 'netdev'. The implementation should
200 * make the Netlink calls required to set up 'netdev' with the right qdisc
201 * and configure it according to 'details'. The implementation may assume
202 * that the current qdisc is the default; that is, there is no need for it
203 * to delete the current qdisc before installing itself.
205 * The contents of 'details' should be documented as valid for 'ovs_name'
206 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
207 * (which is built as ovs-vswitchd.conf.db(8)).
209 * This function must return 0 if and only if it sets 'netdev->tc' to an
210 * initialized 'struct tc'.
212 * (This function is null for tc_ops_other, which cannot be installed. For
213 * other TC classes it should always be nonnull.) */
214 int (*tc_install)(struct netdev *netdev, const struct smap *details);
216 /* Called when the netdev code determines (through a Netlink query) that
217 * this TC class's qdisc is installed on 'netdev', but we didn't install
218 * it ourselves and so don't know any of the details.
220 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
221 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
222 * implementation should parse the other attributes of 'nlmsg' as
223 * necessary to determine its configuration. If necessary it should also
224 * use Netlink queries to determine the configuration of queues on
227 * This function must return 0 if and only if it sets 'netdev->tc' to an
228 * initialized 'struct tc'. */
229 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
231 /* Destroys the data structures allocated by the implementation as part of
232 * 'tc'. (This includes destroying 'tc->queues' by calling
235 * The implementation should not need to perform any Netlink calls. If
236 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
237 * (But it may not be desirable.)
239 * This function may be null if 'tc' is trivial. */
240 void (*tc_destroy)(struct tc *tc);
242 /* Retrieves details of 'netdev->tc' configuration into 'details'.
244 * The implementation should not need to perform any Netlink calls, because
245 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
246 * cached the configuration.
248 * The contents of 'details' should be documented as valid for 'ovs_name'
249 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
250 * (which is built as ovs-vswitchd.conf.db(8)).
252 * This function may be null if 'tc' is not configurable.
254 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
256 /* Reconfigures 'netdev->tc' according to 'details', performing any
257 * required Netlink calls to complete the reconfiguration.
259 * The contents of 'details' should be documented as valid for 'ovs_name'
260 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
261 * (which is built as ovs-vswitchd.conf.db(8)).
263 * This function may be null if 'tc' is not configurable.
265 int (*qdisc_set)(struct netdev *, const struct smap *details);
267 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
268 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
270 * The contents of 'details' should be documented as valid for 'ovs_name'
271 * in the "other_config" column in the "Queue" table in
272 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
274 * The implementation should not need to perform any Netlink calls, because
275 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
276 * cached the queue configuration.
278 * This function may be null if 'tc' does not have queues ('n_queues' is
280 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
281 struct smap *details);
283 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
284 * 'details', perfoming any required Netlink calls to complete the
285 * reconfiguration. The caller ensures that 'queue_id' is less than
288 * The contents of 'details' should be documented as valid for 'ovs_name'
289 * in the "other_config" column in the "Queue" table in
290 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
292 * This function may be null if 'tc' does not have queues or its queues are
293 * not configurable. */
294 int (*class_set)(struct netdev *, unsigned int queue_id,
295 const struct smap *details);
297 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
298 * tc_queue's within 'netdev->tc->queues'.
300 * This function may be null if 'tc' does not have queues or its queues
301 * cannot be deleted. */
302 int (*class_delete)(struct netdev *, struct tc_queue *queue);
304 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
305 * 'struct tc_queue's within 'netdev->tc->queues'.
307 * On success, initializes '*stats'.
309 * This function may be null if 'tc' does not have queues or if it cannot
310 * report queue statistics. */
311 int (*class_get_stats)(const struct netdev *netdev,
312 const struct tc_queue *queue,
313 struct netdev_queue_stats *stats);
315 /* Extracts queue stats from 'nlmsg', which is a response to a
316 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
318 * This function may be null if 'tc' does not have queues or if it cannot
319 * report queue statistics. */
320 int (*class_dump_stats)(const struct netdev *netdev,
321 const struct ofpbuf *nlmsg,
322 netdev_dump_queue_stats_cb *cb, void *aux);
326 tc_init(struct tc *tc, const struct tc_ops *ops)
329 hmap_init(&tc->queues);
333 tc_destroy(struct tc *tc)
335 hmap_destroy(&tc->queues);
338 static const struct tc_ops tc_ops_htb;
339 static const struct tc_ops tc_ops_hfsc;
340 static const struct tc_ops tc_ops_default;
341 static const struct tc_ops tc_ops_other;
343 static const struct tc_ops *const tcs[] = {
344 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
345 &tc_ops_hfsc, /* Hierarchical fair service curve. */
346 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
347 &tc_ops_other, /* Some other qdisc. */
351 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
352 static unsigned int tc_get_major(unsigned int handle);
353 static unsigned int tc_get_minor(unsigned int handle);
355 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
356 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
357 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
359 static struct tcmsg *tc_make_request(const struct netdev *, int type,
360 unsigned int flags, struct ofpbuf *);
361 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
362 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
363 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
366 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
367 struct nlattr **options);
368 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
369 struct nlattr **options,
370 struct netdev_queue_stats *);
371 static int tc_query_class(const struct netdev *,
372 unsigned int handle, unsigned int parent,
373 struct ofpbuf **replyp);
374 static int tc_delete_class(const struct netdev *, unsigned int handle);
376 static int tc_del_qdisc(struct netdev *netdev);
377 static int tc_query_qdisc(const struct netdev *netdev);
379 static int tc_calc_cell_log(unsigned int mtu);
380 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
381 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
382 const struct tc_ratespec *rate);
383 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
385 struct netdev_linux {
388 /* Protects all members below. */
389 struct ovs_mutex mutex;
391 unsigned int cache_valid;
393 bool miimon; /* Link status of last poll. */
394 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
395 struct timer miimon_timer;
397 /* The following are figured out "on demand" only. They are only valid
398 * when the corresponding VALID_* bit in 'cache_valid' is set. */
400 uint8_t etheraddr[ETH_ADDR_LEN];
401 struct in_addr address, netmask;
404 unsigned int ifi_flags;
405 long long int carrier_resets;
406 uint32_t kbits_rate; /* Policing data. */
407 uint32_t kbits_burst;
408 int vport_stats_error; /* Cached error code from vport_get_stats().
409 0 or an errno value. */
410 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
411 int ether_addr_error; /* Cached error code from set/get etheraddr. */
412 int netdev_policing_error; /* Cached error code from set policing. */
413 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
414 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
416 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
417 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
418 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
420 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
423 /* For devices of class netdev_tap_class only. */
427 struct netdev_rxq_linux {
428 struct netdev_rxq up;
433 /* This is set pretty low because we probably won't learn anything from the
434 * additional log messages. */
435 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
437 /* Polling miimon status for all ports causes performance degradation when
438 * handling a large number of ports. If there are no devices using miimon, then
439 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait(). */
440 static atomic_int miimon_cnt = ATOMIC_VAR_INIT(0);
442 static void netdev_linux_run(void);
444 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
445 int cmd, const char *cmd_name);
446 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
447 int cmd, const char *cmd_name);
448 static int get_flags(const struct netdev *, unsigned int *flags);
449 static int set_flags(const char *, unsigned int flags);
450 static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
451 enum netdev_flags on, enum netdev_flags *old_flagsp)
452 OVS_REQUIRES(netdev->mutex);
453 static int do_get_ifindex(const char *netdev_name);
454 static int get_ifindex(const struct netdev *, int *ifindexp);
455 static int do_set_addr(struct netdev *netdev,
456 int ioctl_nr, const char *ioctl_name,
457 struct in_addr addr);
458 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
459 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
460 static int get_stats_via_netlink(const struct netdev *, struct netdev_stats *);
461 static int af_packet_sock(void);
462 static bool netdev_linux_miimon_enabled(void);
463 static void netdev_linux_miimon_run(void);
464 static void netdev_linux_miimon_wait(void);
465 static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
468 is_netdev_linux_class(const struct netdev_class *netdev_class)
470 return netdev_class->run == netdev_linux_run;
474 is_tap_netdev(const struct netdev *netdev)
476 return netdev_get_class(netdev) == &netdev_tap_class;
479 static struct netdev_linux *
480 netdev_linux_cast(const struct netdev *netdev)
482 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
484 return CONTAINER_OF(netdev, struct netdev_linux, up);
487 static struct netdev_rxq_linux *
488 netdev_rxq_linux_cast(const struct netdev_rxq *rx)
490 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
491 return CONTAINER_OF(rx, struct netdev_rxq_linux, up);
494 static void netdev_linux_update(struct netdev_linux *netdev,
495 const struct rtnetlink_link_change *)
496 OVS_REQUIRES(netdev->mutex);
497 static void netdev_linux_changed(struct netdev_linux *netdev,
498 unsigned int ifi_flags, unsigned int mask)
499 OVS_REQUIRES(netdev->mutex);
501 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK changes, or NULL
502 * if no such socket could be created. */
503 static struct nl_sock *
504 netdev_linux_notify_sock(void)
506 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
507 static struct nl_sock *sock;
509 if (ovsthread_once_start(&once)) {
512 error = nl_sock_create(NETLINK_ROUTE, &sock);
514 error = nl_sock_join_mcgroup(sock, RTNLGRP_LINK);
516 nl_sock_destroy(sock);
520 ovsthread_once_done(&once);
527 netdev_linux_miimon_enabled(void)
531 atomic_read(&miimon_cnt, &miimon);
536 netdev_linux_run(void)
538 struct nl_sock *sock;
541 if (netdev_linux_miimon_enabled()) {
542 netdev_linux_miimon_run();
545 sock = netdev_linux_notify_sock();
551 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
552 uint64_t buf_stub[4096 / 8];
555 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
556 error = nl_sock_recv(sock, &buf, false);
558 struct rtnetlink_link_change change;
560 if (rtnetlink_link_parse(&buf, &change)) {
561 struct netdev *netdev_ = netdev_from_name(change.ifname);
562 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
563 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
565 ovs_mutex_lock(&netdev->mutex);
566 netdev_linux_update(netdev, &change);
567 ovs_mutex_unlock(&netdev->mutex);
569 netdev_close(netdev_);
571 } else if (error == ENOBUFS) {
572 struct shash device_shash;
573 struct shash_node *node;
577 shash_init(&device_shash);
578 netdev_get_devices(&netdev_linux_class, &device_shash);
579 SHASH_FOR_EACH (node, &device_shash) {
580 struct netdev *netdev_ = node->data;
581 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
584 ovs_mutex_lock(&netdev->mutex);
585 get_flags(netdev_, &flags);
586 netdev_linux_changed(netdev, flags, 0);
587 ovs_mutex_unlock(&netdev->mutex);
589 netdev_close(netdev_);
591 shash_destroy(&device_shash);
592 } else if (error != EAGAIN) {
593 VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)",
594 ovs_strerror(error));
601 netdev_linux_wait(void)
603 struct nl_sock *sock;
605 if (netdev_linux_miimon_enabled()) {
606 netdev_linux_miimon_wait();
608 sock = netdev_linux_notify_sock();
610 nl_sock_wait(sock, POLLIN);
615 netdev_linux_changed(struct netdev_linux *dev,
616 unsigned int ifi_flags, unsigned int mask)
617 OVS_REQUIRES(dev->mutex)
619 seq_change(connectivity_seq_get());
621 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
622 dev->carrier_resets++;
624 dev->ifi_flags = ifi_flags;
626 dev->cache_valid &= mask;
630 netdev_linux_update(struct netdev_linux *dev,
631 const struct rtnetlink_link_change *change)
632 OVS_REQUIRES(dev->mutex)
634 if (change->nlmsg_type == RTM_NEWLINK) {
636 netdev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
638 /* Update netdev from rtnl-change msg. */
640 dev->mtu = change->mtu;
641 dev->cache_valid |= VALID_MTU;
642 dev->netdev_mtu_error = 0;
645 if (!eth_addr_is_zero(change->addr)) {
646 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
647 dev->cache_valid |= VALID_ETHERADDR;
648 dev->ether_addr_error = 0;
651 dev->ifindex = change->ifi_index;
652 dev->cache_valid |= VALID_IFINDEX;
653 dev->get_ifindex_error = 0;
656 netdev_linux_changed(dev, change->ifi_flags, 0);
660 static struct netdev *
661 netdev_linux_alloc(void)
663 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
668 netdev_linux_common_construct(struct netdev_linux *netdev)
670 ovs_mutex_init(&netdev->mutex);
673 /* Creates system and internal devices. */
675 netdev_linux_construct(struct netdev *netdev_)
677 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
680 netdev_linux_common_construct(netdev);
682 error = get_flags(&netdev->up, &netdev->ifi_flags);
683 if (error == ENODEV) {
684 if (netdev->up.netdev_class != &netdev_internal_class) {
685 /* The device does not exist, so don't allow it to be opened. */
688 /* "Internal" netdevs have to be created as netdev objects before
689 * they exist in the kernel, because creating them in the kernel
690 * happens by passing a netdev object to dpif_port_add().
691 * Therefore, ignore the error. */
698 /* For most types of netdevs we open the device for each call of
699 * netdev_open(). However, this is not the case with tap devices,
700 * since it is only possible to open the device once. In this
701 * situation we share a single file descriptor, and consequently
702 * buffers, across all readers. Therefore once data is read it will
703 * be unavailable to other reads for tap devices. */
705 netdev_linux_construct_tap(struct netdev *netdev_)
707 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
708 static const char tap_dev[] = "/dev/net/tun";
709 const char *name = netdev_->name;
713 netdev_linux_common_construct(netdev);
715 /* Open tap device. */
716 netdev->tap_fd = open(tap_dev, O_RDWR);
717 if (netdev->tap_fd < 0) {
719 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
723 /* Create tap device. */
724 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
725 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
726 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
727 VLOG_WARN("%s: creating tap device failed: %s", name,
728 ovs_strerror(errno));
733 /* Make non-blocking. */
734 error = set_nonblocking(netdev->tap_fd);
742 close(netdev->tap_fd);
747 netdev_linux_destruct(struct netdev *netdev_)
749 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
751 if (netdev->tc && netdev->tc->ops->tc_destroy) {
752 netdev->tc->ops->tc_destroy(netdev->tc);
755 if (netdev_get_class(netdev_) == &netdev_tap_class
756 && netdev->tap_fd >= 0)
758 close(netdev->tap_fd);
761 if (netdev->miimon_interval > 0) {
763 atomic_sub(&miimon_cnt, 1, &junk);
766 ovs_mutex_destroy(&netdev->mutex);
770 netdev_linux_dealloc(struct netdev *netdev_)
772 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
776 static struct netdev_rxq *
777 netdev_linux_rxq_alloc(void)
779 struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
784 netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
786 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
787 struct netdev *netdev_ = rx->up.netdev;
788 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
791 ovs_mutex_lock(&netdev->mutex);
792 rx->is_tap = is_tap_netdev(netdev_);
794 rx->fd = netdev->tap_fd;
796 struct sockaddr_ll sll;
798 /* Result of tcpdump -dd inbound */
799 static const struct sock_filter filt[] = {
800 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
801 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
802 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
803 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
805 static const struct sock_fprog fprog = {
806 ARRAY_SIZE(filt), (struct sock_filter *) filt
809 /* Create file descriptor. */
810 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
813 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
818 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
820 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
821 netdev_get_name(netdev_), ovs_strerror(error));
825 /* Set non-blocking mode. */
826 error = set_nonblocking(rx->fd);
831 /* Get ethernet device index. */
832 error = get_ifindex(&netdev->up, &ifindex);
837 /* Bind to specific ethernet device. */
838 memset(&sll, 0, sizeof sll);
839 sll.sll_family = AF_PACKET;
840 sll.sll_ifindex = ifindex;
841 sll.sll_protocol = htons(ETH_P_ALL);
842 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
844 VLOG_ERR("%s: failed to bind raw socket (%s)",
845 netdev_get_name(netdev_), ovs_strerror(error));
849 /* Filter for only inbound packets. */
850 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
854 VLOG_ERR("%s: failed to attach filter (%s)",
855 netdev_get_name(netdev_), ovs_strerror(error));
859 ovs_mutex_unlock(&netdev->mutex);
867 ovs_mutex_unlock(&netdev->mutex);
872 netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
874 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
882 netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
884 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
890 auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux)
892 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
893 return htons(aux->tp_vlan_tpid);
895 return htons(ETH_TYPE_VLAN);
900 auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
902 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
906 netdev_linux_rxq_recv_sock(int fd, struct ofpbuf *buffer)
911 struct cmsghdr *cmsg;
914 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
918 /* Reserve headroom for a single VLAN tag */
919 ofpbuf_reserve(buffer, VLAN_HEADER_LEN);
920 size = ofpbuf_tailroom(buffer);
922 iov.iov_base = ofpbuf_data(buffer);
924 msgh.msg_name = NULL;
925 msgh.msg_namelen = 0;
928 msgh.msg_control = &cmsg_buffer;
929 msgh.msg_controllen = sizeof cmsg_buffer;
933 retval = recvmsg(fd, &msgh, MSG_TRUNC);
934 } while (retval < 0 && errno == EINTR);
938 } else if (retval > size) {
942 ofpbuf_set_size(buffer, ofpbuf_size(buffer) + retval);
944 for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
945 const struct tpacket_auxdata *aux;
947 if (cmsg->cmsg_level != SOL_PACKET
948 || cmsg->cmsg_type != PACKET_AUXDATA
949 || cmsg->cmsg_len < CMSG_LEN(sizeof(struct tpacket_auxdata))) {
953 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
954 if (auxdata_has_vlan_tci(aux)) {
955 if (retval < ETH_HEADER_LEN) {
959 eth_push_vlan(buffer, auxdata_to_vlan_tpid(aux),
960 htons(aux->tp_vlan_tci));
969 netdev_linux_rxq_recv_tap(int fd, struct ofpbuf *buffer)
972 size_t size = ofpbuf_tailroom(buffer);
975 retval = read(fd, ofpbuf_data(buffer), size);
976 } while (retval < 0 && errno == EINTR);
980 } else if (retval > size) {
984 ofpbuf_set_size(buffer, ofpbuf_size(buffer) + retval);
989 netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct ofpbuf **packet, int *c)
991 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
992 struct netdev *netdev = rx->up.netdev;
993 struct ofpbuf *buffer;
997 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
998 mtu = ETH_PAYLOAD_MAX;
1001 buffer = ofpbuf_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu, DP_NETDEV_HEADROOM);
1003 retval = (rx->is_tap
1004 ? netdev_linux_rxq_recv_tap(rx->fd, buffer)
1005 : netdev_linux_rxq_recv_sock(rx->fd, buffer));
1008 if (retval != EAGAIN && retval != EMSGSIZE) {
1009 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
1010 ovs_strerror(errno), netdev_rxq_get_name(rxq_));
1012 ofpbuf_delete(buffer);
1014 dp_packet_pad(buffer);
1023 netdev_linux_rxq_wait(struct netdev_rxq *rxq_)
1025 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1026 poll_fd_wait(rx->fd, POLLIN);
1030 netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
1032 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1035 int error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr,
1036 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
1040 drain_fd(rx->fd, ifr.ifr_qlen);
1043 return drain_rcvbuf(rx->fd);
1047 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
1048 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1049 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1050 * the packet is too big or too small to transmit on the device.
1052 * The caller retains ownership of 'buffer' in all cases.
1054 * The kernel maintains a packet transmission queue, so the caller is not
1055 * expected to do additional queuing of packets. */
1057 netdev_linux_send(struct netdev *netdev_, struct ofpbuf *pkt, bool may_steal)
1059 const void *data = ofpbuf_data(pkt);
1060 size_t size = ofpbuf_size(pkt);
1065 if (!is_tap_netdev(netdev_)) {
1066 /* Use our AF_PACKET socket to send to this device. */
1067 struct sockaddr_ll sll;
1073 sock = af_packet_sock();
1078 ifindex = netdev_get_ifindex(netdev_);
1083 /* We don't bother setting most fields in sockaddr_ll because the
1084 * kernel ignores them for SOCK_RAW. */
1085 memset(&sll, 0, sizeof sll);
1086 sll.sll_family = AF_PACKET;
1087 sll.sll_ifindex = ifindex;
1089 iov.iov_base = CONST_CAST(void *, data);
1092 msg.msg_name = &sll;
1093 msg.msg_namelen = sizeof sll;
1096 msg.msg_control = NULL;
1097 msg.msg_controllen = 0;
1100 retval = sendmsg(sock, &msg, 0);
1102 /* Use the tap fd to send to this device. This is essential for
1103 * tap devices, because packets sent to a tap device with an
1104 * AF_PACKET socket will loop back to be *received* again on the
1105 * tap device. This doesn't occur on other interface types
1106 * because we attach a socket filter to the rx socket. */
1107 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1109 retval = write(netdev->tap_fd, data, size);
1117 /* The Linux AF_PACKET implementation never blocks waiting for room
1118 * for packets, instead returning ENOBUFS. Translate this into
1119 * EAGAIN for the caller. */
1120 if (errno == ENOBUFS) {
1122 } else if (errno == EINTR) {
1124 } else if (errno != EAGAIN) {
1125 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1126 netdev_get_name(netdev_), ovs_strerror(errno));
1129 } else if (retval != size) {
1130 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE"d bytes of "
1131 "%"PRIuSIZE") on %s", retval, size, netdev_get_name(netdev_));
1139 /* Registers with the poll loop to wake up from the next call to poll_block()
1140 * when the packet transmission queue has sufficient room to transmit a packet
1141 * with netdev_send().
1143 * The kernel maintains a packet transmission queue, so the client is not
1144 * expected to do additional queuing of packets. Thus, this function is
1145 * unlikely to ever be used. It is included for completeness. */
1147 netdev_linux_send_wait(struct netdev *netdev)
1149 if (is_tap_netdev(netdev)) {
1150 /* TAP device always accepts packets.*/
1151 poll_immediate_wake();
1155 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1156 * otherwise a positive errno value. */
1158 netdev_linux_set_etheraddr(struct netdev *netdev_,
1159 const uint8_t mac[ETH_ADDR_LEN])
1161 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1162 enum netdev_flags old_flags = 0;
1165 ovs_mutex_lock(&netdev->mutex);
1167 if (netdev->cache_valid & VALID_ETHERADDR) {
1168 error = netdev->ether_addr_error;
1169 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1172 netdev->cache_valid &= ~VALID_ETHERADDR;
1175 /* Tap devices must be brought down before setting the address. */
1176 if (is_tap_netdev(netdev_)) {
1177 update_flags(netdev, NETDEV_UP, 0, &old_flags);
1179 error = set_etheraddr(netdev_get_name(netdev_), mac);
1180 if (!error || error == ENODEV) {
1181 netdev->ether_addr_error = error;
1182 netdev->cache_valid |= VALID_ETHERADDR;
1184 memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
1188 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1189 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1193 ovs_mutex_unlock(&netdev->mutex);
1197 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1199 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1200 uint8_t mac[ETH_ADDR_LEN])
1202 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1205 ovs_mutex_lock(&netdev->mutex);
1206 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1207 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1209 netdev->cache_valid |= VALID_ETHERADDR;
1212 error = netdev->ether_addr_error;
1214 memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
1216 ovs_mutex_unlock(&netdev->mutex);
1222 netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
1226 if (!(netdev->cache_valid & VALID_MTU)) {
1229 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
1230 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
1231 netdev->mtu = ifr.ifr_mtu;
1232 netdev->cache_valid |= VALID_MTU;
1235 error = netdev->netdev_mtu_error;
1237 *mtup = netdev->mtu;
1243 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1244 * in bytes, not including the hardware header; thus, this is typically 1500
1245 * bytes for Ethernet devices. */
1247 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1249 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1252 ovs_mutex_lock(&netdev->mutex);
1253 error = netdev_linux_get_mtu__(netdev, mtup);
1254 ovs_mutex_unlock(&netdev->mutex);
1259 /* Sets the maximum size of transmitted (MTU) for given device using linux
1260 * networking ioctl interface.
1263 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1265 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1269 ovs_mutex_lock(&netdev->mutex);
1270 if (netdev->cache_valid & VALID_MTU) {
1271 error = netdev->netdev_mtu_error;
1272 if (error || netdev->mtu == mtu) {
1275 netdev->cache_valid &= ~VALID_MTU;
1278 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1279 SIOCSIFMTU, "SIOCSIFMTU");
1280 if (!error || error == ENODEV) {
1281 netdev->netdev_mtu_error = error;
1282 netdev->mtu = ifr.ifr_mtu;
1283 netdev->cache_valid |= VALID_MTU;
1286 ovs_mutex_unlock(&netdev->mutex);
1290 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1291 * On failure, returns a negative errno value. */
1293 netdev_linux_get_ifindex(const struct netdev *netdev_)
1295 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1298 ovs_mutex_lock(&netdev->mutex);
1299 error = get_ifindex(netdev_, &ifindex);
1300 ovs_mutex_unlock(&netdev->mutex);
1302 return error ? -error : ifindex;
1306 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1308 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1310 ovs_mutex_lock(&netdev->mutex);
1311 if (netdev->miimon_interval > 0) {
1312 *carrier = netdev->miimon;
1314 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1316 ovs_mutex_unlock(&netdev->mutex);
1321 static long long int
1322 netdev_linux_get_carrier_resets(const struct netdev *netdev_)
1324 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1325 long long int carrier_resets;
1327 ovs_mutex_lock(&netdev->mutex);
1328 carrier_resets = netdev->carrier_resets;
1329 ovs_mutex_unlock(&netdev->mutex);
1331 return carrier_resets;
1335 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1336 struct mii_ioctl_data *data)
1341 memset(&ifr, 0, sizeof ifr);
1342 memcpy(&ifr.ifr_data, data, sizeof *data);
1343 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
1344 memcpy(data, &ifr.ifr_data, sizeof *data);
1350 netdev_linux_get_miimon(const char *name, bool *miimon)
1352 struct mii_ioctl_data data;
1357 memset(&data, 0, sizeof data);
1358 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1360 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1361 data.reg_num = MII_BMSR;
1362 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1366 *miimon = !!(data.val_out & BMSR_LSTATUS);
1368 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1371 struct ethtool_cmd ecmd;
1373 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1376 COVERAGE_INC(netdev_get_ethtool);
1377 memset(&ecmd, 0, sizeof ecmd);
1378 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1381 struct ethtool_value eval;
1383 memcpy(&eval, &ecmd, sizeof eval);
1384 *miimon = !!eval.data;
1386 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1394 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1395 long long int interval)
1397 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1399 ovs_mutex_lock(&netdev->mutex);
1400 interval = interval > 0 ? MAX(interval, 100) : 0;
1401 if (netdev->miimon_interval != interval) {
1404 if (interval && !netdev->miimon_interval) {
1405 atomic_add(&miimon_cnt, 1, &junk);
1406 } else if (!interval && netdev->miimon_interval) {
1407 atomic_sub(&miimon_cnt, 1, &junk);
1410 netdev->miimon_interval = interval;
1411 timer_set_expired(&netdev->miimon_timer);
1413 ovs_mutex_unlock(&netdev->mutex);
1419 netdev_linux_miimon_run(void)
1421 struct shash device_shash;
1422 struct shash_node *node;
1424 shash_init(&device_shash);
1425 netdev_get_devices(&netdev_linux_class, &device_shash);
1426 SHASH_FOR_EACH (node, &device_shash) {
1427 struct netdev *netdev = node->data;
1428 struct netdev_linux *dev = netdev_linux_cast(netdev);
1431 ovs_mutex_lock(&dev->mutex);
1432 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1433 netdev_linux_get_miimon(dev->up.name, &miimon);
1434 if (miimon != dev->miimon) {
1435 dev->miimon = miimon;
1436 netdev_linux_changed(dev, dev->ifi_flags, 0);
1439 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1441 ovs_mutex_unlock(&dev->mutex);
1442 netdev_close(netdev);
1445 shash_destroy(&device_shash);
1449 netdev_linux_miimon_wait(void)
1451 struct shash device_shash;
1452 struct shash_node *node;
1454 shash_init(&device_shash);
1455 netdev_get_devices(&netdev_linux_class, &device_shash);
1456 SHASH_FOR_EACH (node, &device_shash) {
1457 struct netdev *netdev = node->data;
1458 struct netdev_linux *dev = netdev_linux_cast(netdev);
1460 ovs_mutex_lock(&dev->mutex);
1461 if (dev->miimon_interval > 0) {
1462 timer_wait(&dev->miimon_timer);
1464 ovs_mutex_unlock(&dev->mutex);
1465 netdev_close(netdev);
1467 shash_destroy(&device_shash);
1471 swap_uint64(uint64_t *a, uint64_t *b)
1478 /* Copies 'src' into 'dst', performing format conversion in the process.
1480 * 'src' is allowed to be misaligned. */
1482 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1483 const struct ovs_vport_stats *src)
1485 dst->rx_packets = get_unaligned_u64(&src->rx_packets);
1486 dst->tx_packets = get_unaligned_u64(&src->tx_packets);
1487 dst->rx_bytes = get_unaligned_u64(&src->rx_bytes);
1488 dst->tx_bytes = get_unaligned_u64(&src->tx_bytes);
1489 dst->rx_errors = get_unaligned_u64(&src->rx_errors);
1490 dst->tx_errors = get_unaligned_u64(&src->tx_errors);
1491 dst->rx_dropped = get_unaligned_u64(&src->rx_dropped);
1492 dst->tx_dropped = get_unaligned_u64(&src->tx_dropped);
1494 dst->collisions = 0;
1495 dst->rx_length_errors = 0;
1496 dst->rx_over_errors = 0;
1497 dst->rx_crc_errors = 0;
1498 dst->rx_frame_errors = 0;
1499 dst->rx_fifo_errors = 0;
1500 dst->rx_missed_errors = 0;
1501 dst->tx_aborted_errors = 0;
1502 dst->tx_carrier_errors = 0;
1503 dst->tx_fifo_errors = 0;
1504 dst->tx_heartbeat_errors = 0;
1505 dst->tx_window_errors = 0;
1509 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1511 struct dpif_linux_vport reply;
1515 error = dpif_linux_vport_get(netdev_get_name(netdev), &reply, &buf);
1518 } else if (!reply.stats) {
1523 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1531 get_stats_via_vport(const struct netdev *netdev_,
1532 struct netdev_stats *stats)
1534 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1536 if (!netdev->vport_stats_error ||
1537 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1540 error = get_stats_via_vport__(netdev_, stats);
1541 if (error && error != ENOENT) {
1542 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1544 netdev_get_name(netdev_), ovs_strerror(error));
1546 netdev->vport_stats_error = error;
1547 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1551 /* Retrieves current device stats for 'netdev-linux'. */
1553 netdev_linux_get_stats(const struct netdev *netdev_,
1554 struct netdev_stats *stats)
1556 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1557 struct netdev_stats dev_stats;
1560 ovs_mutex_lock(&netdev->mutex);
1561 get_stats_via_vport(netdev_, stats);
1562 error = get_stats_via_netlink(netdev_, &dev_stats);
1564 if (!netdev->vport_stats_error) {
1567 } else if (netdev->vport_stats_error) {
1568 /* stats not available from OVS then use ioctl stats. */
1571 stats->rx_errors += dev_stats.rx_errors;
1572 stats->tx_errors += dev_stats.tx_errors;
1573 stats->rx_dropped += dev_stats.rx_dropped;
1574 stats->tx_dropped += dev_stats.tx_dropped;
1575 stats->multicast += dev_stats.multicast;
1576 stats->collisions += dev_stats.collisions;
1577 stats->rx_length_errors += dev_stats.rx_length_errors;
1578 stats->rx_over_errors += dev_stats.rx_over_errors;
1579 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1580 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1581 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1582 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1583 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1584 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1585 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1586 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1587 stats->tx_window_errors += dev_stats.tx_window_errors;
1589 ovs_mutex_unlock(&netdev->mutex);
1594 /* Retrieves current device stats for 'netdev-tap' netdev or
1595 * netdev-internal. */
1597 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1599 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1600 struct netdev_stats dev_stats;
1603 ovs_mutex_lock(&netdev->mutex);
1604 get_stats_via_vport(netdev_, stats);
1605 error = get_stats_via_netlink(netdev_, &dev_stats);
1607 if (!netdev->vport_stats_error) {
1610 } else if (netdev->vport_stats_error) {
1611 /* Transmit and receive stats will appear to be swapped relative to the
1612 * other ports since we are the one sending the data, not a remote
1613 * computer. For consistency, we swap them back here. This does not
1614 * apply if we are getting stats from the vport layer because it always
1615 * tracks stats from the perspective of the switch. */
1618 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1619 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1620 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1621 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1622 stats->rx_length_errors = 0;
1623 stats->rx_over_errors = 0;
1624 stats->rx_crc_errors = 0;
1625 stats->rx_frame_errors = 0;
1626 stats->rx_fifo_errors = 0;
1627 stats->rx_missed_errors = 0;
1628 stats->tx_aborted_errors = 0;
1629 stats->tx_carrier_errors = 0;
1630 stats->tx_fifo_errors = 0;
1631 stats->tx_heartbeat_errors = 0;
1632 stats->tx_window_errors = 0;
1634 stats->rx_dropped += dev_stats.tx_dropped;
1635 stats->tx_dropped += dev_stats.rx_dropped;
1637 stats->rx_errors += dev_stats.tx_errors;
1638 stats->tx_errors += dev_stats.rx_errors;
1640 stats->multicast += dev_stats.multicast;
1641 stats->collisions += dev_stats.collisions;
1643 ovs_mutex_unlock(&netdev->mutex);
1649 netdev_internal_get_stats(const struct netdev *netdev_,
1650 struct netdev_stats *stats)
1652 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1655 ovs_mutex_lock(&netdev->mutex);
1656 get_stats_via_vport(netdev_, stats);
1657 error = netdev->vport_stats_error;
1658 ovs_mutex_unlock(&netdev->mutex);
1664 netdev_internal_set_stats(struct netdev *netdev,
1665 const struct netdev_stats *stats)
1667 struct ovs_vport_stats vport_stats;
1668 struct dpif_linux_vport vport;
1671 vport_stats.rx_packets = stats->rx_packets;
1672 vport_stats.tx_packets = stats->tx_packets;
1673 vport_stats.rx_bytes = stats->rx_bytes;
1674 vport_stats.tx_bytes = stats->tx_bytes;
1675 vport_stats.rx_errors = stats->rx_errors;
1676 vport_stats.tx_errors = stats->tx_errors;
1677 vport_stats.rx_dropped = stats->rx_dropped;
1678 vport_stats.tx_dropped = stats->tx_dropped;
1680 dpif_linux_vport_init(&vport);
1681 vport.cmd = OVS_VPORT_CMD_SET;
1682 vport.name = netdev_get_name(netdev);
1683 vport.stats = &vport_stats;
1685 err = dpif_linux_vport_transact(&vport, NULL, NULL);
1687 /* If the vport layer doesn't know about the device, that doesn't mean it
1688 * doesn't exist (after all were able to open it when netdev_open() was
1689 * called), it just means that it isn't attached and we'll be getting
1690 * stats a different way. */
1691 if (err == ENODEV) {
1699 netdev_linux_read_features(struct netdev_linux *netdev)
1701 struct ethtool_cmd ecmd;
1705 if (netdev->cache_valid & VALID_FEATURES) {
1709 COVERAGE_INC(netdev_get_ethtool);
1710 memset(&ecmd, 0, sizeof ecmd);
1711 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1712 ETHTOOL_GSET, "ETHTOOL_GSET");
1717 /* Supported features. */
1718 netdev->supported = 0;
1719 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1720 netdev->supported |= NETDEV_F_10MB_HD;
1722 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1723 netdev->supported |= NETDEV_F_10MB_FD;
1725 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1726 netdev->supported |= NETDEV_F_100MB_HD;
1728 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1729 netdev->supported |= NETDEV_F_100MB_FD;
1731 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1732 netdev->supported |= NETDEV_F_1GB_HD;
1734 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1735 netdev->supported |= NETDEV_F_1GB_FD;
1737 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1738 netdev->supported |= NETDEV_F_10GB_FD;
1740 if (ecmd.supported & SUPPORTED_TP) {
1741 netdev->supported |= NETDEV_F_COPPER;
1743 if (ecmd.supported & SUPPORTED_FIBRE) {
1744 netdev->supported |= NETDEV_F_FIBER;
1746 if (ecmd.supported & SUPPORTED_Autoneg) {
1747 netdev->supported |= NETDEV_F_AUTONEG;
1749 if (ecmd.supported & SUPPORTED_Pause) {
1750 netdev->supported |= NETDEV_F_PAUSE;
1752 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1753 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1756 /* Advertised features. */
1757 netdev->advertised = 0;
1758 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1759 netdev->advertised |= NETDEV_F_10MB_HD;
1761 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1762 netdev->advertised |= NETDEV_F_10MB_FD;
1764 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1765 netdev->advertised |= NETDEV_F_100MB_HD;
1767 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1768 netdev->advertised |= NETDEV_F_100MB_FD;
1770 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1771 netdev->advertised |= NETDEV_F_1GB_HD;
1773 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1774 netdev->advertised |= NETDEV_F_1GB_FD;
1776 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1777 netdev->advertised |= NETDEV_F_10GB_FD;
1779 if (ecmd.advertising & ADVERTISED_TP) {
1780 netdev->advertised |= NETDEV_F_COPPER;
1782 if (ecmd.advertising & ADVERTISED_FIBRE) {
1783 netdev->advertised |= NETDEV_F_FIBER;
1785 if (ecmd.advertising & ADVERTISED_Autoneg) {
1786 netdev->advertised |= NETDEV_F_AUTONEG;
1788 if (ecmd.advertising & ADVERTISED_Pause) {
1789 netdev->advertised |= NETDEV_F_PAUSE;
1791 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1792 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1795 /* Current settings. */
1797 if (speed == SPEED_10) {
1798 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1799 } else if (speed == SPEED_100) {
1800 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1801 } else if (speed == SPEED_1000) {
1802 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1803 } else if (speed == SPEED_10000) {
1804 netdev->current = NETDEV_F_10GB_FD;
1805 } else if (speed == 40000) {
1806 netdev->current = NETDEV_F_40GB_FD;
1807 } else if (speed == 100000) {
1808 netdev->current = NETDEV_F_100GB_FD;
1809 } else if (speed == 1000000) {
1810 netdev->current = NETDEV_F_1TB_FD;
1812 netdev->current = 0;
1815 if (ecmd.port == PORT_TP) {
1816 netdev->current |= NETDEV_F_COPPER;
1817 } else if (ecmd.port == PORT_FIBRE) {
1818 netdev->current |= NETDEV_F_FIBER;
1822 netdev->current |= NETDEV_F_AUTONEG;
1826 netdev->cache_valid |= VALID_FEATURES;
1827 netdev->get_features_error = error;
1830 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
1831 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1832 * Returns 0 if successful, otherwise a positive errno value. */
1834 netdev_linux_get_features(const struct netdev *netdev_,
1835 enum netdev_features *current,
1836 enum netdev_features *advertised,
1837 enum netdev_features *supported,
1838 enum netdev_features *peer)
1840 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1843 ovs_mutex_lock(&netdev->mutex);
1844 netdev_linux_read_features(netdev);
1845 if (!netdev->get_features_error) {
1846 *current = netdev->current;
1847 *advertised = netdev->advertised;
1848 *supported = netdev->supported;
1849 *peer = 0; /* XXX */
1851 error = netdev->get_features_error;
1852 ovs_mutex_unlock(&netdev->mutex);
1857 /* Set the features advertised by 'netdev' to 'advertise'. */
1859 netdev_linux_set_advertisements(struct netdev *netdev_,
1860 enum netdev_features advertise)
1862 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1863 struct ethtool_cmd ecmd;
1866 ovs_mutex_lock(&netdev->mutex);
1868 COVERAGE_INC(netdev_get_ethtool);
1869 memset(&ecmd, 0, sizeof ecmd);
1870 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1871 ETHTOOL_GSET, "ETHTOOL_GSET");
1876 ecmd.advertising = 0;
1877 if (advertise & NETDEV_F_10MB_HD) {
1878 ecmd.advertising |= ADVERTISED_10baseT_Half;
1880 if (advertise & NETDEV_F_10MB_FD) {
1881 ecmd.advertising |= ADVERTISED_10baseT_Full;
1883 if (advertise & NETDEV_F_100MB_HD) {
1884 ecmd.advertising |= ADVERTISED_100baseT_Half;
1886 if (advertise & NETDEV_F_100MB_FD) {
1887 ecmd.advertising |= ADVERTISED_100baseT_Full;
1889 if (advertise & NETDEV_F_1GB_HD) {
1890 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1892 if (advertise & NETDEV_F_1GB_FD) {
1893 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1895 if (advertise & NETDEV_F_10GB_FD) {
1896 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1898 if (advertise & NETDEV_F_COPPER) {
1899 ecmd.advertising |= ADVERTISED_TP;
1901 if (advertise & NETDEV_F_FIBER) {
1902 ecmd.advertising |= ADVERTISED_FIBRE;
1904 if (advertise & NETDEV_F_AUTONEG) {
1905 ecmd.advertising |= ADVERTISED_Autoneg;
1907 if (advertise & NETDEV_F_PAUSE) {
1908 ecmd.advertising |= ADVERTISED_Pause;
1910 if (advertise & NETDEV_F_PAUSE_ASYM) {
1911 ecmd.advertising |= ADVERTISED_Asym_Pause;
1913 COVERAGE_INC(netdev_set_ethtool);
1914 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1915 ETHTOOL_SSET, "ETHTOOL_SSET");
1918 ovs_mutex_unlock(&netdev->mutex);
1922 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1923 * successful, otherwise a positive errno value. */
1925 netdev_linux_set_policing(struct netdev *netdev_,
1926 uint32_t kbits_rate, uint32_t kbits_burst)
1928 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1929 const char *netdev_name = netdev_get_name(netdev_);
1932 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1933 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1934 : kbits_burst); /* Stick with user-specified value. */
1936 ovs_mutex_lock(&netdev->mutex);
1937 if (netdev->cache_valid & VALID_POLICING) {
1938 error = netdev->netdev_policing_error;
1939 if (error || (netdev->kbits_rate == kbits_rate &&
1940 netdev->kbits_burst == kbits_burst)) {
1941 /* Assume that settings haven't changed since we last set them. */
1944 netdev->cache_valid &= ~VALID_POLICING;
1947 COVERAGE_INC(netdev_set_policing);
1948 /* Remove any existing ingress qdisc. */
1949 error = tc_add_del_ingress_qdisc(netdev_, false);
1951 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1952 netdev_name, ovs_strerror(error));
1957 error = tc_add_del_ingress_qdisc(netdev_, true);
1959 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1960 netdev_name, ovs_strerror(error));
1964 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
1966 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1967 netdev_name, ovs_strerror(error));
1972 netdev->kbits_rate = kbits_rate;
1973 netdev->kbits_burst = kbits_burst;
1976 if (!error || error == ENODEV) {
1977 netdev->netdev_policing_error = error;
1978 netdev->cache_valid |= VALID_POLICING;
1980 ovs_mutex_unlock(&netdev->mutex);
1985 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1988 const struct tc_ops *const *opsp;
1990 for (opsp = tcs; *opsp != NULL; opsp++) {
1991 const struct tc_ops *ops = *opsp;
1992 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1993 sset_add(types, ops->ovs_name);
1999 static const struct tc_ops *
2000 tc_lookup_ovs_name(const char *name)
2002 const struct tc_ops *const *opsp;
2004 for (opsp = tcs; *opsp != NULL; opsp++) {
2005 const struct tc_ops *ops = *opsp;
2006 if (!strcmp(name, ops->ovs_name)) {
2013 static const struct tc_ops *
2014 tc_lookup_linux_name(const char *name)
2016 const struct tc_ops *const *opsp;
2018 for (opsp = tcs; *opsp != NULL; opsp++) {
2019 const struct tc_ops *ops = *opsp;
2020 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
2027 static struct tc_queue *
2028 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
2031 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2032 struct tc_queue *queue;
2034 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
2035 if (queue->queue_id == queue_id) {
2042 static struct tc_queue *
2043 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2045 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2049 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2051 struct netdev_qos_capabilities *caps)
2053 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2057 caps->n_queues = ops->n_queues;
2062 netdev_linux_get_qos(const struct netdev *netdev_,
2063 const char **typep, struct smap *details)
2065 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2068 ovs_mutex_lock(&netdev->mutex);
2069 error = tc_query_qdisc(netdev_);
2071 *typep = netdev->tc->ops->ovs_name;
2072 error = (netdev->tc->ops->qdisc_get
2073 ? netdev->tc->ops->qdisc_get(netdev_, details)
2076 ovs_mutex_unlock(&netdev->mutex);
2082 netdev_linux_set_qos(struct netdev *netdev_,
2083 const char *type, const struct smap *details)
2085 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2086 const struct tc_ops *new_ops;
2089 new_ops = tc_lookup_ovs_name(type);
2090 if (!new_ops || !new_ops->tc_install) {
2094 ovs_mutex_lock(&netdev->mutex);
2095 error = tc_query_qdisc(netdev_);
2100 if (new_ops == netdev->tc->ops) {
2101 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
2103 /* Delete existing qdisc. */
2104 error = tc_del_qdisc(netdev_);
2108 ovs_assert(netdev->tc == NULL);
2110 /* Install new qdisc. */
2111 error = new_ops->tc_install(netdev_, details);
2112 ovs_assert((error == 0) == (netdev->tc != NULL));
2116 ovs_mutex_unlock(&netdev->mutex);
2121 netdev_linux_get_queue(const struct netdev *netdev_,
2122 unsigned int queue_id, struct smap *details)
2124 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2127 ovs_mutex_lock(&netdev->mutex);
2128 error = tc_query_qdisc(netdev_);
2130 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2132 ? netdev->tc->ops->class_get(netdev_, queue, details)
2135 ovs_mutex_unlock(&netdev->mutex);
2141 netdev_linux_set_queue(struct netdev *netdev_,
2142 unsigned int queue_id, const struct smap *details)
2144 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2147 ovs_mutex_lock(&netdev->mutex);
2148 error = tc_query_qdisc(netdev_);
2150 error = (queue_id < netdev->tc->ops->n_queues
2151 && netdev->tc->ops->class_set
2152 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2155 ovs_mutex_unlock(&netdev->mutex);
2161 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
2163 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2166 ovs_mutex_lock(&netdev->mutex);
2167 error = tc_query_qdisc(netdev_);
2169 if (netdev->tc->ops->class_delete) {
2170 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2172 ? netdev->tc->ops->class_delete(netdev_, queue)
2178 ovs_mutex_unlock(&netdev->mutex);
2184 netdev_linux_get_queue_stats(const struct netdev *netdev_,
2185 unsigned int queue_id,
2186 struct netdev_queue_stats *stats)
2188 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2191 ovs_mutex_lock(&netdev->mutex);
2192 error = tc_query_qdisc(netdev_);
2194 if (netdev->tc->ops->class_get_stats) {
2195 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2197 stats->created = queue->created;
2198 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2207 ovs_mutex_unlock(&netdev->mutex);
2212 struct queue_dump_state {
2213 struct nl_dump dump;
2218 start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
2220 struct ofpbuf request;
2221 struct tcmsg *tcmsg;
2223 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2227 tcmsg->tcm_parent = 0;
2228 nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
2229 ofpbuf_uninit(&request);
2231 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
2236 finish_queue_dump(struct queue_dump_state *state)
2238 ofpbuf_uninit(&state->buf);
2239 return nl_dump_done(&state->dump);
2242 struct netdev_linux_queue_state {
2243 unsigned int *queues;
2249 netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
2251 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2254 ovs_mutex_lock(&netdev->mutex);
2255 error = tc_query_qdisc(netdev_);
2257 if (netdev->tc->ops->class_get) {
2258 struct netdev_linux_queue_state *state;
2259 struct tc_queue *queue;
2262 *statep = state = xmalloc(sizeof *state);
2263 state->n_queues = hmap_count(&netdev->tc->queues);
2264 state->cur_queue = 0;
2265 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2268 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2269 state->queues[i++] = queue->queue_id;
2275 ovs_mutex_unlock(&netdev->mutex);
2281 netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2282 unsigned int *queue_idp, struct smap *details)
2284 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2285 struct netdev_linux_queue_state *state = state_;
2288 ovs_mutex_lock(&netdev->mutex);
2289 while (state->cur_queue < state->n_queues) {
2290 unsigned int queue_id = state->queues[state->cur_queue++];
2291 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2294 *queue_idp = queue_id;
2295 error = netdev->tc->ops->class_get(netdev_, queue, details);
2299 ovs_mutex_unlock(&netdev->mutex);
2305 netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2308 struct netdev_linux_queue_state *state = state_;
2310 free(state->queues);
2316 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2317 netdev_dump_queue_stats_cb *cb, void *aux)
2319 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2322 ovs_mutex_lock(&netdev->mutex);
2323 error = tc_query_qdisc(netdev_);
2325 struct queue_dump_state state;
2327 if (!netdev->tc->ops->class_dump_stats) {
2329 } else if (!start_queue_dump(netdev_, &state)) {
2335 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
2336 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2343 retval = finish_queue_dump(&state);
2349 ovs_mutex_unlock(&netdev->mutex);
2355 netdev_linux_get_in4(const struct netdev *netdev_,
2356 struct in_addr *address, struct in_addr *netmask)
2358 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2361 ovs_mutex_lock(&netdev->mutex);
2362 if (!(netdev->cache_valid & VALID_IN4)) {
2363 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
2364 SIOCGIFADDR, "SIOCGIFADDR");
2366 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2367 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2369 netdev->cache_valid |= VALID_IN4;
2377 if (netdev->address.s_addr != INADDR_ANY) {
2378 *address = netdev->address;
2379 *netmask = netdev->netmask;
2381 error = EADDRNOTAVAIL;
2384 ovs_mutex_unlock(&netdev->mutex);
2390 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2391 struct in_addr netmask)
2393 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2396 ovs_mutex_lock(&netdev->mutex);
2397 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2399 netdev->cache_valid |= VALID_IN4;
2400 netdev->address = address;
2401 netdev->netmask = netmask;
2402 if (address.s_addr != INADDR_ANY) {
2403 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2404 "SIOCSIFNETMASK", netmask);
2407 ovs_mutex_unlock(&netdev->mutex);
2413 parse_if_inet6_line(const char *line,
2414 struct in6_addr *in6, char ifname[16 + 1])
2416 uint8_t *s6 = in6->s6_addr;
2417 #define X8 "%2"SCNx8
2418 return ovs_scan(line,
2419 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2420 "%*x %*x %*x %*x %16s\n",
2421 &s6[0], &s6[1], &s6[2], &s6[3],
2422 &s6[4], &s6[5], &s6[6], &s6[7],
2423 &s6[8], &s6[9], &s6[10], &s6[11],
2424 &s6[12], &s6[13], &s6[14], &s6[15],
2428 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2429 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2431 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2433 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2435 ovs_mutex_lock(&netdev->mutex);
2436 if (!(netdev->cache_valid & VALID_IN6)) {
2440 netdev->in6 = in6addr_any;
2442 file = fopen("/proc/net/if_inet6", "r");
2444 const char *name = netdev_get_name(netdev_);
2445 while (fgets(line, sizeof line, file)) {
2446 struct in6_addr in6_tmp;
2447 char ifname[16 + 1];
2448 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2449 && !strcmp(name, ifname))
2451 netdev->in6 = in6_tmp;
2457 netdev->cache_valid |= VALID_IN6;
2460 ovs_mutex_unlock(&netdev->mutex);
2466 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2468 struct sockaddr_in sin;
2469 memset(&sin, 0, sizeof sin);
2470 sin.sin_family = AF_INET;
2471 sin.sin_addr = addr;
2474 memset(sa, 0, sizeof *sa);
2475 memcpy(sa, &sin, sizeof sin);
2479 do_set_addr(struct netdev *netdev,
2480 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2484 make_in4_sockaddr(&ifr.ifr_addr, addr);
2485 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2489 /* Adds 'router' as a default IP gateway. */
2491 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2493 struct in_addr any = { INADDR_ANY };
2497 memset(&rt, 0, sizeof rt);
2498 make_in4_sockaddr(&rt.rt_dst, any);
2499 make_in4_sockaddr(&rt.rt_gateway, router);
2500 make_in4_sockaddr(&rt.rt_genmask, any);
2501 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2502 error = af_inet_ioctl(SIOCADDRT, &rt);
2504 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2510 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2513 static const char fn[] = "/proc/net/route";
2518 *netdev_name = NULL;
2519 stream = fopen(fn, "r");
2520 if (stream == NULL) {
2521 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2526 while (fgets(line, sizeof line, stream)) {
2529 ovs_be32 dest, gateway, mask;
2530 int refcnt, metric, mtu;
2531 unsigned int flags, use, window, irtt;
2534 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2536 iface, &dest, &gateway, &flags, &refcnt,
2537 &use, &metric, &mask, &mtu, &window, &irtt)) {
2538 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2542 if (!(flags & RTF_UP)) {
2543 /* Skip routes that aren't up. */
2547 /* The output of 'dest', 'mask', and 'gateway' were given in
2548 * network byte order, so we don't need need any endian
2549 * conversions here. */
2550 if ((dest & mask) == (host->s_addr & mask)) {
2552 /* The host is directly reachable. */
2553 next_hop->s_addr = 0;
2555 /* To reach the host, we must go through a gateway. */
2556 next_hop->s_addr = gateway;
2558 *netdev_name = xstrdup(iface);
2570 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2572 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2575 ovs_mutex_lock(&netdev->mutex);
2576 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2577 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2579 COVERAGE_INC(netdev_get_ethtool);
2580 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2581 error = netdev_linux_do_ethtool(netdev->up.name,
2584 "ETHTOOL_GDRVINFO");
2586 netdev->cache_valid |= VALID_DRVINFO;
2591 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2592 smap_add(smap, "driver_version", netdev->drvinfo.version);
2593 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2595 ovs_mutex_unlock(&netdev->mutex);
2601 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2604 smap_add(smap, "driver_name", "openvswitch");
2608 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2609 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2610 * returns 0. Otherwise, it returns a positive errno value; in particular,
2611 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2613 netdev_linux_arp_lookup(const struct netdev *netdev,
2614 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2617 struct sockaddr_in sin;
2620 memset(&r, 0, sizeof r);
2621 memset(&sin, 0, sizeof sin);
2622 sin.sin_family = AF_INET;
2623 sin.sin_addr.s_addr = ip;
2625 memcpy(&r.arp_pa, &sin, sizeof sin);
2626 r.arp_ha.sa_family = ARPHRD_ETHER;
2628 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2629 COVERAGE_INC(netdev_arp_lookup);
2630 retval = af_inet_ioctl(SIOCGARP, &r);
2632 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2633 } else if (retval != ENXIO) {
2634 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2635 netdev_get_name(netdev), IP_ARGS(ip),
2636 ovs_strerror(retval));
2642 nd_to_iff_flags(enum netdev_flags nd)
2645 if (nd & NETDEV_UP) {
2648 if (nd & NETDEV_PROMISC) {
2651 if (nd & NETDEV_LOOPBACK) {
2652 iff |= IFF_LOOPBACK;
2658 iff_to_nd_flags(int iff)
2660 enum netdev_flags nd = 0;
2664 if (iff & IFF_PROMISC) {
2665 nd |= NETDEV_PROMISC;
2667 if (iff & IFF_LOOPBACK) {
2668 nd |= NETDEV_LOOPBACK;
2674 update_flags(struct netdev_linux *netdev, enum netdev_flags off,
2675 enum netdev_flags on, enum netdev_flags *old_flagsp)
2676 OVS_REQUIRES(netdev->mutex)
2678 int old_flags, new_flags;
2681 old_flags = netdev->ifi_flags;
2682 *old_flagsp = iff_to_nd_flags(old_flags);
2683 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2684 if (new_flags != old_flags) {
2685 error = set_flags(netdev_get_name(&netdev->up), new_flags);
2686 get_flags(&netdev->up, &netdev->ifi_flags);
2693 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2694 enum netdev_flags on, enum netdev_flags *old_flagsp)
2696 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2699 ovs_mutex_lock(&netdev->mutex);
2700 error = update_flags(netdev, off, on, old_flagsp);
2701 ovs_mutex_unlock(&netdev->mutex);
2706 #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, SET_STATS, \
2707 GET_FEATURES, GET_STATUS) \
2713 netdev_linux_wait, \
2715 netdev_linux_alloc, \
2717 netdev_linux_destruct, \
2718 netdev_linux_dealloc, \
2719 NULL, /* get_config */ \
2720 NULL, /* set_config */ \
2721 NULL, /* get_tunnel_config */ \
2723 netdev_linux_send, \
2724 netdev_linux_send_wait, \
2726 netdev_linux_set_etheraddr, \
2727 netdev_linux_get_etheraddr, \
2728 netdev_linux_get_mtu, \
2729 netdev_linux_set_mtu, \
2730 netdev_linux_get_ifindex, \
2731 netdev_linux_get_carrier, \
2732 netdev_linux_get_carrier_resets, \
2733 netdev_linux_set_miimon_interval, \
2738 netdev_linux_set_advertisements, \
2740 netdev_linux_set_policing, \
2741 netdev_linux_get_qos_types, \
2742 netdev_linux_get_qos_capabilities, \
2743 netdev_linux_get_qos, \
2744 netdev_linux_set_qos, \
2745 netdev_linux_get_queue, \
2746 netdev_linux_set_queue, \
2747 netdev_linux_delete_queue, \
2748 netdev_linux_get_queue_stats, \
2749 netdev_linux_queue_dump_start, \
2750 netdev_linux_queue_dump_next, \
2751 netdev_linux_queue_dump_done, \
2752 netdev_linux_dump_queue_stats, \
2754 netdev_linux_get_in4, \
2755 netdev_linux_set_in4, \
2756 netdev_linux_get_in6, \
2757 netdev_linux_add_router, \
2758 netdev_linux_get_next_hop, \
2760 netdev_linux_arp_lookup, \
2762 netdev_linux_update_flags, \
2764 netdev_linux_rxq_alloc, \
2765 netdev_linux_rxq_construct, \
2766 netdev_linux_rxq_destruct, \
2767 netdev_linux_rxq_dealloc, \
2768 netdev_linux_rxq_recv, \
2769 netdev_linux_rxq_wait, \
2770 netdev_linux_rxq_drain, \
2773 const struct netdev_class netdev_linux_class =
2776 netdev_linux_construct,
2777 netdev_linux_get_stats,
2778 NULL, /* set_stats */
2779 netdev_linux_get_features,
2780 netdev_linux_get_status);
2782 const struct netdev_class netdev_tap_class =
2785 netdev_linux_construct_tap,
2786 netdev_tap_get_stats,
2787 NULL, /* set_stats */
2788 netdev_linux_get_features,
2789 netdev_linux_get_status);
2791 const struct netdev_class netdev_internal_class =
2794 netdev_linux_construct,
2795 netdev_internal_get_stats,
2796 netdev_internal_set_stats,
2797 NULL, /* get_features */
2798 netdev_internal_get_status);
2800 /* HTB traffic control class. */
2802 #define HTB_N_QUEUES 0xf000
2806 unsigned int max_rate; /* In bytes/s. */
2810 struct tc_queue tc_queue;
2811 unsigned int min_rate; /* In bytes/s. */
2812 unsigned int max_rate; /* In bytes/s. */
2813 unsigned int burst; /* In bytes. */
2814 unsigned int priority; /* Lower values are higher priorities. */
2818 htb_get__(const struct netdev *netdev_)
2820 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2821 return CONTAINER_OF(netdev->tc, struct htb, tc);
2825 htb_install__(struct netdev *netdev_, uint64_t max_rate)
2827 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2830 htb = xmalloc(sizeof *htb);
2831 tc_init(&htb->tc, &tc_ops_htb);
2832 htb->max_rate = max_rate;
2834 netdev->tc = &htb->tc;
2837 /* Create an HTB qdisc.
2839 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2841 htb_setup_qdisc__(struct netdev *netdev)
2844 struct tc_htb_glob opt;
2845 struct ofpbuf request;
2846 struct tcmsg *tcmsg;
2848 tc_del_qdisc(netdev);
2850 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2851 NLM_F_EXCL | NLM_F_CREATE, &request);
2855 tcmsg->tcm_handle = tc_make_handle(1, 0);
2856 tcmsg->tcm_parent = TC_H_ROOT;
2858 nl_msg_put_string(&request, TCA_KIND, "htb");
2860 memset(&opt, 0, sizeof opt);
2861 opt.rate2quantum = 10;
2865 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2866 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2867 nl_msg_end_nested(&request, opt_offset);
2869 return tc_transact(&request, NULL);
2872 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2873 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2875 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2876 unsigned int parent, struct htb_class *class)
2879 struct tc_htb_opt opt;
2880 struct ofpbuf request;
2881 struct tcmsg *tcmsg;
2885 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
2887 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2888 netdev_get_name(netdev));
2892 memset(&opt, 0, sizeof opt);
2893 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2894 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2895 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2896 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2897 opt.prio = class->priority;
2899 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2903 tcmsg->tcm_handle = handle;
2904 tcmsg->tcm_parent = parent;
2906 nl_msg_put_string(&request, TCA_KIND, "htb");
2907 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2908 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2909 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2910 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2911 nl_msg_end_nested(&request, opt_offset);
2913 error = tc_transact(&request, NULL);
2915 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2916 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2917 netdev_get_name(netdev),
2918 tc_get_major(handle), tc_get_minor(handle),
2919 tc_get_major(parent), tc_get_minor(parent),
2920 class->min_rate, class->max_rate,
2921 class->burst, class->priority, ovs_strerror(error));
2926 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2927 * description of them into 'details'. The description complies with the
2928 * specification given in the vswitch database documentation for linux-htb
2931 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2933 static const struct nl_policy tca_htb_policy[] = {
2934 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2935 .min_len = sizeof(struct tc_htb_opt) },
2938 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2939 const struct tc_htb_opt *htb;
2941 if (!nl_parse_nested(nl_options, tca_htb_policy,
2942 attrs, ARRAY_SIZE(tca_htb_policy))) {
2943 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2947 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2948 class->min_rate = htb->rate.rate;
2949 class->max_rate = htb->ceil.rate;
2950 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2951 class->priority = htb->prio;
2956 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2957 struct htb_class *options,
2958 struct netdev_queue_stats *stats)
2960 struct nlattr *nl_options;
2961 unsigned int handle;
2964 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2965 if (!error && queue_id) {
2966 unsigned int major = tc_get_major(handle);
2967 unsigned int minor = tc_get_minor(handle);
2968 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2969 *queue_id = minor - 1;
2974 if (!error && options) {
2975 error = htb_parse_tca_options__(nl_options, options);
2981 htb_parse_qdisc_details__(struct netdev *netdev_,
2982 const struct smap *details, struct htb_class *hc)
2984 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2985 const char *max_rate_s;
2987 max_rate_s = smap_get(details, "max-rate");
2988 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2989 if (!hc->max_rate) {
2990 enum netdev_features current;
2992 netdev_linux_read_features(netdev);
2993 current = !netdev->get_features_error ? netdev->current : 0;
2994 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
2996 hc->min_rate = hc->max_rate;
3002 htb_parse_class_details__(struct netdev *netdev,
3003 const struct smap *details, struct htb_class *hc)
3005 const struct htb *htb = htb_get__(netdev);
3006 const char *min_rate_s = smap_get(details, "min-rate");
3007 const char *max_rate_s = smap_get(details, "max-rate");
3008 const char *burst_s = smap_get(details, "burst");
3009 const char *priority_s = smap_get(details, "priority");
3012 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3014 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
3015 netdev_get_name(netdev));
3019 /* HTB requires at least an mtu sized min-rate to send any traffic even
3020 * on uncongested links. */
3021 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3022 hc->min_rate = MAX(hc->min_rate, mtu);
3023 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
3026 hc->max_rate = (max_rate_s
3027 ? strtoull(max_rate_s, NULL, 10) / 8
3029 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
3030 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
3034 * According to hints in the documentation that I've read, it is important
3035 * that 'burst' be at least as big as the largest frame that might be
3036 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
3037 * but having it a bit too small is a problem. Since netdev_get_mtu()
3038 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
3039 * the MTU. We actually add 64, instead of 14, as a guard against
3040 * additional headers get tacked on somewhere that we're not aware of. */
3041 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
3042 hc->burst = MAX(hc->burst, mtu + 64);
3045 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
3051 htb_query_class__(const struct netdev *netdev, unsigned int handle,
3052 unsigned int parent, struct htb_class *options,
3053 struct netdev_queue_stats *stats)
3055 struct ofpbuf *reply;
3058 error = tc_query_class(netdev, handle, parent, &reply);
3060 error = htb_parse_tcmsg__(reply, NULL, options, stats);
3061 ofpbuf_delete(reply);
3067 htb_tc_install(struct netdev *netdev, const struct smap *details)
3071 error = htb_setup_qdisc__(netdev);
3073 struct htb_class hc;
3075 htb_parse_qdisc_details__(netdev, details, &hc);
3076 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3077 tc_make_handle(1, 0), &hc);
3079 htb_install__(netdev, hc.max_rate);
3085 static struct htb_class *
3086 htb_class_cast__(const struct tc_queue *queue)
3088 return CONTAINER_OF(queue, struct htb_class, tc_queue);
3092 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
3093 const struct htb_class *hc)
3095 struct htb *htb = htb_get__(netdev);
3096 size_t hash = hash_int(queue_id, 0);
3097 struct tc_queue *queue;
3098 struct htb_class *hcp;
3100 queue = tc_find_queue__(netdev, queue_id, hash);
3102 hcp = htb_class_cast__(queue);
3104 hcp = xmalloc(sizeof *hcp);
3105 queue = &hcp->tc_queue;
3106 queue->queue_id = queue_id;
3107 queue->created = time_msec();
3108 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
3111 hcp->min_rate = hc->min_rate;
3112 hcp->max_rate = hc->max_rate;
3113 hcp->burst = hc->burst;
3114 hcp->priority = hc->priority;
3118 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3121 struct queue_dump_state state;
3122 struct htb_class hc;
3124 /* Get qdisc options. */
3126 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3127 htb_install__(netdev, hc.max_rate);
3130 if (!start_queue_dump(netdev, &state)) {
3133 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
3134 unsigned int queue_id;
3136 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3137 htb_update_queue__(netdev, queue_id, &hc);
3140 finish_queue_dump(&state);
3146 htb_tc_destroy(struct tc *tc)
3148 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
3149 struct htb_class *hc, *next;
3151 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
3152 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3160 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
3162 const struct htb *htb = htb_get__(netdev);
3163 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
3168 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
3170 struct htb_class hc;
3173 htb_parse_qdisc_details__(netdev, details, &hc);
3174 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3175 tc_make_handle(1, 0), &hc);
3177 htb_get__(netdev)->max_rate = hc.max_rate;
3183 htb_class_get(const struct netdev *netdev OVS_UNUSED,
3184 const struct tc_queue *queue, struct smap *details)
3186 const struct htb_class *hc = htb_class_cast__(queue);
3188 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3189 if (hc->min_rate != hc->max_rate) {
3190 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3192 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
3194 smap_add_format(details, "priority", "%u", hc->priority);
3200 htb_class_set(struct netdev *netdev, unsigned int queue_id,
3201 const struct smap *details)
3203 struct htb_class hc;
3206 error = htb_parse_class_details__(netdev, details, &hc);
3211 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3212 tc_make_handle(1, 0xfffe), &hc);
3217 htb_update_queue__(netdev, queue_id, &hc);
3222 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
3224 struct htb_class *hc = htb_class_cast__(queue);
3225 struct htb *htb = htb_get__(netdev);
3228 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3230 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3237 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3238 struct netdev_queue_stats *stats)
3240 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3241 tc_make_handle(1, 0xfffe), NULL, stats);
3245 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3246 const struct ofpbuf *nlmsg,
3247 netdev_dump_queue_stats_cb *cb, void *aux)
3249 struct netdev_queue_stats stats;
3250 unsigned int handle, major, minor;
3253 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3258 major = tc_get_major(handle);
3259 minor = tc_get_minor(handle);
3260 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3261 (*cb)(minor - 1, &stats, aux);
3266 static const struct tc_ops tc_ops_htb = {
3267 "htb", /* linux_name */
3268 "linux-htb", /* ovs_name */
3269 HTB_N_QUEUES, /* n_queues */
3278 htb_class_get_stats,
3279 htb_class_dump_stats
3282 /* "linux-hfsc" traffic control class. */
3284 #define HFSC_N_QUEUES 0xf000
3292 struct tc_queue tc_queue;
3297 static struct hfsc *
3298 hfsc_get__(const struct netdev *netdev_)
3300 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3301 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
3304 static struct hfsc_class *
3305 hfsc_class_cast__(const struct tc_queue *queue)
3307 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3311 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
3313 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3316 hfsc = xmalloc(sizeof *hfsc);
3317 tc_init(&hfsc->tc, &tc_ops_hfsc);
3318 hfsc->max_rate = max_rate;
3319 netdev->tc = &hfsc->tc;
3323 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3324 const struct hfsc_class *hc)
3328 struct hfsc_class *hcp;
3329 struct tc_queue *queue;
3331 hfsc = hfsc_get__(netdev);
3332 hash = hash_int(queue_id, 0);
3334 queue = tc_find_queue__(netdev, queue_id, hash);
3336 hcp = hfsc_class_cast__(queue);
3338 hcp = xmalloc(sizeof *hcp);
3339 queue = &hcp->tc_queue;
3340 queue->queue_id = queue_id;
3341 queue->created = time_msec();
3342 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3345 hcp->min_rate = hc->min_rate;
3346 hcp->max_rate = hc->max_rate;
3350 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3352 const struct tc_service_curve *rsc, *fsc, *usc;
3353 static const struct nl_policy tca_hfsc_policy[] = {
3355 .type = NL_A_UNSPEC,
3357 .min_len = sizeof(struct tc_service_curve),
3360 .type = NL_A_UNSPEC,
3362 .min_len = sizeof(struct tc_service_curve),
3365 .type = NL_A_UNSPEC,
3367 .min_len = sizeof(struct tc_service_curve),
3370 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3372 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3373 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3374 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3378 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3379 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3380 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3382 if (rsc->m1 != 0 || rsc->d != 0 ||
3383 fsc->m1 != 0 || fsc->d != 0 ||
3384 usc->m1 != 0 || usc->d != 0) {
3385 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3386 "Non-linear service curves are not supported.");
3390 if (rsc->m2 != fsc->m2) {
3391 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3392 "Real-time service curves are not supported ");
3396 if (rsc->m2 > usc->m2) {
3397 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3398 "Min-rate service curve is greater than "
3399 "the max-rate service curve.");
3403 class->min_rate = fsc->m2;
3404 class->max_rate = usc->m2;
3409 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3410 struct hfsc_class *options,
3411 struct netdev_queue_stats *stats)
3414 unsigned int handle;
3415 struct nlattr *nl_options;
3417 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3423 unsigned int major, minor;
3425 major = tc_get_major(handle);
3426 minor = tc_get_minor(handle);
3427 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3428 *queue_id = minor - 1;
3435 error = hfsc_parse_tca_options__(nl_options, options);
3442 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3443 unsigned int parent, struct hfsc_class *options,
3444 struct netdev_queue_stats *stats)
3447 struct ofpbuf *reply;
3449 error = tc_query_class(netdev, handle, parent, &reply);
3454 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3455 ofpbuf_delete(reply);
3460 hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
3461 struct hfsc_class *class)
3463 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3465 const char *max_rate_s;
3467 max_rate_s = smap_get(details, "max-rate");
3468 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3471 enum netdev_features current;
3473 netdev_linux_read_features(netdev);
3474 current = !netdev->get_features_error ? netdev->current : 0;
3475 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3478 class->min_rate = max_rate;
3479 class->max_rate = max_rate;
3483 hfsc_parse_class_details__(struct netdev *netdev,
3484 const struct smap *details,
3485 struct hfsc_class * class)
3487 const struct hfsc *hfsc;
3488 uint32_t min_rate, max_rate;
3489 const char *min_rate_s, *max_rate_s;
3491 hfsc = hfsc_get__(netdev);
3492 min_rate_s = smap_get(details, "min-rate");
3493 max_rate_s = smap_get(details, "max-rate");
3495 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3496 min_rate = MAX(min_rate, 1);
3497 min_rate = MIN(min_rate, hfsc->max_rate);
3499 max_rate = (max_rate_s
3500 ? strtoull(max_rate_s, NULL, 10) / 8
3502 max_rate = MAX(max_rate, min_rate);
3503 max_rate = MIN(max_rate, hfsc->max_rate);
3505 class->min_rate = min_rate;
3506 class->max_rate = max_rate;
3511 /* Create an HFSC qdisc.
3513 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3515 hfsc_setup_qdisc__(struct netdev * netdev)
3517 struct tcmsg *tcmsg;
3518 struct ofpbuf request;
3519 struct tc_hfsc_qopt opt;
3521 tc_del_qdisc(netdev);
3523 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3524 NLM_F_EXCL | NLM_F_CREATE, &request);
3530 tcmsg->tcm_handle = tc_make_handle(1, 0);
3531 tcmsg->tcm_parent = TC_H_ROOT;
3533 memset(&opt, 0, sizeof opt);
3536 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3537 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3539 return tc_transact(&request, NULL);
3542 /* Create an HFSC class.
3544 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3545 * sc rate <min_rate> ul rate <max_rate>" */
3547 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3548 unsigned int parent, struct hfsc_class *class)
3552 struct tcmsg *tcmsg;
3553 struct ofpbuf request;
3554 struct tc_service_curve min, max;
3556 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3562 tcmsg->tcm_handle = handle;
3563 tcmsg->tcm_parent = parent;
3567 min.m2 = class->min_rate;
3571 max.m2 = class->max_rate;
3573 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3574 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3575 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3576 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3577 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3578 nl_msg_end_nested(&request, opt_offset);
3580 error = tc_transact(&request, NULL);
3582 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3583 "min-rate %ubps, max-rate %ubps (%s)",
3584 netdev_get_name(netdev),
3585 tc_get_major(handle), tc_get_minor(handle),
3586 tc_get_major(parent), tc_get_minor(parent),
3587 class->min_rate, class->max_rate, ovs_strerror(error));
3594 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
3597 struct hfsc_class class;
3599 error = hfsc_setup_qdisc__(netdev);
3605 hfsc_parse_qdisc_details__(netdev, details, &class);
3606 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3607 tc_make_handle(1, 0), &class);
3613 hfsc_install__(netdev, class.max_rate);
3618 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3621 struct queue_dump_state state;
3622 struct hfsc_class hc;
3625 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3626 hfsc_install__(netdev, hc.max_rate);
3628 if (!start_queue_dump(netdev, &state)) {
3632 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
3633 unsigned int queue_id;
3635 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3636 hfsc_update_queue__(netdev, queue_id, &hc);
3640 finish_queue_dump(&state);
3645 hfsc_tc_destroy(struct tc *tc)
3648 struct hfsc_class *hc, *next;
3650 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3652 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3653 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3662 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
3664 const struct hfsc *hfsc;
3665 hfsc = hfsc_get__(netdev);
3666 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
3671 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
3674 struct hfsc_class class;
3676 hfsc_parse_qdisc_details__(netdev, details, &class);
3677 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3678 tc_make_handle(1, 0), &class);
3681 hfsc_get__(netdev)->max_rate = class.max_rate;
3688 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3689 const struct tc_queue *queue, struct smap *details)
3691 const struct hfsc_class *hc;
3693 hc = hfsc_class_cast__(queue);
3694 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3695 if (hc->min_rate != hc->max_rate) {
3696 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3702 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3703 const struct smap *details)
3706 struct hfsc_class class;
3708 error = hfsc_parse_class_details__(netdev, details, &class);
3713 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3714 tc_make_handle(1, 0xfffe), &class);
3719 hfsc_update_queue__(netdev, queue_id, &class);
3724 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3728 struct hfsc_class *hc;
3730 hc = hfsc_class_cast__(queue);
3731 hfsc = hfsc_get__(netdev);
3733 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3735 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3742 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3743 struct netdev_queue_stats *stats)
3745 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3746 tc_make_handle(1, 0xfffe), NULL, stats);
3750 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3751 const struct ofpbuf *nlmsg,
3752 netdev_dump_queue_stats_cb *cb, void *aux)
3754 struct netdev_queue_stats stats;
3755 unsigned int handle, major, minor;
3758 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3763 major = tc_get_major(handle);
3764 minor = tc_get_minor(handle);
3765 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3766 (*cb)(minor - 1, &stats, aux);
3771 static const struct tc_ops tc_ops_hfsc = {
3772 "hfsc", /* linux_name */
3773 "linux-hfsc", /* ovs_name */
3774 HFSC_N_QUEUES, /* n_queues */
3775 hfsc_tc_install, /* tc_install */
3776 hfsc_tc_load, /* tc_load */
3777 hfsc_tc_destroy, /* tc_destroy */
3778 hfsc_qdisc_get, /* qdisc_get */
3779 hfsc_qdisc_set, /* qdisc_set */
3780 hfsc_class_get, /* class_get */
3781 hfsc_class_set, /* class_set */
3782 hfsc_class_delete, /* class_delete */
3783 hfsc_class_get_stats, /* class_get_stats */
3784 hfsc_class_dump_stats /* class_dump_stats */
3787 /* "linux-default" traffic control class.
3789 * This class represents the default, unnamed Linux qdisc. It corresponds to
3790 * the "" (empty string) QoS type in the OVS database. */
3793 default_install__(struct netdev *netdev_)
3795 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3796 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
3798 /* Nothing but a tc class implementation is allowed to write to a tc. This
3799 * class never does that, so we can legitimately use a const tc object. */
3800 netdev->tc = CONST_CAST(struct tc *, &tc);
3804 default_tc_install(struct netdev *netdev,
3805 const struct smap *details OVS_UNUSED)
3807 default_install__(netdev);
3812 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3814 default_install__(netdev);
3818 static const struct tc_ops tc_ops_default = {
3819 NULL, /* linux_name */
3824 NULL, /* tc_destroy */
3825 NULL, /* qdisc_get */
3826 NULL, /* qdisc_set */
3827 NULL, /* class_get */
3828 NULL, /* class_set */
3829 NULL, /* class_delete */
3830 NULL, /* class_get_stats */
3831 NULL /* class_dump_stats */
3834 /* "linux-other" traffic control class.
3839 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
3841 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3842 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
3844 /* Nothing but a tc class implementation is allowed to write to a tc. This
3845 * class never does that, so we can legitimately use a const tc object. */
3846 netdev->tc = CONST_CAST(struct tc *, &tc);
3850 static const struct tc_ops tc_ops_other = {
3851 NULL, /* linux_name */
3852 "linux-other", /* ovs_name */
3854 NULL, /* tc_install */
3856 NULL, /* tc_destroy */
3857 NULL, /* qdisc_get */
3858 NULL, /* qdisc_set */
3859 NULL, /* class_get */
3860 NULL, /* class_set */
3861 NULL, /* class_delete */
3862 NULL, /* class_get_stats */
3863 NULL /* class_dump_stats */
3866 /* Traffic control. */
3868 /* Number of kernel "tc" ticks per second. */
3869 static double ticks_per_s;
3871 /* Number of kernel "jiffies" per second. This is used for the purpose of
3872 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3873 * one jiffy's worth of data.
3875 * There are two possibilities here:
3877 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3878 * approximate range of 100 to 1024. That means that we really need to
3879 * make sure that the qdisc can buffer that much data.
3881 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3882 * has finely granular timers and there's no need to fudge additional room
3883 * for buffers. (There's no extra effort needed to implement that: the
3884 * large 'buffer_hz' is used as a divisor, so practically any number will
3885 * come out as 0 in the division. Small integer results in the case of
3886 * really high dividends won't have any real effect anyhow.)
3888 static unsigned int buffer_hz;
3890 /* Returns tc handle 'major':'minor'. */
3892 tc_make_handle(unsigned int major, unsigned int minor)
3894 return TC_H_MAKE(major << 16, minor);
3897 /* Returns the major number from 'handle'. */
3899 tc_get_major(unsigned int handle)
3901 return TC_H_MAJ(handle) >> 16;
3904 /* Returns the minor number from 'handle'. */
3906 tc_get_minor(unsigned int handle)
3908 return TC_H_MIN(handle);
3911 static struct tcmsg *
3912 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3913 struct ofpbuf *request)
3915 struct tcmsg *tcmsg;
3919 error = get_ifindex(netdev, &ifindex);
3924 ofpbuf_init(request, 512);
3925 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3926 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3927 tcmsg->tcm_family = AF_UNSPEC;
3928 tcmsg->tcm_ifindex = ifindex;
3929 /* Caller should fill in tcmsg->tcm_handle. */
3930 /* Caller should fill in tcmsg->tcm_parent. */
3936 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3938 int error = nl_transact(NETLINK_ROUTE, request, replyp);
3939 ofpbuf_uninit(request);
3943 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3944 * policing configuration.
3946 * This function is equivalent to running the following when 'add' is true:
3947 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3949 * This function is equivalent to running the following when 'add' is false:
3950 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3952 * The configuration and stats may be seen with the following command:
3953 * /sbin/tc -s qdisc show dev <devname>
3955 * Returns 0 if successful, otherwise a positive errno value.
3958 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3960 struct ofpbuf request;
3961 struct tcmsg *tcmsg;
3963 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3964 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3966 tcmsg = tc_make_request(netdev, type, flags, &request);
3970 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3971 tcmsg->tcm_parent = TC_H_INGRESS;
3972 nl_msg_put_string(&request, TCA_KIND, "ingress");
3973 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3975 error = tc_transact(&request, NULL);
3977 /* If we're deleting the qdisc, don't worry about some of the
3978 * error conditions. */
3979 if (!add && (error == ENOENT || error == EINVAL)) {
3988 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3991 * This function is equivalent to running:
3992 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3993 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3996 * The configuration and stats may be seen with the following command:
3997 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3999 * Returns 0 if successful, otherwise a positive errno value.
4002 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
4004 struct tc_police tc_police;
4005 struct ofpbuf request;
4006 struct tcmsg *tcmsg;
4007 size_t basic_offset;
4008 size_t police_offset;
4012 memset(&tc_police, 0, sizeof tc_police);
4013 tc_police.action = TC_POLICE_SHOT;
4014 tc_police.mtu = mtu;
4015 tc_fill_rate(&tc_police.rate, (kbits_rate * 1000)/8, mtu);
4016 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
4017 kbits_burst * 1024);
4019 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
4020 NLM_F_EXCL | NLM_F_CREATE, &request);
4024 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
4025 tcmsg->tcm_info = tc_make_handle(49,
4026 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
4028 nl_msg_put_string(&request, TCA_KIND, "basic");
4029 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4030 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
4031 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
4032 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
4033 nl_msg_end_nested(&request, police_offset);
4034 nl_msg_end_nested(&request, basic_offset);
4036 error = tc_transact(&request, NULL);
4047 /* The values in psched are not individually very meaningful, but they are
4048 * important. The tables below show some values seen in the wild.
4052 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
4053 * (Before that, there are hints that it was 1000000000.)
4055 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
4059 * -----------------------------------
4060 * [1] 000c8000 000f4240 000f4240 00000064
4061 * [2] 000003e8 00000400 000f4240 3b9aca00
4062 * [3] 000003e8 00000400 000f4240 3b9aca00
4063 * [4] 000003e8 00000400 000f4240 00000064
4064 * [5] 000003e8 00000040 000f4240 3b9aca00
4065 * [6] 000003e8 00000040 000f4240 000000f9
4067 * a b c d ticks_per_s buffer_hz
4068 * ------- --------- ---------- ------------- ----------- -------------
4069 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
4070 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4071 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4072 * [4] 1,000 1,024 1,000,000 100 976,562 100
4073 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
4074 * [6] 1,000 64 1,000,000 249 15,625,000 249
4076 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
4077 * [2] 2.6.26-1-686-bigmem from Debian lenny
4078 * [3] 2.6.26-2-sparc64 from Debian lenny
4079 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
4080 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
4081 * [6] 2.6.34 from kernel.org on KVM
4083 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4084 static const char fn[] = "/proc/net/psched";
4085 unsigned int a, b, c, d;
4088 if (!ovsthread_once_start(&once)) {
4095 stream = fopen(fn, "r");
4097 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
4101 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
4102 VLOG_WARN("%s: read failed", fn);
4106 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
4110 VLOG_WARN("%s: invalid scheduler parameters", fn);
4114 ticks_per_s = (double) a * c / b;
4118 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
4121 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
4124 ovsthread_once_done(&once);
4127 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
4128 * rate of 'rate' bytes per second. */
4130 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
4133 return (rate * ticks) / ticks_per_s;
4136 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
4137 * rate of 'rate' bytes per second. */
4139 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
4142 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
4145 /* Returns the number of bytes that need to be reserved for qdisc buffering at
4146 * a transmission rate of 'rate' bytes per second. */
4148 tc_buffer_per_jiffy(unsigned int rate)
4151 return rate / buffer_hz;
4154 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
4155 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
4156 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
4157 * stores NULL into it if it is absent.
4159 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
4162 * Returns 0 if successful, otherwise a positive errno value. */
4164 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
4165 struct nlattr **options)
4167 static const struct nl_policy tca_policy[] = {
4168 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
4169 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
4171 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4173 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4174 tca_policy, ta, ARRAY_SIZE(ta))) {
4175 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
4180 *kind = nl_attr_get_string(ta[TCA_KIND]);
4184 *options = ta[TCA_OPTIONS];
4199 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
4200 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
4201 * into '*options', and its queue statistics into '*stats'. Any of the output
4202 * arguments may be null.
4204 * Returns 0 if successful, otherwise a positive errno value. */
4206 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
4207 struct nlattr **options, struct netdev_queue_stats *stats)
4209 static const struct nl_policy tca_policy[] = {
4210 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
4211 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
4213 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4215 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4216 tca_policy, ta, ARRAY_SIZE(ta))) {
4217 VLOG_WARN_RL(&rl, "failed to parse class message");
4222 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
4223 *handlep = tc->tcm_handle;
4227 *options = ta[TCA_OPTIONS];
4231 const struct gnet_stats_queue *gsq;
4232 struct gnet_stats_basic gsb;
4234 static const struct nl_policy stats_policy[] = {
4235 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
4236 .min_len = sizeof gsb },
4237 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
4238 .min_len = sizeof *gsq },
4240 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
4242 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
4243 sa, ARRAY_SIZE(sa))) {
4244 VLOG_WARN_RL(&rl, "failed to parse class stats");
4248 /* Alignment issues screw up the length of struct gnet_stats_basic on
4249 * some arch/bitsize combinations. Newer versions of Linux have a
4250 * struct gnet_stats_basic_packed, but we can't depend on that. The
4251 * easiest thing to do is just to make a copy. */
4252 memset(&gsb, 0, sizeof gsb);
4253 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
4254 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
4255 stats->tx_bytes = gsb.bytes;
4256 stats->tx_packets = gsb.packets;
4258 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
4259 stats->tx_errors = gsq->drops;
4269 memset(stats, 0, sizeof *stats);
4274 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
4277 tc_query_class(const struct netdev *netdev,
4278 unsigned int handle, unsigned int parent,
4279 struct ofpbuf **replyp)
4281 struct ofpbuf request;
4282 struct tcmsg *tcmsg;
4285 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
4289 tcmsg->tcm_handle = handle;
4290 tcmsg->tcm_parent = parent;
4292 error = tc_transact(&request, replyp);
4294 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
4295 netdev_get_name(netdev),
4296 tc_get_major(handle), tc_get_minor(handle),
4297 tc_get_major(parent), tc_get_minor(parent),
4298 ovs_strerror(error));
4303 /* Equivalent to "tc class del dev <name> handle <handle>". */
4305 tc_delete_class(const struct netdev *netdev, unsigned int handle)
4307 struct ofpbuf request;
4308 struct tcmsg *tcmsg;
4311 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
4315 tcmsg->tcm_handle = handle;
4316 tcmsg->tcm_parent = 0;
4318 error = tc_transact(&request, NULL);
4320 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4321 netdev_get_name(netdev),
4322 tc_get_major(handle), tc_get_minor(handle),
4323 ovs_strerror(error));
4328 /* Equivalent to "tc qdisc del dev <name> root". */
4330 tc_del_qdisc(struct netdev *netdev_)
4332 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4333 struct ofpbuf request;
4334 struct tcmsg *tcmsg;
4337 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
4341 tcmsg->tcm_handle = tc_make_handle(1, 0);
4342 tcmsg->tcm_parent = TC_H_ROOT;
4344 error = tc_transact(&request, NULL);
4345 if (error == EINVAL) {
4346 /* EINVAL probably means that the default qdisc was in use, in which
4347 * case we've accomplished our purpose. */
4350 if (!error && netdev->tc) {
4351 if (netdev->tc->ops->tc_destroy) {
4352 netdev->tc->ops->tc_destroy(netdev->tc);
4359 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4360 * kernel to determine what they are. Returns 0 if successful, otherwise a
4361 * positive errno value. */
4363 tc_query_qdisc(const struct netdev *netdev_)
4365 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4366 struct ofpbuf request, *qdisc;
4367 const struct tc_ops *ops;
4368 struct tcmsg *tcmsg;
4376 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4377 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4378 * 2.6.35 without that fix backported to it.
4380 * To avoid the OOPS, we must not make a request that would attempt to dump
4381 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4382 * few others. There are a few ways that I can see to do this, but most of
4383 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4384 * technique chosen here is to assume that any non-default qdisc that we
4385 * create will have a class with handle 1:0. The built-in qdiscs only have
4386 * a class with handle 0:0.
4388 * We could check for Linux 2.6.35+ and use a more straightforward method
4390 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
4394 tcmsg->tcm_handle = tc_make_handle(1, 0);
4395 tcmsg->tcm_parent = 0;
4397 /* Figure out what tc class to instantiate. */
4398 error = tc_transact(&request, &qdisc);
4402 error = tc_parse_qdisc(qdisc, &kind, NULL);
4404 ops = &tc_ops_other;
4406 ops = tc_lookup_linux_name(kind);
4408 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4409 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4411 ops = &tc_ops_other;
4414 } else if (error == ENOENT) {
4415 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4416 * other entity that doesn't have a handle 1:0. We will assume
4417 * that it's the system default qdisc. */
4418 ops = &tc_ops_default;
4421 /* Who knows? Maybe the device got deleted. */
4422 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4423 netdev_get_name(netdev_), ovs_strerror(error));
4424 ops = &tc_ops_other;
4427 /* Instantiate it. */
4428 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
4429 ovs_assert((load_error == 0) == (netdev->tc != NULL));
4430 ofpbuf_delete(qdisc);
4432 return error ? error : load_error;
4435 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4436 approximate the time to transmit packets of various lengths. For an MTU of
4437 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4438 represents two possible packet lengths; for a MTU of 513 through 1024, four
4439 possible lengths; and so on.
4441 Returns, for the specified 'mtu', the number of bits that packet lengths
4442 need to be shifted right to fit within such a 256-entry table. */
4444 tc_calc_cell_log(unsigned int mtu)
4449 mtu = ETH_PAYLOAD_MAX;
4451 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4453 for (cell_log = 0; mtu >= 256; cell_log++) {
4460 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4463 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4465 memset(rate, 0, sizeof *rate);
4466 rate->cell_log = tc_calc_cell_log(mtu);
4467 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4468 /* rate->cell_align = 0; */ /* distro headers. */
4469 rate->mpu = ETH_TOTAL_MIN;
4473 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4474 * attribute of the specified "type".
4476 * See tc_calc_cell_log() above for a description of "rtab"s. */
4478 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4483 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4484 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4485 unsigned packet_size = (i + 1) << rate->cell_log;
4486 if (packet_size < rate->mpu) {
4487 packet_size = rate->mpu;
4489 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4493 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4494 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4495 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4498 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4500 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4501 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4504 /* Linux-only functions declared in netdev-linux.h */
4506 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4507 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4509 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4510 const char *flag_name, bool enable)
4512 const char *netdev_name = netdev_get_name(netdev);
4513 struct ethtool_value evalue;
4517 COVERAGE_INC(netdev_get_ethtool);
4518 memset(&evalue, 0, sizeof evalue);
4519 error = netdev_linux_do_ethtool(netdev_name,
4520 (struct ethtool_cmd *)&evalue,
4521 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4526 COVERAGE_INC(netdev_set_ethtool);
4527 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4528 error = netdev_linux_do_ethtool(netdev_name,
4529 (struct ethtool_cmd *)&evalue,
4530 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4535 COVERAGE_INC(netdev_get_ethtool);
4536 memset(&evalue, 0, sizeof evalue);
4537 error = netdev_linux_do_ethtool(netdev_name,
4538 (struct ethtool_cmd *)&evalue,
4539 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4544 if (new_flags != evalue.data) {
4545 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4546 "device %s failed", enable ? "enable" : "disable",
4547 flag_name, netdev_name);
4554 /* Utility functions. */
4556 /* Copies 'src' into 'dst', performing format conversion in the process. */
4558 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4559 const struct rtnl_link_stats *src)
4561 dst->rx_packets = src->rx_packets;
4562 dst->tx_packets = src->tx_packets;
4563 dst->rx_bytes = src->rx_bytes;
4564 dst->tx_bytes = src->tx_bytes;
4565 dst->rx_errors = src->rx_errors;
4566 dst->tx_errors = src->tx_errors;
4567 dst->rx_dropped = src->rx_dropped;
4568 dst->tx_dropped = src->tx_dropped;
4569 dst->multicast = src->multicast;
4570 dst->collisions = src->collisions;
4571 dst->rx_length_errors = src->rx_length_errors;
4572 dst->rx_over_errors = src->rx_over_errors;
4573 dst->rx_crc_errors = src->rx_crc_errors;
4574 dst->rx_frame_errors = src->rx_frame_errors;
4575 dst->rx_fifo_errors = src->rx_fifo_errors;
4576 dst->rx_missed_errors = src->rx_missed_errors;
4577 dst->tx_aborted_errors = src->tx_aborted_errors;
4578 dst->tx_carrier_errors = src->tx_carrier_errors;
4579 dst->tx_fifo_errors = src->tx_fifo_errors;
4580 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4581 dst->tx_window_errors = src->tx_window_errors;
4585 get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
4587 struct ofpbuf request;
4588 struct ofpbuf *reply;
4591 ofpbuf_init(&request, 0);
4592 nl_msg_put_nlmsghdr(&request,
4593 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
4594 RTM_GETLINK, NLM_F_REQUEST);
4595 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
4596 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
4597 error = nl_transact(NETLINK_ROUTE, &request, &reply);
4598 ofpbuf_uninit(&request);
4603 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
4604 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS);
4605 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
4606 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
4609 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4613 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
4618 ofpbuf_delete(reply);
4623 get_flags(const struct netdev *dev, unsigned int *flags)
4629 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
4631 *flags = ifr.ifr_flags;
4637 set_flags(const char *name, unsigned int flags)
4641 ifr.ifr_flags = flags;
4642 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
4646 do_get_ifindex(const char *netdev_name)
4651 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4652 COVERAGE_INC(netdev_get_ifindex);
4654 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
4656 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4657 netdev_name, ovs_strerror(error));
4660 return ifr.ifr_ifindex;
4664 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4666 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4668 if (!(netdev->cache_valid & VALID_IFINDEX)) {
4669 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4672 netdev->get_ifindex_error = -ifindex;
4673 netdev->ifindex = 0;
4675 netdev->get_ifindex_error = 0;
4676 netdev->ifindex = ifindex;
4678 netdev->cache_valid |= VALID_IFINDEX;
4681 *ifindexp = netdev->ifindex;
4682 return netdev->get_ifindex_error;
4686 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4692 memset(&ifr, 0, sizeof ifr);
4693 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4694 COVERAGE_INC(netdev_get_hwaddr);
4695 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
4697 /* ENODEV probably means that a vif disappeared asynchronously and
4698 * hasn't been removed from the database yet, so reduce the log level
4699 * to INFO for that case. */
4700 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
4701 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4702 netdev_name, ovs_strerror(error));
4705 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4706 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4707 VLOG_WARN("%s device has unknown hardware address family %d",
4708 netdev_name, hwaddr_family);
4710 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4715 set_etheraddr(const char *netdev_name,
4716 const uint8_t mac[ETH_ADDR_LEN])
4721 memset(&ifr, 0, sizeof ifr);
4722 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4723 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4724 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4725 COVERAGE_INC(netdev_set_hwaddr);
4726 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
4728 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4729 netdev_name, ovs_strerror(error));
4735 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4736 int cmd, const char *cmd_name)
4741 memset(&ifr, 0, sizeof ifr);
4742 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4743 ifr.ifr_data = (caddr_t) ecmd;
4746 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
4748 if (error != EOPNOTSUPP) {
4749 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4750 "failed: %s", cmd_name, name, ovs_strerror(error));
4752 /* The device doesn't support this operation. That's pretty
4753 * common, so there's no point in logging anything. */
4760 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4761 int cmd, const char *cmd_name)
4766 ifr.ifr_addr.sa_family = AF_INET;
4767 error = af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4769 const struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *,
4771 *ip = sin->sin_addr;
4776 /* Returns an AF_PACKET raw socket or a negative errno value. */
4778 af_packet_sock(void)
4780 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4783 if (ovsthread_once_start(&once)) {
4784 sock = socket(AF_PACKET, SOCK_RAW, 0);
4786 int error = set_nonblocking(sock);
4793 VLOG_ERR("failed to create packet socket: %s",
4794 ovs_strerror(errno));
4796 ovsthread_once_done(&once);