2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <linux/version.h>
37 #include <sys/types.h>
38 #include <sys/ioctl.h>
39 #include <sys/socket.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
51 #include "connectivity.h"
53 #include "dpif-linux.h"
54 #include "dynamic-string.h"
55 #include "fatal-signal.h"
58 #include "netdev-provider.h"
59 #include "netdev-vport.h"
60 #include "netlink-notifier.h"
61 #include "netlink-socket.h"
64 #include "openflow/openflow.h"
65 #include "ovs-atomic.h"
67 #include "poll-loop.h"
68 #include "rtnetlink-link.h"
71 #include "socket-util.h"
74 #include "unaligned.h"
77 VLOG_DEFINE_THIS_MODULE(netdev_linux);
79 COVERAGE_DEFINE(netdev_set_policing);
80 COVERAGE_DEFINE(netdev_arp_lookup);
81 COVERAGE_DEFINE(netdev_get_ifindex);
82 COVERAGE_DEFINE(netdev_get_hwaddr);
83 COVERAGE_DEFINE(netdev_set_hwaddr);
84 COVERAGE_DEFINE(netdev_get_ethtool);
85 COVERAGE_DEFINE(netdev_set_ethtool);
88 /* These were introduced in Linux 2.6.14, so they might be missing if we have
90 #ifndef ADVERTISED_Pause
91 #define ADVERTISED_Pause (1 << 13)
93 #ifndef ADVERTISED_Asym_Pause
94 #define ADVERTISED_Asym_Pause (1 << 14)
97 /* These were introduced in Linux 2.6.24, so they might be missing if we
98 * have old headers. */
99 #ifndef ETHTOOL_GFLAGS
100 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
102 #ifndef ETHTOOL_SFLAGS
103 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
106 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
109 #define TC_RTAB_SIZE 1024
112 /* Linux 2.6.21 introduced struct tpacket_auxdata.
113 * Linux 2.6.27 added the tp_vlan_tci member.
114 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
115 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
116 * TP_STATUS_VLAN_TPID_VALID.
118 * With all this churn it's easiest to unconditionally define a replacement
119 * structure that has everything we want.
121 #ifndef PACKET_AUXDATA
122 #define PACKET_AUXDATA 8
124 #ifndef TP_STATUS_VLAN_VALID
125 #define TP_STATUS_VLAN_VALID (1 << 4)
127 #ifndef TP_STATUS_VLAN_TPID_VALID
128 #define TP_STATUS_VLAN_TPID_VALID (1 << 6)
130 #undef tpacket_auxdata
131 #define tpacket_auxdata rpl_tpacket_auxdata
132 struct tpacket_auxdata {
138 uint16_t tp_vlan_tci;
139 uint16_t tp_vlan_tpid;
143 VALID_IFINDEX = 1 << 0,
144 VALID_ETHERADDR = 1 << 1,
148 VALID_POLICING = 1 << 5,
149 VALID_VPORT_STAT_ERROR = 1 << 6,
150 VALID_DRVINFO = 1 << 7,
151 VALID_FEATURES = 1 << 8,
154 /* Traffic control. */
156 /* An instance of a traffic control class. Always associated with a particular
159 * Each TC implementation subclasses this with whatever additional data it
162 const struct tc_ops *ops;
163 struct hmap queues; /* Contains "struct tc_queue"s.
164 * Read by generic TC layer.
165 * Written only by TC implementation. */
168 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
170 /* One traffic control queue.
172 * Each TC implementation subclasses this with whatever additional data it
175 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
176 unsigned int queue_id; /* OpenFlow queue ID. */
177 long long int created; /* Time queue was created, in msecs. */
180 /* A particular kind of traffic control. Each implementation generally maps to
181 * one particular Linux qdisc class.
183 * The functions below return 0 if successful or a positive errno value on
184 * failure, except where otherwise noted. All of them must be provided, except
185 * where otherwise noted. */
187 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
188 * This is null for tc_ops_default and tc_ops_other, for which there are no
189 * appropriate values. */
190 const char *linux_name;
192 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
193 const char *ovs_name;
195 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
196 * queues. The queues are numbered 0 through n_queues - 1. */
197 unsigned int n_queues;
199 /* Called to install this TC class on 'netdev'. The implementation should
200 * make the Netlink calls required to set up 'netdev' with the right qdisc
201 * and configure it according to 'details'. The implementation may assume
202 * that the current qdisc is the default; that is, there is no need for it
203 * to delete the current qdisc before installing itself.
205 * The contents of 'details' should be documented as valid for 'ovs_name'
206 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
207 * (which is built as ovs-vswitchd.conf.db(8)).
209 * This function must return 0 if and only if it sets 'netdev->tc' to an
210 * initialized 'struct tc'.
212 * (This function is null for tc_ops_other, which cannot be installed. For
213 * other TC classes it should always be nonnull.) */
214 int (*tc_install)(struct netdev *netdev, const struct smap *details);
216 /* Called when the netdev code determines (through a Netlink query) that
217 * this TC class's qdisc is installed on 'netdev', but we didn't install
218 * it ourselves and so don't know any of the details.
220 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
221 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
222 * implementation should parse the other attributes of 'nlmsg' as
223 * necessary to determine its configuration. If necessary it should also
224 * use Netlink queries to determine the configuration of queues on
227 * This function must return 0 if and only if it sets 'netdev->tc' to an
228 * initialized 'struct tc'. */
229 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
231 /* Destroys the data structures allocated by the implementation as part of
232 * 'tc'. (This includes destroying 'tc->queues' by calling
235 * The implementation should not need to perform any Netlink calls. If
236 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
237 * (But it may not be desirable.)
239 * This function may be null if 'tc' is trivial. */
240 void (*tc_destroy)(struct tc *tc);
242 /* Retrieves details of 'netdev->tc' configuration into 'details'.
244 * The implementation should not need to perform any Netlink calls, because
245 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
246 * cached the configuration.
248 * The contents of 'details' should be documented as valid for 'ovs_name'
249 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
250 * (which is built as ovs-vswitchd.conf.db(8)).
252 * This function may be null if 'tc' is not configurable.
254 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
256 /* Reconfigures 'netdev->tc' according to 'details', performing any
257 * required Netlink calls to complete the reconfiguration.
259 * The contents of 'details' should be documented as valid for 'ovs_name'
260 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
261 * (which is built as ovs-vswitchd.conf.db(8)).
263 * This function may be null if 'tc' is not configurable.
265 int (*qdisc_set)(struct netdev *, const struct smap *details);
267 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
268 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
270 * The contents of 'details' should be documented as valid for 'ovs_name'
271 * in the "other_config" column in the "Queue" table in
272 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
274 * The implementation should not need to perform any Netlink calls, because
275 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
276 * cached the queue configuration.
278 * This function may be null if 'tc' does not have queues ('n_queues' is
280 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
281 struct smap *details);
283 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
284 * 'details', perfoming any required Netlink calls to complete the
285 * reconfiguration. The caller ensures that 'queue_id' is less than
288 * The contents of 'details' should be documented as valid for 'ovs_name'
289 * in the "other_config" column in the "Queue" table in
290 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
292 * This function may be null if 'tc' does not have queues or its queues are
293 * not configurable. */
294 int (*class_set)(struct netdev *, unsigned int queue_id,
295 const struct smap *details);
297 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
298 * tc_queue's within 'netdev->tc->queues'.
300 * This function may be null if 'tc' does not have queues or its queues
301 * cannot be deleted. */
302 int (*class_delete)(struct netdev *, struct tc_queue *queue);
304 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
305 * 'struct tc_queue's within 'netdev->tc->queues'.
307 * On success, initializes '*stats'.
309 * This function may be null if 'tc' does not have queues or if it cannot
310 * report queue statistics. */
311 int (*class_get_stats)(const struct netdev *netdev,
312 const struct tc_queue *queue,
313 struct netdev_queue_stats *stats);
315 /* Extracts queue stats from 'nlmsg', which is a response to a
316 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
318 * This function may be null if 'tc' does not have queues or if it cannot
319 * report queue statistics. */
320 int (*class_dump_stats)(const struct netdev *netdev,
321 const struct ofpbuf *nlmsg,
322 netdev_dump_queue_stats_cb *cb, void *aux);
326 tc_init(struct tc *tc, const struct tc_ops *ops)
329 hmap_init(&tc->queues);
333 tc_destroy(struct tc *tc)
335 hmap_destroy(&tc->queues);
338 static const struct tc_ops tc_ops_htb;
339 static const struct tc_ops tc_ops_hfsc;
340 static const struct tc_ops tc_ops_default;
341 static const struct tc_ops tc_ops_other;
343 static const struct tc_ops *const tcs[] = {
344 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
345 &tc_ops_hfsc, /* Hierarchical fair service curve. */
346 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
347 &tc_ops_other, /* Some other qdisc. */
351 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
352 static unsigned int tc_get_major(unsigned int handle);
353 static unsigned int tc_get_minor(unsigned int handle);
355 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
356 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
357 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
359 static struct tcmsg *tc_make_request(const struct netdev *, int type,
360 unsigned int flags, struct ofpbuf *);
361 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
362 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
363 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
366 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
367 struct nlattr **options);
368 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
369 struct nlattr **options,
370 struct netdev_queue_stats *);
371 static int tc_query_class(const struct netdev *,
372 unsigned int handle, unsigned int parent,
373 struct ofpbuf **replyp);
374 static int tc_delete_class(const struct netdev *, unsigned int handle);
376 static int tc_del_qdisc(struct netdev *netdev);
377 static int tc_query_qdisc(const struct netdev *netdev);
379 static int tc_calc_cell_log(unsigned int mtu);
380 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
381 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
382 const struct tc_ratespec *rate);
383 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
385 struct netdev_linux {
388 /* Protects all members below. */
389 struct ovs_mutex mutex;
391 unsigned int cache_valid;
393 bool miimon; /* Link status of last poll. */
394 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
395 struct timer miimon_timer;
397 /* The following are figured out "on demand" only. They are only valid
398 * when the corresponding VALID_* bit in 'cache_valid' is set. */
400 uint8_t etheraddr[ETH_ADDR_LEN];
401 struct in_addr address, netmask;
404 unsigned int ifi_flags;
405 long long int carrier_resets;
406 uint32_t kbits_rate; /* Policing data. */
407 uint32_t kbits_burst;
408 int vport_stats_error; /* Cached error code from vport_get_stats().
409 0 or an errno value. */
410 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
411 int ether_addr_error; /* Cached error code from set/get etheraddr. */
412 int netdev_policing_error; /* Cached error code from set policing. */
413 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
414 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
416 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
417 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
418 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
420 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
423 /* For devices of class netdev_tap_class only. */
427 struct netdev_rx_linux {
433 /* This is set pretty low because we probably won't learn anything from the
434 * additional log messages. */
435 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
437 /* Polling miimon status for all ports causes performance degradation when
438 * handling a large number of ports. If there are no devices using miimon, then
439 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait(). */
440 static atomic_int miimon_cnt = ATOMIC_VAR_INIT(0);
442 static void netdev_linux_run(void);
444 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
445 int cmd, const char *cmd_name);
446 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
447 int cmd, const char *cmd_name);
448 static int get_flags(const struct netdev *, unsigned int *flags);
449 static int set_flags(const char *, unsigned int flags);
450 static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
451 enum netdev_flags on, enum netdev_flags *old_flagsp)
452 OVS_REQUIRES(netdev->mutex);
453 static int do_get_ifindex(const char *netdev_name);
454 static int get_ifindex(const struct netdev *, int *ifindexp);
455 static int do_set_addr(struct netdev *netdev,
456 int ioctl_nr, const char *ioctl_name,
457 struct in_addr addr);
458 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
459 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
460 static int get_stats_via_netlink(const struct netdev *, struct netdev_stats *);
461 static int af_packet_sock(void);
462 static bool netdev_linux_miimon_enabled(void);
463 static void netdev_linux_miimon_run(void);
464 static void netdev_linux_miimon_wait(void);
467 is_netdev_linux_class(const struct netdev_class *netdev_class)
469 return netdev_class->run == netdev_linux_run;
473 is_tap_netdev(const struct netdev *netdev)
475 return netdev_get_class(netdev) == &netdev_tap_class;
478 static struct netdev_linux *
479 netdev_linux_cast(const struct netdev *netdev)
481 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
483 return CONTAINER_OF(netdev, struct netdev_linux, up);
486 static struct netdev_rx_linux *
487 netdev_rx_linux_cast(const struct netdev_rx *rx)
489 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
490 return CONTAINER_OF(rx, struct netdev_rx_linux, up);
493 static void netdev_linux_update(struct netdev_linux *netdev,
494 const struct rtnetlink_link_change *)
495 OVS_REQUIRES(netdev->mutex);
496 static void netdev_linux_changed(struct netdev_linux *netdev,
497 unsigned int ifi_flags, unsigned int mask)
498 OVS_REQUIRES(netdev->mutex);
500 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK changes, or NULL
501 * if no such socket could be created. */
502 static struct nl_sock *
503 netdev_linux_notify_sock(void)
505 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
506 static struct nl_sock *sock;
508 if (ovsthread_once_start(&once)) {
511 error = nl_sock_create(NETLINK_ROUTE, &sock);
513 error = nl_sock_join_mcgroup(sock, RTNLGRP_LINK);
515 nl_sock_destroy(sock);
519 ovsthread_once_done(&once);
526 netdev_linux_miimon_enabled(void)
530 atomic_read(&miimon_cnt, &miimon);
535 netdev_linux_run(void)
537 struct nl_sock *sock;
540 if (netdev_linux_miimon_enabled()) {
541 netdev_linux_miimon_run();
544 sock = netdev_linux_notify_sock();
550 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
551 uint64_t buf_stub[4096 / 8];
554 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
555 error = nl_sock_recv(sock, &buf, false);
557 struct rtnetlink_link_change change;
559 if (rtnetlink_link_parse(&buf, &change)) {
560 struct netdev *netdev_ = netdev_from_name(change.ifname);
561 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
562 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
564 ovs_mutex_lock(&netdev->mutex);
565 netdev_linux_update(netdev, &change);
566 ovs_mutex_unlock(&netdev->mutex);
568 netdev_close(netdev_);
570 } else if (error == ENOBUFS) {
571 struct shash device_shash;
572 struct shash_node *node;
576 shash_init(&device_shash);
577 netdev_get_devices(&netdev_linux_class, &device_shash);
578 SHASH_FOR_EACH (node, &device_shash) {
579 struct netdev *netdev_ = node->data;
580 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
583 ovs_mutex_lock(&netdev->mutex);
584 get_flags(netdev_, &flags);
585 netdev_linux_changed(netdev, flags, 0);
586 ovs_mutex_unlock(&netdev->mutex);
588 netdev_close(netdev_);
590 shash_destroy(&device_shash);
591 } else if (error != EAGAIN) {
592 VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)",
593 ovs_strerror(error));
600 netdev_linux_wait(void)
602 struct nl_sock *sock;
604 if (netdev_linux_miimon_enabled()) {
605 netdev_linux_miimon_wait();
607 sock = netdev_linux_notify_sock();
609 nl_sock_wait(sock, POLLIN);
614 netdev_linux_changed(struct netdev_linux *dev,
615 unsigned int ifi_flags, unsigned int mask)
616 OVS_REQUIRES(dev->mutex)
618 seq_change(connectivity_seq_get());
620 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
621 dev->carrier_resets++;
623 dev->ifi_flags = ifi_flags;
625 dev->cache_valid &= mask;
629 netdev_linux_update(struct netdev_linux *dev,
630 const struct rtnetlink_link_change *change)
631 OVS_REQUIRES(dev->mutex)
633 if (change->nlmsg_type == RTM_NEWLINK) {
635 netdev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
637 /* Update netdev from rtnl-change msg. */
639 dev->mtu = change->mtu;
640 dev->cache_valid |= VALID_MTU;
641 dev->netdev_mtu_error = 0;
644 if (!eth_addr_is_zero(change->addr)) {
645 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
646 dev->cache_valid |= VALID_ETHERADDR;
647 dev->ether_addr_error = 0;
650 dev->ifindex = change->ifi_index;
651 dev->cache_valid |= VALID_IFINDEX;
652 dev->get_ifindex_error = 0;
655 netdev_linux_changed(dev, change->ifi_flags, 0);
659 static struct netdev *
660 netdev_linux_alloc(void)
662 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
667 netdev_linux_common_construct(struct netdev_linux *netdev)
669 ovs_mutex_init(&netdev->mutex);
672 /* Creates system and internal devices. */
674 netdev_linux_construct(struct netdev *netdev_)
676 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
679 netdev_linux_common_construct(netdev);
681 error = get_flags(&netdev->up, &netdev->ifi_flags);
682 if (error == ENODEV) {
683 if (netdev->up.netdev_class != &netdev_internal_class) {
684 /* The device does not exist, so don't allow it to be opened. */
687 /* "Internal" netdevs have to be created as netdev objects before
688 * they exist in the kernel, because creating them in the kernel
689 * happens by passing a netdev object to dpif_port_add().
690 * Therefore, ignore the error. */
697 /* For most types of netdevs we open the device for each call of
698 * netdev_open(). However, this is not the case with tap devices,
699 * since it is only possible to open the device once. In this
700 * situation we share a single file descriptor, and consequently
701 * buffers, across all readers. Therefore once data is read it will
702 * be unavailable to other reads for tap devices. */
704 netdev_linux_construct_tap(struct netdev *netdev_)
706 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
707 static const char tap_dev[] = "/dev/net/tun";
708 const char *name = netdev_->name;
712 netdev_linux_common_construct(netdev);
714 /* Open tap device. */
715 netdev->tap_fd = open(tap_dev, O_RDWR);
716 if (netdev->tap_fd < 0) {
718 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
722 /* Create tap device. */
723 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
724 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
725 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
726 VLOG_WARN("%s: creating tap device failed: %s", name,
727 ovs_strerror(errno));
732 /* Make non-blocking. */
733 error = set_nonblocking(netdev->tap_fd);
741 close(netdev->tap_fd);
746 netdev_linux_destruct(struct netdev *netdev_)
748 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
750 if (netdev->tc && netdev->tc->ops->tc_destroy) {
751 netdev->tc->ops->tc_destroy(netdev->tc);
754 if (netdev_get_class(netdev_) == &netdev_tap_class
755 && netdev->tap_fd >= 0)
757 close(netdev->tap_fd);
760 if (netdev->miimon_interval > 0) {
762 atomic_sub(&miimon_cnt, 1, &junk);
765 ovs_mutex_destroy(&netdev->mutex);
769 netdev_linux_dealloc(struct netdev *netdev_)
771 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
775 static struct netdev_rx *
776 netdev_linux_rx_alloc(void)
778 struct netdev_rx_linux *rx = xzalloc(sizeof *rx);
783 netdev_linux_rx_construct(struct netdev_rx *rx_)
785 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
786 struct netdev *netdev_ = rx->up.netdev;
787 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
790 ovs_mutex_lock(&netdev->mutex);
791 rx->is_tap = is_tap_netdev(netdev_);
793 rx->fd = netdev->tap_fd;
795 struct sockaddr_ll sll;
797 /* Result of tcpdump -dd inbound */
798 static const struct sock_filter filt[] = {
799 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
800 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
801 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
802 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
804 static const struct sock_fprog fprog = {
805 ARRAY_SIZE(filt), (struct sock_filter *) filt
808 /* Create file descriptor. */
809 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
812 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
817 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
819 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
820 netdev_get_name(netdev_), ovs_strerror(error));
824 /* Set non-blocking mode. */
825 error = set_nonblocking(rx->fd);
830 /* Get ethernet device index. */
831 error = get_ifindex(&netdev->up, &ifindex);
836 /* Bind to specific ethernet device. */
837 memset(&sll, 0, sizeof sll);
838 sll.sll_family = AF_PACKET;
839 sll.sll_ifindex = ifindex;
840 sll.sll_protocol = htons(ETH_P_ALL);
841 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
843 VLOG_ERR("%s: failed to bind raw socket (%s)",
844 netdev_get_name(netdev_), ovs_strerror(error));
848 /* Filter for only inbound packets. */
849 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
853 VLOG_ERR("%s: failed to attach filter (%s)",
854 netdev_get_name(netdev_), ovs_strerror(error));
858 ovs_mutex_unlock(&netdev->mutex);
866 ovs_mutex_unlock(&netdev->mutex);
871 netdev_linux_rx_destruct(struct netdev_rx *rx_)
873 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
881 netdev_linux_rx_dealloc(struct netdev_rx *rx_)
883 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
889 auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux)
891 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
892 return htons(aux->tp_vlan_tpid);
894 return htons(ETH_TYPE_VLAN);
899 auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
901 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
905 netdev_linux_rx_recv_sock(int fd, struct ofpbuf *buffer)
910 struct cmsghdr *cmsg;
913 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
917 /* Reserve headroom for a single VLAN tag */
918 ofpbuf_reserve(buffer, VLAN_HEADER_LEN);
919 size = ofpbuf_tailroom(buffer);
921 iov.iov_base = buffer->data;
923 msgh.msg_name = NULL;
924 msgh.msg_namelen = 0;
927 msgh.msg_control = &cmsg_buffer;
928 msgh.msg_controllen = sizeof cmsg_buffer;
932 retval = recvmsg(fd, &msgh, MSG_TRUNC);
933 } while (retval < 0 && errno == EINTR);
937 } else if (retval > size) {
941 buffer->size += retval;
943 for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
944 const struct tpacket_auxdata *aux;
946 if (cmsg->cmsg_level != SOL_PACKET
947 || cmsg->cmsg_type != PACKET_AUXDATA
948 || cmsg->cmsg_len < CMSG_LEN(sizeof(struct tpacket_auxdata))) {
952 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
953 if (auxdata_has_vlan_tci(aux)) {
954 if (retval < ETH_HEADER_LEN) {
958 eth_push_vlan(buffer, auxdata_to_vlan_tpid(aux),
959 htons(aux->tp_vlan_tci));
968 netdev_linux_rx_recv_tap(int fd, struct ofpbuf *buffer)
971 size_t size = ofpbuf_tailroom(buffer);
974 retval = read(fd, buffer->data, size);
975 } while (retval < 0 && errno == EINTR);
979 } else if (retval > size) {
983 buffer->size += retval;
988 netdev_linux_rx_recv(struct netdev_rx *rx_, struct ofpbuf *buffer)
990 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
994 ? netdev_linux_rx_recv_tap(rx->fd, buffer)
995 : netdev_linux_rx_recv_sock(rx->fd, buffer));
996 if (retval && retval != EAGAIN && retval != EMSGSIZE) {
997 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
998 ovs_strerror(errno), netdev_rx_get_name(rx_));
1005 netdev_linux_rx_wait(struct netdev_rx *rx_)
1007 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
1008 poll_fd_wait(rx->fd, POLLIN);
1012 netdev_linux_rx_drain(struct netdev_rx *rx_)
1014 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
1017 int error = af_inet_ifreq_ioctl(netdev_rx_get_name(rx_), &ifr,
1018 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
1022 drain_fd(rx->fd, ifr.ifr_qlen);
1025 return drain_rcvbuf(rx->fd);
1029 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
1030 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1031 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1032 * the packet is too big or too small to transmit on the device.
1034 * The caller retains ownership of 'buffer' in all cases.
1036 * The kernel maintains a packet transmission queue, so the caller is not
1037 * expected to do additional queuing of packets. */
1039 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
1044 if (!is_tap_netdev(netdev_)) {
1045 /* Use our AF_PACKET socket to send to this device. */
1046 struct sockaddr_ll sll;
1052 sock = af_packet_sock();
1057 ifindex = netdev_get_ifindex(netdev_);
1062 /* We don't bother setting most fields in sockaddr_ll because the
1063 * kernel ignores them for SOCK_RAW. */
1064 memset(&sll, 0, sizeof sll);
1065 sll.sll_family = AF_PACKET;
1066 sll.sll_ifindex = ifindex;
1068 iov.iov_base = CONST_CAST(void *, data);
1071 msg.msg_name = &sll;
1072 msg.msg_namelen = sizeof sll;
1075 msg.msg_control = NULL;
1076 msg.msg_controllen = 0;
1079 retval = sendmsg(sock, &msg, 0);
1081 /* Use the tap fd to send to this device. This is essential for
1082 * tap devices, because packets sent to a tap device with an
1083 * AF_PACKET socket will loop back to be *received* again on the
1084 * tap device. This doesn't occur on other interface types
1085 * because we attach a socket filter to the rx socket. */
1086 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1088 retval = write(netdev->tap_fd, data, size);
1092 /* The Linux AF_PACKET implementation never blocks waiting for room
1093 * for packets, instead returning ENOBUFS. Translate this into
1094 * EAGAIN for the caller. */
1095 if (errno == ENOBUFS) {
1097 } else if (errno == EINTR) {
1099 } else if (errno != EAGAIN) {
1100 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1101 netdev_get_name(netdev_), ovs_strerror(errno));
1104 } else if (retval != size) {
1105 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE"d bytes of "
1106 "%"PRIuSIZE") on %s", retval, size, netdev_get_name(netdev_));
1114 /* Registers with the poll loop to wake up from the next call to poll_block()
1115 * when the packet transmission queue has sufficient room to transmit a packet
1116 * with netdev_send().
1118 * The kernel maintains a packet transmission queue, so the client is not
1119 * expected to do additional queuing of packets. Thus, this function is
1120 * unlikely to ever be used. It is included for completeness. */
1122 netdev_linux_send_wait(struct netdev *netdev)
1124 if (is_tap_netdev(netdev)) {
1125 /* TAP device always accepts packets.*/
1126 poll_immediate_wake();
1130 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1131 * otherwise a positive errno value. */
1133 netdev_linux_set_etheraddr(struct netdev *netdev_,
1134 const uint8_t mac[ETH_ADDR_LEN])
1136 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1137 enum netdev_flags old_flags = 0;
1140 ovs_mutex_lock(&netdev->mutex);
1142 if (netdev->cache_valid & VALID_ETHERADDR) {
1143 error = netdev->ether_addr_error;
1144 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1147 netdev->cache_valid &= ~VALID_ETHERADDR;
1150 /* Tap devices must be brought down before setting the address. */
1151 if (is_tap_netdev(netdev_)) {
1152 update_flags(netdev, NETDEV_UP, 0, &old_flags);
1154 error = set_etheraddr(netdev_get_name(netdev_), mac);
1155 if (!error || error == ENODEV) {
1156 netdev->ether_addr_error = error;
1157 netdev->cache_valid |= VALID_ETHERADDR;
1159 memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
1163 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1164 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1168 ovs_mutex_unlock(&netdev->mutex);
1172 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1174 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1175 uint8_t mac[ETH_ADDR_LEN])
1177 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1180 ovs_mutex_lock(&netdev->mutex);
1181 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1182 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1184 netdev->cache_valid |= VALID_ETHERADDR;
1187 error = netdev->ether_addr_error;
1189 memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
1191 ovs_mutex_unlock(&netdev->mutex);
1197 netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
1201 if (!(netdev->cache_valid & VALID_MTU)) {
1204 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
1205 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
1206 netdev->mtu = ifr.ifr_mtu;
1207 netdev->cache_valid |= VALID_MTU;
1210 error = netdev->netdev_mtu_error;
1212 *mtup = netdev->mtu;
1218 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1219 * in bytes, not including the hardware header; thus, this is typically 1500
1220 * bytes for Ethernet devices. */
1222 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1224 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1227 ovs_mutex_lock(&netdev->mutex);
1228 error = netdev_linux_get_mtu__(netdev, mtup);
1229 ovs_mutex_unlock(&netdev->mutex);
1234 /* Sets the maximum size of transmitted (MTU) for given device using linux
1235 * networking ioctl interface.
1238 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1240 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1244 ovs_mutex_lock(&netdev->mutex);
1245 if (netdev->cache_valid & VALID_MTU) {
1246 error = netdev->netdev_mtu_error;
1247 if (error || netdev->mtu == mtu) {
1250 netdev->cache_valid &= ~VALID_MTU;
1253 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1254 SIOCSIFMTU, "SIOCSIFMTU");
1255 if (!error || error == ENODEV) {
1256 netdev->netdev_mtu_error = error;
1257 netdev->mtu = ifr.ifr_mtu;
1258 netdev->cache_valid |= VALID_MTU;
1261 ovs_mutex_unlock(&netdev->mutex);
1265 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1266 * On failure, returns a negative errno value. */
1268 netdev_linux_get_ifindex(const struct netdev *netdev_)
1270 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1273 ovs_mutex_lock(&netdev->mutex);
1274 error = get_ifindex(netdev_, &ifindex);
1275 ovs_mutex_unlock(&netdev->mutex);
1277 return error ? -error : ifindex;
1281 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1283 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1285 ovs_mutex_lock(&netdev->mutex);
1286 if (netdev->miimon_interval > 0) {
1287 *carrier = netdev->miimon;
1289 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1291 ovs_mutex_unlock(&netdev->mutex);
1296 static long long int
1297 netdev_linux_get_carrier_resets(const struct netdev *netdev_)
1299 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1300 long long int carrier_resets;
1302 ovs_mutex_lock(&netdev->mutex);
1303 carrier_resets = netdev->carrier_resets;
1304 ovs_mutex_unlock(&netdev->mutex);
1306 return carrier_resets;
1310 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1311 struct mii_ioctl_data *data)
1316 memset(&ifr, 0, sizeof ifr);
1317 memcpy(&ifr.ifr_data, data, sizeof *data);
1318 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
1319 memcpy(data, &ifr.ifr_data, sizeof *data);
1325 netdev_linux_get_miimon(const char *name, bool *miimon)
1327 struct mii_ioctl_data data;
1332 memset(&data, 0, sizeof data);
1333 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1335 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1336 data.reg_num = MII_BMSR;
1337 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1341 *miimon = !!(data.val_out & BMSR_LSTATUS);
1343 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1346 struct ethtool_cmd ecmd;
1348 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1351 COVERAGE_INC(netdev_get_ethtool);
1352 memset(&ecmd, 0, sizeof ecmd);
1353 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1356 struct ethtool_value eval;
1358 memcpy(&eval, &ecmd, sizeof eval);
1359 *miimon = !!eval.data;
1361 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1369 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1370 long long int interval)
1372 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1374 ovs_mutex_lock(&netdev->mutex);
1375 interval = interval > 0 ? MAX(interval, 100) : 0;
1376 if (netdev->miimon_interval != interval) {
1379 if (interval && !netdev->miimon_interval) {
1380 atomic_add(&miimon_cnt, 1, &junk);
1381 } else if (!interval && netdev->miimon_interval) {
1382 atomic_sub(&miimon_cnt, 1, &junk);
1385 netdev->miimon_interval = interval;
1386 timer_set_expired(&netdev->miimon_timer);
1388 ovs_mutex_unlock(&netdev->mutex);
1394 netdev_linux_miimon_run(void)
1396 struct shash device_shash;
1397 struct shash_node *node;
1399 shash_init(&device_shash);
1400 netdev_get_devices(&netdev_linux_class, &device_shash);
1401 SHASH_FOR_EACH (node, &device_shash) {
1402 struct netdev *netdev = node->data;
1403 struct netdev_linux *dev = netdev_linux_cast(netdev);
1406 ovs_mutex_lock(&dev->mutex);
1407 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1408 netdev_linux_get_miimon(dev->up.name, &miimon);
1409 if (miimon != dev->miimon) {
1410 dev->miimon = miimon;
1411 netdev_linux_changed(dev, dev->ifi_flags, 0);
1414 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1416 ovs_mutex_unlock(&dev->mutex);
1417 netdev_close(netdev);
1420 shash_destroy(&device_shash);
1424 netdev_linux_miimon_wait(void)
1426 struct shash device_shash;
1427 struct shash_node *node;
1429 shash_init(&device_shash);
1430 netdev_get_devices(&netdev_linux_class, &device_shash);
1431 SHASH_FOR_EACH (node, &device_shash) {
1432 struct netdev *netdev = node->data;
1433 struct netdev_linux *dev = netdev_linux_cast(netdev);
1435 ovs_mutex_lock(&dev->mutex);
1436 if (dev->miimon_interval > 0) {
1437 timer_wait(&dev->miimon_timer);
1439 ovs_mutex_unlock(&dev->mutex);
1440 netdev_close(netdev);
1442 shash_destroy(&device_shash);
1446 swap_uint64(uint64_t *a, uint64_t *b)
1453 /* Copies 'src' into 'dst', performing format conversion in the process.
1455 * 'src' is allowed to be misaligned. */
1457 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1458 const struct ovs_vport_stats *src)
1460 dst->rx_packets = get_unaligned_u64(&src->rx_packets);
1461 dst->tx_packets = get_unaligned_u64(&src->tx_packets);
1462 dst->rx_bytes = get_unaligned_u64(&src->rx_bytes);
1463 dst->tx_bytes = get_unaligned_u64(&src->tx_bytes);
1464 dst->rx_errors = get_unaligned_u64(&src->rx_errors);
1465 dst->tx_errors = get_unaligned_u64(&src->tx_errors);
1466 dst->rx_dropped = get_unaligned_u64(&src->rx_dropped);
1467 dst->tx_dropped = get_unaligned_u64(&src->tx_dropped);
1469 dst->collisions = 0;
1470 dst->rx_length_errors = 0;
1471 dst->rx_over_errors = 0;
1472 dst->rx_crc_errors = 0;
1473 dst->rx_frame_errors = 0;
1474 dst->rx_fifo_errors = 0;
1475 dst->rx_missed_errors = 0;
1476 dst->tx_aborted_errors = 0;
1477 dst->tx_carrier_errors = 0;
1478 dst->tx_fifo_errors = 0;
1479 dst->tx_heartbeat_errors = 0;
1480 dst->tx_window_errors = 0;
1484 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1486 struct dpif_linux_vport reply;
1490 error = dpif_linux_vport_get(netdev_get_name(netdev), &reply, &buf);
1493 } else if (!reply.stats) {
1498 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1506 get_stats_via_vport(const struct netdev *netdev_,
1507 struct netdev_stats *stats)
1509 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1511 if (!netdev->vport_stats_error ||
1512 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1515 error = get_stats_via_vport__(netdev_, stats);
1516 if (error && error != ENOENT) {
1517 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1519 netdev_get_name(netdev_), ovs_strerror(error));
1521 netdev->vport_stats_error = error;
1522 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1526 /* Retrieves current device stats for 'netdev-linux'. */
1528 netdev_linux_get_stats(const struct netdev *netdev_,
1529 struct netdev_stats *stats)
1531 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1532 struct netdev_stats dev_stats;
1535 ovs_mutex_lock(&netdev->mutex);
1536 get_stats_via_vport(netdev_, stats);
1537 error = get_stats_via_netlink(netdev_, &dev_stats);
1539 if (!netdev->vport_stats_error) {
1542 } else if (netdev->vport_stats_error) {
1543 /* stats not available from OVS then use ioctl stats. */
1546 stats->rx_errors += dev_stats.rx_errors;
1547 stats->tx_errors += dev_stats.tx_errors;
1548 stats->rx_dropped += dev_stats.rx_dropped;
1549 stats->tx_dropped += dev_stats.tx_dropped;
1550 stats->multicast += dev_stats.multicast;
1551 stats->collisions += dev_stats.collisions;
1552 stats->rx_length_errors += dev_stats.rx_length_errors;
1553 stats->rx_over_errors += dev_stats.rx_over_errors;
1554 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1555 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1556 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1557 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1558 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1559 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1560 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1561 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1562 stats->tx_window_errors += dev_stats.tx_window_errors;
1564 ovs_mutex_unlock(&netdev->mutex);
1569 /* Retrieves current device stats for 'netdev-tap' netdev or
1570 * netdev-internal. */
1572 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1574 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1575 struct netdev_stats dev_stats;
1578 ovs_mutex_lock(&netdev->mutex);
1579 get_stats_via_vport(netdev_, stats);
1580 error = get_stats_via_netlink(netdev_, &dev_stats);
1582 if (!netdev->vport_stats_error) {
1585 } else if (netdev->vport_stats_error) {
1586 /* Transmit and receive stats will appear to be swapped relative to the
1587 * other ports since we are the one sending the data, not a remote
1588 * computer. For consistency, we swap them back here. This does not
1589 * apply if we are getting stats from the vport layer because it always
1590 * tracks stats from the perspective of the switch. */
1593 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1594 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1595 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1596 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1597 stats->rx_length_errors = 0;
1598 stats->rx_over_errors = 0;
1599 stats->rx_crc_errors = 0;
1600 stats->rx_frame_errors = 0;
1601 stats->rx_fifo_errors = 0;
1602 stats->rx_missed_errors = 0;
1603 stats->tx_aborted_errors = 0;
1604 stats->tx_carrier_errors = 0;
1605 stats->tx_fifo_errors = 0;
1606 stats->tx_heartbeat_errors = 0;
1607 stats->tx_window_errors = 0;
1609 stats->rx_dropped += dev_stats.tx_dropped;
1610 stats->tx_dropped += dev_stats.rx_dropped;
1612 stats->rx_errors += dev_stats.tx_errors;
1613 stats->tx_errors += dev_stats.rx_errors;
1615 stats->multicast += dev_stats.multicast;
1616 stats->collisions += dev_stats.collisions;
1618 ovs_mutex_unlock(&netdev->mutex);
1624 netdev_internal_get_stats(const struct netdev *netdev_,
1625 struct netdev_stats *stats)
1627 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1630 ovs_mutex_lock(&netdev->mutex);
1631 get_stats_via_vport(netdev_, stats);
1632 error = netdev->vport_stats_error;
1633 ovs_mutex_unlock(&netdev->mutex);
1639 netdev_internal_set_stats(struct netdev *netdev,
1640 const struct netdev_stats *stats)
1642 struct ovs_vport_stats vport_stats;
1643 struct dpif_linux_vport vport;
1646 vport_stats.rx_packets = stats->rx_packets;
1647 vport_stats.tx_packets = stats->tx_packets;
1648 vport_stats.rx_bytes = stats->rx_bytes;
1649 vport_stats.tx_bytes = stats->tx_bytes;
1650 vport_stats.rx_errors = stats->rx_errors;
1651 vport_stats.tx_errors = stats->tx_errors;
1652 vport_stats.rx_dropped = stats->rx_dropped;
1653 vport_stats.tx_dropped = stats->tx_dropped;
1655 dpif_linux_vport_init(&vport);
1656 vport.cmd = OVS_VPORT_CMD_SET;
1657 vport.name = netdev_get_name(netdev);
1658 vport.stats = &vport_stats;
1660 err = dpif_linux_vport_transact(&vport, NULL, NULL);
1662 /* If the vport layer doesn't know about the device, that doesn't mean it
1663 * doesn't exist (after all were able to open it when netdev_open() was
1664 * called), it just means that it isn't attached and we'll be getting
1665 * stats a different way. */
1666 if (err == ENODEV) {
1674 netdev_linux_read_features(struct netdev_linux *netdev)
1676 struct ethtool_cmd ecmd;
1680 if (netdev->cache_valid & VALID_FEATURES) {
1684 COVERAGE_INC(netdev_get_ethtool);
1685 memset(&ecmd, 0, sizeof ecmd);
1686 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1687 ETHTOOL_GSET, "ETHTOOL_GSET");
1692 /* Supported features. */
1693 netdev->supported = 0;
1694 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1695 netdev->supported |= NETDEV_F_10MB_HD;
1697 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1698 netdev->supported |= NETDEV_F_10MB_FD;
1700 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1701 netdev->supported |= NETDEV_F_100MB_HD;
1703 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1704 netdev->supported |= NETDEV_F_100MB_FD;
1706 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1707 netdev->supported |= NETDEV_F_1GB_HD;
1709 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1710 netdev->supported |= NETDEV_F_1GB_FD;
1712 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1713 netdev->supported |= NETDEV_F_10GB_FD;
1715 if (ecmd.supported & SUPPORTED_TP) {
1716 netdev->supported |= NETDEV_F_COPPER;
1718 if (ecmd.supported & SUPPORTED_FIBRE) {
1719 netdev->supported |= NETDEV_F_FIBER;
1721 if (ecmd.supported & SUPPORTED_Autoneg) {
1722 netdev->supported |= NETDEV_F_AUTONEG;
1724 if (ecmd.supported & SUPPORTED_Pause) {
1725 netdev->supported |= NETDEV_F_PAUSE;
1727 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1728 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1731 /* Advertised features. */
1732 netdev->advertised = 0;
1733 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1734 netdev->advertised |= NETDEV_F_10MB_HD;
1736 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1737 netdev->advertised |= NETDEV_F_10MB_FD;
1739 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1740 netdev->advertised |= NETDEV_F_100MB_HD;
1742 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1743 netdev->advertised |= NETDEV_F_100MB_FD;
1745 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1746 netdev->advertised |= NETDEV_F_1GB_HD;
1748 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1749 netdev->advertised |= NETDEV_F_1GB_FD;
1751 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1752 netdev->advertised |= NETDEV_F_10GB_FD;
1754 if (ecmd.advertising & ADVERTISED_TP) {
1755 netdev->advertised |= NETDEV_F_COPPER;
1757 if (ecmd.advertising & ADVERTISED_FIBRE) {
1758 netdev->advertised |= NETDEV_F_FIBER;
1760 if (ecmd.advertising & ADVERTISED_Autoneg) {
1761 netdev->advertised |= NETDEV_F_AUTONEG;
1763 if (ecmd.advertising & ADVERTISED_Pause) {
1764 netdev->advertised |= NETDEV_F_PAUSE;
1766 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1767 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1770 /* Current settings. */
1772 if (speed == SPEED_10) {
1773 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1774 } else if (speed == SPEED_100) {
1775 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1776 } else if (speed == SPEED_1000) {
1777 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1778 } else if (speed == SPEED_10000) {
1779 netdev->current = NETDEV_F_10GB_FD;
1780 } else if (speed == 40000) {
1781 netdev->current = NETDEV_F_40GB_FD;
1782 } else if (speed == 100000) {
1783 netdev->current = NETDEV_F_100GB_FD;
1784 } else if (speed == 1000000) {
1785 netdev->current = NETDEV_F_1TB_FD;
1787 netdev->current = 0;
1790 if (ecmd.port == PORT_TP) {
1791 netdev->current |= NETDEV_F_COPPER;
1792 } else if (ecmd.port == PORT_FIBRE) {
1793 netdev->current |= NETDEV_F_FIBER;
1797 netdev->current |= NETDEV_F_AUTONEG;
1801 netdev->cache_valid |= VALID_FEATURES;
1802 netdev->get_features_error = error;
1805 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
1806 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1807 * Returns 0 if successful, otherwise a positive errno value. */
1809 netdev_linux_get_features(const struct netdev *netdev_,
1810 enum netdev_features *current,
1811 enum netdev_features *advertised,
1812 enum netdev_features *supported,
1813 enum netdev_features *peer)
1815 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1818 ovs_mutex_lock(&netdev->mutex);
1819 netdev_linux_read_features(netdev);
1820 if (!netdev->get_features_error) {
1821 *current = netdev->current;
1822 *advertised = netdev->advertised;
1823 *supported = netdev->supported;
1824 *peer = 0; /* XXX */
1826 error = netdev->get_features_error;
1827 ovs_mutex_unlock(&netdev->mutex);
1832 /* Set the features advertised by 'netdev' to 'advertise'. */
1834 netdev_linux_set_advertisements(struct netdev *netdev_,
1835 enum netdev_features advertise)
1837 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1838 struct ethtool_cmd ecmd;
1841 ovs_mutex_lock(&netdev->mutex);
1843 COVERAGE_INC(netdev_get_ethtool);
1844 memset(&ecmd, 0, sizeof ecmd);
1845 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1846 ETHTOOL_GSET, "ETHTOOL_GSET");
1851 ecmd.advertising = 0;
1852 if (advertise & NETDEV_F_10MB_HD) {
1853 ecmd.advertising |= ADVERTISED_10baseT_Half;
1855 if (advertise & NETDEV_F_10MB_FD) {
1856 ecmd.advertising |= ADVERTISED_10baseT_Full;
1858 if (advertise & NETDEV_F_100MB_HD) {
1859 ecmd.advertising |= ADVERTISED_100baseT_Half;
1861 if (advertise & NETDEV_F_100MB_FD) {
1862 ecmd.advertising |= ADVERTISED_100baseT_Full;
1864 if (advertise & NETDEV_F_1GB_HD) {
1865 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1867 if (advertise & NETDEV_F_1GB_FD) {
1868 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1870 if (advertise & NETDEV_F_10GB_FD) {
1871 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1873 if (advertise & NETDEV_F_COPPER) {
1874 ecmd.advertising |= ADVERTISED_TP;
1876 if (advertise & NETDEV_F_FIBER) {
1877 ecmd.advertising |= ADVERTISED_FIBRE;
1879 if (advertise & NETDEV_F_AUTONEG) {
1880 ecmd.advertising |= ADVERTISED_Autoneg;
1882 if (advertise & NETDEV_F_PAUSE) {
1883 ecmd.advertising |= ADVERTISED_Pause;
1885 if (advertise & NETDEV_F_PAUSE_ASYM) {
1886 ecmd.advertising |= ADVERTISED_Asym_Pause;
1888 COVERAGE_INC(netdev_set_ethtool);
1889 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1890 ETHTOOL_SSET, "ETHTOOL_SSET");
1893 ovs_mutex_unlock(&netdev->mutex);
1897 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1898 * successful, otherwise a positive errno value. */
1900 netdev_linux_set_policing(struct netdev *netdev_,
1901 uint32_t kbits_rate, uint32_t kbits_burst)
1903 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1904 const char *netdev_name = netdev_get_name(netdev_);
1907 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1908 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1909 : kbits_burst); /* Stick with user-specified value. */
1911 ovs_mutex_lock(&netdev->mutex);
1912 if (netdev->cache_valid & VALID_POLICING) {
1913 error = netdev->netdev_policing_error;
1914 if (error || (netdev->kbits_rate == kbits_rate &&
1915 netdev->kbits_burst == kbits_burst)) {
1916 /* Assume that settings haven't changed since we last set them. */
1919 netdev->cache_valid &= ~VALID_POLICING;
1922 COVERAGE_INC(netdev_set_policing);
1923 /* Remove any existing ingress qdisc. */
1924 error = tc_add_del_ingress_qdisc(netdev_, false);
1926 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1927 netdev_name, ovs_strerror(error));
1932 error = tc_add_del_ingress_qdisc(netdev_, true);
1934 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1935 netdev_name, ovs_strerror(error));
1939 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
1941 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1942 netdev_name, ovs_strerror(error));
1947 netdev->kbits_rate = kbits_rate;
1948 netdev->kbits_burst = kbits_burst;
1951 if (!error || error == ENODEV) {
1952 netdev->netdev_policing_error = error;
1953 netdev->cache_valid |= VALID_POLICING;
1955 ovs_mutex_unlock(&netdev->mutex);
1960 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1963 const struct tc_ops *const *opsp;
1965 for (opsp = tcs; *opsp != NULL; opsp++) {
1966 const struct tc_ops *ops = *opsp;
1967 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1968 sset_add(types, ops->ovs_name);
1974 static const struct tc_ops *
1975 tc_lookup_ovs_name(const char *name)
1977 const struct tc_ops *const *opsp;
1979 for (opsp = tcs; *opsp != NULL; opsp++) {
1980 const struct tc_ops *ops = *opsp;
1981 if (!strcmp(name, ops->ovs_name)) {
1988 static const struct tc_ops *
1989 tc_lookup_linux_name(const char *name)
1991 const struct tc_ops *const *opsp;
1993 for (opsp = tcs; *opsp != NULL; opsp++) {
1994 const struct tc_ops *ops = *opsp;
1995 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
2002 static struct tc_queue *
2003 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
2006 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2007 struct tc_queue *queue;
2009 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
2010 if (queue->queue_id == queue_id) {
2017 static struct tc_queue *
2018 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2020 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2024 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2026 struct netdev_qos_capabilities *caps)
2028 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2032 caps->n_queues = ops->n_queues;
2037 netdev_linux_get_qos(const struct netdev *netdev_,
2038 const char **typep, struct smap *details)
2040 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2043 ovs_mutex_lock(&netdev->mutex);
2044 error = tc_query_qdisc(netdev_);
2046 *typep = netdev->tc->ops->ovs_name;
2047 error = (netdev->tc->ops->qdisc_get
2048 ? netdev->tc->ops->qdisc_get(netdev_, details)
2051 ovs_mutex_unlock(&netdev->mutex);
2057 netdev_linux_set_qos(struct netdev *netdev_,
2058 const char *type, const struct smap *details)
2060 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2061 const struct tc_ops *new_ops;
2064 new_ops = tc_lookup_ovs_name(type);
2065 if (!new_ops || !new_ops->tc_install) {
2069 ovs_mutex_lock(&netdev->mutex);
2070 error = tc_query_qdisc(netdev_);
2075 if (new_ops == netdev->tc->ops) {
2076 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
2078 /* Delete existing qdisc. */
2079 error = tc_del_qdisc(netdev_);
2083 ovs_assert(netdev->tc == NULL);
2085 /* Install new qdisc. */
2086 error = new_ops->tc_install(netdev_, details);
2087 ovs_assert((error == 0) == (netdev->tc != NULL));
2091 ovs_mutex_unlock(&netdev->mutex);
2096 netdev_linux_get_queue(const struct netdev *netdev_,
2097 unsigned int queue_id, struct smap *details)
2099 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2102 ovs_mutex_lock(&netdev->mutex);
2103 error = tc_query_qdisc(netdev_);
2105 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2107 ? netdev->tc->ops->class_get(netdev_, queue, details)
2110 ovs_mutex_unlock(&netdev->mutex);
2116 netdev_linux_set_queue(struct netdev *netdev_,
2117 unsigned int queue_id, const struct smap *details)
2119 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2122 ovs_mutex_lock(&netdev->mutex);
2123 error = tc_query_qdisc(netdev_);
2125 error = (queue_id < netdev->tc->ops->n_queues
2126 && netdev->tc->ops->class_set
2127 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2130 ovs_mutex_unlock(&netdev->mutex);
2136 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
2138 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2141 ovs_mutex_lock(&netdev->mutex);
2142 error = tc_query_qdisc(netdev_);
2144 if (netdev->tc->ops->class_delete) {
2145 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2147 ? netdev->tc->ops->class_delete(netdev_, queue)
2153 ovs_mutex_unlock(&netdev->mutex);
2159 netdev_linux_get_queue_stats(const struct netdev *netdev_,
2160 unsigned int queue_id,
2161 struct netdev_queue_stats *stats)
2163 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2166 ovs_mutex_lock(&netdev->mutex);
2167 error = tc_query_qdisc(netdev_);
2169 if (netdev->tc->ops->class_get_stats) {
2170 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2172 stats->created = queue->created;
2173 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2182 ovs_mutex_unlock(&netdev->mutex);
2187 struct queue_dump_state {
2188 struct nl_dump dump;
2193 start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
2195 struct ofpbuf request;
2196 struct tcmsg *tcmsg;
2198 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2202 tcmsg->tcm_parent = 0;
2203 nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
2204 ofpbuf_uninit(&request);
2206 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
2211 finish_queue_dump(struct queue_dump_state *state)
2213 ofpbuf_uninit(&state->buf);
2214 return nl_dump_done(&state->dump);
2217 struct netdev_linux_queue_state {
2218 unsigned int *queues;
2224 netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
2226 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2229 ovs_mutex_lock(&netdev->mutex);
2230 error = tc_query_qdisc(netdev_);
2232 if (netdev->tc->ops->class_get) {
2233 struct netdev_linux_queue_state *state;
2234 struct tc_queue *queue;
2237 *statep = state = xmalloc(sizeof *state);
2238 state->n_queues = hmap_count(&netdev->tc->queues);
2239 state->cur_queue = 0;
2240 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2243 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2244 state->queues[i++] = queue->queue_id;
2250 ovs_mutex_unlock(&netdev->mutex);
2256 netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2257 unsigned int *queue_idp, struct smap *details)
2259 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2260 struct netdev_linux_queue_state *state = state_;
2263 ovs_mutex_lock(&netdev->mutex);
2264 while (state->cur_queue < state->n_queues) {
2265 unsigned int queue_id = state->queues[state->cur_queue++];
2266 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2269 *queue_idp = queue_id;
2270 error = netdev->tc->ops->class_get(netdev_, queue, details);
2274 ovs_mutex_unlock(&netdev->mutex);
2280 netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2283 struct netdev_linux_queue_state *state = state_;
2285 free(state->queues);
2291 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2292 netdev_dump_queue_stats_cb *cb, void *aux)
2294 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2297 ovs_mutex_lock(&netdev->mutex);
2298 error = tc_query_qdisc(netdev_);
2300 struct queue_dump_state state;
2302 if (!netdev->tc->ops->class_dump_stats) {
2304 } else if (!start_queue_dump(netdev_, &state)) {
2310 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
2311 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2318 retval = finish_queue_dump(&state);
2324 ovs_mutex_unlock(&netdev->mutex);
2330 netdev_linux_get_in4(const struct netdev *netdev_,
2331 struct in_addr *address, struct in_addr *netmask)
2333 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2336 ovs_mutex_lock(&netdev->mutex);
2337 if (!(netdev->cache_valid & VALID_IN4)) {
2338 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
2339 SIOCGIFADDR, "SIOCGIFADDR");
2341 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2342 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2344 netdev->cache_valid |= VALID_IN4;
2352 if (netdev->address.s_addr != INADDR_ANY) {
2353 *address = netdev->address;
2354 *netmask = netdev->netmask;
2356 error = EADDRNOTAVAIL;
2359 ovs_mutex_unlock(&netdev->mutex);
2365 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2366 struct in_addr netmask)
2368 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2371 ovs_mutex_lock(&netdev->mutex);
2372 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2374 netdev->cache_valid |= VALID_IN4;
2375 netdev->address = address;
2376 netdev->netmask = netmask;
2377 if (address.s_addr != INADDR_ANY) {
2378 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2379 "SIOCSIFNETMASK", netmask);
2382 ovs_mutex_unlock(&netdev->mutex);
2388 parse_if_inet6_line(const char *line,
2389 struct in6_addr *in6, char ifname[16 + 1])
2391 uint8_t *s6 = in6->s6_addr;
2392 #define X8 "%2"SCNx8
2393 return ovs_scan(line,
2394 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2395 "%*x %*x %*x %*x %16s\n",
2396 &s6[0], &s6[1], &s6[2], &s6[3],
2397 &s6[4], &s6[5], &s6[6], &s6[7],
2398 &s6[8], &s6[9], &s6[10], &s6[11],
2399 &s6[12], &s6[13], &s6[14], &s6[15],
2403 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2404 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2406 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2408 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2410 ovs_mutex_lock(&netdev->mutex);
2411 if (!(netdev->cache_valid & VALID_IN6)) {
2415 netdev->in6 = in6addr_any;
2417 file = fopen("/proc/net/if_inet6", "r");
2419 const char *name = netdev_get_name(netdev_);
2420 while (fgets(line, sizeof line, file)) {
2421 struct in6_addr in6_tmp;
2422 char ifname[16 + 1];
2423 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2424 && !strcmp(name, ifname))
2426 netdev->in6 = in6_tmp;
2432 netdev->cache_valid |= VALID_IN6;
2435 ovs_mutex_unlock(&netdev->mutex);
2441 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2443 struct sockaddr_in sin;
2444 memset(&sin, 0, sizeof sin);
2445 sin.sin_family = AF_INET;
2446 sin.sin_addr = addr;
2449 memset(sa, 0, sizeof *sa);
2450 memcpy(sa, &sin, sizeof sin);
2454 do_set_addr(struct netdev *netdev,
2455 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2459 make_in4_sockaddr(&ifr.ifr_addr, addr);
2460 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2464 /* Adds 'router' as a default IP gateway. */
2466 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2468 struct in_addr any = { INADDR_ANY };
2472 memset(&rt, 0, sizeof rt);
2473 make_in4_sockaddr(&rt.rt_dst, any);
2474 make_in4_sockaddr(&rt.rt_gateway, router);
2475 make_in4_sockaddr(&rt.rt_genmask, any);
2476 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2477 error = af_inet_ioctl(SIOCADDRT, &rt);
2479 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2485 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2488 static const char fn[] = "/proc/net/route";
2493 *netdev_name = NULL;
2494 stream = fopen(fn, "r");
2495 if (stream == NULL) {
2496 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2501 while (fgets(line, sizeof line, stream)) {
2504 ovs_be32 dest, gateway, mask;
2505 int refcnt, metric, mtu;
2506 unsigned int flags, use, window, irtt;
2509 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2511 iface, &dest, &gateway, &flags, &refcnt,
2512 &use, &metric, &mask, &mtu, &window, &irtt)) {
2513 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2517 if (!(flags & RTF_UP)) {
2518 /* Skip routes that aren't up. */
2522 /* The output of 'dest', 'mask', and 'gateway' were given in
2523 * network byte order, so we don't need need any endian
2524 * conversions here. */
2525 if ((dest & mask) == (host->s_addr & mask)) {
2527 /* The host is directly reachable. */
2528 next_hop->s_addr = 0;
2530 /* To reach the host, we must go through a gateway. */
2531 next_hop->s_addr = gateway;
2533 *netdev_name = xstrdup(iface);
2545 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2547 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2550 ovs_mutex_lock(&netdev->mutex);
2551 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2552 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2554 COVERAGE_INC(netdev_get_ethtool);
2555 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2556 error = netdev_linux_do_ethtool(netdev->up.name,
2559 "ETHTOOL_GDRVINFO");
2561 netdev->cache_valid |= VALID_DRVINFO;
2566 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2567 smap_add(smap, "driver_version", netdev->drvinfo.version);
2568 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2570 ovs_mutex_unlock(&netdev->mutex);
2576 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2579 smap_add(smap, "driver_name", "openvswitch");
2583 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2584 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2585 * returns 0. Otherwise, it returns a positive errno value; in particular,
2586 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2588 netdev_linux_arp_lookup(const struct netdev *netdev,
2589 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2592 struct sockaddr_in sin;
2595 memset(&r, 0, sizeof r);
2596 memset(&sin, 0, sizeof sin);
2597 sin.sin_family = AF_INET;
2598 sin.sin_addr.s_addr = ip;
2600 memcpy(&r.arp_pa, &sin, sizeof sin);
2601 r.arp_ha.sa_family = ARPHRD_ETHER;
2603 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2604 COVERAGE_INC(netdev_arp_lookup);
2605 retval = af_inet_ioctl(SIOCGARP, &r);
2607 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2608 } else if (retval != ENXIO) {
2609 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2610 netdev_get_name(netdev), IP_ARGS(ip),
2611 ovs_strerror(retval));
2617 nd_to_iff_flags(enum netdev_flags nd)
2620 if (nd & NETDEV_UP) {
2623 if (nd & NETDEV_PROMISC) {
2626 if (nd & NETDEV_LOOPBACK) {
2627 iff |= IFF_LOOPBACK;
2633 iff_to_nd_flags(int iff)
2635 enum netdev_flags nd = 0;
2639 if (iff & IFF_PROMISC) {
2640 nd |= NETDEV_PROMISC;
2642 if (iff & IFF_LOOPBACK) {
2643 nd |= NETDEV_LOOPBACK;
2649 update_flags(struct netdev_linux *netdev, enum netdev_flags off,
2650 enum netdev_flags on, enum netdev_flags *old_flagsp)
2651 OVS_REQUIRES(netdev->mutex)
2653 int old_flags, new_flags;
2656 old_flags = netdev->ifi_flags;
2657 *old_flagsp = iff_to_nd_flags(old_flags);
2658 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2659 if (new_flags != old_flags) {
2660 error = set_flags(netdev_get_name(&netdev->up), new_flags);
2661 get_flags(&netdev->up, &netdev->ifi_flags);
2668 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2669 enum netdev_flags on, enum netdev_flags *old_flagsp)
2671 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2674 ovs_mutex_lock(&netdev->mutex);
2675 error = update_flags(netdev, off, on, old_flagsp);
2676 ovs_mutex_unlock(&netdev->mutex);
2681 #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, SET_STATS, \
2682 GET_FEATURES, GET_STATUS) \
2688 netdev_linux_wait, \
2690 netdev_linux_alloc, \
2692 netdev_linux_destruct, \
2693 netdev_linux_dealloc, \
2694 NULL, /* get_config */ \
2695 NULL, /* set_config */ \
2696 NULL, /* get_tunnel_config */ \
2698 netdev_linux_send, \
2699 netdev_linux_send_wait, \
2701 netdev_linux_set_etheraddr, \
2702 netdev_linux_get_etheraddr, \
2703 netdev_linux_get_mtu, \
2704 netdev_linux_set_mtu, \
2705 netdev_linux_get_ifindex, \
2706 netdev_linux_get_carrier, \
2707 netdev_linux_get_carrier_resets, \
2708 netdev_linux_set_miimon_interval, \
2713 netdev_linux_set_advertisements, \
2715 netdev_linux_set_policing, \
2716 netdev_linux_get_qos_types, \
2717 netdev_linux_get_qos_capabilities, \
2718 netdev_linux_get_qos, \
2719 netdev_linux_set_qos, \
2720 netdev_linux_get_queue, \
2721 netdev_linux_set_queue, \
2722 netdev_linux_delete_queue, \
2723 netdev_linux_get_queue_stats, \
2724 netdev_linux_queue_dump_start, \
2725 netdev_linux_queue_dump_next, \
2726 netdev_linux_queue_dump_done, \
2727 netdev_linux_dump_queue_stats, \
2729 netdev_linux_get_in4, \
2730 netdev_linux_set_in4, \
2731 netdev_linux_get_in6, \
2732 netdev_linux_add_router, \
2733 netdev_linux_get_next_hop, \
2735 netdev_linux_arp_lookup, \
2737 netdev_linux_update_flags, \
2739 netdev_linux_rx_alloc, \
2740 netdev_linux_rx_construct, \
2741 netdev_linux_rx_destruct, \
2742 netdev_linux_rx_dealloc, \
2743 netdev_linux_rx_recv, \
2744 netdev_linux_rx_wait, \
2745 netdev_linux_rx_drain, \
2748 const struct netdev_class netdev_linux_class =
2751 netdev_linux_construct,
2752 netdev_linux_get_stats,
2753 NULL, /* set_stats */
2754 netdev_linux_get_features,
2755 netdev_linux_get_status);
2757 const struct netdev_class netdev_tap_class =
2760 netdev_linux_construct_tap,
2761 netdev_tap_get_stats,
2762 NULL, /* set_stats */
2763 netdev_linux_get_features,
2764 netdev_linux_get_status);
2766 const struct netdev_class netdev_internal_class =
2769 netdev_linux_construct,
2770 netdev_internal_get_stats,
2771 netdev_internal_set_stats,
2772 NULL, /* get_features */
2773 netdev_internal_get_status);
2775 /* HTB traffic control class. */
2777 #define HTB_N_QUEUES 0xf000
2781 unsigned int max_rate; /* In bytes/s. */
2785 struct tc_queue tc_queue;
2786 unsigned int min_rate; /* In bytes/s. */
2787 unsigned int max_rate; /* In bytes/s. */
2788 unsigned int burst; /* In bytes. */
2789 unsigned int priority; /* Lower values are higher priorities. */
2793 htb_get__(const struct netdev *netdev_)
2795 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2796 return CONTAINER_OF(netdev->tc, struct htb, tc);
2800 htb_install__(struct netdev *netdev_, uint64_t max_rate)
2802 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2805 htb = xmalloc(sizeof *htb);
2806 tc_init(&htb->tc, &tc_ops_htb);
2807 htb->max_rate = max_rate;
2809 netdev->tc = &htb->tc;
2812 /* Create an HTB qdisc.
2814 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2816 htb_setup_qdisc__(struct netdev *netdev)
2819 struct tc_htb_glob opt;
2820 struct ofpbuf request;
2821 struct tcmsg *tcmsg;
2823 tc_del_qdisc(netdev);
2825 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2826 NLM_F_EXCL | NLM_F_CREATE, &request);
2830 tcmsg->tcm_handle = tc_make_handle(1, 0);
2831 tcmsg->tcm_parent = TC_H_ROOT;
2833 nl_msg_put_string(&request, TCA_KIND, "htb");
2835 memset(&opt, 0, sizeof opt);
2836 opt.rate2quantum = 10;
2840 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2841 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2842 nl_msg_end_nested(&request, opt_offset);
2844 return tc_transact(&request, NULL);
2847 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2848 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2850 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2851 unsigned int parent, struct htb_class *class)
2854 struct tc_htb_opt opt;
2855 struct ofpbuf request;
2856 struct tcmsg *tcmsg;
2860 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
2862 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2863 netdev_get_name(netdev));
2867 memset(&opt, 0, sizeof opt);
2868 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2869 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2870 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2871 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2872 opt.prio = class->priority;
2874 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2878 tcmsg->tcm_handle = handle;
2879 tcmsg->tcm_parent = parent;
2881 nl_msg_put_string(&request, TCA_KIND, "htb");
2882 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2883 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2884 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2885 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2886 nl_msg_end_nested(&request, opt_offset);
2888 error = tc_transact(&request, NULL);
2890 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2891 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2892 netdev_get_name(netdev),
2893 tc_get_major(handle), tc_get_minor(handle),
2894 tc_get_major(parent), tc_get_minor(parent),
2895 class->min_rate, class->max_rate,
2896 class->burst, class->priority, ovs_strerror(error));
2901 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2902 * description of them into 'details'. The description complies with the
2903 * specification given in the vswitch database documentation for linux-htb
2906 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2908 static const struct nl_policy tca_htb_policy[] = {
2909 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2910 .min_len = sizeof(struct tc_htb_opt) },
2913 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2914 const struct tc_htb_opt *htb;
2916 if (!nl_parse_nested(nl_options, tca_htb_policy,
2917 attrs, ARRAY_SIZE(tca_htb_policy))) {
2918 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2922 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2923 class->min_rate = htb->rate.rate;
2924 class->max_rate = htb->ceil.rate;
2925 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2926 class->priority = htb->prio;
2931 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2932 struct htb_class *options,
2933 struct netdev_queue_stats *stats)
2935 struct nlattr *nl_options;
2936 unsigned int handle;
2939 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2940 if (!error && queue_id) {
2941 unsigned int major = tc_get_major(handle);
2942 unsigned int minor = tc_get_minor(handle);
2943 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2944 *queue_id = minor - 1;
2949 if (!error && options) {
2950 error = htb_parse_tca_options__(nl_options, options);
2956 htb_parse_qdisc_details__(struct netdev *netdev_,
2957 const struct smap *details, struct htb_class *hc)
2959 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2960 const char *max_rate_s;
2962 max_rate_s = smap_get(details, "max-rate");
2963 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2964 if (!hc->max_rate) {
2965 enum netdev_features current;
2967 netdev_linux_read_features(netdev);
2968 current = !netdev->get_features_error ? netdev->current : 0;
2969 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
2971 hc->min_rate = hc->max_rate;
2977 htb_parse_class_details__(struct netdev *netdev,
2978 const struct smap *details, struct htb_class *hc)
2980 const struct htb *htb = htb_get__(netdev);
2981 const char *min_rate_s = smap_get(details, "min-rate");
2982 const char *max_rate_s = smap_get(details, "max-rate");
2983 const char *burst_s = smap_get(details, "burst");
2984 const char *priority_s = smap_get(details, "priority");
2987 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
2989 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2990 netdev_get_name(netdev));
2994 /* HTB requires at least an mtu sized min-rate to send any traffic even
2995 * on uncongested links. */
2996 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2997 hc->min_rate = MAX(hc->min_rate, mtu);
2998 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
3001 hc->max_rate = (max_rate_s
3002 ? strtoull(max_rate_s, NULL, 10) / 8
3004 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
3005 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
3009 * According to hints in the documentation that I've read, it is important
3010 * that 'burst' be at least as big as the largest frame that might be
3011 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
3012 * but having it a bit too small is a problem. Since netdev_get_mtu()
3013 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
3014 * the MTU. We actually add 64, instead of 14, as a guard against
3015 * additional headers get tacked on somewhere that we're not aware of. */
3016 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
3017 hc->burst = MAX(hc->burst, mtu + 64);
3020 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
3026 htb_query_class__(const struct netdev *netdev, unsigned int handle,
3027 unsigned int parent, struct htb_class *options,
3028 struct netdev_queue_stats *stats)
3030 struct ofpbuf *reply;
3033 error = tc_query_class(netdev, handle, parent, &reply);
3035 error = htb_parse_tcmsg__(reply, NULL, options, stats);
3036 ofpbuf_delete(reply);
3042 htb_tc_install(struct netdev *netdev, const struct smap *details)
3046 error = htb_setup_qdisc__(netdev);
3048 struct htb_class hc;
3050 htb_parse_qdisc_details__(netdev, details, &hc);
3051 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3052 tc_make_handle(1, 0), &hc);
3054 htb_install__(netdev, hc.max_rate);
3060 static struct htb_class *
3061 htb_class_cast__(const struct tc_queue *queue)
3063 return CONTAINER_OF(queue, struct htb_class, tc_queue);
3067 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
3068 const struct htb_class *hc)
3070 struct htb *htb = htb_get__(netdev);
3071 size_t hash = hash_int(queue_id, 0);
3072 struct tc_queue *queue;
3073 struct htb_class *hcp;
3075 queue = tc_find_queue__(netdev, queue_id, hash);
3077 hcp = htb_class_cast__(queue);
3079 hcp = xmalloc(sizeof *hcp);
3080 queue = &hcp->tc_queue;
3081 queue->queue_id = queue_id;
3082 queue->created = time_msec();
3083 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
3086 hcp->min_rate = hc->min_rate;
3087 hcp->max_rate = hc->max_rate;
3088 hcp->burst = hc->burst;
3089 hcp->priority = hc->priority;
3093 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3096 struct queue_dump_state state;
3097 struct htb_class hc;
3099 /* Get qdisc options. */
3101 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3102 htb_install__(netdev, hc.max_rate);
3105 if (!start_queue_dump(netdev, &state)) {
3108 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
3109 unsigned int queue_id;
3111 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3112 htb_update_queue__(netdev, queue_id, &hc);
3115 finish_queue_dump(&state);
3121 htb_tc_destroy(struct tc *tc)
3123 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
3124 struct htb_class *hc, *next;
3126 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
3127 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3135 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
3137 const struct htb *htb = htb_get__(netdev);
3138 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
3143 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
3145 struct htb_class hc;
3148 htb_parse_qdisc_details__(netdev, details, &hc);
3149 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3150 tc_make_handle(1, 0), &hc);
3152 htb_get__(netdev)->max_rate = hc.max_rate;
3158 htb_class_get(const struct netdev *netdev OVS_UNUSED,
3159 const struct tc_queue *queue, struct smap *details)
3161 const struct htb_class *hc = htb_class_cast__(queue);
3163 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3164 if (hc->min_rate != hc->max_rate) {
3165 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3167 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
3169 smap_add_format(details, "priority", "%u", hc->priority);
3175 htb_class_set(struct netdev *netdev, unsigned int queue_id,
3176 const struct smap *details)
3178 struct htb_class hc;
3181 error = htb_parse_class_details__(netdev, details, &hc);
3186 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3187 tc_make_handle(1, 0xfffe), &hc);
3192 htb_update_queue__(netdev, queue_id, &hc);
3197 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
3199 struct htb_class *hc = htb_class_cast__(queue);
3200 struct htb *htb = htb_get__(netdev);
3203 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3205 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3212 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3213 struct netdev_queue_stats *stats)
3215 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3216 tc_make_handle(1, 0xfffe), NULL, stats);
3220 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3221 const struct ofpbuf *nlmsg,
3222 netdev_dump_queue_stats_cb *cb, void *aux)
3224 struct netdev_queue_stats stats;
3225 unsigned int handle, major, minor;
3228 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3233 major = tc_get_major(handle);
3234 minor = tc_get_minor(handle);
3235 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3236 (*cb)(minor - 1, &stats, aux);
3241 static const struct tc_ops tc_ops_htb = {
3242 "htb", /* linux_name */
3243 "linux-htb", /* ovs_name */
3244 HTB_N_QUEUES, /* n_queues */
3253 htb_class_get_stats,
3254 htb_class_dump_stats
3257 /* "linux-hfsc" traffic control class. */
3259 #define HFSC_N_QUEUES 0xf000
3267 struct tc_queue tc_queue;
3272 static struct hfsc *
3273 hfsc_get__(const struct netdev *netdev_)
3275 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3276 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
3279 static struct hfsc_class *
3280 hfsc_class_cast__(const struct tc_queue *queue)
3282 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3286 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
3288 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3291 hfsc = xmalloc(sizeof *hfsc);
3292 tc_init(&hfsc->tc, &tc_ops_hfsc);
3293 hfsc->max_rate = max_rate;
3294 netdev->tc = &hfsc->tc;
3298 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3299 const struct hfsc_class *hc)
3303 struct hfsc_class *hcp;
3304 struct tc_queue *queue;
3306 hfsc = hfsc_get__(netdev);
3307 hash = hash_int(queue_id, 0);
3309 queue = tc_find_queue__(netdev, queue_id, hash);
3311 hcp = hfsc_class_cast__(queue);
3313 hcp = xmalloc(sizeof *hcp);
3314 queue = &hcp->tc_queue;
3315 queue->queue_id = queue_id;
3316 queue->created = time_msec();
3317 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3320 hcp->min_rate = hc->min_rate;
3321 hcp->max_rate = hc->max_rate;
3325 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3327 const struct tc_service_curve *rsc, *fsc, *usc;
3328 static const struct nl_policy tca_hfsc_policy[] = {
3330 .type = NL_A_UNSPEC,
3332 .min_len = sizeof(struct tc_service_curve),
3335 .type = NL_A_UNSPEC,
3337 .min_len = sizeof(struct tc_service_curve),
3340 .type = NL_A_UNSPEC,
3342 .min_len = sizeof(struct tc_service_curve),
3345 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3347 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3348 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3349 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3353 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3354 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3355 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3357 if (rsc->m1 != 0 || rsc->d != 0 ||
3358 fsc->m1 != 0 || fsc->d != 0 ||
3359 usc->m1 != 0 || usc->d != 0) {
3360 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3361 "Non-linear service curves are not supported.");
3365 if (rsc->m2 != fsc->m2) {
3366 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3367 "Real-time service curves are not supported ");
3371 if (rsc->m2 > usc->m2) {
3372 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3373 "Min-rate service curve is greater than "
3374 "the max-rate service curve.");
3378 class->min_rate = fsc->m2;
3379 class->max_rate = usc->m2;
3384 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3385 struct hfsc_class *options,
3386 struct netdev_queue_stats *stats)
3389 unsigned int handle;
3390 struct nlattr *nl_options;
3392 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3398 unsigned int major, minor;
3400 major = tc_get_major(handle);
3401 minor = tc_get_minor(handle);
3402 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3403 *queue_id = minor - 1;
3410 error = hfsc_parse_tca_options__(nl_options, options);
3417 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3418 unsigned int parent, struct hfsc_class *options,
3419 struct netdev_queue_stats *stats)
3422 struct ofpbuf *reply;
3424 error = tc_query_class(netdev, handle, parent, &reply);
3429 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3430 ofpbuf_delete(reply);
3435 hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
3436 struct hfsc_class *class)
3438 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3440 const char *max_rate_s;
3442 max_rate_s = smap_get(details, "max-rate");
3443 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3446 enum netdev_features current;
3448 netdev_linux_read_features(netdev);
3449 current = !netdev->get_features_error ? netdev->current : 0;
3450 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3453 class->min_rate = max_rate;
3454 class->max_rate = max_rate;
3458 hfsc_parse_class_details__(struct netdev *netdev,
3459 const struct smap *details,
3460 struct hfsc_class * class)
3462 const struct hfsc *hfsc;
3463 uint32_t min_rate, max_rate;
3464 const char *min_rate_s, *max_rate_s;
3466 hfsc = hfsc_get__(netdev);
3467 min_rate_s = smap_get(details, "min-rate");
3468 max_rate_s = smap_get(details, "max-rate");
3470 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3471 min_rate = MAX(min_rate, 1);
3472 min_rate = MIN(min_rate, hfsc->max_rate);
3474 max_rate = (max_rate_s
3475 ? strtoull(max_rate_s, NULL, 10) / 8
3477 max_rate = MAX(max_rate, min_rate);
3478 max_rate = MIN(max_rate, hfsc->max_rate);
3480 class->min_rate = min_rate;
3481 class->max_rate = max_rate;
3486 /* Create an HFSC qdisc.
3488 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3490 hfsc_setup_qdisc__(struct netdev * netdev)
3492 struct tcmsg *tcmsg;
3493 struct ofpbuf request;
3494 struct tc_hfsc_qopt opt;
3496 tc_del_qdisc(netdev);
3498 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3499 NLM_F_EXCL | NLM_F_CREATE, &request);
3505 tcmsg->tcm_handle = tc_make_handle(1, 0);
3506 tcmsg->tcm_parent = TC_H_ROOT;
3508 memset(&opt, 0, sizeof opt);
3511 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3512 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3514 return tc_transact(&request, NULL);
3517 /* Create an HFSC class.
3519 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3520 * sc rate <min_rate> ul rate <max_rate>" */
3522 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3523 unsigned int parent, struct hfsc_class *class)
3527 struct tcmsg *tcmsg;
3528 struct ofpbuf request;
3529 struct tc_service_curve min, max;
3531 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3537 tcmsg->tcm_handle = handle;
3538 tcmsg->tcm_parent = parent;
3542 min.m2 = class->min_rate;
3546 max.m2 = class->max_rate;
3548 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3549 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3550 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3551 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3552 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3553 nl_msg_end_nested(&request, opt_offset);
3555 error = tc_transact(&request, NULL);
3557 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3558 "min-rate %ubps, max-rate %ubps (%s)",
3559 netdev_get_name(netdev),
3560 tc_get_major(handle), tc_get_minor(handle),
3561 tc_get_major(parent), tc_get_minor(parent),
3562 class->min_rate, class->max_rate, ovs_strerror(error));
3569 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
3572 struct hfsc_class class;
3574 error = hfsc_setup_qdisc__(netdev);
3580 hfsc_parse_qdisc_details__(netdev, details, &class);
3581 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3582 tc_make_handle(1, 0), &class);
3588 hfsc_install__(netdev, class.max_rate);
3593 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3596 struct queue_dump_state state;
3597 struct hfsc_class hc;
3600 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3601 hfsc_install__(netdev, hc.max_rate);
3603 if (!start_queue_dump(netdev, &state)) {
3607 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
3608 unsigned int queue_id;
3610 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3611 hfsc_update_queue__(netdev, queue_id, &hc);
3615 finish_queue_dump(&state);
3620 hfsc_tc_destroy(struct tc *tc)
3623 struct hfsc_class *hc, *next;
3625 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3627 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3628 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3637 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
3639 const struct hfsc *hfsc;
3640 hfsc = hfsc_get__(netdev);
3641 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
3646 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
3649 struct hfsc_class class;
3651 hfsc_parse_qdisc_details__(netdev, details, &class);
3652 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3653 tc_make_handle(1, 0), &class);
3656 hfsc_get__(netdev)->max_rate = class.max_rate;
3663 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3664 const struct tc_queue *queue, struct smap *details)
3666 const struct hfsc_class *hc;
3668 hc = hfsc_class_cast__(queue);
3669 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3670 if (hc->min_rate != hc->max_rate) {
3671 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3677 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3678 const struct smap *details)
3681 struct hfsc_class class;
3683 error = hfsc_parse_class_details__(netdev, details, &class);
3688 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3689 tc_make_handle(1, 0xfffe), &class);
3694 hfsc_update_queue__(netdev, queue_id, &class);
3699 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3703 struct hfsc_class *hc;
3705 hc = hfsc_class_cast__(queue);
3706 hfsc = hfsc_get__(netdev);
3708 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3710 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3717 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3718 struct netdev_queue_stats *stats)
3720 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3721 tc_make_handle(1, 0xfffe), NULL, stats);
3725 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3726 const struct ofpbuf *nlmsg,
3727 netdev_dump_queue_stats_cb *cb, void *aux)
3729 struct netdev_queue_stats stats;
3730 unsigned int handle, major, minor;
3733 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3738 major = tc_get_major(handle);
3739 minor = tc_get_minor(handle);
3740 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3741 (*cb)(minor - 1, &stats, aux);
3746 static const struct tc_ops tc_ops_hfsc = {
3747 "hfsc", /* linux_name */
3748 "linux-hfsc", /* ovs_name */
3749 HFSC_N_QUEUES, /* n_queues */
3750 hfsc_tc_install, /* tc_install */
3751 hfsc_tc_load, /* tc_load */
3752 hfsc_tc_destroy, /* tc_destroy */
3753 hfsc_qdisc_get, /* qdisc_get */
3754 hfsc_qdisc_set, /* qdisc_set */
3755 hfsc_class_get, /* class_get */
3756 hfsc_class_set, /* class_set */
3757 hfsc_class_delete, /* class_delete */
3758 hfsc_class_get_stats, /* class_get_stats */
3759 hfsc_class_dump_stats /* class_dump_stats */
3762 /* "linux-default" traffic control class.
3764 * This class represents the default, unnamed Linux qdisc. It corresponds to
3765 * the "" (empty string) QoS type in the OVS database. */
3768 default_install__(struct netdev *netdev_)
3770 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3771 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
3773 /* Nothing but a tc class implementation is allowed to write to a tc. This
3774 * class never does that, so we can legitimately use a const tc object. */
3775 netdev->tc = CONST_CAST(struct tc *, &tc);
3779 default_tc_install(struct netdev *netdev,
3780 const struct smap *details OVS_UNUSED)
3782 default_install__(netdev);
3787 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3789 default_install__(netdev);
3793 static const struct tc_ops tc_ops_default = {
3794 NULL, /* linux_name */
3799 NULL, /* tc_destroy */
3800 NULL, /* qdisc_get */
3801 NULL, /* qdisc_set */
3802 NULL, /* class_get */
3803 NULL, /* class_set */
3804 NULL, /* class_delete */
3805 NULL, /* class_get_stats */
3806 NULL /* class_dump_stats */
3809 /* "linux-other" traffic control class.
3814 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
3816 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3817 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
3819 /* Nothing but a tc class implementation is allowed to write to a tc. This
3820 * class never does that, so we can legitimately use a const tc object. */
3821 netdev->tc = CONST_CAST(struct tc *, &tc);
3825 static const struct tc_ops tc_ops_other = {
3826 NULL, /* linux_name */
3827 "linux-other", /* ovs_name */
3829 NULL, /* tc_install */
3831 NULL, /* tc_destroy */
3832 NULL, /* qdisc_get */
3833 NULL, /* qdisc_set */
3834 NULL, /* class_get */
3835 NULL, /* class_set */
3836 NULL, /* class_delete */
3837 NULL, /* class_get_stats */
3838 NULL /* class_dump_stats */
3841 /* Traffic control. */
3843 /* Number of kernel "tc" ticks per second. */
3844 static double ticks_per_s;
3846 /* Number of kernel "jiffies" per second. This is used for the purpose of
3847 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3848 * one jiffy's worth of data.
3850 * There are two possibilities here:
3852 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3853 * approximate range of 100 to 1024. That means that we really need to
3854 * make sure that the qdisc can buffer that much data.
3856 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3857 * has finely granular timers and there's no need to fudge additional room
3858 * for buffers. (There's no extra effort needed to implement that: the
3859 * large 'buffer_hz' is used as a divisor, so practically any number will
3860 * come out as 0 in the division. Small integer results in the case of
3861 * really high dividends won't have any real effect anyhow.)
3863 static unsigned int buffer_hz;
3865 /* Returns tc handle 'major':'minor'. */
3867 tc_make_handle(unsigned int major, unsigned int minor)
3869 return TC_H_MAKE(major << 16, minor);
3872 /* Returns the major number from 'handle'. */
3874 tc_get_major(unsigned int handle)
3876 return TC_H_MAJ(handle) >> 16;
3879 /* Returns the minor number from 'handle'. */
3881 tc_get_minor(unsigned int handle)
3883 return TC_H_MIN(handle);
3886 static struct tcmsg *
3887 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3888 struct ofpbuf *request)
3890 struct tcmsg *tcmsg;
3894 error = get_ifindex(netdev, &ifindex);
3899 ofpbuf_init(request, 512);
3900 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3901 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3902 tcmsg->tcm_family = AF_UNSPEC;
3903 tcmsg->tcm_ifindex = ifindex;
3904 /* Caller should fill in tcmsg->tcm_handle. */
3905 /* Caller should fill in tcmsg->tcm_parent. */
3911 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3913 int error = nl_transact(NETLINK_ROUTE, request, replyp);
3914 ofpbuf_uninit(request);
3918 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3919 * policing configuration.
3921 * This function is equivalent to running the following when 'add' is true:
3922 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3924 * This function is equivalent to running the following when 'add' is false:
3925 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3927 * The configuration and stats may be seen with the following command:
3928 * /sbin/tc -s qdisc show dev <devname>
3930 * Returns 0 if successful, otherwise a positive errno value.
3933 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3935 struct ofpbuf request;
3936 struct tcmsg *tcmsg;
3938 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3939 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3941 tcmsg = tc_make_request(netdev, type, flags, &request);
3945 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3946 tcmsg->tcm_parent = TC_H_INGRESS;
3947 nl_msg_put_string(&request, TCA_KIND, "ingress");
3948 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3950 error = tc_transact(&request, NULL);
3952 /* If we're deleting the qdisc, don't worry about some of the
3953 * error conditions. */
3954 if (!add && (error == ENOENT || error == EINVAL)) {
3963 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3966 * This function is equivalent to running:
3967 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3968 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3971 * The configuration and stats may be seen with the following command:
3972 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3974 * Returns 0 if successful, otherwise a positive errno value.
3977 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3979 struct tc_police tc_police;
3980 struct ofpbuf request;
3981 struct tcmsg *tcmsg;
3982 size_t basic_offset;
3983 size_t police_offset;
3987 memset(&tc_police, 0, sizeof tc_police);
3988 tc_police.action = TC_POLICE_SHOT;
3989 tc_police.mtu = mtu;
3990 tc_fill_rate(&tc_police.rate, (kbits_rate * 1000)/8, mtu);
3991 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3992 kbits_burst * 1024);
3994 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3995 NLM_F_EXCL | NLM_F_CREATE, &request);
3999 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
4000 tcmsg->tcm_info = tc_make_handle(49,
4001 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
4003 nl_msg_put_string(&request, TCA_KIND, "basic");
4004 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4005 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
4006 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
4007 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
4008 nl_msg_end_nested(&request, police_offset);
4009 nl_msg_end_nested(&request, basic_offset);
4011 error = tc_transact(&request, NULL);
4022 /* The values in psched are not individually very meaningful, but they are
4023 * important. The tables below show some values seen in the wild.
4027 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
4028 * (Before that, there are hints that it was 1000000000.)
4030 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
4034 * -----------------------------------
4035 * [1] 000c8000 000f4240 000f4240 00000064
4036 * [2] 000003e8 00000400 000f4240 3b9aca00
4037 * [3] 000003e8 00000400 000f4240 3b9aca00
4038 * [4] 000003e8 00000400 000f4240 00000064
4039 * [5] 000003e8 00000040 000f4240 3b9aca00
4040 * [6] 000003e8 00000040 000f4240 000000f9
4042 * a b c d ticks_per_s buffer_hz
4043 * ------- --------- ---------- ------------- ----------- -------------
4044 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
4045 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4046 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4047 * [4] 1,000 1,024 1,000,000 100 976,562 100
4048 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
4049 * [6] 1,000 64 1,000,000 249 15,625,000 249
4051 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
4052 * [2] 2.6.26-1-686-bigmem from Debian lenny
4053 * [3] 2.6.26-2-sparc64 from Debian lenny
4054 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
4055 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
4056 * [6] 2.6.34 from kernel.org on KVM
4058 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4059 static const char fn[] = "/proc/net/psched";
4060 unsigned int a, b, c, d;
4063 if (!ovsthread_once_start(&once)) {
4070 stream = fopen(fn, "r");
4072 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
4076 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
4077 VLOG_WARN("%s: read failed", fn);
4081 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
4085 VLOG_WARN("%s: invalid scheduler parameters", fn);
4089 ticks_per_s = (double) a * c / b;
4093 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
4096 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
4099 ovsthread_once_done(&once);
4102 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
4103 * rate of 'rate' bytes per second. */
4105 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
4108 return (rate * ticks) / ticks_per_s;
4111 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
4112 * rate of 'rate' bytes per second. */
4114 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
4117 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
4120 /* Returns the number of bytes that need to be reserved for qdisc buffering at
4121 * a transmission rate of 'rate' bytes per second. */
4123 tc_buffer_per_jiffy(unsigned int rate)
4126 return rate / buffer_hz;
4129 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
4130 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
4131 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
4132 * stores NULL into it if it is absent.
4134 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
4137 * Returns 0 if successful, otherwise a positive errno value. */
4139 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
4140 struct nlattr **options)
4142 static const struct nl_policy tca_policy[] = {
4143 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
4144 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
4146 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4148 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4149 tca_policy, ta, ARRAY_SIZE(ta))) {
4150 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
4155 *kind = nl_attr_get_string(ta[TCA_KIND]);
4159 *options = ta[TCA_OPTIONS];
4174 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
4175 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
4176 * into '*options', and its queue statistics into '*stats'. Any of the output
4177 * arguments may be null.
4179 * Returns 0 if successful, otherwise a positive errno value. */
4181 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
4182 struct nlattr **options, struct netdev_queue_stats *stats)
4184 static const struct nl_policy tca_policy[] = {
4185 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
4186 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
4188 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4190 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4191 tca_policy, ta, ARRAY_SIZE(ta))) {
4192 VLOG_WARN_RL(&rl, "failed to parse class message");
4197 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
4198 *handlep = tc->tcm_handle;
4202 *options = ta[TCA_OPTIONS];
4206 const struct gnet_stats_queue *gsq;
4207 struct gnet_stats_basic gsb;
4209 static const struct nl_policy stats_policy[] = {
4210 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
4211 .min_len = sizeof gsb },
4212 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
4213 .min_len = sizeof *gsq },
4215 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
4217 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
4218 sa, ARRAY_SIZE(sa))) {
4219 VLOG_WARN_RL(&rl, "failed to parse class stats");
4223 /* Alignment issues screw up the length of struct gnet_stats_basic on
4224 * some arch/bitsize combinations. Newer versions of Linux have a
4225 * struct gnet_stats_basic_packed, but we can't depend on that. The
4226 * easiest thing to do is just to make a copy. */
4227 memset(&gsb, 0, sizeof gsb);
4228 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
4229 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
4230 stats->tx_bytes = gsb.bytes;
4231 stats->tx_packets = gsb.packets;
4233 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
4234 stats->tx_errors = gsq->drops;
4244 memset(stats, 0, sizeof *stats);
4249 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
4252 tc_query_class(const struct netdev *netdev,
4253 unsigned int handle, unsigned int parent,
4254 struct ofpbuf **replyp)
4256 struct ofpbuf request;
4257 struct tcmsg *tcmsg;
4260 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
4264 tcmsg->tcm_handle = handle;
4265 tcmsg->tcm_parent = parent;
4267 error = tc_transact(&request, replyp);
4269 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
4270 netdev_get_name(netdev),
4271 tc_get_major(handle), tc_get_minor(handle),
4272 tc_get_major(parent), tc_get_minor(parent),
4273 ovs_strerror(error));
4278 /* Equivalent to "tc class del dev <name> handle <handle>". */
4280 tc_delete_class(const struct netdev *netdev, unsigned int handle)
4282 struct ofpbuf request;
4283 struct tcmsg *tcmsg;
4286 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
4290 tcmsg->tcm_handle = handle;
4291 tcmsg->tcm_parent = 0;
4293 error = tc_transact(&request, NULL);
4295 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4296 netdev_get_name(netdev),
4297 tc_get_major(handle), tc_get_minor(handle),
4298 ovs_strerror(error));
4303 /* Equivalent to "tc qdisc del dev <name> root". */
4305 tc_del_qdisc(struct netdev *netdev_)
4307 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4308 struct ofpbuf request;
4309 struct tcmsg *tcmsg;
4312 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
4316 tcmsg->tcm_handle = tc_make_handle(1, 0);
4317 tcmsg->tcm_parent = TC_H_ROOT;
4319 error = tc_transact(&request, NULL);
4320 if (error == EINVAL) {
4321 /* EINVAL probably means that the default qdisc was in use, in which
4322 * case we've accomplished our purpose. */
4325 if (!error && netdev->tc) {
4326 if (netdev->tc->ops->tc_destroy) {
4327 netdev->tc->ops->tc_destroy(netdev->tc);
4334 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4335 * kernel to determine what they are. Returns 0 if successful, otherwise a
4336 * positive errno value. */
4338 tc_query_qdisc(const struct netdev *netdev_)
4340 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4341 struct ofpbuf request, *qdisc;
4342 const struct tc_ops *ops;
4343 struct tcmsg *tcmsg;
4351 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4352 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4353 * 2.6.35 without that fix backported to it.
4355 * To avoid the OOPS, we must not make a request that would attempt to dump
4356 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4357 * few others. There are a few ways that I can see to do this, but most of
4358 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4359 * technique chosen here is to assume that any non-default qdisc that we
4360 * create will have a class with handle 1:0. The built-in qdiscs only have
4361 * a class with handle 0:0.
4363 * We could check for Linux 2.6.35+ and use a more straightforward method
4365 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
4369 tcmsg->tcm_handle = tc_make_handle(1, 0);
4370 tcmsg->tcm_parent = 0;
4372 /* Figure out what tc class to instantiate. */
4373 error = tc_transact(&request, &qdisc);
4377 error = tc_parse_qdisc(qdisc, &kind, NULL);
4379 ops = &tc_ops_other;
4381 ops = tc_lookup_linux_name(kind);
4383 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4384 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4386 ops = &tc_ops_other;
4389 } else if (error == ENOENT) {
4390 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4391 * other entity that doesn't have a handle 1:0. We will assume
4392 * that it's the system default qdisc. */
4393 ops = &tc_ops_default;
4396 /* Who knows? Maybe the device got deleted. */
4397 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4398 netdev_get_name(netdev_), ovs_strerror(error));
4399 ops = &tc_ops_other;
4402 /* Instantiate it. */
4403 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
4404 ovs_assert((load_error == 0) == (netdev->tc != NULL));
4405 ofpbuf_delete(qdisc);
4407 return error ? error : load_error;
4410 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4411 approximate the time to transmit packets of various lengths. For an MTU of
4412 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4413 represents two possible packet lengths; for a MTU of 513 through 1024, four
4414 possible lengths; and so on.
4416 Returns, for the specified 'mtu', the number of bits that packet lengths
4417 need to be shifted right to fit within such a 256-entry table. */
4419 tc_calc_cell_log(unsigned int mtu)
4424 mtu = ETH_PAYLOAD_MAX;
4426 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4428 for (cell_log = 0; mtu >= 256; cell_log++) {
4435 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4438 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4440 memset(rate, 0, sizeof *rate);
4441 rate->cell_log = tc_calc_cell_log(mtu);
4442 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4443 /* rate->cell_align = 0; */ /* distro headers. */
4444 rate->mpu = ETH_TOTAL_MIN;
4448 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4449 * attribute of the specified "type".
4451 * See tc_calc_cell_log() above for a description of "rtab"s. */
4453 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4458 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4459 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4460 unsigned packet_size = (i + 1) << rate->cell_log;
4461 if (packet_size < rate->mpu) {
4462 packet_size = rate->mpu;
4464 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4468 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4469 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4470 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4473 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4475 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4476 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4479 /* Linux-only functions declared in netdev-linux.h */
4481 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4482 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4484 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4485 const char *flag_name, bool enable)
4487 const char *netdev_name = netdev_get_name(netdev);
4488 struct ethtool_value evalue;
4492 COVERAGE_INC(netdev_get_ethtool);
4493 memset(&evalue, 0, sizeof evalue);
4494 error = netdev_linux_do_ethtool(netdev_name,
4495 (struct ethtool_cmd *)&evalue,
4496 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4501 COVERAGE_INC(netdev_set_ethtool);
4502 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4503 error = netdev_linux_do_ethtool(netdev_name,
4504 (struct ethtool_cmd *)&evalue,
4505 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4510 COVERAGE_INC(netdev_get_ethtool);
4511 memset(&evalue, 0, sizeof evalue);
4512 error = netdev_linux_do_ethtool(netdev_name,
4513 (struct ethtool_cmd *)&evalue,
4514 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4519 if (new_flags != evalue.data) {
4520 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4521 "device %s failed", enable ? "enable" : "disable",
4522 flag_name, netdev_name);
4529 /* Utility functions. */
4531 /* Copies 'src' into 'dst', performing format conversion in the process. */
4533 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4534 const struct rtnl_link_stats *src)
4536 dst->rx_packets = src->rx_packets;
4537 dst->tx_packets = src->tx_packets;
4538 dst->rx_bytes = src->rx_bytes;
4539 dst->tx_bytes = src->tx_bytes;
4540 dst->rx_errors = src->rx_errors;
4541 dst->tx_errors = src->tx_errors;
4542 dst->rx_dropped = src->rx_dropped;
4543 dst->tx_dropped = src->tx_dropped;
4544 dst->multicast = src->multicast;
4545 dst->collisions = src->collisions;
4546 dst->rx_length_errors = src->rx_length_errors;
4547 dst->rx_over_errors = src->rx_over_errors;
4548 dst->rx_crc_errors = src->rx_crc_errors;
4549 dst->rx_frame_errors = src->rx_frame_errors;
4550 dst->rx_fifo_errors = src->rx_fifo_errors;
4551 dst->rx_missed_errors = src->rx_missed_errors;
4552 dst->tx_aborted_errors = src->tx_aborted_errors;
4553 dst->tx_carrier_errors = src->tx_carrier_errors;
4554 dst->tx_fifo_errors = src->tx_fifo_errors;
4555 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4556 dst->tx_window_errors = src->tx_window_errors;
4560 get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
4562 struct ofpbuf request;
4563 struct ofpbuf *reply;
4566 ofpbuf_init(&request, 0);
4567 nl_msg_put_nlmsghdr(&request,
4568 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
4569 RTM_GETLINK, NLM_F_REQUEST);
4570 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
4571 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
4572 error = nl_transact(NETLINK_ROUTE, &request, &reply);
4573 ofpbuf_uninit(&request);
4578 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
4579 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS);
4580 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
4581 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
4584 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4588 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
4593 ofpbuf_delete(reply);
4598 get_flags(const struct netdev *dev, unsigned int *flags)
4604 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
4606 *flags = ifr.ifr_flags;
4612 set_flags(const char *name, unsigned int flags)
4616 ifr.ifr_flags = flags;
4617 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
4621 do_get_ifindex(const char *netdev_name)
4626 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4627 COVERAGE_INC(netdev_get_ifindex);
4629 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
4631 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4632 netdev_name, ovs_strerror(error));
4635 return ifr.ifr_ifindex;
4639 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4641 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4643 if (!(netdev->cache_valid & VALID_IFINDEX)) {
4644 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4647 netdev->get_ifindex_error = -ifindex;
4648 netdev->ifindex = 0;
4650 netdev->get_ifindex_error = 0;
4651 netdev->ifindex = ifindex;
4653 netdev->cache_valid |= VALID_IFINDEX;
4656 *ifindexp = netdev->ifindex;
4657 return netdev->get_ifindex_error;
4661 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4667 memset(&ifr, 0, sizeof ifr);
4668 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4669 COVERAGE_INC(netdev_get_hwaddr);
4670 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
4672 /* ENODEV probably means that a vif disappeared asynchronously and
4673 * hasn't been removed from the database yet, so reduce the log level
4674 * to INFO for that case. */
4675 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
4676 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4677 netdev_name, ovs_strerror(error));
4680 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4681 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4682 VLOG_WARN("%s device has unknown hardware address family %d",
4683 netdev_name, hwaddr_family);
4685 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4690 set_etheraddr(const char *netdev_name,
4691 const uint8_t mac[ETH_ADDR_LEN])
4696 memset(&ifr, 0, sizeof ifr);
4697 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4698 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4699 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4700 COVERAGE_INC(netdev_set_hwaddr);
4701 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
4703 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4704 netdev_name, ovs_strerror(error));
4710 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4711 int cmd, const char *cmd_name)
4716 memset(&ifr, 0, sizeof ifr);
4717 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4718 ifr.ifr_data = (caddr_t) ecmd;
4721 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
4723 if (error != EOPNOTSUPP) {
4724 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4725 "failed: %s", cmd_name, name, ovs_strerror(error));
4727 /* The device doesn't support this operation. That's pretty
4728 * common, so there's no point in logging anything. */
4735 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4736 int cmd, const char *cmd_name)
4741 ifr.ifr_addr.sa_family = AF_INET;
4742 error = af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4744 const struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *,
4746 *ip = sin->sin_addr;
4751 /* Returns an AF_PACKET raw socket or a negative errno value. */
4753 af_packet_sock(void)
4755 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4758 if (ovsthread_once_start(&once)) {
4759 sock = socket(AF_PACKET, SOCK_RAW, 0);
4761 int error = set_nonblocking(sock);
4768 VLOG_ERR("failed to create packet socket: %s",
4769 ovs_strerror(errno));
4771 ovsthread_once_done(&once);