2 * Copyright (c) 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <linux/version.h>
37 #include <sys/types.h>
38 #include <sys/ioctl.h>
39 #include <sys/socket.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
52 #include "dpif-linux.h"
53 #include "dynamic-string.h"
54 #include "fatal-signal.h"
57 #include "netdev-provider.h"
58 #include "netdev-vport.h"
59 #include "netlink-notifier.h"
60 #include "netlink-socket.h"
63 #include "openflow/openflow.h"
64 #include "ovs-atomic.h"
66 #include "poll-loop.h"
67 #include "rtnetlink-link.h"
69 #include "socket-util.h"
72 #include "unaligned.h"
75 VLOG_DEFINE_THIS_MODULE(netdev_linux);
77 COVERAGE_DEFINE(netdev_set_policing);
78 COVERAGE_DEFINE(netdev_arp_lookup);
79 COVERAGE_DEFINE(netdev_get_ifindex);
80 COVERAGE_DEFINE(netdev_get_hwaddr);
81 COVERAGE_DEFINE(netdev_set_hwaddr);
82 COVERAGE_DEFINE(netdev_get_ethtool);
83 COVERAGE_DEFINE(netdev_set_ethtool);
86 /* These were introduced in Linux 2.6.14, so they might be missing if we have
88 #ifndef ADVERTISED_Pause
89 #define ADVERTISED_Pause (1 << 13)
91 #ifndef ADVERTISED_Asym_Pause
92 #define ADVERTISED_Asym_Pause (1 << 14)
95 /* These were introduced in Linux 2.6.24, so they might be missing if we
96 * have old headers. */
97 #ifndef ETHTOOL_GFLAGS
98 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
100 #ifndef ETHTOOL_SFLAGS
101 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
104 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
107 #define TC_RTAB_SIZE 1024
111 VALID_IFINDEX = 1 << 0,
112 VALID_ETHERADDR = 1 << 1,
116 VALID_POLICING = 1 << 5,
117 VALID_VPORT_STAT_ERROR = 1 << 6,
118 VALID_DRVINFO = 1 << 7,
119 VALID_FEATURES = 1 << 8,
122 /* Traffic control. */
124 /* An instance of a traffic control class. Always associated with a particular
127 * Each TC implementation subclasses this with whatever additional data it
130 const struct tc_ops *ops;
131 struct hmap queues; /* Contains "struct tc_queue"s.
132 * Read by generic TC layer.
133 * Written only by TC implementation. */
136 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
138 /* One traffic control queue.
140 * Each TC implementation subclasses this with whatever additional data it
143 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
144 unsigned int queue_id; /* OpenFlow queue ID. */
145 long long int created; /* Time queue was created, in msecs. */
148 /* A particular kind of traffic control. Each implementation generally maps to
149 * one particular Linux qdisc class.
151 * The functions below return 0 if successful or a positive errno value on
152 * failure, except where otherwise noted. All of them must be provided, except
153 * where otherwise noted. */
155 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
156 * This is null for tc_ops_default and tc_ops_other, for which there are no
157 * appropriate values. */
158 const char *linux_name;
160 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
161 const char *ovs_name;
163 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
164 * queues. The queues are numbered 0 through n_queues - 1. */
165 unsigned int n_queues;
167 /* Called to install this TC class on 'netdev'. The implementation should
168 * make the Netlink calls required to set up 'netdev' with the right qdisc
169 * and configure it according to 'details'. The implementation may assume
170 * that the current qdisc is the default; that is, there is no need for it
171 * to delete the current qdisc before installing itself.
173 * The contents of 'details' should be documented as valid for 'ovs_name'
174 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
175 * (which is built as ovs-vswitchd.conf.db(8)).
177 * This function must return 0 if and only if it sets 'netdev->tc' to an
178 * initialized 'struct tc'.
180 * (This function is null for tc_ops_other, which cannot be installed. For
181 * other TC classes it should always be nonnull.) */
182 int (*tc_install)(struct netdev *netdev, const struct smap *details);
184 /* Called when the netdev code determines (through a Netlink query) that
185 * this TC class's qdisc is installed on 'netdev', but we didn't install
186 * it ourselves and so don't know any of the details.
188 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
189 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
190 * implementation should parse the other attributes of 'nlmsg' as
191 * necessary to determine its configuration. If necessary it should also
192 * use Netlink queries to determine the configuration of queues on
195 * This function must return 0 if and only if it sets 'netdev->tc' to an
196 * initialized 'struct tc'. */
197 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
199 /* Destroys the data structures allocated by the implementation as part of
200 * 'tc'. (This includes destroying 'tc->queues' by calling
203 * The implementation should not need to perform any Netlink calls. If
204 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
205 * (But it may not be desirable.)
207 * This function may be null if 'tc' is trivial. */
208 void (*tc_destroy)(struct tc *tc);
210 /* Retrieves details of 'netdev->tc' configuration into 'details'.
212 * The implementation should not need to perform any Netlink calls, because
213 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
214 * cached the configuration.
216 * The contents of 'details' should be documented as valid for 'ovs_name'
217 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
218 * (which is built as ovs-vswitchd.conf.db(8)).
220 * This function may be null if 'tc' is not configurable.
222 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
224 /* Reconfigures 'netdev->tc' according to 'details', performing any
225 * required Netlink calls to complete the reconfiguration.
227 * The contents of 'details' should be documented as valid for 'ovs_name'
228 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
229 * (which is built as ovs-vswitchd.conf.db(8)).
231 * This function may be null if 'tc' is not configurable.
233 int (*qdisc_set)(struct netdev *, const struct smap *details);
235 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
236 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
238 * The contents of 'details' should be documented as valid for 'ovs_name'
239 * in the "other_config" column in the "Queue" table in
240 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
242 * The implementation should not need to perform any Netlink calls, because
243 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
244 * cached the queue configuration.
246 * This function may be null if 'tc' does not have queues ('n_queues' is
248 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
249 struct smap *details);
251 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
252 * 'details', perfoming any required Netlink calls to complete the
253 * reconfiguration. The caller ensures that 'queue_id' is less than
256 * The contents of 'details' should be documented as valid for 'ovs_name'
257 * in the "other_config" column in the "Queue" table in
258 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
260 * This function may be null if 'tc' does not have queues or its queues are
261 * not configurable. */
262 int (*class_set)(struct netdev *, unsigned int queue_id,
263 const struct smap *details);
265 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
266 * tc_queue's within 'netdev->tc->queues'.
268 * This function may be null if 'tc' does not have queues or its queues
269 * cannot be deleted. */
270 int (*class_delete)(struct netdev *, struct tc_queue *queue);
272 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
273 * 'struct tc_queue's within 'netdev->tc->queues'.
275 * On success, initializes '*stats'.
277 * This function may be null if 'tc' does not have queues or if it cannot
278 * report queue statistics. */
279 int (*class_get_stats)(const struct netdev *netdev,
280 const struct tc_queue *queue,
281 struct netdev_queue_stats *stats);
283 /* Extracts queue stats from 'nlmsg', which is a response to a
284 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
286 * This function may be null if 'tc' does not have queues or if it cannot
287 * report queue statistics. */
288 int (*class_dump_stats)(const struct netdev *netdev,
289 const struct ofpbuf *nlmsg,
290 netdev_dump_queue_stats_cb *cb, void *aux);
294 tc_init(struct tc *tc, const struct tc_ops *ops)
297 hmap_init(&tc->queues);
301 tc_destroy(struct tc *tc)
303 hmap_destroy(&tc->queues);
306 static const struct tc_ops tc_ops_htb;
307 static const struct tc_ops tc_ops_hfsc;
308 static const struct tc_ops tc_ops_default;
309 static const struct tc_ops tc_ops_other;
311 static const struct tc_ops *const tcs[] = {
312 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
313 &tc_ops_hfsc, /* Hierarchical fair service curve. */
314 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
315 &tc_ops_other, /* Some other qdisc. */
319 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
320 static unsigned int tc_get_major(unsigned int handle);
321 static unsigned int tc_get_minor(unsigned int handle);
323 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
324 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
325 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
327 static struct tcmsg *tc_make_request(const struct netdev *, int type,
328 unsigned int flags, struct ofpbuf *);
329 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
330 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
331 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
334 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
335 struct nlattr **options);
336 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
337 struct nlattr **options,
338 struct netdev_queue_stats *);
339 static int tc_query_class(const struct netdev *,
340 unsigned int handle, unsigned int parent,
341 struct ofpbuf **replyp);
342 static int tc_delete_class(const struct netdev *, unsigned int handle);
344 static int tc_del_qdisc(struct netdev *netdev);
345 static int tc_query_qdisc(const struct netdev *netdev);
347 static int tc_calc_cell_log(unsigned int mtu);
348 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
349 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
350 const struct tc_ratespec *rate);
351 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
353 struct netdev_linux {
356 /* Protects all members below. */
357 struct ovs_mutex mutex;
359 unsigned int cache_valid;
360 unsigned int change_seq;
362 bool miimon; /* Link status of last poll. */
363 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
364 struct timer miimon_timer;
366 /* The following are figured out "on demand" only. They are only valid
367 * when the corresponding VALID_* bit in 'cache_valid' is set. */
369 uint8_t etheraddr[ETH_ADDR_LEN];
370 struct in_addr address, netmask;
373 unsigned int ifi_flags;
374 long long int carrier_resets;
375 uint32_t kbits_rate; /* Policing data. */
376 uint32_t kbits_burst;
377 int vport_stats_error; /* Cached error code from vport_get_stats().
378 0 or an errno value. */
379 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
380 int ether_addr_error; /* Cached error code from set/get etheraddr. */
381 int netdev_policing_error; /* Cached error code from set policing. */
382 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
383 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
385 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
386 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
387 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
389 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
392 /* For devices of class netdev_tap_class only. */
396 struct netdev_rx_linux {
402 /* This is set pretty low because we probably won't learn anything from the
403 * additional log messages. */
404 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
406 /* Polling miimon status for all ports causes performance degradation when
407 * handling a large number of ports. If there are no devices using miimon, then
408 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait(). */
409 static atomic_int miimon_cnt = ATOMIC_VAR_INIT(0);
411 static void netdev_linux_run(void);
413 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
414 int cmd, const char *cmd_name);
415 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
416 int cmd, const char *cmd_name);
417 static int get_flags(const struct netdev *, unsigned int *flags);
418 static int set_flags(const char *, unsigned int flags);
419 static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
420 enum netdev_flags on, enum netdev_flags *old_flagsp)
421 OVS_REQUIRES(netdev->mutex);
422 static int do_get_ifindex(const char *netdev_name);
423 static int get_ifindex(const struct netdev *, int *ifindexp);
424 static int do_set_addr(struct netdev *netdev,
425 int ioctl_nr, const char *ioctl_name,
426 struct in_addr addr);
427 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
428 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
429 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
430 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
431 static int af_packet_sock(void);
432 static bool netdev_linux_miimon_enabled(void);
433 static void netdev_linux_miimon_run(void);
434 static void netdev_linux_miimon_wait(void);
437 is_netdev_linux_class(const struct netdev_class *netdev_class)
439 return netdev_class->run == netdev_linux_run;
443 is_tap_netdev(const struct netdev *netdev)
445 return netdev_get_class(netdev) == &netdev_tap_class;
448 static struct netdev_linux *
449 netdev_linux_cast(const struct netdev *netdev)
451 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
453 return CONTAINER_OF(netdev, struct netdev_linux, up);
456 static struct netdev_rx_linux *
457 netdev_rx_linux_cast(const struct netdev_rx *rx)
459 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
460 return CONTAINER_OF(rx, struct netdev_rx_linux, up);
463 static void netdev_linux_update(struct netdev_linux *netdev,
464 const struct rtnetlink_link_change *)
465 OVS_REQUIRES(netdev->mutex);
466 static void netdev_linux_changed(struct netdev_linux *netdev,
467 unsigned int ifi_flags, unsigned int mask)
468 OVS_REQUIRES(netdev->mutex);
470 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK changes, or NULL
471 * if no such socket could be created. */
472 static struct nl_sock *
473 netdev_linux_notify_sock(void)
475 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
476 static struct nl_sock *sock;
478 if (ovsthread_once_start(&once)) {
481 error = nl_sock_create(NETLINK_ROUTE, &sock);
483 error = nl_sock_join_mcgroup(sock, RTNLGRP_LINK);
485 nl_sock_destroy(sock);
489 ovsthread_once_done(&once);
496 netdev_linux_miimon_enabled(void)
500 atomic_read(&miimon_cnt, &miimon);
505 netdev_linux_run(void)
507 struct nl_sock *sock;
510 if (netdev_linux_miimon_enabled()) {
511 netdev_linux_miimon_run();
514 sock = netdev_linux_notify_sock();
520 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
521 uint64_t buf_stub[4096 / 8];
524 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
525 error = nl_sock_recv(sock, &buf, false);
527 struct rtnetlink_link_change change;
529 if (rtnetlink_link_parse(&buf, &change)) {
530 struct netdev *netdev_ = netdev_from_name(change.ifname);
531 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
532 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
534 ovs_mutex_lock(&netdev->mutex);
535 netdev_linux_update(netdev, &change);
536 ovs_mutex_unlock(&netdev->mutex);
538 netdev_close(netdev_);
540 } else if (error == ENOBUFS) {
541 struct shash device_shash;
542 struct shash_node *node;
546 shash_init(&device_shash);
547 netdev_get_devices(&netdev_linux_class, &device_shash);
548 SHASH_FOR_EACH (node, &device_shash) {
549 struct netdev *netdev_ = node->data;
550 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
553 ovs_mutex_lock(&netdev->mutex);
554 get_flags(netdev_, &flags);
555 netdev_linux_changed(netdev, flags, 0);
556 ovs_mutex_unlock(&netdev->mutex);
558 netdev_close(netdev_);
560 shash_destroy(&device_shash);
561 } else if (error != EAGAIN) {
562 VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)",
563 ovs_strerror(error));
570 netdev_linux_wait(void)
572 struct nl_sock *sock;
574 if (netdev_linux_miimon_enabled()) {
575 netdev_linux_miimon_wait();
577 sock = netdev_linux_notify_sock();
579 nl_sock_wait(sock, POLLIN);
584 netdev_linux_changed(struct netdev_linux *dev,
585 unsigned int ifi_flags, unsigned int mask)
586 OVS_REQUIRES(dev->mutex)
589 if (!dev->change_seq) {
593 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
594 dev->carrier_resets++;
596 dev->ifi_flags = ifi_flags;
598 dev->cache_valid &= mask;
602 netdev_linux_update(struct netdev_linux *dev,
603 const struct rtnetlink_link_change *change)
604 OVS_REQUIRES(dev->mutex)
606 if (change->nlmsg_type == RTM_NEWLINK) {
608 netdev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
610 /* Update netdev from rtnl-change msg. */
612 dev->mtu = change->mtu;
613 dev->cache_valid |= VALID_MTU;
614 dev->netdev_mtu_error = 0;
617 if (!eth_addr_is_zero(change->addr)) {
618 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
619 dev->cache_valid |= VALID_ETHERADDR;
620 dev->ether_addr_error = 0;
623 dev->ifindex = change->ifi_index;
624 dev->cache_valid |= VALID_IFINDEX;
625 dev->get_ifindex_error = 0;
628 netdev_linux_changed(dev, change->ifi_flags, 0);
632 static struct netdev *
633 netdev_linux_alloc(void)
635 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
640 netdev_linux_common_construct(struct netdev_linux *netdev)
642 ovs_mutex_init(&netdev->mutex);
643 netdev->change_seq = 1;
646 /* Creates system and internal devices. */
648 netdev_linux_construct(struct netdev *netdev_)
650 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
653 netdev_linux_common_construct(netdev);
655 error = get_flags(&netdev->up, &netdev->ifi_flags);
656 if (error == ENODEV) {
657 if (netdev->up.netdev_class != &netdev_internal_class) {
658 /* The device does not exist, so don't allow it to be opened. */
661 /* "Internal" netdevs have to be created as netdev objects before
662 * they exist in the kernel, because creating them in the kernel
663 * happens by passing a netdev object to dpif_port_add().
664 * Therefore, ignore the error. */
671 /* For most types of netdevs we open the device for each call of
672 * netdev_open(). However, this is not the case with tap devices,
673 * since it is only possible to open the device once. In this
674 * situation we share a single file descriptor, and consequently
675 * buffers, across all readers. Therefore once data is read it will
676 * be unavailable to other reads for tap devices. */
678 netdev_linux_construct_tap(struct netdev *netdev_)
680 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
681 static const char tap_dev[] = "/dev/net/tun";
682 const char *name = netdev_->name;
686 netdev_linux_common_construct(netdev);
688 /* Open tap device. */
689 netdev->tap_fd = open(tap_dev, O_RDWR);
690 if (netdev->tap_fd < 0) {
692 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
696 /* Create tap device. */
697 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
698 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
699 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
700 VLOG_WARN("%s: creating tap device failed: %s", name,
701 ovs_strerror(errno));
706 /* Make non-blocking. */
707 error = set_nonblocking(netdev->tap_fd);
715 close(netdev->tap_fd);
720 netdev_linux_destruct(struct netdev *netdev_)
722 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
724 if (netdev->tc && netdev->tc->ops->tc_destroy) {
725 netdev->tc->ops->tc_destroy(netdev->tc);
728 if (netdev_get_class(netdev_) == &netdev_tap_class
729 && netdev->tap_fd >= 0)
731 close(netdev->tap_fd);
734 if (netdev->miimon_interval > 0) {
736 atomic_sub(&miimon_cnt, 1, &junk);
739 ovs_mutex_destroy(&netdev->mutex);
743 netdev_linux_dealloc(struct netdev *netdev_)
745 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
749 static struct netdev_rx *
750 netdev_linux_rx_alloc(void)
752 struct netdev_rx_linux *rx = xzalloc(sizeof *rx);
757 netdev_linux_rx_construct(struct netdev_rx *rx_)
759 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
760 struct netdev *netdev_ = rx->up.netdev;
761 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
764 ovs_mutex_lock(&netdev->mutex);
765 rx->is_tap = is_tap_netdev(netdev_);
767 rx->fd = netdev->tap_fd;
769 struct sockaddr_ll sll;
771 /* Result of tcpdump -dd inbound */
772 static const struct sock_filter filt[] = {
773 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
774 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
775 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
776 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
778 static const struct sock_fprog fprog = {
779 ARRAY_SIZE(filt), (struct sock_filter *) filt
782 /* Create file descriptor. */
783 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
786 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
790 /* Set non-blocking mode. */
791 error = set_nonblocking(rx->fd);
796 /* Get ethernet device index. */
797 error = get_ifindex(&netdev->up, &ifindex);
802 /* Bind to specific ethernet device. */
803 memset(&sll, 0, sizeof sll);
804 sll.sll_family = AF_PACKET;
805 sll.sll_ifindex = ifindex;
806 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
807 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
809 VLOG_ERR("%s: failed to bind raw socket (%s)",
810 netdev_get_name(netdev_), ovs_strerror(error));
814 /* Filter for only inbound packets. */
815 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
819 VLOG_ERR("%s: failed to attach filter (%s)",
820 netdev_get_name(netdev_), ovs_strerror(error));
824 ovs_mutex_unlock(&netdev->mutex);
832 ovs_mutex_unlock(&netdev->mutex);
837 netdev_linux_rx_destruct(struct netdev_rx *rx_)
839 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
847 netdev_linux_rx_dealloc(struct netdev_rx *rx_)
849 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
855 netdev_linux_rx_recv(struct netdev_rx *rx_, void *data, size_t size)
857 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
862 ? read(rx->fd, data, size)
863 : recv(rx->fd, data, size, MSG_TRUNC));
864 } while (retval < 0 && errno == EINTR);
867 return retval > size ? -EMSGSIZE : retval;
869 if (errno != EAGAIN) {
870 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
871 ovs_strerror(errno), netdev_rx_get_name(rx_));
878 netdev_linux_rx_wait(struct netdev_rx *rx_)
880 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
881 poll_fd_wait(rx->fd, POLLIN);
885 netdev_linux_rx_drain(struct netdev_rx *rx_)
887 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
890 int error = af_inet_ifreq_ioctl(netdev_rx_get_name(rx_), &ifr,
891 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
895 drain_fd(rx->fd, ifr.ifr_qlen);
898 return drain_rcvbuf(rx->fd);
902 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
903 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
904 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
905 * the packet is too big or too small to transmit on the device.
907 * The caller retains ownership of 'buffer' in all cases.
909 * The kernel maintains a packet transmission queue, so the caller is not
910 * expected to do additional queuing of packets. */
912 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
917 if (!is_tap_netdev(netdev_)) {
918 /* Use our AF_PACKET socket to send to this device. */
919 struct sockaddr_ll sll;
925 sock = af_packet_sock();
930 ifindex = netdev_get_ifindex(netdev_);
935 /* We don't bother setting most fields in sockaddr_ll because the
936 * kernel ignores them for SOCK_RAW. */
937 memset(&sll, 0, sizeof sll);
938 sll.sll_family = AF_PACKET;
939 sll.sll_ifindex = ifindex;
941 iov.iov_base = CONST_CAST(void *, data);
945 msg.msg_namelen = sizeof sll;
948 msg.msg_control = NULL;
949 msg.msg_controllen = 0;
952 retval = sendmsg(sock, &msg, 0);
954 /* Use the tap fd to send to this device. This is essential for
955 * tap devices, because packets sent to a tap device with an
956 * AF_PACKET socket will loop back to be *received* again on the
957 * tap device. This doesn't occur on other interface types
958 * because we attach a socket filter to the rx socket. */
959 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
961 retval = write(netdev->tap_fd, data, size);
965 /* The Linux AF_PACKET implementation never blocks waiting for room
966 * for packets, instead returning ENOBUFS. Translate this into
967 * EAGAIN for the caller. */
968 if (errno == ENOBUFS) {
970 } else if (errno == EINTR) {
972 } else if (errno != EAGAIN) {
973 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
974 netdev_get_name(netdev_), ovs_strerror(errno));
977 } else if (retval != size) {
978 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE"d bytes of "
979 "%"PRIuSIZE") on %s", retval, size, netdev_get_name(netdev_));
987 /* Registers with the poll loop to wake up from the next call to poll_block()
988 * when the packet transmission queue has sufficient room to transmit a packet
989 * with netdev_send().
991 * The kernel maintains a packet transmission queue, so the client is not
992 * expected to do additional queuing of packets. Thus, this function is
993 * unlikely to ever be used. It is included for completeness. */
995 netdev_linux_send_wait(struct netdev *netdev)
997 if (is_tap_netdev(netdev)) {
998 /* TAP device always accepts packets.*/
999 poll_immediate_wake();
1003 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1004 * otherwise a positive errno value. */
1006 netdev_linux_set_etheraddr(struct netdev *netdev_,
1007 const uint8_t mac[ETH_ADDR_LEN])
1009 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1010 enum netdev_flags old_flags = 0;
1013 ovs_mutex_lock(&netdev->mutex);
1015 if (netdev->cache_valid & VALID_ETHERADDR) {
1016 error = netdev->ether_addr_error;
1017 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1020 netdev->cache_valid &= ~VALID_ETHERADDR;
1023 /* Tap devices must be brought down before setting the address. */
1024 if (is_tap_netdev(netdev_)) {
1025 update_flags(netdev, NETDEV_UP, 0, &old_flags);
1027 error = set_etheraddr(netdev_get_name(netdev_), mac);
1028 if (!error || error == ENODEV) {
1029 netdev->ether_addr_error = error;
1030 netdev->cache_valid |= VALID_ETHERADDR;
1032 memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
1036 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1037 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1041 ovs_mutex_unlock(&netdev->mutex);
1045 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1047 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1048 uint8_t mac[ETH_ADDR_LEN])
1050 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1053 ovs_mutex_lock(&netdev->mutex);
1054 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1055 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1057 netdev->cache_valid |= VALID_ETHERADDR;
1060 error = netdev->ether_addr_error;
1062 memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
1064 ovs_mutex_unlock(&netdev->mutex);
1070 netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
1074 if (!(netdev->cache_valid & VALID_MTU)) {
1077 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
1078 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
1079 netdev->mtu = ifr.ifr_mtu;
1080 netdev->cache_valid |= VALID_MTU;
1083 error = netdev->netdev_mtu_error;
1085 *mtup = netdev->mtu;
1091 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1092 * in bytes, not including the hardware header; thus, this is typically 1500
1093 * bytes for Ethernet devices. */
1095 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1097 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1100 ovs_mutex_lock(&netdev->mutex);
1101 error = netdev_linux_get_mtu__(netdev, mtup);
1102 ovs_mutex_unlock(&netdev->mutex);
1107 /* Sets the maximum size of transmitted (MTU) for given device using linux
1108 * networking ioctl interface.
1111 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1113 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1117 ovs_mutex_lock(&netdev->mutex);
1118 if (netdev->cache_valid & VALID_MTU) {
1119 error = netdev->netdev_mtu_error;
1120 if (error || netdev->mtu == mtu) {
1123 netdev->cache_valid &= ~VALID_MTU;
1126 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1127 SIOCSIFMTU, "SIOCSIFMTU");
1128 if (!error || error == ENODEV) {
1129 netdev->netdev_mtu_error = error;
1130 netdev->mtu = ifr.ifr_mtu;
1131 netdev->cache_valid |= VALID_MTU;
1134 ovs_mutex_unlock(&netdev->mutex);
1138 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1139 * On failure, returns a negative errno value. */
1141 netdev_linux_get_ifindex(const struct netdev *netdev_)
1143 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1146 ovs_mutex_lock(&netdev->mutex);
1147 error = get_ifindex(netdev_, &ifindex);
1148 ovs_mutex_unlock(&netdev->mutex);
1150 return error ? -error : ifindex;
1154 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1156 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1158 ovs_mutex_lock(&netdev->mutex);
1159 if (netdev->miimon_interval > 0) {
1160 *carrier = netdev->miimon;
1162 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1164 ovs_mutex_unlock(&netdev->mutex);
1169 static long long int
1170 netdev_linux_get_carrier_resets(const struct netdev *netdev_)
1172 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1173 long long int carrier_resets;
1175 ovs_mutex_lock(&netdev->mutex);
1176 carrier_resets = netdev->carrier_resets;
1177 ovs_mutex_unlock(&netdev->mutex);
1179 return carrier_resets;
1183 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1184 struct mii_ioctl_data *data)
1189 memset(&ifr, 0, sizeof ifr);
1190 memcpy(&ifr.ifr_data, data, sizeof *data);
1191 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
1192 memcpy(data, &ifr.ifr_data, sizeof *data);
1198 netdev_linux_get_miimon(const char *name, bool *miimon)
1200 struct mii_ioctl_data data;
1205 memset(&data, 0, sizeof data);
1206 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1208 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1209 data.reg_num = MII_BMSR;
1210 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1214 *miimon = !!(data.val_out & BMSR_LSTATUS);
1216 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1219 struct ethtool_cmd ecmd;
1221 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1224 COVERAGE_INC(netdev_get_ethtool);
1225 memset(&ecmd, 0, sizeof ecmd);
1226 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1229 struct ethtool_value eval;
1231 memcpy(&eval, &ecmd, sizeof eval);
1232 *miimon = !!eval.data;
1234 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1242 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1243 long long int interval)
1245 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1247 ovs_mutex_lock(&netdev->mutex);
1248 interval = interval > 0 ? MAX(interval, 100) : 0;
1249 if (netdev->miimon_interval != interval) {
1252 if (interval && !netdev->miimon_interval) {
1253 atomic_add(&miimon_cnt, 1, &junk);
1254 } else if (!interval && netdev->miimon_interval) {
1255 atomic_sub(&miimon_cnt, 1, &junk);
1258 netdev->miimon_interval = interval;
1259 timer_set_expired(&netdev->miimon_timer);
1261 ovs_mutex_unlock(&netdev->mutex);
1267 netdev_linux_miimon_run(void)
1269 struct shash device_shash;
1270 struct shash_node *node;
1272 shash_init(&device_shash);
1273 netdev_get_devices(&netdev_linux_class, &device_shash);
1274 SHASH_FOR_EACH (node, &device_shash) {
1275 struct netdev *netdev = node->data;
1276 struct netdev_linux *dev = netdev_linux_cast(netdev);
1279 ovs_mutex_lock(&dev->mutex);
1280 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1281 netdev_linux_get_miimon(dev->up.name, &miimon);
1282 if (miimon != dev->miimon) {
1283 dev->miimon = miimon;
1284 netdev_linux_changed(dev, dev->ifi_flags, 0);
1287 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1289 ovs_mutex_unlock(&dev->mutex);
1290 netdev_close(netdev);
1293 shash_destroy(&device_shash);
1297 netdev_linux_miimon_wait(void)
1299 struct shash device_shash;
1300 struct shash_node *node;
1302 shash_init(&device_shash);
1303 netdev_get_devices(&netdev_linux_class, &device_shash);
1304 SHASH_FOR_EACH (node, &device_shash) {
1305 struct netdev *netdev = node->data;
1306 struct netdev_linux *dev = netdev_linux_cast(netdev);
1308 ovs_mutex_lock(&dev->mutex);
1309 if (dev->miimon_interval > 0) {
1310 timer_wait(&dev->miimon_timer);
1312 ovs_mutex_unlock(&dev->mutex);
1313 netdev_close(netdev);
1315 shash_destroy(&device_shash);
1318 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1319 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1322 check_for_working_netlink_stats(void)
1324 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1325 * preferable, so if that works, we'll use it. */
1326 int ifindex = do_get_ifindex("lo");
1328 VLOG_WARN("failed to get ifindex for lo, "
1329 "obtaining netdev stats from proc");
1332 struct netdev_stats stats;
1333 int error = get_stats_via_netlink(ifindex, &stats);
1335 VLOG_DBG("obtaining netdev stats via rtnetlink");
1338 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1339 "via proc (you are probably running a pre-2.6.19 "
1340 "kernel)", ovs_strerror(error));
1347 swap_uint64(uint64_t *a, uint64_t *b)
1354 /* Copies 'src' into 'dst', performing format conversion in the process.
1356 * 'src' is allowed to be misaligned. */
1358 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1359 const struct ovs_vport_stats *src)
1361 dst->rx_packets = get_unaligned_u64(&src->rx_packets);
1362 dst->tx_packets = get_unaligned_u64(&src->tx_packets);
1363 dst->rx_bytes = get_unaligned_u64(&src->rx_bytes);
1364 dst->tx_bytes = get_unaligned_u64(&src->tx_bytes);
1365 dst->rx_errors = get_unaligned_u64(&src->rx_errors);
1366 dst->tx_errors = get_unaligned_u64(&src->tx_errors);
1367 dst->rx_dropped = get_unaligned_u64(&src->rx_dropped);
1368 dst->tx_dropped = get_unaligned_u64(&src->tx_dropped);
1370 dst->collisions = 0;
1371 dst->rx_length_errors = 0;
1372 dst->rx_over_errors = 0;
1373 dst->rx_crc_errors = 0;
1374 dst->rx_frame_errors = 0;
1375 dst->rx_fifo_errors = 0;
1376 dst->rx_missed_errors = 0;
1377 dst->tx_aborted_errors = 0;
1378 dst->tx_carrier_errors = 0;
1379 dst->tx_fifo_errors = 0;
1380 dst->tx_heartbeat_errors = 0;
1381 dst->tx_window_errors = 0;
1385 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1387 struct dpif_linux_vport reply;
1391 error = dpif_linux_vport_get(netdev_get_name(netdev), &reply, &buf);
1394 } else if (!reply.stats) {
1399 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1407 get_stats_via_vport(const struct netdev *netdev_,
1408 struct netdev_stats *stats)
1410 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1412 if (!netdev->vport_stats_error ||
1413 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1416 error = get_stats_via_vport__(netdev_, stats);
1417 if (error && error != ENOENT) {
1418 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1420 netdev_get_name(netdev_), ovs_strerror(error));
1422 netdev->vport_stats_error = error;
1423 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1428 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1429 struct netdev_stats *stats)
1431 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
1432 static int use_netlink_stats;
1435 if (ovsthread_once_start(&once)) {
1436 use_netlink_stats = check_for_working_netlink_stats();
1437 ovsthread_once_done(&once);
1440 if (use_netlink_stats) {
1443 error = get_ifindex(netdev_, &ifindex);
1445 error = get_stats_via_netlink(ifindex, stats);
1448 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1452 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1453 netdev_get_name(netdev_), error);
1459 /* Retrieves current device stats for 'netdev-linux'. */
1461 netdev_linux_get_stats(const struct netdev *netdev_,
1462 struct netdev_stats *stats)
1464 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1465 struct netdev_stats dev_stats;
1468 ovs_mutex_lock(&netdev->mutex);
1469 get_stats_via_vport(netdev_, stats);
1470 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1472 if (!netdev->vport_stats_error) {
1475 } else if (netdev->vport_stats_error) {
1476 /* stats not available from OVS then use ioctl stats. */
1479 stats->rx_errors += dev_stats.rx_errors;
1480 stats->tx_errors += dev_stats.tx_errors;
1481 stats->rx_dropped += dev_stats.rx_dropped;
1482 stats->tx_dropped += dev_stats.tx_dropped;
1483 stats->multicast += dev_stats.multicast;
1484 stats->collisions += dev_stats.collisions;
1485 stats->rx_length_errors += dev_stats.rx_length_errors;
1486 stats->rx_over_errors += dev_stats.rx_over_errors;
1487 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1488 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1489 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1490 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1491 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1492 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1493 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1494 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1495 stats->tx_window_errors += dev_stats.tx_window_errors;
1497 ovs_mutex_unlock(&netdev->mutex);
1502 /* Retrieves current device stats for 'netdev-tap' netdev or
1503 * netdev-internal. */
1505 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1507 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1508 struct netdev_stats dev_stats;
1511 ovs_mutex_lock(&netdev->mutex);
1512 get_stats_via_vport(netdev_, stats);
1513 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1515 if (!netdev->vport_stats_error) {
1518 } else if (netdev->vport_stats_error) {
1519 /* Transmit and receive stats will appear to be swapped relative to the
1520 * other ports since we are the one sending the data, not a remote
1521 * computer. For consistency, we swap them back here. This does not
1522 * apply if we are getting stats from the vport layer because it always
1523 * tracks stats from the perspective of the switch. */
1526 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1527 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1528 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1529 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1530 stats->rx_length_errors = 0;
1531 stats->rx_over_errors = 0;
1532 stats->rx_crc_errors = 0;
1533 stats->rx_frame_errors = 0;
1534 stats->rx_fifo_errors = 0;
1535 stats->rx_missed_errors = 0;
1536 stats->tx_aborted_errors = 0;
1537 stats->tx_carrier_errors = 0;
1538 stats->tx_fifo_errors = 0;
1539 stats->tx_heartbeat_errors = 0;
1540 stats->tx_window_errors = 0;
1542 stats->rx_dropped += dev_stats.tx_dropped;
1543 stats->tx_dropped += dev_stats.rx_dropped;
1545 stats->rx_errors += dev_stats.tx_errors;
1546 stats->tx_errors += dev_stats.rx_errors;
1548 stats->multicast += dev_stats.multicast;
1549 stats->collisions += dev_stats.collisions;
1551 ovs_mutex_unlock(&netdev->mutex);
1557 netdev_internal_get_stats(const struct netdev *netdev_,
1558 struct netdev_stats *stats)
1560 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1563 ovs_mutex_lock(&netdev->mutex);
1564 get_stats_via_vport(netdev_, stats);
1565 error = netdev->vport_stats_error;
1566 ovs_mutex_unlock(&netdev->mutex);
1572 netdev_internal_set_stats(struct netdev *netdev,
1573 const struct netdev_stats *stats)
1575 struct ovs_vport_stats vport_stats;
1576 struct dpif_linux_vport vport;
1579 vport_stats.rx_packets = stats->rx_packets;
1580 vport_stats.tx_packets = stats->tx_packets;
1581 vport_stats.rx_bytes = stats->rx_bytes;
1582 vport_stats.tx_bytes = stats->tx_bytes;
1583 vport_stats.rx_errors = stats->rx_errors;
1584 vport_stats.tx_errors = stats->tx_errors;
1585 vport_stats.rx_dropped = stats->rx_dropped;
1586 vport_stats.tx_dropped = stats->tx_dropped;
1588 dpif_linux_vport_init(&vport);
1589 vport.cmd = OVS_VPORT_CMD_SET;
1590 vport.name = netdev_get_name(netdev);
1591 vport.stats = &vport_stats;
1593 err = dpif_linux_vport_transact(&vport, NULL, NULL);
1595 /* If the vport layer doesn't know about the device, that doesn't mean it
1596 * doesn't exist (after all were able to open it when netdev_open() was
1597 * called), it just means that it isn't attached and we'll be getting
1598 * stats a different way. */
1599 if (err == ENODEV) {
1607 netdev_linux_read_features(struct netdev_linux *netdev)
1609 struct ethtool_cmd ecmd;
1613 if (netdev->cache_valid & VALID_FEATURES) {
1617 COVERAGE_INC(netdev_get_ethtool);
1618 memset(&ecmd, 0, sizeof ecmd);
1619 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1620 ETHTOOL_GSET, "ETHTOOL_GSET");
1625 /* Supported features. */
1626 netdev->supported = 0;
1627 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1628 netdev->supported |= NETDEV_F_10MB_HD;
1630 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1631 netdev->supported |= NETDEV_F_10MB_FD;
1633 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1634 netdev->supported |= NETDEV_F_100MB_HD;
1636 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1637 netdev->supported |= NETDEV_F_100MB_FD;
1639 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1640 netdev->supported |= NETDEV_F_1GB_HD;
1642 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1643 netdev->supported |= NETDEV_F_1GB_FD;
1645 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1646 netdev->supported |= NETDEV_F_10GB_FD;
1648 if (ecmd.supported & SUPPORTED_TP) {
1649 netdev->supported |= NETDEV_F_COPPER;
1651 if (ecmd.supported & SUPPORTED_FIBRE) {
1652 netdev->supported |= NETDEV_F_FIBER;
1654 if (ecmd.supported & SUPPORTED_Autoneg) {
1655 netdev->supported |= NETDEV_F_AUTONEG;
1657 if (ecmd.supported & SUPPORTED_Pause) {
1658 netdev->supported |= NETDEV_F_PAUSE;
1660 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1661 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1664 /* Advertised features. */
1665 netdev->advertised = 0;
1666 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1667 netdev->advertised |= NETDEV_F_10MB_HD;
1669 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1670 netdev->advertised |= NETDEV_F_10MB_FD;
1672 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1673 netdev->advertised |= NETDEV_F_100MB_HD;
1675 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1676 netdev->advertised |= NETDEV_F_100MB_FD;
1678 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1679 netdev->advertised |= NETDEV_F_1GB_HD;
1681 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1682 netdev->advertised |= NETDEV_F_1GB_FD;
1684 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1685 netdev->advertised |= NETDEV_F_10GB_FD;
1687 if (ecmd.advertising & ADVERTISED_TP) {
1688 netdev->advertised |= NETDEV_F_COPPER;
1690 if (ecmd.advertising & ADVERTISED_FIBRE) {
1691 netdev->advertised |= NETDEV_F_FIBER;
1693 if (ecmd.advertising & ADVERTISED_Autoneg) {
1694 netdev->advertised |= NETDEV_F_AUTONEG;
1696 if (ecmd.advertising & ADVERTISED_Pause) {
1697 netdev->advertised |= NETDEV_F_PAUSE;
1699 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1700 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1703 /* Current settings. */
1705 if (speed == SPEED_10) {
1706 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1707 } else if (speed == SPEED_100) {
1708 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1709 } else if (speed == SPEED_1000) {
1710 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1711 } else if (speed == SPEED_10000) {
1712 netdev->current = NETDEV_F_10GB_FD;
1713 } else if (speed == 40000) {
1714 netdev->current = NETDEV_F_40GB_FD;
1715 } else if (speed == 100000) {
1716 netdev->current = NETDEV_F_100GB_FD;
1717 } else if (speed == 1000000) {
1718 netdev->current = NETDEV_F_1TB_FD;
1720 netdev->current = 0;
1723 if (ecmd.port == PORT_TP) {
1724 netdev->current |= NETDEV_F_COPPER;
1725 } else if (ecmd.port == PORT_FIBRE) {
1726 netdev->current |= NETDEV_F_FIBER;
1730 netdev->current |= NETDEV_F_AUTONEG;
1734 netdev->cache_valid |= VALID_FEATURES;
1735 netdev->get_features_error = error;
1738 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
1739 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1740 * Returns 0 if successful, otherwise a positive errno value. */
1742 netdev_linux_get_features(const struct netdev *netdev_,
1743 enum netdev_features *current,
1744 enum netdev_features *advertised,
1745 enum netdev_features *supported,
1746 enum netdev_features *peer)
1748 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1751 ovs_mutex_lock(&netdev->mutex);
1752 netdev_linux_read_features(netdev);
1753 if (!netdev->get_features_error) {
1754 *current = netdev->current;
1755 *advertised = netdev->advertised;
1756 *supported = netdev->supported;
1757 *peer = 0; /* XXX */
1759 error = netdev->get_features_error;
1760 ovs_mutex_unlock(&netdev->mutex);
1765 /* Set the features advertised by 'netdev' to 'advertise'. */
1767 netdev_linux_set_advertisements(struct netdev *netdev_,
1768 enum netdev_features advertise)
1770 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1771 struct ethtool_cmd ecmd;
1774 ovs_mutex_lock(&netdev->mutex);
1776 COVERAGE_INC(netdev_get_ethtool);
1777 memset(&ecmd, 0, sizeof ecmd);
1778 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1779 ETHTOOL_GSET, "ETHTOOL_GSET");
1784 ecmd.advertising = 0;
1785 if (advertise & NETDEV_F_10MB_HD) {
1786 ecmd.advertising |= ADVERTISED_10baseT_Half;
1788 if (advertise & NETDEV_F_10MB_FD) {
1789 ecmd.advertising |= ADVERTISED_10baseT_Full;
1791 if (advertise & NETDEV_F_100MB_HD) {
1792 ecmd.advertising |= ADVERTISED_100baseT_Half;
1794 if (advertise & NETDEV_F_100MB_FD) {
1795 ecmd.advertising |= ADVERTISED_100baseT_Full;
1797 if (advertise & NETDEV_F_1GB_HD) {
1798 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1800 if (advertise & NETDEV_F_1GB_FD) {
1801 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1803 if (advertise & NETDEV_F_10GB_FD) {
1804 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1806 if (advertise & NETDEV_F_COPPER) {
1807 ecmd.advertising |= ADVERTISED_TP;
1809 if (advertise & NETDEV_F_FIBER) {
1810 ecmd.advertising |= ADVERTISED_FIBRE;
1812 if (advertise & NETDEV_F_AUTONEG) {
1813 ecmd.advertising |= ADVERTISED_Autoneg;
1815 if (advertise & NETDEV_F_PAUSE) {
1816 ecmd.advertising |= ADVERTISED_Pause;
1818 if (advertise & NETDEV_F_PAUSE_ASYM) {
1819 ecmd.advertising |= ADVERTISED_Asym_Pause;
1821 COVERAGE_INC(netdev_set_ethtool);
1822 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1823 ETHTOOL_SSET, "ETHTOOL_SSET");
1826 ovs_mutex_unlock(&netdev->mutex);
1830 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1831 * successful, otherwise a positive errno value. */
1833 netdev_linux_set_policing(struct netdev *netdev_,
1834 uint32_t kbits_rate, uint32_t kbits_burst)
1836 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1837 const char *netdev_name = netdev_get_name(netdev_);
1840 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1841 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1842 : kbits_burst); /* Stick with user-specified value. */
1844 ovs_mutex_lock(&netdev->mutex);
1845 if (netdev->cache_valid & VALID_POLICING) {
1846 error = netdev->netdev_policing_error;
1847 if (error || (netdev->kbits_rate == kbits_rate &&
1848 netdev->kbits_burst == kbits_burst)) {
1849 /* Assume that settings haven't changed since we last set them. */
1852 netdev->cache_valid &= ~VALID_POLICING;
1855 COVERAGE_INC(netdev_set_policing);
1856 /* Remove any existing ingress qdisc. */
1857 error = tc_add_del_ingress_qdisc(netdev_, false);
1859 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1860 netdev_name, ovs_strerror(error));
1865 error = tc_add_del_ingress_qdisc(netdev_, true);
1867 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1868 netdev_name, ovs_strerror(error));
1872 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
1874 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1875 netdev_name, ovs_strerror(error));
1880 netdev->kbits_rate = kbits_rate;
1881 netdev->kbits_burst = kbits_burst;
1884 if (!error || error == ENODEV) {
1885 netdev->netdev_policing_error = error;
1886 netdev->cache_valid |= VALID_POLICING;
1888 ovs_mutex_unlock(&netdev->mutex);
1893 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1896 const struct tc_ops *const *opsp;
1898 for (opsp = tcs; *opsp != NULL; opsp++) {
1899 const struct tc_ops *ops = *opsp;
1900 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1901 sset_add(types, ops->ovs_name);
1907 static const struct tc_ops *
1908 tc_lookup_ovs_name(const char *name)
1910 const struct tc_ops *const *opsp;
1912 for (opsp = tcs; *opsp != NULL; opsp++) {
1913 const struct tc_ops *ops = *opsp;
1914 if (!strcmp(name, ops->ovs_name)) {
1921 static const struct tc_ops *
1922 tc_lookup_linux_name(const char *name)
1924 const struct tc_ops *const *opsp;
1926 for (opsp = tcs; *opsp != NULL; opsp++) {
1927 const struct tc_ops *ops = *opsp;
1928 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1935 static struct tc_queue *
1936 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
1939 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1940 struct tc_queue *queue;
1942 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
1943 if (queue->queue_id == queue_id) {
1950 static struct tc_queue *
1951 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1953 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1957 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1959 struct netdev_qos_capabilities *caps)
1961 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1965 caps->n_queues = ops->n_queues;
1970 netdev_linux_get_qos(const struct netdev *netdev_,
1971 const char **typep, struct smap *details)
1973 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1976 ovs_mutex_lock(&netdev->mutex);
1977 error = tc_query_qdisc(netdev_);
1979 *typep = netdev->tc->ops->ovs_name;
1980 error = (netdev->tc->ops->qdisc_get
1981 ? netdev->tc->ops->qdisc_get(netdev_, details)
1984 ovs_mutex_unlock(&netdev->mutex);
1990 netdev_linux_set_qos(struct netdev *netdev_,
1991 const char *type, const struct smap *details)
1993 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1994 const struct tc_ops *new_ops;
1997 new_ops = tc_lookup_ovs_name(type);
1998 if (!new_ops || !new_ops->tc_install) {
2002 ovs_mutex_lock(&netdev->mutex);
2003 error = tc_query_qdisc(netdev_);
2008 if (new_ops == netdev->tc->ops) {
2009 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
2011 /* Delete existing qdisc. */
2012 error = tc_del_qdisc(netdev_);
2016 ovs_assert(netdev->tc == NULL);
2018 /* Install new qdisc. */
2019 error = new_ops->tc_install(netdev_, details);
2020 ovs_assert((error == 0) == (netdev->tc != NULL));
2024 ovs_mutex_unlock(&netdev->mutex);
2029 netdev_linux_get_queue(const struct netdev *netdev_,
2030 unsigned int queue_id, struct smap *details)
2032 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2035 ovs_mutex_lock(&netdev->mutex);
2036 error = tc_query_qdisc(netdev_);
2038 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2040 ? netdev->tc->ops->class_get(netdev_, queue, details)
2043 ovs_mutex_unlock(&netdev->mutex);
2049 netdev_linux_set_queue(struct netdev *netdev_,
2050 unsigned int queue_id, const struct smap *details)
2052 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2055 ovs_mutex_lock(&netdev->mutex);
2056 error = tc_query_qdisc(netdev_);
2058 error = (queue_id < netdev->tc->ops->n_queues
2059 && netdev->tc->ops->class_set
2060 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2063 ovs_mutex_unlock(&netdev->mutex);
2069 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
2071 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2074 ovs_mutex_lock(&netdev->mutex);
2075 error = tc_query_qdisc(netdev_);
2077 if (netdev->tc->ops->class_delete) {
2078 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2080 ? netdev->tc->ops->class_delete(netdev_, queue)
2086 ovs_mutex_unlock(&netdev->mutex);
2092 netdev_linux_get_queue_stats(const struct netdev *netdev_,
2093 unsigned int queue_id,
2094 struct netdev_queue_stats *stats)
2096 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2099 ovs_mutex_lock(&netdev->mutex);
2100 error = tc_query_qdisc(netdev_);
2102 if (netdev->tc->ops->class_get_stats) {
2103 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2105 stats->created = queue->created;
2106 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2115 ovs_mutex_unlock(&netdev->mutex);
2121 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
2123 struct ofpbuf request;
2124 struct tcmsg *tcmsg;
2126 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2130 tcmsg->tcm_parent = 0;
2131 nl_dump_start(dump, NETLINK_ROUTE, &request);
2132 ofpbuf_uninit(&request);
2136 struct netdev_linux_queue_state {
2137 unsigned int *queues;
2143 netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
2145 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2148 ovs_mutex_lock(&netdev->mutex);
2149 error = tc_query_qdisc(netdev_);
2151 if (netdev->tc->ops->class_get) {
2152 struct netdev_linux_queue_state *state;
2153 struct tc_queue *queue;
2156 *statep = state = xmalloc(sizeof *state);
2157 state->n_queues = hmap_count(&netdev->tc->queues);
2158 state->cur_queue = 0;
2159 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2162 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2163 state->queues[i++] = queue->queue_id;
2169 ovs_mutex_unlock(&netdev->mutex);
2175 netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2176 unsigned int *queue_idp, struct smap *details)
2178 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2179 struct netdev_linux_queue_state *state = state_;
2182 ovs_mutex_lock(&netdev->mutex);
2183 while (state->cur_queue < state->n_queues) {
2184 unsigned int queue_id = state->queues[state->cur_queue++];
2185 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2188 *queue_idp = queue_id;
2189 error = netdev->tc->ops->class_get(netdev_, queue, details);
2193 ovs_mutex_unlock(&netdev->mutex);
2199 netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2202 struct netdev_linux_queue_state *state = state_;
2204 free(state->queues);
2210 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2211 netdev_dump_queue_stats_cb *cb, void *aux)
2213 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2216 ovs_mutex_lock(&netdev->mutex);
2217 error = tc_query_qdisc(netdev_);
2219 struct nl_dump dump;
2221 if (!netdev->tc->ops->class_dump_stats) {
2223 } else if (!start_queue_dump(netdev_, &dump)) {
2229 while (nl_dump_next(&dump, &msg)) {
2230 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2237 retval = nl_dump_done(&dump);
2243 ovs_mutex_unlock(&netdev->mutex);
2249 netdev_linux_get_in4(const struct netdev *netdev_,
2250 struct in_addr *address, struct in_addr *netmask)
2252 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2255 ovs_mutex_lock(&netdev->mutex);
2256 if (!(netdev->cache_valid & VALID_IN4)) {
2257 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
2258 SIOCGIFADDR, "SIOCGIFADDR");
2260 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2261 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2263 netdev->cache_valid |= VALID_IN4;
2271 if (netdev->address.s_addr != INADDR_ANY) {
2272 *address = netdev->address;
2273 *netmask = netdev->netmask;
2275 error = EADDRNOTAVAIL;
2278 ovs_mutex_unlock(&netdev->mutex);
2284 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2285 struct in_addr netmask)
2287 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2290 ovs_mutex_lock(&netdev->mutex);
2291 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2293 netdev->cache_valid |= VALID_IN4;
2294 netdev->address = address;
2295 netdev->netmask = netmask;
2296 if (address.s_addr != INADDR_ANY) {
2297 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2298 "SIOCSIFNETMASK", netmask);
2301 ovs_mutex_unlock(&netdev->mutex);
2307 parse_if_inet6_line(const char *line,
2308 struct in6_addr *in6, char ifname[16 + 1])
2310 uint8_t *s6 = in6->s6_addr;
2311 #define X8 "%2"SCNx8
2312 return ovs_scan(line,
2313 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2314 "%*x %*x %*x %*x %16s\n",
2315 &s6[0], &s6[1], &s6[2], &s6[3],
2316 &s6[4], &s6[5], &s6[6], &s6[7],
2317 &s6[8], &s6[9], &s6[10], &s6[11],
2318 &s6[12], &s6[13], &s6[14], &s6[15],
2322 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2323 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2325 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2327 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2329 ovs_mutex_lock(&netdev->mutex);
2330 if (!(netdev->cache_valid & VALID_IN6)) {
2334 netdev->in6 = in6addr_any;
2336 file = fopen("/proc/net/if_inet6", "r");
2338 const char *name = netdev_get_name(netdev_);
2339 while (fgets(line, sizeof line, file)) {
2340 struct in6_addr in6_tmp;
2341 char ifname[16 + 1];
2342 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2343 && !strcmp(name, ifname))
2345 netdev->in6 = in6_tmp;
2351 netdev->cache_valid |= VALID_IN6;
2354 ovs_mutex_unlock(&netdev->mutex);
2360 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2362 struct sockaddr_in sin;
2363 memset(&sin, 0, sizeof sin);
2364 sin.sin_family = AF_INET;
2365 sin.sin_addr = addr;
2368 memset(sa, 0, sizeof *sa);
2369 memcpy(sa, &sin, sizeof sin);
2373 do_set_addr(struct netdev *netdev,
2374 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2378 make_in4_sockaddr(&ifr.ifr_addr, addr);
2379 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2383 /* Adds 'router' as a default IP gateway. */
2385 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2387 struct in_addr any = { INADDR_ANY };
2391 memset(&rt, 0, sizeof rt);
2392 make_in4_sockaddr(&rt.rt_dst, any);
2393 make_in4_sockaddr(&rt.rt_gateway, router);
2394 make_in4_sockaddr(&rt.rt_genmask, any);
2395 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2396 error = af_inet_ioctl(SIOCADDRT, &rt);
2398 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2404 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2407 static const char fn[] = "/proc/net/route";
2412 *netdev_name = NULL;
2413 stream = fopen(fn, "r");
2414 if (stream == NULL) {
2415 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2420 while (fgets(line, sizeof line, stream)) {
2423 ovs_be32 dest, gateway, mask;
2424 int refcnt, metric, mtu;
2425 unsigned int flags, use, window, irtt;
2428 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2430 iface, &dest, &gateway, &flags, &refcnt,
2431 &use, &metric, &mask, &mtu, &window, &irtt)) {
2432 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2436 if (!(flags & RTF_UP)) {
2437 /* Skip routes that aren't up. */
2441 /* The output of 'dest', 'mask', and 'gateway' were given in
2442 * network byte order, so we don't need need any endian
2443 * conversions here. */
2444 if ((dest & mask) == (host->s_addr & mask)) {
2446 /* The host is directly reachable. */
2447 next_hop->s_addr = 0;
2449 /* To reach the host, we must go through a gateway. */
2450 next_hop->s_addr = gateway;
2452 *netdev_name = xstrdup(iface);
2464 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2466 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2469 ovs_mutex_lock(&netdev->mutex);
2470 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2471 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2473 COVERAGE_INC(netdev_get_ethtool);
2474 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2475 error = netdev_linux_do_ethtool(netdev->up.name,
2478 "ETHTOOL_GDRVINFO");
2480 netdev->cache_valid |= VALID_DRVINFO;
2485 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2486 smap_add(smap, "driver_version", netdev->drvinfo.version);
2487 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2489 ovs_mutex_unlock(&netdev->mutex);
2495 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2498 smap_add(smap, "driver_name", "openvswitch");
2502 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2503 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2504 * returns 0. Otherwise, it returns a positive errno value; in particular,
2505 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2507 netdev_linux_arp_lookup(const struct netdev *netdev,
2508 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2511 struct sockaddr_in sin;
2514 memset(&r, 0, sizeof r);
2515 memset(&sin, 0, sizeof sin);
2516 sin.sin_family = AF_INET;
2517 sin.sin_addr.s_addr = ip;
2519 memcpy(&r.arp_pa, &sin, sizeof sin);
2520 r.arp_ha.sa_family = ARPHRD_ETHER;
2522 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2523 COVERAGE_INC(netdev_arp_lookup);
2524 retval = af_inet_ioctl(SIOCGARP, &r);
2526 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2527 } else if (retval != ENXIO) {
2528 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2529 netdev_get_name(netdev), IP_ARGS(ip),
2530 ovs_strerror(retval));
2536 nd_to_iff_flags(enum netdev_flags nd)
2539 if (nd & NETDEV_UP) {
2542 if (nd & NETDEV_PROMISC) {
2545 if (nd & NETDEV_LOOPBACK) {
2546 iff |= IFF_LOOPBACK;
2552 iff_to_nd_flags(int iff)
2554 enum netdev_flags nd = 0;
2558 if (iff & IFF_PROMISC) {
2559 nd |= NETDEV_PROMISC;
2561 if (iff & IFF_LOOPBACK) {
2562 nd |= NETDEV_LOOPBACK;
2568 update_flags(struct netdev_linux *netdev, enum netdev_flags off,
2569 enum netdev_flags on, enum netdev_flags *old_flagsp)
2570 OVS_REQUIRES(netdev->mutex)
2572 int old_flags, new_flags;
2575 old_flags = netdev->ifi_flags;
2576 *old_flagsp = iff_to_nd_flags(old_flags);
2577 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2578 if (new_flags != old_flags) {
2579 error = set_flags(netdev_get_name(&netdev->up), new_flags);
2580 get_flags(&netdev->up, &netdev->ifi_flags);
2587 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2588 enum netdev_flags on, enum netdev_flags *old_flagsp)
2590 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2593 ovs_mutex_lock(&netdev->mutex);
2594 error = update_flags(netdev, off, on, old_flagsp);
2595 ovs_mutex_unlock(&netdev->mutex);
2601 netdev_linux_change_seq(const struct netdev *netdev_)
2603 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2604 unsigned int change_seq;
2606 ovs_mutex_lock(&netdev->mutex);
2607 change_seq = netdev->change_seq;
2608 ovs_mutex_unlock(&netdev->mutex);
2613 #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, SET_STATS, \
2614 GET_FEATURES, GET_STATUS) \
2620 netdev_linux_wait, \
2622 netdev_linux_alloc, \
2624 netdev_linux_destruct, \
2625 netdev_linux_dealloc, \
2626 NULL, /* get_config */ \
2627 NULL, /* set_config */ \
2628 NULL, /* get_tunnel_config */ \
2630 netdev_linux_send, \
2631 netdev_linux_send_wait, \
2633 netdev_linux_set_etheraddr, \
2634 netdev_linux_get_etheraddr, \
2635 netdev_linux_get_mtu, \
2636 netdev_linux_set_mtu, \
2637 netdev_linux_get_ifindex, \
2638 netdev_linux_get_carrier, \
2639 netdev_linux_get_carrier_resets, \
2640 netdev_linux_set_miimon_interval, \
2645 netdev_linux_set_advertisements, \
2647 netdev_linux_set_policing, \
2648 netdev_linux_get_qos_types, \
2649 netdev_linux_get_qos_capabilities, \
2650 netdev_linux_get_qos, \
2651 netdev_linux_set_qos, \
2652 netdev_linux_get_queue, \
2653 netdev_linux_set_queue, \
2654 netdev_linux_delete_queue, \
2655 netdev_linux_get_queue_stats, \
2656 netdev_linux_queue_dump_start, \
2657 netdev_linux_queue_dump_next, \
2658 netdev_linux_queue_dump_done, \
2659 netdev_linux_dump_queue_stats, \
2661 netdev_linux_get_in4, \
2662 netdev_linux_set_in4, \
2663 netdev_linux_get_in6, \
2664 netdev_linux_add_router, \
2665 netdev_linux_get_next_hop, \
2667 netdev_linux_arp_lookup, \
2669 netdev_linux_update_flags, \
2671 netdev_linux_change_seq, \
2673 netdev_linux_rx_alloc, \
2674 netdev_linux_rx_construct, \
2675 netdev_linux_rx_destruct, \
2676 netdev_linux_rx_dealloc, \
2677 netdev_linux_rx_recv, \
2678 netdev_linux_rx_wait, \
2679 netdev_linux_rx_drain, \
2682 const struct netdev_class netdev_linux_class =
2685 netdev_linux_construct,
2686 netdev_linux_get_stats,
2687 NULL, /* set_stats */
2688 netdev_linux_get_features,
2689 netdev_linux_get_status);
2691 const struct netdev_class netdev_tap_class =
2694 netdev_linux_construct_tap,
2695 netdev_tap_get_stats,
2696 NULL, /* set_stats */
2697 netdev_linux_get_features,
2698 netdev_linux_get_status);
2700 const struct netdev_class netdev_internal_class =
2703 netdev_linux_construct,
2704 netdev_internal_get_stats,
2705 netdev_internal_set_stats,
2706 NULL, /* get_features */
2707 netdev_internal_get_status);
2709 /* HTB traffic control class. */
2711 #define HTB_N_QUEUES 0xf000
2715 unsigned int max_rate; /* In bytes/s. */
2719 struct tc_queue tc_queue;
2720 unsigned int min_rate; /* In bytes/s. */
2721 unsigned int max_rate; /* In bytes/s. */
2722 unsigned int burst; /* In bytes. */
2723 unsigned int priority; /* Lower values are higher priorities. */
2727 htb_get__(const struct netdev *netdev_)
2729 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2730 return CONTAINER_OF(netdev->tc, struct htb, tc);
2734 htb_install__(struct netdev *netdev_, uint64_t max_rate)
2736 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2739 htb = xmalloc(sizeof *htb);
2740 tc_init(&htb->tc, &tc_ops_htb);
2741 htb->max_rate = max_rate;
2743 netdev->tc = &htb->tc;
2746 /* Create an HTB qdisc.
2748 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2750 htb_setup_qdisc__(struct netdev *netdev)
2753 struct tc_htb_glob opt;
2754 struct ofpbuf request;
2755 struct tcmsg *tcmsg;
2757 tc_del_qdisc(netdev);
2759 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2760 NLM_F_EXCL | NLM_F_CREATE, &request);
2764 tcmsg->tcm_handle = tc_make_handle(1, 0);
2765 tcmsg->tcm_parent = TC_H_ROOT;
2767 nl_msg_put_string(&request, TCA_KIND, "htb");
2769 memset(&opt, 0, sizeof opt);
2770 opt.rate2quantum = 10;
2774 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2775 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2776 nl_msg_end_nested(&request, opt_offset);
2778 return tc_transact(&request, NULL);
2781 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2782 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2784 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2785 unsigned int parent, struct htb_class *class)
2788 struct tc_htb_opt opt;
2789 struct ofpbuf request;
2790 struct tcmsg *tcmsg;
2794 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
2796 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2797 netdev_get_name(netdev));
2801 memset(&opt, 0, sizeof opt);
2802 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2803 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2804 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2805 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2806 opt.prio = class->priority;
2808 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2812 tcmsg->tcm_handle = handle;
2813 tcmsg->tcm_parent = parent;
2815 nl_msg_put_string(&request, TCA_KIND, "htb");
2816 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2817 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2818 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2819 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2820 nl_msg_end_nested(&request, opt_offset);
2822 error = tc_transact(&request, NULL);
2824 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2825 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2826 netdev_get_name(netdev),
2827 tc_get_major(handle), tc_get_minor(handle),
2828 tc_get_major(parent), tc_get_minor(parent),
2829 class->min_rate, class->max_rate,
2830 class->burst, class->priority, ovs_strerror(error));
2835 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2836 * description of them into 'details'. The description complies with the
2837 * specification given in the vswitch database documentation for linux-htb
2840 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2842 static const struct nl_policy tca_htb_policy[] = {
2843 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2844 .min_len = sizeof(struct tc_htb_opt) },
2847 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2848 const struct tc_htb_opt *htb;
2850 if (!nl_parse_nested(nl_options, tca_htb_policy,
2851 attrs, ARRAY_SIZE(tca_htb_policy))) {
2852 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2856 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2857 class->min_rate = htb->rate.rate;
2858 class->max_rate = htb->ceil.rate;
2859 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2860 class->priority = htb->prio;
2865 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2866 struct htb_class *options,
2867 struct netdev_queue_stats *stats)
2869 struct nlattr *nl_options;
2870 unsigned int handle;
2873 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2874 if (!error && queue_id) {
2875 unsigned int major = tc_get_major(handle);
2876 unsigned int minor = tc_get_minor(handle);
2877 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2878 *queue_id = minor - 1;
2883 if (!error && options) {
2884 error = htb_parse_tca_options__(nl_options, options);
2890 htb_parse_qdisc_details__(struct netdev *netdev_,
2891 const struct smap *details, struct htb_class *hc)
2893 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2894 const char *max_rate_s;
2896 max_rate_s = smap_get(details, "max-rate");
2897 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2898 if (!hc->max_rate) {
2899 enum netdev_features current;
2901 netdev_linux_read_features(netdev);
2902 current = !netdev->get_features_error ? netdev->current : 0;
2903 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
2905 hc->min_rate = hc->max_rate;
2911 htb_parse_class_details__(struct netdev *netdev,
2912 const struct smap *details, struct htb_class *hc)
2914 const struct htb *htb = htb_get__(netdev);
2915 const char *min_rate_s = smap_get(details, "min-rate");
2916 const char *max_rate_s = smap_get(details, "max-rate");
2917 const char *burst_s = smap_get(details, "burst");
2918 const char *priority_s = smap_get(details, "priority");
2921 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
2923 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2924 netdev_get_name(netdev));
2928 /* HTB requires at least an mtu sized min-rate to send any traffic even
2929 * on uncongested links. */
2930 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2931 hc->min_rate = MAX(hc->min_rate, mtu);
2932 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2935 hc->max_rate = (max_rate_s
2936 ? strtoull(max_rate_s, NULL, 10) / 8
2938 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2939 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2943 * According to hints in the documentation that I've read, it is important
2944 * that 'burst' be at least as big as the largest frame that might be
2945 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2946 * but having it a bit too small is a problem. Since netdev_get_mtu()
2947 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2948 * the MTU. We actually add 64, instead of 14, as a guard against
2949 * additional headers get tacked on somewhere that we're not aware of. */
2950 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2951 hc->burst = MAX(hc->burst, mtu + 64);
2954 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2960 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2961 unsigned int parent, struct htb_class *options,
2962 struct netdev_queue_stats *stats)
2964 struct ofpbuf *reply;
2967 error = tc_query_class(netdev, handle, parent, &reply);
2969 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2970 ofpbuf_delete(reply);
2976 htb_tc_install(struct netdev *netdev, const struct smap *details)
2980 error = htb_setup_qdisc__(netdev);
2982 struct htb_class hc;
2984 htb_parse_qdisc_details__(netdev, details, &hc);
2985 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2986 tc_make_handle(1, 0), &hc);
2988 htb_install__(netdev, hc.max_rate);
2994 static struct htb_class *
2995 htb_class_cast__(const struct tc_queue *queue)
2997 return CONTAINER_OF(queue, struct htb_class, tc_queue);
3001 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
3002 const struct htb_class *hc)
3004 struct htb *htb = htb_get__(netdev);
3005 size_t hash = hash_int(queue_id, 0);
3006 struct tc_queue *queue;
3007 struct htb_class *hcp;
3009 queue = tc_find_queue__(netdev, queue_id, hash);
3011 hcp = htb_class_cast__(queue);
3013 hcp = xmalloc(sizeof *hcp);
3014 queue = &hcp->tc_queue;
3015 queue->queue_id = queue_id;
3016 queue->created = time_msec();
3017 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
3020 hcp->min_rate = hc->min_rate;
3021 hcp->max_rate = hc->max_rate;
3022 hcp->burst = hc->burst;
3023 hcp->priority = hc->priority;
3027 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3030 struct nl_dump dump;
3031 struct htb_class hc;
3033 /* Get qdisc options. */
3035 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3036 htb_install__(netdev, hc.max_rate);
3039 if (!start_queue_dump(netdev, &dump)) {
3042 while (nl_dump_next(&dump, &msg)) {
3043 unsigned int queue_id;
3045 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3046 htb_update_queue__(netdev, queue_id, &hc);
3049 nl_dump_done(&dump);
3055 htb_tc_destroy(struct tc *tc)
3057 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
3058 struct htb_class *hc, *next;
3060 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
3061 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3069 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
3071 const struct htb *htb = htb_get__(netdev);
3072 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
3077 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
3079 struct htb_class hc;
3082 htb_parse_qdisc_details__(netdev, details, &hc);
3083 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3084 tc_make_handle(1, 0), &hc);
3086 htb_get__(netdev)->max_rate = hc.max_rate;
3092 htb_class_get(const struct netdev *netdev OVS_UNUSED,
3093 const struct tc_queue *queue, struct smap *details)
3095 const struct htb_class *hc = htb_class_cast__(queue);
3097 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3098 if (hc->min_rate != hc->max_rate) {
3099 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3101 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
3103 smap_add_format(details, "priority", "%u", hc->priority);
3109 htb_class_set(struct netdev *netdev, unsigned int queue_id,
3110 const struct smap *details)
3112 struct htb_class hc;
3115 error = htb_parse_class_details__(netdev, details, &hc);
3120 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3121 tc_make_handle(1, 0xfffe), &hc);
3126 htb_update_queue__(netdev, queue_id, &hc);
3131 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
3133 struct htb_class *hc = htb_class_cast__(queue);
3134 struct htb *htb = htb_get__(netdev);
3137 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3139 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3146 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3147 struct netdev_queue_stats *stats)
3149 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3150 tc_make_handle(1, 0xfffe), NULL, stats);
3154 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3155 const struct ofpbuf *nlmsg,
3156 netdev_dump_queue_stats_cb *cb, void *aux)
3158 struct netdev_queue_stats stats;
3159 unsigned int handle, major, minor;
3162 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3167 major = tc_get_major(handle);
3168 minor = tc_get_minor(handle);
3169 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3170 (*cb)(minor - 1, &stats, aux);
3175 static const struct tc_ops tc_ops_htb = {
3176 "htb", /* linux_name */
3177 "linux-htb", /* ovs_name */
3178 HTB_N_QUEUES, /* n_queues */
3187 htb_class_get_stats,
3188 htb_class_dump_stats
3191 /* "linux-hfsc" traffic control class. */
3193 #define HFSC_N_QUEUES 0xf000
3201 struct tc_queue tc_queue;
3206 static struct hfsc *
3207 hfsc_get__(const struct netdev *netdev_)
3209 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3210 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
3213 static struct hfsc_class *
3214 hfsc_class_cast__(const struct tc_queue *queue)
3216 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3220 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
3222 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3225 hfsc = xmalloc(sizeof *hfsc);
3226 tc_init(&hfsc->tc, &tc_ops_hfsc);
3227 hfsc->max_rate = max_rate;
3228 netdev->tc = &hfsc->tc;
3232 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3233 const struct hfsc_class *hc)
3237 struct hfsc_class *hcp;
3238 struct tc_queue *queue;
3240 hfsc = hfsc_get__(netdev);
3241 hash = hash_int(queue_id, 0);
3243 queue = tc_find_queue__(netdev, queue_id, hash);
3245 hcp = hfsc_class_cast__(queue);
3247 hcp = xmalloc(sizeof *hcp);
3248 queue = &hcp->tc_queue;
3249 queue->queue_id = queue_id;
3250 queue->created = time_msec();
3251 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3254 hcp->min_rate = hc->min_rate;
3255 hcp->max_rate = hc->max_rate;
3259 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3261 const struct tc_service_curve *rsc, *fsc, *usc;
3262 static const struct nl_policy tca_hfsc_policy[] = {
3264 .type = NL_A_UNSPEC,
3266 .min_len = sizeof(struct tc_service_curve),
3269 .type = NL_A_UNSPEC,
3271 .min_len = sizeof(struct tc_service_curve),
3274 .type = NL_A_UNSPEC,
3276 .min_len = sizeof(struct tc_service_curve),
3279 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3281 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3282 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3283 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3287 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3288 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3289 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3291 if (rsc->m1 != 0 || rsc->d != 0 ||
3292 fsc->m1 != 0 || fsc->d != 0 ||
3293 usc->m1 != 0 || usc->d != 0) {
3294 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3295 "Non-linear service curves are not supported.");
3299 if (rsc->m2 != fsc->m2) {
3300 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3301 "Real-time service curves are not supported ");
3305 if (rsc->m2 > usc->m2) {
3306 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3307 "Min-rate service curve is greater than "
3308 "the max-rate service curve.");
3312 class->min_rate = fsc->m2;
3313 class->max_rate = usc->m2;
3318 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3319 struct hfsc_class *options,
3320 struct netdev_queue_stats *stats)
3323 unsigned int handle;
3324 struct nlattr *nl_options;
3326 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3332 unsigned int major, minor;
3334 major = tc_get_major(handle);
3335 minor = tc_get_minor(handle);
3336 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3337 *queue_id = minor - 1;
3344 error = hfsc_parse_tca_options__(nl_options, options);
3351 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3352 unsigned int parent, struct hfsc_class *options,
3353 struct netdev_queue_stats *stats)
3356 struct ofpbuf *reply;
3358 error = tc_query_class(netdev, handle, parent, &reply);
3363 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3364 ofpbuf_delete(reply);
3369 hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
3370 struct hfsc_class *class)
3372 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3374 const char *max_rate_s;
3376 max_rate_s = smap_get(details, "max-rate");
3377 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3380 enum netdev_features current;
3382 netdev_linux_read_features(netdev);
3383 current = !netdev->get_features_error ? netdev->current : 0;
3384 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3387 class->min_rate = max_rate;
3388 class->max_rate = max_rate;
3392 hfsc_parse_class_details__(struct netdev *netdev,
3393 const struct smap *details,
3394 struct hfsc_class * class)
3396 const struct hfsc *hfsc;
3397 uint32_t min_rate, max_rate;
3398 const char *min_rate_s, *max_rate_s;
3400 hfsc = hfsc_get__(netdev);
3401 min_rate_s = smap_get(details, "min-rate");
3402 max_rate_s = smap_get(details, "max-rate");
3404 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3405 min_rate = MAX(min_rate, 1);
3406 min_rate = MIN(min_rate, hfsc->max_rate);
3408 max_rate = (max_rate_s
3409 ? strtoull(max_rate_s, NULL, 10) / 8
3411 max_rate = MAX(max_rate, min_rate);
3412 max_rate = MIN(max_rate, hfsc->max_rate);
3414 class->min_rate = min_rate;
3415 class->max_rate = max_rate;
3420 /* Create an HFSC qdisc.
3422 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3424 hfsc_setup_qdisc__(struct netdev * netdev)
3426 struct tcmsg *tcmsg;
3427 struct ofpbuf request;
3428 struct tc_hfsc_qopt opt;
3430 tc_del_qdisc(netdev);
3432 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3433 NLM_F_EXCL | NLM_F_CREATE, &request);
3439 tcmsg->tcm_handle = tc_make_handle(1, 0);
3440 tcmsg->tcm_parent = TC_H_ROOT;
3442 memset(&opt, 0, sizeof opt);
3445 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3446 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3448 return tc_transact(&request, NULL);
3451 /* Create an HFSC class.
3453 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3454 * sc rate <min_rate> ul rate <max_rate>" */
3456 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3457 unsigned int parent, struct hfsc_class *class)
3461 struct tcmsg *tcmsg;
3462 struct ofpbuf request;
3463 struct tc_service_curve min, max;
3465 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3471 tcmsg->tcm_handle = handle;
3472 tcmsg->tcm_parent = parent;
3476 min.m2 = class->min_rate;
3480 max.m2 = class->max_rate;
3482 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3483 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3484 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3485 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3486 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3487 nl_msg_end_nested(&request, opt_offset);
3489 error = tc_transact(&request, NULL);
3491 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3492 "min-rate %ubps, max-rate %ubps (%s)",
3493 netdev_get_name(netdev),
3494 tc_get_major(handle), tc_get_minor(handle),
3495 tc_get_major(parent), tc_get_minor(parent),
3496 class->min_rate, class->max_rate, ovs_strerror(error));
3503 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
3506 struct hfsc_class class;
3508 error = hfsc_setup_qdisc__(netdev);
3514 hfsc_parse_qdisc_details__(netdev, details, &class);
3515 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3516 tc_make_handle(1, 0), &class);
3522 hfsc_install__(netdev, class.max_rate);
3527 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3530 struct nl_dump dump;
3531 struct hfsc_class hc;
3534 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3535 hfsc_install__(netdev, hc.max_rate);
3537 if (!start_queue_dump(netdev, &dump)) {
3541 while (nl_dump_next(&dump, &msg)) {
3542 unsigned int queue_id;
3544 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3545 hfsc_update_queue__(netdev, queue_id, &hc);
3549 nl_dump_done(&dump);
3554 hfsc_tc_destroy(struct tc *tc)
3557 struct hfsc_class *hc, *next;
3559 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3561 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3562 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3571 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
3573 const struct hfsc *hfsc;
3574 hfsc = hfsc_get__(netdev);
3575 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
3580 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
3583 struct hfsc_class class;
3585 hfsc_parse_qdisc_details__(netdev, details, &class);
3586 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3587 tc_make_handle(1, 0), &class);
3590 hfsc_get__(netdev)->max_rate = class.max_rate;
3597 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3598 const struct tc_queue *queue, struct smap *details)
3600 const struct hfsc_class *hc;
3602 hc = hfsc_class_cast__(queue);
3603 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3604 if (hc->min_rate != hc->max_rate) {
3605 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3611 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3612 const struct smap *details)
3615 struct hfsc_class class;
3617 error = hfsc_parse_class_details__(netdev, details, &class);
3622 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3623 tc_make_handle(1, 0xfffe), &class);
3628 hfsc_update_queue__(netdev, queue_id, &class);
3633 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3637 struct hfsc_class *hc;
3639 hc = hfsc_class_cast__(queue);
3640 hfsc = hfsc_get__(netdev);
3642 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3644 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3651 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3652 struct netdev_queue_stats *stats)
3654 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3655 tc_make_handle(1, 0xfffe), NULL, stats);
3659 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3660 const struct ofpbuf *nlmsg,
3661 netdev_dump_queue_stats_cb *cb, void *aux)
3663 struct netdev_queue_stats stats;
3664 unsigned int handle, major, minor;
3667 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3672 major = tc_get_major(handle);
3673 minor = tc_get_minor(handle);
3674 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3675 (*cb)(minor - 1, &stats, aux);
3680 static const struct tc_ops tc_ops_hfsc = {
3681 "hfsc", /* linux_name */
3682 "linux-hfsc", /* ovs_name */
3683 HFSC_N_QUEUES, /* n_queues */
3684 hfsc_tc_install, /* tc_install */
3685 hfsc_tc_load, /* tc_load */
3686 hfsc_tc_destroy, /* tc_destroy */
3687 hfsc_qdisc_get, /* qdisc_get */
3688 hfsc_qdisc_set, /* qdisc_set */
3689 hfsc_class_get, /* class_get */
3690 hfsc_class_set, /* class_set */
3691 hfsc_class_delete, /* class_delete */
3692 hfsc_class_get_stats, /* class_get_stats */
3693 hfsc_class_dump_stats /* class_dump_stats */
3696 /* "linux-default" traffic control class.
3698 * This class represents the default, unnamed Linux qdisc. It corresponds to
3699 * the "" (empty string) QoS type in the OVS database. */
3702 default_install__(struct netdev *netdev_)
3704 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3705 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
3707 /* Nothing but a tc class implementation is allowed to write to a tc. This
3708 * class never does that, so we can legitimately use a const tc object. */
3709 netdev->tc = CONST_CAST(struct tc *, &tc);
3713 default_tc_install(struct netdev *netdev,
3714 const struct smap *details OVS_UNUSED)
3716 default_install__(netdev);
3721 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3723 default_install__(netdev);
3727 static const struct tc_ops tc_ops_default = {
3728 NULL, /* linux_name */
3733 NULL, /* tc_destroy */
3734 NULL, /* qdisc_get */
3735 NULL, /* qdisc_set */
3736 NULL, /* class_get */
3737 NULL, /* class_set */
3738 NULL, /* class_delete */
3739 NULL, /* class_get_stats */
3740 NULL /* class_dump_stats */
3743 /* "linux-other" traffic control class.
3748 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
3750 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3751 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
3753 /* Nothing but a tc class implementation is allowed to write to a tc. This
3754 * class never does that, so we can legitimately use a const tc object. */
3755 netdev->tc = CONST_CAST(struct tc *, &tc);
3759 static const struct tc_ops tc_ops_other = {
3760 NULL, /* linux_name */
3761 "linux-other", /* ovs_name */
3763 NULL, /* tc_install */
3765 NULL, /* tc_destroy */
3766 NULL, /* qdisc_get */
3767 NULL, /* qdisc_set */
3768 NULL, /* class_get */
3769 NULL, /* class_set */
3770 NULL, /* class_delete */
3771 NULL, /* class_get_stats */
3772 NULL /* class_dump_stats */
3775 /* Traffic control. */
3777 /* Number of kernel "tc" ticks per second. */
3778 static double ticks_per_s;
3780 /* Number of kernel "jiffies" per second. This is used for the purpose of
3781 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3782 * one jiffy's worth of data.
3784 * There are two possibilities here:
3786 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3787 * approximate range of 100 to 1024. That means that we really need to
3788 * make sure that the qdisc can buffer that much data.
3790 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3791 * has finely granular timers and there's no need to fudge additional room
3792 * for buffers. (There's no extra effort needed to implement that: the
3793 * large 'buffer_hz' is used as a divisor, so practically any number will
3794 * come out as 0 in the division. Small integer results in the case of
3795 * really high dividends won't have any real effect anyhow.)
3797 static unsigned int buffer_hz;
3799 /* Returns tc handle 'major':'minor'. */
3801 tc_make_handle(unsigned int major, unsigned int minor)
3803 return TC_H_MAKE(major << 16, minor);
3806 /* Returns the major number from 'handle'. */
3808 tc_get_major(unsigned int handle)
3810 return TC_H_MAJ(handle) >> 16;
3813 /* Returns the minor number from 'handle'. */
3815 tc_get_minor(unsigned int handle)
3817 return TC_H_MIN(handle);
3820 static struct tcmsg *
3821 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3822 struct ofpbuf *request)
3824 struct tcmsg *tcmsg;
3828 error = get_ifindex(netdev, &ifindex);
3833 ofpbuf_init(request, 512);
3834 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3835 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3836 tcmsg->tcm_family = AF_UNSPEC;
3837 tcmsg->tcm_ifindex = ifindex;
3838 /* Caller should fill in tcmsg->tcm_handle. */
3839 /* Caller should fill in tcmsg->tcm_parent. */
3845 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3847 int error = nl_transact(NETLINK_ROUTE, request, replyp);
3848 ofpbuf_uninit(request);
3852 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3853 * policing configuration.
3855 * This function is equivalent to running the following when 'add' is true:
3856 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3858 * This function is equivalent to running the following when 'add' is false:
3859 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3861 * The configuration and stats may be seen with the following command:
3862 * /sbin/tc -s qdisc show dev <devname>
3864 * Returns 0 if successful, otherwise a positive errno value.
3867 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3869 struct ofpbuf request;
3870 struct tcmsg *tcmsg;
3872 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3873 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3875 tcmsg = tc_make_request(netdev, type, flags, &request);
3879 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3880 tcmsg->tcm_parent = TC_H_INGRESS;
3881 nl_msg_put_string(&request, TCA_KIND, "ingress");
3882 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3884 error = tc_transact(&request, NULL);
3886 /* If we're deleting the qdisc, don't worry about some of the
3887 * error conditions. */
3888 if (!add && (error == ENOENT || error == EINVAL)) {
3897 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3900 * This function is equivalent to running:
3901 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3902 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3905 * The configuration and stats may be seen with the following command:
3906 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3908 * Returns 0 if successful, otherwise a positive errno value.
3911 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3913 struct tc_police tc_police;
3914 struct ofpbuf request;
3915 struct tcmsg *tcmsg;
3916 size_t basic_offset;
3917 size_t police_offset;
3921 memset(&tc_police, 0, sizeof tc_police);
3922 tc_police.action = TC_POLICE_SHOT;
3923 tc_police.mtu = mtu;
3924 tc_fill_rate(&tc_police.rate, (kbits_rate * 1000)/8, mtu);
3925 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3926 kbits_burst * 1024);
3928 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3929 NLM_F_EXCL | NLM_F_CREATE, &request);
3933 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3934 tcmsg->tcm_info = tc_make_handle(49,
3935 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3937 nl_msg_put_string(&request, TCA_KIND, "basic");
3938 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3939 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3940 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3941 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3942 nl_msg_end_nested(&request, police_offset);
3943 nl_msg_end_nested(&request, basic_offset);
3945 error = tc_transact(&request, NULL);
3956 /* The values in psched are not individually very meaningful, but they are
3957 * important. The tables below show some values seen in the wild.
3961 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3962 * (Before that, there are hints that it was 1000000000.)
3964 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3968 * -----------------------------------
3969 * [1] 000c8000 000f4240 000f4240 00000064
3970 * [2] 000003e8 00000400 000f4240 3b9aca00
3971 * [3] 000003e8 00000400 000f4240 3b9aca00
3972 * [4] 000003e8 00000400 000f4240 00000064
3973 * [5] 000003e8 00000040 000f4240 3b9aca00
3974 * [6] 000003e8 00000040 000f4240 000000f9
3976 * a b c d ticks_per_s buffer_hz
3977 * ------- --------- ---------- ------------- ----------- -------------
3978 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3979 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3980 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3981 * [4] 1,000 1,024 1,000,000 100 976,562 100
3982 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3983 * [6] 1,000 64 1,000,000 249 15,625,000 249
3985 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3986 * [2] 2.6.26-1-686-bigmem from Debian lenny
3987 * [3] 2.6.26-2-sparc64 from Debian lenny
3988 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3989 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3990 * [6] 2.6.34 from kernel.org on KVM
3992 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
3993 static const char fn[] = "/proc/net/psched";
3994 unsigned int a, b, c, d;
3997 if (!ovsthread_once_start(&once)) {
4004 stream = fopen(fn, "r");
4006 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
4010 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
4011 VLOG_WARN("%s: read failed", fn);
4015 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
4019 VLOG_WARN("%s: invalid scheduler parameters", fn);
4023 ticks_per_s = (double) a * c / b;
4027 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
4030 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
4033 ovsthread_once_done(&once);
4036 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
4037 * rate of 'rate' bytes per second. */
4039 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
4042 return (rate * ticks) / ticks_per_s;
4045 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
4046 * rate of 'rate' bytes per second. */
4048 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
4051 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
4054 /* Returns the number of bytes that need to be reserved for qdisc buffering at
4055 * a transmission rate of 'rate' bytes per second. */
4057 tc_buffer_per_jiffy(unsigned int rate)
4060 return rate / buffer_hz;
4063 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
4064 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
4065 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
4066 * stores NULL into it if it is absent.
4068 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
4071 * Returns 0 if successful, otherwise a positive errno value. */
4073 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
4074 struct nlattr **options)
4076 static const struct nl_policy tca_policy[] = {
4077 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
4078 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
4080 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4082 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4083 tca_policy, ta, ARRAY_SIZE(ta))) {
4084 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
4089 *kind = nl_attr_get_string(ta[TCA_KIND]);
4093 *options = ta[TCA_OPTIONS];
4108 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
4109 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
4110 * into '*options', and its queue statistics into '*stats'. Any of the output
4111 * arguments may be null.
4113 * Returns 0 if successful, otherwise a positive errno value. */
4115 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
4116 struct nlattr **options, struct netdev_queue_stats *stats)
4118 static const struct nl_policy tca_policy[] = {
4119 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
4120 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
4122 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4124 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4125 tca_policy, ta, ARRAY_SIZE(ta))) {
4126 VLOG_WARN_RL(&rl, "failed to parse class message");
4131 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
4132 *handlep = tc->tcm_handle;
4136 *options = ta[TCA_OPTIONS];
4140 const struct gnet_stats_queue *gsq;
4141 struct gnet_stats_basic gsb;
4143 static const struct nl_policy stats_policy[] = {
4144 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
4145 .min_len = sizeof gsb },
4146 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
4147 .min_len = sizeof *gsq },
4149 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
4151 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
4152 sa, ARRAY_SIZE(sa))) {
4153 VLOG_WARN_RL(&rl, "failed to parse class stats");
4157 /* Alignment issues screw up the length of struct gnet_stats_basic on
4158 * some arch/bitsize combinations. Newer versions of Linux have a
4159 * struct gnet_stats_basic_packed, but we can't depend on that. The
4160 * easiest thing to do is just to make a copy. */
4161 memset(&gsb, 0, sizeof gsb);
4162 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
4163 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
4164 stats->tx_bytes = gsb.bytes;
4165 stats->tx_packets = gsb.packets;
4167 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
4168 stats->tx_errors = gsq->drops;
4178 memset(stats, 0, sizeof *stats);
4183 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
4186 tc_query_class(const struct netdev *netdev,
4187 unsigned int handle, unsigned int parent,
4188 struct ofpbuf **replyp)
4190 struct ofpbuf request;
4191 struct tcmsg *tcmsg;
4194 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
4198 tcmsg->tcm_handle = handle;
4199 tcmsg->tcm_parent = parent;
4201 error = tc_transact(&request, replyp);
4203 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
4204 netdev_get_name(netdev),
4205 tc_get_major(handle), tc_get_minor(handle),
4206 tc_get_major(parent), tc_get_minor(parent),
4207 ovs_strerror(error));
4212 /* Equivalent to "tc class del dev <name> handle <handle>". */
4214 tc_delete_class(const struct netdev *netdev, unsigned int handle)
4216 struct ofpbuf request;
4217 struct tcmsg *tcmsg;
4220 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
4224 tcmsg->tcm_handle = handle;
4225 tcmsg->tcm_parent = 0;
4227 error = tc_transact(&request, NULL);
4229 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4230 netdev_get_name(netdev),
4231 tc_get_major(handle), tc_get_minor(handle),
4232 ovs_strerror(error));
4237 /* Equivalent to "tc qdisc del dev <name> root". */
4239 tc_del_qdisc(struct netdev *netdev_)
4241 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4242 struct ofpbuf request;
4243 struct tcmsg *tcmsg;
4246 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
4250 tcmsg->tcm_handle = tc_make_handle(1, 0);
4251 tcmsg->tcm_parent = TC_H_ROOT;
4253 error = tc_transact(&request, NULL);
4254 if (error == EINVAL) {
4255 /* EINVAL probably means that the default qdisc was in use, in which
4256 * case we've accomplished our purpose. */
4259 if (!error && netdev->tc) {
4260 if (netdev->tc->ops->tc_destroy) {
4261 netdev->tc->ops->tc_destroy(netdev->tc);
4268 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4269 * kernel to determine what they are. Returns 0 if successful, otherwise a
4270 * positive errno value. */
4272 tc_query_qdisc(const struct netdev *netdev_)
4274 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4275 struct ofpbuf request, *qdisc;
4276 const struct tc_ops *ops;
4277 struct tcmsg *tcmsg;
4285 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4286 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4287 * 2.6.35 without that fix backported to it.
4289 * To avoid the OOPS, we must not make a request that would attempt to dump
4290 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4291 * few others. There are a few ways that I can see to do this, but most of
4292 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4293 * technique chosen here is to assume that any non-default qdisc that we
4294 * create will have a class with handle 1:0. The built-in qdiscs only have
4295 * a class with handle 0:0.
4297 * We could check for Linux 2.6.35+ and use a more straightforward method
4299 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
4303 tcmsg->tcm_handle = tc_make_handle(1, 0);
4304 tcmsg->tcm_parent = 0;
4306 /* Figure out what tc class to instantiate. */
4307 error = tc_transact(&request, &qdisc);
4311 error = tc_parse_qdisc(qdisc, &kind, NULL);
4313 ops = &tc_ops_other;
4315 ops = tc_lookup_linux_name(kind);
4317 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4318 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4320 ops = &tc_ops_other;
4323 } else if (error == ENOENT) {
4324 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4325 * other entity that doesn't have a handle 1:0. We will assume
4326 * that it's the system default qdisc. */
4327 ops = &tc_ops_default;
4330 /* Who knows? Maybe the device got deleted. */
4331 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4332 netdev_get_name(netdev_), ovs_strerror(error));
4333 ops = &tc_ops_other;
4336 /* Instantiate it. */
4337 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
4338 ovs_assert((load_error == 0) == (netdev->tc != NULL));
4339 ofpbuf_delete(qdisc);
4341 return error ? error : load_error;
4344 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4345 approximate the time to transmit packets of various lengths. For an MTU of
4346 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4347 represents two possible packet lengths; for a MTU of 513 through 1024, four
4348 possible lengths; and so on.
4350 Returns, for the specified 'mtu', the number of bits that packet lengths
4351 need to be shifted right to fit within such a 256-entry table. */
4353 tc_calc_cell_log(unsigned int mtu)
4358 mtu = ETH_PAYLOAD_MAX;
4360 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4362 for (cell_log = 0; mtu >= 256; cell_log++) {
4369 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4372 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4374 memset(rate, 0, sizeof *rate);
4375 rate->cell_log = tc_calc_cell_log(mtu);
4376 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4377 /* rate->cell_align = 0; */ /* distro headers. */
4378 rate->mpu = ETH_TOTAL_MIN;
4382 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4383 * attribute of the specified "type".
4385 * See tc_calc_cell_log() above for a description of "rtab"s. */
4387 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4392 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4393 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4394 unsigned packet_size = (i + 1) << rate->cell_log;
4395 if (packet_size < rate->mpu) {
4396 packet_size = rate->mpu;
4398 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4402 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4403 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4404 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4407 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4409 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4410 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4413 /* Linux-only functions declared in netdev-linux.h */
4415 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4416 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4418 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4419 const char *flag_name, bool enable)
4421 const char *netdev_name = netdev_get_name(netdev);
4422 struct ethtool_value evalue;
4426 COVERAGE_INC(netdev_get_ethtool);
4427 memset(&evalue, 0, sizeof evalue);
4428 error = netdev_linux_do_ethtool(netdev_name,
4429 (struct ethtool_cmd *)&evalue,
4430 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4435 COVERAGE_INC(netdev_set_ethtool);
4436 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4437 error = netdev_linux_do_ethtool(netdev_name,
4438 (struct ethtool_cmd *)&evalue,
4439 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4444 COVERAGE_INC(netdev_get_ethtool);
4445 memset(&evalue, 0, sizeof evalue);
4446 error = netdev_linux_do_ethtool(netdev_name,
4447 (struct ethtool_cmd *)&evalue,
4448 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4453 if (new_flags != evalue.data) {
4454 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4455 "device %s failed", enable ? "enable" : "disable",
4456 flag_name, netdev_name);
4463 /* Utility functions. */
4465 /* Copies 'src' into 'dst', performing format conversion in the process. */
4467 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4468 const struct rtnl_link_stats *src)
4470 dst->rx_packets = src->rx_packets;
4471 dst->tx_packets = src->tx_packets;
4472 dst->rx_bytes = src->rx_bytes;
4473 dst->tx_bytes = src->tx_bytes;
4474 dst->rx_errors = src->rx_errors;
4475 dst->tx_errors = src->tx_errors;
4476 dst->rx_dropped = src->rx_dropped;
4477 dst->tx_dropped = src->tx_dropped;
4478 dst->multicast = src->multicast;
4479 dst->collisions = src->collisions;
4480 dst->rx_length_errors = src->rx_length_errors;
4481 dst->rx_over_errors = src->rx_over_errors;
4482 dst->rx_crc_errors = src->rx_crc_errors;
4483 dst->rx_frame_errors = src->rx_frame_errors;
4484 dst->rx_fifo_errors = src->rx_fifo_errors;
4485 dst->rx_missed_errors = src->rx_missed_errors;
4486 dst->tx_aborted_errors = src->tx_aborted_errors;
4487 dst->tx_carrier_errors = src->tx_carrier_errors;
4488 dst->tx_fifo_errors = src->tx_fifo_errors;
4489 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4490 dst->tx_window_errors = src->tx_window_errors;
4494 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4496 /* Policy for RTNLGRP_LINK messages.
4498 * There are *many* more fields in these messages, but currently we only
4499 * care about these fields. */
4500 static const struct nl_policy rtnlgrp_link_policy[] = {
4501 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4502 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4503 .min_len = sizeof(struct rtnl_link_stats) },
4506 struct ofpbuf request;
4507 struct ofpbuf *reply;
4508 struct ifinfomsg *ifi;
4509 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4512 ofpbuf_init(&request, 0);
4513 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4514 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4515 ifi->ifi_family = PF_UNSPEC;
4516 ifi->ifi_index = ifindex;
4517 error = nl_transact(NETLINK_ROUTE, &request, &reply);
4518 ofpbuf_uninit(&request);
4523 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4524 rtnlgrp_link_policy,
4525 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4526 ofpbuf_delete(reply);
4530 if (!attrs[IFLA_STATS]) {
4531 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4532 ofpbuf_delete(reply);
4536 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4538 ofpbuf_delete(reply);
4544 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4546 static const char fn[] = "/proc/net/dev";
4551 stream = fopen(fn, "r");
4553 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
4558 while (fgets(line, sizeof line, stream)) {
4561 #define X64 "%"SCNu64
4564 X64 X64 X64 X64 X64 X64 X64 "%*u"
4565 X64 X64 X64 X64 X64 X64 X64 "%*u",
4571 &stats->rx_fifo_errors,
4572 &stats->rx_frame_errors,
4578 &stats->tx_fifo_errors,
4580 &stats->tx_carrier_errors)) {
4581 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4582 } else if (!strcmp(devname, netdev_name)) {
4583 stats->rx_length_errors = UINT64_MAX;
4584 stats->rx_over_errors = UINT64_MAX;
4585 stats->rx_crc_errors = UINT64_MAX;
4586 stats->rx_missed_errors = UINT64_MAX;
4587 stats->tx_aborted_errors = UINT64_MAX;
4588 stats->tx_heartbeat_errors = UINT64_MAX;
4589 stats->tx_window_errors = UINT64_MAX;
4595 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4601 get_flags(const struct netdev *dev, unsigned int *flags)
4607 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
4609 *flags = ifr.ifr_flags;
4615 set_flags(const char *name, unsigned int flags)
4619 ifr.ifr_flags = flags;
4620 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
4624 do_get_ifindex(const char *netdev_name)
4629 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4630 COVERAGE_INC(netdev_get_ifindex);
4632 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
4634 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4635 netdev_name, ovs_strerror(error));
4638 return ifr.ifr_ifindex;
4642 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4644 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4646 if (!(netdev->cache_valid & VALID_IFINDEX)) {
4647 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4650 netdev->get_ifindex_error = -ifindex;
4651 netdev->ifindex = 0;
4653 netdev->get_ifindex_error = 0;
4654 netdev->ifindex = ifindex;
4656 netdev->cache_valid |= VALID_IFINDEX;
4659 *ifindexp = netdev->ifindex;
4660 return netdev->get_ifindex_error;
4664 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4670 memset(&ifr, 0, sizeof ifr);
4671 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4672 COVERAGE_INC(netdev_get_hwaddr);
4673 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
4675 /* ENODEV probably means that a vif disappeared asynchronously and
4676 * hasn't been removed from the database yet, so reduce the log level
4677 * to INFO for that case. */
4678 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
4679 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4680 netdev_name, ovs_strerror(error));
4683 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4684 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4685 VLOG_WARN("%s device has unknown hardware address family %d",
4686 netdev_name, hwaddr_family);
4688 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4693 set_etheraddr(const char *netdev_name,
4694 const uint8_t mac[ETH_ADDR_LEN])
4699 memset(&ifr, 0, sizeof ifr);
4700 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4701 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4702 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4703 COVERAGE_INC(netdev_set_hwaddr);
4704 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
4706 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4707 netdev_name, ovs_strerror(error));
4713 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4714 int cmd, const char *cmd_name)
4719 memset(&ifr, 0, sizeof ifr);
4720 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4721 ifr.ifr_data = (caddr_t) ecmd;
4724 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
4726 if (error != EOPNOTSUPP) {
4727 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4728 "failed: %s", cmd_name, name, ovs_strerror(error));
4730 /* The device doesn't support this operation. That's pretty
4731 * common, so there's no point in logging anything. */
4738 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4739 int cmd, const char *cmd_name)
4744 ifr.ifr_addr.sa_family = AF_INET;
4745 error = af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4747 const struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *,
4749 *ip = sin->sin_addr;
4754 /* Returns an AF_PACKET raw socket or a negative errno value. */
4756 af_packet_sock(void)
4758 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4761 if (ovsthread_once_start(&once)) {
4762 sock = socket(AF_PACKET, SOCK_RAW, 0);
4764 int error = set_nonblocking(sock);
4771 VLOG_ERR("failed to create packet socket: %s",
4772 ovs_strerror(errno));
4774 ovsthread_once_done(&once);