2 * Copyright (c) 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <linux/version.h>
37 #include <sys/types.h>
38 #include <sys/ioctl.h>
39 #include <sys/socket.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
52 #include "dpif-linux.h"
53 #include "dynamic-string.h"
54 #include "fatal-signal.h"
57 #include "netdev-provider.h"
58 #include "netdev-vport.h"
59 #include "netlink-notifier.h"
60 #include "netlink-socket.h"
63 #include "openflow/openflow.h"
64 #include "ovs-atomic.h"
66 #include "poll-loop.h"
67 #include "rtnetlink-link.h"
69 #include "socket-util.h"
72 #include "unaligned.h"
75 VLOG_DEFINE_THIS_MODULE(netdev_linux);
77 COVERAGE_DEFINE(netdev_set_policing);
78 COVERAGE_DEFINE(netdev_arp_lookup);
79 COVERAGE_DEFINE(netdev_get_ifindex);
80 COVERAGE_DEFINE(netdev_get_hwaddr);
81 COVERAGE_DEFINE(netdev_set_hwaddr);
82 COVERAGE_DEFINE(netdev_get_ethtool);
83 COVERAGE_DEFINE(netdev_set_ethtool);
86 /* These were introduced in Linux 2.6.14, so they might be missing if we have
88 #ifndef ADVERTISED_Pause
89 #define ADVERTISED_Pause (1 << 13)
91 #ifndef ADVERTISED_Asym_Pause
92 #define ADVERTISED_Asym_Pause (1 << 14)
95 /* These were introduced in Linux 2.6.24, so they might be missing if we
96 * have old headers. */
97 #ifndef ETHTOOL_GFLAGS
98 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
100 #ifndef ETHTOOL_SFLAGS
101 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
104 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
107 #define TC_RTAB_SIZE 1024
111 VALID_IFINDEX = 1 << 0,
112 VALID_ETHERADDR = 1 << 1,
116 VALID_POLICING = 1 << 5,
117 VALID_VPORT_STAT_ERROR = 1 << 6,
118 VALID_DRVINFO = 1 << 7,
119 VALID_FEATURES = 1 << 8,
122 /* Traffic control. */
124 /* An instance of a traffic control class. Always associated with a particular
127 * Each TC implementation subclasses this with whatever additional data it
130 const struct tc_ops *ops;
131 struct hmap queues; /* Contains "struct tc_queue"s.
132 * Read by generic TC layer.
133 * Written only by TC implementation. */
136 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
138 /* One traffic control queue.
140 * Each TC implementation subclasses this with whatever additional data it
143 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
144 unsigned int queue_id; /* OpenFlow queue ID. */
145 long long int created; /* Time queue was created, in msecs. */
148 /* A particular kind of traffic control. Each implementation generally maps to
149 * one particular Linux qdisc class.
151 * The functions below return 0 if successful or a positive errno value on
152 * failure, except where otherwise noted. All of them must be provided, except
153 * where otherwise noted. */
155 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
156 * This is null for tc_ops_default and tc_ops_other, for which there are no
157 * appropriate values. */
158 const char *linux_name;
160 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
161 const char *ovs_name;
163 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
164 * queues. The queues are numbered 0 through n_queues - 1. */
165 unsigned int n_queues;
167 /* Called to install this TC class on 'netdev'. The implementation should
168 * make the Netlink calls required to set up 'netdev' with the right qdisc
169 * and configure it according to 'details'. The implementation may assume
170 * that the current qdisc is the default; that is, there is no need for it
171 * to delete the current qdisc before installing itself.
173 * The contents of 'details' should be documented as valid for 'ovs_name'
174 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
175 * (which is built as ovs-vswitchd.conf.db(8)).
177 * This function must return 0 if and only if it sets 'netdev->tc' to an
178 * initialized 'struct tc'.
180 * (This function is null for tc_ops_other, which cannot be installed. For
181 * other TC classes it should always be nonnull.) */
182 int (*tc_install)(struct netdev *netdev, const struct smap *details);
184 /* Called when the netdev code determines (through a Netlink query) that
185 * this TC class's qdisc is installed on 'netdev', but we didn't install
186 * it ourselves and so don't know any of the details.
188 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
189 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
190 * implementation should parse the other attributes of 'nlmsg' as
191 * necessary to determine its configuration. If necessary it should also
192 * use Netlink queries to determine the configuration of queues on
195 * This function must return 0 if and only if it sets 'netdev->tc' to an
196 * initialized 'struct tc'. */
197 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
199 /* Destroys the data structures allocated by the implementation as part of
200 * 'tc'. (This includes destroying 'tc->queues' by calling
203 * The implementation should not need to perform any Netlink calls. If
204 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
205 * (But it may not be desirable.)
207 * This function may be null if 'tc' is trivial. */
208 void (*tc_destroy)(struct tc *tc);
210 /* Retrieves details of 'netdev->tc' configuration into 'details'.
212 * The implementation should not need to perform any Netlink calls, because
213 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
214 * cached the configuration.
216 * The contents of 'details' should be documented as valid for 'ovs_name'
217 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
218 * (which is built as ovs-vswitchd.conf.db(8)).
220 * This function may be null if 'tc' is not configurable.
222 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
224 /* Reconfigures 'netdev->tc' according to 'details', performing any
225 * required Netlink calls to complete the reconfiguration.
227 * The contents of 'details' should be documented as valid for 'ovs_name'
228 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
229 * (which is built as ovs-vswitchd.conf.db(8)).
231 * This function may be null if 'tc' is not configurable.
233 int (*qdisc_set)(struct netdev *, const struct smap *details);
235 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
236 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
238 * The contents of 'details' should be documented as valid for 'ovs_name'
239 * in the "other_config" column in the "Queue" table in
240 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
242 * The implementation should not need to perform any Netlink calls, because
243 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
244 * cached the queue configuration.
246 * This function may be null if 'tc' does not have queues ('n_queues' is
248 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
249 struct smap *details);
251 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
252 * 'details', perfoming any required Netlink calls to complete the
253 * reconfiguration. The caller ensures that 'queue_id' is less than
256 * The contents of 'details' should be documented as valid for 'ovs_name'
257 * in the "other_config" column in the "Queue" table in
258 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
260 * This function may be null if 'tc' does not have queues or its queues are
261 * not configurable. */
262 int (*class_set)(struct netdev *, unsigned int queue_id,
263 const struct smap *details);
265 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
266 * tc_queue's within 'netdev->tc->queues'.
268 * This function may be null if 'tc' does not have queues or its queues
269 * cannot be deleted. */
270 int (*class_delete)(struct netdev *, struct tc_queue *queue);
272 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
273 * 'struct tc_queue's within 'netdev->tc->queues'.
275 * On success, initializes '*stats'.
277 * This function may be null if 'tc' does not have queues or if it cannot
278 * report queue statistics. */
279 int (*class_get_stats)(const struct netdev *netdev,
280 const struct tc_queue *queue,
281 struct netdev_queue_stats *stats);
283 /* Extracts queue stats from 'nlmsg', which is a response to a
284 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
286 * This function may be null if 'tc' does not have queues or if it cannot
287 * report queue statistics. */
288 int (*class_dump_stats)(const struct netdev *netdev,
289 const struct ofpbuf *nlmsg,
290 netdev_dump_queue_stats_cb *cb, void *aux);
294 tc_init(struct tc *tc, const struct tc_ops *ops)
297 hmap_init(&tc->queues);
301 tc_destroy(struct tc *tc)
303 hmap_destroy(&tc->queues);
306 static const struct tc_ops tc_ops_htb;
307 static const struct tc_ops tc_ops_hfsc;
308 static const struct tc_ops tc_ops_default;
309 static const struct tc_ops tc_ops_other;
311 static const struct tc_ops *const tcs[] = {
312 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
313 &tc_ops_hfsc, /* Hierarchical fair service curve. */
314 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
315 &tc_ops_other, /* Some other qdisc. */
319 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
320 static unsigned int tc_get_major(unsigned int handle);
321 static unsigned int tc_get_minor(unsigned int handle);
323 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
324 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
325 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
327 static struct tcmsg *tc_make_request(const struct netdev *, int type,
328 unsigned int flags, struct ofpbuf *);
329 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
330 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
331 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
334 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
335 struct nlattr **options);
336 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
337 struct nlattr **options,
338 struct netdev_queue_stats *);
339 static int tc_query_class(const struct netdev *,
340 unsigned int handle, unsigned int parent,
341 struct ofpbuf **replyp);
342 static int tc_delete_class(const struct netdev *, unsigned int handle);
344 static int tc_del_qdisc(struct netdev *netdev);
345 static int tc_query_qdisc(const struct netdev *netdev);
347 static int tc_calc_cell_log(unsigned int mtu);
348 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
349 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
350 const struct tc_ratespec *rate);
351 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
353 struct netdev_linux {
356 /* Protects all members below. */
357 struct ovs_mutex mutex;
359 unsigned int cache_valid;
360 unsigned int change_seq;
362 bool miimon; /* Link status of last poll. */
363 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
364 struct timer miimon_timer;
366 /* The following are figured out "on demand" only. They are only valid
367 * when the corresponding VALID_* bit in 'cache_valid' is set. */
369 uint8_t etheraddr[ETH_ADDR_LEN];
370 struct in_addr address, netmask;
373 unsigned int ifi_flags;
374 long long int carrier_resets;
375 uint32_t kbits_rate; /* Policing data. */
376 uint32_t kbits_burst;
377 int vport_stats_error; /* Cached error code from vport_get_stats().
378 0 or an errno value. */
379 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
380 int ether_addr_error; /* Cached error code from set/get etheraddr. */
381 int netdev_policing_error; /* Cached error code from set policing. */
382 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
383 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
385 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
386 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
387 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
389 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
392 /* For devices of class netdev_tap_class only. */
396 struct netdev_rx_linux {
402 /* This is set pretty low because we probably won't learn anything from the
403 * additional log messages. */
404 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
406 /* Polling miimon status for all ports causes performance degradation when
407 * handling a large number of ports. If there are no devices using miimon, then
408 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait(). */
409 static atomic_int miimon_cnt = ATOMIC_VAR_INIT(0);
411 static void netdev_linux_run(void);
413 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
414 int cmd, const char *cmd_name);
415 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
416 int cmd, const char *cmd_name);
417 static int get_flags(const struct netdev *, unsigned int *flags);
418 static int set_flags(const char *, unsigned int flags);
419 static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
420 enum netdev_flags on, enum netdev_flags *old_flagsp)
421 OVS_REQUIRES(netdev->mutex);
422 static int do_get_ifindex(const char *netdev_name);
423 static int get_ifindex(const struct netdev *, int *ifindexp);
424 static int do_set_addr(struct netdev *netdev,
425 int ioctl_nr, const char *ioctl_name,
426 struct in_addr addr);
427 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
428 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
429 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
430 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
431 static int af_packet_sock(void);
432 static bool netdev_linux_miimon_enabled(void);
433 static void netdev_linux_miimon_run(void);
434 static void netdev_linux_miimon_wait(void);
437 is_netdev_linux_class(const struct netdev_class *netdev_class)
439 return netdev_class->run == netdev_linux_run;
443 is_tap_netdev(const struct netdev *netdev)
445 return netdev_get_class(netdev) == &netdev_tap_class;
448 static struct netdev_linux *
449 netdev_linux_cast(const struct netdev *netdev)
451 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
453 return CONTAINER_OF(netdev, struct netdev_linux, up);
456 static struct netdev_rx_linux *
457 netdev_rx_linux_cast(const struct netdev_rx *rx)
459 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
460 return CONTAINER_OF(rx, struct netdev_rx_linux, up);
463 static void netdev_linux_update(struct netdev_linux *netdev,
464 const struct rtnetlink_link_change *)
465 OVS_REQUIRES(netdev->mutex);
466 static void netdev_linux_changed(struct netdev_linux *netdev,
467 unsigned int ifi_flags, unsigned int mask)
468 OVS_REQUIRES(netdev->mutex);
470 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK changes, or NULL
471 * if no such socket could be created. */
472 static struct nl_sock *
473 netdev_linux_notify_sock(void)
475 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
476 static struct nl_sock *sock;
478 if (ovsthread_once_start(&once)) {
481 error = nl_sock_create(NETLINK_ROUTE, &sock);
483 error = nl_sock_join_mcgroup(sock, RTNLGRP_LINK);
485 nl_sock_destroy(sock);
489 ovsthread_once_done(&once);
496 netdev_linux_miimon_enabled(void)
500 atomic_read(&miimon_cnt, &miimon);
505 netdev_linux_run(void)
507 struct nl_sock *sock;
510 if (netdev_linux_miimon_enabled()) {
511 netdev_linux_miimon_run();
514 sock = netdev_linux_notify_sock();
520 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
521 uint64_t buf_stub[4096 / 8];
524 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
525 error = nl_sock_recv(sock, &buf, false);
527 struct rtnetlink_link_change change;
529 if (rtnetlink_link_parse(&buf, &change)) {
530 struct netdev *netdev_ = netdev_from_name(change.ifname);
531 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
532 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
534 ovs_mutex_lock(&netdev->mutex);
535 netdev_linux_update(netdev, &change);
536 ovs_mutex_unlock(&netdev->mutex);
538 netdev_close(netdev_);
540 } else if (error == ENOBUFS) {
541 struct shash device_shash;
542 struct shash_node *node;
546 shash_init(&device_shash);
547 netdev_get_devices(&netdev_linux_class, &device_shash);
548 SHASH_FOR_EACH (node, &device_shash) {
549 struct netdev *netdev_ = node->data;
550 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
553 ovs_mutex_lock(&netdev->mutex);
554 get_flags(netdev_, &flags);
555 netdev_linux_changed(netdev, flags, 0);
556 ovs_mutex_unlock(&netdev->mutex);
558 netdev_close(netdev_);
560 shash_destroy(&device_shash);
561 } else if (error != EAGAIN) {
562 VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)",
563 ovs_strerror(error));
570 netdev_linux_wait(void)
572 struct nl_sock *sock;
574 if (netdev_linux_miimon_enabled()) {
575 netdev_linux_miimon_wait();
577 sock = netdev_linux_notify_sock();
579 nl_sock_wait(sock, POLLIN);
584 netdev_linux_changed(struct netdev_linux *dev,
585 unsigned int ifi_flags, unsigned int mask)
586 OVS_REQUIRES(dev->mutex)
589 if (!dev->change_seq) {
593 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
594 dev->carrier_resets++;
596 dev->ifi_flags = ifi_flags;
598 dev->cache_valid &= mask;
602 netdev_linux_update(struct netdev_linux *dev,
603 const struct rtnetlink_link_change *change)
604 OVS_REQUIRES(dev->mutex)
606 if (change->nlmsg_type == RTM_NEWLINK) {
608 netdev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
610 /* Update netdev from rtnl-change msg. */
612 dev->mtu = change->mtu;
613 dev->cache_valid |= VALID_MTU;
614 dev->netdev_mtu_error = 0;
617 if (!eth_addr_is_zero(change->addr)) {
618 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
619 dev->cache_valid |= VALID_ETHERADDR;
620 dev->ether_addr_error = 0;
623 dev->ifindex = change->ifi_index;
624 dev->cache_valid |= VALID_IFINDEX;
625 dev->get_ifindex_error = 0;
628 netdev_linux_changed(dev, change->ifi_flags, 0);
632 static struct netdev *
633 netdev_linux_alloc(void)
635 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
640 netdev_linux_common_construct(struct netdev_linux *netdev)
642 ovs_mutex_init(&netdev->mutex);
643 netdev->change_seq = 1;
646 /* Creates system and internal devices. */
648 netdev_linux_construct(struct netdev *netdev_)
650 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
653 netdev_linux_common_construct(netdev);
655 error = get_flags(&netdev->up, &netdev->ifi_flags);
656 if (error == ENODEV) {
657 if (netdev->up.netdev_class != &netdev_internal_class) {
658 /* The device does not exist, so don't allow it to be opened. */
661 /* "Internal" netdevs have to be created as netdev objects before
662 * they exist in the kernel, because creating them in the kernel
663 * happens by passing a netdev object to dpif_port_add().
664 * Therefore, ignore the error. */
671 /* For most types of netdevs we open the device for each call of
672 * netdev_open(). However, this is not the case with tap devices,
673 * since it is only possible to open the device once. In this
674 * situation we share a single file descriptor, and consequently
675 * buffers, across all readers. Therefore once data is read it will
676 * be unavailable to other reads for tap devices. */
678 netdev_linux_construct_tap(struct netdev *netdev_)
680 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
681 static const char tap_dev[] = "/dev/net/tun";
682 const char *name = netdev_->name;
686 netdev_linux_common_construct(netdev);
688 /* Open tap device. */
689 netdev->tap_fd = open(tap_dev, O_RDWR);
690 if (netdev->tap_fd < 0) {
692 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
696 /* Create tap device. */
697 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
698 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
699 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
700 VLOG_WARN("%s: creating tap device failed: %s", name,
701 ovs_strerror(errno));
706 /* Make non-blocking. */
707 error = set_nonblocking(netdev->tap_fd);
715 close(netdev->tap_fd);
720 netdev_linux_destruct(struct netdev *netdev_)
722 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
724 if (netdev->tc && netdev->tc->ops->tc_destroy) {
725 netdev->tc->ops->tc_destroy(netdev->tc);
728 if (netdev_get_class(netdev_) == &netdev_tap_class
729 && netdev->tap_fd >= 0)
731 close(netdev->tap_fd);
734 if (netdev->miimon_interval > 0) {
736 atomic_sub(&miimon_cnt, 1, &junk);
739 ovs_mutex_destroy(&netdev->mutex);
743 netdev_linux_dealloc(struct netdev *netdev_)
745 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
749 static struct netdev_rx *
750 netdev_linux_rx_alloc(void)
752 struct netdev_rx_linux *rx = xzalloc(sizeof *rx);
757 netdev_linux_rx_construct(struct netdev_rx *rx_)
759 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
760 struct netdev *netdev_ = rx->up.netdev;
761 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
764 ovs_mutex_lock(&netdev->mutex);
765 rx->is_tap = is_tap_netdev(netdev_);
767 rx->fd = netdev->tap_fd;
769 struct sockaddr_ll sll;
771 /* Result of tcpdump -dd inbound */
772 static const struct sock_filter filt[] = {
773 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
774 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
775 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
776 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
778 static const struct sock_fprog fprog = {
779 ARRAY_SIZE(filt), (struct sock_filter *) filt
782 /* Create file descriptor. */
783 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
786 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
790 /* Set non-blocking mode. */
791 error = set_nonblocking(rx->fd);
796 /* Get ethernet device index. */
797 error = get_ifindex(&netdev->up, &ifindex);
802 /* Bind to specific ethernet device. */
803 memset(&sll, 0, sizeof sll);
804 sll.sll_family = AF_PACKET;
805 sll.sll_ifindex = ifindex;
806 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
807 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
809 VLOG_ERR("%s: failed to bind raw socket (%s)",
810 netdev_get_name(netdev_), ovs_strerror(error));
814 /* Filter for only inbound packets. */
815 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
819 VLOG_ERR("%s: failed to attach filter (%s)",
820 netdev_get_name(netdev_), ovs_strerror(error));
824 ovs_mutex_unlock(&netdev->mutex);
832 ovs_mutex_unlock(&netdev->mutex);
837 netdev_linux_rx_destruct(struct netdev_rx *rx_)
839 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
847 netdev_linux_rx_dealloc(struct netdev_rx *rx_)
849 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
855 netdev_linux_rx_recv(struct netdev_rx *rx_, void *data, size_t size)
857 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
862 ? read(rx->fd, data, size)
863 : recv(rx->fd, data, size, MSG_TRUNC));
864 } while (retval < 0 && errno == EINTR);
867 return retval > size ? -EMSGSIZE : retval;
869 if (errno != EAGAIN) {
870 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
871 ovs_strerror(errno), netdev_rx_get_name(rx_));
878 netdev_linux_rx_wait(struct netdev_rx *rx_)
880 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
881 poll_fd_wait(rx->fd, POLLIN);
885 netdev_linux_rx_drain(struct netdev_rx *rx_)
887 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
890 int error = af_inet_ifreq_ioctl(netdev_rx_get_name(rx_), &ifr,
891 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
895 drain_fd(rx->fd, ifr.ifr_qlen);
898 return drain_rcvbuf(rx->fd);
902 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
903 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
904 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
905 * the packet is too big or too small to transmit on the device.
907 * The caller retains ownership of 'buffer' in all cases.
909 * The kernel maintains a packet transmission queue, so the caller is not
910 * expected to do additional queuing of packets. */
912 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
917 if (!is_tap_netdev(netdev_)) {
918 /* Use our AF_PACKET socket to send to this device. */
919 struct sockaddr_ll sll;
925 sock = af_packet_sock();
930 ifindex = netdev_get_ifindex(netdev_);
935 /* We don't bother setting most fields in sockaddr_ll because the
936 * kernel ignores them for SOCK_RAW. */
937 memset(&sll, 0, sizeof sll);
938 sll.sll_family = AF_PACKET;
939 sll.sll_ifindex = ifindex;
941 iov.iov_base = CONST_CAST(void *, data);
945 msg.msg_namelen = sizeof sll;
948 msg.msg_control = NULL;
949 msg.msg_controllen = 0;
952 retval = sendmsg(sock, &msg, 0);
954 /* Use the tap fd to send to this device. This is essential for
955 * tap devices, because packets sent to a tap device with an
956 * AF_PACKET socket will loop back to be *received* again on the
957 * tap device. This doesn't occur on other interface types
958 * because we attach a socket filter to the rx socket. */
959 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
961 retval = write(netdev->tap_fd, data, size);
965 /* The Linux AF_PACKET implementation never blocks waiting for room
966 * for packets, instead returning ENOBUFS. Translate this into
967 * EAGAIN for the caller. */
968 if (errno == ENOBUFS) {
970 } else if (errno == EINTR) {
972 } else if (errno != EAGAIN) {
973 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
974 netdev_get_name(netdev_), ovs_strerror(errno));
977 } else if (retval != size) {
978 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
979 "%zu) on %s", retval, size, netdev_get_name(netdev_));
987 /* Registers with the poll loop to wake up from the next call to poll_block()
988 * when the packet transmission queue has sufficient room to transmit a packet
989 * with netdev_send().
991 * The kernel maintains a packet transmission queue, so the client is not
992 * expected to do additional queuing of packets. Thus, this function is
993 * unlikely to ever be used. It is included for completeness. */
995 netdev_linux_send_wait(struct netdev *netdev)
997 if (is_tap_netdev(netdev)) {
998 /* TAP device always accepts packets.*/
999 poll_immediate_wake();
1003 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1004 * otherwise a positive errno value. */
1006 netdev_linux_set_etheraddr(struct netdev *netdev_,
1007 const uint8_t mac[ETH_ADDR_LEN])
1009 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1010 enum netdev_flags old_flags = 0;
1013 ovs_mutex_lock(&netdev->mutex);
1015 if (netdev->cache_valid & VALID_ETHERADDR) {
1016 error = netdev->ether_addr_error;
1017 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1020 netdev->cache_valid &= ~VALID_ETHERADDR;
1023 /* Tap devices must be brought down before setting the address. */
1024 if (is_tap_netdev(netdev_)) {
1025 update_flags(netdev, NETDEV_UP, 0, &old_flags);
1027 error = set_etheraddr(netdev_get_name(netdev_), mac);
1028 if (!error || error == ENODEV) {
1029 netdev->ether_addr_error = error;
1030 netdev->cache_valid |= VALID_ETHERADDR;
1032 memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
1036 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1037 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1041 ovs_mutex_unlock(&netdev->mutex);
1045 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1047 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1048 uint8_t mac[ETH_ADDR_LEN])
1050 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1053 ovs_mutex_lock(&netdev->mutex);
1054 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1055 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1057 netdev->cache_valid |= VALID_ETHERADDR;
1060 error = netdev->ether_addr_error;
1062 memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
1064 ovs_mutex_unlock(&netdev->mutex);
1070 netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
1074 if (!(netdev->cache_valid & VALID_MTU)) {
1077 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
1078 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
1079 netdev->mtu = ifr.ifr_mtu;
1080 netdev->cache_valid |= VALID_MTU;
1083 error = netdev->netdev_mtu_error;
1085 *mtup = netdev->mtu;
1091 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1092 * in bytes, not including the hardware header; thus, this is typically 1500
1093 * bytes for Ethernet devices. */
1095 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1097 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1100 ovs_mutex_lock(&netdev->mutex);
1101 error = netdev_linux_get_mtu__(netdev, mtup);
1102 ovs_mutex_unlock(&netdev->mutex);
1107 /* Sets the maximum size of transmitted (MTU) for given device using linux
1108 * networking ioctl interface.
1111 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1113 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1117 ovs_mutex_lock(&netdev->mutex);
1118 if (netdev->cache_valid & VALID_MTU) {
1119 error = netdev->netdev_mtu_error;
1120 if (error || netdev->mtu == mtu) {
1123 netdev->cache_valid &= ~VALID_MTU;
1126 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1127 SIOCSIFMTU, "SIOCSIFMTU");
1128 if (!error || error == ENODEV) {
1129 netdev->netdev_mtu_error = error;
1130 netdev->mtu = ifr.ifr_mtu;
1131 netdev->cache_valid |= VALID_MTU;
1134 ovs_mutex_unlock(&netdev->mutex);
1138 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1139 * On failure, returns a negative errno value. */
1141 netdev_linux_get_ifindex(const struct netdev *netdev_)
1143 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1146 ovs_mutex_lock(&netdev->mutex);
1147 error = get_ifindex(netdev_, &ifindex);
1148 ovs_mutex_unlock(&netdev->mutex);
1150 return error ? -error : ifindex;
1154 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1156 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1158 ovs_mutex_lock(&netdev->mutex);
1159 if (netdev->miimon_interval > 0) {
1160 *carrier = netdev->miimon;
1162 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1164 ovs_mutex_unlock(&netdev->mutex);
1169 static long long int
1170 netdev_linux_get_carrier_resets(const struct netdev *netdev_)
1172 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1173 long long int carrier_resets;
1175 ovs_mutex_lock(&netdev->mutex);
1176 carrier_resets = netdev->carrier_resets;
1177 ovs_mutex_unlock(&netdev->mutex);
1179 return carrier_resets;
1183 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1184 struct mii_ioctl_data *data)
1189 memset(&ifr, 0, sizeof ifr);
1190 memcpy(&ifr.ifr_data, data, sizeof *data);
1191 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
1192 memcpy(data, &ifr.ifr_data, sizeof *data);
1198 netdev_linux_get_miimon(const char *name, bool *miimon)
1200 struct mii_ioctl_data data;
1205 memset(&data, 0, sizeof data);
1206 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1208 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1209 data.reg_num = MII_BMSR;
1210 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1214 *miimon = !!(data.val_out & BMSR_LSTATUS);
1216 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1219 struct ethtool_cmd ecmd;
1221 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1224 COVERAGE_INC(netdev_get_ethtool);
1225 memset(&ecmd, 0, sizeof ecmd);
1226 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1229 struct ethtool_value eval;
1231 memcpy(&eval, &ecmd, sizeof eval);
1232 *miimon = !!eval.data;
1234 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1242 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1243 long long int interval)
1245 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1247 ovs_mutex_lock(&netdev->mutex);
1248 interval = interval > 0 ? MAX(interval, 100) : 0;
1249 if (netdev->miimon_interval != interval) {
1252 if (interval && !netdev->miimon_interval) {
1253 atomic_add(&miimon_cnt, 1, &junk);
1254 } else if (!interval && netdev->miimon_interval) {
1255 atomic_sub(&miimon_cnt, 1, &junk);
1258 netdev->miimon_interval = interval;
1259 timer_set_expired(&netdev->miimon_timer);
1261 ovs_mutex_unlock(&netdev->mutex);
1267 netdev_linux_miimon_run(void)
1269 struct shash device_shash;
1270 struct shash_node *node;
1272 shash_init(&device_shash);
1273 netdev_get_devices(&netdev_linux_class, &device_shash);
1274 SHASH_FOR_EACH (node, &device_shash) {
1275 struct netdev *netdev = node->data;
1276 struct netdev_linux *dev = netdev_linux_cast(netdev);
1279 ovs_mutex_lock(&dev->mutex);
1280 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1281 netdev_linux_get_miimon(dev->up.name, &miimon);
1282 if (miimon != dev->miimon) {
1283 dev->miimon = miimon;
1284 netdev_linux_changed(dev, dev->ifi_flags, 0);
1287 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1289 ovs_mutex_unlock(&dev->mutex);
1290 netdev_close(netdev);
1293 shash_destroy(&device_shash);
1297 netdev_linux_miimon_wait(void)
1299 struct shash device_shash;
1300 struct shash_node *node;
1302 shash_init(&device_shash);
1303 netdev_get_devices(&netdev_linux_class, &device_shash);
1304 SHASH_FOR_EACH (node, &device_shash) {
1305 struct netdev *netdev = node->data;
1306 struct netdev_linux *dev = netdev_linux_cast(netdev);
1308 ovs_mutex_lock(&dev->mutex);
1309 if (dev->miimon_interval > 0) {
1310 timer_wait(&dev->miimon_timer);
1312 ovs_mutex_unlock(&dev->mutex);
1313 netdev_close(netdev);
1315 shash_destroy(&device_shash);
1318 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1319 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1322 check_for_working_netlink_stats(void)
1324 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1325 * preferable, so if that works, we'll use it. */
1326 int ifindex = do_get_ifindex("lo");
1328 VLOG_WARN("failed to get ifindex for lo, "
1329 "obtaining netdev stats from proc");
1332 struct netdev_stats stats;
1333 int error = get_stats_via_netlink(ifindex, &stats);
1335 VLOG_DBG("obtaining netdev stats via rtnetlink");
1338 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1339 "via proc (you are probably running a pre-2.6.19 "
1340 "kernel)", ovs_strerror(error));
1347 swap_uint64(uint64_t *a, uint64_t *b)
1354 /* Copies 'src' into 'dst', performing format conversion in the process.
1356 * 'src' is allowed to be misaligned. */
1358 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1359 const struct ovs_vport_stats *src)
1361 dst->rx_packets = get_unaligned_u64(&src->rx_packets);
1362 dst->tx_packets = get_unaligned_u64(&src->tx_packets);
1363 dst->rx_bytes = get_unaligned_u64(&src->rx_bytes);
1364 dst->tx_bytes = get_unaligned_u64(&src->tx_bytes);
1365 dst->rx_errors = get_unaligned_u64(&src->rx_errors);
1366 dst->tx_errors = get_unaligned_u64(&src->tx_errors);
1367 dst->rx_dropped = get_unaligned_u64(&src->rx_dropped);
1368 dst->tx_dropped = get_unaligned_u64(&src->tx_dropped);
1370 dst->collisions = 0;
1371 dst->rx_length_errors = 0;
1372 dst->rx_over_errors = 0;
1373 dst->rx_crc_errors = 0;
1374 dst->rx_frame_errors = 0;
1375 dst->rx_fifo_errors = 0;
1376 dst->rx_missed_errors = 0;
1377 dst->tx_aborted_errors = 0;
1378 dst->tx_carrier_errors = 0;
1379 dst->tx_fifo_errors = 0;
1380 dst->tx_heartbeat_errors = 0;
1381 dst->tx_window_errors = 0;
1385 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1387 struct dpif_linux_vport reply;
1391 error = dpif_linux_vport_get(netdev_get_name(netdev), &reply, &buf);
1394 } else if (!reply.stats) {
1399 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1407 get_stats_via_vport(const struct netdev *netdev_,
1408 struct netdev_stats *stats)
1410 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1412 if (!netdev->vport_stats_error ||
1413 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1416 error = get_stats_via_vport__(netdev_, stats);
1417 if (error && error != ENOENT) {
1418 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1420 netdev_get_name(netdev_), ovs_strerror(error));
1422 netdev->vport_stats_error = error;
1423 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1428 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1429 struct netdev_stats *stats)
1431 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
1432 static int use_netlink_stats;
1435 if (ovsthread_once_start(&once)) {
1436 use_netlink_stats = check_for_working_netlink_stats();
1437 ovsthread_once_done(&once);
1440 if (use_netlink_stats) {
1443 error = get_ifindex(netdev_, &ifindex);
1445 error = get_stats_via_netlink(ifindex, stats);
1448 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1452 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1453 netdev_get_name(netdev_), error);
1459 /* Retrieves current device stats for 'netdev-linux'. */
1461 netdev_linux_get_stats(const struct netdev *netdev_,
1462 struct netdev_stats *stats)
1464 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1465 struct netdev_stats dev_stats;
1468 ovs_mutex_lock(&netdev->mutex);
1469 get_stats_via_vport(netdev_, stats);
1470 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1472 if (!netdev->vport_stats_error) {
1475 } else if (netdev->vport_stats_error) {
1476 /* stats not available from OVS then use ioctl stats. */
1479 stats->rx_errors += dev_stats.rx_errors;
1480 stats->tx_errors += dev_stats.tx_errors;
1481 stats->rx_dropped += dev_stats.rx_dropped;
1482 stats->tx_dropped += dev_stats.tx_dropped;
1483 stats->multicast += dev_stats.multicast;
1484 stats->collisions += dev_stats.collisions;
1485 stats->rx_length_errors += dev_stats.rx_length_errors;
1486 stats->rx_over_errors += dev_stats.rx_over_errors;
1487 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1488 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1489 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1490 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1491 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1492 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1493 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1494 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1495 stats->tx_window_errors += dev_stats.tx_window_errors;
1497 ovs_mutex_unlock(&netdev->mutex);
1502 /* Retrieves current device stats for 'netdev-tap' netdev or
1503 * netdev-internal. */
1505 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1507 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1508 struct netdev_stats dev_stats;
1511 ovs_mutex_lock(&netdev->mutex);
1512 get_stats_via_vport(netdev_, stats);
1513 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1515 if (!netdev->vport_stats_error) {
1518 } else if (netdev->vport_stats_error) {
1519 /* Transmit and receive stats will appear to be swapped relative to the
1520 * other ports since we are the one sending the data, not a remote
1521 * computer. For consistency, we swap them back here. This does not
1522 * apply if we are getting stats from the vport layer because it always
1523 * tracks stats from the perspective of the switch. */
1526 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1527 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1528 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1529 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1530 stats->rx_length_errors = 0;
1531 stats->rx_over_errors = 0;
1532 stats->rx_crc_errors = 0;
1533 stats->rx_frame_errors = 0;
1534 stats->rx_fifo_errors = 0;
1535 stats->rx_missed_errors = 0;
1536 stats->tx_aborted_errors = 0;
1537 stats->tx_carrier_errors = 0;
1538 stats->tx_fifo_errors = 0;
1539 stats->tx_heartbeat_errors = 0;
1540 stats->tx_window_errors = 0;
1542 stats->rx_dropped += dev_stats.tx_dropped;
1543 stats->tx_dropped += dev_stats.rx_dropped;
1545 stats->rx_errors += dev_stats.tx_errors;
1546 stats->tx_errors += dev_stats.rx_errors;
1548 stats->multicast += dev_stats.multicast;
1549 stats->collisions += dev_stats.collisions;
1551 ovs_mutex_unlock(&netdev->mutex);
1557 netdev_internal_get_stats(const struct netdev *netdev_,
1558 struct netdev_stats *stats)
1560 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1563 ovs_mutex_lock(&netdev->mutex);
1564 get_stats_via_vport(netdev_, stats);
1565 error = netdev->vport_stats_error;
1566 ovs_mutex_unlock(&netdev->mutex);
1572 netdev_internal_set_stats(struct netdev *netdev,
1573 const struct netdev_stats *stats)
1575 struct ovs_vport_stats vport_stats;
1576 struct dpif_linux_vport vport;
1579 vport_stats.rx_packets = stats->rx_packets;
1580 vport_stats.tx_packets = stats->tx_packets;
1581 vport_stats.rx_bytes = stats->rx_bytes;
1582 vport_stats.tx_bytes = stats->tx_bytes;
1583 vport_stats.rx_errors = stats->rx_errors;
1584 vport_stats.tx_errors = stats->tx_errors;
1585 vport_stats.rx_dropped = stats->rx_dropped;
1586 vport_stats.tx_dropped = stats->tx_dropped;
1588 dpif_linux_vport_init(&vport);
1589 vport.cmd = OVS_VPORT_CMD_SET;
1590 vport.name = netdev_get_name(netdev);
1591 vport.stats = &vport_stats;
1593 err = dpif_linux_vport_transact(&vport, NULL, NULL);
1595 /* If the vport layer doesn't know about the device, that doesn't mean it
1596 * doesn't exist (after all were able to open it when netdev_open() was
1597 * called), it just means that it isn't attached and we'll be getting
1598 * stats a different way. */
1599 if (err == ENODEV) {
1607 netdev_linux_read_features(struct netdev_linux *netdev)
1609 struct ethtool_cmd ecmd;
1613 if (netdev->cache_valid & VALID_FEATURES) {
1617 COVERAGE_INC(netdev_get_ethtool);
1618 memset(&ecmd, 0, sizeof ecmd);
1619 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1620 ETHTOOL_GSET, "ETHTOOL_GSET");
1625 /* Supported features. */
1626 netdev->supported = 0;
1627 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1628 netdev->supported |= NETDEV_F_10MB_HD;
1630 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1631 netdev->supported |= NETDEV_F_10MB_FD;
1633 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1634 netdev->supported |= NETDEV_F_100MB_HD;
1636 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1637 netdev->supported |= NETDEV_F_100MB_FD;
1639 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1640 netdev->supported |= NETDEV_F_1GB_HD;
1642 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1643 netdev->supported |= NETDEV_F_1GB_FD;
1645 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1646 netdev->supported |= NETDEV_F_10GB_FD;
1648 if (ecmd.supported & SUPPORTED_TP) {
1649 netdev->supported |= NETDEV_F_COPPER;
1651 if (ecmd.supported & SUPPORTED_FIBRE) {
1652 netdev->supported |= NETDEV_F_FIBER;
1654 if (ecmd.supported & SUPPORTED_Autoneg) {
1655 netdev->supported |= NETDEV_F_AUTONEG;
1657 if (ecmd.supported & SUPPORTED_Pause) {
1658 netdev->supported |= NETDEV_F_PAUSE;
1660 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1661 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1664 /* Advertised features. */
1665 netdev->advertised = 0;
1666 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1667 netdev->advertised |= NETDEV_F_10MB_HD;
1669 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1670 netdev->advertised |= NETDEV_F_10MB_FD;
1672 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1673 netdev->advertised |= NETDEV_F_100MB_HD;
1675 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1676 netdev->advertised |= NETDEV_F_100MB_FD;
1678 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1679 netdev->advertised |= NETDEV_F_1GB_HD;
1681 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1682 netdev->advertised |= NETDEV_F_1GB_FD;
1684 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1685 netdev->advertised |= NETDEV_F_10GB_FD;
1687 if (ecmd.advertising & ADVERTISED_TP) {
1688 netdev->advertised |= NETDEV_F_COPPER;
1690 if (ecmd.advertising & ADVERTISED_FIBRE) {
1691 netdev->advertised |= NETDEV_F_FIBER;
1693 if (ecmd.advertising & ADVERTISED_Autoneg) {
1694 netdev->advertised |= NETDEV_F_AUTONEG;
1696 if (ecmd.advertising & ADVERTISED_Pause) {
1697 netdev->advertised |= NETDEV_F_PAUSE;
1699 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1700 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1703 /* Current settings. */
1705 if (speed == SPEED_10) {
1706 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1707 } else if (speed == SPEED_100) {
1708 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1709 } else if (speed == SPEED_1000) {
1710 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1711 } else if (speed == SPEED_10000) {
1712 netdev->current = NETDEV_F_10GB_FD;
1713 } else if (speed == 40000) {
1714 netdev->current = NETDEV_F_40GB_FD;
1715 } else if (speed == 100000) {
1716 netdev->current = NETDEV_F_100GB_FD;
1717 } else if (speed == 1000000) {
1718 netdev->current = NETDEV_F_1TB_FD;
1720 netdev->current = 0;
1723 if (ecmd.port == PORT_TP) {
1724 netdev->current |= NETDEV_F_COPPER;
1725 } else if (ecmd.port == PORT_FIBRE) {
1726 netdev->current |= NETDEV_F_FIBER;
1730 netdev->current |= NETDEV_F_AUTONEG;
1734 netdev->cache_valid |= VALID_FEATURES;
1735 netdev->get_features_error = error;
1738 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
1739 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1740 * Returns 0 if successful, otherwise a positive errno value. */
1742 netdev_linux_get_features(const struct netdev *netdev_,
1743 enum netdev_features *current,
1744 enum netdev_features *advertised,
1745 enum netdev_features *supported,
1746 enum netdev_features *peer)
1748 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1751 ovs_mutex_lock(&netdev->mutex);
1752 netdev_linux_read_features(netdev);
1753 if (!netdev->get_features_error) {
1754 *current = netdev->current;
1755 *advertised = netdev->advertised;
1756 *supported = netdev->supported;
1757 *peer = 0; /* XXX */
1759 error = netdev->get_features_error;
1760 ovs_mutex_unlock(&netdev->mutex);
1765 /* Set the features advertised by 'netdev' to 'advertise'. */
1767 netdev_linux_set_advertisements(struct netdev *netdev_,
1768 enum netdev_features advertise)
1770 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1771 struct ethtool_cmd ecmd;
1774 ovs_mutex_lock(&netdev->mutex);
1776 COVERAGE_INC(netdev_get_ethtool);
1777 memset(&ecmd, 0, sizeof ecmd);
1778 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1779 ETHTOOL_GSET, "ETHTOOL_GSET");
1784 ecmd.advertising = 0;
1785 if (advertise & NETDEV_F_10MB_HD) {
1786 ecmd.advertising |= ADVERTISED_10baseT_Half;
1788 if (advertise & NETDEV_F_10MB_FD) {
1789 ecmd.advertising |= ADVERTISED_10baseT_Full;
1791 if (advertise & NETDEV_F_100MB_HD) {
1792 ecmd.advertising |= ADVERTISED_100baseT_Half;
1794 if (advertise & NETDEV_F_100MB_FD) {
1795 ecmd.advertising |= ADVERTISED_100baseT_Full;
1797 if (advertise & NETDEV_F_1GB_HD) {
1798 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1800 if (advertise & NETDEV_F_1GB_FD) {
1801 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1803 if (advertise & NETDEV_F_10GB_FD) {
1804 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1806 if (advertise & NETDEV_F_COPPER) {
1807 ecmd.advertising |= ADVERTISED_TP;
1809 if (advertise & NETDEV_F_FIBER) {
1810 ecmd.advertising |= ADVERTISED_FIBRE;
1812 if (advertise & NETDEV_F_AUTONEG) {
1813 ecmd.advertising |= ADVERTISED_Autoneg;
1815 if (advertise & NETDEV_F_PAUSE) {
1816 ecmd.advertising |= ADVERTISED_Pause;
1818 if (advertise & NETDEV_F_PAUSE_ASYM) {
1819 ecmd.advertising |= ADVERTISED_Asym_Pause;
1821 COVERAGE_INC(netdev_set_ethtool);
1822 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1823 ETHTOOL_SSET, "ETHTOOL_SSET");
1826 ovs_mutex_unlock(&netdev->mutex);
1830 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1831 * successful, otherwise a positive errno value. */
1833 netdev_linux_set_policing(struct netdev *netdev_,
1834 uint32_t kbits_rate, uint32_t kbits_burst)
1836 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1837 const char *netdev_name = netdev_get_name(netdev_);
1840 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1841 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1842 : kbits_burst); /* Stick with user-specified value. */
1844 ovs_mutex_lock(&netdev->mutex);
1845 if (netdev->cache_valid & VALID_POLICING) {
1846 error = netdev->netdev_policing_error;
1847 if (error || (netdev->kbits_rate == kbits_rate &&
1848 netdev->kbits_burst == kbits_burst)) {
1849 /* Assume that settings haven't changed since we last set them. */
1852 netdev->cache_valid &= ~VALID_POLICING;
1855 COVERAGE_INC(netdev_set_policing);
1856 /* Remove any existing ingress qdisc. */
1857 error = tc_add_del_ingress_qdisc(netdev_, false);
1859 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1860 netdev_name, ovs_strerror(error));
1865 error = tc_add_del_ingress_qdisc(netdev_, true);
1867 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1868 netdev_name, ovs_strerror(error));
1872 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
1874 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1875 netdev_name, ovs_strerror(error));
1880 netdev->kbits_rate = kbits_rate;
1881 netdev->kbits_burst = kbits_burst;
1884 if (!error || error == ENODEV) {
1885 netdev->netdev_policing_error = error;
1886 netdev->cache_valid |= VALID_POLICING;
1888 ovs_mutex_unlock(&netdev->mutex);
1893 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1896 const struct tc_ops *const *opsp;
1898 for (opsp = tcs; *opsp != NULL; opsp++) {
1899 const struct tc_ops *ops = *opsp;
1900 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1901 sset_add(types, ops->ovs_name);
1907 static const struct tc_ops *
1908 tc_lookup_ovs_name(const char *name)
1910 const struct tc_ops *const *opsp;
1912 for (opsp = tcs; *opsp != NULL; opsp++) {
1913 const struct tc_ops *ops = *opsp;
1914 if (!strcmp(name, ops->ovs_name)) {
1921 static const struct tc_ops *
1922 tc_lookup_linux_name(const char *name)
1924 const struct tc_ops *const *opsp;
1926 for (opsp = tcs; *opsp != NULL; opsp++) {
1927 const struct tc_ops *ops = *opsp;
1928 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1935 static struct tc_queue *
1936 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
1939 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1940 struct tc_queue *queue;
1942 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
1943 if (queue->queue_id == queue_id) {
1950 static struct tc_queue *
1951 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1953 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1957 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1959 struct netdev_qos_capabilities *caps)
1961 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1965 caps->n_queues = ops->n_queues;
1970 netdev_linux_get_qos(const struct netdev *netdev_,
1971 const char **typep, struct smap *details)
1973 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1976 ovs_mutex_lock(&netdev->mutex);
1977 error = tc_query_qdisc(netdev_);
1979 *typep = netdev->tc->ops->ovs_name;
1980 error = (netdev->tc->ops->qdisc_get
1981 ? netdev->tc->ops->qdisc_get(netdev_, details)
1984 ovs_mutex_unlock(&netdev->mutex);
1990 netdev_linux_set_qos(struct netdev *netdev_,
1991 const char *type, const struct smap *details)
1993 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1994 const struct tc_ops *new_ops;
1997 new_ops = tc_lookup_ovs_name(type);
1998 if (!new_ops || !new_ops->tc_install) {
2002 ovs_mutex_lock(&netdev->mutex);
2003 error = tc_query_qdisc(netdev_);
2008 if (new_ops == netdev->tc->ops) {
2009 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
2011 /* Delete existing qdisc. */
2012 error = tc_del_qdisc(netdev_);
2016 ovs_assert(netdev->tc == NULL);
2018 /* Install new qdisc. */
2019 error = new_ops->tc_install(netdev_, details);
2020 ovs_assert((error == 0) == (netdev->tc != NULL));
2024 ovs_mutex_unlock(&netdev->mutex);
2029 netdev_linux_get_queue(const struct netdev *netdev_,
2030 unsigned int queue_id, struct smap *details)
2032 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2035 ovs_mutex_lock(&netdev->mutex);
2036 error = tc_query_qdisc(netdev_);
2038 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2040 ? netdev->tc->ops->class_get(netdev_, queue, details)
2043 ovs_mutex_unlock(&netdev->mutex);
2049 netdev_linux_set_queue(struct netdev *netdev_,
2050 unsigned int queue_id, const struct smap *details)
2052 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2055 ovs_mutex_lock(&netdev->mutex);
2056 error = tc_query_qdisc(netdev_);
2058 error = (queue_id < netdev->tc->ops->n_queues
2059 && netdev->tc->ops->class_set
2060 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2063 ovs_mutex_unlock(&netdev->mutex);
2069 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
2071 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2074 ovs_mutex_lock(&netdev->mutex);
2075 error = tc_query_qdisc(netdev_);
2077 if (netdev->tc->ops->class_delete) {
2078 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2080 ? netdev->tc->ops->class_delete(netdev_, queue)
2086 ovs_mutex_unlock(&netdev->mutex);
2092 netdev_linux_get_queue_stats(const struct netdev *netdev_,
2093 unsigned int queue_id,
2094 struct netdev_queue_stats *stats)
2096 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2099 ovs_mutex_lock(&netdev->mutex);
2100 error = tc_query_qdisc(netdev_);
2102 if (netdev->tc->ops->class_get_stats) {
2103 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2105 stats->created = queue->created;
2106 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2115 ovs_mutex_unlock(&netdev->mutex);
2121 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
2123 struct ofpbuf request;
2124 struct tcmsg *tcmsg;
2126 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2130 tcmsg->tcm_parent = 0;
2131 nl_dump_start(dump, NETLINK_ROUTE, &request);
2132 ofpbuf_uninit(&request);
2136 struct netdev_linux_queue_state {
2137 unsigned int *queues;
2143 netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
2145 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2148 ovs_mutex_lock(&netdev->mutex);
2149 error = tc_query_qdisc(netdev_);
2151 if (netdev->tc->ops->class_get) {
2152 struct netdev_linux_queue_state *state;
2153 struct tc_queue *queue;
2156 *statep = state = xmalloc(sizeof *state);
2157 state->n_queues = hmap_count(&netdev->tc->queues);
2158 state->cur_queue = 0;
2159 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2162 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2163 state->queues[i++] = queue->queue_id;
2169 ovs_mutex_unlock(&netdev->mutex);
2175 netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2176 unsigned int *queue_idp, struct smap *details)
2178 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2179 struct netdev_linux_queue_state *state = state_;
2182 ovs_mutex_lock(&netdev->mutex);
2183 while (state->cur_queue < state->n_queues) {
2184 unsigned int queue_id = state->queues[state->cur_queue++];
2185 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2188 *queue_idp = queue_id;
2189 error = netdev->tc->ops->class_get(netdev_, queue, details);
2193 ovs_mutex_unlock(&netdev->mutex);
2199 netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2202 struct netdev_linux_queue_state *state = state_;
2204 free(state->queues);
2210 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2211 netdev_dump_queue_stats_cb *cb, void *aux)
2213 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2216 ovs_mutex_lock(&netdev->mutex);
2217 error = tc_query_qdisc(netdev_);
2219 struct nl_dump dump;
2221 if (!netdev->tc->ops->class_dump_stats) {
2223 } else if (!start_queue_dump(netdev_, &dump)) {
2229 while (nl_dump_next(&dump, &msg)) {
2230 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2237 retval = nl_dump_done(&dump);
2243 ovs_mutex_unlock(&netdev->mutex);
2249 netdev_linux_get_in4(const struct netdev *netdev_,
2250 struct in_addr *address, struct in_addr *netmask)
2252 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2255 ovs_mutex_lock(&netdev->mutex);
2256 if (!(netdev->cache_valid & VALID_IN4)) {
2257 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
2258 SIOCGIFADDR, "SIOCGIFADDR");
2260 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2261 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2263 netdev->cache_valid |= VALID_IN4;
2271 if (netdev->address.s_addr != INADDR_ANY) {
2272 *address = netdev->address;
2273 *netmask = netdev->netmask;
2275 error = EADDRNOTAVAIL;
2278 ovs_mutex_unlock(&netdev->mutex);
2284 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2285 struct in_addr netmask)
2287 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2290 ovs_mutex_lock(&netdev->mutex);
2291 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2293 netdev->cache_valid |= VALID_IN4;
2294 netdev->address = address;
2295 netdev->netmask = netmask;
2296 if (address.s_addr != INADDR_ANY) {
2297 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2298 "SIOCSIFNETMASK", netmask);
2301 ovs_mutex_unlock(&netdev->mutex);
2307 parse_if_inet6_line(const char *line,
2308 struct in6_addr *in6, char ifname[16 + 1])
2310 uint8_t *s6 = in6->s6_addr;
2311 #define X8 "%2"SCNx8
2313 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2314 "%*x %*x %*x %*x %16s\n",
2315 &s6[0], &s6[1], &s6[2], &s6[3],
2316 &s6[4], &s6[5], &s6[6], &s6[7],
2317 &s6[8], &s6[9], &s6[10], &s6[11],
2318 &s6[12], &s6[13], &s6[14], &s6[15],
2322 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2323 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2325 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2327 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2329 ovs_mutex_lock(&netdev->mutex);
2330 if (!(netdev->cache_valid & VALID_IN6)) {
2334 netdev->in6 = in6addr_any;
2336 file = fopen("/proc/net/if_inet6", "r");
2338 const char *name = netdev_get_name(netdev_);
2339 while (fgets(line, sizeof line, file)) {
2340 struct in6_addr in6_tmp;
2341 char ifname[16 + 1];
2342 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2343 && !strcmp(name, ifname))
2345 netdev->in6 = in6_tmp;
2351 netdev->cache_valid |= VALID_IN6;
2354 ovs_mutex_unlock(&netdev->mutex);
2360 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2362 struct sockaddr_in sin;
2363 memset(&sin, 0, sizeof sin);
2364 sin.sin_family = AF_INET;
2365 sin.sin_addr = addr;
2368 memset(sa, 0, sizeof *sa);
2369 memcpy(sa, &sin, sizeof sin);
2373 do_set_addr(struct netdev *netdev,
2374 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2378 make_in4_sockaddr(&ifr.ifr_addr, addr);
2379 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2383 /* Adds 'router' as a default IP gateway. */
2385 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2387 struct in_addr any = { INADDR_ANY };
2391 memset(&rt, 0, sizeof rt);
2392 make_in4_sockaddr(&rt.rt_dst, any);
2393 make_in4_sockaddr(&rt.rt_gateway, router);
2394 make_in4_sockaddr(&rt.rt_genmask, any);
2395 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2396 error = af_inet_ioctl(SIOCADDRT, &rt);
2398 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2404 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2407 static const char fn[] = "/proc/net/route";
2412 *netdev_name = NULL;
2413 stream = fopen(fn, "r");
2414 if (stream == NULL) {
2415 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2420 while (fgets(line, sizeof line, stream)) {
2423 ovs_be32 dest, gateway, mask;
2424 int refcnt, metric, mtu;
2425 unsigned int flags, use, window, irtt;
2428 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2430 iface, &dest, &gateway, &flags, &refcnt,
2431 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2433 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2437 if (!(flags & RTF_UP)) {
2438 /* Skip routes that aren't up. */
2442 /* The output of 'dest', 'mask', and 'gateway' were given in
2443 * network byte order, so we don't need need any endian
2444 * conversions here. */
2445 if ((dest & mask) == (host->s_addr & mask)) {
2447 /* The host is directly reachable. */
2448 next_hop->s_addr = 0;
2450 /* To reach the host, we must go through a gateway. */
2451 next_hop->s_addr = gateway;
2453 *netdev_name = xstrdup(iface);
2465 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2467 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2470 ovs_mutex_lock(&netdev->mutex);
2471 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2472 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2474 COVERAGE_INC(netdev_get_ethtool);
2475 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2476 error = netdev_linux_do_ethtool(netdev->up.name,
2479 "ETHTOOL_GDRVINFO");
2481 netdev->cache_valid |= VALID_DRVINFO;
2486 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2487 smap_add(smap, "driver_version", netdev->drvinfo.version);
2488 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2490 ovs_mutex_unlock(&netdev->mutex);
2496 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2499 smap_add(smap, "driver_name", "openvswitch");
2503 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2504 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2505 * returns 0. Otherwise, it returns a positive errno value; in particular,
2506 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2508 netdev_linux_arp_lookup(const struct netdev *netdev,
2509 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2512 struct sockaddr_in sin;
2515 memset(&r, 0, sizeof r);
2516 memset(&sin, 0, sizeof sin);
2517 sin.sin_family = AF_INET;
2518 sin.sin_addr.s_addr = ip;
2520 memcpy(&r.arp_pa, &sin, sizeof sin);
2521 r.arp_ha.sa_family = ARPHRD_ETHER;
2523 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2524 COVERAGE_INC(netdev_arp_lookup);
2525 retval = af_inet_ioctl(SIOCGARP, &r);
2527 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2528 } else if (retval != ENXIO) {
2529 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2530 netdev_get_name(netdev), IP_ARGS(ip),
2531 ovs_strerror(retval));
2537 nd_to_iff_flags(enum netdev_flags nd)
2540 if (nd & NETDEV_UP) {
2543 if (nd & NETDEV_PROMISC) {
2546 if (nd & NETDEV_LOOPBACK) {
2547 iff |= IFF_LOOPBACK;
2553 iff_to_nd_flags(int iff)
2555 enum netdev_flags nd = 0;
2559 if (iff & IFF_PROMISC) {
2560 nd |= NETDEV_PROMISC;
2562 if (iff & IFF_LOOPBACK) {
2563 nd |= NETDEV_LOOPBACK;
2569 update_flags(struct netdev_linux *netdev, enum netdev_flags off,
2570 enum netdev_flags on, enum netdev_flags *old_flagsp)
2571 OVS_REQUIRES(netdev->mutex)
2573 int old_flags, new_flags;
2576 old_flags = netdev->ifi_flags;
2577 *old_flagsp = iff_to_nd_flags(old_flags);
2578 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2579 if (new_flags != old_flags) {
2580 error = set_flags(netdev_get_name(&netdev->up), new_flags);
2581 get_flags(&netdev->up, &netdev->ifi_flags);
2588 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2589 enum netdev_flags on, enum netdev_flags *old_flagsp)
2591 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2594 ovs_mutex_lock(&netdev->mutex);
2595 error = update_flags(netdev, off, on, old_flagsp);
2596 ovs_mutex_unlock(&netdev->mutex);
2602 netdev_linux_change_seq(const struct netdev *netdev_)
2604 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2605 unsigned int change_seq;
2607 ovs_mutex_lock(&netdev->mutex);
2608 change_seq = netdev->change_seq;
2609 ovs_mutex_unlock(&netdev->mutex);
2614 #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, SET_STATS, \
2615 GET_FEATURES, GET_STATUS) \
2621 netdev_linux_wait, \
2623 netdev_linux_alloc, \
2625 netdev_linux_destruct, \
2626 netdev_linux_dealloc, \
2627 NULL, /* get_config */ \
2628 NULL, /* set_config */ \
2629 NULL, /* get_tunnel_config */ \
2631 netdev_linux_send, \
2632 netdev_linux_send_wait, \
2634 netdev_linux_set_etheraddr, \
2635 netdev_linux_get_etheraddr, \
2636 netdev_linux_get_mtu, \
2637 netdev_linux_set_mtu, \
2638 netdev_linux_get_ifindex, \
2639 netdev_linux_get_carrier, \
2640 netdev_linux_get_carrier_resets, \
2641 netdev_linux_set_miimon_interval, \
2646 netdev_linux_set_advertisements, \
2648 netdev_linux_set_policing, \
2649 netdev_linux_get_qos_types, \
2650 netdev_linux_get_qos_capabilities, \
2651 netdev_linux_get_qos, \
2652 netdev_linux_set_qos, \
2653 netdev_linux_get_queue, \
2654 netdev_linux_set_queue, \
2655 netdev_linux_delete_queue, \
2656 netdev_linux_get_queue_stats, \
2657 netdev_linux_queue_dump_start, \
2658 netdev_linux_queue_dump_next, \
2659 netdev_linux_queue_dump_done, \
2660 netdev_linux_dump_queue_stats, \
2662 netdev_linux_get_in4, \
2663 netdev_linux_set_in4, \
2664 netdev_linux_get_in6, \
2665 netdev_linux_add_router, \
2666 netdev_linux_get_next_hop, \
2668 netdev_linux_arp_lookup, \
2670 netdev_linux_update_flags, \
2672 netdev_linux_change_seq, \
2674 netdev_linux_rx_alloc, \
2675 netdev_linux_rx_construct, \
2676 netdev_linux_rx_destruct, \
2677 netdev_linux_rx_dealloc, \
2678 netdev_linux_rx_recv, \
2679 netdev_linux_rx_wait, \
2680 netdev_linux_rx_drain, \
2683 const struct netdev_class netdev_linux_class =
2686 netdev_linux_construct,
2687 netdev_linux_get_stats,
2688 NULL, /* set_stats */
2689 netdev_linux_get_features,
2690 netdev_linux_get_status);
2692 const struct netdev_class netdev_tap_class =
2695 netdev_linux_construct_tap,
2696 netdev_tap_get_stats,
2697 NULL, /* set_stats */
2698 netdev_linux_get_features,
2699 netdev_linux_get_status);
2701 const struct netdev_class netdev_internal_class =
2704 netdev_linux_construct,
2705 netdev_internal_get_stats,
2706 netdev_internal_set_stats,
2707 NULL, /* get_features */
2708 netdev_internal_get_status);
2710 /* HTB traffic control class. */
2712 #define HTB_N_QUEUES 0xf000
2716 unsigned int max_rate; /* In bytes/s. */
2720 struct tc_queue tc_queue;
2721 unsigned int min_rate; /* In bytes/s. */
2722 unsigned int max_rate; /* In bytes/s. */
2723 unsigned int burst; /* In bytes. */
2724 unsigned int priority; /* Lower values are higher priorities. */
2728 htb_get__(const struct netdev *netdev_)
2730 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2731 return CONTAINER_OF(netdev->tc, struct htb, tc);
2735 htb_install__(struct netdev *netdev_, uint64_t max_rate)
2737 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2740 htb = xmalloc(sizeof *htb);
2741 tc_init(&htb->tc, &tc_ops_htb);
2742 htb->max_rate = max_rate;
2744 netdev->tc = &htb->tc;
2747 /* Create an HTB qdisc.
2749 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2751 htb_setup_qdisc__(struct netdev *netdev)
2754 struct tc_htb_glob opt;
2755 struct ofpbuf request;
2756 struct tcmsg *tcmsg;
2758 tc_del_qdisc(netdev);
2760 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2761 NLM_F_EXCL | NLM_F_CREATE, &request);
2765 tcmsg->tcm_handle = tc_make_handle(1, 0);
2766 tcmsg->tcm_parent = TC_H_ROOT;
2768 nl_msg_put_string(&request, TCA_KIND, "htb");
2770 memset(&opt, 0, sizeof opt);
2771 opt.rate2quantum = 10;
2775 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2776 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2777 nl_msg_end_nested(&request, opt_offset);
2779 return tc_transact(&request, NULL);
2782 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2783 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2785 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2786 unsigned int parent, struct htb_class *class)
2789 struct tc_htb_opt opt;
2790 struct ofpbuf request;
2791 struct tcmsg *tcmsg;
2795 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
2797 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2798 netdev_get_name(netdev));
2802 memset(&opt, 0, sizeof opt);
2803 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2804 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2805 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2806 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2807 opt.prio = class->priority;
2809 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2813 tcmsg->tcm_handle = handle;
2814 tcmsg->tcm_parent = parent;
2816 nl_msg_put_string(&request, TCA_KIND, "htb");
2817 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2818 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2819 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2820 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2821 nl_msg_end_nested(&request, opt_offset);
2823 error = tc_transact(&request, NULL);
2825 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2826 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2827 netdev_get_name(netdev),
2828 tc_get_major(handle), tc_get_minor(handle),
2829 tc_get_major(parent), tc_get_minor(parent),
2830 class->min_rate, class->max_rate,
2831 class->burst, class->priority, ovs_strerror(error));
2836 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2837 * description of them into 'details'. The description complies with the
2838 * specification given in the vswitch database documentation for linux-htb
2841 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2843 static const struct nl_policy tca_htb_policy[] = {
2844 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2845 .min_len = sizeof(struct tc_htb_opt) },
2848 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2849 const struct tc_htb_opt *htb;
2851 if (!nl_parse_nested(nl_options, tca_htb_policy,
2852 attrs, ARRAY_SIZE(tca_htb_policy))) {
2853 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2857 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2858 class->min_rate = htb->rate.rate;
2859 class->max_rate = htb->ceil.rate;
2860 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2861 class->priority = htb->prio;
2866 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2867 struct htb_class *options,
2868 struct netdev_queue_stats *stats)
2870 struct nlattr *nl_options;
2871 unsigned int handle;
2874 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2875 if (!error && queue_id) {
2876 unsigned int major = tc_get_major(handle);
2877 unsigned int minor = tc_get_minor(handle);
2878 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2879 *queue_id = minor - 1;
2884 if (!error && options) {
2885 error = htb_parse_tca_options__(nl_options, options);
2891 htb_parse_qdisc_details__(struct netdev *netdev_,
2892 const struct smap *details, struct htb_class *hc)
2894 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2895 const char *max_rate_s;
2897 max_rate_s = smap_get(details, "max-rate");
2898 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2899 if (!hc->max_rate) {
2900 enum netdev_features current;
2902 netdev_linux_read_features(netdev);
2903 current = !netdev->get_features_error ? netdev->current : 0;
2904 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
2906 hc->min_rate = hc->max_rate;
2912 htb_parse_class_details__(struct netdev *netdev,
2913 const struct smap *details, struct htb_class *hc)
2915 const struct htb *htb = htb_get__(netdev);
2916 const char *min_rate_s = smap_get(details, "min-rate");
2917 const char *max_rate_s = smap_get(details, "max-rate");
2918 const char *burst_s = smap_get(details, "burst");
2919 const char *priority_s = smap_get(details, "priority");
2922 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
2924 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2925 netdev_get_name(netdev));
2929 /* HTB requires at least an mtu sized min-rate to send any traffic even
2930 * on uncongested links. */
2931 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2932 hc->min_rate = MAX(hc->min_rate, mtu);
2933 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2936 hc->max_rate = (max_rate_s
2937 ? strtoull(max_rate_s, NULL, 10) / 8
2939 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2940 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2944 * According to hints in the documentation that I've read, it is important
2945 * that 'burst' be at least as big as the largest frame that might be
2946 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2947 * but having it a bit too small is a problem. Since netdev_get_mtu()
2948 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2949 * the MTU. We actually add 64, instead of 14, as a guard against
2950 * additional headers get tacked on somewhere that we're not aware of. */
2951 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2952 hc->burst = MAX(hc->burst, mtu + 64);
2955 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2961 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2962 unsigned int parent, struct htb_class *options,
2963 struct netdev_queue_stats *stats)
2965 struct ofpbuf *reply;
2968 error = tc_query_class(netdev, handle, parent, &reply);
2970 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2971 ofpbuf_delete(reply);
2977 htb_tc_install(struct netdev *netdev, const struct smap *details)
2981 error = htb_setup_qdisc__(netdev);
2983 struct htb_class hc;
2985 htb_parse_qdisc_details__(netdev, details, &hc);
2986 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2987 tc_make_handle(1, 0), &hc);
2989 htb_install__(netdev, hc.max_rate);
2995 static struct htb_class *
2996 htb_class_cast__(const struct tc_queue *queue)
2998 return CONTAINER_OF(queue, struct htb_class, tc_queue);
3002 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
3003 const struct htb_class *hc)
3005 struct htb *htb = htb_get__(netdev);
3006 size_t hash = hash_int(queue_id, 0);
3007 struct tc_queue *queue;
3008 struct htb_class *hcp;
3010 queue = tc_find_queue__(netdev, queue_id, hash);
3012 hcp = htb_class_cast__(queue);
3014 hcp = xmalloc(sizeof *hcp);
3015 queue = &hcp->tc_queue;
3016 queue->queue_id = queue_id;
3017 queue->created = time_msec();
3018 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
3021 hcp->min_rate = hc->min_rate;
3022 hcp->max_rate = hc->max_rate;
3023 hcp->burst = hc->burst;
3024 hcp->priority = hc->priority;
3028 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3031 struct nl_dump dump;
3032 struct htb_class hc;
3034 /* Get qdisc options. */
3036 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3037 htb_install__(netdev, hc.max_rate);
3040 if (!start_queue_dump(netdev, &dump)) {
3043 while (nl_dump_next(&dump, &msg)) {
3044 unsigned int queue_id;
3046 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3047 htb_update_queue__(netdev, queue_id, &hc);
3050 nl_dump_done(&dump);
3056 htb_tc_destroy(struct tc *tc)
3058 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
3059 struct htb_class *hc, *next;
3061 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
3062 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3070 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
3072 const struct htb *htb = htb_get__(netdev);
3073 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
3078 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
3080 struct htb_class hc;
3083 htb_parse_qdisc_details__(netdev, details, &hc);
3084 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3085 tc_make_handle(1, 0), &hc);
3087 htb_get__(netdev)->max_rate = hc.max_rate;
3093 htb_class_get(const struct netdev *netdev OVS_UNUSED,
3094 const struct tc_queue *queue, struct smap *details)
3096 const struct htb_class *hc = htb_class_cast__(queue);
3098 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3099 if (hc->min_rate != hc->max_rate) {
3100 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3102 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
3104 smap_add_format(details, "priority", "%u", hc->priority);
3110 htb_class_set(struct netdev *netdev, unsigned int queue_id,
3111 const struct smap *details)
3113 struct htb_class hc;
3116 error = htb_parse_class_details__(netdev, details, &hc);
3121 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3122 tc_make_handle(1, 0xfffe), &hc);
3127 htb_update_queue__(netdev, queue_id, &hc);
3132 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
3134 struct htb_class *hc = htb_class_cast__(queue);
3135 struct htb *htb = htb_get__(netdev);
3138 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3140 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3147 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3148 struct netdev_queue_stats *stats)
3150 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3151 tc_make_handle(1, 0xfffe), NULL, stats);
3155 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3156 const struct ofpbuf *nlmsg,
3157 netdev_dump_queue_stats_cb *cb, void *aux)
3159 struct netdev_queue_stats stats;
3160 unsigned int handle, major, minor;
3163 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3168 major = tc_get_major(handle);
3169 minor = tc_get_minor(handle);
3170 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3171 (*cb)(minor - 1, &stats, aux);
3176 static const struct tc_ops tc_ops_htb = {
3177 "htb", /* linux_name */
3178 "linux-htb", /* ovs_name */
3179 HTB_N_QUEUES, /* n_queues */
3188 htb_class_get_stats,
3189 htb_class_dump_stats
3192 /* "linux-hfsc" traffic control class. */
3194 #define HFSC_N_QUEUES 0xf000
3202 struct tc_queue tc_queue;
3207 static struct hfsc *
3208 hfsc_get__(const struct netdev *netdev_)
3210 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3211 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
3214 static struct hfsc_class *
3215 hfsc_class_cast__(const struct tc_queue *queue)
3217 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3221 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
3223 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3226 hfsc = xmalloc(sizeof *hfsc);
3227 tc_init(&hfsc->tc, &tc_ops_hfsc);
3228 hfsc->max_rate = max_rate;
3229 netdev->tc = &hfsc->tc;
3233 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3234 const struct hfsc_class *hc)
3238 struct hfsc_class *hcp;
3239 struct tc_queue *queue;
3241 hfsc = hfsc_get__(netdev);
3242 hash = hash_int(queue_id, 0);
3244 queue = tc_find_queue__(netdev, queue_id, hash);
3246 hcp = hfsc_class_cast__(queue);
3248 hcp = xmalloc(sizeof *hcp);
3249 queue = &hcp->tc_queue;
3250 queue->queue_id = queue_id;
3251 queue->created = time_msec();
3252 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3255 hcp->min_rate = hc->min_rate;
3256 hcp->max_rate = hc->max_rate;
3260 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3262 const struct tc_service_curve *rsc, *fsc, *usc;
3263 static const struct nl_policy tca_hfsc_policy[] = {
3265 .type = NL_A_UNSPEC,
3267 .min_len = sizeof(struct tc_service_curve),
3270 .type = NL_A_UNSPEC,
3272 .min_len = sizeof(struct tc_service_curve),
3275 .type = NL_A_UNSPEC,
3277 .min_len = sizeof(struct tc_service_curve),
3280 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3282 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3283 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3284 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3288 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3289 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3290 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3292 if (rsc->m1 != 0 || rsc->d != 0 ||
3293 fsc->m1 != 0 || fsc->d != 0 ||
3294 usc->m1 != 0 || usc->d != 0) {
3295 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3296 "Non-linear service curves are not supported.");
3300 if (rsc->m2 != fsc->m2) {
3301 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3302 "Real-time service curves are not supported ");
3306 if (rsc->m2 > usc->m2) {
3307 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3308 "Min-rate service curve is greater than "
3309 "the max-rate service curve.");
3313 class->min_rate = fsc->m2;
3314 class->max_rate = usc->m2;
3319 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3320 struct hfsc_class *options,
3321 struct netdev_queue_stats *stats)
3324 unsigned int handle;
3325 struct nlattr *nl_options;
3327 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3333 unsigned int major, minor;
3335 major = tc_get_major(handle);
3336 minor = tc_get_minor(handle);
3337 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3338 *queue_id = minor - 1;
3345 error = hfsc_parse_tca_options__(nl_options, options);
3352 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3353 unsigned int parent, struct hfsc_class *options,
3354 struct netdev_queue_stats *stats)
3357 struct ofpbuf *reply;
3359 error = tc_query_class(netdev, handle, parent, &reply);
3364 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3365 ofpbuf_delete(reply);
3370 hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
3371 struct hfsc_class *class)
3373 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3375 const char *max_rate_s;
3377 max_rate_s = smap_get(details, "max-rate");
3378 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3381 enum netdev_features current;
3383 netdev_linux_read_features(netdev);
3384 current = !netdev->get_features_error ? netdev->current : 0;
3385 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3388 class->min_rate = max_rate;
3389 class->max_rate = max_rate;
3393 hfsc_parse_class_details__(struct netdev *netdev,
3394 const struct smap *details,
3395 struct hfsc_class * class)
3397 const struct hfsc *hfsc;
3398 uint32_t min_rate, max_rate;
3399 const char *min_rate_s, *max_rate_s;
3401 hfsc = hfsc_get__(netdev);
3402 min_rate_s = smap_get(details, "min-rate");
3403 max_rate_s = smap_get(details, "max-rate");
3405 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3406 min_rate = MAX(min_rate, 1);
3407 min_rate = MIN(min_rate, hfsc->max_rate);
3409 max_rate = (max_rate_s
3410 ? strtoull(max_rate_s, NULL, 10) / 8
3412 max_rate = MAX(max_rate, min_rate);
3413 max_rate = MIN(max_rate, hfsc->max_rate);
3415 class->min_rate = min_rate;
3416 class->max_rate = max_rate;
3421 /* Create an HFSC qdisc.
3423 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3425 hfsc_setup_qdisc__(struct netdev * netdev)
3427 struct tcmsg *tcmsg;
3428 struct ofpbuf request;
3429 struct tc_hfsc_qopt opt;
3431 tc_del_qdisc(netdev);
3433 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3434 NLM_F_EXCL | NLM_F_CREATE, &request);
3440 tcmsg->tcm_handle = tc_make_handle(1, 0);
3441 tcmsg->tcm_parent = TC_H_ROOT;
3443 memset(&opt, 0, sizeof opt);
3446 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3447 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3449 return tc_transact(&request, NULL);
3452 /* Create an HFSC class.
3454 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3455 * sc rate <min_rate> ul rate <max_rate>" */
3457 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3458 unsigned int parent, struct hfsc_class *class)
3462 struct tcmsg *tcmsg;
3463 struct ofpbuf request;
3464 struct tc_service_curve min, max;
3466 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3472 tcmsg->tcm_handle = handle;
3473 tcmsg->tcm_parent = parent;
3477 min.m2 = class->min_rate;
3481 max.m2 = class->max_rate;
3483 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3484 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3485 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3486 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3487 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3488 nl_msg_end_nested(&request, opt_offset);
3490 error = tc_transact(&request, NULL);
3492 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3493 "min-rate %ubps, max-rate %ubps (%s)",
3494 netdev_get_name(netdev),
3495 tc_get_major(handle), tc_get_minor(handle),
3496 tc_get_major(parent), tc_get_minor(parent),
3497 class->min_rate, class->max_rate, ovs_strerror(error));
3504 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
3507 struct hfsc_class class;
3509 error = hfsc_setup_qdisc__(netdev);
3515 hfsc_parse_qdisc_details__(netdev, details, &class);
3516 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3517 tc_make_handle(1, 0), &class);
3523 hfsc_install__(netdev, class.max_rate);
3528 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3531 struct nl_dump dump;
3532 struct hfsc_class hc;
3535 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3536 hfsc_install__(netdev, hc.max_rate);
3538 if (!start_queue_dump(netdev, &dump)) {
3542 while (nl_dump_next(&dump, &msg)) {
3543 unsigned int queue_id;
3545 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3546 hfsc_update_queue__(netdev, queue_id, &hc);
3550 nl_dump_done(&dump);
3555 hfsc_tc_destroy(struct tc *tc)
3558 struct hfsc_class *hc, *next;
3560 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3562 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3563 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3572 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
3574 const struct hfsc *hfsc;
3575 hfsc = hfsc_get__(netdev);
3576 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
3581 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
3584 struct hfsc_class class;
3586 hfsc_parse_qdisc_details__(netdev, details, &class);
3587 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3588 tc_make_handle(1, 0), &class);
3591 hfsc_get__(netdev)->max_rate = class.max_rate;
3598 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3599 const struct tc_queue *queue, struct smap *details)
3601 const struct hfsc_class *hc;
3603 hc = hfsc_class_cast__(queue);
3604 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3605 if (hc->min_rate != hc->max_rate) {
3606 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3612 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3613 const struct smap *details)
3616 struct hfsc_class class;
3618 error = hfsc_parse_class_details__(netdev, details, &class);
3623 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3624 tc_make_handle(1, 0xfffe), &class);
3629 hfsc_update_queue__(netdev, queue_id, &class);
3634 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3638 struct hfsc_class *hc;
3640 hc = hfsc_class_cast__(queue);
3641 hfsc = hfsc_get__(netdev);
3643 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3645 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3652 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3653 struct netdev_queue_stats *stats)
3655 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3656 tc_make_handle(1, 0xfffe), NULL, stats);
3660 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3661 const struct ofpbuf *nlmsg,
3662 netdev_dump_queue_stats_cb *cb, void *aux)
3664 struct netdev_queue_stats stats;
3665 unsigned int handle, major, minor;
3668 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3673 major = tc_get_major(handle);
3674 minor = tc_get_minor(handle);
3675 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3676 (*cb)(minor - 1, &stats, aux);
3681 static const struct tc_ops tc_ops_hfsc = {
3682 "hfsc", /* linux_name */
3683 "linux-hfsc", /* ovs_name */
3684 HFSC_N_QUEUES, /* n_queues */
3685 hfsc_tc_install, /* tc_install */
3686 hfsc_tc_load, /* tc_load */
3687 hfsc_tc_destroy, /* tc_destroy */
3688 hfsc_qdisc_get, /* qdisc_get */
3689 hfsc_qdisc_set, /* qdisc_set */
3690 hfsc_class_get, /* class_get */
3691 hfsc_class_set, /* class_set */
3692 hfsc_class_delete, /* class_delete */
3693 hfsc_class_get_stats, /* class_get_stats */
3694 hfsc_class_dump_stats /* class_dump_stats */
3697 /* "linux-default" traffic control class.
3699 * This class represents the default, unnamed Linux qdisc. It corresponds to
3700 * the "" (empty string) QoS type in the OVS database. */
3703 default_install__(struct netdev *netdev_)
3705 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3706 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
3708 /* Nothing but a tc class implementation is allowed to write to a tc. This
3709 * class never does that, so we can legitimately use a const tc object. */
3710 netdev->tc = CONST_CAST(struct tc *, &tc);
3714 default_tc_install(struct netdev *netdev,
3715 const struct smap *details OVS_UNUSED)
3717 default_install__(netdev);
3722 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3724 default_install__(netdev);
3728 static const struct tc_ops tc_ops_default = {
3729 NULL, /* linux_name */
3734 NULL, /* tc_destroy */
3735 NULL, /* qdisc_get */
3736 NULL, /* qdisc_set */
3737 NULL, /* class_get */
3738 NULL, /* class_set */
3739 NULL, /* class_delete */
3740 NULL, /* class_get_stats */
3741 NULL /* class_dump_stats */
3744 /* "linux-other" traffic control class.
3749 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
3751 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3752 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
3754 /* Nothing but a tc class implementation is allowed to write to a tc. This
3755 * class never does that, so we can legitimately use a const tc object. */
3756 netdev->tc = CONST_CAST(struct tc *, &tc);
3760 static const struct tc_ops tc_ops_other = {
3761 NULL, /* linux_name */
3762 "linux-other", /* ovs_name */
3764 NULL, /* tc_install */
3766 NULL, /* tc_destroy */
3767 NULL, /* qdisc_get */
3768 NULL, /* qdisc_set */
3769 NULL, /* class_get */
3770 NULL, /* class_set */
3771 NULL, /* class_delete */
3772 NULL, /* class_get_stats */
3773 NULL /* class_dump_stats */
3776 /* Traffic control. */
3778 /* Number of kernel "tc" ticks per second. */
3779 static double ticks_per_s;
3781 /* Number of kernel "jiffies" per second. This is used for the purpose of
3782 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3783 * one jiffy's worth of data.
3785 * There are two possibilities here:
3787 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3788 * approximate range of 100 to 1024. That means that we really need to
3789 * make sure that the qdisc can buffer that much data.
3791 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3792 * has finely granular timers and there's no need to fudge additional room
3793 * for buffers. (There's no extra effort needed to implement that: the
3794 * large 'buffer_hz' is used as a divisor, so practically any number will
3795 * come out as 0 in the division. Small integer results in the case of
3796 * really high dividends won't have any real effect anyhow.)
3798 static unsigned int buffer_hz;
3800 /* Returns tc handle 'major':'minor'. */
3802 tc_make_handle(unsigned int major, unsigned int minor)
3804 return TC_H_MAKE(major << 16, minor);
3807 /* Returns the major number from 'handle'. */
3809 tc_get_major(unsigned int handle)
3811 return TC_H_MAJ(handle) >> 16;
3814 /* Returns the minor number from 'handle'. */
3816 tc_get_minor(unsigned int handle)
3818 return TC_H_MIN(handle);
3821 static struct tcmsg *
3822 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3823 struct ofpbuf *request)
3825 struct tcmsg *tcmsg;
3829 error = get_ifindex(netdev, &ifindex);
3834 ofpbuf_init(request, 512);
3835 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3836 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3837 tcmsg->tcm_family = AF_UNSPEC;
3838 tcmsg->tcm_ifindex = ifindex;
3839 /* Caller should fill in tcmsg->tcm_handle. */
3840 /* Caller should fill in tcmsg->tcm_parent. */
3846 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3848 int error = nl_transact(NETLINK_ROUTE, request, replyp);
3849 ofpbuf_uninit(request);
3853 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3854 * policing configuration.
3856 * This function is equivalent to running the following when 'add' is true:
3857 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3859 * This function is equivalent to running the following when 'add' is false:
3860 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3862 * The configuration and stats may be seen with the following command:
3863 * /sbin/tc -s qdisc show dev <devname>
3865 * Returns 0 if successful, otherwise a positive errno value.
3868 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3870 struct ofpbuf request;
3871 struct tcmsg *tcmsg;
3873 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3874 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3876 tcmsg = tc_make_request(netdev, type, flags, &request);
3880 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3881 tcmsg->tcm_parent = TC_H_INGRESS;
3882 nl_msg_put_string(&request, TCA_KIND, "ingress");
3883 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3885 error = tc_transact(&request, NULL);
3887 /* If we're deleting the qdisc, don't worry about some of the
3888 * error conditions. */
3889 if (!add && (error == ENOENT || error == EINVAL)) {
3898 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3901 * This function is equivalent to running:
3902 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3903 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3906 * The configuration and stats may be seen with the following command:
3907 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3909 * Returns 0 if successful, otherwise a positive errno value.
3912 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3914 struct tc_police tc_police;
3915 struct ofpbuf request;
3916 struct tcmsg *tcmsg;
3917 size_t basic_offset;
3918 size_t police_offset;
3922 memset(&tc_police, 0, sizeof tc_police);
3923 tc_police.action = TC_POLICE_SHOT;
3924 tc_police.mtu = mtu;
3925 tc_fill_rate(&tc_police.rate, (kbits_rate * 1000)/8, mtu);
3926 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3927 kbits_burst * 1024);
3929 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3930 NLM_F_EXCL | NLM_F_CREATE, &request);
3934 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3935 tcmsg->tcm_info = tc_make_handle(49,
3936 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3938 nl_msg_put_string(&request, TCA_KIND, "basic");
3939 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3940 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3941 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3942 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3943 nl_msg_end_nested(&request, police_offset);
3944 nl_msg_end_nested(&request, basic_offset);
3946 error = tc_transact(&request, NULL);
3957 /* The values in psched are not individually very meaningful, but they are
3958 * important. The tables below show some values seen in the wild.
3962 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3963 * (Before that, there are hints that it was 1000000000.)
3965 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3969 * -----------------------------------
3970 * [1] 000c8000 000f4240 000f4240 00000064
3971 * [2] 000003e8 00000400 000f4240 3b9aca00
3972 * [3] 000003e8 00000400 000f4240 3b9aca00
3973 * [4] 000003e8 00000400 000f4240 00000064
3974 * [5] 000003e8 00000040 000f4240 3b9aca00
3975 * [6] 000003e8 00000040 000f4240 000000f9
3977 * a b c d ticks_per_s buffer_hz
3978 * ------- --------- ---------- ------------- ----------- -------------
3979 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3980 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3981 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3982 * [4] 1,000 1,024 1,000,000 100 976,562 100
3983 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3984 * [6] 1,000 64 1,000,000 249 15,625,000 249
3986 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3987 * [2] 2.6.26-1-686-bigmem from Debian lenny
3988 * [3] 2.6.26-2-sparc64 from Debian lenny
3989 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3990 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3991 * [6] 2.6.34 from kernel.org on KVM
3993 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
3994 static const char fn[] = "/proc/net/psched";
3995 unsigned int a, b, c, d;
3998 if (!ovsthread_once_start(&once)) {
4005 stream = fopen(fn, "r");
4007 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
4011 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
4012 VLOG_WARN("%s: read failed", fn);
4016 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
4020 VLOG_WARN("%s: invalid scheduler parameters", fn);
4024 ticks_per_s = (double) a * c / b;
4028 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
4031 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
4034 ovsthread_once_done(&once);
4037 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
4038 * rate of 'rate' bytes per second. */
4040 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
4043 return (rate * ticks) / ticks_per_s;
4046 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
4047 * rate of 'rate' bytes per second. */
4049 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
4052 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
4055 /* Returns the number of bytes that need to be reserved for qdisc buffering at
4056 * a transmission rate of 'rate' bytes per second. */
4058 tc_buffer_per_jiffy(unsigned int rate)
4061 return rate / buffer_hz;
4064 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
4065 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
4066 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
4067 * stores NULL into it if it is absent.
4069 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
4072 * Returns 0 if successful, otherwise a positive errno value. */
4074 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
4075 struct nlattr **options)
4077 static const struct nl_policy tca_policy[] = {
4078 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
4079 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
4081 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4083 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4084 tca_policy, ta, ARRAY_SIZE(ta))) {
4085 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
4090 *kind = nl_attr_get_string(ta[TCA_KIND]);
4094 *options = ta[TCA_OPTIONS];
4109 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
4110 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
4111 * into '*options', and its queue statistics into '*stats'. Any of the output
4112 * arguments may be null.
4114 * Returns 0 if successful, otherwise a positive errno value. */
4116 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
4117 struct nlattr **options, struct netdev_queue_stats *stats)
4119 static const struct nl_policy tca_policy[] = {
4120 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
4121 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
4123 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4125 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4126 tca_policy, ta, ARRAY_SIZE(ta))) {
4127 VLOG_WARN_RL(&rl, "failed to parse class message");
4132 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
4133 *handlep = tc->tcm_handle;
4137 *options = ta[TCA_OPTIONS];
4141 const struct gnet_stats_queue *gsq;
4142 struct gnet_stats_basic gsb;
4144 static const struct nl_policy stats_policy[] = {
4145 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
4146 .min_len = sizeof gsb },
4147 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
4148 .min_len = sizeof *gsq },
4150 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
4152 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
4153 sa, ARRAY_SIZE(sa))) {
4154 VLOG_WARN_RL(&rl, "failed to parse class stats");
4158 /* Alignment issues screw up the length of struct gnet_stats_basic on
4159 * some arch/bitsize combinations. Newer versions of Linux have a
4160 * struct gnet_stats_basic_packed, but we can't depend on that. The
4161 * easiest thing to do is just to make a copy. */
4162 memset(&gsb, 0, sizeof gsb);
4163 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
4164 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
4165 stats->tx_bytes = gsb.bytes;
4166 stats->tx_packets = gsb.packets;
4168 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
4169 stats->tx_errors = gsq->drops;
4179 memset(stats, 0, sizeof *stats);
4184 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
4187 tc_query_class(const struct netdev *netdev,
4188 unsigned int handle, unsigned int parent,
4189 struct ofpbuf **replyp)
4191 struct ofpbuf request;
4192 struct tcmsg *tcmsg;
4195 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
4199 tcmsg->tcm_handle = handle;
4200 tcmsg->tcm_parent = parent;
4202 error = tc_transact(&request, replyp);
4204 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
4205 netdev_get_name(netdev),
4206 tc_get_major(handle), tc_get_minor(handle),
4207 tc_get_major(parent), tc_get_minor(parent),
4208 ovs_strerror(error));
4213 /* Equivalent to "tc class del dev <name> handle <handle>". */
4215 tc_delete_class(const struct netdev *netdev, unsigned int handle)
4217 struct ofpbuf request;
4218 struct tcmsg *tcmsg;
4221 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
4225 tcmsg->tcm_handle = handle;
4226 tcmsg->tcm_parent = 0;
4228 error = tc_transact(&request, NULL);
4230 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4231 netdev_get_name(netdev),
4232 tc_get_major(handle), tc_get_minor(handle),
4233 ovs_strerror(error));
4238 /* Equivalent to "tc qdisc del dev <name> root". */
4240 tc_del_qdisc(struct netdev *netdev_)
4242 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4243 struct ofpbuf request;
4244 struct tcmsg *tcmsg;
4247 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
4251 tcmsg->tcm_handle = tc_make_handle(1, 0);
4252 tcmsg->tcm_parent = TC_H_ROOT;
4254 error = tc_transact(&request, NULL);
4255 if (error == EINVAL) {
4256 /* EINVAL probably means that the default qdisc was in use, in which
4257 * case we've accomplished our purpose. */
4260 if (!error && netdev->tc) {
4261 if (netdev->tc->ops->tc_destroy) {
4262 netdev->tc->ops->tc_destroy(netdev->tc);
4269 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4270 * kernel to determine what they are. Returns 0 if successful, otherwise a
4271 * positive errno value. */
4273 tc_query_qdisc(const struct netdev *netdev_)
4275 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4276 struct ofpbuf request, *qdisc;
4277 const struct tc_ops *ops;
4278 struct tcmsg *tcmsg;
4286 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4287 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4288 * 2.6.35 without that fix backported to it.
4290 * To avoid the OOPS, we must not make a request that would attempt to dump
4291 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4292 * few others. There are a few ways that I can see to do this, but most of
4293 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4294 * technique chosen here is to assume that any non-default qdisc that we
4295 * create will have a class with handle 1:0. The built-in qdiscs only have
4296 * a class with handle 0:0.
4298 * We could check for Linux 2.6.35+ and use a more straightforward method
4300 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
4304 tcmsg->tcm_handle = tc_make_handle(1, 0);
4305 tcmsg->tcm_parent = 0;
4307 /* Figure out what tc class to instantiate. */
4308 error = tc_transact(&request, &qdisc);
4312 error = tc_parse_qdisc(qdisc, &kind, NULL);
4314 ops = &tc_ops_other;
4316 ops = tc_lookup_linux_name(kind);
4318 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4319 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4321 ops = &tc_ops_other;
4324 } else if (error == ENOENT) {
4325 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4326 * other entity that doesn't have a handle 1:0. We will assume
4327 * that it's the system default qdisc. */
4328 ops = &tc_ops_default;
4331 /* Who knows? Maybe the device got deleted. */
4332 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4333 netdev_get_name(netdev_), ovs_strerror(error));
4334 ops = &tc_ops_other;
4337 /* Instantiate it. */
4338 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
4339 ovs_assert((load_error == 0) == (netdev->tc != NULL));
4340 ofpbuf_delete(qdisc);
4342 return error ? error : load_error;
4345 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4346 approximate the time to transmit packets of various lengths. For an MTU of
4347 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4348 represents two possible packet lengths; for a MTU of 513 through 1024, four
4349 possible lengths; and so on.
4351 Returns, for the specified 'mtu', the number of bits that packet lengths
4352 need to be shifted right to fit within such a 256-entry table. */
4354 tc_calc_cell_log(unsigned int mtu)
4359 mtu = ETH_PAYLOAD_MAX;
4361 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4363 for (cell_log = 0; mtu >= 256; cell_log++) {
4370 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4373 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4375 memset(rate, 0, sizeof *rate);
4376 rate->cell_log = tc_calc_cell_log(mtu);
4377 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4378 /* rate->cell_align = 0; */ /* distro headers. */
4379 rate->mpu = ETH_TOTAL_MIN;
4383 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4384 * attribute of the specified "type".
4386 * See tc_calc_cell_log() above for a description of "rtab"s. */
4388 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4393 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4394 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4395 unsigned packet_size = (i + 1) << rate->cell_log;
4396 if (packet_size < rate->mpu) {
4397 packet_size = rate->mpu;
4399 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4403 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4404 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4405 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4408 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4410 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4411 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4414 /* Linux-only functions declared in netdev-linux.h */
4416 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4417 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4419 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4420 const char *flag_name, bool enable)
4422 const char *netdev_name = netdev_get_name(netdev);
4423 struct ethtool_value evalue;
4427 COVERAGE_INC(netdev_get_ethtool);
4428 memset(&evalue, 0, sizeof evalue);
4429 error = netdev_linux_do_ethtool(netdev_name,
4430 (struct ethtool_cmd *)&evalue,
4431 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4436 COVERAGE_INC(netdev_set_ethtool);
4437 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4438 error = netdev_linux_do_ethtool(netdev_name,
4439 (struct ethtool_cmd *)&evalue,
4440 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4445 COVERAGE_INC(netdev_get_ethtool);
4446 memset(&evalue, 0, sizeof evalue);
4447 error = netdev_linux_do_ethtool(netdev_name,
4448 (struct ethtool_cmd *)&evalue,
4449 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4454 if (new_flags != evalue.data) {
4455 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4456 "device %s failed", enable ? "enable" : "disable",
4457 flag_name, netdev_name);
4464 /* Utility functions. */
4466 /* Copies 'src' into 'dst', performing format conversion in the process. */
4468 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4469 const struct rtnl_link_stats *src)
4471 dst->rx_packets = src->rx_packets;
4472 dst->tx_packets = src->tx_packets;
4473 dst->rx_bytes = src->rx_bytes;
4474 dst->tx_bytes = src->tx_bytes;
4475 dst->rx_errors = src->rx_errors;
4476 dst->tx_errors = src->tx_errors;
4477 dst->rx_dropped = src->rx_dropped;
4478 dst->tx_dropped = src->tx_dropped;
4479 dst->multicast = src->multicast;
4480 dst->collisions = src->collisions;
4481 dst->rx_length_errors = src->rx_length_errors;
4482 dst->rx_over_errors = src->rx_over_errors;
4483 dst->rx_crc_errors = src->rx_crc_errors;
4484 dst->rx_frame_errors = src->rx_frame_errors;
4485 dst->rx_fifo_errors = src->rx_fifo_errors;
4486 dst->rx_missed_errors = src->rx_missed_errors;
4487 dst->tx_aborted_errors = src->tx_aborted_errors;
4488 dst->tx_carrier_errors = src->tx_carrier_errors;
4489 dst->tx_fifo_errors = src->tx_fifo_errors;
4490 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4491 dst->tx_window_errors = src->tx_window_errors;
4495 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4497 /* Policy for RTNLGRP_LINK messages.
4499 * There are *many* more fields in these messages, but currently we only
4500 * care about these fields. */
4501 static const struct nl_policy rtnlgrp_link_policy[] = {
4502 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4503 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4504 .min_len = sizeof(struct rtnl_link_stats) },
4507 struct ofpbuf request;
4508 struct ofpbuf *reply;
4509 struct ifinfomsg *ifi;
4510 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4513 ofpbuf_init(&request, 0);
4514 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4515 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4516 ifi->ifi_family = PF_UNSPEC;
4517 ifi->ifi_index = ifindex;
4518 error = nl_transact(NETLINK_ROUTE, &request, &reply);
4519 ofpbuf_uninit(&request);
4524 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4525 rtnlgrp_link_policy,
4526 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4527 ofpbuf_delete(reply);
4531 if (!attrs[IFLA_STATS]) {
4532 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4533 ofpbuf_delete(reply);
4537 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4539 ofpbuf_delete(reply);
4545 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4547 static const char fn[] = "/proc/net/dev";
4552 stream = fopen(fn, "r");
4554 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
4559 while (fgets(line, sizeof line, stream)) {
4562 #define X64 "%"SCNu64
4565 X64 X64 X64 X64 X64 X64 X64 "%*u"
4566 X64 X64 X64 X64 X64 X64 X64 "%*u",
4572 &stats->rx_fifo_errors,
4573 &stats->rx_frame_errors,
4579 &stats->tx_fifo_errors,
4581 &stats->tx_carrier_errors) != 15) {
4582 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4583 } else if (!strcmp(devname, netdev_name)) {
4584 stats->rx_length_errors = UINT64_MAX;
4585 stats->rx_over_errors = UINT64_MAX;
4586 stats->rx_crc_errors = UINT64_MAX;
4587 stats->rx_missed_errors = UINT64_MAX;
4588 stats->tx_aborted_errors = UINT64_MAX;
4589 stats->tx_heartbeat_errors = UINT64_MAX;
4590 stats->tx_window_errors = UINT64_MAX;
4596 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4602 get_flags(const struct netdev *dev, unsigned int *flags)
4608 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
4610 *flags = ifr.ifr_flags;
4616 set_flags(const char *name, unsigned int flags)
4620 ifr.ifr_flags = flags;
4621 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
4625 do_get_ifindex(const char *netdev_name)
4630 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4631 COVERAGE_INC(netdev_get_ifindex);
4633 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
4635 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4636 netdev_name, ovs_strerror(error));
4639 return ifr.ifr_ifindex;
4643 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4645 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4647 if (!(netdev->cache_valid & VALID_IFINDEX)) {
4648 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4651 netdev->get_ifindex_error = -ifindex;
4652 netdev->ifindex = 0;
4654 netdev->get_ifindex_error = 0;
4655 netdev->ifindex = ifindex;
4657 netdev->cache_valid |= VALID_IFINDEX;
4660 *ifindexp = netdev->ifindex;
4661 return netdev->get_ifindex_error;
4665 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4671 memset(&ifr, 0, sizeof ifr);
4672 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4673 COVERAGE_INC(netdev_get_hwaddr);
4674 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
4676 /* ENODEV probably means that a vif disappeared asynchronously and
4677 * hasn't been removed from the database yet, so reduce the log level
4678 * to INFO for that case. */
4679 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
4680 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4681 netdev_name, ovs_strerror(error));
4684 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4685 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4686 VLOG_WARN("%s device has unknown hardware address family %d",
4687 netdev_name, hwaddr_family);
4689 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4694 set_etheraddr(const char *netdev_name,
4695 const uint8_t mac[ETH_ADDR_LEN])
4700 memset(&ifr, 0, sizeof ifr);
4701 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4702 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4703 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4704 COVERAGE_INC(netdev_set_hwaddr);
4705 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
4707 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4708 netdev_name, ovs_strerror(error));
4714 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4715 int cmd, const char *cmd_name)
4720 memset(&ifr, 0, sizeof ifr);
4721 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4722 ifr.ifr_data = (caddr_t) ecmd;
4725 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
4727 if (error != EOPNOTSUPP) {
4728 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4729 "failed: %s", cmd_name, name, ovs_strerror(error));
4731 /* The device doesn't support this operation. That's pretty
4732 * common, so there's no point in logging anything. */
4739 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4740 int cmd, const char *cmd_name)
4745 ifr.ifr_addr.sa_family = AF_INET;
4746 error = af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4748 const struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *,
4750 *ip = sin->sin_addr;
4755 /* Returns an AF_PACKET raw socket or a negative errno value. */
4757 af_packet_sock(void)
4759 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4762 if (ovsthread_once_start(&once)) {
4763 sock = socket(AF_PACKET, SOCK_RAW, 0);
4765 int error = set_nonblocking(sock);
4772 VLOG_ERR("failed to create packet socket: %s",
4773 ovs_strerror(errno));
4775 ovsthread_once_done(&once);