2 * Copyright (c) 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <linux/version.h>
37 #include <sys/types.h>
38 #include <sys/ioctl.h>
39 #include <sys/socket.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
52 #include "dpif-linux.h"
53 #include "dynamic-string.h"
54 #include "fatal-signal.h"
57 #include "netdev-provider.h"
58 #include "netdev-vport.h"
59 #include "netlink-notifier.h"
60 #include "netlink-socket.h"
63 #include "openflow/openflow.h"
65 #include "poll-loop.h"
66 #include "rtnetlink-link.h"
68 #include "socket-util.h"
71 #include "unaligned.h"
74 VLOG_DEFINE_THIS_MODULE(netdev_linux);
76 COVERAGE_DEFINE(netdev_set_policing);
77 COVERAGE_DEFINE(netdev_arp_lookup);
78 COVERAGE_DEFINE(netdev_get_ifindex);
79 COVERAGE_DEFINE(netdev_get_hwaddr);
80 COVERAGE_DEFINE(netdev_set_hwaddr);
81 COVERAGE_DEFINE(netdev_get_ethtool);
82 COVERAGE_DEFINE(netdev_set_ethtool);
85 /* These were introduced in Linux 2.6.14, so they might be missing if we have
87 #ifndef ADVERTISED_Pause
88 #define ADVERTISED_Pause (1 << 13)
90 #ifndef ADVERTISED_Asym_Pause
91 #define ADVERTISED_Asym_Pause (1 << 14)
94 /* These were introduced in Linux 2.6.24, so they might be missing if we
95 * have old headers. */
96 #ifndef ETHTOOL_GFLAGS
97 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
99 #ifndef ETHTOOL_SFLAGS
100 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
103 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
106 #define TC_RTAB_SIZE 1024
110 VALID_IFINDEX = 1 << 0,
111 VALID_ETHERADDR = 1 << 1,
115 VALID_POLICING = 1 << 5,
116 VALID_VPORT_STAT_ERROR = 1 << 6,
117 VALID_DRVINFO = 1 << 7,
118 VALID_FEATURES = 1 << 8,
121 /* Traffic control. */
123 /* An instance of a traffic control class. Always associated with a particular
126 * Each TC implementation subclasses this with whatever additional data it
129 const struct tc_ops *ops;
130 struct hmap queues; /* Contains "struct tc_queue"s.
131 * Read by generic TC layer.
132 * Written only by TC implementation. */
135 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
137 /* One traffic control queue.
139 * Each TC implementation subclasses this with whatever additional data it
142 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
143 unsigned int queue_id; /* OpenFlow queue ID. */
144 long long int created; /* Time queue was created, in msecs. */
147 /* A particular kind of traffic control. Each implementation generally maps to
148 * one particular Linux qdisc class.
150 * The functions below return 0 if successful or a positive errno value on
151 * failure, except where otherwise noted. All of them must be provided, except
152 * where otherwise noted. */
154 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
155 * This is null for tc_ops_default and tc_ops_other, for which there are no
156 * appropriate values. */
157 const char *linux_name;
159 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
160 const char *ovs_name;
162 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
163 * queues. The queues are numbered 0 through n_queues - 1. */
164 unsigned int n_queues;
166 /* Called to install this TC class on 'netdev'. The implementation should
167 * make the Netlink calls required to set up 'netdev' with the right qdisc
168 * and configure it according to 'details'. The implementation may assume
169 * that the current qdisc is the default; that is, there is no need for it
170 * to delete the current qdisc before installing itself.
172 * The contents of 'details' should be documented as valid for 'ovs_name'
173 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
174 * (which is built as ovs-vswitchd.conf.db(8)).
176 * This function must return 0 if and only if it sets 'netdev->tc' to an
177 * initialized 'struct tc'.
179 * (This function is null for tc_ops_other, which cannot be installed. For
180 * other TC classes it should always be nonnull.) */
181 int (*tc_install)(struct netdev *netdev, const struct smap *details);
183 /* Called when the netdev code determines (through a Netlink query) that
184 * this TC class's qdisc is installed on 'netdev', but we didn't install
185 * it ourselves and so don't know any of the details.
187 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
188 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
189 * implementation should parse the other attributes of 'nlmsg' as
190 * necessary to determine its configuration. If necessary it should also
191 * use Netlink queries to determine the configuration of queues on
194 * This function must return 0 if and only if it sets 'netdev->tc' to an
195 * initialized 'struct tc'. */
196 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
198 /* Destroys the data structures allocated by the implementation as part of
199 * 'tc'. (This includes destroying 'tc->queues' by calling
202 * The implementation should not need to perform any Netlink calls. If
203 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
204 * (But it may not be desirable.)
206 * This function may be null if 'tc' is trivial. */
207 void (*tc_destroy)(struct tc *tc);
209 /* Retrieves details of 'netdev->tc' configuration into 'details'.
211 * The implementation should not need to perform any Netlink calls, because
212 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
213 * cached the configuration.
215 * The contents of 'details' should be documented as valid for 'ovs_name'
216 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
217 * (which is built as ovs-vswitchd.conf.db(8)).
219 * This function may be null if 'tc' is not configurable.
221 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
223 /* Reconfigures 'netdev->tc' according to 'details', performing any
224 * required Netlink calls to complete the reconfiguration.
226 * The contents of 'details' should be documented as valid for 'ovs_name'
227 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
228 * (which is built as ovs-vswitchd.conf.db(8)).
230 * This function may be null if 'tc' is not configurable.
232 int (*qdisc_set)(struct netdev *, const struct smap *details);
234 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
235 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
237 * The contents of 'details' should be documented as valid for 'ovs_name'
238 * in the "other_config" column in the "Queue" table in
239 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
241 * The implementation should not need to perform any Netlink calls, because
242 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
243 * cached the queue configuration.
245 * This function may be null if 'tc' does not have queues ('n_queues' is
247 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
248 struct smap *details);
250 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
251 * 'details', perfoming any required Netlink calls to complete the
252 * reconfiguration. The caller ensures that 'queue_id' is less than
255 * The contents of 'details' should be documented as valid for 'ovs_name'
256 * in the "other_config" column in the "Queue" table in
257 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
259 * This function may be null if 'tc' does not have queues or its queues are
260 * not configurable. */
261 int (*class_set)(struct netdev *, unsigned int queue_id,
262 const struct smap *details);
264 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
265 * tc_queue's within 'netdev->tc->queues'.
267 * This function may be null if 'tc' does not have queues or its queues
268 * cannot be deleted. */
269 int (*class_delete)(struct netdev *, struct tc_queue *queue);
271 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
272 * 'struct tc_queue's within 'netdev->tc->queues'.
274 * On success, initializes '*stats'.
276 * This function may be null if 'tc' does not have queues or if it cannot
277 * report queue statistics. */
278 int (*class_get_stats)(const struct netdev *netdev,
279 const struct tc_queue *queue,
280 struct netdev_queue_stats *stats);
282 /* Extracts queue stats from 'nlmsg', which is a response to a
283 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
285 * This function may be null if 'tc' does not have queues or if it cannot
286 * report queue statistics. */
287 int (*class_dump_stats)(const struct netdev *netdev,
288 const struct ofpbuf *nlmsg,
289 netdev_dump_queue_stats_cb *cb, void *aux);
293 tc_init(struct tc *tc, const struct tc_ops *ops)
296 hmap_init(&tc->queues);
300 tc_destroy(struct tc *tc)
302 hmap_destroy(&tc->queues);
305 static const struct tc_ops tc_ops_htb;
306 static const struct tc_ops tc_ops_hfsc;
307 static const struct tc_ops tc_ops_default;
308 static const struct tc_ops tc_ops_other;
310 static const struct tc_ops *const tcs[] = {
311 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
312 &tc_ops_hfsc, /* Hierarchical fair service curve. */
313 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
314 &tc_ops_other, /* Some other qdisc. */
318 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
319 static unsigned int tc_get_major(unsigned int handle);
320 static unsigned int tc_get_minor(unsigned int handle);
322 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
323 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
324 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
326 static struct tcmsg *tc_make_request(const struct netdev *, int type,
327 unsigned int flags, struct ofpbuf *);
328 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
329 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
330 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
333 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
334 struct nlattr **options);
335 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
336 struct nlattr **options,
337 struct netdev_queue_stats *);
338 static int tc_query_class(const struct netdev *,
339 unsigned int handle, unsigned int parent,
340 struct ofpbuf **replyp);
341 static int tc_delete_class(const struct netdev *, unsigned int handle);
343 static int tc_del_qdisc(struct netdev *netdev);
344 static int tc_query_qdisc(const struct netdev *netdev);
346 static int tc_calc_cell_log(unsigned int mtu);
347 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
348 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
349 const struct tc_ratespec *rate);
350 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
352 struct netdev_linux {
355 /* Protects all members below. */
356 struct ovs_mutex mutex;
358 unsigned int cache_valid;
359 unsigned int change_seq;
361 bool miimon; /* Link status of last poll. */
362 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
363 struct timer miimon_timer;
365 /* The following are figured out "on demand" only. They are only valid
366 * when the corresponding VALID_* bit in 'cache_valid' is set. */
368 uint8_t etheraddr[ETH_ADDR_LEN];
369 struct in_addr address, netmask;
372 unsigned int ifi_flags;
373 long long int carrier_resets;
374 uint32_t kbits_rate; /* Policing data. */
375 uint32_t kbits_burst;
376 int vport_stats_error; /* Cached error code from vport_get_stats().
377 0 or an errno value. */
378 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
379 int ether_addr_error; /* Cached error code from set/get etheraddr. */
380 int netdev_policing_error; /* Cached error code from set policing. */
381 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
382 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
384 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
385 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
386 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
388 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
391 /* For devices of class netdev_tap_class only. */
395 struct netdev_rx_linux {
401 /* This is set pretty low because we probably won't learn anything from the
402 * additional log messages. */
403 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
405 static void netdev_linux_run(void);
407 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
408 int cmd, const char *cmd_name);
409 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
410 int cmd, const char *cmd_name);
411 static int get_flags(const struct netdev *, unsigned int *flags);
412 static int set_flags(const char *, unsigned int flags);
413 static int do_get_ifindex(const char *netdev_name);
414 static int get_ifindex(const struct netdev *, int *ifindexp);
415 static int do_set_addr(struct netdev *netdev,
416 int ioctl_nr, const char *ioctl_name,
417 struct in_addr addr);
418 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
419 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
420 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
421 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
422 static int af_packet_sock(void);
423 static void netdev_linux_miimon_run(void);
424 static void netdev_linux_miimon_wait(void);
427 is_netdev_linux_class(const struct netdev_class *netdev_class)
429 return netdev_class->run == netdev_linux_run;
433 is_tap_netdev(const struct netdev *netdev)
435 return netdev_get_class(netdev) == &netdev_tap_class;
438 static struct netdev_linux *
439 netdev_linux_cast(const struct netdev *netdev)
441 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
443 return CONTAINER_OF(netdev, struct netdev_linux, up);
446 static struct netdev_rx_linux *
447 netdev_rx_linux_cast(const struct netdev_rx *rx)
449 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
450 return CONTAINER_OF(rx, struct netdev_rx_linux, up);
453 static void netdev_linux_update(struct netdev_linux *netdev,
454 const struct rtnetlink_link_change *)
455 OVS_REQUIRES(netdev->mutex);
456 static void netdev_linux_changed(struct netdev_linux *netdev,
457 unsigned int ifi_flags, unsigned int mask)
458 OVS_REQUIRES(netdev->mutex);
460 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK changes, or NULL
461 * if no such socket could be created. */
462 static struct nl_sock *
463 netdev_linux_notify_sock(void)
465 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
466 static struct nl_sock *sock;
468 if (ovsthread_once_start(&once)) {
471 error = nl_sock_create(NETLINK_ROUTE, &sock);
473 error = nl_sock_join_mcgroup(sock, RTNLGRP_LINK);
475 nl_sock_destroy(sock);
479 ovsthread_once_done(&once);
486 netdev_linux_run(void)
488 struct nl_sock *sock;
491 netdev_linux_miimon_run();
493 sock = netdev_linux_notify_sock();
499 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
500 uint64_t buf_stub[4096 / 8];
503 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
504 error = nl_sock_recv(sock, &buf, false);
506 struct rtnetlink_link_change change;
508 if (rtnetlink_link_parse(&buf, &change)) {
509 struct netdev *netdev_ = netdev_from_name(change.ifname);
510 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
511 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
513 ovs_mutex_lock(&netdev->mutex);
514 netdev_linux_update(netdev, &change);
515 ovs_mutex_unlock(&netdev->mutex);
517 netdev_close(netdev_);
520 } else if (error == ENOBUFS) {
521 struct shash device_shash;
522 struct shash_node *node;
526 shash_init(&device_shash);
527 netdev_get_devices(&netdev_linux_class, &device_shash);
528 SHASH_FOR_EACH (node, &device_shash) {
529 struct netdev *netdev_ = node->data;
530 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
533 ovs_mutex_lock(&netdev->mutex);
534 get_flags(netdev_, &flags);
535 netdev_linux_changed(netdev, flags, 0);
536 ovs_mutex_unlock(&netdev->mutex);
538 netdev_close(netdev_);
540 shash_destroy(&device_shash);
541 } else if (error != EAGAIN) {
542 VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)",
543 ovs_strerror(error));
550 netdev_linux_wait(void)
552 struct nl_sock *sock;
554 netdev_linux_miimon_wait();
555 sock = netdev_linux_notify_sock();
557 nl_sock_wait(sock, POLLIN);
562 netdev_linux_changed(struct netdev_linux *dev,
563 unsigned int ifi_flags, unsigned int mask)
564 OVS_REQUIRES(dev->mutex)
567 if (!dev->change_seq) {
571 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
572 dev->carrier_resets++;
574 dev->ifi_flags = ifi_flags;
576 dev->cache_valid &= mask;
580 netdev_linux_update(struct netdev_linux *dev,
581 const struct rtnetlink_link_change *change)
582 OVS_REQUIRES(dev->mutex)
584 if (change->nlmsg_type == RTM_NEWLINK) {
586 netdev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
588 /* Update netdev from rtnl-change msg. */
590 dev->mtu = change->mtu;
591 dev->cache_valid |= VALID_MTU;
592 dev->netdev_mtu_error = 0;
595 if (!eth_addr_is_zero(change->addr)) {
596 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
597 dev->cache_valid |= VALID_ETHERADDR;
598 dev->ether_addr_error = 0;
601 dev->ifindex = change->ifi_index;
602 dev->cache_valid |= VALID_IFINDEX;
603 dev->get_ifindex_error = 0;
606 netdev_linux_changed(dev, change->ifi_flags, 0);
610 static struct netdev *
611 netdev_linux_alloc(void)
613 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
618 netdev_linux_common_construct(struct netdev_linux *netdev)
620 ovs_mutex_init(&netdev->mutex, PTHREAD_MUTEX_NORMAL);
621 netdev->change_seq = 1;
624 /* Creates system and internal devices. */
626 netdev_linux_construct(struct netdev *netdev_)
628 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
631 netdev_linux_common_construct(netdev);
633 error = get_flags(&netdev->up, &netdev->ifi_flags);
634 if (error == ENODEV) {
635 if (netdev->up.netdev_class != &netdev_internal_class) {
636 /* The device does not exist, so don't allow it to be opened. */
639 /* "Internal" netdevs have to be created as netdev objects before
640 * they exist in the kernel, because creating them in the kernel
641 * happens by passing a netdev object to dpif_port_add().
642 * Therefore, ignore the error. */
649 /* For most types of netdevs we open the device for each call of
650 * netdev_open(). However, this is not the case with tap devices,
651 * since it is only possible to open the device once. In this
652 * situation we share a single file descriptor, and consequently
653 * buffers, across all readers. Therefore once data is read it will
654 * be unavailable to other reads for tap devices. */
656 netdev_linux_construct_tap(struct netdev *netdev_)
658 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
659 static const char tap_dev[] = "/dev/net/tun";
660 const char *name = netdev_->name;
664 netdev_linux_common_construct(netdev);
666 /* Open tap device. */
667 netdev->tap_fd = open(tap_dev, O_RDWR);
668 if (netdev->tap_fd < 0) {
670 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
674 /* Create tap device. */
675 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
676 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
677 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
678 VLOG_WARN("%s: creating tap device failed: %s", name,
679 ovs_strerror(errno));
684 /* Make non-blocking. */
685 error = set_nonblocking(netdev->tap_fd);
693 close(netdev->tap_fd);
698 netdev_linux_destruct(struct netdev *netdev_)
700 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
702 if (netdev->tc && netdev->tc->ops->tc_destroy) {
703 netdev->tc->ops->tc_destroy(netdev->tc);
706 if (netdev_get_class(netdev_) == &netdev_tap_class
707 && netdev->tap_fd >= 0)
709 close(netdev->tap_fd);
712 ovs_mutex_destroy(&netdev->mutex);
716 netdev_linux_dealloc(struct netdev *netdev_)
718 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
722 static struct netdev_rx *
723 netdev_linux_rx_alloc(void)
725 struct netdev_rx_linux *rx = xzalloc(sizeof *rx);
730 netdev_linux_rx_construct(struct netdev_rx *rx_)
732 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
733 struct netdev *netdev_ = rx->up.netdev;
734 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
737 ovs_mutex_lock(&netdev->mutex);
738 rx->is_tap = is_tap_netdev(netdev_);
740 rx->fd = netdev->tap_fd;
742 struct sockaddr_ll sll;
744 /* Result of tcpdump -dd inbound */
745 static const struct sock_filter filt[] = {
746 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
747 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
748 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
749 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
751 static const struct sock_fprog fprog = {
752 ARRAY_SIZE(filt), (struct sock_filter *) filt
755 /* Create file descriptor. */
756 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
759 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
763 /* Set non-blocking mode. */
764 error = set_nonblocking(rx->fd);
769 /* Get ethernet device index. */
770 error = get_ifindex(&netdev->up, &ifindex);
775 /* Bind to specific ethernet device. */
776 memset(&sll, 0, sizeof sll);
777 sll.sll_family = AF_PACKET;
778 sll.sll_ifindex = ifindex;
779 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
780 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
782 VLOG_ERR("%s: failed to bind raw socket (%s)",
783 netdev_get_name(netdev_), ovs_strerror(error));
787 /* Filter for only inbound packets. */
788 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
792 VLOG_ERR("%s: failed to attach filter (%s)",
793 netdev_get_name(netdev_), ovs_strerror(error));
797 ovs_mutex_unlock(&netdev->mutex);
805 ovs_mutex_unlock(&netdev->mutex);
810 netdev_linux_rx_destruct(struct netdev_rx *rx_)
812 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
820 netdev_linux_rx_dealloc(struct netdev_rx *rx_)
822 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
828 netdev_linux_rx_recv(struct netdev_rx *rx_, void *data, size_t size)
830 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
835 ? read(rx->fd, data, size)
836 : recv(rx->fd, data, size, MSG_TRUNC));
837 } while (retval < 0 && errno == EINTR);
840 return retval > size ? -EMSGSIZE : retval;
842 if (errno != EAGAIN) {
843 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
844 ovs_strerror(errno), netdev_rx_get_name(rx_));
851 netdev_linux_rx_wait(struct netdev_rx *rx_)
853 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
854 poll_fd_wait(rx->fd, POLLIN);
858 netdev_linux_rx_drain(struct netdev_rx *rx_)
860 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
863 int error = af_inet_ifreq_ioctl(netdev_rx_get_name(rx_), &ifr,
864 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
868 drain_fd(rx->fd, ifr.ifr_qlen);
871 return drain_rcvbuf(rx->fd);
875 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
876 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
877 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
878 * the packet is too big or too small to transmit on the device.
880 * The caller retains ownership of 'buffer' in all cases.
882 * The kernel maintains a packet transmission queue, so the caller is not
883 * expected to do additional queuing of packets. */
885 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
890 if (!is_tap_netdev(netdev_)) {
891 /* Use our AF_PACKET socket to send to this device. */
892 struct sockaddr_ll sll;
898 sock = af_packet_sock();
903 ifindex = netdev_get_ifindex(netdev_);
908 /* We don't bother setting most fields in sockaddr_ll because the
909 * kernel ignores them for SOCK_RAW. */
910 memset(&sll, 0, sizeof sll);
911 sll.sll_family = AF_PACKET;
912 sll.sll_ifindex = ifindex;
914 iov.iov_base = CONST_CAST(void *, data);
918 msg.msg_namelen = sizeof sll;
921 msg.msg_control = NULL;
922 msg.msg_controllen = 0;
925 retval = sendmsg(sock, &msg, 0);
927 /* Use the tap fd to send to this device. This is essential for
928 * tap devices, because packets sent to a tap device with an
929 * AF_PACKET socket will loop back to be *received* again on the
930 * tap device. This doesn't occur on other interface types
931 * because we attach a socket filter to the rx socket. */
932 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
934 retval = write(netdev->tap_fd, data, size);
938 /* The Linux AF_PACKET implementation never blocks waiting for room
939 * for packets, instead returning ENOBUFS. Translate this into
940 * EAGAIN for the caller. */
941 if (errno == ENOBUFS) {
943 } else if (errno == EINTR) {
945 } else if (errno != EAGAIN) {
946 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
947 netdev_get_name(netdev_), ovs_strerror(errno));
950 } else if (retval != size) {
951 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
952 "%zu) on %s", retval, size, netdev_get_name(netdev_));
960 /* Registers with the poll loop to wake up from the next call to poll_block()
961 * when the packet transmission queue has sufficient room to transmit a packet
962 * with netdev_send().
964 * The kernel maintains a packet transmission queue, so the client is not
965 * expected to do additional queuing of packets. Thus, this function is
966 * unlikely to ever be used. It is included for completeness. */
968 netdev_linux_send_wait(struct netdev *netdev)
970 if (is_tap_netdev(netdev)) {
971 /* TAP device always accepts packets.*/
972 poll_immediate_wake();
976 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
977 * otherwise a positive errno value. */
979 netdev_linux_set_etheraddr(struct netdev *netdev_,
980 const uint8_t mac[ETH_ADDR_LEN])
982 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
983 struct netdev_saved_flags *sf = NULL;
986 ovs_mutex_lock(&netdev->mutex);
988 if (netdev->cache_valid & VALID_ETHERADDR) {
989 error = netdev->ether_addr_error;
990 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
993 netdev->cache_valid &= ~VALID_ETHERADDR;
996 /* Tap devices must be brought down before setting the address. */
997 if (is_tap_netdev(netdev_)) {
998 netdev_turn_flags_off(netdev_, NETDEV_UP, &sf);
1000 error = set_etheraddr(netdev_get_name(netdev_), mac);
1001 if (!error || error == ENODEV) {
1002 netdev->ether_addr_error = error;
1003 netdev->cache_valid |= VALID_ETHERADDR;
1005 memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
1009 netdev_restore_flags(sf);
1012 ovs_mutex_unlock(&netdev->mutex);
1016 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1018 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1019 uint8_t mac[ETH_ADDR_LEN])
1021 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1024 ovs_mutex_lock(&netdev->mutex);
1025 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1026 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1028 netdev->cache_valid |= VALID_ETHERADDR;
1031 error = netdev->ether_addr_error;
1033 memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
1035 ovs_mutex_unlock(&netdev->mutex);
1040 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1041 * in bytes, not including the hardware header; thus, this is typically 1500
1042 * bytes for Ethernet devices. */
1044 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1046 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1049 ovs_mutex_lock(&netdev->mutex);
1050 if (!(netdev->cache_valid & VALID_MTU)) {
1053 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
1054 netdev_get_name(netdev_), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
1055 netdev->mtu = ifr.ifr_mtu;
1056 netdev->cache_valid |= VALID_MTU;
1059 error = netdev->netdev_mtu_error;
1061 *mtup = netdev->mtu;
1063 ovs_mutex_unlock(&netdev->mutex);
1068 /* Sets the maximum size of transmitted (MTU) for given device using linux
1069 * networking ioctl interface.
1072 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1074 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1078 ovs_mutex_lock(&netdev->mutex);
1079 if (netdev->cache_valid & VALID_MTU) {
1080 error = netdev->netdev_mtu_error;
1081 if (error || netdev->mtu == mtu) {
1084 netdev->cache_valid &= ~VALID_MTU;
1087 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1088 SIOCSIFMTU, "SIOCSIFMTU");
1089 if (!error || error == ENODEV) {
1090 netdev->netdev_mtu_error = error;
1091 netdev->mtu = ifr.ifr_mtu;
1092 netdev->cache_valid |= VALID_MTU;
1095 ovs_mutex_unlock(&netdev->mutex);
1099 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1100 * On failure, returns a negative errno value. */
1102 netdev_linux_get_ifindex(const struct netdev *netdev_)
1104 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1107 ovs_mutex_lock(&netdev->mutex);
1108 error = get_ifindex(netdev_, &ifindex);
1109 ovs_mutex_unlock(&netdev->mutex);
1111 return error ? -error : ifindex;
1115 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1117 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1119 ovs_mutex_lock(&netdev->mutex);
1120 if (netdev->miimon_interval > 0) {
1121 *carrier = netdev->miimon;
1123 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1125 ovs_mutex_unlock(&netdev->mutex);
1130 static long long int
1131 netdev_linux_get_carrier_resets(const struct netdev *netdev_)
1133 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1134 long long int carrier_resets;
1136 ovs_mutex_lock(&netdev->mutex);
1137 carrier_resets = netdev->carrier_resets;
1138 ovs_mutex_unlock(&netdev->mutex);
1140 return carrier_resets;
1144 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1145 struct mii_ioctl_data *data)
1150 memset(&ifr, 0, sizeof ifr);
1151 memcpy(&ifr.ifr_data, data, sizeof *data);
1152 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
1153 memcpy(data, &ifr.ifr_data, sizeof *data);
1159 netdev_linux_get_miimon(const char *name, bool *miimon)
1161 struct mii_ioctl_data data;
1166 memset(&data, 0, sizeof data);
1167 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1169 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1170 data.reg_num = MII_BMSR;
1171 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1175 *miimon = !!(data.val_out & BMSR_LSTATUS);
1177 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1180 struct ethtool_cmd ecmd;
1182 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1185 COVERAGE_INC(netdev_get_ethtool);
1186 memset(&ecmd, 0, sizeof ecmd);
1187 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1190 struct ethtool_value eval;
1192 memcpy(&eval, &ecmd, sizeof eval);
1193 *miimon = !!eval.data;
1195 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1203 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1204 long long int interval)
1206 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1208 ovs_mutex_lock(&netdev->mutex);
1209 interval = interval > 0 ? MAX(interval, 100) : 0;
1210 if (netdev->miimon_interval != interval) {
1211 netdev->miimon_interval = interval;
1212 timer_set_expired(&netdev->miimon_timer);
1214 ovs_mutex_unlock(&netdev->mutex);
1220 netdev_linux_miimon_run(void)
1222 struct shash device_shash;
1223 struct shash_node *node;
1225 shash_init(&device_shash);
1226 netdev_get_devices(&netdev_linux_class, &device_shash);
1227 SHASH_FOR_EACH (node, &device_shash) {
1228 struct netdev *netdev = node->data;
1229 struct netdev_linux *dev = netdev_linux_cast(netdev);
1232 ovs_mutex_lock(&dev->mutex);
1233 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1234 netdev_linux_get_miimon(dev->up.name, &miimon);
1235 if (miimon != dev->miimon) {
1236 dev->miimon = miimon;
1237 netdev_linux_changed(dev, dev->ifi_flags, 0);
1240 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1242 ovs_mutex_unlock(&dev->mutex);
1243 netdev_close(netdev);
1246 shash_destroy(&device_shash);
1250 netdev_linux_miimon_wait(void)
1252 struct shash device_shash;
1253 struct shash_node *node;
1255 shash_init(&device_shash);
1256 netdev_get_devices(&netdev_linux_class, &device_shash);
1257 SHASH_FOR_EACH (node, &device_shash) {
1258 struct netdev *netdev = node->data;
1259 struct netdev_linux *dev = netdev_linux_cast(netdev);
1261 ovs_mutex_lock(&dev->mutex);
1262 if (dev->miimon_interval > 0) {
1263 timer_wait(&dev->miimon_timer);
1265 ovs_mutex_unlock(&dev->mutex);
1266 netdev_close(netdev);
1268 shash_destroy(&device_shash);
1271 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1272 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1275 check_for_working_netlink_stats(void)
1277 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1278 * preferable, so if that works, we'll use it. */
1279 int ifindex = do_get_ifindex("lo");
1281 VLOG_WARN("failed to get ifindex for lo, "
1282 "obtaining netdev stats from proc");
1285 struct netdev_stats stats;
1286 int error = get_stats_via_netlink(ifindex, &stats);
1288 VLOG_DBG("obtaining netdev stats via rtnetlink");
1291 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1292 "via proc (you are probably running a pre-2.6.19 "
1293 "kernel)", ovs_strerror(error));
1300 swap_uint64(uint64_t *a, uint64_t *b)
1307 /* Copies 'src' into 'dst', performing format conversion in the process.
1309 * 'src' is allowed to be misaligned. */
1311 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1312 const struct ovs_vport_stats *src)
1314 dst->rx_packets = get_unaligned_u64(&src->rx_packets);
1315 dst->tx_packets = get_unaligned_u64(&src->tx_packets);
1316 dst->rx_bytes = get_unaligned_u64(&src->rx_bytes);
1317 dst->tx_bytes = get_unaligned_u64(&src->tx_bytes);
1318 dst->rx_errors = get_unaligned_u64(&src->rx_errors);
1319 dst->tx_errors = get_unaligned_u64(&src->tx_errors);
1320 dst->rx_dropped = get_unaligned_u64(&src->rx_dropped);
1321 dst->tx_dropped = get_unaligned_u64(&src->tx_dropped);
1323 dst->collisions = 0;
1324 dst->rx_length_errors = 0;
1325 dst->rx_over_errors = 0;
1326 dst->rx_crc_errors = 0;
1327 dst->rx_frame_errors = 0;
1328 dst->rx_fifo_errors = 0;
1329 dst->rx_missed_errors = 0;
1330 dst->tx_aborted_errors = 0;
1331 dst->tx_carrier_errors = 0;
1332 dst->tx_fifo_errors = 0;
1333 dst->tx_heartbeat_errors = 0;
1334 dst->tx_window_errors = 0;
1338 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1340 struct dpif_linux_vport reply;
1344 error = dpif_linux_vport_get(netdev_get_name(netdev), &reply, &buf);
1347 } else if (!reply.stats) {
1352 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1360 get_stats_via_vport(const struct netdev *netdev_,
1361 struct netdev_stats *stats)
1363 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1365 if (!netdev->vport_stats_error ||
1366 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1369 error = get_stats_via_vport__(netdev_, stats);
1370 if (error && error != ENOENT) {
1371 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1373 netdev_get_name(netdev_), ovs_strerror(error));
1375 netdev->vport_stats_error = error;
1376 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1381 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1382 struct netdev_stats *stats)
1384 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
1385 static int use_netlink_stats;
1388 if (ovsthread_once_start(&once)) {
1389 use_netlink_stats = check_for_working_netlink_stats();
1390 ovsthread_once_done(&once);
1393 if (use_netlink_stats) {
1396 error = get_ifindex(netdev_, &ifindex);
1398 error = get_stats_via_netlink(ifindex, stats);
1401 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1405 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1406 netdev_get_name(netdev_), error);
1412 /* Retrieves current device stats for 'netdev-linux'. */
1414 netdev_linux_get_stats(const struct netdev *netdev_,
1415 struct netdev_stats *stats)
1417 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1418 struct netdev_stats dev_stats;
1421 ovs_mutex_lock(&netdev->mutex);
1422 get_stats_via_vport(netdev_, stats);
1423 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1425 if (!netdev->vport_stats_error) {
1428 } else if (netdev->vport_stats_error) {
1429 /* stats not available from OVS then use ioctl stats. */
1432 stats->rx_errors += dev_stats.rx_errors;
1433 stats->tx_errors += dev_stats.tx_errors;
1434 stats->rx_dropped += dev_stats.rx_dropped;
1435 stats->tx_dropped += dev_stats.tx_dropped;
1436 stats->multicast += dev_stats.multicast;
1437 stats->collisions += dev_stats.collisions;
1438 stats->rx_length_errors += dev_stats.rx_length_errors;
1439 stats->rx_over_errors += dev_stats.rx_over_errors;
1440 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1441 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1442 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1443 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1444 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1445 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1446 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1447 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1448 stats->tx_window_errors += dev_stats.tx_window_errors;
1450 ovs_mutex_unlock(&netdev->mutex);
1455 /* Retrieves current device stats for 'netdev-tap' netdev or
1456 * netdev-internal. */
1458 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1460 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1461 struct netdev_stats dev_stats;
1464 ovs_mutex_lock(&netdev->mutex);
1465 get_stats_via_vport(netdev_, stats);
1466 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1468 if (!netdev->vport_stats_error) {
1471 } else if (netdev->vport_stats_error) {
1472 /* Transmit and receive stats will appear to be swapped relative to the
1473 * other ports since we are the one sending the data, not a remote
1474 * computer. For consistency, we swap them back here. This does not
1475 * apply if we are getting stats from the vport layer because it always
1476 * tracks stats from the perspective of the switch. */
1479 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1480 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1481 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1482 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1483 stats->rx_length_errors = 0;
1484 stats->rx_over_errors = 0;
1485 stats->rx_crc_errors = 0;
1486 stats->rx_frame_errors = 0;
1487 stats->rx_fifo_errors = 0;
1488 stats->rx_missed_errors = 0;
1489 stats->tx_aborted_errors = 0;
1490 stats->tx_carrier_errors = 0;
1491 stats->tx_fifo_errors = 0;
1492 stats->tx_heartbeat_errors = 0;
1493 stats->tx_window_errors = 0;
1495 stats->rx_dropped += dev_stats.tx_dropped;
1496 stats->tx_dropped += dev_stats.rx_dropped;
1498 stats->rx_errors += dev_stats.tx_errors;
1499 stats->tx_errors += dev_stats.rx_errors;
1501 stats->multicast += dev_stats.multicast;
1502 stats->collisions += dev_stats.collisions;
1504 ovs_mutex_unlock(&netdev->mutex);
1510 netdev_internal_get_stats(const struct netdev *netdev_,
1511 struct netdev_stats *stats)
1513 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1516 ovs_mutex_lock(&netdev->mutex);
1517 get_stats_via_vport(netdev_, stats);
1518 error = netdev->vport_stats_error;
1519 ovs_mutex_unlock(&netdev->mutex);
1525 netdev_internal_set_stats(struct netdev *netdev,
1526 const struct netdev_stats *stats)
1528 struct ovs_vport_stats vport_stats;
1529 struct dpif_linux_vport vport;
1532 vport_stats.rx_packets = stats->rx_packets;
1533 vport_stats.tx_packets = stats->tx_packets;
1534 vport_stats.rx_bytes = stats->rx_bytes;
1535 vport_stats.tx_bytes = stats->tx_bytes;
1536 vport_stats.rx_errors = stats->rx_errors;
1537 vport_stats.tx_errors = stats->tx_errors;
1538 vport_stats.rx_dropped = stats->rx_dropped;
1539 vport_stats.tx_dropped = stats->tx_dropped;
1541 dpif_linux_vport_init(&vport);
1542 vport.cmd = OVS_VPORT_CMD_SET;
1543 vport.name = netdev_get_name(netdev);
1544 vport.stats = &vport_stats;
1546 err = dpif_linux_vport_transact(&vport, NULL, NULL);
1548 /* If the vport layer doesn't know about the device, that doesn't mean it
1549 * doesn't exist (after all were able to open it when netdev_open() was
1550 * called), it just means that it isn't attached and we'll be getting
1551 * stats a different way. */
1552 if (err == ENODEV) {
1560 netdev_linux_read_features(struct netdev_linux *netdev)
1561 OVS_REQUIRES(netdev->mutex)
1563 struct ethtool_cmd ecmd;
1567 if (netdev->cache_valid & VALID_FEATURES) {
1571 COVERAGE_INC(netdev_get_ethtool);
1572 memset(&ecmd, 0, sizeof ecmd);
1573 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1574 ETHTOOL_GSET, "ETHTOOL_GSET");
1579 /* Supported features. */
1580 netdev->supported = 0;
1581 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1582 netdev->supported |= NETDEV_F_10MB_HD;
1584 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1585 netdev->supported |= NETDEV_F_10MB_FD;
1587 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1588 netdev->supported |= NETDEV_F_100MB_HD;
1590 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1591 netdev->supported |= NETDEV_F_100MB_FD;
1593 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1594 netdev->supported |= NETDEV_F_1GB_HD;
1596 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1597 netdev->supported |= NETDEV_F_1GB_FD;
1599 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1600 netdev->supported |= NETDEV_F_10GB_FD;
1602 if (ecmd.supported & SUPPORTED_TP) {
1603 netdev->supported |= NETDEV_F_COPPER;
1605 if (ecmd.supported & SUPPORTED_FIBRE) {
1606 netdev->supported |= NETDEV_F_FIBER;
1608 if (ecmd.supported & SUPPORTED_Autoneg) {
1609 netdev->supported |= NETDEV_F_AUTONEG;
1611 if (ecmd.supported & SUPPORTED_Pause) {
1612 netdev->supported |= NETDEV_F_PAUSE;
1614 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1615 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1618 /* Advertised features. */
1619 netdev->advertised = 0;
1620 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1621 netdev->advertised |= NETDEV_F_10MB_HD;
1623 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1624 netdev->advertised |= NETDEV_F_10MB_FD;
1626 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1627 netdev->advertised |= NETDEV_F_100MB_HD;
1629 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1630 netdev->advertised |= NETDEV_F_100MB_FD;
1632 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1633 netdev->advertised |= NETDEV_F_1GB_HD;
1635 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1636 netdev->advertised |= NETDEV_F_1GB_FD;
1638 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1639 netdev->advertised |= NETDEV_F_10GB_FD;
1641 if (ecmd.advertising & ADVERTISED_TP) {
1642 netdev->advertised |= NETDEV_F_COPPER;
1644 if (ecmd.advertising & ADVERTISED_FIBRE) {
1645 netdev->advertised |= NETDEV_F_FIBER;
1647 if (ecmd.advertising & ADVERTISED_Autoneg) {
1648 netdev->advertised |= NETDEV_F_AUTONEG;
1650 if (ecmd.advertising & ADVERTISED_Pause) {
1651 netdev->advertised |= NETDEV_F_PAUSE;
1653 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1654 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1657 /* Current settings. */
1659 if (speed == SPEED_10) {
1660 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1661 } else if (speed == SPEED_100) {
1662 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1663 } else if (speed == SPEED_1000) {
1664 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1665 } else if (speed == SPEED_10000) {
1666 netdev->current = NETDEV_F_10GB_FD;
1667 } else if (speed == 40000) {
1668 netdev->current = NETDEV_F_40GB_FD;
1669 } else if (speed == 100000) {
1670 netdev->current = NETDEV_F_100GB_FD;
1671 } else if (speed == 1000000) {
1672 netdev->current = NETDEV_F_1TB_FD;
1674 netdev->current = 0;
1677 if (ecmd.port == PORT_TP) {
1678 netdev->current |= NETDEV_F_COPPER;
1679 } else if (ecmd.port == PORT_FIBRE) {
1680 netdev->current |= NETDEV_F_FIBER;
1684 netdev->current |= NETDEV_F_AUTONEG;
1688 netdev->cache_valid |= VALID_FEATURES;
1689 netdev->get_features_error = error;
1692 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
1693 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1694 * Returns 0 if successful, otherwise a positive errno value. */
1696 netdev_linux_get_features(const struct netdev *netdev_,
1697 enum netdev_features *current,
1698 enum netdev_features *advertised,
1699 enum netdev_features *supported,
1700 enum netdev_features *peer)
1702 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1705 ovs_mutex_lock(&netdev->mutex);
1706 netdev_linux_read_features(netdev);
1707 if (!netdev->get_features_error) {
1708 *current = netdev->current;
1709 *advertised = netdev->advertised;
1710 *supported = netdev->supported;
1711 *peer = 0; /* XXX */
1713 error = netdev->get_features_error;
1714 ovs_mutex_unlock(&netdev->mutex);
1719 /* Set the features advertised by 'netdev' to 'advertise'. */
1721 netdev_linux_set_advertisements(struct netdev *netdev_,
1722 enum netdev_features advertise)
1724 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1725 struct ethtool_cmd ecmd;
1728 ovs_mutex_lock(&netdev->mutex);
1730 COVERAGE_INC(netdev_get_ethtool);
1731 memset(&ecmd, 0, sizeof ecmd);
1732 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1733 ETHTOOL_GSET, "ETHTOOL_GSET");
1738 ecmd.advertising = 0;
1739 if (advertise & NETDEV_F_10MB_HD) {
1740 ecmd.advertising |= ADVERTISED_10baseT_Half;
1742 if (advertise & NETDEV_F_10MB_FD) {
1743 ecmd.advertising |= ADVERTISED_10baseT_Full;
1745 if (advertise & NETDEV_F_100MB_HD) {
1746 ecmd.advertising |= ADVERTISED_100baseT_Half;
1748 if (advertise & NETDEV_F_100MB_FD) {
1749 ecmd.advertising |= ADVERTISED_100baseT_Full;
1751 if (advertise & NETDEV_F_1GB_HD) {
1752 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1754 if (advertise & NETDEV_F_1GB_FD) {
1755 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1757 if (advertise & NETDEV_F_10GB_FD) {
1758 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1760 if (advertise & NETDEV_F_COPPER) {
1761 ecmd.advertising |= ADVERTISED_TP;
1763 if (advertise & NETDEV_F_FIBER) {
1764 ecmd.advertising |= ADVERTISED_FIBRE;
1766 if (advertise & NETDEV_F_AUTONEG) {
1767 ecmd.advertising |= ADVERTISED_Autoneg;
1769 if (advertise & NETDEV_F_PAUSE) {
1770 ecmd.advertising |= ADVERTISED_Pause;
1772 if (advertise & NETDEV_F_PAUSE_ASYM) {
1773 ecmd.advertising |= ADVERTISED_Asym_Pause;
1775 COVERAGE_INC(netdev_set_ethtool);
1776 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1777 ETHTOOL_SSET, "ETHTOOL_SSET");
1780 ovs_mutex_unlock(&netdev->mutex);
1784 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1785 * successful, otherwise a positive errno value. */
1787 netdev_linux_set_policing(struct netdev *netdev_,
1788 uint32_t kbits_rate, uint32_t kbits_burst)
1790 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1791 const char *netdev_name = netdev_get_name(netdev_);
1794 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1795 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1796 : kbits_burst); /* Stick with user-specified value. */
1798 ovs_mutex_lock(&netdev->mutex);
1799 if (netdev->cache_valid & VALID_POLICING) {
1800 error = netdev->netdev_policing_error;
1801 if (error || (netdev->kbits_rate == kbits_rate &&
1802 netdev->kbits_burst == kbits_burst)) {
1803 /* Assume that settings haven't changed since we last set them. */
1806 netdev->cache_valid &= ~VALID_POLICING;
1809 COVERAGE_INC(netdev_set_policing);
1810 /* Remove any existing ingress qdisc. */
1811 error = tc_add_del_ingress_qdisc(netdev_, false);
1813 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1814 netdev_name, ovs_strerror(error));
1819 error = tc_add_del_ingress_qdisc(netdev_, true);
1821 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1822 netdev_name, ovs_strerror(error));
1826 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
1828 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1829 netdev_name, ovs_strerror(error));
1834 netdev->kbits_rate = kbits_rate;
1835 netdev->kbits_burst = kbits_burst;
1838 if (!error || error == ENODEV) {
1839 netdev->netdev_policing_error = error;
1840 netdev->cache_valid |= VALID_POLICING;
1842 ovs_mutex_unlock(&netdev->mutex);
1847 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1850 const struct tc_ops *const *opsp;
1852 for (opsp = tcs; *opsp != NULL; opsp++) {
1853 const struct tc_ops *ops = *opsp;
1854 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1855 sset_add(types, ops->ovs_name);
1861 static const struct tc_ops *
1862 tc_lookup_ovs_name(const char *name)
1864 const struct tc_ops *const *opsp;
1866 for (opsp = tcs; *opsp != NULL; opsp++) {
1867 const struct tc_ops *ops = *opsp;
1868 if (!strcmp(name, ops->ovs_name)) {
1875 static const struct tc_ops *
1876 tc_lookup_linux_name(const char *name)
1878 const struct tc_ops *const *opsp;
1880 for (opsp = tcs; *opsp != NULL; opsp++) {
1881 const struct tc_ops *ops = *opsp;
1882 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1889 static struct tc_queue *
1890 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
1893 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1894 struct tc_queue *queue;
1896 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
1897 if (queue->queue_id == queue_id) {
1904 static struct tc_queue *
1905 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1907 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1911 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1913 struct netdev_qos_capabilities *caps)
1915 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1919 caps->n_queues = ops->n_queues;
1924 netdev_linux_get_qos(const struct netdev *netdev_,
1925 const char **typep, struct smap *details)
1927 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1930 ovs_mutex_lock(&netdev->mutex);
1931 error = tc_query_qdisc(netdev_);
1933 *typep = netdev->tc->ops->ovs_name;
1934 error = (netdev->tc->ops->qdisc_get
1935 ? netdev->tc->ops->qdisc_get(netdev_, details)
1938 ovs_mutex_unlock(&netdev->mutex);
1944 netdev_linux_set_qos(struct netdev *netdev_,
1945 const char *type, const struct smap *details)
1947 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1948 const struct tc_ops *new_ops;
1951 new_ops = tc_lookup_ovs_name(type);
1952 if (!new_ops || !new_ops->tc_install) {
1956 ovs_mutex_lock(&netdev->mutex);
1957 error = tc_query_qdisc(netdev_);
1962 if (new_ops == netdev->tc->ops) {
1963 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
1965 /* Delete existing qdisc. */
1966 error = tc_del_qdisc(netdev_);
1970 ovs_assert(netdev->tc == NULL);
1972 /* Install new qdisc. */
1973 error = new_ops->tc_install(netdev_, details);
1974 ovs_assert((error == 0) == (netdev->tc != NULL));
1978 ovs_mutex_unlock(&netdev->mutex);
1983 netdev_linux_get_queue(const struct netdev *netdev_,
1984 unsigned int queue_id, struct smap *details)
1986 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1989 ovs_mutex_lock(&netdev->mutex);
1990 error = tc_query_qdisc(netdev_);
1992 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
1994 ? netdev->tc->ops->class_get(netdev_, queue, details)
1997 ovs_mutex_unlock(&netdev->mutex);
2003 netdev_linux_set_queue(struct netdev *netdev_,
2004 unsigned int queue_id, const struct smap *details)
2006 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2009 ovs_mutex_lock(&netdev->mutex);
2010 error = tc_query_qdisc(netdev_);
2012 error = (queue_id < netdev->tc->ops->n_queues
2013 && netdev->tc->ops->class_set
2014 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2017 ovs_mutex_unlock(&netdev->mutex);
2023 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
2025 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2028 ovs_mutex_lock(&netdev->mutex);
2029 error = tc_query_qdisc(netdev_);
2031 if (netdev->tc->ops->class_delete) {
2032 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2034 ? netdev->tc->ops->class_delete(netdev_, queue)
2040 ovs_mutex_unlock(&netdev->mutex);
2046 netdev_linux_get_queue_stats(const struct netdev *netdev_,
2047 unsigned int queue_id,
2048 struct netdev_queue_stats *stats)
2050 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2053 ovs_mutex_lock(&netdev->mutex);
2054 error = tc_query_qdisc(netdev_);
2056 if (netdev->tc->ops->class_get_stats) {
2057 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2059 stats->created = queue->created;
2060 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2069 ovs_mutex_unlock(&netdev->mutex);
2075 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
2077 struct ofpbuf request;
2078 struct tcmsg *tcmsg;
2080 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2084 tcmsg->tcm_parent = 0;
2085 nl_dump_start(dump, NETLINK_ROUTE, &request);
2086 ofpbuf_uninit(&request);
2091 netdev_linux_dump_queues(const struct netdev *netdev_,
2092 netdev_dump_queues_cb *cb, void *aux)
2094 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2097 ovs_mutex_lock(&netdev->mutex);
2098 error = tc_query_qdisc(netdev_);
2100 if (netdev->tc->ops->class_get) {
2101 struct tc_queue *queue, *next_queue;
2102 struct smap details;
2104 smap_init(&details);
2105 HMAP_FOR_EACH_SAFE (queue, next_queue, hmap_node,
2106 &netdev->tc->queues) {
2109 smap_clear(&details);
2111 retval = netdev->tc->ops->class_get(netdev_, queue, &details);
2113 (*cb)(queue->queue_id, &details, aux);
2118 smap_destroy(&details);
2123 ovs_mutex_unlock(&netdev->mutex);
2129 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2130 netdev_dump_queue_stats_cb *cb, void *aux)
2132 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2135 ovs_mutex_lock(&netdev->mutex);
2136 error = tc_query_qdisc(netdev_);
2138 struct nl_dump dump;
2140 if (!netdev->tc->ops->class_dump_stats) {
2142 } else if (!start_queue_dump(netdev_, &dump)) {
2148 while (nl_dump_next(&dump, &msg)) {
2149 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2156 retval = nl_dump_done(&dump);
2162 ovs_mutex_unlock(&netdev->mutex);
2168 netdev_linux_get_in4(const struct netdev *netdev_,
2169 struct in_addr *address, struct in_addr *netmask)
2171 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2174 ovs_mutex_lock(&netdev->mutex);
2175 if (!(netdev->cache_valid & VALID_IN4)) {
2176 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
2177 SIOCGIFADDR, "SIOCGIFADDR");
2179 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2180 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2182 netdev->cache_valid |= VALID_IN4;
2190 if (netdev->address.s_addr != INADDR_ANY) {
2191 *address = netdev->address;
2192 *netmask = netdev->netmask;
2194 error = EADDRNOTAVAIL;
2197 ovs_mutex_unlock(&netdev->mutex);
2203 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2204 struct in_addr netmask)
2206 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2209 ovs_mutex_lock(&netdev->mutex);
2210 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2212 netdev->cache_valid |= VALID_IN4;
2213 netdev->address = address;
2214 netdev->netmask = netmask;
2215 if (address.s_addr != INADDR_ANY) {
2216 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2217 "SIOCSIFNETMASK", netmask);
2220 ovs_mutex_unlock(&netdev->mutex);
2226 parse_if_inet6_line(const char *line,
2227 struct in6_addr *in6, char ifname[16 + 1])
2229 uint8_t *s6 = in6->s6_addr;
2230 #define X8 "%2"SCNx8
2232 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2233 "%*x %*x %*x %*x %16s\n",
2234 &s6[0], &s6[1], &s6[2], &s6[3],
2235 &s6[4], &s6[5], &s6[6], &s6[7],
2236 &s6[8], &s6[9], &s6[10], &s6[11],
2237 &s6[12], &s6[13], &s6[14], &s6[15],
2241 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2242 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2244 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2246 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2248 ovs_mutex_lock(&netdev->mutex);
2249 if (!(netdev->cache_valid & VALID_IN6)) {
2253 netdev->in6 = in6addr_any;
2255 file = fopen("/proc/net/if_inet6", "r");
2257 const char *name = netdev_get_name(netdev_);
2258 while (fgets(line, sizeof line, file)) {
2259 struct in6_addr in6_tmp;
2260 char ifname[16 + 1];
2261 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2262 && !strcmp(name, ifname))
2264 netdev->in6 = in6_tmp;
2270 netdev->cache_valid |= VALID_IN6;
2273 ovs_mutex_unlock(&netdev->mutex);
2279 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2281 struct sockaddr_in sin;
2282 memset(&sin, 0, sizeof sin);
2283 sin.sin_family = AF_INET;
2284 sin.sin_addr = addr;
2287 memset(sa, 0, sizeof *sa);
2288 memcpy(sa, &sin, sizeof sin);
2292 do_set_addr(struct netdev *netdev,
2293 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2297 make_in4_sockaddr(&ifr.ifr_addr, addr);
2298 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2302 /* Adds 'router' as a default IP gateway. */
2304 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2306 struct in_addr any = { INADDR_ANY };
2310 memset(&rt, 0, sizeof rt);
2311 make_in4_sockaddr(&rt.rt_dst, any);
2312 make_in4_sockaddr(&rt.rt_gateway, router);
2313 make_in4_sockaddr(&rt.rt_genmask, any);
2314 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2315 error = af_inet_ioctl(SIOCADDRT, &rt);
2317 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2323 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2326 static const char fn[] = "/proc/net/route";
2331 *netdev_name = NULL;
2332 stream = fopen(fn, "r");
2333 if (stream == NULL) {
2334 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2339 while (fgets(line, sizeof line, stream)) {
2342 ovs_be32 dest, gateway, mask;
2343 int refcnt, metric, mtu;
2344 unsigned int flags, use, window, irtt;
2347 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2349 iface, &dest, &gateway, &flags, &refcnt,
2350 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2352 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2356 if (!(flags & RTF_UP)) {
2357 /* Skip routes that aren't up. */
2361 /* The output of 'dest', 'mask', and 'gateway' were given in
2362 * network byte order, so we don't need need any endian
2363 * conversions here. */
2364 if ((dest & mask) == (host->s_addr & mask)) {
2366 /* The host is directly reachable. */
2367 next_hop->s_addr = 0;
2369 /* To reach the host, we must go through a gateway. */
2370 next_hop->s_addr = gateway;
2372 *netdev_name = xstrdup(iface);
2384 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2386 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2389 ovs_mutex_lock(&netdev->mutex);
2390 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2391 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2393 COVERAGE_INC(netdev_get_ethtool);
2394 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2395 error = netdev_linux_do_ethtool(netdev->up.name,
2398 "ETHTOOL_GDRVINFO");
2400 netdev->cache_valid |= VALID_DRVINFO;
2405 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2406 smap_add(smap, "driver_version", netdev->drvinfo.version);
2407 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2409 ovs_mutex_unlock(&netdev->mutex);
2415 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2418 smap_add(smap, "driver_name", "openvswitch");
2422 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2423 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2424 * returns 0. Otherwise, it returns a positive errno value; in particular,
2425 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2427 netdev_linux_arp_lookup(const struct netdev *netdev,
2428 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2431 struct sockaddr_in sin;
2434 memset(&r, 0, sizeof r);
2435 memset(&sin, 0, sizeof sin);
2436 sin.sin_family = AF_INET;
2437 sin.sin_addr.s_addr = ip;
2439 memcpy(&r.arp_pa, &sin, sizeof sin);
2440 r.arp_ha.sa_family = ARPHRD_ETHER;
2442 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2443 COVERAGE_INC(netdev_arp_lookup);
2444 retval = af_inet_ioctl(SIOCGARP, &r);
2446 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2447 } else if (retval != ENXIO) {
2448 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2449 netdev_get_name(netdev), IP_ARGS(ip),
2450 ovs_strerror(retval));
2456 nd_to_iff_flags(enum netdev_flags nd)
2459 if (nd & NETDEV_UP) {
2462 if (nd & NETDEV_PROMISC) {
2469 iff_to_nd_flags(int iff)
2471 enum netdev_flags nd = 0;
2475 if (iff & IFF_PROMISC) {
2476 nd |= NETDEV_PROMISC;
2482 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2483 enum netdev_flags on, enum netdev_flags *old_flagsp)
2485 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2486 int old_flags, new_flags;
2489 ovs_mutex_lock(&netdev->mutex);
2490 old_flags = netdev->ifi_flags;
2491 *old_flagsp = iff_to_nd_flags(old_flags);
2492 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2493 if (new_flags != old_flags) {
2494 error = set_flags(netdev_get_name(netdev_), new_flags);
2495 get_flags(netdev_, &netdev->ifi_flags);
2497 ovs_mutex_unlock(&netdev->mutex);
2503 netdev_linux_change_seq(const struct netdev *netdev_)
2505 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2506 unsigned int change_seq;
2508 ovs_mutex_lock(&netdev->mutex);
2509 change_seq = netdev->change_seq;
2510 ovs_mutex_unlock(&netdev->mutex);
2515 #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, SET_STATS, \
2516 GET_FEATURES, GET_STATUS) \
2522 netdev_linux_wait, \
2524 netdev_linux_alloc, \
2526 netdev_linux_destruct, \
2527 netdev_linux_dealloc, \
2528 NULL, /* get_config */ \
2529 NULL, /* set_config */ \
2530 NULL, /* get_tunnel_config */ \
2532 netdev_linux_send, \
2533 netdev_linux_send_wait, \
2535 netdev_linux_set_etheraddr, \
2536 netdev_linux_get_etheraddr, \
2537 netdev_linux_get_mtu, \
2538 netdev_linux_set_mtu, \
2539 netdev_linux_get_ifindex, \
2540 netdev_linux_get_carrier, \
2541 netdev_linux_get_carrier_resets, \
2542 netdev_linux_set_miimon_interval, \
2547 netdev_linux_set_advertisements, \
2549 netdev_linux_set_policing, \
2550 netdev_linux_get_qos_types, \
2551 netdev_linux_get_qos_capabilities, \
2552 netdev_linux_get_qos, \
2553 netdev_linux_set_qos, \
2554 netdev_linux_get_queue, \
2555 netdev_linux_set_queue, \
2556 netdev_linux_delete_queue, \
2557 netdev_linux_get_queue_stats, \
2558 netdev_linux_dump_queues, \
2559 netdev_linux_dump_queue_stats, \
2561 netdev_linux_get_in4, \
2562 netdev_linux_set_in4, \
2563 netdev_linux_get_in6, \
2564 netdev_linux_add_router, \
2565 netdev_linux_get_next_hop, \
2567 netdev_linux_arp_lookup, \
2569 netdev_linux_update_flags, \
2571 netdev_linux_change_seq, \
2573 netdev_linux_rx_alloc, \
2574 netdev_linux_rx_construct, \
2575 netdev_linux_rx_destruct, \
2576 netdev_linux_rx_dealloc, \
2577 netdev_linux_rx_recv, \
2578 netdev_linux_rx_wait, \
2579 netdev_linux_rx_drain, \
2582 const struct netdev_class netdev_linux_class =
2585 netdev_linux_construct,
2586 netdev_linux_get_stats,
2587 NULL, /* set_stats */
2588 netdev_linux_get_features,
2589 netdev_linux_get_status);
2591 const struct netdev_class netdev_tap_class =
2594 netdev_linux_construct_tap,
2595 netdev_tap_get_stats,
2596 NULL, /* set_stats */
2597 netdev_linux_get_features,
2598 netdev_linux_get_status);
2600 const struct netdev_class netdev_internal_class =
2603 netdev_linux_construct,
2604 netdev_internal_get_stats,
2605 netdev_internal_set_stats,
2606 NULL, /* get_features */
2607 netdev_internal_get_status);
2609 /* HTB traffic control class. */
2611 #define HTB_N_QUEUES 0xf000
2615 unsigned int max_rate; /* In bytes/s. */
2619 struct tc_queue tc_queue;
2620 unsigned int min_rate; /* In bytes/s. */
2621 unsigned int max_rate; /* In bytes/s. */
2622 unsigned int burst; /* In bytes. */
2623 unsigned int priority; /* Lower values are higher priorities. */
2627 htb_get__(const struct netdev *netdev_)
2629 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2630 return CONTAINER_OF(netdev->tc, struct htb, tc);
2634 htb_install__(struct netdev *netdev_, uint64_t max_rate)
2636 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2639 htb = xmalloc(sizeof *htb);
2640 tc_init(&htb->tc, &tc_ops_htb);
2641 htb->max_rate = max_rate;
2643 netdev->tc = &htb->tc;
2646 /* Create an HTB qdisc.
2648 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2650 htb_setup_qdisc__(struct netdev *netdev)
2653 struct tc_htb_glob opt;
2654 struct ofpbuf request;
2655 struct tcmsg *tcmsg;
2657 tc_del_qdisc(netdev);
2659 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2660 NLM_F_EXCL | NLM_F_CREATE, &request);
2664 tcmsg->tcm_handle = tc_make_handle(1, 0);
2665 tcmsg->tcm_parent = TC_H_ROOT;
2667 nl_msg_put_string(&request, TCA_KIND, "htb");
2669 memset(&opt, 0, sizeof opt);
2670 opt.rate2quantum = 10;
2674 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2675 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2676 nl_msg_end_nested(&request, opt_offset);
2678 return tc_transact(&request, NULL);
2681 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2682 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2684 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2685 unsigned int parent, struct htb_class *class)
2688 struct tc_htb_opt opt;
2689 struct ofpbuf request;
2690 struct tcmsg *tcmsg;
2694 error = netdev_get_mtu(netdev, &mtu);
2696 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2697 netdev_get_name(netdev));
2701 memset(&opt, 0, sizeof opt);
2702 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2703 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2704 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2705 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2706 opt.prio = class->priority;
2708 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2712 tcmsg->tcm_handle = handle;
2713 tcmsg->tcm_parent = parent;
2715 nl_msg_put_string(&request, TCA_KIND, "htb");
2716 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2717 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2718 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2719 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2720 nl_msg_end_nested(&request, opt_offset);
2722 error = tc_transact(&request, NULL);
2724 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2725 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2726 netdev_get_name(netdev),
2727 tc_get_major(handle), tc_get_minor(handle),
2728 tc_get_major(parent), tc_get_minor(parent),
2729 class->min_rate, class->max_rate,
2730 class->burst, class->priority, ovs_strerror(error));
2735 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2736 * description of them into 'details'. The description complies with the
2737 * specification given in the vswitch database documentation for linux-htb
2740 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2742 static const struct nl_policy tca_htb_policy[] = {
2743 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2744 .min_len = sizeof(struct tc_htb_opt) },
2747 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2748 const struct tc_htb_opt *htb;
2750 if (!nl_parse_nested(nl_options, tca_htb_policy,
2751 attrs, ARRAY_SIZE(tca_htb_policy))) {
2752 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2756 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2757 class->min_rate = htb->rate.rate;
2758 class->max_rate = htb->ceil.rate;
2759 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2760 class->priority = htb->prio;
2765 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2766 struct htb_class *options,
2767 struct netdev_queue_stats *stats)
2769 struct nlattr *nl_options;
2770 unsigned int handle;
2773 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2774 if (!error && queue_id) {
2775 unsigned int major = tc_get_major(handle);
2776 unsigned int minor = tc_get_minor(handle);
2777 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2778 *queue_id = minor - 1;
2783 if (!error && options) {
2784 error = htb_parse_tca_options__(nl_options, options);
2790 htb_parse_qdisc_details__(struct netdev *netdev,
2791 const struct smap *details, struct htb_class *hc)
2793 const char *max_rate_s;
2795 max_rate_s = smap_get(details, "max-rate");
2796 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2797 if (!hc->max_rate) {
2798 enum netdev_features current;
2800 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2801 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
2803 hc->min_rate = hc->max_rate;
2809 htb_parse_class_details__(struct netdev *netdev,
2810 const struct smap *details, struct htb_class *hc)
2812 const struct htb *htb = htb_get__(netdev);
2813 const char *min_rate_s = smap_get(details, "min-rate");
2814 const char *max_rate_s = smap_get(details, "max-rate");
2815 const char *burst_s = smap_get(details, "burst");
2816 const char *priority_s = smap_get(details, "priority");
2819 error = netdev_get_mtu(netdev, &mtu);
2821 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2822 netdev_get_name(netdev));
2826 /* HTB requires at least an mtu sized min-rate to send any traffic even
2827 * on uncongested links. */
2828 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2829 hc->min_rate = MAX(hc->min_rate, mtu);
2830 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2833 hc->max_rate = (max_rate_s
2834 ? strtoull(max_rate_s, NULL, 10) / 8
2836 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2837 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2841 * According to hints in the documentation that I've read, it is important
2842 * that 'burst' be at least as big as the largest frame that might be
2843 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2844 * but having it a bit too small is a problem. Since netdev_get_mtu()
2845 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2846 * the MTU. We actually add 64, instead of 14, as a guard against
2847 * additional headers get tacked on somewhere that we're not aware of. */
2848 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2849 hc->burst = MAX(hc->burst, mtu + 64);
2852 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2858 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2859 unsigned int parent, struct htb_class *options,
2860 struct netdev_queue_stats *stats)
2862 struct ofpbuf *reply;
2865 error = tc_query_class(netdev, handle, parent, &reply);
2867 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2868 ofpbuf_delete(reply);
2874 htb_tc_install(struct netdev *netdev, const struct smap *details)
2878 error = htb_setup_qdisc__(netdev);
2880 struct htb_class hc;
2882 htb_parse_qdisc_details__(netdev, details, &hc);
2883 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2884 tc_make_handle(1, 0), &hc);
2886 htb_install__(netdev, hc.max_rate);
2892 static struct htb_class *
2893 htb_class_cast__(const struct tc_queue *queue)
2895 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2899 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2900 const struct htb_class *hc)
2902 struct htb *htb = htb_get__(netdev);
2903 size_t hash = hash_int(queue_id, 0);
2904 struct tc_queue *queue;
2905 struct htb_class *hcp;
2907 queue = tc_find_queue__(netdev, queue_id, hash);
2909 hcp = htb_class_cast__(queue);
2911 hcp = xmalloc(sizeof *hcp);
2912 queue = &hcp->tc_queue;
2913 queue->queue_id = queue_id;
2914 queue->created = time_msec();
2915 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2918 hcp->min_rate = hc->min_rate;
2919 hcp->max_rate = hc->max_rate;
2920 hcp->burst = hc->burst;
2921 hcp->priority = hc->priority;
2925 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2928 struct nl_dump dump;
2929 struct htb_class hc;
2931 /* Get qdisc options. */
2933 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2934 htb_install__(netdev, hc.max_rate);
2937 if (!start_queue_dump(netdev, &dump)) {
2940 while (nl_dump_next(&dump, &msg)) {
2941 unsigned int queue_id;
2943 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2944 htb_update_queue__(netdev, queue_id, &hc);
2947 nl_dump_done(&dump);
2953 htb_tc_destroy(struct tc *tc)
2955 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2956 struct htb_class *hc, *next;
2958 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2959 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2967 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
2969 const struct htb *htb = htb_get__(netdev);
2970 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
2975 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
2977 struct htb_class hc;
2980 htb_parse_qdisc_details__(netdev, details, &hc);
2981 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2982 tc_make_handle(1, 0), &hc);
2984 htb_get__(netdev)->max_rate = hc.max_rate;
2990 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2991 const struct tc_queue *queue, struct smap *details)
2993 const struct htb_class *hc = htb_class_cast__(queue);
2995 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
2996 if (hc->min_rate != hc->max_rate) {
2997 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
2999 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
3001 smap_add_format(details, "priority", "%u", hc->priority);
3007 htb_class_set(struct netdev *netdev, unsigned int queue_id,
3008 const struct smap *details)
3010 struct htb_class hc;
3013 error = htb_parse_class_details__(netdev, details, &hc);
3018 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3019 tc_make_handle(1, 0xfffe), &hc);
3024 htb_update_queue__(netdev, queue_id, &hc);
3029 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
3031 struct htb_class *hc = htb_class_cast__(queue);
3032 struct htb *htb = htb_get__(netdev);
3035 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3037 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3044 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3045 struct netdev_queue_stats *stats)
3047 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3048 tc_make_handle(1, 0xfffe), NULL, stats);
3052 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3053 const struct ofpbuf *nlmsg,
3054 netdev_dump_queue_stats_cb *cb, void *aux)
3056 struct netdev_queue_stats stats;
3057 unsigned int handle, major, minor;
3060 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3065 major = tc_get_major(handle);
3066 minor = tc_get_minor(handle);
3067 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3068 (*cb)(minor - 1, &stats, aux);
3073 static const struct tc_ops tc_ops_htb = {
3074 "htb", /* linux_name */
3075 "linux-htb", /* ovs_name */
3076 HTB_N_QUEUES, /* n_queues */
3085 htb_class_get_stats,
3086 htb_class_dump_stats
3089 /* "linux-hfsc" traffic control class. */
3091 #define HFSC_N_QUEUES 0xf000
3099 struct tc_queue tc_queue;
3104 static struct hfsc *
3105 hfsc_get__(const struct netdev *netdev_)
3107 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3108 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
3111 static struct hfsc_class *
3112 hfsc_class_cast__(const struct tc_queue *queue)
3114 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3118 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
3120 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3123 hfsc = xmalloc(sizeof *hfsc);
3124 tc_init(&hfsc->tc, &tc_ops_hfsc);
3125 hfsc->max_rate = max_rate;
3126 netdev->tc = &hfsc->tc;
3130 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3131 const struct hfsc_class *hc)
3135 struct hfsc_class *hcp;
3136 struct tc_queue *queue;
3138 hfsc = hfsc_get__(netdev);
3139 hash = hash_int(queue_id, 0);
3141 queue = tc_find_queue__(netdev, queue_id, hash);
3143 hcp = hfsc_class_cast__(queue);
3145 hcp = xmalloc(sizeof *hcp);
3146 queue = &hcp->tc_queue;
3147 queue->queue_id = queue_id;
3148 queue->created = time_msec();
3149 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3152 hcp->min_rate = hc->min_rate;
3153 hcp->max_rate = hc->max_rate;
3157 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3159 const struct tc_service_curve *rsc, *fsc, *usc;
3160 static const struct nl_policy tca_hfsc_policy[] = {
3162 .type = NL_A_UNSPEC,
3164 .min_len = sizeof(struct tc_service_curve),
3167 .type = NL_A_UNSPEC,
3169 .min_len = sizeof(struct tc_service_curve),
3172 .type = NL_A_UNSPEC,
3174 .min_len = sizeof(struct tc_service_curve),
3177 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3179 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3180 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3181 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3185 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3186 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3187 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3189 if (rsc->m1 != 0 || rsc->d != 0 ||
3190 fsc->m1 != 0 || fsc->d != 0 ||
3191 usc->m1 != 0 || usc->d != 0) {
3192 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3193 "Non-linear service curves are not supported.");
3197 if (rsc->m2 != fsc->m2) {
3198 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3199 "Real-time service curves are not supported ");
3203 if (rsc->m2 > usc->m2) {
3204 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3205 "Min-rate service curve is greater than "
3206 "the max-rate service curve.");
3210 class->min_rate = fsc->m2;
3211 class->max_rate = usc->m2;
3216 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3217 struct hfsc_class *options,
3218 struct netdev_queue_stats *stats)
3221 unsigned int handle;
3222 struct nlattr *nl_options;
3224 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3230 unsigned int major, minor;
3232 major = tc_get_major(handle);
3233 minor = tc_get_minor(handle);
3234 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3235 *queue_id = minor - 1;
3242 error = hfsc_parse_tca_options__(nl_options, options);
3249 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3250 unsigned int parent, struct hfsc_class *options,
3251 struct netdev_queue_stats *stats)
3254 struct ofpbuf *reply;
3256 error = tc_query_class(netdev, handle, parent, &reply);
3261 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3262 ofpbuf_delete(reply);
3267 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct smap *details,
3268 struct hfsc_class *class)
3271 const char *max_rate_s;
3273 max_rate_s = smap_get(details, "max-rate");
3274 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3277 enum netdev_features current;
3279 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
3280 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3283 class->min_rate = max_rate;
3284 class->max_rate = max_rate;
3288 hfsc_parse_class_details__(struct netdev *netdev,
3289 const struct smap *details,
3290 struct hfsc_class * class)
3292 const struct hfsc *hfsc;
3293 uint32_t min_rate, max_rate;
3294 const char *min_rate_s, *max_rate_s;
3296 hfsc = hfsc_get__(netdev);
3297 min_rate_s = smap_get(details, "min-rate");
3298 max_rate_s = smap_get(details, "max-rate");
3300 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3301 min_rate = MAX(min_rate, 1);
3302 min_rate = MIN(min_rate, hfsc->max_rate);
3304 max_rate = (max_rate_s
3305 ? strtoull(max_rate_s, NULL, 10) / 8
3307 max_rate = MAX(max_rate, min_rate);
3308 max_rate = MIN(max_rate, hfsc->max_rate);
3310 class->min_rate = min_rate;
3311 class->max_rate = max_rate;
3316 /* Create an HFSC qdisc.
3318 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3320 hfsc_setup_qdisc__(struct netdev * netdev)
3322 struct tcmsg *tcmsg;
3323 struct ofpbuf request;
3324 struct tc_hfsc_qopt opt;
3326 tc_del_qdisc(netdev);
3328 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3329 NLM_F_EXCL | NLM_F_CREATE, &request);
3335 tcmsg->tcm_handle = tc_make_handle(1, 0);
3336 tcmsg->tcm_parent = TC_H_ROOT;
3338 memset(&opt, 0, sizeof opt);
3341 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3342 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3344 return tc_transact(&request, NULL);
3347 /* Create an HFSC class.
3349 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3350 * sc rate <min_rate> ul rate <max_rate>" */
3352 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3353 unsigned int parent, struct hfsc_class *class)
3357 struct tcmsg *tcmsg;
3358 struct ofpbuf request;
3359 struct tc_service_curve min, max;
3361 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3367 tcmsg->tcm_handle = handle;
3368 tcmsg->tcm_parent = parent;
3372 min.m2 = class->min_rate;
3376 max.m2 = class->max_rate;
3378 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3379 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3380 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3381 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3382 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3383 nl_msg_end_nested(&request, opt_offset);
3385 error = tc_transact(&request, NULL);
3387 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3388 "min-rate %ubps, max-rate %ubps (%s)",
3389 netdev_get_name(netdev),
3390 tc_get_major(handle), tc_get_minor(handle),
3391 tc_get_major(parent), tc_get_minor(parent),
3392 class->min_rate, class->max_rate, ovs_strerror(error));
3399 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
3402 struct hfsc_class class;
3404 error = hfsc_setup_qdisc__(netdev);
3410 hfsc_parse_qdisc_details__(netdev, details, &class);
3411 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3412 tc_make_handle(1, 0), &class);
3418 hfsc_install__(netdev, class.max_rate);
3423 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3426 struct nl_dump dump;
3427 struct hfsc_class hc;
3430 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3431 hfsc_install__(netdev, hc.max_rate);
3433 if (!start_queue_dump(netdev, &dump)) {
3437 while (nl_dump_next(&dump, &msg)) {
3438 unsigned int queue_id;
3440 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3441 hfsc_update_queue__(netdev, queue_id, &hc);
3445 nl_dump_done(&dump);
3450 hfsc_tc_destroy(struct tc *tc)
3453 struct hfsc_class *hc, *next;
3455 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3457 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3458 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3467 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
3469 const struct hfsc *hfsc;
3470 hfsc = hfsc_get__(netdev);
3471 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
3476 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
3479 struct hfsc_class class;
3481 hfsc_parse_qdisc_details__(netdev, details, &class);
3482 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3483 tc_make_handle(1, 0), &class);
3486 hfsc_get__(netdev)->max_rate = class.max_rate;
3493 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3494 const struct tc_queue *queue, struct smap *details)
3496 const struct hfsc_class *hc;
3498 hc = hfsc_class_cast__(queue);
3499 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3500 if (hc->min_rate != hc->max_rate) {
3501 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3507 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3508 const struct smap *details)
3511 struct hfsc_class class;
3513 error = hfsc_parse_class_details__(netdev, details, &class);
3518 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3519 tc_make_handle(1, 0xfffe), &class);
3524 hfsc_update_queue__(netdev, queue_id, &class);
3529 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3533 struct hfsc_class *hc;
3535 hc = hfsc_class_cast__(queue);
3536 hfsc = hfsc_get__(netdev);
3538 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3540 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3547 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3548 struct netdev_queue_stats *stats)
3550 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3551 tc_make_handle(1, 0xfffe), NULL, stats);
3555 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3556 const struct ofpbuf *nlmsg,
3557 netdev_dump_queue_stats_cb *cb, void *aux)
3559 struct netdev_queue_stats stats;
3560 unsigned int handle, major, minor;
3563 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3568 major = tc_get_major(handle);
3569 minor = tc_get_minor(handle);
3570 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3571 (*cb)(minor - 1, &stats, aux);
3576 static const struct tc_ops tc_ops_hfsc = {
3577 "hfsc", /* linux_name */
3578 "linux-hfsc", /* ovs_name */
3579 HFSC_N_QUEUES, /* n_queues */
3580 hfsc_tc_install, /* tc_install */
3581 hfsc_tc_load, /* tc_load */
3582 hfsc_tc_destroy, /* tc_destroy */
3583 hfsc_qdisc_get, /* qdisc_get */
3584 hfsc_qdisc_set, /* qdisc_set */
3585 hfsc_class_get, /* class_get */
3586 hfsc_class_set, /* class_set */
3587 hfsc_class_delete, /* class_delete */
3588 hfsc_class_get_stats, /* class_get_stats */
3589 hfsc_class_dump_stats /* class_dump_stats */
3592 /* "linux-default" traffic control class.
3594 * This class represents the default, unnamed Linux qdisc. It corresponds to
3595 * the "" (empty string) QoS type in the OVS database. */
3598 default_install__(struct netdev *netdev_)
3600 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3601 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
3603 /* Nothing but a tc class implementation is allowed to write to a tc. This
3604 * class never does that, so we can legitimately use a const tc object. */
3605 netdev->tc = CONST_CAST(struct tc *, &tc);
3609 default_tc_install(struct netdev *netdev,
3610 const struct smap *details OVS_UNUSED)
3612 default_install__(netdev);
3617 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3619 default_install__(netdev);
3623 static const struct tc_ops tc_ops_default = {
3624 NULL, /* linux_name */
3629 NULL, /* tc_destroy */
3630 NULL, /* qdisc_get */
3631 NULL, /* qdisc_set */
3632 NULL, /* class_get */
3633 NULL, /* class_set */
3634 NULL, /* class_delete */
3635 NULL, /* class_get_stats */
3636 NULL /* class_dump_stats */
3639 /* "linux-other" traffic control class.
3644 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
3646 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3647 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
3649 /* Nothing but a tc class implementation is allowed to write to a tc. This
3650 * class never does that, so we can legitimately use a const tc object. */
3651 netdev->tc = CONST_CAST(struct tc *, &tc);
3655 static const struct tc_ops tc_ops_other = {
3656 NULL, /* linux_name */
3657 "linux-other", /* ovs_name */
3659 NULL, /* tc_install */
3661 NULL, /* tc_destroy */
3662 NULL, /* qdisc_get */
3663 NULL, /* qdisc_set */
3664 NULL, /* class_get */
3665 NULL, /* class_set */
3666 NULL, /* class_delete */
3667 NULL, /* class_get_stats */
3668 NULL /* class_dump_stats */
3671 /* Traffic control. */
3673 /* Number of kernel "tc" ticks per second. */
3674 static double ticks_per_s;
3676 /* Number of kernel "jiffies" per second. This is used for the purpose of
3677 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3678 * one jiffy's worth of data.
3680 * There are two possibilities here:
3682 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3683 * approximate range of 100 to 1024. That means that we really need to
3684 * make sure that the qdisc can buffer that much data.
3686 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3687 * has finely granular timers and there's no need to fudge additional room
3688 * for buffers. (There's no extra effort needed to implement that: the
3689 * large 'buffer_hz' is used as a divisor, so practically any number will
3690 * come out as 0 in the division. Small integer results in the case of
3691 * really high dividends won't have any real effect anyhow.)
3693 static unsigned int buffer_hz;
3695 /* Returns tc handle 'major':'minor'. */
3697 tc_make_handle(unsigned int major, unsigned int minor)
3699 return TC_H_MAKE(major << 16, minor);
3702 /* Returns the major number from 'handle'. */
3704 tc_get_major(unsigned int handle)
3706 return TC_H_MAJ(handle) >> 16;
3709 /* Returns the minor number from 'handle'. */
3711 tc_get_minor(unsigned int handle)
3713 return TC_H_MIN(handle);
3716 static struct tcmsg *
3717 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3718 struct ofpbuf *request)
3720 struct tcmsg *tcmsg;
3724 error = get_ifindex(netdev, &ifindex);
3729 ofpbuf_init(request, 512);
3730 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3731 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3732 tcmsg->tcm_family = AF_UNSPEC;
3733 tcmsg->tcm_ifindex = ifindex;
3734 /* Caller should fill in tcmsg->tcm_handle. */
3735 /* Caller should fill in tcmsg->tcm_parent. */
3741 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3743 int error = nl_transact(NETLINK_ROUTE, request, replyp);
3744 ofpbuf_uninit(request);
3748 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3749 * policing configuration.
3751 * This function is equivalent to running the following when 'add' is true:
3752 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3754 * This function is equivalent to running the following when 'add' is false:
3755 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3757 * The configuration and stats may be seen with the following command:
3758 * /sbin/tc -s qdisc show dev <devname>
3760 * Returns 0 if successful, otherwise a positive errno value.
3763 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3765 struct ofpbuf request;
3766 struct tcmsg *tcmsg;
3768 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3769 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3771 tcmsg = tc_make_request(netdev, type, flags, &request);
3775 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3776 tcmsg->tcm_parent = TC_H_INGRESS;
3777 nl_msg_put_string(&request, TCA_KIND, "ingress");
3778 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3780 error = tc_transact(&request, NULL);
3782 /* If we're deleting the qdisc, don't worry about some of the
3783 * error conditions. */
3784 if (!add && (error == ENOENT || error == EINVAL)) {
3793 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3796 * This function is equivalent to running:
3797 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3798 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3801 * The configuration and stats may be seen with the following command:
3802 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3804 * Returns 0 if successful, otherwise a positive errno value.
3807 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3809 struct tc_police tc_police;
3810 struct ofpbuf request;
3811 struct tcmsg *tcmsg;
3812 size_t basic_offset;
3813 size_t police_offset;
3817 memset(&tc_police, 0, sizeof tc_police);
3818 tc_police.action = TC_POLICE_SHOT;
3819 tc_police.mtu = mtu;
3820 tc_fill_rate(&tc_police.rate, (kbits_rate * 1000)/8, mtu);
3821 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3822 kbits_burst * 1024);
3824 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3825 NLM_F_EXCL | NLM_F_CREATE, &request);
3829 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3830 tcmsg->tcm_info = tc_make_handle(49,
3831 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3833 nl_msg_put_string(&request, TCA_KIND, "basic");
3834 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3835 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3836 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3837 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3838 nl_msg_end_nested(&request, police_offset);
3839 nl_msg_end_nested(&request, basic_offset);
3841 error = tc_transact(&request, NULL);
3852 /* The values in psched are not individually very meaningful, but they are
3853 * important. The tables below show some values seen in the wild.
3857 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3858 * (Before that, there are hints that it was 1000000000.)
3860 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3864 * -----------------------------------
3865 * [1] 000c8000 000f4240 000f4240 00000064
3866 * [2] 000003e8 00000400 000f4240 3b9aca00
3867 * [3] 000003e8 00000400 000f4240 3b9aca00
3868 * [4] 000003e8 00000400 000f4240 00000064
3869 * [5] 000003e8 00000040 000f4240 3b9aca00
3870 * [6] 000003e8 00000040 000f4240 000000f9
3872 * a b c d ticks_per_s buffer_hz
3873 * ------- --------- ---------- ------------- ----------- -------------
3874 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3875 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3876 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3877 * [4] 1,000 1,024 1,000,000 100 976,562 100
3878 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3879 * [6] 1,000 64 1,000,000 249 15,625,000 249
3881 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3882 * [2] 2.6.26-1-686-bigmem from Debian lenny
3883 * [3] 2.6.26-2-sparc64 from Debian lenny
3884 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3885 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3886 * [6] 2.6.34 from kernel.org on KVM
3888 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
3889 static const char fn[] = "/proc/net/psched";
3890 unsigned int a, b, c, d;
3893 if (!ovsthread_once_start(&once)) {
3900 stream = fopen(fn, "r");
3902 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
3906 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3907 VLOG_WARN("%s: read failed", fn);
3911 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3915 VLOG_WARN("%s: invalid scheduler parameters", fn);
3919 ticks_per_s = (double) a * c / b;
3923 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3926 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3929 ovsthread_once_done(&once);
3932 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3933 * rate of 'rate' bytes per second. */
3935 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3938 return (rate * ticks) / ticks_per_s;
3941 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3942 * rate of 'rate' bytes per second. */
3944 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3947 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3950 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3951 * a transmission rate of 'rate' bytes per second. */
3953 tc_buffer_per_jiffy(unsigned int rate)
3956 return rate / buffer_hz;
3959 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3960 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3961 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3962 * stores NULL into it if it is absent.
3964 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3967 * Returns 0 if successful, otherwise a positive errno value. */
3969 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3970 struct nlattr **options)
3972 static const struct nl_policy tca_policy[] = {
3973 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3974 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3976 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3978 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3979 tca_policy, ta, ARRAY_SIZE(ta))) {
3980 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3985 *kind = nl_attr_get_string(ta[TCA_KIND]);
3989 *options = ta[TCA_OPTIONS];
4004 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
4005 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
4006 * into '*options', and its queue statistics into '*stats'. Any of the output
4007 * arguments may be null.
4009 * Returns 0 if successful, otherwise a positive errno value. */
4011 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
4012 struct nlattr **options, struct netdev_queue_stats *stats)
4014 static const struct nl_policy tca_policy[] = {
4015 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
4016 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
4018 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4020 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4021 tca_policy, ta, ARRAY_SIZE(ta))) {
4022 VLOG_WARN_RL(&rl, "failed to parse class message");
4027 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
4028 *handlep = tc->tcm_handle;
4032 *options = ta[TCA_OPTIONS];
4036 const struct gnet_stats_queue *gsq;
4037 struct gnet_stats_basic gsb;
4039 static const struct nl_policy stats_policy[] = {
4040 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
4041 .min_len = sizeof gsb },
4042 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
4043 .min_len = sizeof *gsq },
4045 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
4047 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
4048 sa, ARRAY_SIZE(sa))) {
4049 VLOG_WARN_RL(&rl, "failed to parse class stats");
4053 /* Alignment issues screw up the length of struct gnet_stats_basic on
4054 * some arch/bitsize combinations. Newer versions of Linux have a
4055 * struct gnet_stats_basic_packed, but we can't depend on that. The
4056 * easiest thing to do is just to make a copy. */
4057 memset(&gsb, 0, sizeof gsb);
4058 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
4059 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
4060 stats->tx_bytes = gsb.bytes;
4061 stats->tx_packets = gsb.packets;
4063 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
4064 stats->tx_errors = gsq->drops;
4074 memset(stats, 0, sizeof *stats);
4079 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
4082 tc_query_class(const struct netdev *netdev,
4083 unsigned int handle, unsigned int parent,
4084 struct ofpbuf **replyp)
4086 struct ofpbuf request;
4087 struct tcmsg *tcmsg;
4090 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
4094 tcmsg->tcm_handle = handle;
4095 tcmsg->tcm_parent = parent;
4097 error = tc_transact(&request, replyp);
4099 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
4100 netdev_get_name(netdev),
4101 tc_get_major(handle), tc_get_minor(handle),
4102 tc_get_major(parent), tc_get_minor(parent),
4103 ovs_strerror(error));
4108 /* Equivalent to "tc class del dev <name> handle <handle>". */
4110 tc_delete_class(const struct netdev *netdev, unsigned int handle)
4112 struct ofpbuf request;
4113 struct tcmsg *tcmsg;
4116 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
4120 tcmsg->tcm_handle = handle;
4121 tcmsg->tcm_parent = 0;
4123 error = tc_transact(&request, NULL);
4125 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4126 netdev_get_name(netdev),
4127 tc_get_major(handle), tc_get_minor(handle),
4128 ovs_strerror(error));
4133 /* Equivalent to "tc qdisc del dev <name> root". */
4135 tc_del_qdisc(struct netdev *netdev_)
4137 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4138 struct ofpbuf request;
4139 struct tcmsg *tcmsg;
4142 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
4146 tcmsg->tcm_handle = tc_make_handle(1, 0);
4147 tcmsg->tcm_parent = TC_H_ROOT;
4149 error = tc_transact(&request, NULL);
4150 if (error == EINVAL) {
4151 /* EINVAL probably means that the default qdisc was in use, in which
4152 * case we've accomplished our purpose. */
4155 if (!error && netdev->tc) {
4156 if (netdev->tc->ops->tc_destroy) {
4157 netdev->tc->ops->tc_destroy(netdev->tc);
4164 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4165 * kernel to determine what they are. Returns 0 if successful, otherwise a
4166 * positive errno value. */
4168 tc_query_qdisc(const struct netdev *netdev_)
4170 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4171 struct ofpbuf request, *qdisc;
4172 const struct tc_ops *ops;
4173 struct tcmsg *tcmsg;
4181 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4182 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4183 * 2.6.35 without that fix backported to it.
4185 * To avoid the OOPS, we must not make a request that would attempt to dump
4186 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4187 * few others. There are a few ways that I can see to do this, but most of
4188 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4189 * technique chosen here is to assume that any non-default qdisc that we
4190 * create will have a class with handle 1:0. The built-in qdiscs only have
4191 * a class with handle 0:0.
4193 * We could check for Linux 2.6.35+ and use a more straightforward method
4195 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
4199 tcmsg->tcm_handle = tc_make_handle(1, 0);
4200 tcmsg->tcm_parent = 0;
4202 /* Figure out what tc class to instantiate. */
4203 error = tc_transact(&request, &qdisc);
4207 error = tc_parse_qdisc(qdisc, &kind, NULL);
4209 ops = &tc_ops_other;
4211 ops = tc_lookup_linux_name(kind);
4213 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4214 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4216 ops = &tc_ops_other;
4219 } else if (error == ENOENT) {
4220 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4221 * other entity that doesn't have a handle 1:0. We will assume
4222 * that it's the system default qdisc. */
4223 ops = &tc_ops_default;
4226 /* Who knows? Maybe the device got deleted. */
4227 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4228 netdev_get_name(netdev_), ovs_strerror(error));
4229 ops = &tc_ops_other;
4232 /* Instantiate it. */
4233 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
4234 ovs_assert((load_error == 0) == (netdev->tc != NULL));
4235 ofpbuf_delete(qdisc);
4237 return error ? error : load_error;
4240 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4241 approximate the time to transmit packets of various lengths. For an MTU of
4242 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4243 represents two possible packet lengths; for a MTU of 513 through 1024, four
4244 possible lengths; and so on.
4246 Returns, for the specified 'mtu', the number of bits that packet lengths
4247 need to be shifted right to fit within such a 256-entry table. */
4249 tc_calc_cell_log(unsigned int mtu)
4254 mtu = ETH_PAYLOAD_MAX;
4256 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4258 for (cell_log = 0; mtu >= 256; cell_log++) {
4265 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4268 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4270 memset(rate, 0, sizeof *rate);
4271 rate->cell_log = tc_calc_cell_log(mtu);
4272 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4273 /* rate->cell_align = 0; */ /* distro headers. */
4274 rate->mpu = ETH_TOTAL_MIN;
4278 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4279 * attribute of the specified "type".
4281 * See tc_calc_cell_log() above for a description of "rtab"s. */
4283 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4288 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4289 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4290 unsigned packet_size = (i + 1) << rate->cell_log;
4291 if (packet_size < rate->mpu) {
4292 packet_size = rate->mpu;
4294 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4298 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4299 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4300 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4303 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4305 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4306 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4309 /* Linux-only functions declared in netdev-linux.h */
4311 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4312 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4314 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4315 const char *flag_name, bool enable)
4317 const char *netdev_name = netdev_get_name(netdev);
4318 struct ethtool_value evalue;
4322 COVERAGE_INC(netdev_get_ethtool);
4323 memset(&evalue, 0, sizeof evalue);
4324 error = netdev_linux_do_ethtool(netdev_name,
4325 (struct ethtool_cmd *)&evalue,
4326 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4331 COVERAGE_INC(netdev_set_ethtool);
4332 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4333 error = netdev_linux_do_ethtool(netdev_name,
4334 (struct ethtool_cmd *)&evalue,
4335 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4340 COVERAGE_INC(netdev_get_ethtool);
4341 memset(&evalue, 0, sizeof evalue);
4342 error = netdev_linux_do_ethtool(netdev_name,
4343 (struct ethtool_cmd *)&evalue,
4344 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4349 if (new_flags != evalue.data) {
4350 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4351 "device %s failed", enable ? "enable" : "disable",
4352 flag_name, netdev_name);
4359 /* Utility functions. */
4361 /* Copies 'src' into 'dst', performing format conversion in the process. */
4363 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4364 const struct rtnl_link_stats *src)
4366 dst->rx_packets = src->rx_packets;
4367 dst->tx_packets = src->tx_packets;
4368 dst->rx_bytes = src->rx_bytes;
4369 dst->tx_bytes = src->tx_bytes;
4370 dst->rx_errors = src->rx_errors;
4371 dst->tx_errors = src->tx_errors;
4372 dst->rx_dropped = src->rx_dropped;
4373 dst->tx_dropped = src->tx_dropped;
4374 dst->multicast = src->multicast;
4375 dst->collisions = src->collisions;
4376 dst->rx_length_errors = src->rx_length_errors;
4377 dst->rx_over_errors = src->rx_over_errors;
4378 dst->rx_crc_errors = src->rx_crc_errors;
4379 dst->rx_frame_errors = src->rx_frame_errors;
4380 dst->rx_fifo_errors = src->rx_fifo_errors;
4381 dst->rx_missed_errors = src->rx_missed_errors;
4382 dst->tx_aborted_errors = src->tx_aborted_errors;
4383 dst->tx_carrier_errors = src->tx_carrier_errors;
4384 dst->tx_fifo_errors = src->tx_fifo_errors;
4385 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4386 dst->tx_window_errors = src->tx_window_errors;
4390 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4392 /* Policy for RTNLGRP_LINK messages.
4394 * There are *many* more fields in these messages, but currently we only
4395 * care about these fields. */
4396 static const struct nl_policy rtnlgrp_link_policy[] = {
4397 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4398 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4399 .min_len = sizeof(struct rtnl_link_stats) },
4402 struct ofpbuf request;
4403 struct ofpbuf *reply;
4404 struct ifinfomsg *ifi;
4405 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4408 ofpbuf_init(&request, 0);
4409 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4410 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4411 ifi->ifi_family = PF_UNSPEC;
4412 ifi->ifi_index = ifindex;
4413 error = nl_transact(NETLINK_ROUTE, &request, &reply);
4414 ofpbuf_uninit(&request);
4419 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4420 rtnlgrp_link_policy,
4421 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4422 ofpbuf_delete(reply);
4426 if (!attrs[IFLA_STATS]) {
4427 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4428 ofpbuf_delete(reply);
4432 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4434 ofpbuf_delete(reply);
4440 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4442 static const char fn[] = "/proc/net/dev";
4447 stream = fopen(fn, "r");
4449 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
4454 while (fgets(line, sizeof line, stream)) {
4457 #define X64 "%"SCNu64
4460 X64 X64 X64 X64 X64 X64 X64 "%*u"
4461 X64 X64 X64 X64 X64 X64 X64 "%*u",
4467 &stats->rx_fifo_errors,
4468 &stats->rx_frame_errors,
4474 &stats->tx_fifo_errors,
4476 &stats->tx_carrier_errors) != 15) {
4477 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4478 } else if (!strcmp(devname, netdev_name)) {
4479 stats->rx_length_errors = UINT64_MAX;
4480 stats->rx_over_errors = UINT64_MAX;
4481 stats->rx_crc_errors = UINT64_MAX;
4482 stats->rx_missed_errors = UINT64_MAX;
4483 stats->tx_aborted_errors = UINT64_MAX;
4484 stats->tx_heartbeat_errors = UINT64_MAX;
4485 stats->tx_window_errors = UINT64_MAX;
4491 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4497 get_flags(const struct netdev *dev, unsigned int *flags)
4503 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
4505 *flags = ifr.ifr_flags;
4511 set_flags(const char *name, unsigned int flags)
4515 ifr.ifr_flags = flags;
4516 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
4520 do_get_ifindex(const char *netdev_name)
4525 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4526 COVERAGE_INC(netdev_get_ifindex);
4528 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
4530 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4531 netdev_name, ovs_strerror(error));
4534 return ifr.ifr_ifindex;
4538 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4540 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4542 if (!(netdev->cache_valid & VALID_IFINDEX)) {
4543 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4546 netdev->get_ifindex_error = -ifindex;
4547 netdev->ifindex = 0;
4549 netdev->get_ifindex_error = 0;
4550 netdev->ifindex = ifindex;
4552 netdev->cache_valid |= VALID_IFINDEX;
4555 *ifindexp = netdev->ifindex;
4556 return netdev->get_ifindex_error;
4560 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4566 memset(&ifr, 0, sizeof ifr);
4567 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4568 COVERAGE_INC(netdev_get_hwaddr);
4569 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
4571 /* ENODEV probably means that a vif disappeared asynchronously and
4572 * hasn't been removed from the database yet, so reduce the log level
4573 * to INFO for that case. */
4574 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
4575 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4576 netdev_name, ovs_strerror(error));
4579 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4580 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4581 VLOG_WARN("%s device has unknown hardware address family %d",
4582 netdev_name, hwaddr_family);
4584 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4589 set_etheraddr(const char *netdev_name,
4590 const uint8_t mac[ETH_ADDR_LEN])
4595 memset(&ifr, 0, sizeof ifr);
4596 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4597 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4598 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4599 COVERAGE_INC(netdev_set_hwaddr);
4600 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
4602 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4603 netdev_name, ovs_strerror(error));
4609 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4610 int cmd, const char *cmd_name)
4615 memset(&ifr, 0, sizeof ifr);
4616 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4617 ifr.ifr_data = (caddr_t) ecmd;
4620 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
4622 if (error != EOPNOTSUPP) {
4623 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4624 "failed: %s", cmd_name, name, ovs_strerror(error));
4626 /* The device doesn't support this operation. That's pretty
4627 * common, so there's no point in logging anything. */
4634 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4635 int cmd, const char *cmd_name)
4640 ifr.ifr_addr.sa_family = AF_INET;
4641 error = af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4643 const struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *,
4645 *ip = sin->sin_addr;
4650 /* Returns an AF_PACKET raw socket or a negative errno value. */
4652 af_packet_sock(void)
4654 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4657 if (ovsthread_once_start(&once)) {
4658 sock = socket(AF_PACKET, SOCK_RAW, 0);
4660 int error = set_nonblocking(sock);
4667 VLOG_ERR("failed to create packet socket: %s",
4668 ovs_strerror(errno));
4670 ovsthread_once_done(&once);