2 * Copyright (c) 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <linux/version.h>
37 #include <sys/types.h>
38 #include <sys/ioctl.h>
39 #include <sys/socket.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
52 #include "dpif-linux.h"
53 #include "dynamic-string.h"
54 #include "fatal-signal.h"
57 #include "netdev-provider.h"
58 #include "netdev-vport.h"
59 #include "netlink-notifier.h"
60 #include "netlink-socket.h"
63 #include "openflow/openflow.h"
65 #include "poll-loop.h"
66 #include "rtnetlink-link.h"
68 #include "socket-util.h"
71 #include "unaligned.h"
74 VLOG_DEFINE_THIS_MODULE(netdev_linux);
76 COVERAGE_DEFINE(netdev_set_policing);
77 COVERAGE_DEFINE(netdev_arp_lookup);
78 COVERAGE_DEFINE(netdev_get_ifindex);
79 COVERAGE_DEFINE(netdev_get_hwaddr);
80 COVERAGE_DEFINE(netdev_set_hwaddr);
81 COVERAGE_DEFINE(netdev_get_ethtool);
82 COVERAGE_DEFINE(netdev_set_ethtool);
85 /* These were introduced in Linux 2.6.14, so they might be missing if we have
87 #ifndef ADVERTISED_Pause
88 #define ADVERTISED_Pause (1 << 13)
90 #ifndef ADVERTISED_Asym_Pause
91 #define ADVERTISED_Asym_Pause (1 << 14)
94 /* These were introduced in Linux 2.6.24, so they might be missing if we
95 * have old headers. */
96 #ifndef ETHTOOL_GFLAGS
97 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
99 #ifndef ETHTOOL_SFLAGS
100 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
103 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
106 #define TC_RTAB_SIZE 1024
109 static struct nln_notifier *netdev_linux_cache_notifier = NULL;
110 static int cache_notifier_refcount;
113 VALID_IFINDEX = 1 << 0,
114 VALID_ETHERADDR = 1 << 1,
118 VALID_POLICING = 1 << 5,
119 VALID_VPORT_STAT_ERROR = 1 << 6,
120 VALID_DRVINFO = 1 << 7,
121 VALID_FEATURES = 1 << 8,
124 /* Traffic control. */
126 /* An instance of a traffic control class. Always associated with a particular
129 * Each TC implementation subclasses this with whatever additional data it
132 const struct tc_ops *ops;
133 struct hmap queues; /* Contains "struct tc_queue"s.
134 * Read by generic TC layer.
135 * Written only by TC implementation. */
138 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
140 /* One traffic control queue.
142 * Each TC implementation subclasses this with whatever additional data it
145 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
146 unsigned int queue_id; /* OpenFlow queue ID. */
147 long long int created; /* Time queue was created, in msecs. */
150 /* A particular kind of traffic control. Each implementation generally maps to
151 * one particular Linux qdisc class.
153 * The functions below return 0 if successful or a positive errno value on
154 * failure, except where otherwise noted. All of them must be provided, except
155 * where otherwise noted. */
157 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
158 * This is null for tc_ops_default and tc_ops_other, for which there are no
159 * appropriate values. */
160 const char *linux_name;
162 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
163 const char *ovs_name;
165 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
166 * queues. The queues are numbered 0 through n_queues - 1. */
167 unsigned int n_queues;
169 /* Called to install this TC class on 'netdev'. The implementation should
170 * make the Netlink calls required to set up 'netdev' with the right qdisc
171 * and configure it according to 'details'. The implementation may assume
172 * that the current qdisc is the default; that is, there is no need for it
173 * to delete the current qdisc before installing itself.
175 * The contents of 'details' should be documented as valid for 'ovs_name'
176 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
177 * (which is built as ovs-vswitchd.conf.db(8)).
179 * This function must return 0 if and only if it sets 'netdev->tc' to an
180 * initialized 'struct tc'.
182 * (This function is null for tc_ops_other, which cannot be installed. For
183 * other TC classes it should always be nonnull.) */
184 int (*tc_install)(struct netdev *netdev, const struct smap *details);
186 /* Called when the netdev code determines (through a Netlink query) that
187 * this TC class's qdisc is installed on 'netdev', but we didn't install
188 * it ourselves and so don't know any of the details.
190 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
191 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
192 * implementation should parse the other attributes of 'nlmsg' as
193 * necessary to determine its configuration. If necessary it should also
194 * use Netlink queries to determine the configuration of queues on
197 * This function must return 0 if and only if it sets 'netdev->tc' to an
198 * initialized 'struct tc'. */
199 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
201 /* Destroys the data structures allocated by the implementation as part of
202 * 'tc'. (This includes destroying 'tc->queues' by calling
205 * The implementation should not need to perform any Netlink calls. If
206 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
207 * (But it may not be desirable.)
209 * This function may be null if 'tc' is trivial. */
210 void (*tc_destroy)(struct tc *tc);
212 /* Retrieves details of 'netdev->tc' configuration into 'details'.
214 * The implementation should not need to perform any Netlink calls, because
215 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
216 * cached the configuration.
218 * The contents of 'details' should be documented as valid for 'ovs_name'
219 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
220 * (which is built as ovs-vswitchd.conf.db(8)).
222 * This function may be null if 'tc' is not configurable.
224 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
226 /* Reconfigures 'netdev->tc' according to 'details', performing any
227 * required Netlink calls to complete the reconfiguration.
229 * The contents of 'details' should be documented as valid for 'ovs_name'
230 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
231 * (which is built as ovs-vswitchd.conf.db(8)).
233 * This function may be null if 'tc' is not configurable.
235 int (*qdisc_set)(struct netdev *, const struct smap *details);
237 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
238 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
240 * The contents of 'details' should be documented as valid for 'ovs_name'
241 * in the "other_config" column in the "Queue" table in
242 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
244 * The implementation should not need to perform any Netlink calls, because
245 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
246 * cached the queue configuration.
248 * This function may be null if 'tc' does not have queues ('n_queues' is
250 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
251 struct smap *details);
253 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
254 * 'details', perfoming any required Netlink calls to complete the
255 * reconfiguration. The caller ensures that 'queue_id' is less than
258 * The contents of 'details' should be documented as valid for 'ovs_name'
259 * in the "other_config" column in the "Queue" table in
260 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
262 * This function may be null if 'tc' does not have queues or its queues are
263 * not configurable. */
264 int (*class_set)(struct netdev *, unsigned int queue_id,
265 const struct smap *details);
267 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
268 * tc_queue's within 'netdev->tc->queues'.
270 * This function may be null if 'tc' does not have queues or its queues
271 * cannot be deleted. */
272 int (*class_delete)(struct netdev *, struct tc_queue *queue);
274 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
275 * 'struct tc_queue's within 'netdev->tc->queues'.
277 * On success, initializes '*stats'.
279 * This function may be null if 'tc' does not have queues or if it cannot
280 * report queue statistics. */
281 int (*class_get_stats)(const struct netdev *netdev,
282 const struct tc_queue *queue,
283 struct netdev_queue_stats *stats);
285 /* Extracts queue stats from 'nlmsg', which is a response to a
286 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
288 * This function may be null if 'tc' does not have queues or if it cannot
289 * report queue statistics. */
290 int (*class_dump_stats)(const struct netdev *netdev,
291 const struct ofpbuf *nlmsg,
292 netdev_dump_queue_stats_cb *cb, void *aux);
296 tc_init(struct tc *tc, const struct tc_ops *ops)
299 hmap_init(&tc->queues);
303 tc_destroy(struct tc *tc)
305 hmap_destroy(&tc->queues);
308 static const struct tc_ops tc_ops_htb;
309 static const struct tc_ops tc_ops_hfsc;
310 static const struct tc_ops tc_ops_default;
311 static const struct tc_ops tc_ops_other;
313 static const struct tc_ops *const tcs[] = {
314 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
315 &tc_ops_hfsc, /* Hierarchical fair service curve. */
316 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
317 &tc_ops_other, /* Some other qdisc. */
321 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
322 static unsigned int tc_get_major(unsigned int handle);
323 static unsigned int tc_get_minor(unsigned int handle);
325 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
326 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
327 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
329 static struct tcmsg *tc_make_request(const struct netdev *, int type,
330 unsigned int flags, struct ofpbuf *);
331 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
332 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
333 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
336 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
337 struct nlattr **options);
338 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
339 struct nlattr **options,
340 struct netdev_queue_stats *);
341 static int tc_query_class(const struct netdev *,
342 unsigned int handle, unsigned int parent,
343 struct ofpbuf **replyp);
344 static int tc_delete_class(const struct netdev *, unsigned int handle);
346 static int tc_del_qdisc(struct netdev *netdev);
347 static int tc_query_qdisc(const struct netdev *netdev);
349 static int tc_calc_cell_log(unsigned int mtu);
350 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
351 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
352 const struct tc_ratespec *rate);
353 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
355 struct netdev_linux {
358 struct shash_node *shash_node;
359 unsigned int cache_valid;
360 unsigned int change_seq;
362 bool miimon; /* Link status of last poll. */
363 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
364 struct timer miimon_timer;
366 /* The following are figured out "on demand" only. They are only valid
367 * when the corresponding VALID_* bit in 'cache_valid' is set. */
369 uint8_t etheraddr[ETH_ADDR_LEN];
370 struct in_addr address, netmask;
373 unsigned int ifi_flags;
374 long long int carrier_resets;
375 uint32_t kbits_rate; /* Policing data. */
376 uint32_t kbits_burst;
377 int vport_stats_error; /* Cached error code from vport_get_stats().
378 0 or an errno value. */
379 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
380 int ether_addr_error; /* Cached error code from set/get etheraddr. */
381 int netdev_policing_error; /* Cached error code from set policing. */
382 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
383 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
385 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
386 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
387 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
388 enum netdev_features peer; /* Cached from ETHTOOL_GSET. */
390 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
393 /* For devices of class netdev_tap_class only. */
397 struct netdev_rx_linux {
403 static const struct netdev_rx_class netdev_rx_linux_class;
405 /* Sockets used for ioctl operations. */
406 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
408 /* This is set pretty low because we probably won't learn anything from the
409 * additional log messages. */
410 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
412 static int netdev_linux_init(void);
414 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
415 int cmd, const char *cmd_name);
416 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
417 const char *cmd_name);
418 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
419 int cmd, const char *cmd_name);
420 static int get_flags(const struct netdev *, unsigned int *flags);
421 static int set_flags(const char *, unsigned int flags);
422 static int do_get_ifindex(const char *netdev_name);
423 static int get_ifindex(const struct netdev *, int *ifindexp);
424 static int do_set_addr(struct netdev *netdev,
425 int ioctl_nr, const char *ioctl_name,
426 struct in_addr addr);
427 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
428 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
429 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
430 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
431 static int af_packet_sock(void);
432 static void netdev_linux_miimon_run(void);
433 static void netdev_linux_miimon_wait(void);
436 is_netdev_linux_class(const struct netdev_class *netdev_class)
438 return netdev_class->init == netdev_linux_init;
442 is_tap_netdev(const struct netdev *netdev)
444 return netdev_get_class(netdev) == &netdev_tap_class;
447 static struct netdev_linux *
448 netdev_linux_cast(const struct netdev *netdev)
450 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
452 return CONTAINER_OF(netdev, struct netdev_linux, up);
455 static struct netdev_rx_linux *
456 netdev_rx_linux_cast(const struct netdev_rx *rx)
458 netdev_rx_assert_class(rx, &netdev_rx_linux_class);
459 return CONTAINER_OF(rx, struct netdev_rx_linux, up);
463 netdev_linux_init(void)
465 static int status = -1;
467 /* Create AF_INET socket. */
468 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
469 status = af_inet_sock >= 0 ? 0 : errno;
471 VLOG_ERR("failed to create inet socket: %s", ovs_strerror(status));
478 netdev_linux_run(void)
480 rtnetlink_link_run();
481 netdev_linux_miimon_run();
485 netdev_linux_wait(void)
487 rtnetlink_link_wait();
488 netdev_linux_miimon_wait();
492 netdev_linux_changed(struct netdev_linux *dev,
493 unsigned int ifi_flags, unsigned int mask)
496 if (!dev->change_seq) {
500 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
501 dev->carrier_resets++;
503 dev->ifi_flags = ifi_flags;
505 dev->cache_valid &= mask;
509 netdev_linux_update(struct netdev_linux *dev,
510 const struct rtnetlink_link_change *change)
512 if (change->nlmsg_type == RTM_NEWLINK) {
514 netdev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
516 /* Update netdev from rtnl-change msg. */
518 dev->mtu = change->mtu;
519 dev->cache_valid |= VALID_MTU;
520 dev->netdev_mtu_error = 0;
523 if (!eth_addr_is_zero(change->addr)) {
524 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
525 dev->cache_valid |= VALID_ETHERADDR;
526 dev->ether_addr_error = 0;
529 dev->ifindex = change->ifi_index;
530 dev->cache_valid |= VALID_IFINDEX;
531 dev->get_ifindex_error = 0;
534 netdev_linux_changed(dev, change->ifi_flags, 0);
539 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
540 void *aux OVS_UNUSED)
542 struct netdev_linux *dev;
544 struct netdev *base_dev = netdev_from_name(change->ifname);
545 if (base_dev && is_netdev_linux_class(netdev_get_class(base_dev))) {
546 netdev_linux_update(netdev_linux_cast(base_dev), change);
549 struct shash device_shash;
550 struct shash_node *node;
552 shash_init(&device_shash);
553 netdev_get_devices(&netdev_linux_class, &device_shash);
554 SHASH_FOR_EACH (node, &device_shash) {
555 struct netdev *netdev = node->data;
558 dev = netdev_linux_cast(netdev);
560 get_flags(&dev->up, &flags);
561 netdev_linux_changed(dev, flags, 0);
563 shash_destroy(&device_shash);
568 cache_notifier_ref(void)
570 if (!cache_notifier_refcount) {
571 ovs_assert(!netdev_linux_cache_notifier);
573 netdev_linux_cache_notifier =
574 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
576 if (!netdev_linux_cache_notifier) {
580 cache_notifier_refcount++;
586 cache_notifier_unref(void)
588 ovs_assert(cache_notifier_refcount > 0);
589 if (!--cache_notifier_refcount) {
590 ovs_assert(netdev_linux_cache_notifier);
591 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
592 netdev_linux_cache_notifier = NULL;
596 /* Creates system and internal devices. */
598 netdev_linux_create(const struct netdev_class *class, const char *name,
599 struct netdev **netdevp)
601 struct netdev_linux *netdev;
604 error = cache_notifier_ref();
609 netdev = xzalloc(sizeof *netdev);
610 netdev->change_seq = 1;
611 netdev_init(&netdev->up, name, class);
612 error = get_flags(&netdev->up, &netdev->ifi_flags);
613 if (error == ENODEV) {
614 if (class != &netdev_internal_class) {
615 /* The device does not exist, so don't allow it to be opened. */
616 netdev_uninit(&netdev->up, false);
617 cache_notifier_unref();
621 /* "Internal" netdevs have to be created as netdev objects before
622 * they exist in the kernel, because creating them in the kernel
623 * happens by passing a netdev object to dpif_port_add().
624 * Therefore, ignore the error. */
628 *netdevp = &netdev->up;
632 /* For most types of netdevs we open the device for each call of
633 * netdev_open(). However, this is not the case with tap devices,
634 * since it is only possible to open the device once. In this
635 * situation we share a single file descriptor, and consequently
636 * buffers, across all readers. Therefore once data is read it will
637 * be unavailable to other reads for tap devices. */
639 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
640 const char *name, struct netdev **netdevp)
642 struct netdev_linux *netdev;
643 static const char tap_dev[] = "/dev/net/tun";
647 netdev = xzalloc(sizeof *netdev);
648 netdev->change_seq = 1;
650 error = cache_notifier_ref();
655 /* Open tap device. */
656 netdev->tap_fd = open(tap_dev, O_RDWR);
657 if (netdev->tap_fd < 0) {
659 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
660 goto error_unref_notifier;
663 /* Create tap device. */
664 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
665 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
666 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
667 VLOG_WARN("%s: creating tap device failed: %s", name,
668 ovs_strerror(errno));
673 /* Make non-blocking. */
674 error = set_nonblocking(netdev->tap_fd);
679 netdev_init(&netdev->up, name, &netdev_tap_class);
680 *netdevp = &netdev->up;
684 close(netdev->tap_fd);
685 error_unref_notifier:
686 cache_notifier_unref();
693 netdev_linux_destroy(struct netdev *netdev_)
695 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
697 if (netdev->tc && netdev->tc->ops->tc_destroy) {
698 netdev->tc->ops->tc_destroy(netdev->tc);
701 if (netdev_get_class(netdev_) == &netdev_tap_class
702 && netdev->tap_fd >= 0)
704 close(netdev->tap_fd);
708 cache_notifier_unref();
712 netdev_linux_rx_open(struct netdev *netdev_, struct netdev_rx **rxp)
714 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
715 bool is_tap = is_tap_netdev(netdev_);
716 struct netdev_rx_linux *rx;
723 struct sockaddr_ll sll;
725 /* Result of tcpdump -dd inbound */
726 static struct sock_filter filt[] = {
727 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
728 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
729 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
730 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
732 static struct sock_fprog fprog = { ARRAY_SIZE(filt), filt };
734 /* Create file descriptor. */
735 fd = socket(PF_PACKET, SOCK_RAW, 0);
738 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
742 /* Set non-blocking mode. */
743 error = set_nonblocking(fd);
748 /* Get ethernet device index. */
749 error = get_ifindex(&netdev->up, &ifindex);
754 /* Bind to specific ethernet device. */
755 memset(&sll, 0, sizeof sll);
756 sll.sll_family = AF_PACKET;
757 sll.sll_ifindex = ifindex;
758 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
759 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
761 VLOG_ERR("%s: failed to bind raw socket (%s)",
762 netdev_get_name(netdev_), ovs_strerror(error));
766 /* Filter for only inbound packets. */
767 error = setsockopt(fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
771 VLOG_ERR("%s: failed attach filter (%s)",
772 netdev_get_name(netdev_), ovs_strerror(error));
777 rx = xmalloc(sizeof *rx);
778 netdev_rx_init(&rx->up, netdev_, &netdev_rx_linux_class);
793 netdev_rx_linux_destroy(struct netdev_rx *rx_)
795 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
804 netdev_rx_linux_recv(struct netdev_rx *rx_, void *data, size_t size)
806 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
811 ? read(rx->fd, data, size)
812 : recv(rx->fd, data, size, MSG_TRUNC));
813 } while (retval < 0 && errno == EINTR);
816 return retval > size ? -EMSGSIZE : retval;
818 if (errno != EAGAIN) {
819 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
820 ovs_strerror(errno), netdev_rx_get_name(rx_));
827 netdev_rx_linux_wait(struct netdev_rx *rx_)
829 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
830 poll_fd_wait(rx->fd, POLLIN);
834 netdev_rx_linux_drain(struct netdev_rx *rx_)
836 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
839 int error = netdev_linux_do_ioctl(netdev_rx_get_name(rx_), &ifr,
840 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
844 drain_fd(rx->fd, ifr.ifr_qlen);
847 return drain_rcvbuf(rx->fd);
851 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
852 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
853 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
854 * the packet is too big or too small to transmit on the device.
856 * The caller retains ownership of 'buffer' in all cases.
858 * The kernel maintains a packet transmission queue, so the caller is not
859 * expected to do additional queuing of packets. */
861 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
866 if (!is_tap_netdev(netdev_)) {
867 /* Use our AF_PACKET socket to send to this device. */
868 struct sockaddr_ll sll;
875 sock = af_packet_sock();
880 error = get_ifindex(netdev_, &ifindex);
885 /* We don't bother setting most fields in sockaddr_ll because the
886 * kernel ignores them for SOCK_RAW. */
887 memset(&sll, 0, sizeof sll);
888 sll.sll_family = AF_PACKET;
889 sll.sll_ifindex = ifindex;
891 iov.iov_base = CONST_CAST(void *, data);
895 msg.msg_namelen = sizeof sll;
898 msg.msg_control = NULL;
899 msg.msg_controllen = 0;
902 retval = sendmsg(sock, &msg, 0);
904 /* Use the tap fd to send to this device. This is essential for
905 * tap devices, because packets sent to a tap device with an
906 * AF_PACKET socket will loop back to be *received* again on the
907 * tap device. This doesn't occur on other interface types
908 * because we attach a socket filter to the rx socket. */
909 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
911 retval = write(netdev->tap_fd, data, size);
915 /* The Linux AF_PACKET implementation never blocks waiting for room
916 * for packets, instead returning ENOBUFS. Translate this into
917 * EAGAIN for the caller. */
918 if (errno == ENOBUFS) {
920 } else if (errno == EINTR) {
922 } else if (errno != EAGAIN) {
923 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
924 netdev_get_name(netdev_), ovs_strerror(errno));
927 } else if (retval != size) {
928 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
929 "%zu) on %s", retval, size, netdev_get_name(netdev_));
937 /* Registers with the poll loop to wake up from the next call to poll_block()
938 * when the packet transmission queue has sufficient room to transmit a packet
939 * with netdev_send().
941 * The kernel maintains a packet transmission queue, so the client is not
942 * expected to do additional queuing of packets. Thus, this function is
943 * unlikely to ever be used. It is included for completeness. */
945 netdev_linux_send_wait(struct netdev *netdev)
947 if (is_tap_netdev(netdev)) {
948 /* TAP device always accepts packets.*/
949 poll_immediate_wake();
953 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
954 * otherwise a positive errno value. */
956 netdev_linux_set_etheraddr(struct netdev *netdev_,
957 const uint8_t mac[ETH_ADDR_LEN])
959 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
960 struct netdev_saved_flags *sf = NULL;
963 if (netdev->cache_valid & VALID_ETHERADDR) {
964 if (netdev->ether_addr_error) {
965 return netdev->ether_addr_error;
967 if (eth_addr_equals(netdev->etheraddr, mac)) {
970 netdev->cache_valid &= ~VALID_ETHERADDR;
973 /* Tap devices must be brought down before setting the address. */
974 if (is_tap_netdev(netdev_)) {
975 netdev_turn_flags_off(netdev_, NETDEV_UP, &sf);
977 error = set_etheraddr(netdev_get_name(netdev_), mac);
978 if (!error || error == ENODEV) {
979 netdev->ether_addr_error = error;
980 netdev->cache_valid |= VALID_ETHERADDR;
982 memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
986 netdev_restore_flags(sf);
991 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
993 netdev_linux_get_etheraddr(const struct netdev *netdev_,
994 uint8_t mac[ETH_ADDR_LEN])
996 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
998 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
999 int error = get_etheraddr(netdev_get_name(netdev_),
1002 netdev->ether_addr_error = error;
1003 netdev->cache_valid |= VALID_ETHERADDR;
1006 if (!netdev->ether_addr_error) {
1007 memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
1010 return netdev->ether_addr_error;
1013 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1014 * in bytes, not including the hardware header; thus, this is typically 1500
1015 * bytes for Ethernet devices. */
1017 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1019 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1020 if (!(netdev->cache_valid & VALID_MTU)) {
1024 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1025 SIOCGIFMTU, "SIOCGIFMTU");
1027 netdev->netdev_mtu_error = error;
1028 netdev->mtu = ifr.ifr_mtu;
1029 netdev->cache_valid |= VALID_MTU;
1032 if (!netdev->netdev_mtu_error) {
1033 *mtup = netdev->mtu;
1035 return netdev->netdev_mtu_error;
1038 /* Sets the maximum size of transmitted (MTU) for given device using linux
1039 * networking ioctl interface.
1042 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1044 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1048 if (netdev->cache_valid & VALID_MTU) {
1049 if (netdev->netdev_mtu_error) {
1050 return netdev->netdev_mtu_error;
1052 if (netdev->mtu == mtu) {
1055 netdev->cache_valid &= ~VALID_MTU;
1058 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1059 SIOCSIFMTU, "SIOCSIFMTU");
1060 if (!error || error == ENODEV) {
1061 netdev->netdev_mtu_error = error;
1062 netdev->mtu = ifr.ifr_mtu;
1063 netdev->cache_valid |= VALID_MTU;
1068 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1069 * On failure, returns a negative errno value. */
1071 netdev_linux_get_ifindex(const struct netdev *netdev)
1075 error = get_ifindex(netdev, &ifindex);
1076 return error ? -error : ifindex;
1080 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1082 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1084 if (netdev->miimon_interval > 0) {
1085 *carrier = netdev->miimon;
1087 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1093 static long long int
1094 netdev_linux_get_carrier_resets(const struct netdev *netdev)
1096 return netdev_linux_cast(netdev)->carrier_resets;
1100 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1101 struct mii_ioctl_data *data)
1106 memset(&ifr, 0, sizeof ifr);
1107 memcpy(&ifr.ifr_data, data, sizeof *data);
1108 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1109 memcpy(data, &ifr.ifr_data, sizeof *data);
1115 netdev_linux_get_miimon(const char *name, bool *miimon)
1117 struct mii_ioctl_data data;
1122 memset(&data, 0, sizeof data);
1123 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1125 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1126 data.reg_num = MII_BMSR;
1127 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1131 *miimon = !!(data.val_out & BMSR_LSTATUS);
1133 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1136 struct ethtool_cmd ecmd;
1138 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1141 COVERAGE_INC(netdev_get_ethtool);
1142 memset(&ecmd, 0, sizeof ecmd);
1143 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1146 struct ethtool_value eval;
1148 memcpy(&eval, &ecmd, sizeof eval);
1149 *miimon = !!eval.data;
1151 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1159 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1160 long long int interval)
1162 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1164 interval = interval > 0 ? MAX(interval, 100) : 0;
1165 if (netdev->miimon_interval != interval) {
1166 netdev->miimon_interval = interval;
1167 timer_set_expired(&netdev->miimon_timer);
1174 netdev_linux_miimon_run(void)
1176 struct shash device_shash;
1177 struct shash_node *node;
1179 shash_init(&device_shash);
1180 netdev_get_devices(&netdev_linux_class, &device_shash);
1181 SHASH_FOR_EACH (node, &device_shash) {
1182 struct netdev *netdev = node->data;
1183 struct netdev_linux *dev = netdev_linux_cast(netdev);
1186 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1190 netdev_linux_get_miimon(dev->up.name, &miimon);
1191 if (miimon != dev->miimon) {
1192 dev->miimon = miimon;
1193 netdev_linux_changed(dev, dev->ifi_flags, 0);
1196 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1199 shash_destroy(&device_shash);
1203 netdev_linux_miimon_wait(void)
1205 struct shash device_shash;
1206 struct shash_node *node;
1208 shash_init(&device_shash);
1209 netdev_get_devices(&netdev_linux_class, &device_shash);
1210 SHASH_FOR_EACH (node, &device_shash) {
1211 struct netdev *netdev = node->data;
1212 struct netdev_linux *dev = netdev_linux_cast(netdev);
1214 if (dev->miimon_interval > 0) {
1215 timer_wait(&dev->miimon_timer);
1218 shash_destroy(&device_shash);
1221 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1222 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1225 check_for_working_netlink_stats(void)
1227 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1228 * preferable, so if that works, we'll use it. */
1229 int ifindex = do_get_ifindex("lo");
1231 VLOG_WARN("failed to get ifindex for lo, "
1232 "obtaining netdev stats from proc");
1235 struct netdev_stats stats;
1236 int error = get_stats_via_netlink(ifindex, &stats);
1238 VLOG_DBG("obtaining netdev stats via rtnetlink");
1241 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1242 "via proc (you are probably running a pre-2.6.19 "
1243 "kernel)", ovs_strerror(error));
1250 swap_uint64(uint64_t *a, uint64_t *b)
1257 /* Copies 'src' into 'dst', performing format conversion in the process.
1259 * 'src' is allowed to be misaligned. */
1261 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1262 const struct ovs_vport_stats *src)
1264 dst->rx_packets = get_unaligned_u64(&src->rx_packets);
1265 dst->tx_packets = get_unaligned_u64(&src->tx_packets);
1266 dst->rx_bytes = get_unaligned_u64(&src->rx_bytes);
1267 dst->tx_bytes = get_unaligned_u64(&src->tx_bytes);
1268 dst->rx_errors = get_unaligned_u64(&src->rx_errors);
1269 dst->tx_errors = get_unaligned_u64(&src->tx_errors);
1270 dst->rx_dropped = get_unaligned_u64(&src->rx_dropped);
1271 dst->tx_dropped = get_unaligned_u64(&src->tx_dropped);
1273 dst->collisions = 0;
1274 dst->rx_length_errors = 0;
1275 dst->rx_over_errors = 0;
1276 dst->rx_crc_errors = 0;
1277 dst->rx_frame_errors = 0;
1278 dst->rx_fifo_errors = 0;
1279 dst->rx_missed_errors = 0;
1280 dst->tx_aborted_errors = 0;
1281 dst->tx_carrier_errors = 0;
1282 dst->tx_fifo_errors = 0;
1283 dst->tx_heartbeat_errors = 0;
1284 dst->tx_window_errors = 0;
1288 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1290 struct dpif_linux_vport reply;
1294 error = dpif_linux_vport_get(netdev_get_name(netdev), &reply, &buf);
1297 } else if (!reply.stats) {
1302 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1310 get_stats_via_vport(const struct netdev *netdev_,
1311 struct netdev_stats *stats)
1313 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1315 if (!netdev->vport_stats_error ||
1316 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1319 error = get_stats_via_vport__(netdev_, stats);
1320 if (error && error != ENOENT) {
1321 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1323 netdev_get_name(netdev_), ovs_strerror(error));
1325 netdev->vport_stats_error = error;
1326 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1331 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1332 struct netdev_stats *stats)
1334 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
1335 static int use_netlink_stats;
1338 if (ovsthread_once_start(&once)) {
1339 use_netlink_stats = check_for_working_netlink_stats();
1340 ovsthread_once_done(&once);
1343 if (use_netlink_stats) {
1346 error = get_ifindex(netdev_, &ifindex);
1348 error = get_stats_via_netlink(ifindex, stats);
1351 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1355 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1356 netdev_get_name(netdev_), error);
1362 /* Retrieves current device stats for 'netdev-linux'. */
1364 netdev_linux_get_stats(const struct netdev *netdev_,
1365 struct netdev_stats *stats)
1367 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1368 struct netdev_stats dev_stats;
1371 get_stats_via_vport(netdev_, stats);
1373 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1376 if (netdev->vport_stats_error) {
1383 if (netdev->vport_stats_error) {
1384 /* stats not available from OVS then use ioctl stats. */
1387 stats->rx_errors += dev_stats.rx_errors;
1388 stats->tx_errors += dev_stats.tx_errors;
1389 stats->rx_dropped += dev_stats.rx_dropped;
1390 stats->tx_dropped += dev_stats.tx_dropped;
1391 stats->multicast += dev_stats.multicast;
1392 stats->collisions += dev_stats.collisions;
1393 stats->rx_length_errors += dev_stats.rx_length_errors;
1394 stats->rx_over_errors += dev_stats.rx_over_errors;
1395 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1396 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1397 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1398 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1399 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1400 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1401 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1402 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1403 stats->tx_window_errors += dev_stats.tx_window_errors;
1408 /* Retrieves current device stats for 'netdev-tap' netdev or
1409 * netdev-internal. */
1411 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1413 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1414 struct netdev_stats dev_stats;
1417 get_stats_via_vport(netdev_, stats);
1419 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1421 if (netdev->vport_stats_error) {
1428 /* If this port is an internal port then the transmit and receive stats
1429 * will appear to be swapped relative to the other ports since we are the
1430 * one sending the data, not a remote computer. For consistency, we swap
1431 * them back here. This does not apply if we are getting stats from the
1432 * vport layer because it always tracks stats from the perspective of the
1434 if (netdev->vport_stats_error) {
1436 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1437 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1438 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1439 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1440 stats->rx_length_errors = 0;
1441 stats->rx_over_errors = 0;
1442 stats->rx_crc_errors = 0;
1443 stats->rx_frame_errors = 0;
1444 stats->rx_fifo_errors = 0;
1445 stats->rx_missed_errors = 0;
1446 stats->tx_aborted_errors = 0;
1447 stats->tx_carrier_errors = 0;
1448 stats->tx_fifo_errors = 0;
1449 stats->tx_heartbeat_errors = 0;
1450 stats->tx_window_errors = 0;
1452 stats->rx_dropped += dev_stats.tx_dropped;
1453 stats->tx_dropped += dev_stats.rx_dropped;
1455 stats->rx_errors += dev_stats.tx_errors;
1456 stats->tx_errors += dev_stats.rx_errors;
1458 stats->multicast += dev_stats.multicast;
1459 stats->collisions += dev_stats.collisions;
1465 netdev_internal_get_stats(const struct netdev *netdev_,
1466 struct netdev_stats *stats)
1468 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1470 get_stats_via_vport(netdev_, stats);
1471 return netdev->vport_stats_error;
1475 netdev_internal_set_stats(struct netdev *netdev,
1476 const struct netdev_stats *stats)
1478 struct ovs_vport_stats vport_stats;
1479 struct dpif_linux_vport vport;
1482 vport_stats.rx_packets = stats->rx_packets;
1483 vport_stats.tx_packets = stats->tx_packets;
1484 vport_stats.rx_bytes = stats->rx_bytes;
1485 vport_stats.tx_bytes = stats->tx_bytes;
1486 vport_stats.rx_errors = stats->rx_errors;
1487 vport_stats.tx_errors = stats->tx_errors;
1488 vport_stats.rx_dropped = stats->rx_dropped;
1489 vport_stats.tx_dropped = stats->tx_dropped;
1491 dpif_linux_vport_init(&vport);
1492 vport.cmd = OVS_VPORT_CMD_SET;
1493 vport.name = netdev_get_name(netdev);
1494 vport.stats = &vport_stats;
1496 err = dpif_linux_vport_transact(&vport, NULL, NULL);
1498 /* If the vport layer doesn't know about the device, that doesn't mean it
1499 * doesn't exist (after all were able to open it when netdev_open() was
1500 * called), it just means that it isn't attached and we'll be getting
1501 * stats a different way. */
1502 if (err == ENODEV) {
1510 netdev_linux_read_features(struct netdev_linux *netdev)
1512 struct ethtool_cmd ecmd;
1516 if (netdev->cache_valid & VALID_FEATURES) {
1520 COVERAGE_INC(netdev_get_ethtool);
1521 memset(&ecmd, 0, sizeof ecmd);
1522 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1523 ETHTOOL_GSET, "ETHTOOL_GSET");
1528 /* Supported features. */
1529 netdev->supported = 0;
1530 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1531 netdev->supported |= NETDEV_F_10MB_HD;
1533 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1534 netdev->supported |= NETDEV_F_10MB_FD;
1536 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1537 netdev->supported |= NETDEV_F_100MB_HD;
1539 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1540 netdev->supported |= NETDEV_F_100MB_FD;
1542 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1543 netdev->supported |= NETDEV_F_1GB_HD;
1545 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1546 netdev->supported |= NETDEV_F_1GB_FD;
1548 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1549 netdev->supported |= NETDEV_F_10GB_FD;
1551 if (ecmd.supported & SUPPORTED_TP) {
1552 netdev->supported |= NETDEV_F_COPPER;
1554 if (ecmd.supported & SUPPORTED_FIBRE) {
1555 netdev->supported |= NETDEV_F_FIBER;
1557 if (ecmd.supported & SUPPORTED_Autoneg) {
1558 netdev->supported |= NETDEV_F_AUTONEG;
1560 if (ecmd.supported & SUPPORTED_Pause) {
1561 netdev->supported |= NETDEV_F_PAUSE;
1563 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1564 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1567 /* Advertised features. */
1568 netdev->advertised = 0;
1569 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1570 netdev->advertised |= NETDEV_F_10MB_HD;
1572 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1573 netdev->advertised |= NETDEV_F_10MB_FD;
1575 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1576 netdev->advertised |= NETDEV_F_100MB_HD;
1578 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1579 netdev->advertised |= NETDEV_F_100MB_FD;
1581 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1582 netdev->advertised |= NETDEV_F_1GB_HD;
1584 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1585 netdev->advertised |= NETDEV_F_1GB_FD;
1587 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1588 netdev->advertised |= NETDEV_F_10GB_FD;
1590 if (ecmd.advertising & ADVERTISED_TP) {
1591 netdev->advertised |= NETDEV_F_COPPER;
1593 if (ecmd.advertising & ADVERTISED_FIBRE) {
1594 netdev->advertised |= NETDEV_F_FIBER;
1596 if (ecmd.advertising & ADVERTISED_Autoneg) {
1597 netdev->advertised |= NETDEV_F_AUTONEG;
1599 if (ecmd.advertising & ADVERTISED_Pause) {
1600 netdev->advertised |= NETDEV_F_PAUSE;
1602 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1603 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1606 /* Current settings. */
1608 if (speed == SPEED_10) {
1609 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1610 } else if (speed == SPEED_100) {
1611 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1612 } else if (speed == SPEED_1000) {
1613 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1614 } else if (speed == SPEED_10000) {
1615 netdev->current = NETDEV_F_10GB_FD;
1616 } else if (speed == 40000) {
1617 netdev->current = NETDEV_F_40GB_FD;
1618 } else if (speed == 100000) {
1619 netdev->current = NETDEV_F_100GB_FD;
1620 } else if (speed == 1000000) {
1621 netdev->current = NETDEV_F_1TB_FD;
1623 netdev->current = 0;
1626 if (ecmd.port == PORT_TP) {
1627 netdev->current |= NETDEV_F_COPPER;
1628 } else if (ecmd.port == PORT_FIBRE) {
1629 netdev->current |= NETDEV_F_FIBER;
1633 netdev->current |= NETDEV_F_AUTONEG;
1636 /* Peer advertisements. */
1637 netdev->peer = 0; /* XXX */
1640 netdev->cache_valid |= VALID_FEATURES;
1641 netdev->get_features_error = error;
1644 /* Stores the features supported by 'netdev' into each of '*current',
1645 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1646 * bitmap of NETDEV_* bits. Returns 0 if successful, otherwise a positive
1649 netdev_linux_get_features(const struct netdev *netdev_,
1650 enum netdev_features *current,
1651 enum netdev_features *advertised,
1652 enum netdev_features *supported,
1653 enum netdev_features *peer)
1655 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1657 netdev_linux_read_features(netdev);
1659 if (!netdev->get_features_error) {
1660 *current = netdev->current;
1661 *advertised = netdev->advertised;
1662 *supported = netdev->supported;
1663 *peer = netdev->peer;
1665 return netdev->get_features_error;
1668 /* Set the features advertised by 'netdev' to 'advertise'. */
1670 netdev_linux_set_advertisements(struct netdev *netdev,
1671 enum netdev_features advertise)
1673 struct ethtool_cmd ecmd;
1676 COVERAGE_INC(netdev_get_ethtool);
1677 memset(&ecmd, 0, sizeof ecmd);
1678 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1679 ETHTOOL_GSET, "ETHTOOL_GSET");
1684 ecmd.advertising = 0;
1685 if (advertise & NETDEV_F_10MB_HD) {
1686 ecmd.advertising |= ADVERTISED_10baseT_Half;
1688 if (advertise & NETDEV_F_10MB_FD) {
1689 ecmd.advertising |= ADVERTISED_10baseT_Full;
1691 if (advertise & NETDEV_F_100MB_HD) {
1692 ecmd.advertising |= ADVERTISED_100baseT_Half;
1694 if (advertise & NETDEV_F_100MB_FD) {
1695 ecmd.advertising |= ADVERTISED_100baseT_Full;
1697 if (advertise & NETDEV_F_1GB_HD) {
1698 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1700 if (advertise & NETDEV_F_1GB_FD) {
1701 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1703 if (advertise & NETDEV_F_10GB_FD) {
1704 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1706 if (advertise & NETDEV_F_COPPER) {
1707 ecmd.advertising |= ADVERTISED_TP;
1709 if (advertise & NETDEV_F_FIBER) {
1710 ecmd.advertising |= ADVERTISED_FIBRE;
1712 if (advertise & NETDEV_F_AUTONEG) {
1713 ecmd.advertising |= ADVERTISED_Autoneg;
1715 if (advertise & NETDEV_F_PAUSE) {
1716 ecmd.advertising |= ADVERTISED_Pause;
1718 if (advertise & NETDEV_F_PAUSE_ASYM) {
1719 ecmd.advertising |= ADVERTISED_Asym_Pause;
1721 COVERAGE_INC(netdev_set_ethtool);
1722 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1723 ETHTOOL_SSET, "ETHTOOL_SSET");
1726 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1727 * successful, otherwise a positive errno value. */
1729 netdev_linux_set_policing(struct netdev *netdev_,
1730 uint32_t kbits_rate, uint32_t kbits_burst)
1732 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1733 const char *netdev_name = netdev_get_name(netdev_);
1737 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1738 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1739 : kbits_burst); /* Stick with user-specified value. */
1741 if (netdev->cache_valid & VALID_POLICING) {
1742 if (netdev->netdev_policing_error) {
1743 return netdev->netdev_policing_error;
1746 if (netdev->kbits_rate == kbits_rate &&
1747 netdev->kbits_burst == kbits_burst) {
1748 /* Assume that settings haven't changed since we last set them. */
1751 netdev->cache_valid &= ~VALID_POLICING;
1754 COVERAGE_INC(netdev_set_policing);
1755 /* Remove any existing ingress qdisc. */
1756 error = tc_add_del_ingress_qdisc(netdev_, false);
1758 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1759 netdev_name, ovs_strerror(error));
1764 error = tc_add_del_ingress_qdisc(netdev_, true);
1766 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1767 netdev_name, ovs_strerror(error));
1771 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
1773 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1774 netdev_name, ovs_strerror(error));
1779 netdev->kbits_rate = kbits_rate;
1780 netdev->kbits_burst = kbits_burst;
1783 if (!error || error == ENODEV) {
1784 netdev->netdev_policing_error = error;
1785 netdev->cache_valid |= VALID_POLICING;
1791 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1794 const struct tc_ops *const *opsp;
1796 for (opsp = tcs; *opsp != NULL; opsp++) {
1797 const struct tc_ops *ops = *opsp;
1798 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1799 sset_add(types, ops->ovs_name);
1805 static const struct tc_ops *
1806 tc_lookup_ovs_name(const char *name)
1808 const struct tc_ops *const *opsp;
1810 for (opsp = tcs; *opsp != NULL; opsp++) {
1811 const struct tc_ops *ops = *opsp;
1812 if (!strcmp(name, ops->ovs_name)) {
1819 static const struct tc_ops *
1820 tc_lookup_linux_name(const char *name)
1822 const struct tc_ops *const *opsp;
1824 for (opsp = tcs; *opsp != NULL; opsp++) {
1825 const struct tc_ops *ops = *opsp;
1826 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1833 static struct tc_queue *
1834 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
1837 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1838 struct tc_queue *queue;
1840 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
1841 if (queue->queue_id == queue_id) {
1848 static struct tc_queue *
1849 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1851 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1855 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1857 struct netdev_qos_capabilities *caps)
1859 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1863 caps->n_queues = ops->n_queues;
1868 netdev_linux_get_qos(const struct netdev *netdev_,
1869 const char **typep, struct smap *details)
1871 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1874 error = tc_query_qdisc(netdev_);
1879 *typep = netdev->tc->ops->ovs_name;
1880 return (netdev->tc->ops->qdisc_get
1881 ? netdev->tc->ops->qdisc_get(netdev_, details)
1886 netdev_linux_set_qos(struct netdev *netdev_,
1887 const char *type, const struct smap *details)
1889 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1890 const struct tc_ops *new_ops;
1893 new_ops = tc_lookup_ovs_name(type);
1894 if (!new_ops || !new_ops->tc_install) {
1898 error = tc_query_qdisc(netdev_);
1903 if (new_ops == netdev->tc->ops) {
1904 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
1906 /* Delete existing qdisc. */
1907 error = tc_del_qdisc(netdev_);
1911 ovs_assert(netdev->tc == NULL);
1913 /* Install new qdisc. */
1914 error = new_ops->tc_install(netdev_, details);
1915 ovs_assert((error == 0) == (netdev->tc != NULL));
1922 netdev_linux_get_queue(const struct netdev *netdev_,
1923 unsigned int queue_id, struct smap *details)
1925 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1928 error = tc_query_qdisc(netdev_);
1932 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
1934 ? netdev->tc->ops->class_get(netdev_, queue, details)
1940 netdev_linux_set_queue(struct netdev *netdev_,
1941 unsigned int queue_id, const struct smap *details)
1943 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1946 error = tc_query_qdisc(netdev_);
1949 } else if (queue_id >= netdev->tc->ops->n_queues
1950 || !netdev->tc->ops->class_set) {
1954 return netdev->tc->ops->class_set(netdev_, queue_id, details);
1958 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
1960 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1963 error = tc_query_qdisc(netdev_);
1966 } else if (!netdev->tc->ops->class_delete) {
1969 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
1971 ? netdev->tc->ops->class_delete(netdev_, queue)
1977 netdev_linux_get_queue_stats(const struct netdev *netdev_,
1978 unsigned int queue_id,
1979 struct netdev_queue_stats *stats)
1981 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1984 error = tc_query_qdisc(netdev_);
1987 } else if (!netdev->tc->ops->class_get_stats) {
1990 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
1994 stats->created = queue->created;
1995 return netdev->tc->ops->class_get_stats(netdev_, queue, stats);
2000 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
2002 struct ofpbuf request;
2003 struct tcmsg *tcmsg;
2005 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2009 tcmsg->tcm_parent = 0;
2010 nl_dump_start(dump, NETLINK_ROUTE, &request);
2011 ofpbuf_uninit(&request);
2016 netdev_linux_dump_queues(const struct netdev *netdev_,
2017 netdev_dump_queues_cb *cb, void *aux)
2019 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2020 struct tc_queue *queue, *next_queue;
2021 struct smap details;
2025 error = tc_query_qdisc(netdev_);
2028 } else if (!netdev->tc->ops->class_get) {
2033 smap_init(&details);
2034 HMAP_FOR_EACH_SAFE (queue, next_queue, hmap_node,
2035 &netdev->tc->queues) {
2036 smap_clear(&details);
2038 error = netdev->tc->ops->class_get(netdev_, queue, &details);
2040 (*cb)(queue->queue_id, &details, aux);
2045 smap_destroy(&details);
2051 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2052 netdev_dump_queue_stats_cb *cb, void *aux)
2054 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2055 struct nl_dump dump;
2060 error = tc_query_qdisc(netdev_);
2063 } else if (!netdev->tc->ops->class_dump_stats) {
2068 if (!start_queue_dump(netdev_, &dump)) {
2071 while (nl_dump_next(&dump, &msg)) {
2072 error = netdev->tc->ops->class_dump_stats(netdev_, &msg, cb, aux);
2078 error = nl_dump_done(&dump);
2079 return error ? error : last_error;
2083 netdev_linux_get_in4(const struct netdev *netdev_,
2084 struct in_addr *address, struct in_addr *netmask)
2086 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2088 if (!(netdev->cache_valid & VALID_IN4)) {
2091 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
2092 SIOCGIFADDR, "SIOCGIFADDR");
2097 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2098 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2103 netdev->cache_valid |= VALID_IN4;
2105 *address = netdev->address;
2106 *netmask = netdev->netmask;
2107 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
2111 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2112 struct in_addr netmask)
2114 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2117 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2119 netdev->cache_valid |= VALID_IN4;
2120 netdev->address = address;
2121 netdev->netmask = netmask;
2122 if (address.s_addr != INADDR_ANY) {
2123 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2124 "SIOCSIFNETMASK", netmask);
2131 parse_if_inet6_line(const char *line,
2132 struct in6_addr *in6, char ifname[16 + 1])
2134 uint8_t *s6 = in6->s6_addr;
2135 #define X8 "%2"SCNx8
2137 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2138 "%*x %*x %*x %*x %16s\n",
2139 &s6[0], &s6[1], &s6[2], &s6[3],
2140 &s6[4], &s6[5], &s6[6], &s6[7],
2141 &s6[8], &s6[9], &s6[10], &s6[11],
2142 &s6[12], &s6[13], &s6[14], &s6[15],
2146 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2147 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2149 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2151 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2152 if (!(netdev->cache_valid & VALID_IN6)) {
2156 netdev->in6 = in6addr_any;
2158 file = fopen("/proc/net/if_inet6", "r");
2160 const char *name = netdev_get_name(netdev_);
2161 while (fgets(line, sizeof line, file)) {
2162 struct in6_addr in6_tmp;
2163 char ifname[16 + 1];
2164 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2165 && !strcmp(name, ifname))
2167 netdev->in6 = in6_tmp;
2173 netdev->cache_valid |= VALID_IN6;
2180 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2182 struct sockaddr_in sin;
2183 memset(&sin, 0, sizeof sin);
2184 sin.sin_family = AF_INET;
2185 sin.sin_addr = addr;
2188 memset(sa, 0, sizeof *sa);
2189 memcpy(sa, &sin, sizeof sin);
2193 do_set_addr(struct netdev *netdev,
2194 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2197 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2198 make_in4_sockaddr(&ifr.ifr_addr, addr);
2200 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2204 /* Adds 'router' as a default IP gateway. */
2206 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2208 struct in_addr any = { INADDR_ANY };
2212 memset(&rt, 0, sizeof rt);
2213 make_in4_sockaddr(&rt.rt_dst, any);
2214 make_in4_sockaddr(&rt.rt_gateway, router);
2215 make_in4_sockaddr(&rt.rt_genmask, any);
2216 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2217 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2219 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2225 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2228 static const char fn[] = "/proc/net/route";
2233 *netdev_name = NULL;
2234 stream = fopen(fn, "r");
2235 if (stream == NULL) {
2236 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2241 while (fgets(line, sizeof line, stream)) {
2244 ovs_be32 dest, gateway, mask;
2245 int refcnt, metric, mtu;
2246 unsigned int flags, use, window, irtt;
2249 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2251 iface, &dest, &gateway, &flags, &refcnt,
2252 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2254 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2258 if (!(flags & RTF_UP)) {
2259 /* Skip routes that aren't up. */
2263 /* The output of 'dest', 'mask', and 'gateway' were given in
2264 * network byte order, so we don't need need any endian
2265 * conversions here. */
2266 if ((dest & mask) == (host->s_addr & mask)) {
2268 /* The host is directly reachable. */
2269 next_hop->s_addr = 0;
2271 /* To reach the host, we must go through a gateway. */
2272 next_hop->s_addr = gateway;
2274 *netdev_name = xstrdup(iface);
2286 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2288 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2291 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2292 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2294 COVERAGE_INC(netdev_get_ethtool);
2295 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2296 error = netdev_linux_do_ethtool(netdev->up.name,
2299 "ETHTOOL_GDRVINFO");
2301 netdev->cache_valid |= VALID_DRVINFO;
2306 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2307 smap_add(smap, "driver_version", netdev->drvinfo.version);
2308 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2314 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2317 smap_add(smap, "driver_name", "openvswitch");
2321 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2322 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2323 * returns 0. Otherwise, it returns a positive errno value; in particular,
2324 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2326 netdev_linux_arp_lookup(const struct netdev *netdev,
2327 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2330 struct sockaddr_in sin;
2333 memset(&r, 0, sizeof r);
2334 memset(&sin, 0, sizeof sin);
2335 sin.sin_family = AF_INET;
2336 sin.sin_addr.s_addr = ip;
2338 memcpy(&r.arp_pa, &sin, sizeof sin);
2339 r.arp_ha.sa_family = ARPHRD_ETHER;
2341 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2342 COVERAGE_INC(netdev_arp_lookup);
2343 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2345 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2346 } else if (retval != ENXIO) {
2347 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2348 netdev_get_name(netdev), IP_ARGS(ip),
2349 ovs_strerror(retval));
2355 nd_to_iff_flags(enum netdev_flags nd)
2358 if (nd & NETDEV_UP) {
2361 if (nd & NETDEV_PROMISC) {
2368 iff_to_nd_flags(int iff)
2370 enum netdev_flags nd = 0;
2374 if (iff & IFF_PROMISC) {
2375 nd |= NETDEV_PROMISC;
2381 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2382 enum netdev_flags on, enum netdev_flags *old_flagsp)
2384 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2385 int old_flags, new_flags;
2388 old_flags = netdev->ifi_flags;
2389 *old_flagsp = iff_to_nd_flags(old_flags);
2390 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2391 if (new_flags != old_flags) {
2392 error = set_flags(netdev_get_name(netdev_), new_flags);
2393 get_flags(netdev_, &netdev->ifi_flags);
2399 netdev_linux_change_seq(const struct netdev *netdev)
2401 return netdev_linux_cast(netdev)->change_seq;
2404 #define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS, \
2405 GET_FEATURES, GET_STATUS) \
2409 netdev_linux_init, \
2411 netdev_linux_wait, \
2414 netdev_linux_destroy, \
2415 NULL, /* get_config */ \
2416 NULL, /* set_config */ \
2417 NULL, /* get_tunnel_config */ \
2419 netdev_linux_rx_open, \
2421 netdev_linux_send, \
2422 netdev_linux_send_wait, \
2424 netdev_linux_set_etheraddr, \
2425 netdev_linux_get_etheraddr, \
2426 netdev_linux_get_mtu, \
2427 netdev_linux_set_mtu, \
2428 netdev_linux_get_ifindex, \
2429 netdev_linux_get_carrier, \
2430 netdev_linux_get_carrier_resets, \
2431 netdev_linux_set_miimon_interval, \
2436 netdev_linux_set_advertisements, \
2438 netdev_linux_set_policing, \
2439 netdev_linux_get_qos_types, \
2440 netdev_linux_get_qos_capabilities, \
2441 netdev_linux_get_qos, \
2442 netdev_linux_set_qos, \
2443 netdev_linux_get_queue, \
2444 netdev_linux_set_queue, \
2445 netdev_linux_delete_queue, \
2446 netdev_linux_get_queue_stats, \
2447 netdev_linux_dump_queues, \
2448 netdev_linux_dump_queue_stats, \
2450 netdev_linux_get_in4, \
2451 netdev_linux_set_in4, \
2452 netdev_linux_get_in6, \
2453 netdev_linux_add_router, \
2454 netdev_linux_get_next_hop, \
2456 netdev_linux_arp_lookup, \
2458 netdev_linux_update_flags, \
2460 netdev_linux_change_seq \
2463 const struct netdev_class netdev_linux_class =
2466 netdev_linux_create,
2467 netdev_linux_get_stats,
2468 NULL, /* set_stats */
2469 netdev_linux_get_features,
2470 netdev_linux_get_status);
2472 const struct netdev_class netdev_tap_class =
2475 netdev_linux_create_tap,
2476 netdev_tap_get_stats,
2477 NULL, /* set_stats */
2478 netdev_linux_get_features,
2479 netdev_linux_get_status);
2481 const struct netdev_class netdev_internal_class =
2484 netdev_linux_create,
2485 netdev_internal_get_stats,
2486 netdev_internal_set_stats,
2487 NULL, /* get_features */
2488 netdev_internal_get_status);
2490 static const struct netdev_rx_class netdev_rx_linux_class = {
2491 netdev_rx_linux_destroy,
2492 netdev_rx_linux_recv,
2493 netdev_rx_linux_wait,
2494 netdev_rx_linux_drain,
2497 /* HTB traffic control class. */
2499 #define HTB_N_QUEUES 0xf000
2503 unsigned int max_rate; /* In bytes/s. */
2507 struct tc_queue tc_queue;
2508 unsigned int min_rate; /* In bytes/s. */
2509 unsigned int max_rate; /* In bytes/s. */
2510 unsigned int burst; /* In bytes. */
2511 unsigned int priority; /* Lower values are higher priorities. */
2515 htb_get__(const struct netdev *netdev_)
2517 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2518 return CONTAINER_OF(netdev->tc, struct htb, tc);
2522 htb_install__(struct netdev *netdev_, uint64_t max_rate)
2524 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2527 htb = xmalloc(sizeof *htb);
2528 tc_init(&htb->tc, &tc_ops_htb);
2529 htb->max_rate = max_rate;
2531 netdev->tc = &htb->tc;
2534 /* Create an HTB qdisc.
2536 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2538 htb_setup_qdisc__(struct netdev *netdev)
2541 struct tc_htb_glob opt;
2542 struct ofpbuf request;
2543 struct tcmsg *tcmsg;
2545 tc_del_qdisc(netdev);
2547 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2548 NLM_F_EXCL | NLM_F_CREATE, &request);
2552 tcmsg->tcm_handle = tc_make_handle(1, 0);
2553 tcmsg->tcm_parent = TC_H_ROOT;
2555 nl_msg_put_string(&request, TCA_KIND, "htb");
2557 memset(&opt, 0, sizeof opt);
2558 opt.rate2quantum = 10;
2562 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2563 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2564 nl_msg_end_nested(&request, opt_offset);
2566 return tc_transact(&request, NULL);
2569 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2570 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2572 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2573 unsigned int parent, struct htb_class *class)
2576 struct tc_htb_opt opt;
2577 struct ofpbuf request;
2578 struct tcmsg *tcmsg;
2582 error = netdev_get_mtu(netdev, &mtu);
2584 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2585 netdev_get_name(netdev));
2589 memset(&opt, 0, sizeof opt);
2590 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2591 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2592 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2593 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2594 opt.prio = class->priority;
2596 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2600 tcmsg->tcm_handle = handle;
2601 tcmsg->tcm_parent = parent;
2603 nl_msg_put_string(&request, TCA_KIND, "htb");
2604 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2605 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2606 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2607 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2608 nl_msg_end_nested(&request, opt_offset);
2610 error = tc_transact(&request, NULL);
2612 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2613 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2614 netdev_get_name(netdev),
2615 tc_get_major(handle), tc_get_minor(handle),
2616 tc_get_major(parent), tc_get_minor(parent),
2617 class->min_rate, class->max_rate,
2618 class->burst, class->priority, ovs_strerror(error));
2623 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2624 * description of them into 'details'. The description complies with the
2625 * specification given in the vswitch database documentation for linux-htb
2628 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2630 static const struct nl_policy tca_htb_policy[] = {
2631 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2632 .min_len = sizeof(struct tc_htb_opt) },
2635 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2636 const struct tc_htb_opt *htb;
2638 if (!nl_parse_nested(nl_options, tca_htb_policy,
2639 attrs, ARRAY_SIZE(tca_htb_policy))) {
2640 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2644 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2645 class->min_rate = htb->rate.rate;
2646 class->max_rate = htb->ceil.rate;
2647 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2648 class->priority = htb->prio;
2653 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2654 struct htb_class *options,
2655 struct netdev_queue_stats *stats)
2657 struct nlattr *nl_options;
2658 unsigned int handle;
2661 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2662 if (!error && queue_id) {
2663 unsigned int major = tc_get_major(handle);
2664 unsigned int minor = tc_get_minor(handle);
2665 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2666 *queue_id = minor - 1;
2671 if (!error && options) {
2672 error = htb_parse_tca_options__(nl_options, options);
2678 htb_parse_qdisc_details__(struct netdev *netdev,
2679 const struct smap *details, struct htb_class *hc)
2681 const char *max_rate_s;
2683 max_rate_s = smap_get(details, "max-rate");
2684 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2685 if (!hc->max_rate) {
2686 enum netdev_features current;
2688 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2689 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
2691 hc->min_rate = hc->max_rate;
2697 htb_parse_class_details__(struct netdev *netdev,
2698 const struct smap *details, struct htb_class *hc)
2700 const struct htb *htb = htb_get__(netdev);
2701 const char *min_rate_s = smap_get(details, "min-rate");
2702 const char *max_rate_s = smap_get(details, "max-rate");
2703 const char *burst_s = smap_get(details, "burst");
2704 const char *priority_s = smap_get(details, "priority");
2707 error = netdev_get_mtu(netdev, &mtu);
2709 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2710 netdev_get_name(netdev));
2714 /* HTB requires at least an mtu sized min-rate to send any traffic even
2715 * on uncongested links. */
2716 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2717 hc->min_rate = MAX(hc->min_rate, mtu);
2718 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2721 hc->max_rate = (max_rate_s
2722 ? strtoull(max_rate_s, NULL, 10) / 8
2724 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2725 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2729 * According to hints in the documentation that I've read, it is important
2730 * that 'burst' be at least as big as the largest frame that might be
2731 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2732 * but having it a bit too small is a problem. Since netdev_get_mtu()
2733 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2734 * the MTU. We actually add 64, instead of 14, as a guard against
2735 * additional headers get tacked on somewhere that we're not aware of. */
2736 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2737 hc->burst = MAX(hc->burst, mtu + 64);
2740 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2746 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2747 unsigned int parent, struct htb_class *options,
2748 struct netdev_queue_stats *stats)
2750 struct ofpbuf *reply;
2753 error = tc_query_class(netdev, handle, parent, &reply);
2755 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2756 ofpbuf_delete(reply);
2762 htb_tc_install(struct netdev *netdev, const struct smap *details)
2766 error = htb_setup_qdisc__(netdev);
2768 struct htb_class hc;
2770 htb_parse_qdisc_details__(netdev, details, &hc);
2771 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2772 tc_make_handle(1, 0), &hc);
2774 htb_install__(netdev, hc.max_rate);
2780 static struct htb_class *
2781 htb_class_cast__(const struct tc_queue *queue)
2783 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2787 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2788 const struct htb_class *hc)
2790 struct htb *htb = htb_get__(netdev);
2791 size_t hash = hash_int(queue_id, 0);
2792 struct tc_queue *queue;
2793 struct htb_class *hcp;
2795 queue = tc_find_queue__(netdev, queue_id, hash);
2797 hcp = htb_class_cast__(queue);
2799 hcp = xmalloc(sizeof *hcp);
2800 queue = &hcp->tc_queue;
2801 queue->queue_id = queue_id;
2802 queue->created = time_msec();
2803 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2806 hcp->min_rate = hc->min_rate;
2807 hcp->max_rate = hc->max_rate;
2808 hcp->burst = hc->burst;
2809 hcp->priority = hc->priority;
2813 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2816 struct nl_dump dump;
2817 struct htb_class hc;
2819 /* Get qdisc options. */
2821 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2822 htb_install__(netdev, hc.max_rate);
2825 if (!start_queue_dump(netdev, &dump)) {
2828 while (nl_dump_next(&dump, &msg)) {
2829 unsigned int queue_id;
2831 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2832 htb_update_queue__(netdev, queue_id, &hc);
2835 nl_dump_done(&dump);
2841 htb_tc_destroy(struct tc *tc)
2843 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2844 struct htb_class *hc, *next;
2846 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2847 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2855 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
2857 const struct htb *htb = htb_get__(netdev);
2858 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
2863 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
2865 struct htb_class hc;
2868 htb_parse_qdisc_details__(netdev, details, &hc);
2869 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2870 tc_make_handle(1, 0), &hc);
2872 htb_get__(netdev)->max_rate = hc.max_rate;
2878 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2879 const struct tc_queue *queue, struct smap *details)
2881 const struct htb_class *hc = htb_class_cast__(queue);
2883 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
2884 if (hc->min_rate != hc->max_rate) {
2885 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
2887 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
2889 smap_add_format(details, "priority", "%u", hc->priority);
2895 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2896 const struct smap *details)
2898 struct htb_class hc;
2901 error = htb_parse_class_details__(netdev, details, &hc);
2906 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2907 tc_make_handle(1, 0xfffe), &hc);
2912 htb_update_queue__(netdev, queue_id, &hc);
2917 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2919 struct htb_class *hc = htb_class_cast__(queue);
2920 struct htb *htb = htb_get__(netdev);
2923 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2925 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2932 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2933 struct netdev_queue_stats *stats)
2935 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2936 tc_make_handle(1, 0xfffe), NULL, stats);
2940 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2941 const struct ofpbuf *nlmsg,
2942 netdev_dump_queue_stats_cb *cb, void *aux)
2944 struct netdev_queue_stats stats;
2945 unsigned int handle, major, minor;
2948 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2953 major = tc_get_major(handle);
2954 minor = tc_get_minor(handle);
2955 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2956 (*cb)(minor - 1, &stats, aux);
2961 static const struct tc_ops tc_ops_htb = {
2962 "htb", /* linux_name */
2963 "linux-htb", /* ovs_name */
2964 HTB_N_QUEUES, /* n_queues */
2973 htb_class_get_stats,
2974 htb_class_dump_stats
2977 /* "linux-hfsc" traffic control class. */
2979 #define HFSC_N_QUEUES 0xf000
2987 struct tc_queue tc_queue;
2992 static struct hfsc *
2993 hfsc_get__(const struct netdev *netdev_)
2995 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2996 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
2999 static struct hfsc_class *
3000 hfsc_class_cast__(const struct tc_queue *queue)
3002 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3006 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
3008 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3011 hfsc = xmalloc(sizeof *hfsc);
3012 tc_init(&hfsc->tc, &tc_ops_hfsc);
3013 hfsc->max_rate = max_rate;
3014 netdev->tc = &hfsc->tc;
3018 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3019 const struct hfsc_class *hc)
3023 struct hfsc_class *hcp;
3024 struct tc_queue *queue;
3026 hfsc = hfsc_get__(netdev);
3027 hash = hash_int(queue_id, 0);
3029 queue = tc_find_queue__(netdev, queue_id, hash);
3031 hcp = hfsc_class_cast__(queue);
3033 hcp = xmalloc(sizeof *hcp);
3034 queue = &hcp->tc_queue;
3035 queue->queue_id = queue_id;
3036 queue->created = time_msec();
3037 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3040 hcp->min_rate = hc->min_rate;
3041 hcp->max_rate = hc->max_rate;
3045 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3047 const struct tc_service_curve *rsc, *fsc, *usc;
3048 static const struct nl_policy tca_hfsc_policy[] = {
3050 .type = NL_A_UNSPEC,
3052 .min_len = sizeof(struct tc_service_curve),
3055 .type = NL_A_UNSPEC,
3057 .min_len = sizeof(struct tc_service_curve),
3060 .type = NL_A_UNSPEC,
3062 .min_len = sizeof(struct tc_service_curve),
3065 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3067 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3068 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3069 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3073 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3074 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3075 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3077 if (rsc->m1 != 0 || rsc->d != 0 ||
3078 fsc->m1 != 0 || fsc->d != 0 ||
3079 usc->m1 != 0 || usc->d != 0) {
3080 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3081 "Non-linear service curves are not supported.");
3085 if (rsc->m2 != fsc->m2) {
3086 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3087 "Real-time service curves are not supported ");
3091 if (rsc->m2 > usc->m2) {
3092 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3093 "Min-rate service curve is greater than "
3094 "the max-rate service curve.");
3098 class->min_rate = fsc->m2;
3099 class->max_rate = usc->m2;
3104 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3105 struct hfsc_class *options,
3106 struct netdev_queue_stats *stats)
3109 unsigned int handle;
3110 struct nlattr *nl_options;
3112 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3118 unsigned int major, minor;
3120 major = tc_get_major(handle);
3121 minor = tc_get_minor(handle);
3122 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3123 *queue_id = minor - 1;
3130 error = hfsc_parse_tca_options__(nl_options, options);
3137 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3138 unsigned int parent, struct hfsc_class *options,
3139 struct netdev_queue_stats *stats)
3142 struct ofpbuf *reply;
3144 error = tc_query_class(netdev, handle, parent, &reply);
3149 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3150 ofpbuf_delete(reply);
3155 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct smap *details,
3156 struct hfsc_class *class)
3159 const char *max_rate_s;
3161 max_rate_s = smap_get(details, "max-rate");
3162 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3165 enum netdev_features current;
3167 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
3168 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3171 class->min_rate = max_rate;
3172 class->max_rate = max_rate;
3176 hfsc_parse_class_details__(struct netdev *netdev,
3177 const struct smap *details,
3178 struct hfsc_class * class)
3180 const struct hfsc *hfsc;
3181 uint32_t min_rate, max_rate;
3182 const char *min_rate_s, *max_rate_s;
3184 hfsc = hfsc_get__(netdev);
3185 min_rate_s = smap_get(details, "min-rate");
3186 max_rate_s = smap_get(details, "max-rate");
3188 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3189 min_rate = MAX(min_rate, 1);
3190 min_rate = MIN(min_rate, hfsc->max_rate);
3192 max_rate = (max_rate_s
3193 ? strtoull(max_rate_s, NULL, 10) / 8
3195 max_rate = MAX(max_rate, min_rate);
3196 max_rate = MIN(max_rate, hfsc->max_rate);
3198 class->min_rate = min_rate;
3199 class->max_rate = max_rate;
3204 /* Create an HFSC qdisc.
3206 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3208 hfsc_setup_qdisc__(struct netdev * netdev)
3210 struct tcmsg *tcmsg;
3211 struct ofpbuf request;
3212 struct tc_hfsc_qopt opt;
3214 tc_del_qdisc(netdev);
3216 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3217 NLM_F_EXCL | NLM_F_CREATE, &request);
3223 tcmsg->tcm_handle = tc_make_handle(1, 0);
3224 tcmsg->tcm_parent = TC_H_ROOT;
3226 memset(&opt, 0, sizeof opt);
3229 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3230 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3232 return tc_transact(&request, NULL);
3235 /* Create an HFSC class.
3237 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3238 * sc rate <min_rate> ul rate <max_rate>" */
3240 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3241 unsigned int parent, struct hfsc_class *class)
3245 struct tcmsg *tcmsg;
3246 struct ofpbuf request;
3247 struct tc_service_curve min, max;
3249 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3255 tcmsg->tcm_handle = handle;
3256 tcmsg->tcm_parent = parent;
3260 min.m2 = class->min_rate;
3264 max.m2 = class->max_rate;
3266 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3267 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3268 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3269 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3270 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3271 nl_msg_end_nested(&request, opt_offset);
3273 error = tc_transact(&request, NULL);
3275 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3276 "min-rate %ubps, max-rate %ubps (%s)",
3277 netdev_get_name(netdev),
3278 tc_get_major(handle), tc_get_minor(handle),
3279 tc_get_major(parent), tc_get_minor(parent),
3280 class->min_rate, class->max_rate, ovs_strerror(error));
3287 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
3290 struct hfsc_class class;
3292 error = hfsc_setup_qdisc__(netdev);
3298 hfsc_parse_qdisc_details__(netdev, details, &class);
3299 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3300 tc_make_handle(1, 0), &class);
3306 hfsc_install__(netdev, class.max_rate);
3311 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3314 struct nl_dump dump;
3315 struct hfsc_class hc;
3318 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3319 hfsc_install__(netdev, hc.max_rate);
3321 if (!start_queue_dump(netdev, &dump)) {
3325 while (nl_dump_next(&dump, &msg)) {
3326 unsigned int queue_id;
3328 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3329 hfsc_update_queue__(netdev, queue_id, &hc);
3333 nl_dump_done(&dump);
3338 hfsc_tc_destroy(struct tc *tc)
3341 struct hfsc_class *hc, *next;
3343 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3345 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3346 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3355 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
3357 const struct hfsc *hfsc;
3358 hfsc = hfsc_get__(netdev);
3359 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
3364 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
3367 struct hfsc_class class;
3369 hfsc_parse_qdisc_details__(netdev, details, &class);
3370 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3371 tc_make_handle(1, 0), &class);
3374 hfsc_get__(netdev)->max_rate = class.max_rate;
3381 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3382 const struct tc_queue *queue, struct smap *details)
3384 const struct hfsc_class *hc;
3386 hc = hfsc_class_cast__(queue);
3387 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3388 if (hc->min_rate != hc->max_rate) {
3389 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3395 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3396 const struct smap *details)
3399 struct hfsc_class class;
3401 error = hfsc_parse_class_details__(netdev, details, &class);
3406 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3407 tc_make_handle(1, 0xfffe), &class);
3412 hfsc_update_queue__(netdev, queue_id, &class);
3417 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3421 struct hfsc_class *hc;
3423 hc = hfsc_class_cast__(queue);
3424 hfsc = hfsc_get__(netdev);
3426 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3428 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3435 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3436 struct netdev_queue_stats *stats)
3438 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3439 tc_make_handle(1, 0xfffe), NULL, stats);
3443 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3444 const struct ofpbuf *nlmsg,
3445 netdev_dump_queue_stats_cb *cb, void *aux)
3447 struct netdev_queue_stats stats;
3448 unsigned int handle, major, minor;
3451 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3456 major = tc_get_major(handle);
3457 minor = tc_get_minor(handle);
3458 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3459 (*cb)(minor - 1, &stats, aux);
3464 static const struct tc_ops tc_ops_hfsc = {
3465 "hfsc", /* linux_name */
3466 "linux-hfsc", /* ovs_name */
3467 HFSC_N_QUEUES, /* n_queues */
3468 hfsc_tc_install, /* tc_install */
3469 hfsc_tc_load, /* tc_load */
3470 hfsc_tc_destroy, /* tc_destroy */
3471 hfsc_qdisc_get, /* qdisc_get */
3472 hfsc_qdisc_set, /* qdisc_set */
3473 hfsc_class_get, /* class_get */
3474 hfsc_class_set, /* class_set */
3475 hfsc_class_delete, /* class_delete */
3476 hfsc_class_get_stats, /* class_get_stats */
3477 hfsc_class_dump_stats /* class_dump_stats */
3480 /* "linux-default" traffic control class.
3482 * This class represents the default, unnamed Linux qdisc. It corresponds to
3483 * the "" (empty string) QoS type in the OVS database. */
3486 default_install__(struct netdev *netdev_)
3488 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3489 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
3491 /* Nothing but a tc class implementation is allowed to write to a tc. This
3492 * class never does that, so we can legitimately use a const tc object. */
3493 netdev->tc = CONST_CAST(struct tc *, &tc);
3497 default_tc_install(struct netdev *netdev,
3498 const struct smap *details OVS_UNUSED)
3500 default_install__(netdev);
3505 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3507 default_install__(netdev);
3511 static const struct tc_ops tc_ops_default = {
3512 NULL, /* linux_name */
3517 NULL, /* tc_destroy */
3518 NULL, /* qdisc_get */
3519 NULL, /* qdisc_set */
3520 NULL, /* class_get */
3521 NULL, /* class_set */
3522 NULL, /* class_delete */
3523 NULL, /* class_get_stats */
3524 NULL /* class_dump_stats */
3527 /* "linux-other" traffic control class.
3532 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
3534 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3535 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
3537 /* Nothing but a tc class implementation is allowed to write to a tc. This
3538 * class never does that, so we can legitimately use a const tc object. */
3539 netdev->tc = CONST_CAST(struct tc *, &tc);
3543 static const struct tc_ops tc_ops_other = {
3544 NULL, /* linux_name */
3545 "linux-other", /* ovs_name */
3547 NULL, /* tc_install */
3549 NULL, /* tc_destroy */
3550 NULL, /* qdisc_get */
3551 NULL, /* qdisc_set */
3552 NULL, /* class_get */
3553 NULL, /* class_set */
3554 NULL, /* class_delete */
3555 NULL, /* class_get_stats */
3556 NULL /* class_dump_stats */
3559 /* Traffic control. */
3561 /* Number of kernel "tc" ticks per second. */
3562 static double ticks_per_s;
3564 /* Number of kernel "jiffies" per second. This is used for the purpose of
3565 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3566 * one jiffy's worth of data.
3568 * There are two possibilities here:
3570 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3571 * approximate range of 100 to 1024. That means that we really need to
3572 * make sure that the qdisc can buffer that much data.
3574 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3575 * has finely granular timers and there's no need to fudge additional room
3576 * for buffers. (There's no extra effort needed to implement that: the
3577 * large 'buffer_hz' is used as a divisor, so practically any number will
3578 * come out as 0 in the division. Small integer results in the case of
3579 * really high dividends won't have any real effect anyhow.)
3581 static unsigned int buffer_hz;
3583 /* Returns tc handle 'major':'minor'. */
3585 tc_make_handle(unsigned int major, unsigned int minor)
3587 return TC_H_MAKE(major << 16, minor);
3590 /* Returns the major number from 'handle'. */
3592 tc_get_major(unsigned int handle)
3594 return TC_H_MAJ(handle) >> 16;
3597 /* Returns the minor number from 'handle'. */
3599 tc_get_minor(unsigned int handle)
3601 return TC_H_MIN(handle);
3604 static struct tcmsg *
3605 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3606 struct ofpbuf *request)
3608 struct tcmsg *tcmsg;
3612 error = get_ifindex(netdev, &ifindex);
3617 ofpbuf_init(request, 512);
3618 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3619 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3620 tcmsg->tcm_family = AF_UNSPEC;
3621 tcmsg->tcm_ifindex = ifindex;
3622 /* Caller should fill in tcmsg->tcm_handle. */
3623 /* Caller should fill in tcmsg->tcm_parent. */
3629 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3631 int error = nl_transact(NETLINK_ROUTE, request, replyp);
3632 ofpbuf_uninit(request);
3636 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3637 * policing configuration.
3639 * This function is equivalent to running the following when 'add' is true:
3640 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3642 * This function is equivalent to running the following when 'add' is false:
3643 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3645 * The configuration and stats may be seen with the following command:
3646 * /sbin/tc -s qdisc show dev <devname>
3648 * Returns 0 if successful, otherwise a positive errno value.
3651 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3653 struct ofpbuf request;
3654 struct tcmsg *tcmsg;
3656 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3657 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3659 tcmsg = tc_make_request(netdev, type, flags, &request);
3663 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3664 tcmsg->tcm_parent = TC_H_INGRESS;
3665 nl_msg_put_string(&request, TCA_KIND, "ingress");
3666 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3668 error = tc_transact(&request, NULL);
3670 /* If we're deleting the qdisc, don't worry about some of the
3671 * error conditions. */
3672 if (!add && (error == ENOENT || error == EINVAL)) {
3681 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3684 * This function is equivalent to running:
3685 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3686 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3689 * The configuration and stats may be seen with the following command:
3690 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3692 * Returns 0 if successful, otherwise a positive errno value.
3695 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3697 struct tc_police tc_police;
3698 struct ofpbuf request;
3699 struct tcmsg *tcmsg;
3700 size_t basic_offset;
3701 size_t police_offset;
3705 memset(&tc_police, 0, sizeof tc_police);
3706 tc_police.action = TC_POLICE_SHOT;
3707 tc_police.mtu = mtu;
3708 tc_fill_rate(&tc_police.rate, (kbits_rate * 1000)/8, mtu);
3709 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3710 kbits_burst * 1024);
3712 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3713 NLM_F_EXCL | NLM_F_CREATE, &request);
3717 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3718 tcmsg->tcm_info = tc_make_handle(49,
3719 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3721 nl_msg_put_string(&request, TCA_KIND, "basic");
3722 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3723 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3724 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3725 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3726 nl_msg_end_nested(&request, police_offset);
3727 nl_msg_end_nested(&request, basic_offset);
3729 error = tc_transact(&request, NULL);
3740 /* The values in psched are not individually very meaningful, but they are
3741 * important. The tables below show some values seen in the wild.
3745 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3746 * (Before that, there are hints that it was 1000000000.)
3748 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3752 * -----------------------------------
3753 * [1] 000c8000 000f4240 000f4240 00000064
3754 * [2] 000003e8 00000400 000f4240 3b9aca00
3755 * [3] 000003e8 00000400 000f4240 3b9aca00
3756 * [4] 000003e8 00000400 000f4240 00000064
3757 * [5] 000003e8 00000040 000f4240 3b9aca00
3758 * [6] 000003e8 00000040 000f4240 000000f9
3760 * a b c d ticks_per_s buffer_hz
3761 * ------- --------- ---------- ------------- ----------- -------------
3762 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3763 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3764 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3765 * [4] 1,000 1,024 1,000,000 100 976,562 100
3766 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3767 * [6] 1,000 64 1,000,000 249 15,625,000 249
3769 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3770 * [2] 2.6.26-1-686-bigmem from Debian lenny
3771 * [3] 2.6.26-2-sparc64 from Debian lenny
3772 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3773 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3774 * [6] 2.6.34 from kernel.org on KVM
3776 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
3777 static const char fn[] = "/proc/net/psched";
3778 unsigned int a, b, c, d;
3781 if (!ovsthread_once_start(&once)) {
3788 stream = fopen(fn, "r");
3790 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
3794 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3795 VLOG_WARN("%s: read failed", fn);
3799 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3803 VLOG_WARN("%s: invalid scheduler parameters", fn);
3807 ticks_per_s = (double) a * c / b;
3811 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3814 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3817 ovsthread_once_done(&once);
3820 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3821 * rate of 'rate' bytes per second. */
3823 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3826 return (rate * ticks) / ticks_per_s;
3829 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3830 * rate of 'rate' bytes per second. */
3832 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3835 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3838 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3839 * a transmission rate of 'rate' bytes per second. */
3841 tc_buffer_per_jiffy(unsigned int rate)
3844 return rate / buffer_hz;
3847 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3848 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3849 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3850 * stores NULL into it if it is absent.
3852 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3855 * Returns 0 if successful, otherwise a positive errno value. */
3857 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3858 struct nlattr **options)
3860 static const struct nl_policy tca_policy[] = {
3861 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3862 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3864 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3866 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3867 tca_policy, ta, ARRAY_SIZE(ta))) {
3868 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3873 *kind = nl_attr_get_string(ta[TCA_KIND]);
3877 *options = ta[TCA_OPTIONS];
3892 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3893 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3894 * into '*options', and its queue statistics into '*stats'. Any of the output
3895 * arguments may be null.
3897 * Returns 0 if successful, otherwise a positive errno value. */
3899 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3900 struct nlattr **options, struct netdev_queue_stats *stats)
3902 static const struct nl_policy tca_policy[] = {
3903 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3904 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3906 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3908 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3909 tca_policy, ta, ARRAY_SIZE(ta))) {
3910 VLOG_WARN_RL(&rl, "failed to parse class message");
3915 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3916 *handlep = tc->tcm_handle;
3920 *options = ta[TCA_OPTIONS];
3924 const struct gnet_stats_queue *gsq;
3925 struct gnet_stats_basic gsb;
3927 static const struct nl_policy stats_policy[] = {
3928 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3929 .min_len = sizeof gsb },
3930 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3931 .min_len = sizeof *gsq },
3933 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3935 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3936 sa, ARRAY_SIZE(sa))) {
3937 VLOG_WARN_RL(&rl, "failed to parse class stats");
3941 /* Alignment issues screw up the length of struct gnet_stats_basic on
3942 * some arch/bitsize combinations. Newer versions of Linux have a
3943 * struct gnet_stats_basic_packed, but we can't depend on that. The
3944 * easiest thing to do is just to make a copy. */
3945 memset(&gsb, 0, sizeof gsb);
3946 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3947 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3948 stats->tx_bytes = gsb.bytes;
3949 stats->tx_packets = gsb.packets;
3951 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3952 stats->tx_errors = gsq->drops;
3962 memset(stats, 0, sizeof *stats);
3967 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3970 tc_query_class(const struct netdev *netdev,
3971 unsigned int handle, unsigned int parent,
3972 struct ofpbuf **replyp)
3974 struct ofpbuf request;
3975 struct tcmsg *tcmsg;
3978 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3982 tcmsg->tcm_handle = handle;
3983 tcmsg->tcm_parent = parent;
3985 error = tc_transact(&request, replyp);
3987 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3988 netdev_get_name(netdev),
3989 tc_get_major(handle), tc_get_minor(handle),
3990 tc_get_major(parent), tc_get_minor(parent),
3991 ovs_strerror(error));
3996 /* Equivalent to "tc class del dev <name> handle <handle>". */
3998 tc_delete_class(const struct netdev *netdev, unsigned int handle)
4000 struct ofpbuf request;
4001 struct tcmsg *tcmsg;
4004 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
4008 tcmsg->tcm_handle = handle;
4009 tcmsg->tcm_parent = 0;
4011 error = tc_transact(&request, NULL);
4013 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4014 netdev_get_name(netdev),
4015 tc_get_major(handle), tc_get_minor(handle),
4016 ovs_strerror(error));
4021 /* Equivalent to "tc qdisc del dev <name> root". */
4023 tc_del_qdisc(struct netdev *netdev_)
4025 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4026 struct ofpbuf request;
4027 struct tcmsg *tcmsg;
4030 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
4034 tcmsg->tcm_handle = tc_make_handle(1, 0);
4035 tcmsg->tcm_parent = TC_H_ROOT;
4037 error = tc_transact(&request, NULL);
4038 if (error == EINVAL) {
4039 /* EINVAL probably means that the default qdisc was in use, in which
4040 * case we've accomplished our purpose. */
4043 if (!error && netdev->tc) {
4044 if (netdev->tc->ops->tc_destroy) {
4045 netdev->tc->ops->tc_destroy(netdev->tc);
4052 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4053 * kernel to determine what they are. Returns 0 if successful, otherwise a
4054 * positive errno value. */
4056 tc_query_qdisc(const struct netdev *netdev_)
4058 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4059 struct ofpbuf request, *qdisc;
4060 const struct tc_ops *ops;
4061 struct tcmsg *tcmsg;
4069 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4070 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4071 * 2.6.35 without that fix backported to it.
4073 * To avoid the OOPS, we must not make a request that would attempt to dump
4074 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4075 * few others. There are a few ways that I can see to do this, but most of
4076 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4077 * technique chosen here is to assume that any non-default qdisc that we
4078 * create will have a class with handle 1:0. The built-in qdiscs only have
4079 * a class with handle 0:0.
4081 * We could check for Linux 2.6.35+ and use a more straightforward method
4083 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
4087 tcmsg->tcm_handle = tc_make_handle(1, 0);
4088 tcmsg->tcm_parent = 0;
4090 /* Figure out what tc class to instantiate. */
4091 error = tc_transact(&request, &qdisc);
4095 error = tc_parse_qdisc(qdisc, &kind, NULL);
4097 ops = &tc_ops_other;
4099 ops = tc_lookup_linux_name(kind);
4101 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4102 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4104 ops = &tc_ops_other;
4107 } else if (error == ENOENT) {
4108 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4109 * other entity that doesn't have a handle 1:0. We will assume
4110 * that it's the system default qdisc. */
4111 ops = &tc_ops_default;
4114 /* Who knows? Maybe the device got deleted. */
4115 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4116 netdev_get_name(netdev_), ovs_strerror(error));
4117 ops = &tc_ops_other;
4120 /* Instantiate it. */
4121 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
4122 ovs_assert((load_error == 0) == (netdev->tc != NULL));
4123 ofpbuf_delete(qdisc);
4125 return error ? error : load_error;
4128 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4129 approximate the time to transmit packets of various lengths. For an MTU of
4130 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4131 represents two possible packet lengths; for a MTU of 513 through 1024, four
4132 possible lengths; and so on.
4134 Returns, for the specified 'mtu', the number of bits that packet lengths
4135 need to be shifted right to fit within such a 256-entry table. */
4137 tc_calc_cell_log(unsigned int mtu)
4142 mtu = ETH_PAYLOAD_MAX;
4144 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4146 for (cell_log = 0; mtu >= 256; cell_log++) {
4153 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4156 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4158 memset(rate, 0, sizeof *rate);
4159 rate->cell_log = tc_calc_cell_log(mtu);
4160 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4161 /* rate->cell_align = 0; */ /* distro headers. */
4162 rate->mpu = ETH_TOTAL_MIN;
4166 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4167 * attribute of the specified "type".
4169 * See tc_calc_cell_log() above for a description of "rtab"s. */
4171 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4176 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4177 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4178 unsigned packet_size = (i + 1) << rate->cell_log;
4179 if (packet_size < rate->mpu) {
4180 packet_size = rate->mpu;
4182 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4186 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4187 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4188 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4191 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4193 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4194 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4197 /* Linux-only functions declared in netdev-linux.h */
4199 /* Returns a fd for an AF_INET socket or a negative errno value. */
4201 netdev_linux_get_af_inet_sock(void)
4203 int error = netdev_linux_init();
4204 return error ? -error : af_inet_sock;
4207 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4208 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4210 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4211 const char *flag_name, bool enable)
4213 const char *netdev_name = netdev_get_name(netdev);
4214 struct ethtool_value evalue;
4218 COVERAGE_INC(netdev_get_ethtool);
4219 memset(&evalue, 0, sizeof evalue);
4220 error = netdev_linux_do_ethtool(netdev_name,
4221 (struct ethtool_cmd *)&evalue,
4222 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4227 COVERAGE_INC(netdev_set_ethtool);
4228 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4229 error = netdev_linux_do_ethtool(netdev_name,
4230 (struct ethtool_cmd *)&evalue,
4231 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4236 COVERAGE_INC(netdev_get_ethtool);
4237 memset(&evalue, 0, sizeof evalue);
4238 error = netdev_linux_do_ethtool(netdev_name,
4239 (struct ethtool_cmd *)&evalue,
4240 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4245 if (new_flags != evalue.data) {
4246 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4247 "device %s failed", enable ? "enable" : "disable",
4248 flag_name, netdev_name);
4255 /* Utility functions. */
4257 /* Copies 'src' into 'dst', performing format conversion in the process. */
4259 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4260 const struct rtnl_link_stats *src)
4262 dst->rx_packets = src->rx_packets;
4263 dst->tx_packets = src->tx_packets;
4264 dst->rx_bytes = src->rx_bytes;
4265 dst->tx_bytes = src->tx_bytes;
4266 dst->rx_errors = src->rx_errors;
4267 dst->tx_errors = src->tx_errors;
4268 dst->rx_dropped = src->rx_dropped;
4269 dst->tx_dropped = src->tx_dropped;
4270 dst->multicast = src->multicast;
4271 dst->collisions = src->collisions;
4272 dst->rx_length_errors = src->rx_length_errors;
4273 dst->rx_over_errors = src->rx_over_errors;
4274 dst->rx_crc_errors = src->rx_crc_errors;
4275 dst->rx_frame_errors = src->rx_frame_errors;
4276 dst->rx_fifo_errors = src->rx_fifo_errors;
4277 dst->rx_missed_errors = src->rx_missed_errors;
4278 dst->tx_aborted_errors = src->tx_aborted_errors;
4279 dst->tx_carrier_errors = src->tx_carrier_errors;
4280 dst->tx_fifo_errors = src->tx_fifo_errors;
4281 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4282 dst->tx_window_errors = src->tx_window_errors;
4286 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4288 /* Policy for RTNLGRP_LINK messages.
4290 * There are *many* more fields in these messages, but currently we only
4291 * care about these fields. */
4292 static const struct nl_policy rtnlgrp_link_policy[] = {
4293 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4294 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4295 .min_len = sizeof(struct rtnl_link_stats) },
4298 struct ofpbuf request;
4299 struct ofpbuf *reply;
4300 struct ifinfomsg *ifi;
4301 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4304 ofpbuf_init(&request, 0);
4305 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4306 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4307 ifi->ifi_family = PF_UNSPEC;
4308 ifi->ifi_index = ifindex;
4309 error = nl_transact(NETLINK_ROUTE, &request, &reply);
4310 ofpbuf_uninit(&request);
4315 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4316 rtnlgrp_link_policy,
4317 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4318 ofpbuf_delete(reply);
4322 if (!attrs[IFLA_STATS]) {
4323 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4324 ofpbuf_delete(reply);
4328 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4330 ofpbuf_delete(reply);
4336 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4338 static const char fn[] = "/proc/net/dev";
4343 stream = fopen(fn, "r");
4345 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
4350 while (fgets(line, sizeof line, stream)) {
4353 #define X64 "%"SCNu64
4356 X64 X64 X64 X64 X64 X64 X64 "%*u"
4357 X64 X64 X64 X64 X64 X64 X64 "%*u",
4363 &stats->rx_fifo_errors,
4364 &stats->rx_frame_errors,
4370 &stats->tx_fifo_errors,
4372 &stats->tx_carrier_errors) != 15) {
4373 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4374 } else if (!strcmp(devname, netdev_name)) {
4375 stats->rx_length_errors = UINT64_MAX;
4376 stats->rx_over_errors = UINT64_MAX;
4377 stats->rx_crc_errors = UINT64_MAX;
4378 stats->rx_missed_errors = UINT64_MAX;
4379 stats->tx_aborted_errors = UINT64_MAX;
4380 stats->tx_heartbeat_errors = UINT64_MAX;
4381 stats->tx_window_errors = UINT64_MAX;
4387 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4393 get_flags(const struct netdev *dev, unsigned int *flags)
4399 error = netdev_linux_do_ioctl(dev->name, &ifr, SIOCGIFFLAGS,
4402 *flags = ifr.ifr_flags;
4408 set_flags(const char *name, unsigned int flags)
4412 ifr.ifr_flags = flags;
4413 return netdev_linux_do_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
4417 do_get_ifindex(const char *netdev_name)
4421 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4422 COVERAGE_INC(netdev_get_ifindex);
4423 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4424 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4425 netdev_name, ovs_strerror(errno));
4428 return ifr.ifr_ifindex;
4432 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4434 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4436 if (!(netdev->cache_valid & VALID_IFINDEX)) {
4437 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4440 netdev->get_ifindex_error = -ifindex;
4441 netdev->ifindex = 0;
4443 netdev->get_ifindex_error = 0;
4444 netdev->ifindex = ifindex;
4446 netdev->cache_valid |= VALID_IFINDEX;
4449 *ifindexp = netdev->ifindex;
4450 return netdev->get_ifindex_error;
4454 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4459 memset(&ifr, 0, sizeof ifr);
4460 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4461 COVERAGE_INC(netdev_get_hwaddr);
4462 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4463 /* ENODEV probably means that a vif disappeared asynchronously and
4464 * hasn't been removed from the database yet, so reduce the log level
4465 * to INFO for that case. */
4466 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4467 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4468 netdev_name, ovs_strerror(errno));
4471 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4472 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4473 VLOG_WARN("%s device has unknown hardware address family %d",
4474 netdev_name, hwaddr_family);
4476 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4481 set_etheraddr(const char *netdev_name,
4482 const uint8_t mac[ETH_ADDR_LEN])
4486 memset(&ifr, 0, sizeof ifr);
4487 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4488 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4489 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4490 COVERAGE_INC(netdev_set_hwaddr);
4491 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4492 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4493 netdev_name, ovs_strerror(errno));
4500 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4501 int cmd, const char *cmd_name)
4505 memset(&ifr, 0, sizeof ifr);
4506 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4507 ifr.ifr_data = (caddr_t) ecmd;
4510 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4513 if (errno != EOPNOTSUPP) {
4514 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4515 "failed: %s", cmd_name, name, ovs_strerror(errno));
4517 /* The device doesn't support this operation. That's pretty
4518 * common, so there's no point in logging anything. */
4525 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4526 const char *cmd_name)
4528 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4529 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4530 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4531 ovs_strerror(errno));
4538 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4539 int cmd, const char *cmd_name)
4544 ifr.ifr_addr.sa_family = AF_INET;
4545 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4547 const struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *,
4549 *ip = sin->sin_addr;
4554 /* Returns an AF_PACKET raw socket or a negative errno value. */
4556 af_packet_sock(void)
4558 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4561 if (ovsthread_once_start(&once)) {
4562 sock = socket(AF_PACKET, SOCK_RAW, 0);
4564 int error = set_nonblocking(sock);
4571 VLOG_ERR("failed to create packet socket: %s",
4572 ovs_strerror(errno));
4574 ovsthread_once_done(&once);