2 * Copyright (c) 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <linux/version.h>
37 #include <sys/types.h>
38 #include <sys/ioctl.h>
39 #include <sys/socket.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
52 #include "dpif-linux.h"
53 #include "dynamic-string.h"
54 #include "fatal-signal.h"
57 #include "netdev-provider.h"
58 #include "netdev-vport.h"
59 #include "netlink-notifier.h"
60 #include "netlink-socket.h"
63 #include "openflow/openflow.h"
65 #include "poll-loop.h"
66 #include "rtnetlink-link.h"
68 #include "socket-util.h"
71 #include "unaligned.h"
74 VLOG_DEFINE_THIS_MODULE(netdev_linux);
76 COVERAGE_DEFINE(netdev_set_policing);
77 COVERAGE_DEFINE(netdev_arp_lookup);
78 COVERAGE_DEFINE(netdev_get_ifindex);
79 COVERAGE_DEFINE(netdev_get_hwaddr);
80 COVERAGE_DEFINE(netdev_set_hwaddr);
81 COVERAGE_DEFINE(netdev_get_ethtool);
82 COVERAGE_DEFINE(netdev_set_ethtool);
85 /* These were introduced in Linux 2.6.14, so they might be missing if we have
87 #ifndef ADVERTISED_Pause
88 #define ADVERTISED_Pause (1 << 13)
90 #ifndef ADVERTISED_Asym_Pause
91 #define ADVERTISED_Asym_Pause (1 << 14)
94 /* These were introduced in Linux 2.6.24, so they might be missing if we
95 * have old headers. */
96 #ifndef ETHTOOL_GFLAGS
97 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
99 #ifndef ETHTOOL_SFLAGS
100 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
103 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
106 #define TC_RTAB_SIZE 1024
109 static struct nln_notifier *netdev_linux_cache_notifier = NULL;
110 static int cache_notifier_refcount;
113 VALID_IFINDEX = 1 << 0,
114 VALID_ETHERADDR = 1 << 1,
118 VALID_POLICING = 1 << 5,
119 VALID_VPORT_STAT_ERROR = 1 << 6,
120 VALID_DRVINFO = 1 << 7,
121 VALID_FEATURES = 1 << 8,
124 /* Traffic control. */
126 /* An instance of a traffic control class. Always associated with a particular
129 * Each TC implementation subclasses this with whatever additional data it
132 const struct tc_ops *ops;
133 struct hmap queues; /* Contains "struct tc_queue"s.
134 * Read by generic TC layer.
135 * Written only by TC implementation. */
138 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
140 /* One traffic control queue.
142 * Each TC implementation subclasses this with whatever additional data it
145 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
146 unsigned int queue_id; /* OpenFlow queue ID. */
147 long long int created; /* Time queue was created, in msecs. */
150 /* A particular kind of traffic control. Each implementation generally maps to
151 * one particular Linux qdisc class.
153 * The functions below return 0 if successful or a positive errno value on
154 * failure, except where otherwise noted. All of them must be provided, except
155 * where otherwise noted. */
157 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
158 * This is null for tc_ops_default and tc_ops_other, for which there are no
159 * appropriate values. */
160 const char *linux_name;
162 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
163 const char *ovs_name;
165 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
166 * queues. The queues are numbered 0 through n_queues - 1. */
167 unsigned int n_queues;
169 /* Called to install this TC class on 'netdev'. The implementation should
170 * make the Netlink calls required to set up 'netdev' with the right qdisc
171 * and configure it according to 'details'. The implementation may assume
172 * that the current qdisc is the default; that is, there is no need for it
173 * to delete the current qdisc before installing itself.
175 * The contents of 'details' should be documented as valid for 'ovs_name'
176 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
177 * (which is built as ovs-vswitchd.conf.db(8)).
179 * This function must return 0 if and only if it sets 'netdev->tc' to an
180 * initialized 'struct tc'.
182 * (This function is null for tc_ops_other, which cannot be installed. For
183 * other TC classes it should always be nonnull.) */
184 int (*tc_install)(struct netdev *netdev, const struct smap *details);
186 /* Called when the netdev code determines (through a Netlink query) that
187 * this TC class's qdisc is installed on 'netdev', but we didn't install
188 * it ourselves and so don't know any of the details.
190 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
191 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
192 * implementation should parse the other attributes of 'nlmsg' as
193 * necessary to determine its configuration. If necessary it should also
194 * use Netlink queries to determine the configuration of queues on
197 * This function must return 0 if and only if it sets 'netdev->tc' to an
198 * initialized 'struct tc'. */
199 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
201 /* Destroys the data structures allocated by the implementation as part of
202 * 'tc'. (This includes destroying 'tc->queues' by calling
205 * The implementation should not need to perform any Netlink calls. If
206 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
207 * (But it may not be desirable.)
209 * This function may be null if 'tc' is trivial. */
210 void (*tc_destroy)(struct tc *tc);
212 /* Retrieves details of 'netdev->tc' configuration into 'details'.
214 * The implementation should not need to perform any Netlink calls, because
215 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
216 * cached the configuration.
218 * The contents of 'details' should be documented as valid for 'ovs_name'
219 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
220 * (which is built as ovs-vswitchd.conf.db(8)).
222 * This function may be null if 'tc' is not configurable.
224 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
226 /* Reconfigures 'netdev->tc' according to 'details', performing any
227 * required Netlink calls to complete the reconfiguration.
229 * The contents of 'details' should be documented as valid for 'ovs_name'
230 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
231 * (which is built as ovs-vswitchd.conf.db(8)).
233 * This function may be null if 'tc' is not configurable.
235 int (*qdisc_set)(struct netdev *, const struct smap *details);
237 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
238 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
240 * The contents of 'details' should be documented as valid for 'ovs_name'
241 * in the "other_config" column in the "Queue" table in
242 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
244 * The implementation should not need to perform any Netlink calls, because
245 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
246 * cached the queue configuration.
248 * This function may be null if 'tc' does not have queues ('n_queues' is
250 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
251 struct smap *details);
253 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
254 * 'details', perfoming any required Netlink calls to complete the
255 * reconfiguration. The caller ensures that 'queue_id' is less than
258 * The contents of 'details' should be documented as valid for 'ovs_name'
259 * in the "other_config" column in the "Queue" table in
260 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
262 * This function may be null if 'tc' does not have queues or its queues are
263 * not configurable. */
264 int (*class_set)(struct netdev *, unsigned int queue_id,
265 const struct smap *details);
267 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
268 * tc_queue's within 'netdev->tc->queues'.
270 * This function may be null if 'tc' does not have queues or its queues
271 * cannot be deleted. */
272 int (*class_delete)(struct netdev *, struct tc_queue *queue);
274 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
275 * 'struct tc_queue's within 'netdev->tc->queues'.
277 * On success, initializes '*stats'.
279 * This function may be null if 'tc' does not have queues or if it cannot
280 * report queue statistics. */
281 int (*class_get_stats)(const struct netdev *netdev,
282 const struct tc_queue *queue,
283 struct netdev_queue_stats *stats);
285 /* Extracts queue stats from 'nlmsg', which is a response to a
286 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
288 * This function may be null if 'tc' does not have queues or if it cannot
289 * report queue statistics. */
290 int (*class_dump_stats)(const struct netdev *netdev,
291 const struct ofpbuf *nlmsg,
292 netdev_dump_queue_stats_cb *cb, void *aux);
296 tc_init(struct tc *tc, const struct tc_ops *ops)
299 hmap_init(&tc->queues);
303 tc_destroy(struct tc *tc)
305 hmap_destroy(&tc->queues);
308 static const struct tc_ops tc_ops_htb;
309 static const struct tc_ops tc_ops_hfsc;
310 static const struct tc_ops tc_ops_default;
311 static const struct tc_ops tc_ops_other;
313 static const struct tc_ops *const tcs[] = {
314 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
315 &tc_ops_hfsc, /* Hierarchical fair service curve. */
316 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
317 &tc_ops_other, /* Some other qdisc. */
321 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
322 static unsigned int tc_get_major(unsigned int handle);
323 static unsigned int tc_get_minor(unsigned int handle);
325 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
326 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
327 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
329 static struct tcmsg *tc_make_request(const struct netdev *, int type,
330 unsigned int flags, struct ofpbuf *);
331 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
332 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
333 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
336 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
337 struct nlattr **options);
338 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
339 struct nlattr **options,
340 struct netdev_queue_stats *);
341 static int tc_query_class(const struct netdev *,
342 unsigned int handle, unsigned int parent,
343 struct ofpbuf **replyp);
344 static int tc_delete_class(const struct netdev *, unsigned int handle);
346 static int tc_del_qdisc(struct netdev *netdev);
347 static int tc_query_qdisc(const struct netdev *netdev);
349 static int tc_calc_cell_log(unsigned int mtu);
350 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
351 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
352 const struct tc_ratespec *rate);
353 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
355 struct netdev_linux {
358 unsigned int cache_valid;
359 unsigned int change_seq;
361 bool miimon; /* Link status of last poll. */
362 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
363 struct timer miimon_timer;
365 /* The following are figured out "on demand" only. They are only valid
366 * when the corresponding VALID_* bit in 'cache_valid' is set. */
368 uint8_t etheraddr[ETH_ADDR_LEN];
369 struct in_addr address, netmask;
372 unsigned int ifi_flags;
373 long long int carrier_resets;
374 uint32_t kbits_rate; /* Policing data. */
375 uint32_t kbits_burst;
376 int vport_stats_error; /* Cached error code from vport_get_stats().
377 0 or an errno value. */
378 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
379 int ether_addr_error; /* Cached error code from set/get etheraddr. */
380 int netdev_policing_error; /* Cached error code from set policing. */
381 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
382 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
384 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
385 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
386 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
388 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
391 /* For devices of class netdev_tap_class only. */
395 struct netdev_rx_linux {
401 static const struct netdev_rx_class netdev_rx_linux_class;
403 /* Sockets used for ioctl operations. */
404 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
406 /* This is set pretty low because we probably won't learn anything from the
407 * additional log messages. */
408 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
410 static int netdev_linux_init(void);
412 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
413 int cmd, const char *cmd_name);
414 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
415 const char *cmd_name);
416 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
417 int cmd, const char *cmd_name);
418 static int get_flags(const struct netdev *, unsigned int *flags);
419 static int set_flags(const char *, unsigned int flags);
420 static int do_get_ifindex(const char *netdev_name);
421 static int get_ifindex(const struct netdev *, int *ifindexp);
422 static int do_set_addr(struct netdev *netdev,
423 int ioctl_nr, const char *ioctl_name,
424 struct in_addr addr);
425 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
426 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
427 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
428 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
429 static int af_packet_sock(void);
430 static void netdev_linux_miimon_run(void);
431 static void netdev_linux_miimon_wait(void);
434 is_netdev_linux_class(const struct netdev_class *netdev_class)
436 return netdev_class->init == netdev_linux_init;
440 is_tap_netdev(const struct netdev *netdev)
442 return netdev_get_class(netdev) == &netdev_tap_class;
445 static struct netdev_linux *
446 netdev_linux_cast(const struct netdev *netdev)
448 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
450 return CONTAINER_OF(netdev, struct netdev_linux, up);
453 static struct netdev_rx_linux *
454 netdev_rx_linux_cast(const struct netdev_rx *rx)
456 netdev_rx_assert_class(rx, &netdev_rx_linux_class);
457 return CONTAINER_OF(rx, struct netdev_rx_linux, up);
461 netdev_linux_init(void)
463 static int status = -1;
465 /* Create AF_INET socket. */
466 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
467 status = af_inet_sock >= 0 ? 0 : errno;
469 VLOG_ERR("failed to create inet socket: %s", ovs_strerror(status));
476 netdev_linux_run(void)
478 rtnetlink_link_run();
479 netdev_linux_miimon_run();
483 netdev_linux_wait(void)
485 rtnetlink_link_wait();
486 netdev_linux_miimon_wait();
490 netdev_linux_changed(struct netdev_linux *dev,
491 unsigned int ifi_flags, unsigned int mask)
494 if (!dev->change_seq) {
498 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
499 dev->carrier_resets++;
501 dev->ifi_flags = ifi_flags;
503 dev->cache_valid &= mask;
507 netdev_linux_update(struct netdev_linux *dev,
508 const struct rtnetlink_link_change *change)
510 if (change->nlmsg_type == RTM_NEWLINK) {
512 netdev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
514 /* Update netdev from rtnl-change msg. */
516 dev->mtu = change->mtu;
517 dev->cache_valid |= VALID_MTU;
518 dev->netdev_mtu_error = 0;
521 if (!eth_addr_is_zero(change->addr)) {
522 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
523 dev->cache_valid |= VALID_ETHERADDR;
524 dev->ether_addr_error = 0;
527 dev->ifindex = change->ifi_index;
528 dev->cache_valid |= VALID_IFINDEX;
529 dev->get_ifindex_error = 0;
532 netdev_linux_changed(dev, change->ifi_flags, 0);
537 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
538 void *aux OVS_UNUSED)
541 struct netdev *base_dev = netdev_from_name(change->ifname);
542 if (base_dev && is_netdev_linux_class(netdev_get_class(base_dev))) {
543 netdev_linux_update(netdev_linux_cast(base_dev), change);
546 struct shash device_shash;
547 struct shash_node *node;
549 shash_init(&device_shash);
550 netdev_get_devices(&netdev_linux_class, &device_shash);
551 SHASH_FOR_EACH (node, &device_shash) {
552 struct netdev *netdev = node->data;
553 struct netdev_linux *dev = netdev_linux_cast(netdev);
556 get_flags(&dev->up, &flags);
557 netdev_linux_changed(dev, flags, 0);
558 netdev_close(netdev);
560 shash_destroy(&device_shash);
565 cache_notifier_ref(void)
567 if (!cache_notifier_refcount) {
568 ovs_assert(!netdev_linux_cache_notifier);
570 netdev_linux_cache_notifier =
571 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
573 if (!netdev_linux_cache_notifier) {
577 cache_notifier_refcount++;
583 cache_notifier_unref(void)
585 ovs_assert(cache_notifier_refcount > 0);
586 if (!--cache_notifier_refcount) {
587 ovs_assert(netdev_linux_cache_notifier);
588 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
589 netdev_linux_cache_notifier = NULL;
593 /* Creates system and internal devices. */
595 netdev_linux_create(const struct netdev_class *class, const char *name,
596 struct netdev **netdevp)
598 struct netdev_linux *netdev;
601 error = cache_notifier_ref();
606 netdev = xzalloc(sizeof *netdev);
607 netdev->change_seq = 1;
608 netdev_init(&netdev->up, name, class);
609 error = get_flags(&netdev->up, &netdev->ifi_flags);
610 if (error == ENODEV) {
611 if (class != &netdev_internal_class) {
612 /* The device does not exist, so don't allow it to be opened. */
613 netdev_uninit(&netdev->up, false);
614 cache_notifier_unref();
618 /* "Internal" netdevs have to be created as netdev objects before
619 * they exist in the kernel, because creating them in the kernel
620 * happens by passing a netdev object to dpif_port_add().
621 * Therefore, ignore the error. */
625 *netdevp = &netdev->up;
629 /* For most types of netdevs we open the device for each call of
630 * netdev_open(). However, this is not the case with tap devices,
631 * since it is only possible to open the device once. In this
632 * situation we share a single file descriptor, and consequently
633 * buffers, across all readers. Therefore once data is read it will
634 * be unavailable to other reads for tap devices. */
636 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
637 const char *name, struct netdev **netdevp)
639 struct netdev_linux *netdev;
640 static const char tap_dev[] = "/dev/net/tun";
644 netdev = xzalloc(sizeof *netdev);
645 netdev->change_seq = 1;
647 error = cache_notifier_ref();
652 /* Open tap device. */
653 netdev->tap_fd = open(tap_dev, O_RDWR);
654 if (netdev->tap_fd < 0) {
656 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
657 goto error_unref_notifier;
660 /* Create tap device. */
661 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
662 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
663 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
664 VLOG_WARN("%s: creating tap device failed: %s", name,
665 ovs_strerror(errno));
670 /* Make non-blocking. */
671 error = set_nonblocking(netdev->tap_fd);
676 netdev_init(&netdev->up, name, &netdev_tap_class);
677 *netdevp = &netdev->up;
681 close(netdev->tap_fd);
682 error_unref_notifier:
683 cache_notifier_unref();
690 netdev_linux_destroy(struct netdev *netdev_)
692 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
694 if (netdev->tc && netdev->tc->ops->tc_destroy) {
695 netdev->tc->ops->tc_destroy(netdev->tc);
698 if (netdev_get_class(netdev_) == &netdev_tap_class
699 && netdev->tap_fd >= 0)
701 close(netdev->tap_fd);
705 cache_notifier_unref();
709 netdev_linux_rx_open(struct netdev *netdev_, struct netdev_rx **rxp)
711 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
712 bool is_tap = is_tap_netdev(netdev_);
713 struct netdev_rx_linux *rx;
720 struct sockaddr_ll sll;
722 /* Result of tcpdump -dd inbound */
723 static struct sock_filter filt[] = {
724 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
725 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
726 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
727 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
729 static struct sock_fprog fprog = { ARRAY_SIZE(filt), filt };
731 /* Create file descriptor. */
732 fd = socket(PF_PACKET, SOCK_RAW, 0);
735 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
739 /* Set non-blocking mode. */
740 error = set_nonblocking(fd);
745 /* Get ethernet device index. */
746 error = get_ifindex(&netdev->up, &ifindex);
751 /* Bind to specific ethernet device. */
752 memset(&sll, 0, sizeof sll);
753 sll.sll_family = AF_PACKET;
754 sll.sll_ifindex = ifindex;
755 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
756 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
758 VLOG_ERR("%s: failed to bind raw socket (%s)",
759 netdev_get_name(netdev_), ovs_strerror(error));
763 /* Filter for only inbound packets. */
764 error = setsockopt(fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
768 VLOG_ERR("%s: failed attach filter (%s)",
769 netdev_get_name(netdev_), ovs_strerror(error));
774 rx = xmalloc(sizeof *rx);
775 netdev_rx_init(&rx->up, netdev_, &netdev_rx_linux_class);
790 netdev_rx_linux_destroy(struct netdev_rx *rx_)
792 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
801 netdev_rx_linux_recv(struct netdev_rx *rx_, void *data, size_t size)
803 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
808 ? read(rx->fd, data, size)
809 : recv(rx->fd, data, size, MSG_TRUNC));
810 } while (retval < 0 && errno == EINTR);
813 return retval > size ? -EMSGSIZE : retval;
815 if (errno != EAGAIN) {
816 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
817 ovs_strerror(errno), netdev_rx_get_name(rx_));
824 netdev_rx_linux_wait(struct netdev_rx *rx_)
826 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
827 poll_fd_wait(rx->fd, POLLIN);
831 netdev_rx_linux_drain(struct netdev_rx *rx_)
833 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
836 int error = netdev_linux_do_ioctl(netdev_rx_get_name(rx_), &ifr,
837 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
841 drain_fd(rx->fd, ifr.ifr_qlen);
844 return drain_rcvbuf(rx->fd);
848 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
849 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
850 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
851 * the packet is too big or too small to transmit on the device.
853 * The caller retains ownership of 'buffer' in all cases.
855 * The kernel maintains a packet transmission queue, so the caller is not
856 * expected to do additional queuing of packets. */
858 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
863 if (!is_tap_netdev(netdev_)) {
864 /* Use our AF_PACKET socket to send to this device. */
865 struct sockaddr_ll sll;
872 sock = af_packet_sock();
877 error = get_ifindex(netdev_, &ifindex);
882 /* We don't bother setting most fields in sockaddr_ll because the
883 * kernel ignores them for SOCK_RAW. */
884 memset(&sll, 0, sizeof sll);
885 sll.sll_family = AF_PACKET;
886 sll.sll_ifindex = ifindex;
888 iov.iov_base = CONST_CAST(void *, data);
892 msg.msg_namelen = sizeof sll;
895 msg.msg_control = NULL;
896 msg.msg_controllen = 0;
899 retval = sendmsg(sock, &msg, 0);
901 /* Use the tap fd to send to this device. This is essential for
902 * tap devices, because packets sent to a tap device with an
903 * AF_PACKET socket will loop back to be *received* again on the
904 * tap device. This doesn't occur on other interface types
905 * because we attach a socket filter to the rx socket. */
906 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
908 retval = write(netdev->tap_fd, data, size);
912 /* The Linux AF_PACKET implementation never blocks waiting for room
913 * for packets, instead returning ENOBUFS. Translate this into
914 * EAGAIN for the caller. */
915 if (errno == ENOBUFS) {
917 } else if (errno == EINTR) {
919 } else if (errno != EAGAIN) {
920 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
921 netdev_get_name(netdev_), ovs_strerror(errno));
924 } else if (retval != size) {
925 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
926 "%zu) on %s", retval, size, netdev_get_name(netdev_));
934 /* Registers with the poll loop to wake up from the next call to poll_block()
935 * when the packet transmission queue has sufficient room to transmit a packet
936 * with netdev_send().
938 * The kernel maintains a packet transmission queue, so the client is not
939 * expected to do additional queuing of packets. Thus, this function is
940 * unlikely to ever be used. It is included for completeness. */
942 netdev_linux_send_wait(struct netdev *netdev)
944 if (is_tap_netdev(netdev)) {
945 /* TAP device always accepts packets.*/
946 poll_immediate_wake();
950 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
951 * otherwise a positive errno value. */
953 netdev_linux_set_etheraddr(struct netdev *netdev_,
954 const uint8_t mac[ETH_ADDR_LEN])
956 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
957 struct netdev_saved_flags *sf = NULL;
960 if (netdev->cache_valid & VALID_ETHERADDR) {
961 if (netdev->ether_addr_error) {
962 return netdev->ether_addr_error;
964 if (eth_addr_equals(netdev->etheraddr, mac)) {
967 netdev->cache_valid &= ~VALID_ETHERADDR;
970 /* Tap devices must be brought down before setting the address. */
971 if (is_tap_netdev(netdev_)) {
972 netdev_turn_flags_off(netdev_, NETDEV_UP, &sf);
974 error = set_etheraddr(netdev_get_name(netdev_), mac);
975 if (!error || error == ENODEV) {
976 netdev->ether_addr_error = error;
977 netdev->cache_valid |= VALID_ETHERADDR;
979 memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
983 netdev_restore_flags(sf);
988 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
990 netdev_linux_get_etheraddr(const struct netdev *netdev_,
991 uint8_t mac[ETH_ADDR_LEN])
993 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
995 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
996 int error = get_etheraddr(netdev_get_name(netdev_),
999 netdev->ether_addr_error = error;
1000 netdev->cache_valid |= VALID_ETHERADDR;
1003 if (!netdev->ether_addr_error) {
1004 memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
1007 return netdev->ether_addr_error;
1010 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1011 * in bytes, not including the hardware header; thus, this is typically 1500
1012 * bytes for Ethernet devices. */
1014 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1016 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1017 if (!(netdev->cache_valid & VALID_MTU)) {
1021 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1022 SIOCGIFMTU, "SIOCGIFMTU");
1024 netdev->netdev_mtu_error = error;
1025 netdev->mtu = ifr.ifr_mtu;
1026 netdev->cache_valid |= VALID_MTU;
1029 if (!netdev->netdev_mtu_error) {
1030 *mtup = netdev->mtu;
1032 return netdev->netdev_mtu_error;
1035 /* Sets the maximum size of transmitted (MTU) for given device using linux
1036 * networking ioctl interface.
1039 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1041 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1045 if (netdev->cache_valid & VALID_MTU) {
1046 if (netdev->netdev_mtu_error) {
1047 return netdev->netdev_mtu_error;
1049 if (netdev->mtu == mtu) {
1052 netdev->cache_valid &= ~VALID_MTU;
1055 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1056 SIOCSIFMTU, "SIOCSIFMTU");
1057 if (!error || error == ENODEV) {
1058 netdev->netdev_mtu_error = error;
1059 netdev->mtu = ifr.ifr_mtu;
1060 netdev->cache_valid |= VALID_MTU;
1065 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1066 * On failure, returns a negative errno value. */
1068 netdev_linux_get_ifindex(const struct netdev *netdev)
1072 error = get_ifindex(netdev, &ifindex);
1073 return error ? -error : ifindex;
1077 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1079 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1081 if (netdev->miimon_interval > 0) {
1082 *carrier = netdev->miimon;
1084 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1090 static long long int
1091 netdev_linux_get_carrier_resets(const struct netdev *netdev)
1093 return netdev_linux_cast(netdev)->carrier_resets;
1097 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1098 struct mii_ioctl_data *data)
1103 memset(&ifr, 0, sizeof ifr);
1104 memcpy(&ifr.ifr_data, data, sizeof *data);
1105 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1106 memcpy(data, &ifr.ifr_data, sizeof *data);
1112 netdev_linux_get_miimon(const char *name, bool *miimon)
1114 struct mii_ioctl_data data;
1119 memset(&data, 0, sizeof data);
1120 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1122 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1123 data.reg_num = MII_BMSR;
1124 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1128 *miimon = !!(data.val_out & BMSR_LSTATUS);
1130 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1133 struct ethtool_cmd ecmd;
1135 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1138 COVERAGE_INC(netdev_get_ethtool);
1139 memset(&ecmd, 0, sizeof ecmd);
1140 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1143 struct ethtool_value eval;
1145 memcpy(&eval, &ecmd, sizeof eval);
1146 *miimon = !!eval.data;
1148 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1156 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1157 long long int interval)
1159 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1161 interval = interval > 0 ? MAX(interval, 100) : 0;
1162 if (netdev->miimon_interval != interval) {
1163 netdev->miimon_interval = interval;
1164 timer_set_expired(&netdev->miimon_timer);
1171 netdev_linux_miimon_run(void)
1173 struct shash device_shash;
1174 struct shash_node *node;
1176 shash_init(&device_shash);
1177 netdev_get_devices(&netdev_linux_class, &device_shash);
1178 SHASH_FOR_EACH (node, &device_shash) {
1179 struct netdev *netdev = node->data;
1180 struct netdev_linux *dev = netdev_linux_cast(netdev);
1183 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1184 netdev_close(netdev);
1188 netdev_linux_get_miimon(dev->up.name, &miimon);
1189 if (miimon != dev->miimon) {
1190 dev->miimon = miimon;
1191 netdev_linux_changed(dev, dev->ifi_flags, 0);
1194 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1195 netdev_close(netdev);
1198 shash_destroy(&device_shash);
1202 netdev_linux_miimon_wait(void)
1204 struct shash device_shash;
1205 struct shash_node *node;
1207 shash_init(&device_shash);
1208 netdev_get_devices(&netdev_linux_class, &device_shash);
1209 SHASH_FOR_EACH (node, &device_shash) {
1210 struct netdev *netdev = node->data;
1211 struct netdev_linux *dev = netdev_linux_cast(netdev);
1213 if (dev->miimon_interval > 0) {
1214 timer_wait(&dev->miimon_timer);
1216 netdev_close(netdev);
1218 shash_destroy(&device_shash);
1221 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1222 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1225 check_for_working_netlink_stats(void)
1227 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1228 * preferable, so if that works, we'll use it. */
1229 int ifindex = do_get_ifindex("lo");
1231 VLOG_WARN("failed to get ifindex for lo, "
1232 "obtaining netdev stats from proc");
1235 struct netdev_stats stats;
1236 int error = get_stats_via_netlink(ifindex, &stats);
1238 VLOG_DBG("obtaining netdev stats via rtnetlink");
1241 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1242 "via proc (you are probably running a pre-2.6.19 "
1243 "kernel)", ovs_strerror(error));
1250 swap_uint64(uint64_t *a, uint64_t *b)
1257 /* Copies 'src' into 'dst', performing format conversion in the process.
1259 * 'src' is allowed to be misaligned. */
1261 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1262 const struct ovs_vport_stats *src)
1264 dst->rx_packets = get_unaligned_u64(&src->rx_packets);
1265 dst->tx_packets = get_unaligned_u64(&src->tx_packets);
1266 dst->rx_bytes = get_unaligned_u64(&src->rx_bytes);
1267 dst->tx_bytes = get_unaligned_u64(&src->tx_bytes);
1268 dst->rx_errors = get_unaligned_u64(&src->rx_errors);
1269 dst->tx_errors = get_unaligned_u64(&src->tx_errors);
1270 dst->rx_dropped = get_unaligned_u64(&src->rx_dropped);
1271 dst->tx_dropped = get_unaligned_u64(&src->tx_dropped);
1273 dst->collisions = 0;
1274 dst->rx_length_errors = 0;
1275 dst->rx_over_errors = 0;
1276 dst->rx_crc_errors = 0;
1277 dst->rx_frame_errors = 0;
1278 dst->rx_fifo_errors = 0;
1279 dst->rx_missed_errors = 0;
1280 dst->tx_aborted_errors = 0;
1281 dst->tx_carrier_errors = 0;
1282 dst->tx_fifo_errors = 0;
1283 dst->tx_heartbeat_errors = 0;
1284 dst->tx_window_errors = 0;
1288 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1290 struct dpif_linux_vport reply;
1294 error = dpif_linux_vport_get(netdev_get_name(netdev), &reply, &buf);
1297 } else if (!reply.stats) {
1302 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1310 get_stats_via_vport(const struct netdev *netdev_,
1311 struct netdev_stats *stats)
1313 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1315 if (!netdev->vport_stats_error ||
1316 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1319 error = get_stats_via_vport__(netdev_, stats);
1320 if (error && error != ENOENT) {
1321 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1323 netdev_get_name(netdev_), ovs_strerror(error));
1325 netdev->vport_stats_error = error;
1326 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1331 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1332 struct netdev_stats *stats)
1334 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
1335 static int use_netlink_stats;
1338 if (ovsthread_once_start(&once)) {
1339 use_netlink_stats = check_for_working_netlink_stats();
1340 ovsthread_once_done(&once);
1343 if (use_netlink_stats) {
1346 error = get_ifindex(netdev_, &ifindex);
1348 error = get_stats_via_netlink(ifindex, stats);
1351 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1355 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1356 netdev_get_name(netdev_), error);
1362 /* Retrieves current device stats for 'netdev-linux'. */
1364 netdev_linux_get_stats(const struct netdev *netdev_,
1365 struct netdev_stats *stats)
1367 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1368 struct netdev_stats dev_stats;
1371 get_stats_via_vport(netdev_, stats);
1373 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1376 if (netdev->vport_stats_error) {
1383 if (netdev->vport_stats_error) {
1384 /* stats not available from OVS then use ioctl stats. */
1387 stats->rx_errors += dev_stats.rx_errors;
1388 stats->tx_errors += dev_stats.tx_errors;
1389 stats->rx_dropped += dev_stats.rx_dropped;
1390 stats->tx_dropped += dev_stats.tx_dropped;
1391 stats->multicast += dev_stats.multicast;
1392 stats->collisions += dev_stats.collisions;
1393 stats->rx_length_errors += dev_stats.rx_length_errors;
1394 stats->rx_over_errors += dev_stats.rx_over_errors;
1395 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1396 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1397 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1398 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1399 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1400 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1401 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1402 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1403 stats->tx_window_errors += dev_stats.tx_window_errors;
1408 /* Retrieves current device stats for 'netdev-tap' netdev or
1409 * netdev-internal. */
1411 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1413 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1414 struct netdev_stats dev_stats;
1417 get_stats_via_vport(netdev_, stats);
1419 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1421 if (netdev->vport_stats_error) {
1428 /* If this port is an internal port then the transmit and receive stats
1429 * will appear to be swapped relative to the other ports since we are the
1430 * one sending the data, not a remote computer. For consistency, we swap
1431 * them back here. This does not apply if we are getting stats from the
1432 * vport layer because it always tracks stats from the perspective of the
1434 if (netdev->vport_stats_error) {
1436 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1437 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1438 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1439 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1440 stats->rx_length_errors = 0;
1441 stats->rx_over_errors = 0;
1442 stats->rx_crc_errors = 0;
1443 stats->rx_frame_errors = 0;
1444 stats->rx_fifo_errors = 0;
1445 stats->rx_missed_errors = 0;
1446 stats->tx_aborted_errors = 0;
1447 stats->tx_carrier_errors = 0;
1448 stats->tx_fifo_errors = 0;
1449 stats->tx_heartbeat_errors = 0;
1450 stats->tx_window_errors = 0;
1452 stats->rx_dropped += dev_stats.tx_dropped;
1453 stats->tx_dropped += dev_stats.rx_dropped;
1455 stats->rx_errors += dev_stats.tx_errors;
1456 stats->tx_errors += dev_stats.rx_errors;
1458 stats->multicast += dev_stats.multicast;
1459 stats->collisions += dev_stats.collisions;
1465 netdev_internal_get_stats(const struct netdev *netdev_,
1466 struct netdev_stats *stats)
1468 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1470 get_stats_via_vport(netdev_, stats);
1471 return netdev->vport_stats_error;
1475 netdev_internal_set_stats(struct netdev *netdev,
1476 const struct netdev_stats *stats)
1478 struct ovs_vport_stats vport_stats;
1479 struct dpif_linux_vport vport;
1482 vport_stats.rx_packets = stats->rx_packets;
1483 vport_stats.tx_packets = stats->tx_packets;
1484 vport_stats.rx_bytes = stats->rx_bytes;
1485 vport_stats.tx_bytes = stats->tx_bytes;
1486 vport_stats.rx_errors = stats->rx_errors;
1487 vport_stats.tx_errors = stats->tx_errors;
1488 vport_stats.rx_dropped = stats->rx_dropped;
1489 vport_stats.tx_dropped = stats->tx_dropped;
1491 dpif_linux_vport_init(&vport);
1492 vport.cmd = OVS_VPORT_CMD_SET;
1493 vport.name = netdev_get_name(netdev);
1494 vport.stats = &vport_stats;
1496 err = dpif_linux_vport_transact(&vport, NULL, NULL);
1498 /* If the vport layer doesn't know about the device, that doesn't mean it
1499 * doesn't exist (after all were able to open it when netdev_open() was
1500 * called), it just means that it isn't attached and we'll be getting
1501 * stats a different way. */
1502 if (err == ENODEV) {
1510 netdev_linux_read_features(struct netdev_linux *netdev)
1512 struct ethtool_cmd ecmd;
1516 if (netdev->cache_valid & VALID_FEATURES) {
1520 COVERAGE_INC(netdev_get_ethtool);
1521 memset(&ecmd, 0, sizeof ecmd);
1522 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1523 ETHTOOL_GSET, "ETHTOOL_GSET");
1528 /* Supported features. */
1529 netdev->supported = 0;
1530 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1531 netdev->supported |= NETDEV_F_10MB_HD;
1533 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1534 netdev->supported |= NETDEV_F_10MB_FD;
1536 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1537 netdev->supported |= NETDEV_F_100MB_HD;
1539 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1540 netdev->supported |= NETDEV_F_100MB_FD;
1542 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1543 netdev->supported |= NETDEV_F_1GB_HD;
1545 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1546 netdev->supported |= NETDEV_F_1GB_FD;
1548 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1549 netdev->supported |= NETDEV_F_10GB_FD;
1551 if (ecmd.supported & SUPPORTED_TP) {
1552 netdev->supported |= NETDEV_F_COPPER;
1554 if (ecmd.supported & SUPPORTED_FIBRE) {
1555 netdev->supported |= NETDEV_F_FIBER;
1557 if (ecmd.supported & SUPPORTED_Autoneg) {
1558 netdev->supported |= NETDEV_F_AUTONEG;
1560 if (ecmd.supported & SUPPORTED_Pause) {
1561 netdev->supported |= NETDEV_F_PAUSE;
1563 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1564 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1567 /* Advertised features. */
1568 netdev->advertised = 0;
1569 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1570 netdev->advertised |= NETDEV_F_10MB_HD;
1572 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1573 netdev->advertised |= NETDEV_F_10MB_FD;
1575 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1576 netdev->advertised |= NETDEV_F_100MB_HD;
1578 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1579 netdev->advertised |= NETDEV_F_100MB_FD;
1581 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1582 netdev->advertised |= NETDEV_F_1GB_HD;
1584 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1585 netdev->advertised |= NETDEV_F_1GB_FD;
1587 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1588 netdev->advertised |= NETDEV_F_10GB_FD;
1590 if (ecmd.advertising & ADVERTISED_TP) {
1591 netdev->advertised |= NETDEV_F_COPPER;
1593 if (ecmd.advertising & ADVERTISED_FIBRE) {
1594 netdev->advertised |= NETDEV_F_FIBER;
1596 if (ecmd.advertising & ADVERTISED_Autoneg) {
1597 netdev->advertised |= NETDEV_F_AUTONEG;
1599 if (ecmd.advertising & ADVERTISED_Pause) {
1600 netdev->advertised |= NETDEV_F_PAUSE;
1602 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1603 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1606 /* Current settings. */
1608 if (speed == SPEED_10) {
1609 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1610 } else if (speed == SPEED_100) {
1611 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1612 } else if (speed == SPEED_1000) {
1613 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1614 } else if (speed == SPEED_10000) {
1615 netdev->current = NETDEV_F_10GB_FD;
1616 } else if (speed == 40000) {
1617 netdev->current = NETDEV_F_40GB_FD;
1618 } else if (speed == 100000) {
1619 netdev->current = NETDEV_F_100GB_FD;
1620 } else if (speed == 1000000) {
1621 netdev->current = NETDEV_F_1TB_FD;
1623 netdev->current = 0;
1626 if (ecmd.port == PORT_TP) {
1627 netdev->current |= NETDEV_F_COPPER;
1628 } else if (ecmd.port == PORT_FIBRE) {
1629 netdev->current |= NETDEV_F_FIBER;
1633 netdev->current |= NETDEV_F_AUTONEG;
1637 netdev->cache_valid |= VALID_FEATURES;
1638 netdev->get_features_error = error;
1641 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
1642 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1643 * Returns 0 if successful, otherwise a positive errno value. */
1645 netdev_linux_get_features(const struct netdev *netdev_,
1646 enum netdev_features *current,
1647 enum netdev_features *advertised,
1648 enum netdev_features *supported,
1649 enum netdev_features *peer)
1651 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1653 netdev_linux_read_features(netdev);
1655 if (!netdev->get_features_error) {
1656 *current = netdev->current;
1657 *advertised = netdev->advertised;
1658 *supported = netdev->supported;
1659 *peer = 0; /* XXX */
1661 return netdev->get_features_error;
1664 /* Set the features advertised by 'netdev' to 'advertise'. */
1666 netdev_linux_set_advertisements(struct netdev *netdev,
1667 enum netdev_features advertise)
1669 struct ethtool_cmd ecmd;
1672 COVERAGE_INC(netdev_get_ethtool);
1673 memset(&ecmd, 0, sizeof ecmd);
1674 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1675 ETHTOOL_GSET, "ETHTOOL_GSET");
1680 ecmd.advertising = 0;
1681 if (advertise & NETDEV_F_10MB_HD) {
1682 ecmd.advertising |= ADVERTISED_10baseT_Half;
1684 if (advertise & NETDEV_F_10MB_FD) {
1685 ecmd.advertising |= ADVERTISED_10baseT_Full;
1687 if (advertise & NETDEV_F_100MB_HD) {
1688 ecmd.advertising |= ADVERTISED_100baseT_Half;
1690 if (advertise & NETDEV_F_100MB_FD) {
1691 ecmd.advertising |= ADVERTISED_100baseT_Full;
1693 if (advertise & NETDEV_F_1GB_HD) {
1694 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1696 if (advertise & NETDEV_F_1GB_FD) {
1697 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1699 if (advertise & NETDEV_F_10GB_FD) {
1700 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1702 if (advertise & NETDEV_F_COPPER) {
1703 ecmd.advertising |= ADVERTISED_TP;
1705 if (advertise & NETDEV_F_FIBER) {
1706 ecmd.advertising |= ADVERTISED_FIBRE;
1708 if (advertise & NETDEV_F_AUTONEG) {
1709 ecmd.advertising |= ADVERTISED_Autoneg;
1711 if (advertise & NETDEV_F_PAUSE) {
1712 ecmd.advertising |= ADVERTISED_Pause;
1714 if (advertise & NETDEV_F_PAUSE_ASYM) {
1715 ecmd.advertising |= ADVERTISED_Asym_Pause;
1717 COVERAGE_INC(netdev_set_ethtool);
1718 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1719 ETHTOOL_SSET, "ETHTOOL_SSET");
1722 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1723 * successful, otherwise a positive errno value. */
1725 netdev_linux_set_policing(struct netdev *netdev_,
1726 uint32_t kbits_rate, uint32_t kbits_burst)
1728 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1729 const char *netdev_name = netdev_get_name(netdev_);
1733 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1734 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1735 : kbits_burst); /* Stick with user-specified value. */
1737 if (netdev->cache_valid & VALID_POLICING) {
1738 if (netdev->netdev_policing_error) {
1739 return netdev->netdev_policing_error;
1742 if (netdev->kbits_rate == kbits_rate &&
1743 netdev->kbits_burst == kbits_burst) {
1744 /* Assume that settings haven't changed since we last set them. */
1747 netdev->cache_valid &= ~VALID_POLICING;
1750 COVERAGE_INC(netdev_set_policing);
1751 /* Remove any existing ingress qdisc. */
1752 error = tc_add_del_ingress_qdisc(netdev_, false);
1754 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1755 netdev_name, ovs_strerror(error));
1760 error = tc_add_del_ingress_qdisc(netdev_, true);
1762 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1763 netdev_name, ovs_strerror(error));
1767 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
1769 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1770 netdev_name, ovs_strerror(error));
1775 netdev->kbits_rate = kbits_rate;
1776 netdev->kbits_burst = kbits_burst;
1779 if (!error || error == ENODEV) {
1780 netdev->netdev_policing_error = error;
1781 netdev->cache_valid |= VALID_POLICING;
1787 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1790 const struct tc_ops *const *opsp;
1792 for (opsp = tcs; *opsp != NULL; opsp++) {
1793 const struct tc_ops *ops = *opsp;
1794 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1795 sset_add(types, ops->ovs_name);
1801 static const struct tc_ops *
1802 tc_lookup_ovs_name(const char *name)
1804 const struct tc_ops *const *opsp;
1806 for (opsp = tcs; *opsp != NULL; opsp++) {
1807 const struct tc_ops *ops = *opsp;
1808 if (!strcmp(name, ops->ovs_name)) {
1815 static const struct tc_ops *
1816 tc_lookup_linux_name(const char *name)
1818 const struct tc_ops *const *opsp;
1820 for (opsp = tcs; *opsp != NULL; opsp++) {
1821 const struct tc_ops *ops = *opsp;
1822 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1829 static struct tc_queue *
1830 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
1833 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1834 struct tc_queue *queue;
1836 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
1837 if (queue->queue_id == queue_id) {
1844 static struct tc_queue *
1845 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1847 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1851 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1853 struct netdev_qos_capabilities *caps)
1855 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1859 caps->n_queues = ops->n_queues;
1864 netdev_linux_get_qos(const struct netdev *netdev_,
1865 const char **typep, struct smap *details)
1867 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1870 error = tc_query_qdisc(netdev_);
1875 *typep = netdev->tc->ops->ovs_name;
1876 return (netdev->tc->ops->qdisc_get
1877 ? netdev->tc->ops->qdisc_get(netdev_, details)
1882 netdev_linux_set_qos(struct netdev *netdev_,
1883 const char *type, const struct smap *details)
1885 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1886 const struct tc_ops *new_ops;
1889 new_ops = tc_lookup_ovs_name(type);
1890 if (!new_ops || !new_ops->tc_install) {
1894 error = tc_query_qdisc(netdev_);
1899 if (new_ops == netdev->tc->ops) {
1900 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
1902 /* Delete existing qdisc. */
1903 error = tc_del_qdisc(netdev_);
1907 ovs_assert(netdev->tc == NULL);
1909 /* Install new qdisc. */
1910 error = new_ops->tc_install(netdev_, details);
1911 ovs_assert((error == 0) == (netdev->tc != NULL));
1918 netdev_linux_get_queue(const struct netdev *netdev_,
1919 unsigned int queue_id, struct smap *details)
1921 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1924 error = tc_query_qdisc(netdev_);
1928 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
1930 ? netdev->tc->ops->class_get(netdev_, queue, details)
1936 netdev_linux_set_queue(struct netdev *netdev_,
1937 unsigned int queue_id, const struct smap *details)
1939 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1942 error = tc_query_qdisc(netdev_);
1945 } else if (queue_id >= netdev->tc->ops->n_queues
1946 || !netdev->tc->ops->class_set) {
1950 return netdev->tc->ops->class_set(netdev_, queue_id, details);
1954 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
1956 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1959 error = tc_query_qdisc(netdev_);
1962 } else if (!netdev->tc->ops->class_delete) {
1965 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
1967 ? netdev->tc->ops->class_delete(netdev_, queue)
1973 netdev_linux_get_queue_stats(const struct netdev *netdev_,
1974 unsigned int queue_id,
1975 struct netdev_queue_stats *stats)
1977 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1980 error = tc_query_qdisc(netdev_);
1983 } else if (!netdev->tc->ops->class_get_stats) {
1986 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
1990 stats->created = queue->created;
1991 return netdev->tc->ops->class_get_stats(netdev_, queue, stats);
1996 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1998 struct ofpbuf request;
1999 struct tcmsg *tcmsg;
2001 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2005 tcmsg->tcm_parent = 0;
2006 nl_dump_start(dump, NETLINK_ROUTE, &request);
2007 ofpbuf_uninit(&request);
2012 netdev_linux_dump_queues(const struct netdev *netdev_,
2013 netdev_dump_queues_cb *cb, void *aux)
2015 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2016 struct tc_queue *queue, *next_queue;
2017 struct smap details;
2021 error = tc_query_qdisc(netdev_);
2024 } else if (!netdev->tc->ops->class_get) {
2029 smap_init(&details);
2030 HMAP_FOR_EACH_SAFE (queue, next_queue, hmap_node,
2031 &netdev->tc->queues) {
2032 smap_clear(&details);
2034 error = netdev->tc->ops->class_get(netdev_, queue, &details);
2036 (*cb)(queue->queue_id, &details, aux);
2041 smap_destroy(&details);
2047 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2048 netdev_dump_queue_stats_cb *cb, void *aux)
2050 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2051 struct nl_dump dump;
2056 error = tc_query_qdisc(netdev_);
2059 } else if (!netdev->tc->ops->class_dump_stats) {
2064 if (!start_queue_dump(netdev_, &dump)) {
2067 while (nl_dump_next(&dump, &msg)) {
2068 error = netdev->tc->ops->class_dump_stats(netdev_, &msg, cb, aux);
2074 error = nl_dump_done(&dump);
2075 return error ? error : last_error;
2079 netdev_linux_get_in4(const struct netdev *netdev_,
2080 struct in_addr *address, struct in_addr *netmask)
2082 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2084 if (!(netdev->cache_valid & VALID_IN4)) {
2087 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
2088 SIOCGIFADDR, "SIOCGIFADDR");
2093 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2094 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2099 netdev->cache_valid |= VALID_IN4;
2101 *address = netdev->address;
2102 *netmask = netdev->netmask;
2103 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
2107 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2108 struct in_addr netmask)
2110 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2113 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2115 netdev->cache_valid |= VALID_IN4;
2116 netdev->address = address;
2117 netdev->netmask = netmask;
2118 if (address.s_addr != INADDR_ANY) {
2119 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2120 "SIOCSIFNETMASK", netmask);
2127 parse_if_inet6_line(const char *line,
2128 struct in6_addr *in6, char ifname[16 + 1])
2130 uint8_t *s6 = in6->s6_addr;
2131 #define X8 "%2"SCNx8
2133 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2134 "%*x %*x %*x %*x %16s\n",
2135 &s6[0], &s6[1], &s6[2], &s6[3],
2136 &s6[4], &s6[5], &s6[6], &s6[7],
2137 &s6[8], &s6[9], &s6[10], &s6[11],
2138 &s6[12], &s6[13], &s6[14], &s6[15],
2142 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2143 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2145 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2147 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2148 if (!(netdev->cache_valid & VALID_IN6)) {
2152 netdev->in6 = in6addr_any;
2154 file = fopen("/proc/net/if_inet6", "r");
2156 const char *name = netdev_get_name(netdev_);
2157 while (fgets(line, sizeof line, file)) {
2158 struct in6_addr in6_tmp;
2159 char ifname[16 + 1];
2160 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2161 && !strcmp(name, ifname))
2163 netdev->in6 = in6_tmp;
2169 netdev->cache_valid |= VALID_IN6;
2176 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2178 struct sockaddr_in sin;
2179 memset(&sin, 0, sizeof sin);
2180 sin.sin_family = AF_INET;
2181 sin.sin_addr = addr;
2184 memset(sa, 0, sizeof *sa);
2185 memcpy(sa, &sin, sizeof sin);
2189 do_set_addr(struct netdev *netdev,
2190 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2193 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2194 make_in4_sockaddr(&ifr.ifr_addr, addr);
2196 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2200 /* Adds 'router' as a default IP gateway. */
2202 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2204 struct in_addr any = { INADDR_ANY };
2208 memset(&rt, 0, sizeof rt);
2209 make_in4_sockaddr(&rt.rt_dst, any);
2210 make_in4_sockaddr(&rt.rt_gateway, router);
2211 make_in4_sockaddr(&rt.rt_genmask, any);
2212 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2213 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2215 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2221 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2224 static const char fn[] = "/proc/net/route";
2229 *netdev_name = NULL;
2230 stream = fopen(fn, "r");
2231 if (stream == NULL) {
2232 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2237 while (fgets(line, sizeof line, stream)) {
2240 ovs_be32 dest, gateway, mask;
2241 int refcnt, metric, mtu;
2242 unsigned int flags, use, window, irtt;
2245 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2247 iface, &dest, &gateway, &flags, &refcnt,
2248 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2250 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2254 if (!(flags & RTF_UP)) {
2255 /* Skip routes that aren't up. */
2259 /* The output of 'dest', 'mask', and 'gateway' were given in
2260 * network byte order, so we don't need need any endian
2261 * conversions here. */
2262 if ((dest & mask) == (host->s_addr & mask)) {
2264 /* The host is directly reachable. */
2265 next_hop->s_addr = 0;
2267 /* To reach the host, we must go through a gateway. */
2268 next_hop->s_addr = gateway;
2270 *netdev_name = xstrdup(iface);
2282 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2284 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2287 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2288 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2290 COVERAGE_INC(netdev_get_ethtool);
2291 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2292 error = netdev_linux_do_ethtool(netdev->up.name,
2295 "ETHTOOL_GDRVINFO");
2297 netdev->cache_valid |= VALID_DRVINFO;
2302 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2303 smap_add(smap, "driver_version", netdev->drvinfo.version);
2304 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2310 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2313 smap_add(smap, "driver_name", "openvswitch");
2317 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2318 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2319 * returns 0. Otherwise, it returns a positive errno value; in particular,
2320 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2322 netdev_linux_arp_lookup(const struct netdev *netdev,
2323 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2326 struct sockaddr_in sin;
2329 memset(&r, 0, sizeof r);
2330 memset(&sin, 0, sizeof sin);
2331 sin.sin_family = AF_INET;
2332 sin.sin_addr.s_addr = ip;
2334 memcpy(&r.arp_pa, &sin, sizeof sin);
2335 r.arp_ha.sa_family = ARPHRD_ETHER;
2337 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2338 COVERAGE_INC(netdev_arp_lookup);
2339 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2341 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2342 } else if (retval != ENXIO) {
2343 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2344 netdev_get_name(netdev), IP_ARGS(ip),
2345 ovs_strerror(retval));
2351 nd_to_iff_flags(enum netdev_flags nd)
2354 if (nd & NETDEV_UP) {
2357 if (nd & NETDEV_PROMISC) {
2364 iff_to_nd_flags(int iff)
2366 enum netdev_flags nd = 0;
2370 if (iff & IFF_PROMISC) {
2371 nd |= NETDEV_PROMISC;
2377 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2378 enum netdev_flags on, enum netdev_flags *old_flagsp)
2380 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2381 int old_flags, new_flags;
2384 old_flags = netdev->ifi_flags;
2385 *old_flagsp = iff_to_nd_flags(old_flags);
2386 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2387 if (new_flags != old_flags) {
2388 error = set_flags(netdev_get_name(netdev_), new_flags);
2389 get_flags(netdev_, &netdev->ifi_flags);
2395 netdev_linux_change_seq(const struct netdev *netdev)
2397 return netdev_linux_cast(netdev)->change_seq;
2400 #define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS, \
2401 GET_FEATURES, GET_STATUS) \
2405 netdev_linux_init, \
2407 netdev_linux_wait, \
2410 netdev_linux_destroy, \
2411 NULL, /* get_config */ \
2412 NULL, /* set_config */ \
2413 NULL, /* get_tunnel_config */ \
2415 netdev_linux_rx_open, \
2417 netdev_linux_send, \
2418 netdev_linux_send_wait, \
2420 netdev_linux_set_etheraddr, \
2421 netdev_linux_get_etheraddr, \
2422 netdev_linux_get_mtu, \
2423 netdev_linux_set_mtu, \
2424 netdev_linux_get_ifindex, \
2425 netdev_linux_get_carrier, \
2426 netdev_linux_get_carrier_resets, \
2427 netdev_linux_set_miimon_interval, \
2432 netdev_linux_set_advertisements, \
2434 netdev_linux_set_policing, \
2435 netdev_linux_get_qos_types, \
2436 netdev_linux_get_qos_capabilities, \
2437 netdev_linux_get_qos, \
2438 netdev_linux_set_qos, \
2439 netdev_linux_get_queue, \
2440 netdev_linux_set_queue, \
2441 netdev_linux_delete_queue, \
2442 netdev_linux_get_queue_stats, \
2443 netdev_linux_dump_queues, \
2444 netdev_linux_dump_queue_stats, \
2446 netdev_linux_get_in4, \
2447 netdev_linux_set_in4, \
2448 netdev_linux_get_in6, \
2449 netdev_linux_add_router, \
2450 netdev_linux_get_next_hop, \
2452 netdev_linux_arp_lookup, \
2454 netdev_linux_update_flags, \
2456 netdev_linux_change_seq \
2459 const struct netdev_class netdev_linux_class =
2462 netdev_linux_create,
2463 netdev_linux_get_stats,
2464 NULL, /* set_stats */
2465 netdev_linux_get_features,
2466 netdev_linux_get_status);
2468 const struct netdev_class netdev_tap_class =
2471 netdev_linux_create_tap,
2472 netdev_tap_get_stats,
2473 NULL, /* set_stats */
2474 netdev_linux_get_features,
2475 netdev_linux_get_status);
2477 const struct netdev_class netdev_internal_class =
2480 netdev_linux_create,
2481 netdev_internal_get_stats,
2482 netdev_internal_set_stats,
2483 NULL, /* get_features */
2484 netdev_internal_get_status);
2486 static const struct netdev_rx_class netdev_rx_linux_class = {
2487 netdev_rx_linux_destroy,
2488 netdev_rx_linux_recv,
2489 netdev_rx_linux_wait,
2490 netdev_rx_linux_drain,
2493 /* HTB traffic control class. */
2495 #define HTB_N_QUEUES 0xf000
2499 unsigned int max_rate; /* In bytes/s. */
2503 struct tc_queue tc_queue;
2504 unsigned int min_rate; /* In bytes/s. */
2505 unsigned int max_rate; /* In bytes/s. */
2506 unsigned int burst; /* In bytes. */
2507 unsigned int priority; /* Lower values are higher priorities. */
2511 htb_get__(const struct netdev *netdev_)
2513 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2514 return CONTAINER_OF(netdev->tc, struct htb, tc);
2518 htb_install__(struct netdev *netdev_, uint64_t max_rate)
2520 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2523 htb = xmalloc(sizeof *htb);
2524 tc_init(&htb->tc, &tc_ops_htb);
2525 htb->max_rate = max_rate;
2527 netdev->tc = &htb->tc;
2530 /* Create an HTB qdisc.
2532 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2534 htb_setup_qdisc__(struct netdev *netdev)
2537 struct tc_htb_glob opt;
2538 struct ofpbuf request;
2539 struct tcmsg *tcmsg;
2541 tc_del_qdisc(netdev);
2543 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2544 NLM_F_EXCL | NLM_F_CREATE, &request);
2548 tcmsg->tcm_handle = tc_make_handle(1, 0);
2549 tcmsg->tcm_parent = TC_H_ROOT;
2551 nl_msg_put_string(&request, TCA_KIND, "htb");
2553 memset(&opt, 0, sizeof opt);
2554 opt.rate2quantum = 10;
2558 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2559 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2560 nl_msg_end_nested(&request, opt_offset);
2562 return tc_transact(&request, NULL);
2565 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2566 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2568 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2569 unsigned int parent, struct htb_class *class)
2572 struct tc_htb_opt opt;
2573 struct ofpbuf request;
2574 struct tcmsg *tcmsg;
2578 error = netdev_get_mtu(netdev, &mtu);
2580 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2581 netdev_get_name(netdev));
2585 memset(&opt, 0, sizeof opt);
2586 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2587 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2588 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2589 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2590 opt.prio = class->priority;
2592 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2596 tcmsg->tcm_handle = handle;
2597 tcmsg->tcm_parent = parent;
2599 nl_msg_put_string(&request, TCA_KIND, "htb");
2600 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2601 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2602 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2603 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2604 nl_msg_end_nested(&request, opt_offset);
2606 error = tc_transact(&request, NULL);
2608 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2609 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2610 netdev_get_name(netdev),
2611 tc_get_major(handle), tc_get_minor(handle),
2612 tc_get_major(parent), tc_get_minor(parent),
2613 class->min_rate, class->max_rate,
2614 class->burst, class->priority, ovs_strerror(error));
2619 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2620 * description of them into 'details'. The description complies with the
2621 * specification given in the vswitch database documentation for linux-htb
2624 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2626 static const struct nl_policy tca_htb_policy[] = {
2627 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2628 .min_len = sizeof(struct tc_htb_opt) },
2631 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2632 const struct tc_htb_opt *htb;
2634 if (!nl_parse_nested(nl_options, tca_htb_policy,
2635 attrs, ARRAY_SIZE(tca_htb_policy))) {
2636 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2640 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2641 class->min_rate = htb->rate.rate;
2642 class->max_rate = htb->ceil.rate;
2643 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2644 class->priority = htb->prio;
2649 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2650 struct htb_class *options,
2651 struct netdev_queue_stats *stats)
2653 struct nlattr *nl_options;
2654 unsigned int handle;
2657 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2658 if (!error && queue_id) {
2659 unsigned int major = tc_get_major(handle);
2660 unsigned int minor = tc_get_minor(handle);
2661 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2662 *queue_id = minor - 1;
2667 if (!error && options) {
2668 error = htb_parse_tca_options__(nl_options, options);
2674 htb_parse_qdisc_details__(struct netdev *netdev,
2675 const struct smap *details, struct htb_class *hc)
2677 const char *max_rate_s;
2679 max_rate_s = smap_get(details, "max-rate");
2680 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2681 if (!hc->max_rate) {
2682 enum netdev_features current;
2684 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2685 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
2687 hc->min_rate = hc->max_rate;
2693 htb_parse_class_details__(struct netdev *netdev,
2694 const struct smap *details, struct htb_class *hc)
2696 const struct htb *htb = htb_get__(netdev);
2697 const char *min_rate_s = smap_get(details, "min-rate");
2698 const char *max_rate_s = smap_get(details, "max-rate");
2699 const char *burst_s = smap_get(details, "burst");
2700 const char *priority_s = smap_get(details, "priority");
2703 error = netdev_get_mtu(netdev, &mtu);
2705 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2706 netdev_get_name(netdev));
2710 /* HTB requires at least an mtu sized min-rate to send any traffic even
2711 * on uncongested links. */
2712 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2713 hc->min_rate = MAX(hc->min_rate, mtu);
2714 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2717 hc->max_rate = (max_rate_s
2718 ? strtoull(max_rate_s, NULL, 10) / 8
2720 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2721 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2725 * According to hints in the documentation that I've read, it is important
2726 * that 'burst' be at least as big as the largest frame that might be
2727 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2728 * but having it a bit too small is a problem. Since netdev_get_mtu()
2729 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2730 * the MTU. We actually add 64, instead of 14, as a guard against
2731 * additional headers get tacked on somewhere that we're not aware of. */
2732 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2733 hc->burst = MAX(hc->burst, mtu + 64);
2736 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2742 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2743 unsigned int parent, struct htb_class *options,
2744 struct netdev_queue_stats *stats)
2746 struct ofpbuf *reply;
2749 error = tc_query_class(netdev, handle, parent, &reply);
2751 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2752 ofpbuf_delete(reply);
2758 htb_tc_install(struct netdev *netdev, const struct smap *details)
2762 error = htb_setup_qdisc__(netdev);
2764 struct htb_class hc;
2766 htb_parse_qdisc_details__(netdev, details, &hc);
2767 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2768 tc_make_handle(1, 0), &hc);
2770 htb_install__(netdev, hc.max_rate);
2776 static struct htb_class *
2777 htb_class_cast__(const struct tc_queue *queue)
2779 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2783 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2784 const struct htb_class *hc)
2786 struct htb *htb = htb_get__(netdev);
2787 size_t hash = hash_int(queue_id, 0);
2788 struct tc_queue *queue;
2789 struct htb_class *hcp;
2791 queue = tc_find_queue__(netdev, queue_id, hash);
2793 hcp = htb_class_cast__(queue);
2795 hcp = xmalloc(sizeof *hcp);
2796 queue = &hcp->tc_queue;
2797 queue->queue_id = queue_id;
2798 queue->created = time_msec();
2799 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2802 hcp->min_rate = hc->min_rate;
2803 hcp->max_rate = hc->max_rate;
2804 hcp->burst = hc->burst;
2805 hcp->priority = hc->priority;
2809 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2812 struct nl_dump dump;
2813 struct htb_class hc;
2815 /* Get qdisc options. */
2817 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2818 htb_install__(netdev, hc.max_rate);
2821 if (!start_queue_dump(netdev, &dump)) {
2824 while (nl_dump_next(&dump, &msg)) {
2825 unsigned int queue_id;
2827 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2828 htb_update_queue__(netdev, queue_id, &hc);
2831 nl_dump_done(&dump);
2837 htb_tc_destroy(struct tc *tc)
2839 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2840 struct htb_class *hc, *next;
2842 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2843 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2851 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
2853 const struct htb *htb = htb_get__(netdev);
2854 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
2859 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
2861 struct htb_class hc;
2864 htb_parse_qdisc_details__(netdev, details, &hc);
2865 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2866 tc_make_handle(1, 0), &hc);
2868 htb_get__(netdev)->max_rate = hc.max_rate;
2874 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2875 const struct tc_queue *queue, struct smap *details)
2877 const struct htb_class *hc = htb_class_cast__(queue);
2879 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
2880 if (hc->min_rate != hc->max_rate) {
2881 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
2883 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
2885 smap_add_format(details, "priority", "%u", hc->priority);
2891 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2892 const struct smap *details)
2894 struct htb_class hc;
2897 error = htb_parse_class_details__(netdev, details, &hc);
2902 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2903 tc_make_handle(1, 0xfffe), &hc);
2908 htb_update_queue__(netdev, queue_id, &hc);
2913 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2915 struct htb_class *hc = htb_class_cast__(queue);
2916 struct htb *htb = htb_get__(netdev);
2919 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2921 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2928 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2929 struct netdev_queue_stats *stats)
2931 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2932 tc_make_handle(1, 0xfffe), NULL, stats);
2936 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2937 const struct ofpbuf *nlmsg,
2938 netdev_dump_queue_stats_cb *cb, void *aux)
2940 struct netdev_queue_stats stats;
2941 unsigned int handle, major, minor;
2944 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2949 major = tc_get_major(handle);
2950 minor = tc_get_minor(handle);
2951 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2952 (*cb)(minor - 1, &stats, aux);
2957 static const struct tc_ops tc_ops_htb = {
2958 "htb", /* linux_name */
2959 "linux-htb", /* ovs_name */
2960 HTB_N_QUEUES, /* n_queues */
2969 htb_class_get_stats,
2970 htb_class_dump_stats
2973 /* "linux-hfsc" traffic control class. */
2975 #define HFSC_N_QUEUES 0xf000
2983 struct tc_queue tc_queue;
2988 static struct hfsc *
2989 hfsc_get__(const struct netdev *netdev_)
2991 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2992 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
2995 static struct hfsc_class *
2996 hfsc_class_cast__(const struct tc_queue *queue)
2998 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3002 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
3004 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3007 hfsc = xmalloc(sizeof *hfsc);
3008 tc_init(&hfsc->tc, &tc_ops_hfsc);
3009 hfsc->max_rate = max_rate;
3010 netdev->tc = &hfsc->tc;
3014 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3015 const struct hfsc_class *hc)
3019 struct hfsc_class *hcp;
3020 struct tc_queue *queue;
3022 hfsc = hfsc_get__(netdev);
3023 hash = hash_int(queue_id, 0);
3025 queue = tc_find_queue__(netdev, queue_id, hash);
3027 hcp = hfsc_class_cast__(queue);
3029 hcp = xmalloc(sizeof *hcp);
3030 queue = &hcp->tc_queue;
3031 queue->queue_id = queue_id;
3032 queue->created = time_msec();
3033 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3036 hcp->min_rate = hc->min_rate;
3037 hcp->max_rate = hc->max_rate;
3041 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3043 const struct tc_service_curve *rsc, *fsc, *usc;
3044 static const struct nl_policy tca_hfsc_policy[] = {
3046 .type = NL_A_UNSPEC,
3048 .min_len = sizeof(struct tc_service_curve),
3051 .type = NL_A_UNSPEC,
3053 .min_len = sizeof(struct tc_service_curve),
3056 .type = NL_A_UNSPEC,
3058 .min_len = sizeof(struct tc_service_curve),
3061 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3063 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3064 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3065 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3069 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3070 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3071 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3073 if (rsc->m1 != 0 || rsc->d != 0 ||
3074 fsc->m1 != 0 || fsc->d != 0 ||
3075 usc->m1 != 0 || usc->d != 0) {
3076 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3077 "Non-linear service curves are not supported.");
3081 if (rsc->m2 != fsc->m2) {
3082 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3083 "Real-time service curves are not supported ");
3087 if (rsc->m2 > usc->m2) {
3088 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3089 "Min-rate service curve is greater than "
3090 "the max-rate service curve.");
3094 class->min_rate = fsc->m2;
3095 class->max_rate = usc->m2;
3100 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3101 struct hfsc_class *options,
3102 struct netdev_queue_stats *stats)
3105 unsigned int handle;
3106 struct nlattr *nl_options;
3108 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3114 unsigned int major, minor;
3116 major = tc_get_major(handle);
3117 minor = tc_get_minor(handle);
3118 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3119 *queue_id = minor - 1;
3126 error = hfsc_parse_tca_options__(nl_options, options);
3133 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3134 unsigned int parent, struct hfsc_class *options,
3135 struct netdev_queue_stats *stats)
3138 struct ofpbuf *reply;
3140 error = tc_query_class(netdev, handle, parent, &reply);
3145 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3146 ofpbuf_delete(reply);
3151 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct smap *details,
3152 struct hfsc_class *class)
3155 const char *max_rate_s;
3157 max_rate_s = smap_get(details, "max-rate");
3158 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3161 enum netdev_features current;
3163 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
3164 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3167 class->min_rate = max_rate;
3168 class->max_rate = max_rate;
3172 hfsc_parse_class_details__(struct netdev *netdev,
3173 const struct smap *details,
3174 struct hfsc_class * class)
3176 const struct hfsc *hfsc;
3177 uint32_t min_rate, max_rate;
3178 const char *min_rate_s, *max_rate_s;
3180 hfsc = hfsc_get__(netdev);
3181 min_rate_s = smap_get(details, "min-rate");
3182 max_rate_s = smap_get(details, "max-rate");
3184 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3185 min_rate = MAX(min_rate, 1);
3186 min_rate = MIN(min_rate, hfsc->max_rate);
3188 max_rate = (max_rate_s
3189 ? strtoull(max_rate_s, NULL, 10) / 8
3191 max_rate = MAX(max_rate, min_rate);
3192 max_rate = MIN(max_rate, hfsc->max_rate);
3194 class->min_rate = min_rate;
3195 class->max_rate = max_rate;
3200 /* Create an HFSC qdisc.
3202 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3204 hfsc_setup_qdisc__(struct netdev * netdev)
3206 struct tcmsg *tcmsg;
3207 struct ofpbuf request;
3208 struct tc_hfsc_qopt opt;
3210 tc_del_qdisc(netdev);
3212 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3213 NLM_F_EXCL | NLM_F_CREATE, &request);
3219 tcmsg->tcm_handle = tc_make_handle(1, 0);
3220 tcmsg->tcm_parent = TC_H_ROOT;
3222 memset(&opt, 0, sizeof opt);
3225 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3226 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3228 return tc_transact(&request, NULL);
3231 /* Create an HFSC class.
3233 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3234 * sc rate <min_rate> ul rate <max_rate>" */
3236 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3237 unsigned int parent, struct hfsc_class *class)
3241 struct tcmsg *tcmsg;
3242 struct ofpbuf request;
3243 struct tc_service_curve min, max;
3245 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3251 tcmsg->tcm_handle = handle;
3252 tcmsg->tcm_parent = parent;
3256 min.m2 = class->min_rate;
3260 max.m2 = class->max_rate;
3262 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3263 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3264 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3265 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3266 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3267 nl_msg_end_nested(&request, opt_offset);
3269 error = tc_transact(&request, NULL);
3271 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3272 "min-rate %ubps, max-rate %ubps (%s)",
3273 netdev_get_name(netdev),
3274 tc_get_major(handle), tc_get_minor(handle),
3275 tc_get_major(parent), tc_get_minor(parent),
3276 class->min_rate, class->max_rate, ovs_strerror(error));
3283 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
3286 struct hfsc_class class;
3288 error = hfsc_setup_qdisc__(netdev);
3294 hfsc_parse_qdisc_details__(netdev, details, &class);
3295 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3296 tc_make_handle(1, 0), &class);
3302 hfsc_install__(netdev, class.max_rate);
3307 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3310 struct nl_dump dump;
3311 struct hfsc_class hc;
3314 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3315 hfsc_install__(netdev, hc.max_rate);
3317 if (!start_queue_dump(netdev, &dump)) {
3321 while (nl_dump_next(&dump, &msg)) {
3322 unsigned int queue_id;
3324 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3325 hfsc_update_queue__(netdev, queue_id, &hc);
3329 nl_dump_done(&dump);
3334 hfsc_tc_destroy(struct tc *tc)
3337 struct hfsc_class *hc, *next;
3339 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3341 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3342 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3351 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
3353 const struct hfsc *hfsc;
3354 hfsc = hfsc_get__(netdev);
3355 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
3360 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
3363 struct hfsc_class class;
3365 hfsc_parse_qdisc_details__(netdev, details, &class);
3366 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3367 tc_make_handle(1, 0), &class);
3370 hfsc_get__(netdev)->max_rate = class.max_rate;
3377 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3378 const struct tc_queue *queue, struct smap *details)
3380 const struct hfsc_class *hc;
3382 hc = hfsc_class_cast__(queue);
3383 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3384 if (hc->min_rate != hc->max_rate) {
3385 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3391 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3392 const struct smap *details)
3395 struct hfsc_class class;
3397 error = hfsc_parse_class_details__(netdev, details, &class);
3402 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3403 tc_make_handle(1, 0xfffe), &class);
3408 hfsc_update_queue__(netdev, queue_id, &class);
3413 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3417 struct hfsc_class *hc;
3419 hc = hfsc_class_cast__(queue);
3420 hfsc = hfsc_get__(netdev);
3422 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3424 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3431 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3432 struct netdev_queue_stats *stats)
3434 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3435 tc_make_handle(1, 0xfffe), NULL, stats);
3439 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3440 const struct ofpbuf *nlmsg,
3441 netdev_dump_queue_stats_cb *cb, void *aux)
3443 struct netdev_queue_stats stats;
3444 unsigned int handle, major, minor;
3447 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3452 major = tc_get_major(handle);
3453 minor = tc_get_minor(handle);
3454 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3455 (*cb)(minor - 1, &stats, aux);
3460 static const struct tc_ops tc_ops_hfsc = {
3461 "hfsc", /* linux_name */
3462 "linux-hfsc", /* ovs_name */
3463 HFSC_N_QUEUES, /* n_queues */
3464 hfsc_tc_install, /* tc_install */
3465 hfsc_tc_load, /* tc_load */
3466 hfsc_tc_destroy, /* tc_destroy */
3467 hfsc_qdisc_get, /* qdisc_get */
3468 hfsc_qdisc_set, /* qdisc_set */
3469 hfsc_class_get, /* class_get */
3470 hfsc_class_set, /* class_set */
3471 hfsc_class_delete, /* class_delete */
3472 hfsc_class_get_stats, /* class_get_stats */
3473 hfsc_class_dump_stats /* class_dump_stats */
3476 /* "linux-default" traffic control class.
3478 * This class represents the default, unnamed Linux qdisc. It corresponds to
3479 * the "" (empty string) QoS type in the OVS database. */
3482 default_install__(struct netdev *netdev_)
3484 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3485 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
3487 /* Nothing but a tc class implementation is allowed to write to a tc. This
3488 * class never does that, so we can legitimately use a const tc object. */
3489 netdev->tc = CONST_CAST(struct tc *, &tc);
3493 default_tc_install(struct netdev *netdev,
3494 const struct smap *details OVS_UNUSED)
3496 default_install__(netdev);
3501 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3503 default_install__(netdev);
3507 static const struct tc_ops tc_ops_default = {
3508 NULL, /* linux_name */
3513 NULL, /* tc_destroy */
3514 NULL, /* qdisc_get */
3515 NULL, /* qdisc_set */
3516 NULL, /* class_get */
3517 NULL, /* class_set */
3518 NULL, /* class_delete */
3519 NULL, /* class_get_stats */
3520 NULL /* class_dump_stats */
3523 /* "linux-other" traffic control class.
3528 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
3530 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3531 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
3533 /* Nothing but a tc class implementation is allowed to write to a tc. This
3534 * class never does that, so we can legitimately use a const tc object. */
3535 netdev->tc = CONST_CAST(struct tc *, &tc);
3539 static const struct tc_ops tc_ops_other = {
3540 NULL, /* linux_name */
3541 "linux-other", /* ovs_name */
3543 NULL, /* tc_install */
3545 NULL, /* tc_destroy */
3546 NULL, /* qdisc_get */
3547 NULL, /* qdisc_set */
3548 NULL, /* class_get */
3549 NULL, /* class_set */
3550 NULL, /* class_delete */
3551 NULL, /* class_get_stats */
3552 NULL /* class_dump_stats */
3555 /* Traffic control. */
3557 /* Number of kernel "tc" ticks per second. */
3558 static double ticks_per_s;
3560 /* Number of kernel "jiffies" per second. This is used for the purpose of
3561 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3562 * one jiffy's worth of data.
3564 * There are two possibilities here:
3566 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3567 * approximate range of 100 to 1024. That means that we really need to
3568 * make sure that the qdisc can buffer that much data.
3570 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3571 * has finely granular timers and there's no need to fudge additional room
3572 * for buffers. (There's no extra effort needed to implement that: the
3573 * large 'buffer_hz' is used as a divisor, so practically any number will
3574 * come out as 0 in the division. Small integer results in the case of
3575 * really high dividends won't have any real effect anyhow.)
3577 static unsigned int buffer_hz;
3579 /* Returns tc handle 'major':'minor'. */
3581 tc_make_handle(unsigned int major, unsigned int minor)
3583 return TC_H_MAKE(major << 16, minor);
3586 /* Returns the major number from 'handle'. */
3588 tc_get_major(unsigned int handle)
3590 return TC_H_MAJ(handle) >> 16;
3593 /* Returns the minor number from 'handle'. */
3595 tc_get_minor(unsigned int handle)
3597 return TC_H_MIN(handle);
3600 static struct tcmsg *
3601 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3602 struct ofpbuf *request)
3604 struct tcmsg *tcmsg;
3608 error = get_ifindex(netdev, &ifindex);
3613 ofpbuf_init(request, 512);
3614 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3615 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3616 tcmsg->tcm_family = AF_UNSPEC;
3617 tcmsg->tcm_ifindex = ifindex;
3618 /* Caller should fill in tcmsg->tcm_handle. */
3619 /* Caller should fill in tcmsg->tcm_parent. */
3625 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3627 int error = nl_transact(NETLINK_ROUTE, request, replyp);
3628 ofpbuf_uninit(request);
3632 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3633 * policing configuration.
3635 * This function is equivalent to running the following when 'add' is true:
3636 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3638 * This function is equivalent to running the following when 'add' is false:
3639 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3641 * The configuration and stats may be seen with the following command:
3642 * /sbin/tc -s qdisc show dev <devname>
3644 * Returns 0 if successful, otherwise a positive errno value.
3647 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3649 struct ofpbuf request;
3650 struct tcmsg *tcmsg;
3652 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3653 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3655 tcmsg = tc_make_request(netdev, type, flags, &request);
3659 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3660 tcmsg->tcm_parent = TC_H_INGRESS;
3661 nl_msg_put_string(&request, TCA_KIND, "ingress");
3662 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3664 error = tc_transact(&request, NULL);
3666 /* If we're deleting the qdisc, don't worry about some of the
3667 * error conditions. */
3668 if (!add && (error == ENOENT || error == EINVAL)) {
3677 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3680 * This function is equivalent to running:
3681 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3682 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3685 * The configuration and stats may be seen with the following command:
3686 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3688 * Returns 0 if successful, otherwise a positive errno value.
3691 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3693 struct tc_police tc_police;
3694 struct ofpbuf request;
3695 struct tcmsg *tcmsg;
3696 size_t basic_offset;
3697 size_t police_offset;
3701 memset(&tc_police, 0, sizeof tc_police);
3702 tc_police.action = TC_POLICE_SHOT;
3703 tc_police.mtu = mtu;
3704 tc_fill_rate(&tc_police.rate, (kbits_rate * 1000)/8, mtu);
3705 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3706 kbits_burst * 1024);
3708 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3709 NLM_F_EXCL | NLM_F_CREATE, &request);
3713 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3714 tcmsg->tcm_info = tc_make_handle(49,
3715 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3717 nl_msg_put_string(&request, TCA_KIND, "basic");
3718 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3719 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3720 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3721 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3722 nl_msg_end_nested(&request, police_offset);
3723 nl_msg_end_nested(&request, basic_offset);
3725 error = tc_transact(&request, NULL);
3736 /* The values in psched are not individually very meaningful, but they are
3737 * important. The tables below show some values seen in the wild.
3741 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3742 * (Before that, there are hints that it was 1000000000.)
3744 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3748 * -----------------------------------
3749 * [1] 000c8000 000f4240 000f4240 00000064
3750 * [2] 000003e8 00000400 000f4240 3b9aca00
3751 * [3] 000003e8 00000400 000f4240 3b9aca00
3752 * [4] 000003e8 00000400 000f4240 00000064
3753 * [5] 000003e8 00000040 000f4240 3b9aca00
3754 * [6] 000003e8 00000040 000f4240 000000f9
3756 * a b c d ticks_per_s buffer_hz
3757 * ------- --------- ---------- ------------- ----------- -------------
3758 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3759 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3760 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3761 * [4] 1,000 1,024 1,000,000 100 976,562 100
3762 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3763 * [6] 1,000 64 1,000,000 249 15,625,000 249
3765 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3766 * [2] 2.6.26-1-686-bigmem from Debian lenny
3767 * [3] 2.6.26-2-sparc64 from Debian lenny
3768 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3769 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3770 * [6] 2.6.34 from kernel.org on KVM
3772 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
3773 static const char fn[] = "/proc/net/psched";
3774 unsigned int a, b, c, d;
3777 if (!ovsthread_once_start(&once)) {
3784 stream = fopen(fn, "r");
3786 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
3790 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3791 VLOG_WARN("%s: read failed", fn);
3795 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3799 VLOG_WARN("%s: invalid scheduler parameters", fn);
3803 ticks_per_s = (double) a * c / b;
3807 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3810 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3813 ovsthread_once_done(&once);
3816 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3817 * rate of 'rate' bytes per second. */
3819 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3822 return (rate * ticks) / ticks_per_s;
3825 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3826 * rate of 'rate' bytes per second. */
3828 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3831 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3834 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3835 * a transmission rate of 'rate' bytes per second. */
3837 tc_buffer_per_jiffy(unsigned int rate)
3840 return rate / buffer_hz;
3843 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3844 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3845 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3846 * stores NULL into it if it is absent.
3848 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3851 * Returns 0 if successful, otherwise a positive errno value. */
3853 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3854 struct nlattr **options)
3856 static const struct nl_policy tca_policy[] = {
3857 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3858 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3860 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3862 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3863 tca_policy, ta, ARRAY_SIZE(ta))) {
3864 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3869 *kind = nl_attr_get_string(ta[TCA_KIND]);
3873 *options = ta[TCA_OPTIONS];
3888 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3889 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3890 * into '*options', and its queue statistics into '*stats'. Any of the output
3891 * arguments may be null.
3893 * Returns 0 if successful, otherwise a positive errno value. */
3895 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3896 struct nlattr **options, struct netdev_queue_stats *stats)
3898 static const struct nl_policy tca_policy[] = {
3899 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3900 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3902 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3904 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3905 tca_policy, ta, ARRAY_SIZE(ta))) {
3906 VLOG_WARN_RL(&rl, "failed to parse class message");
3911 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3912 *handlep = tc->tcm_handle;
3916 *options = ta[TCA_OPTIONS];
3920 const struct gnet_stats_queue *gsq;
3921 struct gnet_stats_basic gsb;
3923 static const struct nl_policy stats_policy[] = {
3924 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3925 .min_len = sizeof gsb },
3926 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3927 .min_len = sizeof *gsq },
3929 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3931 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3932 sa, ARRAY_SIZE(sa))) {
3933 VLOG_WARN_RL(&rl, "failed to parse class stats");
3937 /* Alignment issues screw up the length of struct gnet_stats_basic on
3938 * some arch/bitsize combinations. Newer versions of Linux have a
3939 * struct gnet_stats_basic_packed, but we can't depend on that. The
3940 * easiest thing to do is just to make a copy. */
3941 memset(&gsb, 0, sizeof gsb);
3942 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3943 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3944 stats->tx_bytes = gsb.bytes;
3945 stats->tx_packets = gsb.packets;
3947 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3948 stats->tx_errors = gsq->drops;
3958 memset(stats, 0, sizeof *stats);
3963 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3966 tc_query_class(const struct netdev *netdev,
3967 unsigned int handle, unsigned int parent,
3968 struct ofpbuf **replyp)
3970 struct ofpbuf request;
3971 struct tcmsg *tcmsg;
3974 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3978 tcmsg->tcm_handle = handle;
3979 tcmsg->tcm_parent = parent;
3981 error = tc_transact(&request, replyp);
3983 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3984 netdev_get_name(netdev),
3985 tc_get_major(handle), tc_get_minor(handle),
3986 tc_get_major(parent), tc_get_minor(parent),
3987 ovs_strerror(error));
3992 /* Equivalent to "tc class del dev <name> handle <handle>". */
3994 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3996 struct ofpbuf request;
3997 struct tcmsg *tcmsg;
4000 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
4004 tcmsg->tcm_handle = handle;
4005 tcmsg->tcm_parent = 0;
4007 error = tc_transact(&request, NULL);
4009 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4010 netdev_get_name(netdev),
4011 tc_get_major(handle), tc_get_minor(handle),
4012 ovs_strerror(error));
4017 /* Equivalent to "tc qdisc del dev <name> root". */
4019 tc_del_qdisc(struct netdev *netdev_)
4021 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4022 struct ofpbuf request;
4023 struct tcmsg *tcmsg;
4026 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
4030 tcmsg->tcm_handle = tc_make_handle(1, 0);
4031 tcmsg->tcm_parent = TC_H_ROOT;
4033 error = tc_transact(&request, NULL);
4034 if (error == EINVAL) {
4035 /* EINVAL probably means that the default qdisc was in use, in which
4036 * case we've accomplished our purpose. */
4039 if (!error && netdev->tc) {
4040 if (netdev->tc->ops->tc_destroy) {
4041 netdev->tc->ops->tc_destroy(netdev->tc);
4048 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4049 * kernel to determine what they are. Returns 0 if successful, otherwise a
4050 * positive errno value. */
4052 tc_query_qdisc(const struct netdev *netdev_)
4054 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4055 struct ofpbuf request, *qdisc;
4056 const struct tc_ops *ops;
4057 struct tcmsg *tcmsg;
4065 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4066 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4067 * 2.6.35 without that fix backported to it.
4069 * To avoid the OOPS, we must not make a request that would attempt to dump
4070 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4071 * few others. There are a few ways that I can see to do this, but most of
4072 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4073 * technique chosen here is to assume that any non-default qdisc that we
4074 * create will have a class with handle 1:0. The built-in qdiscs only have
4075 * a class with handle 0:0.
4077 * We could check for Linux 2.6.35+ and use a more straightforward method
4079 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
4083 tcmsg->tcm_handle = tc_make_handle(1, 0);
4084 tcmsg->tcm_parent = 0;
4086 /* Figure out what tc class to instantiate. */
4087 error = tc_transact(&request, &qdisc);
4091 error = tc_parse_qdisc(qdisc, &kind, NULL);
4093 ops = &tc_ops_other;
4095 ops = tc_lookup_linux_name(kind);
4097 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4098 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4100 ops = &tc_ops_other;
4103 } else if (error == ENOENT) {
4104 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4105 * other entity that doesn't have a handle 1:0. We will assume
4106 * that it's the system default qdisc. */
4107 ops = &tc_ops_default;
4110 /* Who knows? Maybe the device got deleted. */
4111 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4112 netdev_get_name(netdev_), ovs_strerror(error));
4113 ops = &tc_ops_other;
4116 /* Instantiate it. */
4117 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
4118 ovs_assert((load_error == 0) == (netdev->tc != NULL));
4119 ofpbuf_delete(qdisc);
4121 return error ? error : load_error;
4124 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4125 approximate the time to transmit packets of various lengths. For an MTU of
4126 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4127 represents two possible packet lengths; for a MTU of 513 through 1024, four
4128 possible lengths; and so on.
4130 Returns, for the specified 'mtu', the number of bits that packet lengths
4131 need to be shifted right to fit within such a 256-entry table. */
4133 tc_calc_cell_log(unsigned int mtu)
4138 mtu = ETH_PAYLOAD_MAX;
4140 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4142 for (cell_log = 0; mtu >= 256; cell_log++) {
4149 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4152 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4154 memset(rate, 0, sizeof *rate);
4155 rate->cell_log = tc_calc_cell_log(mtu);
4156 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4157 /* rate->cell_align = 0; */ /* distro headers. */
4158 rate->mpu = ETH_TOTAL_MIN;
4162 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4163 * attribute of the specified "type".
4165 * See tc_calc_cell_log() above for a description of "rtab"s. */
4167 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4172 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4173 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4174 unsigned packet_size = (i + 1) << rate->cell_log;
4175 if (packet_size < rate->mpu) {
4176 packet_size = rate->mpu;
4178 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4182 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4183 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4184 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4187 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4189 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4190 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4193 /* Linux-only functions declared in netdev-linux.h */
4195 /* Returns a fd for an AF_INET socket or a negative errno value. */
4197 netdev_linux_get_af_inet_sock(void)
4199 int error = netdev_linux_init();
4200 return error ? -error : af_inet_sock;
4203 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4204 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4206 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4207 const char *flag_name, bool enable)
4209 const char *netdev_name = netdev_get_name(netdev);
4210 struct ethtool_value evalue;
4214 COVERAGE_INC(netdev_get_ethtool);
4215 memset(&evalue, 0, sizeof evalue);
4216 error = netdev_linux_do_ethtool(netdev_name,
4217 (struct ethtool_cmd *)&evalue,
4218 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4223 COVERAGE_INC(netdev_set_ethtool);
4224 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4225 error = netdev_linux_do_ethtool(netdev_name,
4226 (struct ethtool_cmd *)&evalue,
4227 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4232 COVERAGE_INC(netdev_get_ethtool);
4233 memset(&evalue, 0, sizeof evalue);
4234 error = netdev_linux_do_ethtool(netdev_name,
4235 (struct ethtool_cmd *)&evalue,
4236 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4241 if (new_flags != evalue.data) {
4242 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4243 "device %s failed", enable ? "enable" : "disable",
4244 flag_name, netdev_name);
4251 /* Utility functions. */
4253 /* Copies 'src' into 'dst', performing format conversion in the process. */
4255 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4256 const struct rtnl_link_stats *src)
4258 dst->rx_packets = src->rx_packets;
4259 dst->tx_packets = src->tx_packets;
4260 dst->rx_bytes = src->rx_bytes;
4261 dst->tx_bytes = src->tx_bytes;
4262 dst->rx_errors = src->rx_errors;
4263 dst->tx_errors = src->tx_errors;
4264 dst->rx_dropped = src->rx_dropped;
4265 dst->tx_dropped = src->tx_dropped;
4266 dst->multicast = src->multicast;
4267 dst->collisions = src->collisions;
4268 dst->rx_length_errors = src->rx_length_errors;
4269 dst->rx_over_errors = src->rx_over_errors;
4270 dst->rx_crc_errors = src->rx_crc_errors;
4271 dst->rx_frame_errors = src->rx_frame_errors;
4272 dst->rx_fifo_errors = src->rx_fifo_errors;
4273 dst->rx_missed_errors = src->rx_missed_errors;
4274 dst->tx_aborted_errors = src->tx_aborted_errors;
4275 dst->tx_carrier_errors = src->tx_carrier_errors;
4276 dst->tx_fifo_errors = src->tx_fifo_errors;
4277 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4278 dst->tx_window_errors = src->tx_window_errors;
4282 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4284 /* Policy for RTNLGRP_LINK messages.
4286 * There are *many* more fields in these messages, but currently we only
4287 * care about these fields. */
4288 static const struct nl_policy rtnlgrp_link_policy[] = {
4289 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4290 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4291 .min_len = sizeof(struct rtnl_link_stats) },
4294 struct ofpbuf request;
4295 struct ofpbuf *reply;
4296 struct ifinfomsg *ifi;
4297 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4300 ofpbuf_init(&request, 0);
4301 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4302 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4303 ifi->ifi_family = PF_UNSPEC;
4304 ifi->ifi_index = ifindex;
4305 error = nl_transact(NETLINK_ROUTE, &request, &reply);
4306 ofpbuf_uninit(&request);
4311 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4312 rtnlgrp_link_policy,
4313 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4314 ofpbuf_delete(reply);
4318 if (!attrs[IFLA_STATS]) {
4319 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4320 ofpbuf_delete(reply);
4324 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4326 ofpbuf_delete(reply);
4332 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4334 static const char fn[] = "/proc/net/dev";
4339 stream = fopen(fn, "r");
4341 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
4346 while (fgets(line, sizeof line, stream)) {
4349 #define X64 "%"SCNu64
4352 X64 X64 X64 X64 X64 X64 X64 "%*u"
4353 X64 X64 X64 X64 X64 X64 X64 "%*u",
4359 &stats->rx_fifo_errors,
4360 &stats->rx_frame_errors,
4366 &stats->tx_fifo_errors,
4368 &stats->tx_carrier_errors) != 15) {
4369 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4370 } else if (!strcmp(devname, netdev_name)) {
4371 stats->rx_length_errors = UINT64_MAX;
4372 stats->rx_over_errors = UINT64_MAX;
4373 stats->rx_crc_errors = UINT64_MAX;
4374 stats->rx_missed_errors = UINT64_MAX;
4375 stats->tx_aborted_errors = UINT64_MAX;
4376 stats->tx_heartbeat_errors = UINT64_MAX;
4377 stats->tx_window_errors = UINT64_MAX;
4383 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4389 get_flags(const struct netdev *dev, unsigned int *flags)
4395 error = netdev_linux_do_ioctl(dev->name, &ifr, SIOCGIFFLAGS,
4398 *flags = ifr.ifr_flags;
4404 set_flags(const char *name, unsigned int flags)
4408 ifr.ifr_flags = flags;
4409 return netdev_linux_do_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
4413 do_get_ifindex(const char *netdev_name)
4417 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4418 COVERAGE_INC(netdev_get_ifindex);
4419 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4420 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4421 netdev_name, ovs_strerror(errno));
4424 return ifr.ifr_ifindex;
4428 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4430 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4432 if (!(netdev->cache_valid & VALID_IFINDEX)) {
4433 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4436 netdev->get_ifindex_error = -ifindex;
4437 netdev->ifindex = 0;
4439 netdev->get_ifindex_error = 0;
4440 netdev->ifindex = ifindex;
4442 netdev->cache_valid |= VALID_IFINDEX;
4445 *ifindexp = netdev->ifindex;
4446 return netdev->get_ifindex_error;
4450 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4455 memset(&ifr, 0, sizeof ifr);
4456 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4457 COVERAGE_INC(netdev_get_hwaddr);
4458 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4459 /* ENODEV probably means that a vif disappeared asynchronously and
4460 * hasn't been removed from the database yet, so reduce the log level
4461 * to INFO for that case. */
4462 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4463 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4464 netdev_name, ovs_strerror(errno));
4467 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4468 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4469 VLOG_WARN("%s device has unknown hardware address family %d",
4470 netdev_name, hwaddr_family);
4472 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4477 set_etheraddr(const char *netdev_name,
4478 const uint8_t mac[ETH_ADDR_LEN])
4482 memset(&ifr, 0, sizeof ifr);
4483 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4484 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4485 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4486 COVERAGE_INC(netdev_set_hwaddr);
4487 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4488 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4489 netdev_name, ovs_strerror(errno));
4496 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4497 int cmd, const char *cmd_name)
4501 memset(&ifr, 0, sizeof ifr);
4502 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4503 ifr.ifr_data = (caddr_t) ecmd;
4506 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4509 if (errno != EOPNOTSUPP) {
4510 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4511 "failed: %s", cmd_name, name, ovs_strerror(errno));
4513 /* The device doesn't support this operation. That's pretty
4514 * common, so there's no point in logging anything. */
4521 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4522 const char *cmd_name)
4524 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4525 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4526 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4527 ovs_strerror(errno));
4534 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4535 int cmd, const char *cmd_name)
4540 ifr.ifr_addr.sa_family = AF_INET;
4541 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4543 const struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *,
4545 *ip = sin->sin_addr;
4550 /* Returns an AF_PACKET raw socket or a negative errno value. */
4552 af_packet_sock(void)
4554 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4557 if (ovsthread_once_start(&once)) {
4558 sock = socket(AF_PACKET, SOCK_RAW, 0);
4560 int error = set_nonblocking(sock);
4567 VLOG_ERR("failed to create packet socket: %s",
4568 ovs_strerror(errno));
4570 ovsthread_once_done(&once);