2 * Copyright (c) 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <linux/version.h>
37 #include <sys/types.h>
38 #include <sys/ioctl.h>
39 #include <sys/socket.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
52 #include "dpif-linux.h"
53 #include "dynamic-string.h"
54 #include "fatal-signal.h"
57 #include "netdev-provider.h"
58 #include "netdev-vport.h"
59 #include "netlink-notifier.h"
60 #include "netlink-socket.h"
63 #include "openflow/openflow.h"
65 #include "poll-loop.h"
66 #include "rtnetlink-link.h"
68 #include "socket-util.h"
71 #include "unaligned.h"
74 VLOG_DEFINE_THIS_MODULE(netdev_linux);
76 COVERAGE_DEFINE(netdev_set_policing);
77 COVERAGE_DEFINE(netdev_arp_lookup);
78 COVERAGE_DEFINE(netdev_get_ifindex);
79 COVERAGE_DEFINE(netdev_get_hwaddr);
80 COVERAGE_DEFINE(netdev_set_hwaddr);
81 COVERAGE_DEFINE(netdev_get_ethtool);
82 COVERAGE_DEFINE(netdev_set_ethtool);
85 /* These were introduced in Linux 2.6.14, so they might be missing if we have
87 #ifndef ADVERTISED_Pause
88 #define ADVERTISED_Pause (1 << 13)
90 #ifndef ADVERTISED_Asym_Pause
91 #define ADVERTISED_Asym_Pause (1 << 14)
94 /* These were introduced in Linux 2.6.24, so they might be missing if we
95 * have old headers. */
96 #ifndef ETHTOOL_GFLAGS
97 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
99 #ifndef ETHTOOL_SFLAGS
100 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
103 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
106 #define TC_RTAB_SIZE 1024
109 static struct nln_notifier *netdev_linux_cache_notifier = NULL;
110 static int cache_notifier_refcount;
113 VALID_IFINDEX = 1 << 0,
114 VALID_ETHERADDR = 1 << 1,
118 VALID_POLICING = 1 << 5,
119 VALID_VPORT_STAT_ERROR = 1 << 6,
120 VALID_DRVINFO = 1 << 7,
121 VALID_FEATURES = 1 << 8,
124 /* Traffic control. */
126 /* An instance of a traffic control class. Always associated with a particular
129 * Each TC implementation subclasses this with whatever additional data it
132 const struct tc_ops *ops;
133 struct hmap queues; /* Contains "struct tc_queue"s.
134 * Read by generic TC layer.
135 * Written only by TC implementation. */
138 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
140 /* One traffic control queue.
142 * Each TC implementation subclasses this with whatever additional data it
145 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
146 unsigned int queue_id; /* OpenFlow queue ID. */
147 long long int created; /* Time queue was created, in msecs. */
150 /* A particular kind of traffic control. Each implementation generally maps to
151 * one particular Linux qdisc class.
153 * The functions below return 0 if successful or a positive errno value on
154 * failure, except where otherwise noted. All of them must be provided, except
155 * where otherwise noted. */
157 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
158 * This is null for tc_ops_default and tc_ops_other, for which there are no
159 * appropriate values. */
160 const char *linux_name;
162 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
163 const char *ovs_name;
165 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
166 * queues. The queues are numbered 0 through n_queues - 1. */
167 unsigned int n_queues;
169 /* Called to install this TC class on 'netdev'. The implementation should
170 * make the Netlink calls required to set up 'netdev' with the right qdisc
171 * and configure it according to 'details'. The implementation may assume
172 * that the current qdisc is the default; that is, there is no need for it
173 * to delete the current qdisc before installing itself.
175 * The contents of 'details' should be documented as valid for 'ovs_name'
176 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
177 * (which is built as ovs-vswitchd.conf.db(8)).
179 * This function must return 0 if and only if it sets 'netdev->tc' to an
180 * initialized 'struct tc'.
182 * (This function is null for tc_ops_other, which cannot be installed. For
183 * other TC classes it should always be nonnull.) */
184 int (*tc_install)(struct netdev *netdev, const struct smap *details);
186 /* Called when the netdev code determines (through a Netlink query) that
187 * this TC class's qdisc is installed on 'netdev', but we didn't install
188 * it ourselves and so don't know any of the details.
190 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
191 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
192 * implementation should parse the other attributes of 'nlmsg' as
193 * necessary to determine its configuration. If necessary it should also
194 * use Netlink queries to determine the configuration of queues on
197 * This function must return 0 if and only if it sets 'netdev->tc' to an
198 * initialized 'struct tc'. */
199 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
201 /* Destroys the data structures allocated by the implementation as part of
202 * 'tc'. (This includes destroying 'tc->queues' by calling
205 * The implementation should not need to perform any Netlink calls. If
206 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
207 * (But it may not be desirable.)
209 * This function may be null if 'tc' is trivial. */
210 void (*tc_destroy)(struct tc *tc);
212 /* Retrieves details of 'netdev->tc' configuration into 'details'.
214 * The implementation should not need to perform any Netlink calls, because
215 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
216 * cached the configuration.
218 * The contents of 'details' should be documented as valid for 'ovs_name'
219 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
220 * (which is built as ovs-vswitchd.conf.db(8)).
222 * This function may be null if 'tc' is not configurable.
224 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
226 /* Reconfigures 'netdev->tc' according to 'details', performing any
227 * required Netlink calls to complete the reconfiguration.
229 * The contents of 'details' should be documented as valid for 'ovs_name'
230 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
231 * (which is built as ovs-vswitchd.conf.db(8)).
233 * This function may be null if 'tc' is not configurable.
235 int (*qdisc_set)(struct netdev *, const struct smap *details);
237 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
238 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
240 * The contents of 'details' should be documented as valid for 'ovs_name'
241 * in the "other_config" column in the "Queue" table in
242 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
244 * The implementation should not need to perform any Netlink calls, because
245 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
246 * cached the queue configuration.
248 * This function may be null if 'tc' does not have queues ('n_queues' is
250 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
251 struct smap *details);
253 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
254 * 'details', perfoming any required Netlink calls to complete the
255 * reconfiguration. The caller ensures that 'queue_id' is less than
258 * The contents of 'details' should be documented as valid for 'ovs_name'
259 * in the "other_config" column in the "Queue" table in
260 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
262 * This function may be null if 'tc' does not have queues or its queues are
263 * not configurable. */
264 int (*class_set)(struct netdev *, unsigned int queue_id,
265 const struct smap *details);
267 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
268 * tc_queue's within 'netdev->tc->queues'.
270 * This function may be null if 'tc' does not have queues or its queues
271 * cannot be deleted. */
272 int (*class_delete)(struct netdev *, struct tc_queue *queue);
274 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
275 * 'struct tc_queue's within 'netdev->tc->queues'.
277 * On success, initializes '*stats'.
279 * This function may be null if 'tc' does not have queues or if it cannot
280 * report queue statistics. */
281 int (*class_get_stats)(const struct netdev *netdev,
282 const struct tc_queue *queue,
283 struct netdev_queue_stats *stats);
285 /* Extracts queue stats from 'nlmsg', which is a response to a
286 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
288 * This function may be null if 'tc' does not have queues or if it cannot
289 * report queue statistics. */
290 int (*class_dump_stats)(const struct netdev *netdev,
291 const struct ofpbuf *nlmsg,
292 netdev_dump_queue_stats_cb *cb, void *aux);
296 tc_init(struct tc *tc, const struct tc_ops *ops)
299 hmap_init(&tc->queues);
303 tc_destroy(struct tc *tc)
305 hmap_destroy(&tc->queues);
308 static const struct tc_ops tc_ops_htb;
309 static const struct tc_ops tc_ops_hfsc;
310 static const struct tc_ops tc_ops_default;
311 static const struct tc_ops tc_ops_other;
313 static const struct tc_ops *const tcs[] = {
314 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
315 &tc_ops_hfsc, /* Hierarchical fair service curve. */
316 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
317 &tc_ops_other, /* Some other qdisc. */
321 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
322 static unsigned int tc_get_major(unsigned int handle);
323 static unsigned int tc_get_minor(unsigned int handle);
325 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
326 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
327 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
329 static struct tcmsg *tc_make_request(const struct netdev *, int type,
330 unsigned int flags, struct ofpbuf *);
331 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
332 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
333 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
336 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
337 struct nlattr **options);
338 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
339 struct nlattr **options,
340 struct netdev_queue_stats *);
341 static int tc_query_class(const struct netdev *,
342 unsigned int handle, unsigned int parent,
343 struct ofpbuf **replyp);
344 static int tc_delete_class(const struct netdev *, unsigned int handle);
346 static int tc_del_qdisc(struct netdev *netdev);
347 static int tc_query_qdisc(const struct netdev *netdev);
349 static int tc_calc_cell_log(unsigned int mtu);
350 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
351 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
352 const struct tc_ratespec *rate);
353 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
355 struct netdev_linux {
358 unsigned int cache_valid;
359 unsigned int change_seq;
361 bool miimon; /* Link status of last poll. */
362 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
363 struct timer miimon_timer;
365 /* The following are figured out "on demand" only. They are only valid
366 * when the corresponding VALID_* bit in 'cache_valid' is set. */
368 uint8_t etheraddr[ETH_ADDR_LEN];
369 struct in_addr address, netmask;
372 unsigned int ifi_flags;
373 long long int carrier_resets;
374 uint32_t kbits_rate; /* Policing data. */
375 uint32_t kbits_burst;
376 int vport_stats_error; /* Cached error code from vport_get_stats().
377 0 or an errno value. */
378 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
379 int ether_addr_error; /* Cached error code from set/get etheraddr. */
380 int netdev_policing_error; /* Cached error code from set policing. */
381 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
382 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
384 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
385 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
386 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
387 enum netdev_features peer; /* Cached from ETHTOOL_GSET. */
389 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
392 /* For devices of class netdev_tap_class only. */
396 struct netdev_rx_linux {
402 static const struct netdev_rx_class netdev_rx_linux_class;
404 /* Sockets used for ioctl operations. */
405 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
407 /* This is set pretty low because we probably won't learn anything from the
408 * additional log messages. */
409 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
411 static int netdev_linux_init(void);
413 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
414 int cmd, const char *cmd_name);
415 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
416 const char *cmd_name);
417 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
418 int cmd, const char *cmd_name);
419 static int get_flags(const struct netdev *, unsigned int *flags);
420 static int set_flags(const char *, unsigned int flags);
421 static int do_get_ifindex(const char *netdev_name);
422 static int get_ifindex(const struct netdev *, int *ifindexp);
423 static int do_set_addr(struct netdev *netdev,
424 int ioctl_nr, const char *ioctl_name,
425 struct in_addr addr);
426 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
427 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
428 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
429 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
430 static int af_packet_sock(void);
431 static void netdev_linux_miimon_run(void);
432 static void netdev_linux_miimon_wait(void);
435 is_netdev_linux_class(const struct netdev_class *netdev_class)
437 return netdev_class->init == netdev_linux_init;
441 is_tap_netdev(const struct netdev *netdev)
443 return netdev_get_class(netdev) == &netdev_tap_class;
446 static struct netdev_linux *
447 netdev_linux_cast(const struct netdev *netdev)
449 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
451 return CONTAINER_OF(netdev, struct netdev_linux, up);
454 static struct netdev_rx_linux *
455 netdev_rx_linux_cast(const struct netdev_rx *rx)
457 netdev_rx_assert_class(rx, &netdev_rx_linux_class);
458 return CONTAINER_OF(rx, struct netdev_rx_linux, up);
462 netdev_linux_init(void)
464 static int status = -1;
466 /* Create AF_INET socket. */
467 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
468 status = af_inet_sock >= 0 ? 0 : errno;
470 VLOG_ERR("failed to create inet socket: %s", ovs_strerror(status));
477 netdev_linux_run(void)
479 rtnetlink_link_run();
480 netdev_linux_miimon_run();
484 netdev_linux_wait(void)
486 rtnetlink_link_wait();
487 netdev_linux_miimon_wait();
491 netdev_linux_changed(struct netdev_linux *dev,
492 unsigned int ifi_flags, unsigned int mask)
495 if (!dev->change_seq) {
499 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
500 dev->carrier_resets++;
502 dev->ifi_flags = ifi_flags;
504 dev->cache_valid &= mask;
508 netdev_linux_update(struct netdev_linux *dev,
509 const struct rtnetlink_link_change *change)
511 if (change->nlmsg_type == RTM_NEWLINK) {
513 netdev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
515 /* Update netdev from rtnl-change msg. */
517 dev->mtu = change->mtu;
518 dev->cache_valid |= VALID_MTU;
519 dev->netdev_mtu_error = 0;
522 if (!eth_addr_is_zero(change->addr)) {
523 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
524 dev->cache_valid |= VALID_ETHERADDR;
525 dev->ether_addr_error = 0;
528 dev->ifindex = change->ifi_index;
529 dev->cache_valid |= VALID_IFINDEX;
530 dev->get_ifindex_error = 0;
533 netdev_linux_changed(dev, change->ifi_flags, 0);
538 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
539 void *aux OVS_UNUSED)
541 struct netdev_linux *dev;
543 struct netdev *base_dev = netdev_from_name(change->ifname);
544 if (base_dev && is_netdev_linux_class(netdev_get_class(base_dev))) {
545 netdev_linux_update(netdev_linux_cast(base_dev), change);
548 struct shash device_shash;
549 struct shash_node *node;
551 shash_init(&device_shash);
552 netdev_get_devices(&netdev_linux_class, &device_shash);
553 SHASH_FOR_EACH (node, &device_shash) {
554 struct netdev *netdev = node->data;
557 dev = netdev_linux_cast(netdev);
559 get_flags(&dev->up, &flags);
560 netdev_linux_changed(dev, flags, 0);
562 shash_destroy(&device_shash);
567 cache_notifier_ref(void)
569 if (!cache_notifier_refcount) {
570 ovs_assert(!netdev_linux_cache_notifier);
572 netdev_linux_cache_notifier =
573 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
575 if (!netdev_linux_cache_notifier) {
579 cache_notifier_refcount++;
585 cache_notifier_unref(void)
587 ovs_assert(cache_notifier_refcount > 0);
588 if (!--cache_notifier_refcount) {
589 ovs_assert(netdev_linux_cache_notifier);
590 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
591 netdev_linux_cache_notifier = NULL;
595 /* Creates system and internal devices. */
597 netdev_linux_create(const struct netdev_class *class, const char *name,
598 struct netdev **netdevp)
600 struct netdev_linux *netdev;
603 error = cache_notifier_ref();
608 netdev = xzalloc(sizeof *netdev);
609 netdev->change_seq = 1;
610 netdev_init(&netdev->up, name, class);
611 error = get_flags(&netdev->up, &netdev->ifi_flags);
612 if (error == ENODEV) {
613 if (class != &netdev_internal_class) {
614 /* The device does not exist, so don't allow it to be opened. */
615 netdev_uninit(&netdev->up, false);
616 cache_notifier_unref();
620 /* "Internal" netdevs have to be created as netdev objects before
621 * they exist in the kernel, because creating them in the kernel
622 * happens by passing a netdev object to dpif_port_add().
623 * Therefore, ignore the error. */
627 *netdevp = &netdev->up;
631 /* For most types of netdevs we open the device for each call of
632 * netdev_open(). However, this is not the case with tap devices,
633 * since it is only possible to open the device once. In this
634 * situation we share a single file descriptor, and consequently
635 * buffers, across all readers. Therefore once data is read it will
636 * be unavailable to other reads for tap devices. */
638 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
639 const char *name, struct netdev **netdevp)
641 struct netdev_linux *netdev;
642 static const char tap_dev[] = "/dev/net/tun";
646 netdev = xzalloc(sizeof *netdev);
647 netdev->change_seq = 1;
649 error = cache_notifier_ref();
654 /* Open tap device. */
655 netdev->tap_fd = open(tap_dev, O_RDWR);
656 if (netdev->tap_fd < 0) {
658 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
659 goto error_unref_notifier;
662 /* Create tap device. */
663 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
664 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
665 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
666 VLOG_WARN("%s: creating tap device failed: %s", name,
667 ovs_strerror(errno));
672 /* Make non-blocking. */
673 error = set_nonblocking(netdev->tap_fd);
678 netdev_init(&netdev->up, name, &netdev_tap_class);
679 *netdevp = &netdev->up;
683 close(netdev->tap_fd);
684 error_unref_notifier:
685 cache_notifier_unref();
692 netdev_linux_destroy(struct netdev *netdev_)
694 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
696 if (netdev->tc && netdev->tc->ops->tc_destroy) {
697 netdev->tc->ops->tc_destroy(netdev->tc);
700 if (netdev_get_class(netdev_) == &netdev_tap_class
701 && netdev->tap_fd >= 0)
703 close(netdev->tap_fd);
707 cache_notifier_unref();
711 netdev_linux_rx_open(struct netdev *netdev_, struct netdev_rx **rxp)
713 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
714 bool is_tap = is_tap_netdev(netdev_);
715 struct netdev_rx_linux *rx;
722 struct sockaddr_ll sll;
724 /* Result of tcpdump -dd inbound */
725 static struct sock_filter filt[] = {
726 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
727 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
728 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
729 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
731 static struct sock_fprog fprog = { ARRAY_SIZE(filt), filt };
733 /* Create file descriptor. */
734 fd = socket(PF_PACKET, SOCK_RAW, 0);
737 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
741 /* Set non-blocking mode. */
742 error = set_nonblocking(fd);
747 /* Get ethernet device index. */
748 error = get_ifindex(&netdev->up, &ifindex);
753 /* Bind to specific ethernet device. */
754 memset(&sll, 0, sizeof sll);
755 sll.sll_family = AF_PACKET;
756 sll.sll_ifindex = ifindex;
757 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
758 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
760 VLOG_ERR("%s: failed to bind raw socket (%s)",
761 netdev_get_name(netdev_), ovs_strerror(error));
765 /* Filter for only inbound packets. */
766 error = setsockopt(fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
770 VLOG_ERR("%s: failed attach filter (%s)",
771 netdev_get_name(netdev_), ovs_strerror(error));
776 rx = xmalloc(sizeof *rx);
777 netdev_rx_init(&rx->up, netdev_, &netdev_rx_linux_class);
792 netdev_rx_linux_destroy(struct netdev_rx *rx_)
794 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
803 netdev_rx_linux_recv(struct netdev_rx *rx_, void *data, size_t size)
805 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
810 ? read(rx->fd, data, size)
811 : recv(rx->fd, data, size, MSG_TRUNC));
812 } while (retval < 0 && errno == EINTR);
815 return retval > size ? -EMSGSIZE : retval;
817 if (errno != EAGAIN) {
818 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
819 ovs_strerror(errno), netdev_rx_get_name(rx_));
826 netdev_rx_linux_wait(struct netdev_rx *rx_)
828 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
829 poll_fd_wait(rx->fd, POLLIN);
833 netdev_rx_linux_drain(struct netdev_rx *rx_)
835 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
838 int error = netdev_linux_do_ioctl(netdev_rx_get_name(rx_), &ifr,
839 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
843 drain_fd(rx->fd, ifr.ifr_qlen);
846 return drain_rcvbuf(rx->fd);
850 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
851 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
852 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
853 * the packet is too big or too small to transmit on the device.
855 * The caller retains ownership of 'buffer' in all cases.
857 * The kernel maintains a packet transmission queue, so the caller is not
858 * expected to do additional queuing of packets. */
860 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
865 if (!is_tap_netdev(netdev_)) {
866 /* Use our AF_PACKET socket to send to this device. */
867 struct sockaddr_ll sll;
874 sock = af_packet_sock();
879 error = get_ifindex(netdev_, &ifindex);
884 /* We don't bother setting most fields in sockaddr_ll because the
885 * kernel ignores them for SOCK_RAW. */
886 memset(&sll, 0, sizeof sll);
887 sll.sll_family = AF_PACKET;
888 sll.sll_ifindex = ifindex;
890 iov.iov_base = CONST_CAST(void *, data);
894 msg.msg_namelen = sizeof sll;
897 msg.msg_control = NULL;
898 msg.msg_controllen = 0;
901 retval = sendmsg(sock, &msg, 0);
903 /* Use the tap fd to send to this device. This is essential for
904 * tap devices, because packets sent to a tap device with an
905 * AF_PACKET socket will loop back to be *received* again on the
906 * tap device. This doesn't occur on other interface types
907 * because we attach a socket filter to the rx socket. */
908 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
910 retval = write(netdev->tap_fd, data, size);
914 /* The Linux AF_PACKET implementation never blocks waiting for room
915 * for packets, instead returning ENOBUFS. Translate this into
916 * EAGAIN for the caller. */
917 if (errno == ENOBUFS) {
919 } else if (errno == EINTR) {
921 } else if (errno != EAGAIN) {
922 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
923 netdev_get_name(netdev_), ovs_strerror(errno));
926 } else if (retval != size) {
927 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
928 "%zu) on %s", retval, size, netdev_get_name(netdev_));
936 /* Registers with the poll loop to wake up from the next call to poll_block()
937 * when the packet transmission queue has sufficient room to transmit a packet
938 * with netdev_send().
940 * The kernel maintains a packet transmission queue, so the client is not
941 * expected to do additional queuing of packets. Thus, this function is
942 * unlikely to ever be used. It is included for completeness. */
944 netdev_linux_send_wait(struct netdev *netdev)
946 if (is_tap_netdev(netdev)) {
947 /* TAP device always accepts packets.*/
948 poll_immediate_wake();
952 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
953 * otherwise a positive errno value. */
955 netdev_linux_set_etheraddr(struct netdev *netdev_,
956 const uint8_t mac[ETH_ADDR_LEN])
958 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
959 struct netdev_saved_flags *sf = NULL;
962 if (netdev->cache_valid & VALID_ETHERADDR) {
963 if (netdev->ether_addr_error) {
964 return netdev->ether_addr_error;
966 if (eth_addr_equals(netdev->etheraddr, mac)) {
969 netdev->cache_valid &= ~VALID_ETHERADDR;
972 /* Tap devices must be brought down before setting the address. */
973 if (is_tap_netdev(netdev_)) {
974 netdev_turn_flags_off(netdev_, NETDEV_UP, &sf);
976 error = set_etheraddr(netdev_get_name(netdev_), mac);
977 if (!error || error == ENODEV) {
978 netdev->ether_addr_error = error;
979 netdev->cache_valid |= VALID_ETHERADDR;
981 memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
985 netdev_restore_flags(sf);
990 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
992 netdev_linux_get_etheraddr(const struct netdev *netdev_,
993 uint8_t mac[ETH_ADDR_LEN])
995 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
997 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
998 int error = get_etheraddr(netdev_get_name(netdev_),
1001 netdev->ether_addr_error = error;
1002 netdev->cache_valid |= VALID_ETHERADDR;
1005 if (!netdev->ether_addr_error) {
1006 memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
1009 return netdev->ether_addr_error;
1012 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1013 * in bytes, not including the hardware header; thus, this is typically 1500
1014 * bytes for Ethernet devices. */
1016 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1018 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1019 if (!(netdev->cache_valid & VALID_MTU)) {
1023 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1024 SIOCGIFMTU, "SIOCGIFMTU");
1026 netdev->netdev_mtu_error = error;
1027 netdev->mtu = ifr.ifr_mtu;
1028 netdev->cache_valid |= VALID_MTU;
1031 if (!netdev->netdev_mtu_error) {
1032 *mtup = netdev->mtu;
1034 return netdev->netdev_mtu_error;
1037 /* Sets the maximum size of transmitted (MTU) for given device using linux
1038 * networking ioctl interface.
1041 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1043 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1047 if (netdev->cache_valid & VALID_MTU) {
1048 if (netdev->netdev_mtu_error) {
1049 return netdev->netdev_mtu_error;
1051 if (netdev->mtu == mtu) {
1054 netdev->cache_valid &= ~VALID_MTU;
1057 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1058 SIOCSIFMTU, "SIOCSIFMTU");
1059 if (!error || error == ENODEV) {
1060 netdev->netdev_mtu_error = error;
1061 netdev->mtu = ifr.ifr_mtu;
1062 netdev->cache_valid |= VALID_MTU;
1067 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1068 * On failure, returns a negative errno value. */
1070 netdev_linux_get_ifindex(const struct netdev *netdev)
1074 error = get_ifindex(netdev, &ifindex);
1075 return error ? -error : ifindex;
1079 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1081 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1083 if (netdev->miimon_interval > 0) {
1084 *carrier = netdev->miimon;
1086 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1092 static long long int
1093 netdev_linux_get_carrier_resets(const struct netdev *netdev)
1095 return netdev_linux_cast(netdev)->carrier_resets;
1099 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1100 struct mii_ioctl_data *data)
1105 memset(&ifr, 0, sizeof ifr);
1106 memcpy(&ifr.ifr_data, data, sizeof *data);
1107 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1108 memcpy(data, &ifr.ifr_data, sizeof *data);
1114 netdev_linux_get_miimon(const char *name, bool *miimon)
1116 struct mii_ioctl_data data;
1121 memset(&data, 0, sizeof data);
1122 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1124 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1125 data.reg_num = MII_BMSR;
1126 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1130 *miimon = !!(data.val_out & BMSR_LSTATUS);
1132 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1135 struct ethtool_cmd ecmd;
1137 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1140 COVERAGE_INC(netdev_get_ethtool);
1141 memset(&ecmd, 0, sizeof ecmd);
1142 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1145 struct ethtool_value eval;
1147 memcpy(&eval, &ecmd, sizeof eval);
1148 *miimon = !!eval.data;
1150 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1158 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1159 long long int interval)
1161 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1163 interval = interval > 0 ? MAX(interval, 100) : 0;
1164 if (netdev->miimon_interval != interval) {
1165 netdev->miimon_interval = interval;
1166 timer_set_expired(&netdev->miimon_timer);
1173 netdev_linux_miimon_run(void)
1175 struct shash device_shash;
1176 struct shash_node *node;
1178 shash_init(&device_shash);
1179 netdev_get_devices(&netdev_linux_class, &device_shash);
1180 SHASH_FOR_EACH (node, &device_shash) {
1181 struct netdev *netdev = node->data;
1182 struct netdev_linux *dev = netdev_linux_cast(netdev);
1185 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1189 netdev_linux_get_miimon(dev->up.name, &miimon);
1190 if (miimon != dev->miimon) {
1191 dev->miimon = miimon;
1192 netdev_linux_changed(dev, dev->ifi_flags, 0);
1195 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1198 shash_destroy(&device_shash);
1202 netdev_linux_miimon_wait(void)
1204 struct shash device_shash;
1205 struct shash_node *node;
1207 shash_init(&device_shash);
1208 netdev_get_devices(&netdev_linux_class, &device_shash);
1209 SHASH_FOR_EACH (node, &device_shash) {
1210 struct netdev *netdev = node->data;
1211 struct netdev_linux *dev = netdev_linux_cast(netdev);
1213 if (dev->miimon_interval > 0) {
1214 timer_wait(&dev->miimon_timer);
1217 shash_destroy(&device_shash);
1220 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1221 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1224 check_for_working_netlink_stats(void)
1226 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1227 * preferable, so if that works, we'll use it. */
1228 int ifindex = do_get_ifindex("lo");
1230 VLOG_WARN("failed to get ifindex for lo, "
1231 "obtaining netdev stats from proc");
1234 struct netdev_stats stats;
1235 int error = get_stats_via_netlink(ifindex, &stats);
1237 VLOG_DBG("obtaining netdev stats via rtnetlink");
1240 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1241 "via proc (you are probably running a pre-2.6.19 "
1242 "kernel)", ovs_strerror(error));
1249 swap_uint64(uint64_t *a, uint64_t *b)
1256 /* Copies 'src' into 'dst', performing format conversion in the process.
1258 * 'src' is allowed to be misaligned. */
1260 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1261 const struct ovs_vport_stats *src)
1263 dst->rx_packets = get_unaligned_u64(&src->rx_packets);
1264 dst->tx_packets = get_unaligned_u64(&src->tx_packets);
1265 dst->rx_bytes = get_unaligned_u64(&src->rx_bytes);
1266 dst->tx_bytes = get_unaligned_u64(&src->tx_bytes);
1267 dst->rx_errors = get_unaligned_u64(&src->rx_errors);
1268 dst->tx_errors = get_unaligned_u64(&src->tx_errors);
1269 dst->rx_dropped = get_unaligned_u64(&src->rx_dropped);
1270 dst->tx_dropped = get_unaligned_u64(&src->tx_dropped);
1272 dst->collisions = 0;
1273 dst->rx_length_errors = 0;
1274 dst->rx_over_errors = 0;
1275 dst->rx_crc_errors = 0;
1276 dst->rx_frame_errors = 0;
1277 dst->rx_fifo_errors = 0;
1278 dst->rx_missed_errors = 0;
1279 dst->tx_aborted_errors = 0;
1280 dst->tx_carrier_errors = 0;
1281 dst->tx_fifo_errors = 0;
1282 dst->tx_heartbeat_errors = 0;
1283 dst->tx_window_errors = 0;
1287 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1289 struct dpif_linux_vport reply;
1293 error = dpif_linux_vport_get(netdev_get_name(netdev), &reply, &buf);
1296 } else if (!reply.stats) {
1301 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1309 get_stats_via_vport(const struct netdev *netdev_,
1310 struct netdev_stats *stats)
1312 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1314 if (!netdev->vport_stats_error ||
1315 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1318 error = get_stats_via_vport__(netdev_, stats);
1319 if (error && error != ENOENT) {
1320 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1322 netdev_get_name(netdev_), ovs_strerror(error));
1324 netdev->vport_stats_error = error;
1325 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1330 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1331 struct netdev_stats *stats)
1333 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
1334 static int use_netlink_stats;
1337 if (ovsthread_once_start(&once)) {
1338 use_netlink_stats = check_for_working_netlink_stats();
1339 ovsthread_once_done(&once);
1342 if (use_netlink_stats) {
1345 error = get_ifindex(netdev_, &ifindex);
1347 error = get_stats_via_netlink(ifindex, stats);
1350 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1354 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1355 netdev_get_name(netdev_), error);
1361 /* Retrieves current device stats for 'netdev-linux'. */
1363 netdev_linux_get_stats(const struct netdev *netdev_,
1364 struct netdev_stats *stats)
1366 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1367 struct netdev_stats dev_stats;
1370 get_stats_via_vport(netdev_, stats);
1372 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1375 if (netdev->vport_stats_error) {
1382 if (netdev->vport_stats_error) {
1383 /* stats not available from OVS then use ioctl stats. */
1386 stats->rx_errors += dev_stats.rx_errors;
1387 stats->tx_errors += dev_stats.tx_errors;
1388 stats->rx_dropped += dev_stats.rx_dropped;
1389 stats->tx_dropped += dev_stats.tx_dropped;
1390 stats->multicast += dev_stats.multicast;
1391 stats->collisions += dev_stats.collisions;
1392 stats->rx_length_errors += dev_stats.rx_length_errors;
1393 stats->rx_over_errors += dev_stats.rx_over_errors;
1394 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1395 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1396 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1397 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1398 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1399 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1400 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1401 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1402 stats->tx_window_errors += dev_stats.tx_window_errors;
1407 /* Retrieves current device stats for 'netdev-tap' netdev or
1408 * netdev-internal. */
1410 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1412 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1413 struct netdev_stats dev_stats;
1416 get_stats_via_vport(netdev_, stats);
1418 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1420 if (netdev->vport_stats_error) {
1427 /* If this port is an internal port then the transmit and receive stats
1428 * will appear to be swapped relative to the other ports since we are the
1429 * one sending the data, not a remote computer. For consistency, we swap
1430 * them back here. This does not apply if we are getting stats from the
1431 * vport layer because it always tracks stats from the perspective of the
1433 if (netdev->vport_stats_error) {
1435 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1436 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1437 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1438 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1439 stats->rx_length_errors = 0;
1440 stats->rx_over_errors = 0;
1441 stats->rx_crc_errors = 0;
1442 stats->rx_frame_errors = 0;
1443 stats->rx_fifo_errors = 0;
1444 stats->rx_missed_errors = 0;
1445 stats->tx_aborted_errors = 0;
1446 stats->tx_carrier_errors = 0;
1447 stats->tx_fifo_errors = 0;
1448 stats->tx_heartbeat_errors = 0;
1449 stats->tx_window_errors = 0;
1451 stats->rx_dropped += dev_stats.tx_dropped;
1452 stats->tx_dropped += dev_stats.rx_dropped;
1454 stats->rx_errors += dev_stats.tx_errors;
1455 stats->tx_errors += dev_stats.rx_errors;
1457 stats->multicast += dev_stats.multicast;
1458 stats->collisions += dev_stats.collisions;
1464 netdev_internal_get_stats(const struct netdev *netdev_,
1465 struct netdev_stats *stats)
1467 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1469 get_stats_via_vport(netdev_, stats);
1470 return netdev->vport_stats_error;
1474 netdev_internal_set_stats(struct netdev *netdev,
1475 const struct netdev_stats *stats)
1477 struct ovs_vport_stats vport_stats;
1478 struct dpif_linux_vport vport;
1481 vport_stats.rx_packets = stats->rx_packets;
1482 vport_stats.tx_packets = stats->tx_packets;
1483 vport_stats.rx_bytes = stats->rx_bytes;
1484 vport_stats.tx_bytes = stats->tx_bytes;
1485 vport_stats.rx_errors = stats->rx_errors;
1486 vport_stats.tx_errors = stats->tx_errors;
1487 vport_stats.rx_dropped = stats->rx_dropped;
1488 vport_stats.tx_dropped = stats->tx_dropped;
1490 dpif_linux_vport_init(&vport);
1491 vport.cmd = OVS_VPORT_CMD_SET;
1492 vport.name = netdev_get_name(netdev);
1493 vport.stats = &vport_stats;
1495 err = dpif_linux_vport_transact(&vport, NULL, NULL);
1497 /* If the vport layer doesn't know about the device, that doesn't mean it
1498 * doesn't exist (after all were able to open it when netdev_open() was
1499 * called), it just means that it isn't attached and we'll be getting
1500 * stats a different way. */
1501 if (err == ENODEV) {
1509 netdev_linux_read_features(struct netdev_linux *netdev)
1511 struct ethtool_cmd ecmd;
1515 if (netdev->cache_valid & VALID_FEATURES) {
1519 COVERAGE_INC(netdev_get_ethtool);
1520 memset(&ecmd, 0, sizeof ecmd);
1521 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1522 ETHTOOL_GSET, "ETHTOOL_GSET");
1527 /* Supported features. */
1528 netdev->supported = 0;
1529 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1530 netdev->supported |= NETDEV_F_10MB_HD;
1532 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1533 netdev->supported |= NETDEV_F_10MB_FD;
1535 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1536 netdev->supported |= NETDEV_F_100MB_HD;
1538 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1539 netdev->supported |= NETDEV_F_100MB_FD;
1541 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1542 netdev->supported |= NETDEV_F_1GB_HD;
1544 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1545 netdev->supported |= NETDEV_F_1GB_FD;
1547 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1548 netdev->supported |= NETDEV_F_10GB_FD;
1550 if (ecmd.supported & SUPPORTED_TP) {
1551 netdev->supported |= NETDEV_F_COPPER;
1553 if (ecmd.supported & SUPPORTED_FIBRE) {
1554 netdev->supported |= NETDEV_F_FIBER;
1556 if (ecmd.supported & SUPPORTED_Autoneg) {
1557 netdev->supported |= NETDEV_F_AUTONEG;
1559 if (ecmd.supported & SUPPORTED_Pause) {
1560 netdev->supported |= NETDEV_F_PAUSE;
1562 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1563 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1566 /* Advertised features. */
1567 netdev->advertised = 0;
1568 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1569 netdev->advertised |= NETDEV_F_10MB_HD;
1571 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1572 netdev->advertised |= NETDEV_F_10MB_FD;
1574 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1575 netdev->advertised |= NETDEV_F_100MB_HD;
1577 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1578 netdev->advertised |= NETDEV_F_100MB_FD;
1580 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1581 netdev->advertised |= NETDEV_F_1GB_HD;
1583 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1584 netdev->advertised |= NETDEV_F_1GB_FD;
1586 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1587 netdev->advertised |= NETDEV_F_10GB_FD;
1589 if (ecmd.advertising & ADVERTISED_TP) {
1590 netdev->advertised |= NETDEV_F_COPPER;
1592 if (ecmd.advertising & ADVERTISED_FIBRE) {
1593 netdev->advertised |= NETDEV_F_FIBER;
1595 if (ecmd.advertising & ADVERTISED_Autoneg) {
1596 netdev->advertised |= NETDEV_F_AUTONEG;
1598 if (ecmd.advertising & ADVERTISED_Pause) {
1599 netdev->advertised |= NETDEV_F_PAUSE;
1601 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1602 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1605 /* Current settings. */
1607 if (speed == SPEED_10) {
1608 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1609 } else if (speed == SPEED_100) {
1610 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1611 } else if (speed == SPEED_1000) {
1612 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1613 } else if (speed == SPEED_10000) {
1614 netdev->current = NETDEV_F_10GB_FD;
1615 } else if (speed == 40000) {
1616 netdev->current = NETDEV_F_40GB_FD;
1617 } else if (speed == 100000) {
1618 netdev->current = NETDEV_F_100GB_FD;
1619 } else if (speed == 1000000) {
1620 netdev->current = NETDEV_F_1TB_FD;
1622 netdev->current = 0;
1625 if (ecmd.port == PORT_TP) {
1626 netdev->current |= NETDEV_F_COPPER;
1627 } else if (ecmd.port == PORT_FIBRE) {
1628 netdev->current |= NETDEV_F_FIBER;
1632 netdev->current |= NETDEV_F_AUTONEG;
1635 /* Peer advertisements. */
1636 netdev->peer = 0; /* XXX */
1639 netdev->cache_valid |= VALID_FEATURES;
1640 netdev->get_features_error = error;
1643 /* Stores the features supported by 'netdev' into each of '*current',
1644 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1645 * bitmap of NETDEV_* bits. Returns 0 if successful, otherwise a positive
1648 netdev_linux_get_features(const struct netdev *netdev_,
1649 enum netdev_features *current,
1650 enum netdev_features *advertised,
1651 enum netdev_features *supported,
1652 enum netdev_features *peer)
1654 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1656 netdev_linux_read_features(netdev);
1658 if (!netdev->get_features_error) {
1659 *current = netdev->current;
1660 *advertised = netdev->advertised;
1661 *supported = netdev->supported;
1662 *peer = netdev->peer;
1664 return netdev->get_features_error;
1667 /* Set the features advertised by 'netdev' to 'advertise'. */
1669 netdev_linux_set_advertisements(struct netdev *netdev,
1670 enum netdev_features advertise)
1672 struct ethtool_cmd ecmd;
1675 COVERAGE_INC(netdev_get_ethtool);
1676 memset(&ecmd, 0, sizeof ecmd);
1677 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1678 ETHTOOL_GSET, "ETHTOOL_GSET");
1683 ecmd.advertising = 0;
1684 if (advertise & NETDEV_F_10MB_HD) {
1685 ecmd.advertising |= ADVERTISED_10baseT_Half;
1687 if (advertise & NETDEV_F_10MB_FD) {
1688 ecmd.advertising |= ADVERTISED_10baseT_Full;
1690 if (advertise & NETDEV_F_100MB_HD) {
1691 ecmd.advertising |= ADVERTISED_100baseT_Half;
1693 if (advertise & NETDEV_F_100MB_FD) {
1694 ecmd.advertising |= ADVERTISED_100baseT_Full;
1696 if (advertise & NETDEV_F_1GB_HD) {
1697 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1699 if (advertise & NETDEV_F_1GB_FD) {
1700 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1702 if (advertise & NETDEV_F_10GB_FD) {
1703 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1705 if (advertise & NETDEV_F_COPPER) {
1706 ecmd.advertising |= ADVERTISED_TP;
1708 if (advertise & NETDEV_F_FIBER) {
1709 ecmd.advertising |= ADVERTISED_FIBRE;
1711 if (advertise & NETDEV_F_AUTONEG) {
1712 ecmd.advertising |= ADVERTISED_Autoneg;
1714 if (advertise & NETDEV_F_PAUSE) {
1715 ecmd.advertising |= ADVERTISED_Pause;
1717 if (advertise & NETDEV_F_PAUSE_ASYM) {
1718 ecmd.advertising |= ADVERTISED_Asym_Pause;
1720 COVERAGE_INC(netdev_set_ethtool);
1721 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1722 ETHTOOL_SSET, "ETHTOOL_SSET");
1725 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1726 * successful, otherwise a positive errno value. */
1728 netdev_linux_set_policing(struct netdev *netdev_,
1729 uint32_t kbits_rate, uint32_t kbits_burst)
1731 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1732 const char *netdev_name = netdev_get_name(netdev_);
1736 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1737 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1738 : kbits_burst); /* Stick with user-specified value. */
1740 if (netdev->cache_valid & VALID_POLICING) {
1741 if (netdev->netdev_policing_error) {
1742 return netdev->netdev_policing_error;
1745 if (netdev->kbits_rate == kbits_rate &&
1746 netdev->kbits_burst == kbits_burst) {
1747 /* Assume that settings haven't changed since we last set them. */
1750 netdev->cache_valid &= ~VALID_POLICING;
1753 COVERAGE_INC(netdev_set_policing);
1754 /* Remove any existing ingress qdisc. */
1755 error = tc_add_del_ingress_qdisc(netdev_, false);
1757 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1758 netdev_name, ovs_strerror(error));
1763 error = tc_add_del_ingress_qdisc(netdev_, true);
1765 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1766 netdev_name, ovs_strerror(error));
1770 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
1772 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1773 netdev_name, ovs_strerror(error));
1778 netdev->kbits_rate = kbits_rate;
1779 netdev->kbits_burst = kbits_burst;
1782 if (!error || error == ENODEV) {
1783 netdev->netdev_policing_error = error;
1784 netdev->cache_valid |= VALID_POLICING;
1790 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1793 const struct tc_ops *const *opsp;
1795 for (opsp = tcs; *opsp != NULL; opsp++) {
1796 const struct tc_ops *ops = *opsp;
1797 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1798 sset_add(types, ops->ovs_name);
1804 static const struct tc_ops *
1805 tc_lookup_ovs_name(const char *name)
1807 const struct tc_ops *const *opsp;
1809 for (opsp = tcs; *opsp != NULL; opsp++) {
1810 const struct tc_ops *ops = *opsp;
1811 if (!strcmp(name, ops->ovs_name)) {
1818 static const struct tc_ops *
1819 tc_lookup_linux_name(const char *name)
1821 const struct tc_ops *const *opsp;
1823 for (opsp = tcs; *opsp != NULL; opsp++) {
1824 const struct tc_ops *ops = *opsp;
1825 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1832 static struct tc_queue *
1833 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
1836 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1837 struct tc_queue *queue;
1839 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
1840 if (queue->queue_id == queue_id) {
1847 static struct tc_queue *
1848 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1850 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1854 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1856 struct netdev_qos_capabilities *caps)
1858 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1862 caps->n_queues = ops->n_queues;
1867 netdev_linux_get_qos(const struct netdev *netdev_,
1868 const char **typep, struct smap *details)
1870 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1873 error = tc_query_qdisc(netdev_);
1878 *typep = netdev->tc->ops->ovs_name;
1879 return (netdev->tc->ops->qdisc_get
1880 ? netdev->tc->ops->qdisc_get(netdev_, details)
1885 netdev_linux_set_qos(struct netdev *netdev_,
1886 const char *type, const struct smap *details)
1888 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1889 const struct tc_ops *new_ops;
1892 new_ops = tc_lookup_ovs_name(type);
1893 if (!new_ops || !new_ops->tc_install) {
1897 error = tc_query_qdisc(netdev_);
1902 if (new_ops == netdev->tc->ops) {
1903 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
1905 /* Delete existing qdisc. */
1906 error = tc_del_qdisc(netdev_);
1910 ovs_assert(netdev->tc == NULL);
1912 /* Install new qdisc. */
1913 error = new_ops->tc_install(netdev_, details);
1914 ovs_assert((error == 0) == (netdev->tc != NULL));
1921 netdev_linux_get_queue(const struct netdev *netdev_,
1922 unsigned int queue_id, struct smap *details)
1924 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1927 error = tc_query_qdisc(netdev_);
1931 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
1933 ? netdev->tc->ops->class_get(netdev_, queue, details)
1939 netdev_linux_set_queue(struct netdev *netdev_,
1940 unsigned int queue_id, const struct smap *details)
1942 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1945 error = tc_query_qdisc(netdev_);
1948 } else if (queue_id >= netdev->tc->ops->n_queues
1949 || !netdev->tc->ops->class_set) {
1953 return netdev->tc->ops->class_set(netdev_, queue_id, details);
1957 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
1959 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1962 error = tc_query_qdisc(netdev_);
1965 } else if (!netdev->tc->ops->class_delete) {
1968 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
1970 ? netdev->tc->ops->class_delete(netdev_, queue)
1976 netdev_linux_get_queue_stats(const struct netdev *netdev_,
1977 unsigned int queue_id,
1978 struct netdev_queue_stats *stats)
1980 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1983 error = tc_query_qdisc(netdev_);
1986 } else if (!netdev->tc->ops->class_get_stats) {
1989 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
1993 stats->created = queue->created;
1994 return netdev->tc->ops->class_get_stats(netdev_, queue, stats);
1999 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
2001 struct ofpbuf request;
2002 struct tcmsg *tcmsg;
2004 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2008 tcmsg->tcm_parent = 0;
2009 nl_dump_start(dump, NETLINK_ROUTE, &request);
2010 ofpbuf_uninit(&request);
2015 netdev_linux_dump_queues(const struct netdev *netdev_,
2016 netdev_dump_queues_cb *cb, void *aux)
2018 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2019 struct tc_queue *queue, *next_queue;
2020 struct smap details;
2024 error = tc_query_qdisc(netdev_);
2027 } else if (!netdev->tc->ops->class_get) {
2032 smap_init(&details);
2033 HMAP_FOR_EACH_SAFE (queue, next_queue, hmap_node,
2034 &netdev->tc->queues) {
2035 smap_clear(&details);
2037 error = netdev->tc->ops->class_get(netdev_, queue, &details);
2039 (*cb)(queue->queue_id, &details, aux);
2044 smap_destroy(&details);
2050 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2051 netdev_dump_queue_stats_cb *cb, void *aux)
2053 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2054 struct nl_dump dump;
2059 error = tc_query_qdisc(netdev_);
2062 } else if (!netdev->tc->ops->class_dump_stats) {
2067 if (!start_queue_dump(netdev_, &dump)) {
2070 while (nl_dump_next(&dump, &msg)) {
2071 error = netdev->tc->ops->class_dump_stats(netdev_, &msg, cb, aux);
2077 error = nl_dump_done(&dump);
2078 return error ? error : last_error;
2082 netdev_linux_get_in4(const struct netdev *netdev_,
2083 struct in_addr *address, struct in_addr *netmask)
2085 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2087 if (!(netdev->cache_valid & VALID_IN4)) {
2090 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
2091 SIOCGIFADDR, "SIOCGIFADDR");
2096 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2097 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2102 netdev->cache_valid |= VALID_IN4;
2104 *address = netdev->address;
2105 *netmask = netdev->netmask;
2106 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
2110 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2111 struct in_addr netmask)
2113 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2116 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2118 netdev->cache_valid |= VALID_IN4;
2119 netdev->address = address;
2120 netdev->netmask = netmask;
2121 if (address.s_addr != INADDR_ANY) {
2122 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2123 "SIOCSIFNETMASK", netmask);
2130 parse_if_inet6_line(const char *line,
2131 struct in6_addr *in6, char ifname[16 + 1])
2133 uint8_t *s6 = in6->s6_addr;
2134 #define X8 "%2"SCNx8
2136 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2137 "%*x %*x %*x %*x %16s\n",
2138 &s6[0], &s6[1], &s6[2], &s6[3],
2139 &s6[4], &s6[5], &s6[6], &s6[7],
2140 &s6[8], &s6[9], &s6[10], &s6[11],
2141 &s6[12], &s6[13], &s6[14], &s6[15],
2145 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2146 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2148 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2150 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2151 if (!(netdev->cache_valid & VALID_IN6)) {
2155 netdev->in6 = in6addr_any;
2157 file = fopen("/proc/net/if_inet6", "r");
2159 const char *name = netdev_get_name(netdev_);
2160 while (fgets(line, sizeof line, file)) {
2161 struct in6_addr in6_tmp;
2162 char ifname[16 + 1];
2163 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2164 && !strcmp(name, ifname))
2166 netdev->in6 = in6_tmp;
2172 netdev->cache_valid |= VALID_IN6;
2179 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2181 struct sockaddr_in sin;
2182 memset(&sin, 0, sizeof sin);
2183 sin.sin_family = AF_INET;
2184 sin.sin_addr = addr;
2187 memset(sa, 0, sizeof *sa);
2188 memcpy(sa, &sin, sizeof sin);
2192 do_set_addr(struct netdev *netdev,
2193 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2196 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2197 make_in4_sockaddr(&ifr.ifr_addr, addr);
2199 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2203 /* Adds 'router' as a default IP gateway. */
2205 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2207 struct in_addr any = { INADDR_ANY };
2211 memset(&rt, 0, sizeof rt);
2212 make_in4_sockaddr(&rt.rt_dst, any);
2213 make_in4_sockaddr(&rt.rt_gateway, router);
2214 make_in4_sockaddr(&rt.rt_genmask, any);
2215 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2216 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2218 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2224 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2227 static const char fn[] = "/proc/net/route";
2232 *netdev_name = NULL;
2233 stream = fopen(fn, "r");
2234 if (stream == NULL) {
2235 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2240 while (fgets(line, sizeof line, stream)) {
2243 ovs_be32 dest, gateway, mask;
2244 int refcnt, metric, mtu;
2245 unsigned int flags, use, window, irtt;
2248 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2250 iface, &dest, &gateway, &flags, &refcnt,
2251 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2253 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2257 if (!(flags & RTF_UP)) {
2258 /* Skip routes that aren't up. */
2262 /* The output of 'dest', 'mask', and 'gateway' were given in
2263 * network byte order, so we don't need need any endian
2264 * conversions here. */
2265 if ((dest & mask) == (host->s_addr & mask)) {
2267 /* The host is directly reachable. */
2268 next_hop->s_addr = 0;
2270 /* To reach the host, we must go through a gateway. */
2271 next_hop->s_addr = gateway;
2273 *netdev_name = xstrdup(iface);
2285 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2287 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2290 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2291 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2293 COVERAGE_INC(netdev_get_ethtool);
2294 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2295 error = netdev_linux_do_ethtool(netdev->up.name,
2298 "ETHTOOL_GDRVINFO");
2300 netdev->cache_valid |= VALID_DRVINFO;
2305 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2306 smap_add(smap, "driver_version", netdev->drvinfo.version);
2307 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2313 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2316 smap_add(smap, "driver_name", "openvswitch");
2320 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2321 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2322 * returns 0. Otherwise, it returns a positive errno value; in particular,
2323 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2325 netdev_linux_arp_lookup(const struct netdev *netdev,
2326 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2329 struct sockaddr_in sin;
2332 memset(&r, 0, sizeof r);
2333 memset(&sin, 0, sizeof sin);
2334 sin.sin_family = AF_INET;
2335 sin.sin_addr.s_addr = ip;
2337 memcpy(&r.arp_pa, &sin, sizeof sin);
2338 r.arp_ha.sa_family = ARPHRD_ETHER;
2340 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2341 COVERAGE_INC(netdev_arp_lookup);
2342 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2344 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2345 } else if (retval != ENXIO) {
2346 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2347 netdev_get_name(netdev), IP_ARGS(ip),
2348 ovs_strerror(retval));
2354 nd_to_iff_flags(enum netdev_flags nd)
2357 if (nd & NETDEV_UP) {
2360 if (nd & NETDEV_PROMISC) {
2367 iff_to_nd_flags(int iff)
2369 enum netdev_flags nd = 0;
2373 if (iff & IFF_PROMISC) {
2374 nd |= NETDEV_PROMISC;
2380 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2381 enum netdev_flags on, enum netdev_flags *old_flagsp)
2383 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2384 int old_flags, new_flags;
2387 old_flags = netdev->ifi_flags;
2388 *old_flagsp = iff_to_nd_flags(old_flags);
2389 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2390 if (new_flags != old_flags) {
2391 error = set_flags(netdev_get_name(netdev_), new_flags);
2392 get_flags(netdev_, &netdev->ifi_flags);
2398 netdev_linux_change_seq(const struct netdev *netdev)
2400 return netdev_linux_cast(netdev)->change_seq;
2403 #define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS, \
2404 GET_FEATURES, GET_STATUS) \
2408 netdev_linux_init, \
2410 netdev_linux_wait, \
2413 netdev_linux_destroy, \
2414 NULL, /* get_config */ \
2415 NULL, /* set_config */ \
2416 NULL, /* get_tunnel_config */ \
2418 netdev_linux_rx_open, \
2420 netdev_linux_send, \
2421 netdev_linux_send_wait, \
2423 netdev_linux_set_etheraddr, \
2424 netdev_linux_get_etheraddr, \
2425 netdev_linux_get_mtu, \
2426 netdev_linux_set_mtu, \
2427 netdev_linux_get_ifindex, \
2428 netdev_linux_get_carrier, \
2429 netdev_linux_get_carrier_resets, \
2430 netdev_linux_set_miimon_interval, \
2435 netdev_linux_set_advertisements, \
2437 netdev_linux_set_policing, \
2438 netdev_linux_get_qos_types, \
2439 netdev_linux_get_qos_capabilities, \
2440 netdev_linux_get_qos, \
2441 netdev_linux_set_qos, \
2442 netdev_linux_get_queue, \
2443 netdev_linux_set_queue, \
2444 netdev_linux_delete_queue, \
2445 netdev_linux_get_queue_stats, \
2446 netdev_linux_dump_queues, \
2447 netdev_linux_dump_queue_stats, \
2449 netdev_linux_get_in4, \
2450 netdev_linux_set_in4, \
2451 netdev_linux_get_in6, \
2452 netdev_linux_add_router, \
2453 netdev_linux_get_next_hop, \
2455 netdev_linux_arp_lookup, \
2457 netdev_linux_update_flags, \
2459 netdev_linux_change_seq \
2462 const struct netdev_class netdev_linux_class =
2465 netdev_linux_create,
2466 netdev_linux_get_stats,
2467 NULL, /* set_stats */
2468 netdev_linux_get_features,
2469 netdev_linux_get_status);
2471 const struct netdev_class netdev_tap_class =
2474 netdev_linux_create_tap,
2475 netdev_tap_get_stats,
2476 NULL, /* set_stats */
2477 netdev_linux_get_features,
2478 netdev_linux_get_status);
2480 const struct netdev_class netdev_internal_class =
2483 netdev_linux_create,
2484 netdev_internal_get_stats,
2485 netdev_internal_set_stats,
2486 NULL, /* get_features */
2487 netdev_internal_get_status);
2489 static const struct netdev_rx_class netdev_rx_linux_class = {
2490 netdev_rx_linux_destroy,
2491 netdev_rx_linux_recv,
2492 netdev_rx_linux_wait,
2493 netdev_rx_linux_drain,
2496 /* HTB traffic control class. */
2498 #define HTB_N_QUEUES 0xf000
2502 unsigned int max_rate; /* In bytes/s. */
2506 struct tc_queue tc_queue;
2507 unsigned int min_rate; /* In bytes/s. */
2508 unsigned int max_rate; /* In bytes/s. */
2509 unsigned int burst; /* In bytes. */
2510 unsigned int priority; /* Lower values are higher priorities. */
2514 htb_get__(const struct netdev *netdev_)
2516 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2517 return CONTAINER_OF(netdev->tc, struct htb, tc);
2521 htb_install__(struct netdev *netdev_, uint64_t max_rate)
2523 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2526 htb = xmalloc(sizeof *htb);
2527 tc_init(&htb->tc, &tc_ops_htb);
2528 htb->max_rate = max_rate;
2530 netdev->tc = &htb->tc;
2533 /* Create an HTB qdisc.
2535 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2537 htb_setup_qdisc__(struct netdev *netdev)
2540 struct tc_htb_glob opt;
2541 struct ofpbuf request;
2542 struct tcmsg *tcmsg;
2544 tc_del_qdisc(netdev);
2546 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2547 NLM_F_EXCL | NLM_F_CREATE, &request);
2551 tcmsg->tcm_handle = tc_make_handle(1, 0);
2552 tcmsg->tcm_parent = TC_H_ROOT;
2554 nl_msg_put_string(&request, TCA_KIND, "htb");
2556 memset(&opt, 0, sizeof opt);
2557 opt.rate2quantum = 10;
2561 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2562 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2563 nl_msg_end_nested(&request, opt_offset);
2565 return tc_transact(&request, NULL);
2568 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2569 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2571 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2572 unsigned int parent, struct htb_class *class)
2575 struct tc_htb_opt opt;
2576 struct ofpbuf request;
2577 struct tcmsg *tcmsg;
2581 error = netdev_get_mtu(netdev, &mtu);
2583 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2584 netdev_get_name(netdev));
2588 memset(&opt, 0, sizeof opt);
2589 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2590 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2591 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2592 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2593 opt.prio = class->priority;
2595 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2599 tcmsg->tcm_handle = handle;
2600 tcmsg->tcm_parent = parent;
2602 nl_msg_put_string(&request, TCA_KIND, "htb");
2603 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2604 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2605 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2606 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2607 nl_msg_end_nested(&request, opt_offset);
2609 error = tc_transact(&request, NULL);
2611 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2612 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2613 netdev_get_name(netdev),
2614 tc_get_major(handle), tc_get_minor(handle),
2615 tc_get_major(parent), tc_get_minor(parent),
2616 class->min_rate, class->max_rate,
2617 class->burst, class->priority, ovs_strerror(error));
2622 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2623 * description of them into 'details'. The description complies with the
2624 * specification given in the vswitch database documentation for linux-htb
2627 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2629 static const struct nl_policy tca_htb_policy[] = {
2630 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2631 .min_len = sizeof(struct tc_htb_opt) },
2634 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2635 const struct tc_htb_opt *htb;
2637 if (!nl_parse_nested(nl_options, tca_htb_policy,
2638 attrs, ARRAY_SIZE(tca_htb_policy))) {
2639 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2643 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2644 class->min_rate = htb->rate.rate;
2645 class->max_rate = htb->ceil.rate;
2646 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2647 class->priority = htb->prio;
2652 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2653 struct htb_class *options,
2654 struct netdev_queue_stats *stats)
2656 struct nlattr *nl_options;
2657 unsigned int handle;
2660 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2661 if (!error && queue_id) {
2662 unsigned int major = tc_get_major(handle);
2663 unsigned int minor = tc_get_minor(handle);
2664 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2665 *queue_id = minor - 1;
2670 if (!error && options) {
2671 error = htb_parse_tca_options__(nl_options, options);
2677 htb_parse_qdisc_details__(struct netdev *netdev,
2678 const struct smap *details, struct htb_class *hc)
2680 const char *max_rate_s;
2682 max_rate_s = smap_get(details, "max-rate");
2683 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2684 if (!hc->max_rate) {
2685 enum netdev_features current;
2687 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2688 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
2690 hc->min_rate = hc->max_rate;
2696 htb_parse_class_details__(struct netdev *netdev,
2697 const struct smap *details, struct htb_class *hc)
2699 const struct htb *htb = htb_get__(netdev);
2700 const char *min_rate_s = smap_get(details, "min-rate");
2701 const char *max_rate_s = smap_get(details, "max-rate");
2702 const char *burst_s = smap_get(details, "burst");
2703 const char *priority_s = smap_get(details, "priority");
2706 error = netdev_get_mtu(netdev, &mtu);
2708 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2709 netdev_get_name(netdev));
2713 /* HTB requires at least an mtu sized min-rate to send any traffic even
2714 * on uncongested links. */
2715 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2716 hc->min_rate = MAX(hc->min_rate, mtu);
2717 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2720 hc->max_rate = (max_rate_s
2721 ? strtoull(max_rate_s, NULL, 10) / 8
2723 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2724 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2728 * According to hints in the documentation that I've read, it is important
2729 * that 'burst' be at least as big as the largest frame that might be
2730 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2731 * but having it a bit too small is a problem. Since netdev_get_mtu()
2732 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2733 * the MTU. We actually add 64, instead of 14, as a guard against
2734 * additional headers get tacked on somewhere that we're not aware of. */
2735 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2736 hc->burst = MAX(hc->burst, mtu + 64);
2739 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2745 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2746 unsigned int parent, struct htb_class *options,
2747 struct netdev_queue_stats *stats)
2749 struct ofpbuf *reply;
2752 error = tc_query_class(netdev, handle, parent, &reply);
2754 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2755 ofpbuf_delete(reply);
2761 htb_tc_install(struct netdev *netdev, const struct smap *details)
2765 error = htb_setup_qdisc__(netdev);
2767 struct htb_class hc;
2769 htb_parse_qdisc_details__(netdev, details, &hc);
2770 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2771 tc_make_handle(1, 0), &hc);
2773 htb_install__(netdev, hc.max_rate);
2779 static struct htb_class *
2780 htb_class_cast__(const struct tc_queue *queue)
2782 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2786 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2787 const struct htb_class *hc)
2789 struct htb *htb = htb_get__(netdev);
2790 size_t hash = hash_int(queue_id, 0);
2791 struct tc_queue *queue;
2792 struct htb_class *hcp;
2794 queue = tc_find_queue__(netdev, queue_id, hash);
2796 hcp = htb_class_cast__(queue);
2798 hcp = xmalloc(sizeof *hcp);
2799 queue = &hcp->tc_queue;
2800 queue->queue_id = queue_id;
2801 queue->created = time_msec();
2802 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2805 hcp->min_rate = hc->min_rate;
2806 hcp->max_rate = hc->max_rate;
2807 hcp->burst = hc->burst;
2808 hcp->priority = hc->priority;
2812 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2815 struct nl_dump dump;
2816 struct htb_class hc;
2818 /* Get qdisc options. */
2820 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2821 htb_install__(netdev, hc.max_rate);
2824 if (!start_queue_dump(netdev, &dump)) {
2827 while (nl_dump_next(&dump, &msg)) {
2828 unsigned int queue_id;
2830 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2831 htb_update_queue__(netdev, queue_id, &hc);
2834 nl_dump_done(&dump);
2840 htb_tc_destroy(struct tc *tc)
2842 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2843 struct htb_class *hc, *next;
2845 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2846 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2854 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
2856 const struct htb *htb = htb_get__(netdev);
2857 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
2862 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
2864 struct htb_class hc;
2867 htb_parse_qdisc_details__(netdev, details, &hc);
2868 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2869 tc_make_handle(1, 0), &hc);
2871 htb_get__(netdev)->max_rate = hc.max_rate;
2877 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2878 const struct tc_queue *queue, struct smap *details)
2880 const struct htb_class *hc = htb_class_cast__(queue);
2882 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
2883 if (hc->min_rate != hc->max_rate) {
2884 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
2886 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
2888 smap_add_format(details, "priority", "%u", hc->priority);
2894 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2895 const struct smap *details)
2897 struct htb_class hc;
2900 error = htb_parse_class_details__(netdev, details, &hc);
2905 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2906 tc_make_handle(1, 0xfffe), &hc);
2911 htb_update_queue__(netdev, queue_id, &hc);
2916 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2918 struct htb_class *hc = htb_class_cast__(queue);
2919 struct htb *htb = htb_get__(netdev);
2922 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2924 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2931 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2932 struct netdev_queue_stats *stats)
2934 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2935 tc_make_handle(1, 0xfffe), NULL, stats);
2939 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2940 const struct ofpbuf *nlmsg,
2941 netdev_dump_queue_stats_cb *cb, void *aux)
2943 struct netdev_queue_stats stats;
2944 unsigned int handle, major, minor;
2947 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2952 major = tc_get_major(handle);
2953 minor = tc_get_minor(handle);
2954 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2955 (*cb)(minor - 1, &stats, aux);
2960 static const struct tc_ops tc_ops_htb = {
2961 "htb", /* linux_name */
2962 "linux-htb", /* ovs_name */
2963 HTB_N_QUEUES, /* n_queues */
2972 htb_class_get_stats,
2973 htb_class_dump_stats
2976 /* "linux-hfsc" traffic control class. */
2978 #define HFSC_N_QUEUES 0xf000
2986 struct tc_queue tc_queue;
2991 static struct hfsc *
2992 hfsc_get__(const struct netdev *netdev_)
2994 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2995 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
2998 static struct hfsc_class *
2999 hfsc_class_cast__(const struct tc_queue *queue)
3001 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3005 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
3007 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3010 hfsc = xmalloc(sizeof *hfsc);
3011 tc_init(&hfsc->tc, &tc_ops_hfsc);
3012 hfsc->max_rate = max_rate;
3013 netdev->tc = &hfsc->tc;
3017 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3018 const struct hfsc_class *hc)
3022 struct hfsc_class *hcp;
3023 struct tc_queue *queue;
3025 hfsc = hfsc_get__(netdev);
3026 hash = hash_int(queue_id, 0);
3028 queue = tc_find_queue__(netdev, queue_id, hash);
3030 hcp = hfsc_class_cast__(queue);
3032 hcp = xmalloc(sizeof *hcp);
3033 queue = &hcp->tc_queue;
3034 queue->queue_id = queue_id;
3035 queue->created = time_msec();
3036 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3039 hcp->min_rate = hc->min_rate;
3040 hcp->max_rate = hc->max_rate;
3044 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3046 const struct tc_service_curve *rsc, *fsc, *usc;
3047 static const struct nl_policy tca_hfsc_policy[] = {
3049 .type = NL_A_UNSPEC,
3051 .min_len = sizeof(struct tc_service_curve),
3054 .type = NL_A_UNSPEC,
3056 .min_len = sizeof(struct tc_service_curve),
3059 .type = NL_A_UNSPEC,
3061 .min_len = sizeof(struct tc_service_curve),
3064 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3066 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3067 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3068 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3072 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3073 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3074 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3076 if (rsc->m1 != 0 || rsc->d != 0 ||
3077 fsc->m1 != 0 || fsc->d != 0 ||
3078 usc->m1 != 0 || usc->d != 0) {
3079 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3080 "Non-linear service curves are not supported.");
3084 if (rsc->m2 != fsc->m2) {
3085 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3086 "Real-time service curves are not supported ");
3090 if (rsc->m2 > usc->m2) {
3091 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3092 "Min-rate service curve is greater than "
3093 "the max-rate service curve.");
3097 class->min_rate = fsc->m2;
3098 class->max_rate = usc->m2;
3103 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3104 struct hfsc_class *options,
3105 struct netdev_queue_stats *stats)
3108 unsigned int handle;
3109 struct nlattr *nl_options;
3111 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3117 unsigned int major, minor;
3119 major = tc_get_major(handle);
3120 minor = tc_get_minor(handle);
3121 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3122 *queue_id = minor - 1;
3129 error = hfsc_parse_tca_options__(nl_options, options);
3136 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3137 unsigned int parent, struct hfsc_class *options,
3138 struct netdev_queue_stats *stats)
3141 struct ofpbuf *reply;
3143 error = tc_query_class(netdev, handle, parent, &reply);
3148 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3149 ofpbuf_delete(reply);
3154 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct smap *details,
3155 struct hfsc_class *class)
3158 const char *max_rate_s;
3160 max_rate_s = smap_get(details, "max-rate");
3161 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3164 enum netdev_features current;
3166 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
3167 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3170 class->min_rate = max_rate;
3171 class->max_rate = max_rate;
3175 hfsc_parse_class_details__(struct netdev *netdev,
3176 const struct smap *details,
3177 struct hfsc_class * class)
3179 const struct hfsc *hfsc;
3180 uint32_t min_rate, max_rate;
3181 const char *min_rate_s, *max_rate_s;
3183 hfsc = hfsc_get__(netdev);
3184 min_rate_s = smap_get(details, "min-rate");
3185 max_rate_s = smap_get(details, "max-rate");
3187 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3188 min_rate = MAX(min_rate, 1);
3189 min_rate = MIN(min_rate, hfsc->max_rate);
3191 max_rate = (max_rate_s
3192 ? strtoull(max_rate_s, NULL, 10) / 8
3194 max_rate = MAX(max_rate, min_rate);
3195 max_rate = MIN(max_rate, hfsc->max_rate);
3197 class->min_rate = min_rate;
3198 class->max_rate = max_rate;
3203 /* Create an HFSC qdisc.
3205 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3207 hfsc_setup_qdisc__(struct netdev * netdev)
3209 struct tcmsg *tcmsg;
3210 struct ofpbuf request;
3211 struct tc_hfsc_qopt opt;
3213 tc_del_qdisc(netdev);
3215 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3216 NLM_F_EXCL | NLM_F_CREATE, &request);
3222 tcmsg->tcm_handle = tc_make_handle(1, 0);
3223 tcmsg->tcm_parent = TC_H_ROOT;
3225 memset(&opt, 0, sizeof opt);
3228 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3229 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3231 return tc_transact(&request, NULL);
3234 /* Create an HFSC class.
3236 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3237 * sc rate <min_rate> ul rate <max_rate>" */
3239 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3240 unsigned int parent, struct hfsc_class *class)
3244 struct tcmsg *tcmsg;
3245 struct ofpbuf request;
3246 struct tc_service_curve min, max;
3248 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3254 tcmsg->tcm_handle = handle;
3255 tcmsg->tcm_parent = parent;
3259 min.m2 = class->min_rate;
3263 max.m2 = class->max_rate;
3265 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3266 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3267 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3268 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3269 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3270 nl_msg_end_nested(&request, opt_offset);
3272 error = tc_transact(&request, NULL);
3274 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3275 "min-rate %ubps, max-rate %ubps (%s)",
3276 netdev_get_name(netdev),
3277 tc_get_major(handle), tc_get_minor(handle),
3278 tc_get_major(parent), tc_get_minor(parent),
3279 class->min_rate, class->max_rate, ovs_strerror(error));
3286 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
3289 struct hfsc_class class;
3291 error = hfsc_setup_qdisc__(netdev);
3297 hfsc_parse_qdisc_details__(netdev, details, &class);
3298 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3299 tc_make_handle(1, 0), &class);
3305 hfsc_install__(netdev, class.max_rate);
3310 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3313 struct nl_dump dump;
3314 struct hfsc_class hc;
3317 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3318 hfsc_install__(netdev, hc.max_rate);
3320 if (!start_queue_dump(netdev, &dump)) {
3324 while (nl_dump_next(&dump, &msg)) {
3325 unsigned int queue_id;
3327 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3328 hfsc_update_queue__(netdev, queue_id, &hc);
3332 nl_dump_done(&dump);
3337 hfsc_tc_destroy(struct tc *tc)
3340 struct hfsc_class *hc, *next;
3342 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3344 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3345 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3354 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
3356 const struct hfsc *hfsc;
3357 hfsc = hfsc_get__(netdev);
3358 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
3363 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
3366 struct hfsc_class class;
3368 hfsc_parse_qdisc_details__(netdev, details, &class);
3369 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3370 tc_make_handle(1, 0), &class);
3373 hfsc_get__(netdev)->max_rate = class.max_rate;
3380 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3381 const struct tc_queue *queue, struct smap *details)
3383 const struct hfsc_class *hc;
3385 hc = hfsc_class_cast__(queue);
3386 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3387 if (hc->min_rate != hc->max_rate) {
3388 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3394 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3395 const struct smap *details)
3398 struct hfsc_class class;
3400 error = hfsc_parse_class_details__(netdev, details, &class);
3405 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3406 tc_make_handle(1, 0xfffe), &class);
3411 hfsc_update_queue__(netdev, queue_id, &class);
3416 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3420 struct hfsc_class *hc;
3422 hc = hfsc_class_cast__(queue);
3423 hfsc = hfsc_get__(netdev);
3425 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3427 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3434 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3435 struct netdev_queue_stats *stats)
3437 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3438 tc_make_handle(1, 0xfffe), NULL, stats);
3442 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3443 const struct ofpbuf *nlmsg,
3444 netdev_dump_queue_stats_cb *cb, void *aux)
3446 struct netdev_queue_stats stats;
3447 unsigned int handle, major, minor;
3450 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3455 major = tc_get_major(handle);
3456 minor = tc_get_minor(handle);
3457 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3458 (*cb)(minor - 1, &stats, aux);
3463 static const struct tc_ops tc_ops_hfsc = {
3464 "hfsc", /* linux_name */
3465 "linux-hfsc", /* ovs_name */
3466 HFSC_N_QUEUES, /* n_queues */
3467 hfsc_tc_install, /* tc_install */
3468 hfsc_tc_load, /* tc_load */
3469 hfsc_tc_destroy, /* tc_destroy */
3470 hfsc_qdisc_get, /* qdisc_get */
3471 hfsc_qdisc_set, /* qdisc_set */
3472 hfsc_class_get, /* class_get */
3473 hfsc_class_set, /* class_set */
3474 hfsc_class_delete, /* class_delete */
3475 hfsc_class_get_stats, /* class_get_stats */
3476 hfsc_class_dump_stats /* class_dump_stats */
3479 /* "linux-default" traffic control class.
3481 * This class represents the default, unnamed Linux qdisc. It corresponds to
3482 * the "" (empty string) QoS type in the OVS database. */
3485 default_install__(struct netdev *netdev_)
3487 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3488 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
3490 /* Nothing but a tc class implementation is allowed to write to a tc. This
3491 * class never does that, so we can legitimately use a const tc object. */
3492 netdev->tc = CONST_CAST(struct tc *, &tc);
3496 default_tc_install(struct netdev *netdev,
3497 const struct smap *details OVS_UNUSED)
3499 default_install__(netdev);
3504 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3506 default_install__(netdev);
3510 static const struct tc_ops tc_ops_default = {
3511 NULL, /* linux_name */
3516 NULL, /* tc_destroy */
3517 NULL, /* qdisc_get */
3518 NULL, /* qdisc_set */
3519 NULL, /* class_get */
3520 NULL, /* class_set */
3521 NULL, /* class_delete */
3522 NULL, /* class_get_stats */
3523 NULL /* class_dump_stats */
3526 /* "linux-other" traffic control class.
3531 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
3533 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3534 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
3536 /* Nothing but a tc class implementation is allowed to write to a tc. This
3537 * class never does that, so we can legitimately use a const tc object. */
3538 netdev->tc = CONST_CAST(struct tc *, &tc);
3542 static const struct tc_ops tc_ops_other = {
3543 NULL, /* linux_name */
3544 "linux-other", /* ovs_name */
3546 NULL, /* tc_install */
3548 NULL, /* tc_destroy */
3549 NULL, /* qdisc_get */
3550 NULL, /* qdisc_set */
3551 NULL, /* class_get */
3552 NULL, /* class_set */
3553 NULL, /* class_delete */
3554 NULL, /* class_get_stats */
3555 NULL /* class_dump_stats */
3558 /* Traffic control. */
3560 /* Number of kernel "tc" ticks per second. */
3561 static double ticks_per_s;
3563 /* Number of kernel "jiffies" per second. This is used for the purpose of
3564 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3565 * one jiffy's worth of data.
3567 * There are two possibilities here:
3569 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3570 * approximate range of 100 to 1024. That means that we really need to
3571 * make sure that the qdisc can buffer that much data.
3573 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3574 * has finely granular timers and there's no need to fudge additional room
3575 * for buffers. (There's no extra effort needed to implement that: the
3576 * large 'buffer_hz' is used as a divisor, so practically any number will
3577 * come out as 0 in the division. Small integer results in the case of
3578 * really high dividends won't have any real effect anyhow.)
3580 static unsigned int buffer_hz;
3582 /* Returns tc handle 'major':'minor'. */
3584 tc_make_handle(unsigned int major, unsigned int minor)
3586 return TC_H_MAKE(major << 16, minor);
3589 /* Returns the major number from 'handle'. */
3591 tc_get_major(unsigned int handle)
3593 return TC_H_MAJ(handle) >> 16;
3596 /* Returns the minor number from 'handle'. */
3598 tc_get_minor(unsigned int handle)
3600 return TC_H_MIN(handle);
3603 static struct tcmsg *
3604 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3605 struct ofpbuf *request)
3607 struct tcmsg *tcmsg;
3611 error = get_ifindex(netdev, &ifindex);
3616 ofpbuf_init(request, 512);
3617 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3618 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3619 tcmsg->tcm_family = AF_UNSPEC;
3620 tcmsg->tcm_ifindex = ifindex;
3621 /* Caller should fill in tcmsg->tcm_handle. */
3622 /* Caller should fill in tcmsg->tcm_parent. */
3628 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3630 int error = nl_transact(NETLINK_ROUTE, request, replyp);
3631 ofpbuf_uninit(request);
3635 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3636 * policing configuration.
3638 * This function is equivalent to running the following when 'add' is true:
3639 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3641 * This function is equivalent to running the following when 'add' is false:
3642 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3644 * The configuration and stats may be seen with the following command:
3645 * /sbin/tc -s qdisc show dev <devname>
3647 * Returns 0 if successful, otherwise a positive errno value.
3650 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3652 struct ofpbuf request;
3653 struct tcmsg *tcmsg;
3655 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3656 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3658 tcmsg = tc_make_request(netdev, type, flags, &request);
3662 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3663 tcmsg->tcm_parent = TC_H_INGRESS;
3664 nl_msg_put_string(&request, TCA_KIND, "ingress");
3665 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3667 error = tc_transact(&request, NULL);
3669 /* If we're deleting the qdisc, don't worry about some of the
3670 * error conditions. */
3671 if (!add && (error == ENOENT || error == EINVAL)) {
3680 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3683 * This function is equivalent to running:
3684 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3685 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3688 * The configuration and stats may be seen with the following command:
3689 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3691 * Returns 0 if successful, otherwise a positive errno value.
3694 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3696 struct tc_police tc_police;
3697 struct ofpbuf request;
3698 struct tcmsg *tcmsg;
3699 size_t basic_offset;
3700 size_t police_offset;
3704 memset(&tc_police, 0, sizeof tc_police);
3705 tc_police.action = TC_POLICE_SHOT;
3706 tc_police.mtu = mtu;
3707 tc_fill_rate(&tc_police.rate, (kbits_rate * 1000)/8, mtu);
3708 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3709 kbits_burst * 1024);
3711 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3712 NLM_F_EXCL | NLM_F_CREATE, &request);
3716 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3717 tcmsg->tcm_info = tc_make_handle(49,
3718 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3720 nl_msg_put_string(&request, TCA_KIND, "basic");
3721 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3722 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3723 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3724 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3725 nl_msg_end_nested(&request, police_offset);
3726 nl_msg_end_nested(&request, basic_offset);
3728 error = tc_transact(&request, NULL);
3739 /* The values in psched are not individually very meaningful, but they are
3740 * important. The tables below show some values seen in the wild.
3744 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3745 * (Before that, there are hints that it was 1000000000.)
3747 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3751 * -----------------------------------
3752 * [1] 000c8000 000f4240 000f4240 00000064
3753 * [2] 000003e8 00000400 000f4240 3b9aca00
3754 * [3] 000003e8 00000400 000f4240 3b9aca00
3755 * [4] 000003e8 00000400 000f4240 00000064
3756 * [5] 000003e8 00000040 000f4240 3b9aca00
3757 * [6] 000003e8 00000040 000f4240 000000f9
3759 * a b c d ticks_per_s buffer_hz
3760 * ------- --------- ---------- ------------- ----------- -------------
3761 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3762 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3763 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3764 * [4] 1,000 1,024 1,000,000 100 976,562 100
3765 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3766 * [6] 1,000 64 1,000,000 249 15,625,000 249
3768 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3769 * [2] 2.6.26-1-686-bigmem from Debian lenny
3770 * [3] 2.6.26-2-sparc64 from Debian lenny
3771 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3772 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3773 * [6] 2.6.34 from kernel.org on KVM
3775 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
3776 static const char fn[] = "/proc/net/psched";
3777 unsigned int a, b, c, d;
3780 if (!ovsthread_once_start(&once)) {
3787 stream = fopen(fn, "r");
3789 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
3793 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3794 VLOG_WARN("%s: read failed", fn);
3798 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3802 VLOG_WARN("%s: invalid scheduler parameters", fn);
3806 ticks_per_s = (double) a * c / b;
3810 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3813 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3816 ovsthread_once_done(&once);
3819 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3820 * rate of 'rate' bytes per second. */
3822 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3825 return (rate * ticks) / ticks_per_s;
3828 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3829 * rate of 'rate' bytes per second. */
3831 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3834 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3837 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3838 * a transmission rate of 'rate' bytes per second. */
3840 tc_buffer_per_jiffy(unsigned int rate)
3843 return rate / buffer_hz;
3846 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3847 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3848 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3849 * stores NULL into it if it is absent.
3851 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3854 * Returns 0 if successful, otherwise a positive errno value. */
3856 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3857 struct nlattr **options)
3859 static const struct nl_policy tca_policy[] = {
3860 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3861 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3863 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3865 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3866 tca_policy, ta, ARRAY_SIZE(ta))) {
3867 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3872 *kind = nl_attr_get_string(ta[TCA_KIND]);
3876 *options = ta[TCA_OPTIONS];
3891 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3892 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3893 * into '*options', and its queue statistics into '*stats'. Any of the output
3894 * arguments may be null.
3896 * Returns 0 if successful, otherwise a positive errno value. */
3898 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3899 struct nlattr **options, struct netdev_queue_stats *stats)
3901 static const struct nl_policy tca_policy[] = {
3902 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3903 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3905 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3907 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3908 tca_policy, ta, ARRAY_SIZE(ta))) {
3909 VLOG_WARN_RL(&rl, "failed to parse class message");
3914 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3915 *handlep = tc->tcm_handle;
3919 *options = ta[TCA_OPTIONS];
3923 const struct gnet_stats_queue *gsq;
3924 struct gnet_stats_basic gsb;
3926 static const struct nl_policy stats_policy[] = {
3927 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3928 .min_len = sizeof gsb },
3929 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3930 .min_len = sizeof *gsq },
3932 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3934 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3935 sa, ARRAY_SIZE(sa))) {
3936 VLOG_WARN_RL(&rl, "failed to parse class stats");
3940 /* Alignment issues screw up the length of struct gnet_stats_basic on
3941 * some arch/bitsize combinations. Newer versions of Linux have a
3942 * struct gnet_stats_basic_packed, but we can't depend on that. The
3943 * easiest thing to do is just to make a copy. */
3944 memset(&gsb, 0, sizeof gsb);
3945 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3946 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3947 stats->tx_bytes = gsb.bytes;
3948 stats->tx_packets = gsb.packets;
3950 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3951 stats->tx_errors = gsq->drops;
3961 memset(stats, 0, sizeof *stats);
3966 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3969 tc_query_class(const struct netdev *netdev,
3970 unsigned int handle, unsigned int parent,
3971 struct ofpbuf **replyp)
3973 struct ofpbuf request;
3974 struct tcmsg *tcmsg;
3977 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3981 tcmsg->tcm_handle = handle;
3982 tcmsg->tcm_parent = parent;
3984 error = tc_transact(&request, replyp);
3986 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3987 netdev_get_name(netdev),
3988 tc_get_major(handle), tc_get_minor(handle),
3989 tc_get_major(parent), tc_get_minor(parent),
3990 ovs_strerror(error));
3995 /* Equivalent to "tc class del dev <name> handle <handle>". */
3997 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3999 struct ofpbuf request;
4000 struct tcmsg *tcmsg;
4003 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
4007 tcmsg->tcm_handle = handle;
4008 tcmsg->tcm_parent = 0;
4010 error = tc_transact(&request, NULL);
4012 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4013 netdev_get_name(netdev),
4014 tc_get_major(handle), tc_get_minor(handle),
4015 ovs_strerror(error));
4020 /* Equivalent to "tc qdisc del dev <name> root". */
4022 tc_del_qdisc(struct netdev *netdev_)
4024 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4025 struct ofpbuf request;
4026 struct tcmsg *tcmsg;
4029 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
4033 tcmsg->tcm_handle = tc_make_handle(1, 0);
4034 tcmsg->tcm_parent = TC_H_ROOT;
4036 error = tc_transact(&request, NULL);
4037 if (error == EINVAL) {
4038 /* EINVAL probably means that the default qdisc was in use, in which
4039 * case we've accomplished our purpose. */
4042 if (!error && netdev->tc) {
4043 if (netdev->tc->ops->tc_destroy) {
4044 netdev->tc->ops->tc_destroy(netdev->tc);
4051 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4052 * kernel to determine what they are. Returns 0 if successful, otherwise a
4053 * positive errno value. */
4055 tc_query_qdisc(const struct netdev *netdev_)
4057 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4058 struct ofpbuf request, *qdisc;
4059 const struct tc_ops *ops;
4060 struct tcmsg *tcmsg;
4068 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4069 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4070 * 2.6.35 without that fix backported to it.
4072 * To avoid the OOPS, we must not make a request that would attempt to dump
4073 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4074 * few others. There are a few ways that I can see to do this, but most of
4075 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4076 * technique chosen here is to assume that any non-default qdisc that we
4077 * create will have a class with handle 1:0. The built-in qdiscs only have
4078 * a class with handle 0:0.
4080 * We could check for Linux 2.6.35+ and use a more straightforward method
4082 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
4086 tcmsg->tcm_handle = tc_make_handle(1, 0);
4087 tcmsg->tcm_parent = 0;
4089 /* Figure out what tc class to instantiate. */
4090 error = tc_transact(&request, &qdisc);
4094 error = tc_parse_qdisc(qdisc, &kind, NULL);
4096 ops = &tc_ops_other;
4098 ops = tc_lookup_linux_name(kind);
4100 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4101 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4103 ops = &tc_ops_other;
4106 } else if (error == ENOENT) {
4107 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4108 * other entity that doesn't have a handle 1:0. We will assume
4109 * that it's the system default qdisc. */
4110 ops = &tc_ops_default;
4113 /* Who knows? Maybe the device got deleted. */
4114 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4115 netdev_get_name(netdev_), ovs_strerror(error));
4116 ops = &tc_ops_other;
4119 /* Instantiate it. */
4120 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
4121 ovs_assert((load_error == 0) == (netdev->tc != NULL));
4122 ofpbuf_delete(qdisc);
4124 return error ? error : load_error;
4127 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4128 approximate the time to transmit packets of various lengths. For an MTU of
4129 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4130 represents two possible packet lengths; for a MTU of 513 through 1024, four
4131 possible lengths; and so on.
4133 Returns, for the specified 'mtu', the number of bits that packet lengths
4134 need to be shifted right to fit within such a 256-entry table. */
4136 tc_calc_cell_log(unsigned int mtu)
4141 mtu = ETH_PAYLOAD_MAX;
4143 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4145 for (cell_log = 0; mtu >= 256; cell_log++) {
4152 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4155 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4157 memset(rate, 0, sizeof *rate);
4158 rate->cell_log = tc_calc_cell_log(mtu);
4159 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4160 /* rate->cell_align = 0; */ /* distro headers. */
4161 rate->mpu = ETH_TOTAL_MIN;
4165 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4166 * attribute of the specified "type".
4168 * See tc_calc_cell_log() above for a description of "rtab"s. */
4170 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4175 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4176 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4177 unsigned packet_size = (i + 1) << rate->cell_log;
4178 if (packet_size < rate->mpu) {
4179 packet_size = rate->mpu;
4181 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4185 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4186 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4187 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4190 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4192 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4193 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4196 /* Linux-only functions declared in netdev-linux.h */
4198 /* Returns a fd for an AF_INET socket or a negative errno value. */
4200 netdev_linux_get_af_inet_sock(void)
4202 int error = netdev_linux_init();
4203 return error ? -error : af_inet_sock;
4206 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4207 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4209 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4210 const char *flag_name, bool enable)
4212 const char *netdev_name = netdev_get_name(netdev);
4213 struct ethtool_value evalue;
4217 COVERAGE_INC(netdev_get_ethtool);
4218 memset(&evalue, 0, sizeof evalue);
4219 error = netdev_linux_do_ethtool(netdev_name,
4220 (struct ethtool_cmd *)&evalue,
4221 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4226 COVERAGE_INC(netdev_set_ethtool);
4227 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4228 error = netdev_linux_do_ethtool(netdev_name,
4229 (struct ethtool_cmd *)&evalue,
4230 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4235 COVERAGE_INC(netdev_get_ethtool);
4236 memset(&evalue, 0, sizeof evalue);
4237 error = netdev_linux_do_ethtool(netdev_name,
4238 (struct ethtool_cmd *)&evalue,
4239 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4244 if (new_flags != evalue.data) {
4245 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4246 "device %s failed", enable ? "enable" : "disable",
4247 flag_name, netdev_name);
4254 /* Utility functions. */
4256 /* Copies 'src' into 'dst', performing format conversion in the process. */
4258 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4259 const struct rtnl_link_stats *src)
4261 dst->rx_packets = src->rx_packets;
4262 dst->tx_packets = src->tx_packets;
4263 dst->rx_bytes = src->rx_bytes;
4264 dst->tx_bytes = src->tx_bytes;
4265 dst->rx_errors = src->rx_errors;
4266 dst->tx_errors = src->tx_errors;
4267 dst->rx_dropped = src->rx_dropped;
4268 dst->tx_dropped = src->tx_dropped;
4269 dst->multicast = src->multicast;
4270 dst->collisions = src->collisions;
4271 dst->rx_length_errors = src->rx_length_errors;
4272 dst->rx_over_errors = src->rx_over_errors;
4273 dst->rx_crc_errors = src->rx_crc_errors;
4274 dst->rx_frame_errors = src->rx_frame_errors;
4275 dst->rx_fifo_errors = src->rx_fifo_errors;
4276 dst->rx_missed_errors = src->rx_missed_errors;
4277 dst->tx_aborted_errors = src->tx_aborted_errors;
4278 dst->tx_carrier_errors = src->tx_carrier_errors;
4279 dst->tx_fifo_errors = src->tx_fifo_errors;
4280 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4281 dst->tx_window_errors = src->tx_window_errors;
4285 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4287 /* Policy for RTNLGRP_LINK messages.
4289 * There are *many* more fields in these messages, but currently we only
4290 * care about these fields. */
4291 static const struct nl_policy rtnlgrp_link_policy[] = {
4292 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4293 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4294 .min_len = sizeof(struct rtnl_link_stats) },
4297 struct ofpbuf request;
4298 struct ofpbuf *reply;
4299 struct ifinfomsg *ifi;
4300 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4303 ofpbuf_init(&request, 0);
4304 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4305 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4306 ifi->ifi_family = PF_UNSPEC;
4307 ifi->ifi_index = ifindex;
4308 error = nl_transact(NETLINK_ROUTE, &request, &reply);
4309 ofpbuf_uninit(&request);
4314 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4315 rtnlgrp_link_policy,
4316 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4317 ofpbuf_delete(reply);
4321 if (!attrs[IFLA_STATS]) {
4322 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4323 ofpbuf_delete(reply);
4327 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4329 ofpbuf_delete(reply);
4335 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4337 static const char fn[] = "/proc/net/dev";
4342 stream = fopen(fn, "r");
4344 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
4349 while (fgets(line, sizeof line, stream)) {
4352 #define X64 "%"SCNu64
4355 X64 X64 X64 X64 X64 X64 X64 "%*u"
4356 X64 X64 X64 X64 X64 X64 X64 "%*u",
4362 &stats->rx_fifo_errors,
4363 &stats->rx_frame_errors,
4369 &stats->tx_fifo_errors,
4371 &stats->tx_carrier_errors) != 15) {
4372 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4373 } else if (!strcmp(devname, netdev_name)) {
4374 stats->rx_length_errors = UINT64_MAX;
4375 stats->rx_over_errors = UINT64_MAX;
4376 stats->rx_crc_errors = UINT64_MAX;
4377 stats->rx_missed_errors = UINT64_MAX;
4378 stats->tx_aborted_errors = UINT64_MAX;
4379 stats->tx_heartbeat_errors = UINT64_MAX;
4380 stats->tx_window_errors = UINT64_MAX;
4386 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4392 get_flags(const struct netdev *dev, unsigned int *flags)
4398 error = netdev_linux_do_ioctl(dev->name, &ifr, SIOCGIFFLAGS,
4401 *flags = ifr.ifr_flags;
4407 set_flags(const char *name, unsigned int flags)
4411 ifr.ifr_flags = flags;
4412 return netdev_linux_do_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
4416 do_get_ifindex(const char *netdev_name)
4420 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4421 COVERAGE_INC(netdev_get_ifindex);
4422 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4423 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4424 netdev_name, ovs_strerror(errno));
4427 return ifr.ifr_ifindex;
4431 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4433 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4435 if (!(netdev->cache_valid & VALID_IFINDEX)) {
4436 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4439 netdev->get_ifindex_error = -ifindex;
4440 netdev->ifindex = 0;
4442 netdev->get_ifindex_error = 0;
4443 netdev->ifindex = ifindex;
4445 netdev->cache_valid |= VALID_IFINDEX;
4448 *ifindexp = netdev->ifindex;
4449 return netdev->get_ifindex_error;
4453 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4458 memset(&ifr, 0, sizeof ifr);
4459 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4460 COVERAGE_INC(netdev_get_hwaddr);
4461 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4462 /* ENODEV probably means that a vif disappeared asynchronously and
4463 * hasn't been removed from the database yet, so reduce the log level
4464 * to INFO for that case. */
4465 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4466 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4467 netdev_name, ovs_strerror(errno));
4470 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4471 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4472 VLOG_WARN("%s device has unknown hardware address family %d",
4473 netdev_name, hwaddr_family);
4475 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4480 set_etheraddr(const char *netdev_name,
4481 const uint8_t mac[ETH_ADDR_LEN])
4485 memset(&ifr, 0, sizeof ifr);
4486 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4487 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4488 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4489 COVERAGE_INC(netdev_set_hwaddr);
4490 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4491 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4492 netdev_name, ovs_strerror(errno));
4499 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4500 int cmd, const char *cmd_name)
4504 memset(&ifr, 0, sizeof ifr);
4505 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4506 ifr.ifr_data = (caddr_t) ecmd;
4509 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4512 if (errno != EOPNOTSUPP) {
4513 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4514 "failed: %s", cmd_name, name, ovs_strerror(errno));
4516 /* The device doesn't support this operation. That's pretty
4517 * common, so there's no point in logging anything. */
4524 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4525 const char *cmd_name)
4527 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4528 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4529 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4530 ovs_strerror(errno));
4537 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4538 int cmd, const char *cmd_name)
4543 ifr.ifr_addr.sa_family = AF_INET;
4544 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4546 const struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *,
4548 *ip = sin->sin_addr;
4553 /* Returns an AF_PACKET raw socket or a negative errno value. */
4555 af_packet_sock(void)
4557 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4560 if (ovsthread_once_start(&once)) {
4561 sock = socket(AF_PACKET, SOCK_RAW, 0);
4563 int error = set_nonblocking(sock);
4570 VLOG_ERR("failed to create packet socket: %s",
4571 ovs_strerror(errno));
4573 ovsthread_once_done(&once);