2 * Copyright (c) 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <linux/version.h>
37 #include <sys/types.h>
38 #include <sys/ioctl.h>
39 #include <sys/socket.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
52 #include "dpif-linux.h"
53 #include "dynamic-string.h"
54 #include "fatal-signal.h"
57 #include "netdev-provider.h"
58 #include "netdev-vport.h"
59 #include "netlink-notifier.h"
60 #include "netlink-socket.h"
63 #include "openflow/openflow.h"
65 #include "poll-loop.h"
66 #include "rtnetlink-link.h"
68 #include "socket-util.h"
71 #include "unaligned.h"
74 VLOG_DEFINE_THIS_MODULE(netdev_linux);
76 COVERAGE_DEFINE(netdev_set_policing);
77 COVERAGE_DEFINE(netdev_arp_lookup);
78 COVERAGE_DEFINE(netdev_get_ifindex);
79 COVERAGE_DEFINE(netdev_get_hwaddr);
80 COVERAGE_DEFINE(netdev_set_hwaddr);
81 COVERAGE_DEFINE(netdev_get_ethtool);
82 COVERAGE_DEFINE(netdev_set_ethtool);
85 /* These were introduced in Linux 2.6.14, so they might be missing if we have
87 #ifndef ADVERTISED_Pause
88 #define ADVERTISED_Pause (1 << 13)
90 #ifndef ADVERTISED_Asym_Pause
91 #define ADVERTISED_Asym_Pause (1 << 14)
94 /* These were introduced in Linux 2.6.24, so they might be missing if we
95 * have old headers. */
96 #ifndef ETHTOOL_GFLAGS
97 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
99 #ifndef ETHTOOL_SFLAGS
100 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
103 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
106 #define TC_RTAB_SIZE 1024
109 static struct nln_notifier *netdev_linux_cache_notifier = NULL;
110 static int cache_notifier_refcount;
113 VALID_IFINDEX = 1 << 0,
114 VALID_ETHERADDR = 1 << 1,
118 VALID_POLICING = 1 << 5,
119 VALID_VPORT_STAT_ERROR = 1 << 6,
120 VALID_DRVINFO = 1 << 7,
121 VALID_FEATURES = 1 << 8,
124 /* Traffic control. */
126 /* An instance of a traffic control class. Always associated with a particular
129 * Each TC implementation subclasses this with whatever additional data it
132 const struct tc_ops *ops;
133 struct hmap queues; /* Contains "struct tc_queue"s.
134 * Read by generic TC layer.
135 * Written only by TC implementation. */
138 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
140 /* One traffic control queue.
142 * Each TC implementation subclasses this with whatever additional data it
145 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
146 unsigned int queue_id; /* OpenFlow queue ID. */
147 long long int created; /* Time queue was created, in msecs. */
150 /* A particular kind of traffic control. Each implementation generally maps to
151 * one particular Linux qdisc class.
153 * The functions below return 0 if successful or a positive errno value on
154 * failure, except where otherwise noted. All of them must be provided, except
155 * where otherwise noted. */
157 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
158 * This is null for tc_ops_default and tc_ops_other, for which there are no
159 * appropriate values. */
160 const char *linux_name;
162 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
163 const char *ovs_name;
165 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
166 * queues. The queues are numbered 0 through n_queues - 1. */
167 unsigned int n_queues;
169 /* Called to install this TC class on 'netdev'. The implementation should
170 * make the Netlink calls required to set up 'netdev' with the right qdisc
171 * and configure it according to 'details'. The implementation may assume
172 * that the current qdisc is the default; that is, there is no need for it
173 * to delete the current qdisc before installing itself.
175 * The contents of 'details' should be documented as valid for 'ovs_name'
176 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
177 * (which is built as ovs-vswitchd.conf.db(8)).
179 * This function must return 0 if and only if it sets 'netdev->tc' to an
180 * initialized 'struct tc'.
182 * (This function is null for tc_ops_other, which cannot be installed. For
183 * other TC classes it should always be nonnull.) */
184 int (*tc_install)(struct netdev *netdev, const struct smap *details);
186 /* Called when the netdev code determines (through a Netlink query) that
187 * this TC class's qdisc is installed on 'netdev', but we didn't install
188 * it ourselves and so don't know any of the details.
190 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
191 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
192 * implementation should parse the other attributes of 'nlmsg' as
193 * necessary to determine its configuration. If necessary it should also
194 * use Netlink queries to determine the configuration of queues on
197 * This function must return 0 if and only if it sets 'netdev->tc' to an
198 * initialized 'struct tc'. */
199 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
201 /* Destroys the data structures allocated by the implementation as part of
202 * 'tc'. (This includes destroying 'tc->queues' by calling
205 * The implementation should not need to perform any Netlink calls. If
206 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
207 * (But it may not be desirable.)
209 * This function may be null if 'tc' is trivial. */
210 void (*tc_destroy)(struct tc *tc);
212 /* Retrieves details of 'netdev->tc' configuration into 'details'.
214 * The implementation should not need to perform any Netlink calls, because
215 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
216 * cached the configuration.
218 * The contents of 'details' should be documented as valid for 'ovs_name'
219 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
220 * (which is built as ovs-vswitchd.conf.db(8)).
222 * This function may be null if 'tc' is not configurable.
224 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
226 /* Reconfigures 'netdev->tc' according to 'details', performing any
227 * required Netlink calls to complete the reconfiguration.
229 * The contents of 'details' should be documented as valid for 'ovs_name'
230 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
231 * (which is built as ovs-vswitchd.conf.db(8)).
233 * This function may be null if 'tc' is not configurable.
235 int (*qdisc_set)(struct netdev *, const struct smap *details);
237 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
238 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
240 * The contents of 'details' should be documented as valid for 'ovs_name'
241 * in the "other_config" column in the "Queue" table in
242 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
244 * The implementation should not need to perform any Netlink calls, because
245 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
246 * cached the queue configuration.
248 * This function may be null if 'tc' does not have queues ('n_queues' is
250 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
251 struct smap *details);
253 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
254 * 'details', perfoming any required Netlink calls to complete the
255 * reconfiguration. The caller ensures that 'queue_id' is less than
258 * The contents of 'details' should be documented as valid for 'ovs_name'
259 * in the "other_config" column in the "Queue" table in
260 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
262 * This function may be null if 'tc' does not have queues or its queues are
263 * not configurable. */
264 int (*class_set)(struct netdev *, unsigned int queue_id,
265 const struct smap *details);
267 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
268 * tc_queue's within 'netdev->tc->queues'.
270 * This function may be null if 'tc' does not have queues or its queues
271 * cannot be deleted. */
272 int (*class_delete)(struct netdev *, struct tc_queue *queue);
274 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
275 * 'struct tc_queue's within 'netdev->tc->queues'.
277 * On success, initializes '*stats'.
279 * This function may be null if 'tc' does not have queues or if it cannot
280 * report queue statistics. */
281 int (*class_get_stats)(const struct netdev *netdev,
282 const struct tc_queue *queue,
283 struct netdev_queue_stats *stats);
285 /* Extracts queue stats from 'nlmsg', which is a response to a
286 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
288 * This function may be null if 'tc' does not have queues or if it cannot
289 * report queue statistics. */
290 int (*class_dump_stats)(const struct netdev *netdev,
291 const struct ofpbuf *nlmsg,
292 netdev_dump_queue_stats_cb *cb, void *aux);
296 tc_init(struct tc *tc, const struct tc_ops *ops)
299 hmap_init(&tc->queues);
303 tc_destroy(struct tc *tc)
305 hmap_destroy(&tc->queues);
308 static const struct tc_ops tc_ops_htb;
309 static const struct tc_ops tc_ops_hfsc;
310 static const struct tc_ops tc_ops_default;
311 static const struct tc_ops tc_ops_other;
313 static const struct tc_ops *const tcs[] = {
314 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
315 &tc_ops_hfsc, /* Hierarchical fair service curve. */
316 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
317 &tc_ops_other, /* Some other qdisc. */
321 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
322 static unsigned int tc_get_major(unsigned int handle);
323 static unsigned int tc_get_minor(unsigned int handle);
325 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
326 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
327 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
329 static struct tcmsg *tc_make_request(const struct netdev *, int type,
330 unsigned int flags, struct ofpbuf *);
331 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
332 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
333 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
336 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
337 struct nlattr **options);
338 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
339 struct nlattr **options,
340 struct netdev_queue_stats *);
341 static int tc_query_class(const struct netdev *,
342 unsigned int handle, unsigned int parent,
343 struct ofpbuf **replyp);
344 static int tc_delete_class(const struct netdev *, unsigned int handle);
346 static int tc_del_qdisc(struct netdev *netdev);
347 static int tc_query_qdisc(const struct netdev *netdev);
349 static int tc_calc_cell_log(unsigned int mtu);
350 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
351 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
352 const struct tc_ratespec *rate);
353 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
355 struct netdev_linux {
358 unsigned int cache_valid;
359 unsigned int change_seq;
361 bool miimon; /* Link status of last poll. */
362 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
363 struct timer miimon_timer;
365 /* The following are figured out "on demand" only. They are only valid
366 * when the corresponding VALID_* bit in 'cache_valid' is set. */
368 uint8_t etheraddr[ETH_ADDR_LEN];
369 struct in_addr address, netmask;
372 unsigned int ifi_flags;
373 long long int carrier_resets;
374 uint32_t kbits_rate; /* Policing data. */
375 uint32_t kbits_burst;
376 int vport_stats_error; /* Cached error code from vport_get_stats().
377 0 or an errno value. */
378 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
379 int ether_addr_error; /* Cached error code from set/get etheraddr. */
380 int netdev_policing_error; /* Cached error code from set policing. */
381 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
382 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
384 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
385 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
386 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
388 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
391 /* For devices of class netdev_tap_class only. */
395 struct netdev_rx_linux {
401 static const struct netdev_rx_class netdev_rx_linux_class;
403 /* This is set pretty low because we probably won't learn anything from the
404 * additional log messages. */
405 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
407 static void netdev_linux_run(void);
409 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
410 int cmd, const char *cmd_name);
411 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
412 int cmd, const char *cmd_name);
413 static int get_flags(const struct netdev *, unsigned int *flags);
414 static int set_flags(const char *, unsigned int flags);
415 static int do_get_ifindex(const char *netdev_name);
416 static int get_ifindex(const struct netdev *, int *ifindexp);
417 static int do_set_addr(struct netdev *netdev,
418 int ioctl_nr, const char *ioctl_name,
419 struct in_addr addr);
420 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
421 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
422 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
423 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
424 static int af_packet_sock(void);
425 static void netdev_linux_miimon_run(void);
426 static void netdev_linux_miimon_wait(void);
429 is_netdev_linux_class(const struct netdev_class *netdev_class)
431 return netdev_class->run == netdev_linux_run;
435 is_tap_netdev(const struct netdev *netdev)
437 return netdev_get_class(netdev) == &netdev_tap_class;
440 static struct netdev_linux *
441 netdev_linux_cast(const struct netdev *netdev)
443 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
445 return CONTAINER_OF(netdev, struct netdev_linux, up);
448 static struct netdev_rx_linux *
449 netdev_rx_linux_cast(const struct netdev_rx *rx)
451 netdev_rx_assert_class(rx, &netdev_rx_linux_class);
452 return CONTAINER_OF(rx, struct netdev_rx_linux, up);
456 netdev_linux_run(void)
458 rtnetlink_link_run();
459 netdev_linux_miimon_run();
463 netdev_linux_wait(void)
465 rtnetlink_link_wait();
466 netdev_linux_miimon_wait();
470 netdev_linux_changed(struct netdev_linux *dev,
471 unsigned int ifi_flags, unsigned int mask)
474 if (!dev->change_seq) {
478 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
479 dev->carrier_resets++;
481 dev->ifi_flags = ifi_flags;
483 dev->cache_valid &= mask;
487 netdev_linux_update(struct netdev_linux *dev,
488 const struct rtnetlink_link_change *change)
490 if (change->nlmsg_type == RTM_NEWLINK) {
492 netdev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
494 /* Update netdev from rtnl-change msg. */
496 dev->mtu = change->mtu;
497 dev->cache_valid |= VALID_MTU;
498 dev->netdev_mtu_error = 0;
501 if (!eth_addr_is_zero(change->addr)) {
502 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
503 dev->cache_valid |= VALID_ETHERADDR;
504 dev->ether_addr_error = 0;
507 dev->ifindex = change->ifi_index;
508 dev->cache_valid |= VALID_IFINDEX;
509 dev->get_ifindex_error = 0;
512 netdev_linux_changed(dev, change->ifi_flags, 0);
517 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
518 void *aux OVS_UNUSED)
521 struct netdev *base_dev = netdev_from_name(change->ifname);
522 if (base_dev && is_netdev_linux_class(netdev_get_class(base_dev))) {
523 netdev_linux_update(netdev_linux_cast(base_dev), change);
524 netdev_close(base_dev);
527 struct shash device_shash;
528 struct shash_node *node;
530 shash_init(&device_shash);
531 netdev_get_devices(&netdev_linux_class, &device_shash);
532 SHASH_FOR_EACH (node, &device_shash) {
533 struct netdev *netdev = node->data;
534 struct netdev_linux *dev = netdev_linux_cast(netdev);
537 get_flags(&dev->up, &flags);
538 netdev_linux_changed(dev, flags, 0);
539 netdev_close(netdev);
541 shash_destroy(&device_shash);
546 cache_notifier_ref(void)
548 if (!cache_notifier_refcount) {
549 ovs_assert(!netdev_linux_cache_notifier);
551 netdev_linux_cache_notifier =
552 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
554 if (!netdev_linux_cache_notifier) {
558 cache_notifier_refcount++;
564 cache_notifier_unref(void)
566 ovs_assert(cache_notifier_refcount > 0);
567 if (!--cache_notifier_refcount) {
568 ovs_assert(netdev_linux_cache_notifier);
569 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
570 netdev_linux_cache_notifier = NULL;
574 /* Creates system and internal devices. */
576 netdev_linux_create(const struct netdev_class *class, const char *name,
577 struct netdev **netdevp)
579 struct netdev_linux *netdev;
582 error = cache_notifier_ref();
587 netdev = xzalloc(sizeof *netdev);
588 netdev->change_seq = 1;
589 netdev_init(&netdev->up, name, class);
590 error = get_flags(&netdev->up, &netdev->ifi_flags);
591 if (error == ENODEV) {
592 if (class != &netdev_internal_class) {
593 /* The device does not exist, so don't allow it to be opened. */
594 netdev_uninit(&netdev->up, false);
595 cache_notifier_unref();
599 /* "Internal" netdevs have to be created as netdev objects before
600 * they exist in the kernel, because creating them in the kernel
601 * happens by passing a netdev object to dpif_port_add().
602 * Therefore, ignore the error. */
606 *netdevp = &netdev->up;
610 /* For most types of netdevs we open the device for each call of
611 * netdev_open(). However, this is not the case with tap devices,
612 * since it is only possible to open the device once. In this
613 * situation we share a single file descriptor, and consequently
614 * buffers, across all readers. Therefore once data is read it will
615 * be unavailable to other reads for tap devices. */
617 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
618 const char *name, struct netdev **netdevp)
620 struct netdev_linux *netdev;
621 static const char tap_dev[] = "/dev/net/tun";
625 netdev = xzalloc(sizeof *netdev);
626 netdev->change_seq = 1;
628 error = cache_notifier_ref();
633 /* Open tap device. */
634 netdev->tap_fd = open(tap_dev, O_RDWR);
635 if (netdev->tap_fd < 0) {
637 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
638 goto error_unref_notifier;
641 /* Create tap device. */
642 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
643 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
644 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
645 VLOG_WARN("%s: creating tap device failed: %s", name,
646 ovs_strerror(errno));
651 /* Make non-blocking. */
652 error = set_nonblocking(netdev->tap_fd);
657 netdev_init(&netdev->up, name, &netdev_tap_class);
658 *netdevp = &netdev->up;
662 close(netdev->tap_fd);
663 error_unref_notifier:
664 cache_notifier_unref();
671 netdev_linux_destroy(struct netdev *netdev_)
673 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
675 if (netdev->tc && netdev->tc->ops->tc_destroy) {
676 netdev->tc->ops->tc_destroy(netdev->tc);
679 if (netdev_get_class(netdev_) == &netdev_tap_class
680 && netdev->tap_fd >= 0)
682 close(netdev->tap_fd);
686 cache_notifier_unref();
690 netdev_linux_rx_open(struct netdev *netdev_, struct netdev_rx **rxp)
692 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
693 bool is_tap = is_tap_netdev(netdev_);
694 struct netdev_rx_linux *rx;
701 struct sockaddr_ll sll;
703 /* Result of tcpdump -dd inbound */
704 static const struct sock_filter filt[] = {
705 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
706 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
707 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
708 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
710 static const struct sock_fprog fprog = {
711 ARRAY_SIZE(filt), (struct sock_filter *) filt
714 /* Create file descriptor. */
715 fd = socket(PF_PACKET, SOCK_RAW, 0);
718 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
722 /* Set non-blocking mode. */
723 error = set_nonblocking(fd);
728 /* Get ethernet device index. */
729 error = get_ifindex(&netdev->up, &ifindex);
734 /* Bind to specific ethernet device. */
735 memset(&sll, 0, sizeof sll);
736 sll.sll_family = AF_PACKET;
737 sll.sll_ifindex = ifindex;
738 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
739 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
741 VLOG_ERR("%s: failed to bind raw socket (%s)",
742 netdev_get_name(netdev_), ovs_strerror(error));
746 /* Filter for only inbound packets. */
747 error = setsockopt(fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
751 VLOG_ERR("%s: failed to attach filter (%s)",
752 netdev_get_name(netdev_), ovs_strerror(error));
757 rx = xmalloc(sizeof *rx);
758 netdev_rx_init(&rx->up, netdev_, &netdev_rx_linux_class);
773 netdev_rx_linux_destroy(struct netdev_rx *rx_)
775 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
784 netdev_rx_linux_recv(struct netdev_rx *rx_, void *data, size_t size)
786 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
791 ? read(rx->fd, data, size)
792 : recv(rx->fd, data, size, MSG_TRUNC));
793 } while (retval < 0 && errno == EINTR);
796 return retval > size ? -EMSGSIZE : retval;
798 if (errno != EAGAIN) {
799 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
800 ovs_strerror(errno), netdev_rx_get_name(rx_));
807 netdev_rx_linux_wait(struct netdev_rx *rx_)
809 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
810 poll_fd_wait(rx->fd, POLLIN);
814 netdev_rx_linux_drain(struct netdev_rx *rx_)
816 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
819 int error = af_inet_ifreq_ioctl(netdev_rx_get_name(rx_), &ifr,
820 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
824 drain_fd(rx->fd, ifr.ifr_qlen);
827 return drain_rcvbuf(rx->fd);
831 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
832 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
833 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
834 * the packet is too big or too small to transmit on the device.
836 * The caller retains ownership of 'buffer' in all cases.
838 * The kernel maintains a packet transmission queue, so the caller is not
839 * expected to do additional queuing of packets. */
841 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
846 if (!is_tap_netdev(netdev_)) {
847 /* Use our AF_PACKET socket to send to this device. */
848 struct sockaddr_ll sll;
855 sock = af_packet_sock();
860 error = get_ifindex(netdev_, &ifindex);
865 /* We don't bother setting most fields in sockaddr_ll because the
866 * kernel ignores them for SOCK_RAW. */
867 memset(&sll, 0, sizeof sll);
868 sll.sll_family = AF_PACKET;
869 sll.sll_ifindex = ifindex;
871 iov.iov_base = CONST_CAST(void *, data);
875 msg.msg_namelen = sizeof sll;
878 msg.msg_control = NULL;
879 msg.msg_controllen = 0;
882 retval = sendmsg(sock, &msg, 0);
884 /* Use the tap fd to send to this device. This is essential for
885 * tap devices, because packets sent to a tap device with an
886 * AF_PACKET socket will loop back to be *received* again on the
887 * tap device. This doesn't occur on other interface types
888 * because we attach a socket filter to the rx socket. */
889 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
891 retval = write(netdev->tap_fd, data, size);
895 /* The Linux AF_PACKET implementation never blocks waiting for room
896 * for packets, instead returning ENOBUFS. Translate this into
897 * EAGAIN for the caller. */
898 if (errno == ENOBUFS) {
900 } else if (errno == EINTR) {
902 } else if (errno != EAGAIN) {
903 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
904 netdev_get_name(netdev_), ovs_strerror(errno));
907 } else if (retval != size) {
908 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
909 "%zu) on %s", retval, size, netdev_get_name(netdev_));
917 /* Registers with the poll loop to wake up from the next call to poll_block()
918 * when the packet transmission queue has sufficient room to transmit a packet
919 * with netdev_send().
921 * The kernel maintains a packet transmission queue, so the client is not
922 * expected to do additional queuing of packets. Thus, this function is
923 * unlikely to ever be used. It is included for completeness. */
925 netdev_linux_send_wait(struct netdev *netdev)
927 if (is_tap_netdev(netdev)) {
928 /* TAP device always accepts packets.*/
929 poll_immediate_wake();
933 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
934 * otherwise a positive errno value. */
936 netdev_linux_set_etheraddr(struct netdev *netdev_,
937 const uint8_t mac[ETH_ADDR_LEN])
939 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
940 struct netdev_saved_flags *sf = NULL;
943 if (netdev->cache_valid & VALID_ETHERADDR) {
944 if (netdev->ether_addr_error) {
945 return netdev->ether_addr_error;
947 if (eth_addr_equals(netdev->etheraddr, mac)) {
950 netdev->cache_valid &= ~VALID_ETHERADDR;
953 /* Tap devices must be brought down before setting the address. */
954 if (is_tap_netdev(netdev_)) {
955 netdev_turn_flags_off(netdev_, NETDEV_UP, &sf);
957 error = set_etheraddr(netdev_get_name(netdev_), mac);
958 if (!error || error == ENODEV) {
959 netdev->ether_addr_error = error;
960 netdev->cache_valid |= VALID_ETHERADDR;
962 memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
966 netdev_restore_flags(sf);
971 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
973 netdev_linux_get_etheraddr(const struct netdev *netdev_,
974 uint8_t mac[ETH_ADDR_LEN])
976 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
978 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
979 int error = get_etheraddr(netdev_get_name(netdev_),
982 netdev->ether_addr_error = error;
983 netdev->cache_valid |= VALID_ETHERADDR;
986 if (!netdev->ether_addr_error) {
987 memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
990 return netdev->ether_addr_error;
993 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
994 * in bytes, not including the hardware header; thus, this is typically 1500
995 * bytes for Ethernet devices. */
997 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
999 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1000 if (!(netdev->cache_valid & VALID_MTU)) {
1004 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1005 SIOCGIFMTU, "SIOCGIFMTU");
1007 netdev->netdev_mtu_error = error;
1008 netdev->mtu = ifr.ifr_mtu;
1009 netdev->cache_valid |= VALID_MTU;
1012 if (!netdev->netdev_mtu_error) {
1013 *mtup = netdev->mtu;
1015 return netdev->netdev_mtu_error;
1018 /* Sets the maximum size of transmitted (MTU) for given device using linux
1019 * networking ioctl interface.
1022 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1024 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1028 if (netdev->cache_valid & VALID_MTU) {
1029 if (netdev->netdev_mtu_error) {
1030 return netdev->netdev_mtu_error;
1032 if (netdev->mtu == mtu) {
1035 netdev->cache_valid &= ~VALID_MTU;
1038 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1039 SIOCSIFMTU, "SIOCSIFMTU");
1040 if (!error || error == ENODEV) {
1041 netdev->netdev_mtu_error = error;
1042 netdev->mtu = ifr.ifr_mtu;
1043 netdev->cache_valid |= VALID_MTU;
1048 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1049 * On failure, returns a negative errno value. */
1051 netdev_linux_get_ifindex(const struct netdev *netdev)
1055 error = get_ifindex(netdev, &ifindex);
1056 return error ? -error : ifindex;
1060 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1062 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1064 if (netdev->miimon_interval > 0) {
1065 *carrier = netdev->miimon;
1067 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1073 static long long int
1074 netdev_linux_get_carrier_resets(const struct netdev *netdev)
1076 return netdev_linux_cast(netdev)->carrier_resets;
1080 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1081 struct mii_ioctl_data *data)
1086 memset(&ifr, 0, sizeof ifr);
1087 memcpy(&ifr.ifr_data, data, sizeof *data);
1088 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
1089 memcpy(data, &ifr.ifr_data, sizeof *data);
1095 netdev_linux_get_miimon(const char *name, bool *miimon)
1097 struct mii_ioctl_data data;
1102 memset(&data, 0, sizeof data);
1103 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1105 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1106 data.reg_num = MII_BMSR;
1107 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1111 *miimon = !!(data.val_out & BMSR_LSTATUS);
1113 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1116 struct ethtool_cmd ecmd;
1118 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1121 COVERAGE_INC(netdev_get_ethtool);
1122 memset(&ecmd, 0, sizeof ecmd);
1123 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1126 struct ethtool_value eval;
1128 memcpy(&eval, &ecmd, sizeof eval);
1129 *miimon = !!eval.data;
1131 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1139 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1140 long long int interval)
1142 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1144 interval = interval > 0 ? MAX(interval, 100) : 0;
1145 if (netdev->miimon_interval != interval) {
1146 netdev->miimon_interval = interval;
1147 timer_set_expired(&netdev->miimon_timer);
1154 netdev_linux_miimon_run(void)
1156 struct shash device_shash;
1157 struct shash_node *node;
1159 shash_init(&device_shash);
1160 netdev_get_devices(&netdev_linux_class, &device_shash);
1161 SHASH_FOR_EACH (node, &device_shash) {
1162 struct netdev *netdev = node->data;
1163 struct netdev_linux *dev = netdev_linux_cast(netdev);
1166 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1167 netdev_close(netdev);
1171 netdev_linux_get_miimon(dev->up.name, &miimon);
1172 if (miimon != dev->miimon) {
1173 dev->miimon = miimon;
1174 netdev_linux_changed(dev, dev->ifi_flags, 0);
1177 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1178 netdev_close(netdev);
1181 shash_destroy(&device_shash);
1185 netdev_linux_miimon_wait(void)
1187 struct shash device_shash;
1188 struct shash_node *node;
1190 shash_init(&device_shash);
1191 netdev_get_devices(&netdev_linux_class, &device_shash);
1192 SHASH_FOR_EACH (node, &device_shash) {
1193 struct netdev *netdev = node->data;
1194 struct netdev_linux *dev = netdev_linux_cast(netdev);
1196 if (dev->miimon_interval > 0) {
1197 timer_wait(&dev->miimon_timer);
1199 netdev_close(netdev);
1201 shash_destroy(&device_shash);
1204 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1205 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1208 check_for_working_netlink_stats(void)
1210 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1211 * preferable, so if that works, we'll use it. */
1212 int ifindex = do_get_ifindex("lo");
1214 VLOG_WARN("failed to get ifindex for lo, "
1215 "obtaining netdev stats from proc");
1218 struct netdev_stats stats;
1219 int error = get_stats_via_netlink(ifindex, &stats);
1221 VLOG_DBG("obtaining netdev stats via rtnetlink");
1224 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1225 "via proc (you are probably running a pre-2.6.19 "
1226 "kernel)", ovs_strerror(error));
1233 swap_uint64(uint64_t *a, uint64_t *b)
1240 /* Copies 'src' into 'dst', performing format conversion in the process.
1242 * 'src' is allowed to be misaligned. */
1244 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1245 const struct ovs_vport_stats *src)
1247 dst->rx_packets = get_unaligned_u64(&src->rx_packets);
1248 dst->tx_packets = get_unaligned_u64(&src->tx_packets);
1249 dst->rx_bytes = get_unaligned_u64(&src->rx_bytes);
1250 dst->tx_bytes = get_unaligned_u64(&src->tx_bytes);
1251 dst->rx_errors = get_unaligned_u64(&src->rx_errors);
1252 dst->tx_errors = get_unaligned_u64(&src->tx_errors);
1253 dst->rx_dropped = get_unaligned_u64(&src->rx_dropped);
1254 dst->tx_dropped = get_unaligned_u64(&src->tx_dropped);
1256 dst->collisions = 0;
1257 dst->rx_length_errors = 0;
1258 dst->rx_over_errors = 0;
1259 dst->rx_crc_errors = 0;
1260 dst->rx_frame_errors = 0;
1261 dst->rx_fifo_errors = 0;
1262 dst->rx_missed_errors = 0;
1263 dst->tx_aborted_errors = 0;
1264 dst->tx_carrier_errors = 0;
1265 dst->tx_fifo_errors = 0;
1266 dst->tx_heartbeat_errors = 0;
1267 dst->tx_window_errors = 0;
1271 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1273 struct dpif_linux_vport reply;
1277 error = dpif_linux_vport_get(netdev_get_name(netdev), &reply, &buf);
1280 } else if (!reply.stats) {
1285 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1293 get_stats_via_vport(const struct netdev *netdev_,
1294 struct netdev_stats *stats)
1296 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1298 if (!netdev->vport_stats_error ||
1299 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1302 error = get_stats_via_vport__(netdev_, stats);
1303 if (error && error != ENOENT) {
1304 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1306 netdev_get_name(netdev_), ovs_strerror(error));
1308 netdev->vport_stats_error = error;
1309 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1314 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1315 struct netdev_stats *stats)
1317 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
1318 static int use_netlink_stats;
1321 if (ovsthread_once_start(&once)) {
1322 use_netlink_stats = check_for_working_netlink_stats();
1323 ovsthread_once_done(&once);
1326 if (use_netlink_stats) {
1329 error = get_ifindex(netdev_, &ifindex);
1331 error = get_stats_via_netlink(ifindex, stats);
1334 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1338 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1339 netdev_get_name(netdev_), error);
1345 /* Retrieves current device stats for 'netdev-linux'. */
1347 netdev_linux_get_stats(const struct netdev *netdev_,
1348 struct netdev_stats *stats)
1350 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1351 struct netdev_stats dev_stats;
1354 get_stats_via_vport(netdev_, stats);
1356 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1359 if (netdev->vport_stats_error) {
1366 if (netdev->vport_stats_error) {
1367 /* stats not available from OVS then use ioctl stats. */
1370 stats->rx_errors += dev_stats.rx_errors;
1371 stats->tx_errors += dev_stats.tx_errors;
1372 stats->rx_dropped += dev_stats.rx_dropped;
1373 stats->tx_dropped += dev_stats.tx_dropped;
1374 stats->multicast += dev_stats.multicast;
1375 stats->collisions += dev_stats.collisions;
1376 stats->rx_length_errors += dev_stats.rx_length_errors;
1377 stats->rx_over_errors += dev_stats.rx_over_errors;
1378 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1379 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1380 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1381 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1382 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1383 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1384 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1385 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1386 stats->tx_window_errors += dev_stats.tx_window_errors;
1391 /* Retrieves current device stats for 'netdev-tap' netdev or
1392 * netdev-internal. */
1394 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1396 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1397 struct netdev_stats dev_stats;
1400 get_stats_via_vport(netdev_, stats);
1402 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1404 if (netdev->vport_stats_error) {
1411 /* If this port is an internal port then the transmit and receive stats
1412 * will appear to be swapped relative to the other ports since we are the
1413 * one sending the data, not a remote computer. For consistency, we swap
1414 * them back here. This does not apply if we are getting stats from the
1415 * vport layer because it always tracks stats from the perspective of the
1417 if (netdev->vport_stats_error) {
1419 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1420 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1421 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1422 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1423 stats->rx_length_errors = 0;
1424 stats->rx_over_errors = 0;
1425 stats->rx_crc_errors = 0;
1426 stats->rx_frame_errors = 0;
1427 stats->rx_fifo_errors = 0;
1428 stats->rx_missed_errors = 0;
1429 stats->tx_aborted_errors = 0;
1430 stats->tx_carrier_errors = 0;
1431 stats->tx_fifo_errors = 0;
1432 stats->tx_heartbeat_errors = 0;
1433 stats->tx_window_errors = 0;
1435 stats->rx_dropped += dev_stats.tx_dropped;
1436 stats->tx_dropped += dev_stats.rx_dropped;
1438 stats->rx_errors += dev_stats.tx_errors;
1439 stats->tx_errors += dev_stats.rx_errors;
1441 stats->multicast += dev_stats.multicast;
1442 stats->collisions += dev_stats.collisions;
1448 netdev_internal_get_stats(const struct netdev *netdev_,
1449 struct netdev_stats *stats)
1451 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1453 get_stats_via_vport(netdev_, stats);
1454 return netdev->vport_stats_error;
1458 netdev_internal_set_stats(struct netdev *netdev,
1459 const struct netdev_stats *stats)
1461 struct ovs_vport_stats vport_stats;
1462 struct dpif_linux_vport vport;
1465 vport_stats.rx_packets = stats->rx_packets;
1466 vport_stats.tx_packets = stats->tx_packets;
1467 vport_stats.rx_bytes = stats->rx_bytes;
1468 vport_stats.tx_bytes = stats->tx_bytes;
1469 vport_stats.rx_errors = stats->rx_errors;
1470 vport_stats.tx_errors = stats->tx_errors;
1471 vport_stats.rx_dropped = stats->rx_dropped;
1472 vport_stats.tx_dropped = stats->tx_dropped;
1474 dpif_linux_vport_init(&vport);
1475 vport.cmd = OVS_VPORT_CMD_SET;
1476 vport.name = netdev_get_name(netdev);
1477 vport.stats = &vport_stats;
1479 err = dpif_linux_vport_transact(&vport, NULL, NULL);
1481 /* If the vport layer doesn't know about the device, that doesn't mean it
1482 * doesn't exist (after all were able to open it when netdev_open() was
1483 * called), it just means that it isn't attached and we'll be getting
1484 * stats a different way. */
1485 if (err == ENODEV) {
1493 netdev_linux_read_features(struct netdev_linux *netdev)
1495 struct ethtool_cmd ecmd;
1499 if (netdev->cache_valid & VALID_FEATURES) {
1503 COVERAGE_INC(netdev_get_ethtool);
1504 memset(&ecmd, 0, sizeof ecmd);
1505 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1506 ETHTOOL_GSET, "ETHTOOL_GSET");
1511 /* Supported features. */
1512 netdev->supported = 0;
1513 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1514 netdev->supported |= NETDEV_F_10MB_HD;
1516 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1517 netdev->supported |= NETDEV_F_10MB_FD;
1519 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1520 netdev->supported |= NETDEV_F_100MB_HD;
1522 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1523 netdev->supported |= NETDEV_F_100MB_FD;
1525 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1526 netdev->supported |= NETDEV_F_1GB_HD;
1528 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1529 netdev->supported |= NETDEV_F_1GB_FD;
1531 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1532 netdev->supported |= NETDEV_F_10GB_FD;
1534 if (ecmd.supported & SUPPORTED_TP) {
1535 netdev->supported |= NETDEV_F_COPPER;
1537 if (ecmd.supported & SUPPORTED_FIBRE) {
1538 netdev->supported |= NETDEV_F_FIBER;
1540 if (ecmd.supported & SUPPORTED_Autoneg) {
1541 netdev->supported |= NETDEV_F_AUTONEG;
1543 if (ecmd.supported & SUPPORTED_Pause) {
1544 netdev->supported |= NETDEV_F_PAUSE;
1546 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1547 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1550 /* Advertised features. */
1551 netdev->advertised = 0;
1552 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1553 netdev->advertised |= NETDEV_F_10MB_HD;
1555 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1556 netdev->advertised |= NETDEV_F_10MB_FD;
1558 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1559 netdev->advertised |= NETDEV_F_100MB_HD;
1561 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1562 netdev->advertised |= NETDEV_F_100MB_FD;
1564 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1565 netdev->advertised |= NETDEV_F_1GB_HD;
1567 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1568 netdev->advertised |= NETDEV_F_1GB_FD;
1570 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1571 netdev->advertised |= NETDEV_F_10GB_FD;
1573 if (ecmd.advertising & ADVERTISED_TP) {
1574 netdev->advertised |= NETDEV_F_COPPER;
1576 if (ecmd.advertising & ADVERTISED_FIBRE) {
1577 netdev->advertised |= NETDEV_F_FIBER;
1579 if (ecmd.advertising & ADVERTISED_Autoneg) {
1580 netdev->advertised |= NETDEV_F_AUTONEG;
1582 if (ecmd.advertising & ADVERTISED_Pause) {
1583 netdev->advertised |= NETDEV_F_PAUSE;
1585 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1586 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1589 /* Current settings. */
1591 if (speed == SPEED_10) {
1592 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1593 } else if (speed == SPEED_100) {
1594 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1595 } else if (speed == SPEED_1000) {
1596 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1597 } else if (speed == SPEED_10000) {
1598 netdev->current = NETDEV_F_10GB_FD;
1599 } else if (speed == 40000) {
1600 netdev->current = NETDEV_F_40GB_FD;
1601 } else if (speed == 100000) {
1602 netdev->current = NETDEV_F_100GB_FD;
1603 } else if (speed == 1000000) {
1604 netdev->current = NETDEV_F_1TB_FD;
1606 netdev->current = 0;
1609 if (ecmd.port == PORT_TP) {
1610 netdev->current |= NETDEV_F_COPPER;
1611 } else if (ecmd.port == PORT_FIBRE) {
1612 netdev->current |= NETDEV_F_FIBER;
1616 netdev->current |= NETDEV_F_AUTONEG;
1620 netdev->cache_valid |= VALID_FEATURES;
1621 netdev->get_features_error = error;
1624 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
1625 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1626 * Returns 0 if successful, otherwise a positive errno value. */
1628 netdev_linux_get_features(const struct netdev *netdev_,
1629 enum netdev_features *current,
1630 enum netdev_features *advertised,
1631 enum netdev_features *supported,
1632 enum netdev_features *peer)
1634 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1636 netdev_linux_read_features(netdev);
1638 if (!netdev->get_features_error) {
1639 *current = netdev->current;
1640 *advertised = netdev->advertised;
1641 *supported = netdev->supported;
1642 *peer = 0; /* XXX */
1644 return netdev->get_features_error;
1647 /* Set the features advertised by 'netdev' to 'advertise'. */
1649 netdev_linux_set_advertisements(struct netdev *netdev,
1650 enum netdev_features advertise)
1652 struct ethtool_cmd ecmd;
1655 COVERAGE_INC(netdev_get_ethtool);
1656 memset(&ecmd, 0, sizeof ecmd);
1657 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1658 ETHTOOL_GSET, "ETHTOOL_GSET");
1663 ecmd.advertising = 0;
1664 if (advertise & NETDEV_F_10MB_HD) {
1665 ecmd.advertising |= ADVERTISED_10baseT_Half;
1667 if (advertise & NETDEV_F_10MB_FD) {
1668 ecmd.advertising |= ADVERTISED_10baseT_Full;
1670 if (advertise & NETDEV_F_100MB_HD) {
1671 ecmd.advertising |= ADVERTISED_100baseT_Half;
1673 if (advertise & NETDEV_F_100MB_FD) {
1674 ecmd.advertising |= ADVERTISED_100baseT_Full;
1676 if (advertise & NETDEV_F_1GB_HD) {
1677 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1679 if (advertise & NETDEV_F_1GB_FD) {
1680 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1682 if (advertise & NETDEV_F_10GB_FD) {
1683 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1685 if (advertise & NETDEV_F_COPPER) {
1686 ecmd.advertising |= ADVERTISED_TP;
1688 if (advertise & NETDEV_F_FIBER) {
1689 ecmd.advertising |= ADVERTISED_FIBRE;
1691 if (advertise & NETDEV_F_AUTONEG) {
1692 ecmd.advertising |= ADVERTISED_Autoneg;
1694 if (advertise & NETDEV_F_PAUSE) {
1695 ecmd.advertising |= ADVERTISED_Pause;
1697 if (advertise & NETDEV_F_PAUSE_ASYM) {
1698 ecmd.advertising |= ADVERTISED_Asym_Pause;
1700 COVERAGE_INC(netdev_set_ethtool);
1701 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1702 ETHTOOL_SSET, "ETHTOOL_SSET");
1705 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1706 * successful, otherwise a positive errno value. */
1708 netdev_linux_set_policing(struct netdev *netdev_,
1709 uint32_t kbits_rate, uint32_t kbits_burst)
1711 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1712 const char *netdev_name = netdev_get_name(netdev_);
1716 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1717 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1718 : kbits_burst); /* Stick with user-specified value. */
1720 if (netdev->cache_valid & VALID_POLICING) {
1721 if (netdev->netdev_policing_error) {
1722 return netdev->netdev_policing_error;
1725 if (netdev->kbits_rate == kbits_rate &&
1726 netdev->kbits_burst == kbits_burst) {
1727 /* Assume that settings haven't changed since we last set them. */
1730 netdev->cache_valid &= ~VALID_POLICING;
1733 COVERAGE_INC(netdev_set_policing);
1734 /* Remove any existing ingress qdisc. */
1735 error = tc_add_del_ingress_qdisc(netdev_, false);
1737 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1738 netdev_name, ovs_strerror(error));
1743 error = tc_add_del_ingress_qdisc(netdev_, true);
1745 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1746 netdev_name, ovs_strerror(error));
1750 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
1752 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1753 netdev_name, ovs_strerror(error));
1758 netdev->kbits_rate = kbits_rate;
1759 netdev->kbits_burst = kbits_burst;
1762 if (!error || error == ENODEV) {
1763 netdev->netdev_policing_error = error;
1764 netdev->cache_valid |= VALID_POLICING;
1770 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1773 const struct tc_ops *const *opsp;
1775 for (opsp = tcs; *opsp != NULL; opsp++) {
1776 const struct tc_ops *ops = *opsp;
1777 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1778 sset_add(types, ops->ovs_name);
1784 static const struct tc_ops *
1785 tc_lookup_ovs_name(const char *name)
1787 const struct tc_ops *const *opsp;
1789 for (opsp = tcs; *opsp != NULL; opsp++) {
1790 const struct tc_ops *ops = *opsp;
1791 if (!strcmp(name, ops->ovs_name)) {
1798 static const struct tc_ops *
1799 tc_lookup_linux_name(const char *name)
1801 const struct tc_ops *const *opsp;
1803 for (opsp = tcs; *opsp != NULL; opsp++) {
1804 const struct tc_ops *ops = *opsp;
1805 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1812 static struct tc_queue *
1813 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
1816 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1817 struct tc_queue *queue;
1819 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
1820 if (queue->queue_id == queue_id) {
1827 static struct tc_queue *
1828 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1830 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1834 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1836 struct netdev_qos_capabilities *caps)
1838 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1842 caps->n_queues = ops->n_queues;
1847 netdev_linux_get_qos(const struct netdev *netdev_,
1848 const char **typep, struct smap *details)
1850 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1853 error = tc_query_qdisc(netdev_);
1858 *typep = netdev->tc->ops->ovs_name;
1859 return (netdev->tc->ops->qdisc_get
1860 ? netdev->tc->ops->qdisc_get(netdev_, details)
1865 netdev_linux_set_qos(struct netdev *netdev_,
1866 const char *type, const struct smap *details)
1868 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1869 const struct tc_ops *new_ops;
1872 new_ops = tc_lookup_ovs_name(type);
1873 if (!new_ops || !new_ops->tc_install) {
1877 error = tc_query_qdisc(netdev_);
1882 if (new_ops == netdev->tc->ops) {
1883 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
1885 /* Delete existing qdisc. */
1886 error = tc_del_qdisc(netdev_);
1890 ovs_assert(netdev->tc == NULL);
1892 /* Install new qdisc. */
1893 error = new_ops->tc_install(netdev_, details);
1894 ovs_assert((error == 0) == (netdev->tc != NULL));
1901 netdev_linux_get_queue(const struct netdev *netdev_,
1902 unsigned int queue_id, struct smap *details)
1904 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1907 error = tc_query_qdisc(netdev_);
1911 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
1913 ? netdev->tc->ops->class_get(netdev_, queue, details)
1919 netdev_linux_set_queue(struct netdev *netdev_,
1920 unsigned int queue_id, const struct smap *details)
1922 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1925 error = tc_query_qdisc(netdev_);
1928 } else if (queue_id >= netdev->tc->ops->n_queues
1929 || !netdev->tc->ops->class_set) {
1933 return netdev->tc->ops->class_set(netdev_, queue_id, details);
1937 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
1939 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1942 error = tc_query_qdisc(netdev_);
1945 } else if (!netdev->tc->ops->class_delete) {
1948 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
1950 ? netdev->tc->ops->class_delete(netdev_, queue)
1956 netdev_linux_get_queue_stats(const struct netdev *netdev_,
1957 unsigned int queue_id,
1958 struct netdev_queue_stats *stats)
1960 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1963 error = tc_query_qdisc(netdev_);
1966 } else if (!netdev->tc->ops->class_get_stats) {
1969 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
1973 stats->created = queue->created;
1974 return netdev->tc->ops->class_get_stats(netdev_, queue, stats);
1979 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1981 struct ofpbuf request;
1982 struct tcmsg *tcmsg;
1984 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
1988 tcmsg->tcm_parent = 0;
1989 nl_dump_start(dump, NETLINK_ROUTE, &request);
1990 ofpbuf_uninit(&request);
1995 netdev_linux_dump_queues(const struct netdev *netdev_,
1996 netdev_dump_queues_cb *cb, void *aux)
1998 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1999 struct tc_queue *queue, *next_queue;
2000 struct smap details;
2004 error = tc_query_qdisc(netdev_);
2007 } else if (!netdev->tc->ops->class_get) {
2012 smap_init(&details);
2013 HMAP_FOR_EACH_SAFE (queue, next_queue, hmap_node,
2014 &netdev->tc->queues) {
2015 smap_clear(&details);
2017 error = netdev->tc->ops->class_get(netdev_, queue, &details);
2019 (*cb)(queue->queue_id, &details, aux);
2024 smap_destroy(&details);
2030 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2031 netdev_dump_queue_stats_cb *cb, void *aux)
2033 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2034 struct nl_dump dump;
2039 error = tc_query_qdisc(netdev_);
2042 } else if (!netdev->tc->ops->class_dump_stats) {
2047 if (!start_queue_dump(netdev_, &dump)) {
2050 while (nl_dump_next(&dump, &msg)) {
2051 error = netdev->tc->ops->class_dump_stats(netdev_, &msg, cb, aux);
2057 error = nl_dump_done(&dump);
2058 return error ? error : last_error;
2062 netdev_linux_get_in4(const struct netdev *netdev_,
2063 struct in_addr *address, struct in_addr *netmask)
2065 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2067 if (!(netdev->cache_valid & VALID_IN4)) {
2070 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
2071 SIOCGIFADDR, "SIOCGIFADDR");
2076 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2077 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2082 netdev->cache_valid |= VALID_IN4;
2084 *address = netdev->address;
2085 *netmask = netdev->netmask;
2086 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
2090 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2091 struct in_addr netmask)
2093 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2096 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2098 netdev->cache_valid |= VALID_IN4;
2099 netdev->address = address;
2100 netdev->netmask = netmask;
2101 if (address.s_addr != INADDR_ANY) {
2102 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2103 "SIOCSIFNETMASK", netmask);
2110 parse_if_inet6_line(const char *line,
2111 struct in6_addr *in6, char ifname[16 + 1])
2113 uint8_t *s6 = in6->s6_addr;
2114 #define X8 "%2"SCNx8
2116 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2117 "%*x %*x %*x %*x %16s\n",
2118 &s6[0], &s6[1], &s6[2], &s6[3],
2119 &s6[4], &s6[5], &s6[6], &s6[7],
2120 &s6[8], &s6[9], &s6[10], &s6[11],
2121 &s6[12], &s6[13], &s6[14], &s6[15],
2125 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2126 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2128 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2130 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2131 if (!(netdev->cache_valid & VALID_IN6)) {
2135 netdev->in6 = in6addr_any;
2137 file = fopen("/proc/net/if_inet6", "r");
2139 const char *name = netdev_get_name(netdev_);
2140 while (fgets(line, sizeof line, file)) {
2141 struct in6_addr in6_tmp;
2142 char ifname[16 + 1];
2143 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2144 && !strcmp(name, ifname))
2146 netdev->in6 = in6_tmp;
2152 netdev->cache_valid |= VALID_IN6;
2159 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2161 struct sockaddr_in sin;
2162 memset(&sin, 0, sizeof sin);
2163 sin.sin_family = AF_INET;
2164 sin.sin_addr = addr;
2167 memset(sa, 0, sizeof *sa);
2168 memcpy(sa, &sin, sizeof sin);
2172 do_set_addr(struct netdev *netdev,
2173 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2177 make_in4_sockaddr(&ifr.ifr_addr, addr);
2178 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2182 /* Adds 'router' as a default IP gateway. */
2184 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2186 struct in_addr any = { INADDR_ANY };
2190 memset(&rt, 0, sizeof rt);
2191 make_in4_sockaddr(&rt.rt_dst, any);
2192 make_in4_sockaddr(&rt.rt_gateway, router);
2193 make_in4_sockaddr(&rt.rt_genmask, any);
2194 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2195 error = af_inet_ioctl(SIOCADDRT, &rt);
2197 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2203 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2206 static const char fn[] = "/proc/net/route";
2211 *netdev_name = NULL;
2212 stream = fopen(fn, "r");
2213 if (stream == NULL) {
2214 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2219 while (fgets(line, sizeof line, stream)) {
2222 ovs_be32 dest, gateway, mask;
2223 int refcnt, metric, mtu;
2224 unsigned int flags, use, window, irtt;
2227 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2229 iface, &dest, &gateway, &flags, &refcnt,
2230 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2232 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2236 if (!(flags & RTF_UP)) {
2237 /* Skip routes that aren't up. */
2241 /* The output of 'dest', 'mask', and 'gateway' were given in
2242 * network byte order, so we don't need need any endian
2243 * conversions here. */
2244 if ((dest & mask) == (host->s_addr & mask)) {
2246 /* The host is directly reachable. */
2247 next_hop->s_addr = 0;
2249 /* To reach the host, we must go through a gateway. */
2250 next_hop->s_addr = gateway;
2252 *netdev_name = xstrdup(iface);
2264 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2266 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2269 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2270 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2272 COVERAGE_INC(netdev_get_ethtool);
2273 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2274 error = netdev_linux_do_ethtool(netdev->up.name,
2277 "ETHTOOL_GDRVINFO");
2279 netdev->cache_valid |= VALID_DRVINFO;
2284 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2285 smap_add(smap, "driver_version", netdev->drvinfo.version);
2286 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2292 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2295 smap_add(smap, "driver_name", "openvswitch");
2299 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2300 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2301 * returns 0. Otherwise, it returns a positive errno value; in particular,
2302 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2304 netdev_linux_arp_lookup(const struct netdev *netdev,
2305 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2308 struct sockaddr_in sin;
2311 memset(&r, 0, sizeof r);
2312 memset(&sin, 0, sizeof sin);
2313 sin.sin_family = AF_INET;
2314 sin.sin_addr.s_addr = ip;
2316 memcpy(&r.arp_pa, &sin, sizeof sin);
2317 r.arp_ha.sa_family = ARPHRD_ETHER;
2319 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2320 COVERAGE_INC(netdev_arp_lookup);
2321 retval = af_inet_ioctl(SIOCGARP, &r);
2323 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2324 } else if (retval != ENXIO) {
2325 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2326 netdev_get_name(netdev), IP_ARGS(ip),
2327 ovs_strerror(retval));
2333 nd_to_iff_flags(enum netdev_flags nd)
2336 if (nd & NETDEV_UP) {
2339 if (nd & NETDEV_PROMISC) {
2346 iff_to_nd_flags(int iff)
2348 enum netdev_flags nd = 0;
2352 if (iff & IFF_PROMISC) {
2353 nd |= NETDEV_PROMISC;
2359 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2360 enum netdev_flags on, enum netdev_flags *old_flagsp)
2362 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2363 int old_flags, new_flags;
2366 old_flags = netdev->ifi_flags;
2367 *old_flagsp = iff_to_nd_flags(old_flags);
2368 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2369 if (new_flags != old_flags) {
2370 error = set_flags(netdev_get_name(netdev_), new_flags);
2371 get_flags(netdev_, &netdev->ifi_flags);
2377 netdev_linux_change_seq(const struct netdev *netdev)
2379 return netdev_linux_cast(netdev)->change_seq;
2382 #define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS, \
2383 GET_FEATURES, GET_STATUS) \
2389 netdev_linux_wait, \
2392 netdev_linux_destroy, \
2393 NULL, /* get_config */ \
2394 NULL, /* set_config */ \
2395 NULL, /* get_tunnel_config */ \
2397 netdev_linux_rx_open, \
2399 netdev_linux_send, \
2400 netdev_linux_send_wait, \
2402 netdev_linux_set_etheraddr, \
2403 netdev_linux_get_etheraddr, \
2404 netdev_linux_get_mtu, \
2405 netdev_linux_set_mtu, \
2406 netdev_linux_get_ifindex, \
2407 netdev_linux_get_carrier, \
2408 netdev_linux_get_carrier_resets, \
2409 netdev_linux_set_miimon_interval, \
2414 netdev_linux_set_advertisements, \
2416 netdev_linux_set_policing, \
2417 netdev_linux_get_qos_types, \
2418 netdev_linux_get_qos_capabilities, \
2419 netdev_linux_get_qos, \
2420 netdev_linux_set_qos, \
2421 netdev_linux_get_queue, \
2422 netdev_linux_set_queue, \
2423 netdev_linux_delete_queue, \
2424 netdev_linux_get_queue_stats, \
2425 netdev_linux_dump_queues, \
2426 netdev_linux_dump_queue_stats, \
2428 netdev_linux_get_in4, \
2429 netdev_linux_set_in4, \
2430 netdev_linux_get_in6, \
2431 netdev_linux_add_router, \
2432 netdev_linux_get_next_hop, \
2434 netdev_linux_arp_lookup, \
2436 netdev_linux_update_flags, \
2438 netdev_linux_change_seq \
2441 const struct netdev_class netdev_linux_class =
2444 netdev_linux_create,
2445 netdev_linux_get_stats,
2446 NULL, /* set_stats */
2447 netdev_linux_get_features,
2448 netdev_linux_get_status);
2450 const struct netdev_class netdev_tap_class =
2453 netdev_linux_create_tap,
2454 netdev_tap_get_stats,
2455 NULL, /* set_stats */
2456 netdev_linux_get_features,
2457 netdev_linux_get_status);
2459 const struct netdev_class netdev_internal_class =
2462 netdev_linux_create,
2463 netdev_internal_get_stats,
2464 netdev_internal_set_stats,
2465 NULL, /* get_features */
2466 netdev_internal_get_status);
2468 static const struct netdev_rx_class netdev_rx_linux_class = {
2469 netdev_rx_linux_destroy,
2470 netdev_rx_linux_recv,
2471 netdev_rx_linux_wait,
2472 netdev_rx_linux_drain,
2475 /* HTB traffic control class. */
2477 #define HTB_N_QUEUES 0xf000
2481 unsigned int max_rate; /* In bytes/s. */
2485 struct tc_queue tc_queue;
2486 unsigned int min_rate; /* In bytes/s. */
2487 unsigned int max_rate; /* In bytes/s. */
2488 unsigned int burst; /* In bytes. */
2489 unsigned int priority; /* Lower values are higher priorities. */
2493 htb_get__(const struct netdev *netdev_)
2495 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2496 return CONTAINER_OF(netdev->tc, struct htb, tc);
2500 htb_install__(struct netdev *netdev_, uint64_t max_rate)
2502 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2505 htb = xmalloc(sizeof *htb);
2506 tc_init(&htb->tc, &tc_ops_htb);
2507 htb->max_rate = max_rate;
2509 netdev->tc = &htb->tc;
2512 /* Create an HTB qdisc.
2514 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2516 htb_setup_qdisc__(struct netdev *netdev)
2519 struct tc_htb_glob opt;
2520 struct ofpbuf request;
2521 struct tcmsg *tcmsg;
2523 tc_del_qdisc(netdev);
2525 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2526 NLM_F_EXCL | NLM_F_CREATE, &request);
2530 tcmsg->tcm_handle = tc_make_handle(1, 0);
2531 tcmsg->tcm_parent = TC_H_ROOT;
2533 nl_msg_put_string(&request, TCA_KIND, "htb");
2535 memset(&opt, 0, sizeof opt);
2536 opt.rate2quantum = 10;
2540 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2541 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2542 nl_msg_end_nested(&request, opt_offset);
2544 return tc_transact(&request, NULL);
2547 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2548 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2550 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2551 unsigned int parent, struct htb_class *class)
2554 struct tc_htb_opt opt;
2555 struct ofpbuf request;
2556 struct tcmsg *tcmsg;
2560 error = netdev_get_mtu(netdev, &mtu);
2562 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2563 netdev_get_name(netdev));
2567 memset(&opt, 0, sizeof opt);
2568 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2569 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2570 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2571 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2572 opt.prio = class->priority;
2574 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2578 tcmsg->tcm_handle = handle;
2579 tcmsg->tcm_parent = parent;
2581 nl_msg_put_string(&request, TCA_KIND, "htb");
2582 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2583 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2584 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2585 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2586 nl_msg_end_nested(&request, opt_offset);
2588 error = tc_transact(&request, NULL);
2590 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2591 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2592 netdev_get_name(netdev),
2593 tc_get_major(handle), tc_get_minor(handle),
2594 tc_get_major(parent), tc_get_minor(parent),
2595 class->min_rate, class->max_rate,
2596 class->burst, class->priority, ovs_strerror(error));
2601 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2602 * description of them into 'details'. The description complies with the
2603 * specification given in the vswitch database documentation for linux-htb
2606 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2608 static const struct nl_policy tca_htb_policy[] = {
2609 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2610 .min_len = sizeof(struct tc_htb_opt) },
2613 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2614 const struct tc_htb_opt *htb;
2616 if (!nl_parse_nested(nl_options, tca_htb_policy,
2617 attrs, ARRAY_SIZE(tca_htb_policy))) {
2618 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2622 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2623 class->min_rate = htb->rate.rate;
2624 class->max_rate = htb->ceil.rate;
2625 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2626 class->priority = htb->prio;
2631 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2632 struct htb_class *options,
2633 struct netdev_queue_stats *stats)
2635 struct nlattr *nl_options;
2636 unsigned int handle;
2639 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2640 if (!error && queue_id) {
2641 unsigned int major = tc_get_major(handle);
2642 unsigned int minor = tc_get_minor(handle);
2643 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2644 *queue_id = minor - 1;
2649 if (!error && options) {
2650 error = htb_parse_tca_options__(nl_options, options);
2656 htb_parse_qdisc_details__(struct netdev *netdev,
2657 const struct smap *details, struct htb_class *hc)
2659 const char *max_rate_s;
2661 max_rate_s = smap_get(details, "max-rate");
2662 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2663 if (!hc->max_rate) {
2664 enum netdev_features current;
2666 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2667 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
2669 hc->min_rate = hc->max_rate;
2675 htb_parse_class_details__(struct netdev *netdev,
2676 const struct smap *details, struct htb_class *hc)
2678 const struct htb *htb = htb_get__(netdev);
2679 const char *min_rate_s = smap_get(details, "min-rate");
2680 const char *max_rate_s = smap_get(details, "max-rate");
2681 const char *burst_s = smap_get(details, "burst");
2682 const char *priority_s = smap_get(details, "priority");
2685 error = netdev_get_mtu(netdev, &mtu);
2687 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2688 netdev_get_name(netdev));
2692 /* HTB requires at least an mtu sized min-rate to send any traffic even
2693 * on uncongested links. */
2694 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2695 hc->min_rate = MAX(hc->min_rate, mtu);
2696 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2699 hc->max_rate = (max_rate_s
2700 ? strtoull(max_rate_s, NULL, 10) / 8
2702 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2703 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2707 * According to hints in the documentation that I've read, it is important
2708 * that 'burst' be at least as big as the largest frame that might be
2709 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2710 * but having it a bit too small is a problem. Since netdev_get_mtu()
2711 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2712 * the MTU. We actually add 64, instead of 14, as a guard against
2713 * additional headers get tacked on somewhere that we're not aware of. */
2714 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2715 hc->burst = MAX(hc->burst, mtu + 64);
2718 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2724 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2725 unsigned int parent, struct htb_class *options,
2726 struct netdev_queue_stats *stats)
2728 struct ofpbuf *reply;
2731 error = tc_query_class(netdev, handle, parent, &reply);
2733 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2734 ofpbuf_delete(reply);
2740 htb_tc_install(struct netdev *netdev, const struct smap *details)
2744 error = htb_setup_qdisc__(netdev);
2746 struct htb_class hc;
2748 htb_parse_qdisc_details__(netdev, details, &hc);
2749 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2750 tc_make_handle(1, 0), &hc);
2752 htb_install__(netdev, hc.max_rate);
2758 static struct htb_class *
2759 htb_class_cast__(const struct tc_queue *queue)
2761 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2765 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2766 const struct htb_class *hc)
2768 struct htb *htb = htb_get__(netdev);
2769 size_t hash = hash_int(queue_id, 0);
2770 struct tc_queue *queue;
2771 struct htb_class *hcp;
2773 queue = tc_find_queue__(netdev, queue_id, hash);
2775 hcp = htb_class_cast__(queue);
2777 hcp = xmalloc(sizeof *hcp);
2778 queue = &hcp->tc_queue;
2779 queue->queue_id = queue_id;
2780 queue->created = time_msec();
2781 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2784 hcp->min_rate = hc->min_rate;
2785 hcp->max_rate = hc->max_rate;
2786 hcp->burst = hc->burst;
2787 hcp->priority = hc->priority;
2791 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2794 struct nl_dump dump;
2795 struct htb_class hc;
2797 /* Get qdisc options. */
2799 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2800 htb_install__(netdev, hc.max_rate);
2803 if (!start_queue_dump(netdev, &dump)) {
2806 while (nl_dump_next(&dump, &msg)) {
2807 unsigned int queue_id;
2809 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2810 htb_update_queue__(netdev, queue_id, &hc);
2813 nl_dump_done(&dump);
2819 htb_tc_destroy(struct tc *tc)
2821 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2822 struct htb_class *hc, *next;
2824 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2825 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2833 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
2835 const struct htb *htb = htb_get__(netdev);
2836 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
2841 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
2843 struct htb_class hc;
2846 htb_parse_qdisc_details__(netdev, details, &hc);
2847 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2848 tc_make_handle(1, 0), &hc);
2850 htb_get__(netdev)->max_rate = hc.max_rate;
2856 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2857 const struct tc_queue *queue, struct smap *details)
2859 const struct htb_class *hc = htb_class_cast__(queue);
2861 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
2862 if (hc->min_rate != hc->max_rate) {
2863 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
2865 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
2867 smap_add_format(details, "priority", "%u", hc->priority);
2873 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2874 const struct smap *details)
2876 struct htb_class hc;
2879 error = htb_parse_class_details__(netdev, details, &hc);
2884 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2885 tc_make_handle(1, 0xfffe), &hc);
2890 htb_update_queue__(netdev, queue_id, &hc);
2895 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2897 struct htb_class *hc = htb_class_cast__(queue);
2898 struct htb *htb = htb_get__(netdev);
2901 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2903 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2910 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2911 struct netdev_queue_stats *stats)
2913 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2914 tc_make_handle(1, 0xfffe), NULL, stats);
2918 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2919 const struct ofpbuf *nlmsg,
2920 netdev_dump_queue_stats_cb *cb, void *aux)
2922 struct netdev_queue_stats stats;
2923 unsigned int handle, major, minor;
2926 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2931 major = tc_get_major(handle);
2932 minor = tc_get_minor(handle);
2933 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2934 (*cb)(minor - 1, &stats, aux);
2939 static const struct tc_ops tc_ops_htb = {
2940 "htb", /* linux_name */
2941 "linux-htb", /* ovs_name */
2942 HTB_N_QUEUES, /* n_queues */
2951 htb_class_get_stats,
2952 htb_class_dump_stats
2955 /* "linux-hfsc" traffic control class. */
2957 #define HFSC_N_QUEUES 0xf000
2965 struct tc_queue tc_queue;
2970 static struct hfsc *
2971 hfsc_get__(const struct netdev *netdev_)
2973 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2974 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
2977 static struct hfsc_class *
2978 hfsc_class_cast__(const struct tc_queue *queue)
2980 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2984 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
2986 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2989 hfsc = xmalloc(sizeof *hfsc);
2990 tc_init(&hfsc->tc, &tc_ops_hfsc);
2991 hfsc->max_rate = max_rate;
2992 netdev->tc = &hfsc->tc;
2996 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
2997 const struct hfsc_class *hc)
3001 struct hfsc_class *hcp;
3002 struct tc_queue *queue;
3004 hfsc = hfsc_get__(netdev);
3005 hash = hash_int(queue_id, 0);
3007 queue = tc_find_queue__(netdev, queue_id, hash);
3009 hcp = hfsc_class_cast__(queue);
3011 hcp = xmalloc(sizeof *hcp);
3012 queue = &hcp->tc_queue;
3013 queue->queue_id = queue_id;
3014 queue->created = time_msec();
3015 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3018 hcp->min_rate = hc->min_rate;
3019 hcp->max_rate = hc->max_rate;
3023 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3025 const struct tc_service_curve *rsc, *fsc, *usc;
3026 static const struct nl_policy tca_hfsc_policy[] = {
3028 .type = NL_A_UNSPEC,
3030 .min_len = sizeof(struct tc_service_curve),
3033 .type = NL_A_UNSPEC,
3035 .min_len = sizeof(struct tc_service_curve),
3038 .type = NL_A_UNSPEC,
3040 .min_len = sizeof(struct tc_service_curve),
3043 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3045 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3046 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3047 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3051 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3052 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3053 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3055 if (rsc->m1 != 0 || rsc->d != 0 ||
3056 fsc->m1 != 0 || fsc->d != 0 ||
3057 usc->m1 != 0 || usc->d != 0) {
3058 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3059 "Non-linear service curves are not supported.");
3063 if (rsc->m2 != fsc->m2) {
3064 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3065 "Real-time service curves are not supported ");
3069 if (rsc->m2 > usc->m2) {
3070 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3071 "Min-rate service curve is greater than "
3072 "the max-rate service curve.");
3076 class->min_rate = fsc->m2;
3077 class->max_rate = usc->m2;
3082 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3083 struct hfsc_class *options,
3084 struct netdev_queue_stats *stats)
3087 unsigned int handle;
3088 struct nlattr *nl_options;
3090 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3096 unsigned int major, minor;
3098 major = tc_get_major(handle);
3099 minor = tc_get_minor(handle);
3100 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3101 *queue_id = minor - 1;
3108 error = hfsc_parse_tca_options__(nl_options, options);
3115 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3116 unsigned int parent, struct hfsc_class *options,
3117 struct netdev_queue_stats *stats)
3120 struct ofpbuf *reply;
3122 error = tc_query_class(netdev, handle, parent, &reply);
3127 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3128 ofpbuf_delete(reply);
3133 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct smap *details,
3134 struct hfsc_class *class)
3137 const char *max_rate_s;
3139 max_rate_s = smap_get(details, "max-rate");
3140 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3143 enum netdev_features current;
3145 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
3146 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3149 class->min_rate = max_rate;
3150 class->max_rate = max_rate;
3154 hfsc_parse_class_details__(struct netdev *netdev,
3155 const struct smap *details,
3156 struct hfsc_class * class)
3158 const struct hfsc *hfsc;
3159 uint32_t min_rate, max_rate;
3160 const char *min_rate_s, *max_rate_s;
3162 hfsc = hfsc_get__(netdev);
3163 min_rate_s = smap_get(details, "min-rate");
3164 max_rate_s = smap_get(details, "max-rate");
3166 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3167 min_rate = MAX(min_rate, 1);
3168 min_rate = MIN(min_rate, hfsc->max_rate);
3170 max_rate = (max_rate_s
3171 ? strtoull(max_rate_s, NULL, 10) / 8
3173 max_rate = MAX(max_rate, min_rate);
3174 max_rate = MIN(max_rate, hfsc->max_rate);
3176 class->min_rate = min_rate;
3177 class->max_rate = max_rate;
3182 /* Create an HFSC qdisc.
3184 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3186 hfsc_setup_qdisc__(struct netdev * netdev)
3188 struct tcmsg *tcmsg;
3189 struct ofpbuf request;
3190 struct tc_hfsc_qopt opt;
3192 tc_del_qdisc(netdev);
3194 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3195 NLM_F_EXCL | NLM_F_CREATE, &request);
3201 tcmsg->tcm_handle = tc_make_handle(1, 0);
3202 tcmsg->tcm_parent = TC_H_ROOT;
3204 memset(&opt, 0, sizeof opt);
3207 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3208 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3210 return tc_transact(&request, NULL);
3213 /* Create an HFSC class.
3215 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3216 * sc rate <min_rate> ul rate <max_rate>" */
3218 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3219 unsigned int parent, struct hfsc_class *class)
3223 struct tcmsg *tcmsg;
3224 struct ofpbuf request;
3225 struct tc_service_curve min, max;
3227 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3233 tcmsg->tcm_handle = handle;
3234 tcmsg->tcm_parent = parent;
3238 min.m2 = class->min_rate;
3242 max.m2 = class->max_rate;
3244 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3245 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3246 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3247 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3248 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3249 nl_msg_end_nested(&request, opt_offset);
3251 error = tc_transact(&request, NULL);
3253 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3254 "min-rate %ubps, max-rate %ubps (%s)",
3255 netdev_get_name(netdev),
3256 tc_get_major(handle), tc_get_minor(handle),
3257 tc_get_major(parent), tc_get_minor(parent),
3258 class->min_rate, class->max_rate, ovs_strerror(error));
3265 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
3268 struct hfsc_class class;
3270 error = hfsc_setup_qdisc__(netdev);
3276 hfsc_parse_qdisc_details__(netdev, details, &class);
3277 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3278 tc_make_handle(1, 0), &class);
3284 hfsc_install__(netdev, class.max_rate);
3289 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3292 struct nl_dump dump;
3293 struct hfsc_class hc;
3296 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3297 hfsc_install__(netdev, hc.max_rate);
3299 if (!start_queue_dump(netdev, &dump)) {
3303 while (nl_dump_next(&dump, &msg)) {
3304 unsigned int queue_id;
3306 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3307 hfsc_update_queue__(netdev, queue_id, &hc);
3311 nl_dump_done(&dump);
3316 hfsc_tc_destroy(struct tc *tc)
3319 struct hfsc_class *hc, *next;
3321 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3323 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3324 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3333 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
3335 const struct hfsc *hfsc;
3336 hfsc = hfsc_get__(netdev);
3337 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
3342 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
3345 struct hfsc_class class;
3347 hfsc_parse_qdisc_details__(netdev, details, &class);
3348 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3349 tc_make_handle(1, 0), &class);
3352 hfsc_get__(netdev)->max_rate = class.max_rate;
3359 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3360 const struct tc_queue *queue, struct smap *details)
3362 const struct hfsc_class *hc;
3364 hc = hfsc_class_cast__(queue);
3365 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3366 if (hc->min_rate != hc->max_rate) {
3367 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3373 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3374 const struct smap *details)
3377 struct hfsc_class class;
3379 error = hfsc_parse_class_details__(netdev, details, &class);
3384 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3385 tc_make_handle(1, 0xfffe), &class);
3390 hfsc_update_queue__(netdev, queue_id, &class);
3395 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3399 struct hfsc_class *hc;
3401 hc = hfsc_class_cast__(queue);
3402 hfsc = hfsc_get__(netdev);
3404 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3406 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3413 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3414 struct netdev_queue_stats *stats)
3416 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3417 tc_make_handle(1, 0xfffe), NULL, stats);
3421 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3422 const struct ofpbuf *nlmsg,
3423 netdev_dump_queue_stats_cb *cb, void *aux)
3425 struct netdev_queue_stats stats;
3426 unsigned int handle, major, minor;
3429 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3434 major = tc_get_major(handle);
3435 minor = tc_get_minor(handle);
3436 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3437 (*cb)(minor - 1, &stats, aux);
3442 static const struct tc_ops tc_ops_hfsc = {
3443 "hfsc", /* linux_name */
3444 "linux-hfsc", /* ovs_name */
3445 HFSC_N_QUEUES, /* n_queues */
3446 hfsc_tc_install, /* tc_install */
3447 hfsc_tc_load, /* tc_load */
3448 hfsc_tc_destroy, /* tc_destroy */
3449 hfsc_qdisc_get, /* qdisc_get */
3450 hfsc_qdisc_set, /* qdisc_set */
3451 hfsc_class_get, /* class_get */
3452 hfsc_class_set, /* class_set */
3453 hfsc_class_delete, /* class_delete */
3454 hfsc_class_get_stats, /* class_get_stats */
3455 hfsc_class_dump_stats /* class_dump_stats */
3458 /* "linux-default" traffic control class.
3460 * This class represents the default, unnamed Linux qdisc. It corresponds to
3461 * the "" (empty string) QoS type in the OVS database. */
3464 default_install__(struct netdev *netdev_)
3466 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3467 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
3469 /* Nothing but a tc class implementation is allowed to write to a tc. This
3470 * class never does that, so we can legitimately use a const tc object. */
3471 netdev->tc = CONST_CAST(struct tc *, &tc);
3475 default_tc_install(struct netdev *netdev,
3476 const struct smap *details OVS_UNUSED)
3478 default_install__(netdev);
3483 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3485 default_install__(netdev);
3489 static const struct tc_ops tc_ops_default = {
3490 NULL, /* linux_name */
3495 NULL, /* tc_destroy */
3496 NULL, /* qdisc_get */
3497 NULL, /* qdisc_set */
3498 NULL, /* class_get */
3499 NULL, /* class_set */
3500 NULL, /* class_delete */
3501 NULL, /* class_get_stats */
3502 NULL /* class_dump_stats */
3505 /* "linux-other" traffic control class.
3510 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
3512 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3513 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
3515 /* Nothing but a tc class implementation is allowed to write to a tc. This
3516 * class never does that, so we can legitimately use a const tc object. */
3517 netdev->tc = CONST_CAST(struct tc *, &tc);
3521 static const struct tc_ops tc_ops_other = {
3522 NULL, /* linux_name */
3523 "linux-other", /* ovs_name */
3525 NULL, /* tc_install */
3527 NULL, /* tc_destroy */
3528 NULL, /* qdisc_get */
3529 NULL, /* qdisc_set */
3530 NULL, /* class_get */
3531 NULL, /* class_set */
3532 NULL, /* class_delete */
3533 NULL, /* class_get_stats */
3534 NULL /* class_dump_stats */
3537 /* Traffic control. */
3539 /* Number of kernel "tc" ticks per second. */
3540 static double ticks_per_s;
3542 /* Number of kernel "jiffies" per second. This is used for the purpose of
3543 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3544 * one jiffy's worth of data.
3546 * There are two possibilities here:
3548 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3549 * approximate range of 100 to 1024. That means that we really need to
3550 * make sure that the qdisc can buffer that much data.
3552 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3553 * has finely granular timers and there's no need to fudge additional room
3554 * for buffers. (There's no extra effort needed to implement that: the
3555 * large 'buffer_hz' is used as a divisor, so practically any number will
3556 * come out as 0 in the division. Small integer results in the case of
3557 * really high dividends won't have any real effect anyhow.)
3559 static unsigned int buffer_hz;
3561 /* Returns tc handle 'major':'minor'. */
3563 tc_make_handle(unsigned int major, unsigned int minor)
3565 return TC_H_MAKE(major << 16, minor);
3568 /* Returns the major number from 'handle'. */
3570 tc_get_major(unsigned int handle)
3572 return TC_H_MAJ(handle) >> 16;
3575 /* Returns the minor number from 'handle'. */
3577 tc_get_minor(unsigned int handle)
3579 return TC_H_MIN(handle);
3582 static struct tcmsg *
3583 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3584 struct ofpbuf *request)
3586 struct tcmsg *tcmsg;
3590 error = get_ifindex(netdev, &ifindex);
3595 ofpbuf_init(request, 512);
3596 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3597 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3598 tcmsg->tcm_family = AF_UNSPEC;
3599 tcmsg->tcm_ifindex = ifindex;
3600 /* Caller should fill in tcmsg->tcm_handle. */
3601 /* Caller should fill in tcmsg->tcm_parent. */
3607 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3609 int error = nl_transact(NETLINK_ROUTE, request, replyp);
3610 ofpbuf_uninit(request);
3614 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3615 * policing configuration.
3617 * This function is equivalent to running the following when 'add' is true:
3618 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3620 * This function is equivalent to running the following when 'add' is false:
3621 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3623 * The configuration and stats may be seen with the following command:
3624 * /sbin/tc -s qdisc show dev <devname>
3626 * Returns 0 if successful, otherwise a positive errno value.
3629 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3631 struct ofpbuf request;
3632 struct tcmsg *tcmsg;
3634 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3635 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3637 tcmsg = tc_make_request(netdev, type, flags, &request);
3641 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3642 tcmsg->tcm_parent = TC_H_INGRESS;
3643 nl_msg_put_string(&request, TCA_KIND, "ingress");
3644 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3646 error = tc_transact(&request, NULL);
3648 /* If we're deleting the qdisc, don't worry about some of the
3649 * error conditions. */
3650 if (!add && (error == ENOENT || error == EINVAL)) {
3659 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3662 * This function is equivalent to running:
3663 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3664 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3667 * The configuration and stats may be seen with the following command:
3668 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3670 * Returns 0 if successful, otherwise a positive errno value.
3673 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3675 struct tc_police tc_police;
3676 struct ofpbuf request;
3677 struct tcmsg *tcmsg;
3678 size_t basic_offset;
3679 size_t police_offset;
3683 memset(&tc_police, 0, sizeof tc_police);
3684 tc_police.action = TC_POLICE_SHOT;
3685 tc_police.mtu = mtu;
3686 tc_fill_rate(&tc_police.rate, (kbits_rate * 1000)/8, mtu);
3687 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3688 kbits_burst * 1024);
3690 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3691 NLM_F_EXCL | NLM_F_CREATE, &request);
3695 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3696 tcmsg->tcm_info = tc_make_handle(49,
3697 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3699 nl_msg_put_string(&request, TCA_KIND, "basic");
3700 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3701 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3702 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3703 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3704 nl_msg_end_nested(&request, police_offset);
3705 nl_msg_end_nested(&request, basic_offset);
3707 error = tc_transact(&request, NULL);
3718 /* The values in psched are not individually very meaningful, but they are
3719 * important. The tables below show some values seen in the wild.
3723 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3724 * (Before that, there are hints that it was 1000000000.)
3726 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3730 * -----------------------------------
3731 * [1] 000c8000 000f4240 000f4240 00000064
3732 * [2] 000003e8 00000400 000f4240 3b9aca00
3733 * [3] 000003e8 00000400 000f4240 3b9aca00
3734 * [4] 000003e8 00000400 000f4240 00000064
3735 * [5] 000003e8 00000040 000f4240 3b9aca00
3736 * [6] 000003e8 00000040 000f4240 000000f9
3738 * a b c d ticks_per_s buffer_hz
3739 * ------- --------- ---------- ------------- ----------- -------------
3740 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3741 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3742 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3743 * [4] 1,000 1,024 1,000,000 100 976,562 100
3744 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3745 * [6] 1,000 64 1,000,000 249 15,625,000 249
3747 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3748 * [2] 2.6.26-1-686-bigmem from Debian lenny
3749 * [3] 2.6.26-2-sparc64 from Debian lenny
3750 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3751 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3752 * [6] 2.6.34 from kernel.org on KVM
3754 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
3755 static const char fn[] = "/proc/net/psched";
3756 unsigned int a, b, c, d;
3759 if (!ovsthread_once_start(&once)) {
3766 stream = fopen(fn, "r");
3768 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
3772 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3773 VLOG_WARN("%s: read failed", fn);
3777 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3781 VLOG_WARN("%s: invalid scheduler parameters", fn);
3785 ticks_per_s = (double) a * c / b;
3789 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3792 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3795 ovsthread_once_done(&once);
3798 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3799 * rate of 'rate' bytes per second. */
3801 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3804 return (rate * ticks) / ticks_per_s;
3807 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3808 * rate of 'rate' bytes per second. */
3810 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3813 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3816 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3817 * a transmission rate of 'rate' bytes per second. */
3819 tc_buffer_per_jiffy(unsigned int rate)
3822 return rate / buffer_hz;
3825 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3826 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3827 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3828 * stores NULL into it if it is absent.
3830 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3833 * Returns 0 if successful, otherwise a positive errno value. */
3835 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3836 struct nlattr **options)
3838 static const struct nl_policy tca_policy[] = {
3839 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3840 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3842 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3844 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3845 tca_policy, ta, ARRAY_SIZE(ta))) {
3846 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3851 *kind = nl_attr_get_string(ta[TCA_KIND]);
3855 *options = ta[TCA_OPTIONS];
3870 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3871 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3872 * into '*options', and its queue statistics into '*stats'. Any of the output
3873 * arguments may be null.
3875 * Returns 0 if successful, otherwise a positive errno value. */
3877 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3878 struct nlattr **options, struct netdev_queue_stats *stats)
3880 static const struct nl_policy tca_policy[] = {
3881 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3882 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3884 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3886 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3887 tca_policy, ta, ARRAY_SIZE(ta))) {
3888 VLOG_WARN_RL(&rl, "failed to parse class message");
3893 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3894 *handlep = tc->tcm_handle;
3898 *options = ta[TCA_OPTIONS];
3902 const struct gnet_stats_queue *gsq;
3903 struct gnet_stats_basic gsb;
3905 static const struct nl_policy stats_policy[] = {
3906 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3907 .min_len = sizeof gsb },
3908 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3909 .min_len = sizeof *gsq },
3911 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3913 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3914 sa, ARRAY_SIZE(sa))) {
3915 VLOG_WARN_RL(&rl, "failed to parse class stats");
3919 /* Alignment issues screw up the length of struct gnet_stats_basic on
3920 * some arch/bitsize combinations. Newer versions of Linux have a
3921 * struct gnet_stats_basic_packed, but we can't depend on that. The
3922 * easiest thing to do is just to make a copy. */
3923 memset(&gsb, 0, sizeof gsb);
3924 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3925 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3926 stats->tx_bytes = gsb.bytes;
3927 stats->tx_packets = gsb.packets;
3929 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3930 stats->tx_errors = gsq->drops;
3940 memset(stats, 0, sizeof *stats);
3945 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3948 tc_query_class(const struct netdev *netdev,
3949 unsigned int handle, unsigned int parent,
3950 struct ofpbuf **replyp)
3952 struct ofpbuf request;
3953 struct tcmsg *tcmsg;
3956 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3960 tcmsg->tcm_handle = handle;
3961 tcmsg->tcm_parent = parent;
3963 error = tc_transact(&request, replyp);
3965 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3966 netdev_get_name(netdev),
3967 tc_get_major(handle), tc_get_minor(handle),
3968 tc_get_major(parent), tc_get_minor(parent),
3969 ovs_strerror(error));
3974 /* Equivalent to "tc class del dev <name> handle <handle>". */
3976 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3978 struct ofpbuf request;
3979 struct tcmsg *tcmsg;
3982 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
3986 tcmsg->tcm_handle = handle;
3987 tcmsg->tcm_parent = 0;
3989 error = tc_transact(&request, NULL);
3991 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3992 netdev_get_name(netdev),
3993 tc_get_major(handle), tc_get_minor(handle),
3994 ovs_strerror(error));
3999 /* Equivalent to "tc qdisc del dev <name> root". */
4001 tc_del_qdisc(struct netdev *netdev_)
4003 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4004 struct ofpbuf request;
4005 struct tcmsg *tcmsg;
4008 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
4012 tcmsg->tcm_handle = tc_make_handle(1, 0);
4013 tcmsg->tcm_parent = TC_H_ROOT;
4015 error = tc_transact(&request, NULL);
4016 if (error == EINVAL) {
4017 /* EINVAL probably means that the default qdisc was in use, in which
4018 * case we've accomplished our purpose. */
4021 if (!error && netdev->tc) {
4022 if (netdev->tc->ops->tc_destroy) {
4023 netdev->tc->ops->tc_destroy(netdev->tc);
4030 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4031 * kernel to determine what they are. Returns 0 if successful, otherwise a
4032 * positive errno value. */
4034 tc_query_qdisc(const struct netdev *netdev_)
4036 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4037 struct ofpbuf request, *qdisc;
4038 const struct tc_ops *ops;
4039 struct tcmsg *tcmsg;
4047 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4048 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4049 * 2.6.35 without that fix backported to it.
4051 * To avoid the OOPS, we must not make a request that would attempt to dump
4052 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4053 * few others. There are a few ways that I can see to do this, but most of
4054 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4055 * technique chosen here is to assume that any non-default qdisc that we
4056 * create will have a class with handle 1:0. The built-in qdiscs only have
4057 * a class with handle 0:0.
4059 * We could check for Linux 2.6.35+ and use a more straightforward method
4061 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
4065 tcmsg->tcm_handle = tc_make_handle(1, 0);
4066 tcmsg->tcm_parent = 0;
4068 /* Figure out what tc class to instantiate. */
4069 error = tc_transact(&request, &qdisc);
4073 error = tc_parse_qdisc(qdisc, &kind, NULL);
4075 ops = &tc_ops_other;
4077 ops = tc_lookup_linux_name(kind);
4079 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4080 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4082 ops = &tc_ops_other;
4085 } else if (error == ENOENT) {
4086 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4087 * other entity that doesn't have a handle 1:0. We will assume
4088 * that it's the system default qdisc. */
4089 ops = &tc_ops_default;
4092 /* Who knows? Maybe the device got deleted. */
4093 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4094 netdev_get_name(netdev_), ovs_strerror(error));
4095 ops = &tc_ops_other;
4098 /* Instantiate it. */
4099 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
4100 ovs_assert((load_error == 0) == (netdev->tc != NULL));
4101 ofpbuf_delete(qdisc);
4103 return error ? error : load_error;
4106 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4107 approximate the time to transmit packets of various lengths. For an MTU of
4108 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4109 represents two possible packet lengths; for a MTU of 513 through 1024, four
4110 possible lengths; and so on.
4112 Returns, for the specified 'mtu', the number of bits that packet lengths
4113 need to be shifted right to fit within such a 256-entry table. */
4115 tc_calc_cell_log(unsigned int mtu)
4120 mtu = ETH_PAYLOAD_MAX;
4122 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4124 for (cell_log = 0; mtu >= 256; cell_log++) {
4131 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4134 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4136 memset(rate, 0, sizeof *rate);
4137 rate->cell_log = tc_calc_cell_log(mtu);
4138 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4139 /* rate->cell_align = 0; */ /* distro headers. */
4140 rate->mpu = ETH_TOTAL_MIN;
4144 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4145 * attribute of the specified "type".
4147 * See tc_calc_cell_log() above for a description of "rtab"s. */
4149 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4154 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4155 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4156 unsigned packet_size = (i + 1) << rate->cell_log;
4157 if (packet_size < rate->mpu) {
4158 packet_size = rate->mpu;
4160 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4164 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4165 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4166 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4169 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4171 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4172 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4175 /* Linux-only functions declared in netdev-linux.h */
4177 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4178 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4180 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4181 const char *flag_name, bool enable)
4183 const char *netdev_name = netdev_get_name(netdev);
4184 struct ethtool_value evalue;
4188 COVERAGE_INC(netdev_get_ethtool);
4189 memset(&evalue, 0, sizeof evalue);
4190 error = netdev_linux_do_ethtool(netdev_name,
4191 (struct ethtool_cmd *)&evalue,
4192 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4197 COVERAGE_INC(netdev_set_ethtool);
4198 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4199 error = netdev_linux_do_ethtool(netdev_name,
4200 (struct ethtool_cmd *)&evalue,
4201 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4206 COVERAGE_INC(netdev_get_ethtool);
4207 memset(&evalue, 0, sizeof evalue);
4208 error = netdev_linux_do_ethtool(netdev_name,
4209 (struct ethtool_cmd *)&evalue,
4210 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4215 if (new_flags != evalue.data) {
4216 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4217 "device %s failed", enable ? "enable" : "disable",
4218 flag_name, netdev_name);
4225 /* Utility functions. */
4227 /* Copies 'src' into 'dst', performing format conversion in the process. */
4229 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4230 const struct rtnl_link_stats *src)
4232 dst->rx_packets = src->rx_packets;
4233 dst->tx_packets = src->tx_packets;
4234 dst->rx_bytes = src->rx_bytes;
4235 dst->tx_bytes = src->tx_bytes;
4236 dst->rx_errors = src->rx_errors;
4237 dst->tx_errors = src->tx_errors;
4238 dst->rx_dropped = src->rx_dropped;
4239 dst->tx_dropped = src->tx_dropped;
4240 dst->multicast = src->multicast;
4241 dst->collisions = src->collisions;
4242 dst->rx_length_errors = src->rx_length_errors;
4243 dst->rx_over_errors = src->rx_over_errors;
4244 dst->rx_crc_errors = src->rx_crc_errors;
4245 dst->rx_frame_errors = src->rx_frame_errors;
4246 dst->rx_fifo_errors = src->rx_fifo_errors;
4247 dst->rx_missed_errors = src->rx_missed_errors;
4248 dst->tx_aborted_errors = src->tx_aborted_errors;
4249 dst->tx_carrier_errors = src->tx_carrier_errors;
4250 dst->tx_fifo_errors = src->tx_fifo_errors;
4251 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4252 dst->tx_window_errors = src->tx_window_errors;
4256 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4258 /* Policy for RTNLGRP_LINK messages.
4260 * There are *many* more fields in these messages, but currently we only
4261 * care about these fields. */
4262 static const struct nl_policy rtnlgrp_link_policy[] = {
4263 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4264 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4265 .min_len = sizeof(struct rtnl_link_stats) },
4268 struct ofpbuf request;
4269 struct ofpbuf *reply;
4270 struct ifinfomsg *ifi;
4271 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4274 ofpbuf_init(&request, 0);
4275 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4276 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4277 ifi->ifi_family = PF_UNSPEC;
4278 ifi->ifi_index = ifindex;
4279 error = nl_transact(NETLINK_ROUTE, &request, &reply);
4280 ofpbuf_uninit(&request);
4285 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4286 rtnlgrp_link_policy,
4287 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4288 ofpbuf_delete(reply);
4292 if (!attrs[IFLA_STATS]) {
4293 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4294 ofpbuf_delete(reply);
4298 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4300 ofpbuf_delete(reply);
4306 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4308 static const char fn[] = "/proc/net/dev";
4313 stream = fopen(fn, "r");
4315 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
4320 while (fgets(line, sizeof line, stream)) {
4323 #define X64 "%"SCNu64
4326 X64 X64 X64 X64 X64 X64 X64 "%*u"
4327 X64 X64 X64 X64 X64 X64 X64 "%*u",
4333 &stats->rx_fifo_errors,
4334 &stats->rx_frame_errors,
4340 &stats->tx_fifo_errors,
4342 &stats->tx_carrier_errors) != 15) {
4343 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4344 } else if (!strcmp(devname, netdev_name)) {
4345 stats->rx_length_errors = UINT64_MAX;
4346 stats->rx_over_errors = UINT64_MAX;
4347 stats->rx_crc_errors = UINT64_MAX;
4348 stats->rx_missed_errors = UINT64_MAX;
4349 stats->tx_aborted_errors = UINT64_MAX;
4350 stats->tx_heartbeat_errors = UINT64_MAX;
4351 stats->tx_window_errors = UINT64_MAX;
4357 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4363 get_flags(const struct netdev *dev, unsigned int *flags)
4369 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
4371 *flags = ifr.ifr_flags;
4377 set_flags(const char *name, unsigned int flags)
4381 ifr.ifr_flags = flags;
4382 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
4386 do_get_ifindex(const char *netdev_name)
4391 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4392 COVERAGE_INC(netdev_get_ifindex);
4394 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
4396 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4397 netdev_name, ovs_strerror(error));
4400 return ifr.ifr_ifindex;
4404 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4406 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4408 if (!(netdev->cache_valid & VALID_IFINDEX)) {
4409 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4412 netdev->get_ifindex_error = -ifindex;
4413 netdev->ifindex = 0;
4415 netdev->get_ifindex_error = 0;
4416 netdev->ifindex = ifindex;
4418 netdev->cache_valid |= VALID_IFINDEX;
4421 *ifindexp = netdev->ifindex;
4422 return netdev->get_ifindex_error;
4426 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4432 memset(&ifr, 0, sizeof ifr);
4433 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4434 COVERAGE_INC(netdev_get_hwaddr);
4435 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
4437 /* ENODEV probably means that a vif disappeared asynchronously and
4438 * hasn't been removed from the database yet, so reduce the log level
4439 * to INFO for that case. */
4440 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
4441 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4442 netdev_name, ovs_strerror(error));
4445 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4446 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4447 VLOG_WARN("%s device has unknown hardware address family %d",
4448 netdev_name, hwaddr_family);
4450 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4455 set_etheraddr(const char *netdev_name,
4456 const uint8_t mac[ETH_ADDR_LEN])
4461 memset(&ifr, 0, sizeof ifr);
4462 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4463 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4464 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4465 COVERAGE_INC(netdev_set_hwaddr);
4466 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
4468 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4469 netdev_name, ovs_strerror(error));
4475 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4476 int cmd, const char *cmd_name)
4481 memset(&ifr, 0, sizeof ifr);
4482 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4483 ifr.ifr_data = (caddr_t) ecmd;
4486 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
4488 if (error != EOPNOTSUPP) {
4489 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4490 "failed: %s", cmd_name, name, ovs_strerror(error));
4492 /* The device doesn't support this operation. That's pretty
4493 * common, so there's no point in logging anything. */
4500 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4501 int cmd, const char *cmd_name)
4506 ifr.ifr_addr.sa_family = AF_INET;
4507 error = af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4509 const struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *,
4511 *ip = sin->sin_addr;
4516 /* Returns an AF_PACKET raw socket or a negative errno value. */
4518 af_packet_sock(void)
4520 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4523 if (ovsthread_once_start(&once)) {
4524 sock = socket(AF_PACKET, SOCK_RAW, 0);
4526 int error = set_nonblocking(sock);
4533 VLOG_ERR("failed to create packet socket: %s",
4534 ovs_strerror(errno));
4536 ovsthread_once_done(&once);