2 * Copyright (c) 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <linux/version.h>
37 #include <sys/types.h>
38 #include <sys/ioctl.h>
39 #include <sys/socket.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
52 #include "dpif-linux.h"
53 #include "dynamic-string.h"
54 #include "fatal-signal.h"
57 #include "netdev-provider.h"
58 #include "netdev-vport.h"
59 #include "netlink-notifier.h"
60 #include "netlink-socket.h"
63 #include "openflow/openflow.h"
65 #include "poll-loop.h"
66 #include "rtnetlink-link.h"
68 #include "socket-util.h"
71 #include "unaligned.h"
74 VLOG_DEFINE_THIS_MODULE(netdev_linux);
76 COVERAGE_DEFINE(netdev_set_policing);
77 COVERAGE_DEFINE(netdev_arp_lookup);
78 COVERAGE_DEFINE(netdev_get_ifindex);
79 COVERAGE_DEFINE(netdev_get_hwaddr);
80 COVERAGE_DEFINE(netdev_set_hwaddr);
81 COVERAGE_DEFINE(netdev_get_ethtool);
82 COVERAGE_DEFINE(netdev_set_ethtool);
85 /* These were introduced in Linux 2.6.14, so they might be missing if we have
87 #ifndef ADVERTISED_Pause
88 #define ADVERTISED_Pause (1 << 13)
90 #ifndef ADVERTISED_Asym_Pause
91 #define ADVERTISED_Asym_Pause (1 << 14)
94 /* These were introduced in Linux 2.6.24, so they might be missing if we
95 * have old headers. */
96 #ifndef ETHTOOL_GFLAGS
97 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
99 #ifndef ETHTOOL_SFLAGS
100 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
103 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
106 #define TC_RTAB_SIZE 1024
109 static struct nln_notifier *netdev_linux_cache_notifier = NULL;
110 static int cache_notifier_refcount;
113 VALID_IFINDEX = 1 << 0,
114 VALID_ETHERADDR = 1 << 1,
118 VALID_POLICING = 1 << 5,
119 VALID_VPORT_STAT_ERROR = 1 << 6,
120 VALID_DRVINFO = 1 << 7,
121 VALID_FEATURES = 1 << 8,
124 /* Traffic control. */
126 /* An instance of a traffic control class. Always associated with a particular
129 * Each TC implementation subclasses this with whatever additional data it
132 const struct tc_ops *ops;
133 struct hmap queues; /* Contains "struct tc_queue"s.
134 * Read by generic TC layer.
135 * Written only by TC implementation. */
138 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
140 /* One traffic control queue.
142 * Each TC implementation subclasses this with whatever additional data it
145 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
146 unsigned int queue_id; /* OpenFlow queue ID. */
147 long long int created; /* Time queue was created, in msecs. */
150 /* A particular kind of traffic control. Each implementation generally maps to
151 * one particular Linux qdisc class.
153 * The functions below return 0 if successful or a positive errno value on
154 * failure, except where otherwise noted. All of them must be provided, except
155 * where otherwise noted. */
157 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
158 * This is null for tc_ops_default and tc_ops_other, for which there are no
159 * appropriate values. */
160 const char *linux_name;
162 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
163 const char *ovs_name;
165 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
166 * queues. The queues are numbered 0 through n_queues - 1. */
167 unsigned int n_queues;
169 /* Called to install this TC class on 'netdev'. The implementation should
170 * make the Netlink calls required to set up 'netdev' with the right qdisc
171 * and configure it according to 'details'. The implementation may assume
172 * that the current qdisc is the default; that is, there is no need for it
173 * to delete the current qdisc before installing itself.
175 * The contents of 'details' should be documented as valid for 'ovs_name'
176 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
177 * (which is built as ovs-vswitchd.conf.db(8)).
179 * This function must return 0 if and only if it sets 'netdev->tc' to an
180 * initialized 'struct tc'.
182 * (This function is null for tc_ops_other, which cannot be installed. For
183 * other TC classes it should always be nonnull.) */
184 int (*tc_install)(struct netdev *netdev, const struct smap *details);
186 /* Called when the netdev code determines (through a Netlink query) that
187 * this TC class's qdisc is installed on 'netdev', but we didn't install
188 * it ourselves and so don't know any of the details.
190 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
191 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
192 * implementation should parse the other attributes of 'nlmsg' as
193 * necessary to determine its configuration. If necessary it should also
194 * use Netlink queries to determine the configuration of queues on
197 * This function must return 0 if and only if it sets 'netdev->tc' to an
198 * initialized 'struct tc'. */
199 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
201 /* Destroys the data structures allocated by the implementation as part of
202 * 'tc'. (This includes destroying 'tc->queues' by calling
205 * The implementation should not need to perform any Netlink calls. If
206 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
207 * (But it may not be desirable.)
209 * This function may be null if 'tc' is trivial. */
210 void (*tc_destroy)(struct tc *tc);
212 /* Retrieves details of 'netdev->tc' configuration into 'details'.
214 * The implementation should not need to perform any Netlink calls, because
215 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
216 * cached the configuration.
218 * The contents of 'details' should be documented as valid for 'ovs_name'
219 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
220 * (which is built as ovs-vswitchd.conf.db(8)).
222 * This function may be null if 'tc' is not configurable.
224 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
226 /* Reconfigures 'netdev->tc' according to 'details', performing any
227 * required Netlink calls to complete the reconfiguration.
229 * The contents of 'details' should be documented as valid for 'ovs_name'
230 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
231 * (which is built as ovs-vswitchd.conf.db(8)).
233 * This function may be null if 'tc' is not configurable.
235 int (*qdisc_set)(struct netdev *, const struct smap *details);
237 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
238 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
240 * The contents of 'details' should be documented as valid for 'ovs_name'
241 * in the "other_config" column in the "Queue" table in
242 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
244 * The implementation should not need to perform any Netlink calls, because
245 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
246 * cached the queue configuration.
248 * This function may be null if 'tc' does not have queues ('n_queues' is
250 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
251 struct smap *details);
253 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
254 * 'details', perfoming any required Netlink calls to complete the
255 * reconfiguration. The caller ensures that 'queue_id' is less than
258 * The contents of 'details' should be documented as valid for 'ovs_name'
259 * in the "other_config" column in the "Queue" table in
260 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
262 * This function may be null if 'tc' does not have queues or its queues are
263 * not configurable. */
264 int (*class_set)(struct netdev *, unsigned int queue_id,
265 const struct smap *details);
267 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
268 * tc_queue's within 'netdev->tc->queues'.
270 * This function may be null if 'tc' does not have queues or its queues
271 * cannot be deleted. */
272 int (*class_delete)(struct netdev *, struct tc_queue *queue);
274 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
275 * 'struct tc_queue's within 'netdev->tc->queues'.
277 * On success, initializes '*stats'.
279 * This function may be null if 'tc' does not have queues or if it cannot
280 * report queue statistics. */
281 int (*class_get_stats)(const struct netdev *netdev,
282 const struct tc_queue *queue,
283 struct netdev_queue_stats *stats);
285 /* Extracts queue stats from 'nlmsg', which is a response to a
286 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
288 * This function may be null if 'tc' does not have queues or if it cannot
289 * report queue statistics. */
290 int (*class_dump_stats)(const struct netdev *netdev,
291 const struct ofpbuf *nlmsg,
292 netdev_dump_queue_stats_cb *cb, void *aux);
296 tc_init(struct tc *tc, const struct tc_ops *ops)
299 hmap_init(&tc->queues);
303 tc_destroy(struct tc *tc)
305 hmap_destroy(&tc->queues);
308 static const struct tc_ops tc_ops_htb;
309 static const struct tc_ops tc_ops_hfsc;
310 static const struct tc_ops tc_ops_default;
311 static const struct tc_ops tc_ops_other;
313 static const struct tc_ops *const tcs[] = {
314 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
315 &tc_ops_hfsc, /* Hierarchical fair service curve. */
316 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
317 &tc_ops_other, /* Some other qdisc. */
321 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
322 static unsigned int tc_get_major(unsigned int handle);
323 static unsigned int tc_get_minor(unsigned int handle);
325 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
326 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
327 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
329 static struct tcmsg *tc_make_request(const struct netdev *, int type,
330 unsigned int flags, struct ofpbuf *);
331 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
332 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
333 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
336 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
337 struct nlattr **options);
338 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
339 struct nlattr **options,
340 struct netdev_queue_stats *);
341 static int tc_query_class(const struct netdev *,
342 unsigned int handle, unsigned int parent,
343 struct ofpbuf **replyp);
344 static int tc_delete_class(const struct netdev *, unsigned int handle);
346 static int tc_del_qdisc(struct netdev *netdev);
347 static int tc_query_qdisc(const struct netdev *netdev);
349 static int tc_calc_cell_log(unsigned int mtu);
350 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
351 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
352 const struct tc_ratespec *rate);
353 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
355 struct netdev_linux {
358 unsigned int cache_valid;
359 unsigned int change_seq;
361 bool miimon; /* Link status of last poll. */
362 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
363 struct timer miimon_timer;
365 /* The following are figured out "on demand" only. They are only valid
366 * when the corresponding VALID_* bit in 'cache_valid' is set. */
368 uint8_t etheraddr[ETH_ADDR_LEN];
369 struct in_addr address, netmask;
372 unsigned int ifi_flags;
373 long long int carrier_resets;
374 uint32_t kbits_rate; /* Policing data. */
375 uint32_t kbits_burst;
376 int vport_stats_error; /* Cached error code from vport_get_stats().
377 0 or an errno value. */
378 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
379 int ether_addr_error; /* Cached error code from set/get etheraddr. */
380 int netdev_policing_error; /* Cached error code from set policing. */
381 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
382 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
384 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
385 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
386 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
388 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
391 /* For devices of class netdev_tap_class only. */
395 struct netdev_rx_linux {
401 static const struct netdev_rx_class netdev_rx_linux_class;
403 /* Sockets used for ioctl operations. */
404 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
406 /* This is set pretty low because we probably won't learn anything from the
407 * additional log messages. */
408 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
410 static int netdev_linux_init(void);
412 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
413 int cmd, const char *cmd_name);
414 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
415 const char *cmd_name);
416 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
417 int cmd, const char *cmd_name);
418 static int get_flags(const struct netdev *, unsigned int *flags);
419 static int set_flags(const char *, unsigned int flags);
420 static int do_get_ifindex(const char *netdev_name);
421 static int get_ifindex(const struct netdev *, int *ifindexp);
422 static int do_set_addr(struct netdev *netdev,
423 int ioctl_nr, const char *ioctl_name,
424 struct in_addr addr);
425 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
426 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
427 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
428 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
429 static int af_packet_sock(void);
430 static void netdev_linux_miimon_run(void);
431 static void netdev_linux_miimon_wait(void);
434 is_netdev_linux_class(const struct netdev_class *netdev_class)
436 return netdev_class->init == netdev_linux_init;
440 is_tap_netdev(const struct netdev *netdev)
442 return netdev_get_class(netdev) == &netdev_tap_class;
445 static struct netdev_linux *
446 netdev_linux_cast(const struct netdev *netdev)
448 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
450 return CONTAINER_OF(netdev, struct netdev_linux, up);
453 static struct netdev_rx_linux *
454 netdev_rx_linux_cast(const struct netdev_rx *rx)
456 netdev_rx_assert_class(rx, &netdev_rx_linux_class);
457 return CONTAINER_OF(rx, struct netdev_rx_linux, up);
461 netdev_linux_init(void)
463 static int status = -1;
465 /* Create AF_INET socket. */
466 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
467 status = af_inet_sock >= 0 ? 0 : errno;
469 VLOG_ERR("failed to create inet socket: %s", ovs_strerror(status));
476 netdev_linux_run(void)
478 rtnetlink_link_run();
479 netdev_linux_miimon_run();
483 netdev_linux_wait(void)
485 rtnetlink_link_wait();
486 netdev_linux_miimon_wait();
490 netdev_linux_changed(struct netdev_linux *dev,
491 unsigned int ifi_flags, unsigned int mask)
494 if (!dev->change_seq) {
498 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
499 dev->carrier_resets++;
501 dev->ifi_flags = ifi_flags;
503 dev->cache_valid &= mask;
507 netdev_linux_update(struct netdev_linux *dev,
508 const struct rtnetlink_link_change *change)
510 if (change->nlmsg_type == RTM_NEWLINK) {
512 netdev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
514 /* Update netdev from rtnl-change msg. */
516 dev->mtu = change->mtu;
517 dev->cache_valid |= VALID_MTU;
518 dev->netdev_mtu_error = 0;
521 if (!eth_addr_is_zero(change->addr)) {
522 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
523 dev->cache_valid |= VALID_ETHERADDR;
524 dev->ether_addr_error = 0;
527 dev->ifindex = change->ifi_index;
528 dev->cache_valid |= VALID_IFINDEX;
529 dev->get_ifindex_error = 0;
532 netdev_linux_changed(dev, change->ifi_flags, 0);
537 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
538 void *aux OVS_UNUSED)
541 struct netdev *base_dev = netdev_from_name(change->ifname);
542 if (base_dev && is_netdev_linux_class(netdev_get_class(base_dev))) {
543 netdev_linux_update(netdev_linux_cast(base_dev), change);
544 netdev_close(base_dev);
547 struct shash device_shash;
548 struct shash_node *node;
550 shash_init(&device_shash);
551 netdev_get_devices(&netdev_linux_class, &device_shash);
552 SHASH_FOR_EACH (node, &device_shash) {
553 struct netdev *netdev = node->data;
554 struct netdev_linux *dev = netdev_linux_cast(netdev);
557 get_flags(&dev->up, &flags);
558 netdev_linux_changed(dev, flags, 0);
559 netdev_close(netdev);
561 shash_destroy(&device_shash);
566 cache_notifier_ref(void)
568 if (!cache_notifier_refcount) {
569 ovs_assert(!netdev_linux_cache_notifier);
571 netdev_linux_cache_notifier =
572 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
574 if (!netdev_linux_cache_notifier) {
578 cache_notifier_refcount++;
584 cache_notifier_unref(void)
586 ovs_assert(cache_notifier_refcount > 0);
587 if (!--cache_notifier_refcount) {
588 ovs_assert(netdev_linux_cache_notifier);
589 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
590 netdev_linux_cache_notifier = NULL;
594 /* Creates system and internal devices. */
596 netdev_linux_create(const struct netdev_class *class, const char *name,
597 struct netdev **netdevp)
599 struct netdev_linux *netdev;
602 error = cache_notifier_ref();
607 netdev = xzalloc(sizeof *netdev);
608 netdev->change_seq = 1;
609 netdev_init(&netdev->up, name, class);
610 error = get_flags(&netdev->up, &netdev->ifi_flags);
611 if (error == ENODEV) {
612 if (class != &netdev_internal_class) {
613 /* The device does not exist, so don't allow it to be opened. */
614 netdev_uninit(&netdev->up, false);
615 cache_notifier_unref();
619 /* "Internal" netdevs have to be created as netdev objects before
620 * they exist in the kernel, because creating them in the kernel
621 * happens by passing a netdev object to dpif_port_add().
622 * Therefore, ignore the error. */
626 *netdevp = &netdev->up;
630 /* For most types of netdevs we open the device for each call of
631 * netdev_open(). However, this is not the case with tap devices,
632 * since it is only possible to open the device once. In this
633 * situation we share a single file descriptor, and consequently
634 * buffers, across all readers. Therefore once data is read it will
635 * be unavailable to other reads for tap devices. */
637 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
638 const char *name, struct netdev **netdevp)
640 struct netdev_linux *netdev;
641 static const char tap_dev[] = "/dev/net/tun";
645 netdev = xzalloc(sizeof *netdev);
646 netdev->change_seq = 1;
648 error = cache_notifier_ref();
653 /* Open tap device. */
654 netdev->tap_fd = open(tap_dev, O_RDWR);
655 if (netdev->tap_fd < 0) {
657 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
658 goto error_unref_notifier;
661 /* Create tap device. */
662 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
663 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
664 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
665 VLOG_WARN("%s: creating tap device failed: %s", name,
666 ovs_strerror(errno));
671 /* Make non-blocking. */
672 error = set_nonblocking(netdev->tap_fd);
677 netdev_init(&netdev->up, name, &netdev_tap_class);
678 *netdevp = &netdev->up;
682 close(netdev->tap_fd);
683 error_unref_notifier:
684 cache_notifier_unref();
691 netdev_linux_destroy(struct netdev *netdev_)
693 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
695 if (netdev->tc && netdev->tc->ops->tc_destroy) {
696 netdev->tc->ops->tc_destroy(netdev->tc);
699 if (netdev_get_class(netdev_) == &netdev_tap_class
700 && netdev->tap_fd >= 0)
702 close(netdev->tap_fd);
706 cache_notifier_unref();
710 netdev_linux_rx_open(struct netdev *netdev_, struct netdev_rx **rxp)
712 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
713 bool is_tap = is_tap_netdev(netdev_);
714 struct netdev_rx_linux *rx;
721 struct sockaddr_ll sll;
723 /* Result of tcpdump -dd inbound */
724 static struct sock_filter filt[] = {
725 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
726 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
727 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
728 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
730 static struct sock_fprog fprog = { ARRAY_SIZE(filt), filt };
732 /* Create file descriptor. */
733 fd = socket(PF_PACKET, SOCK_RAW, 0);
736 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
740 /* Set non-blocking mode. */
741 error = set_nonblocking(fd);
746 /* Get ethernet device index. */
747 error = get_ifindex(&netdev->up, &ifindex);
752 /* Bind to specific ethernet device. */
753 memset(&sll, 0, sizeof sll);
754 sll.sll_family = AF_PACKET;
755 sll.sll_ifindex = ifindex;
756 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
757 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
759 VLOG_ERR("%s: failed to bind raw socket (%s)",
760 netdev_get_name(netdev_), ovs_strerror(error));
764 /* Filter for only inbound packets. */
765 error = setsockopt(fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
769 VLOG_ERR("%s: failed attach filter (%s)",
770 netdev_get_name(netdev_), ovs_strerror(error));
775 rx = xmalloc(sizeof *rx);
776 netdev_rx_init(&rx->up, netdev_, &netdev_rx_linux_class);
791 netdev_rx_linux_destroy(struct netdev_rx *rx_)
793 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
802 netdev_rx_linux_recv(struct netdev_rx *rx_, void *data, size_t size)
804 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
809 ? read(rx->fd, data, size)
810 : recv(rx->fd, data, size, MSG_TRUNC));
811 } while (retval < 0 && errno == EINTR);
814 return retval > size ? -EMSGSIZE : retval;
816 if (errno != EAGAIN) {
817 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
818 ovs_strerror(errno), netdev_rx_get_name(rx_));
825 netdev_rx_linux_wait(struct netdev_rx *rx_)
827 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
828 poll_fd_wait(rx->fd, POLLIN);
832 netdev_rx_linux_drain(struct netdev_rx *rx_)
834 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
837 int error = netdev_linux_do_ioctl(netdev_rx_get_name(rx_), &ifr,
838 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
842 drain_fd(rx->fd, ifr.ifr_qlen);
845 return drain_rcvbuf(rx->fd);
849 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
850 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
851 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
852 * the packet is too big or too small to transmit on the device.
854 * The caller retains ownership of 'buffer' in all cases.
856 * The kernel maintains a packet transmission queue, so the caller is not
857 * expected to do additional queuing of packets. */
859 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
864 if (!is_tap_netdev(netdev_)) {
865 /* Use our AF_PACKET socket to send to this device. */
866 struct sockaddr_ll sll;
873 sock = af_packet_sock();
878 error = get_ifindex(netdev_, &ifindex);
883 /* We don't bother setting most fields in sockaddr_ll because the
884 * kernel ignores them for SOCK_RAW. */
885 memset(&sll, 0, sizeof sll);
886 sll.sll_family = AF_PACKET;
887 sll.sll_ifindex = ifindex;
889 iov.iov_base = CONST_CAST(void *, data);
893 msg.msg_namelen = sizeof sll;
896 msg.msg_control = NULL;
897 msg.msg_controllen = 0;
900 retval = sendmsg(sock, &msg, 0);
902 /* Use the tap fd to send to this device. This is essential for
903 * tap devices, because packets sent to a tap device with an
904 * AF_PACKET socket will loop back to be *received* again on the
905 * tap device. This doesn't occur on other interface types
906 * because we attach a socket filter to the rx socket. */
907 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
909 retval = write(netdev->tap_fd, data, size);
913 /* The Linux AF_PACKET implementation never blocks waiting for room
914 * for packets, instead returning ENOBUFS. Translate this into
915 * EAGAIN for the caller. */
916 if (errno == ENOBUFS) {
918 } else if (errno == EINTR) {
920 } else if (errno != EAGAIN) {
921 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
922 netdev_get_name(netdev_), ovs_strerror(errno));
925 } else if (retval != size) {
926 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
927 "%zu) on %s", retval, size, netdev_get_name(netdev_));
935 /* Registers with the poll loop to wake up from the next call to poll_block()
936 * when the packet transmission queue has sufficient room to transmit a packet
937 * with netdev_send().
939 * The kernel maintains a packet transmission queue, so the client is not
940 * expected to do additional queuing of packets. Thus, this function is
941 * unlikely to ever be used. It is included for completeness. */
943 netdev_linux_send_wait(struct netdev *netdev)
945 if (is_tap_netdev(netdev)) {
946 /* TAP device always accepts packets.*/
947 poll_immediate_wake();
951 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
952 * otherwise a positive errno value. */
954 netdev_linux_set_etheraddr(struct netdev *netdev_,
955 const uint8_t mac[ETH_ADDR_LEN])
957 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
958 struct netdev_saved_flags *sf = NULL;
961 if (netdev->cache_valid & VALID_ETHERADDR) {
962 if (netdev->ether_addr_error) {
963 return netdev->ether_addr_error;
965 if (eth_addr_equals(netdev->etheraddr, mac)) {
968 netdev->cache_valid &= ~VALID_ETHERADDR;
971 /* Tap devices must be brought down before setting the address. */
972 if (is_tap_netdev(netdev_)) {
973 netdev_turn_flags_off(netdev_, NETDEV_UP, &sf);
975 error = set_etheraddr(netdev_get_name(netdev_), mac);
976 if (!error || error == ENODEV) {
977 netdev->ether_addr_error = error;
978 netdev->cache_valid |= VALID_ETHERADDR;
980 memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
984 netdev_restore_flags(sf);
989 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
991 netdev_linux_get_etheraddr(const struct netdev *netdev_,
992 uint8_t mac[ETH_ADDR_LEN])
994 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
996 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
997 int error = get_etheraddr(netdev_get_name(netdev_),
1000 netdev->ether_addr_error = error;
1001 netdev->cache_valid |= VALID_ETHERADDR;
1004 if (!netdev->ether_addr_error) {
1005 memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
1008 return netdev->ether_addr_error;
1011 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1012 * in bytes, not including the hardware header; thus, this is typically 1500
1013 * bytes for Ethernet devices. */
1015 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1017 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1018 if (!(netdev->cache_valid & VALID_MTU)) {
1022 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1023 SIOCGIFMTU, "SIOCGIFMTU");
1025 netdev->netdev_mtu_error = error;
1026 netdev->mtu = ifr.ifr_mtu;
1027 netdev->cache_valid |= VALID_MTU;
1030 if (!netdev->netdev_mtu_error) {
1031 *mtup = netdev->mtu;
1033 return netdev->netdev_mtu_error;
1036 /* Sets the maximum size of transmitted (MTU) for given device using linux
1037 * networking ioctl interface.
1040 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1042 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1046 if (netdev->cache_valid & VALID_MTU) {
1047 if (netdev->netdev_mtu_error) {
1048 return netdev->netdev_mtu_error;
1050 if (netdev->mtu == mtu) {
1053 netdev->cache_valid &= ~VALID_MTU;
1056 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1057 SIOCSIFMTU, "SIOCSIFMTU");
1058 if (!error || error == ENODEV) {
1059 netdev->netdev_mtu_error = error;
1060 netdev->mtu = ifr.ifr_mtu;
1061 netdev->cache_valid |= VALID_MTU;
1066 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1067 * On failure, returns a negative errno value. */
1069 netdev_linux_get_ifindex(const struct netdev *netdev)
1073 error = get_ifindex(netdev, &ifindex);
1074 return error ? -error : ifindex;
1078 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1080 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1082 if (netdev->miimon_interval > 0) {
1083 *carrier = netdev->miimon;
1085 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1091 static long long int
1092 netdev_linux_get_carrier_resets(const struct netdev *netdev)
1094 return netdev_linux_cast(netdev)->carrier_resets;
1098 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1099 struct mii_ioctl_data *data)
1104 memset(&ifr, 0, sizeof ifr);
1105 memcpy(&ifr.ifr_data, data, sizeof *data);
1106 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1107 memcpy(data, &ifr.ifr_data, sizeof *data);
1113 netdev_linux_get_miimon(const char *name, bool *miimon)
1115 struct mii_ioctl_data data;
1120 memset(&data, 0, sizeof data);
1121 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1123 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1124 data.reg_num = MII_BMSR;
1125 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1129 *miimon = !!(data.val_out & BMSR_LSTATUS);
1131 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1134 struct ethtool_cmd ecmd;
1136 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1139 COVERAGE_INC(netdev_get_ethtool);
1140 memset(&ecmd, 0, sizeof ecmd);
1141 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1144 struct ethtool_value eval;
1146 memcpy(&eval, &ecmd, sizeof eval);
1147 *miimon = !!eval.data;
1149 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1157 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1158 long long int interval)
1160 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1162 interval = interval > 0 ? MAX(interval, 100) : 0;
1163 if (netdev->miimon_interval != interval) {
1164 netdev->miimon_interval = interval;
1165 timer_set_expired(&netdev->miimon_timer);
1172 netdev_linux_miimon_run(void)
1174 struct shash device_shash;
1175 struct shash_node *node;
1177 shash_init(&device_shash);
1178 netdev_get_devices(&netdev_linux_class, &device_shash);
1179 SHASH_FOR_EACH (node, &device_shash) {
1180 struct netdev *netdev = node->data;
1181 struct netdev_linux *dev = netdev_linux_cast(netdev);
1184 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1185 netdev_close(netdev);
1189 netdev_linux_get_miimon(dev->up.name, &miimon);
1190 if (miimon != dev->miimon) {
1191 dev->miimon = miimon;
1192 netdev_linux_changed(dev, dev->ifi_flags, 0);
1195 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1196 netdev_close(netdev);
1199 shash_destroy(&device_shash);
1203 netdev_linux_miimon_wait(void)
1205 struct shash device_shash;
1206 struct shash_node *node;
1208 shash_init(&device_shash);
1209 netdev_get_devices(&netdev_linux_class, &device_shash);
1210 SHASH_FOR_EACH (node, &device_shash) {
1211 struct netdev *netdev = node->data;
1212 struct netdev_linux *dev = netdev_linux_cast(netdev);
1214 if (dev->miimon_interval > 0) {
1215 timer_wait(&dev->miimon_timer);
1217 netdev_close(netdev);
1219 shash_destroy(&device_shash);
1222 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1223 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1226 check_for_working_netlink_stats(void)
1228 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1229 * preferable, so if that works, we'll use it. */
1230 int ifindex = do_get_ifindex("lo");
1232 VLOG_WARN("failed to get ifindex for lo, "
1233 "obtaining netdev stats from proc");
1236 struct netdev_stats stats;
1237 int error = get_stats_via_netlink(ifindex, &stats);
1239 VLOG_DBG("obtaining netdev stats via rtnetlink");
1242 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1243 "via proc (you are probably running a pre-2.6.19 "
1244 "kernel)", ovs_strerror(error));
1251 swap_uint64(uint64_t *a, uint64_t *b)
1258 /* Copies 'src' into 'dst', performing format conversion in the process.
1260 * 'src' is allowed to be misaligned. */
1262 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1263 const struct ovs_vport_stats *src)
1265 dst->rx_packets = get_unaligned_u64(&src->rx_packets);
1266 dst->tx_packets = get_unaligned_u64(&src->tx_packets);
1267 dst->rx_bytes = get_unaligned_u64(&src->rx_bytes);
1268 dst->tx_bytes = get_unaligned_u64(&src->tx_bytes);
1269 dst->rx_errors = get_unaligned_u64(&src->rx_errors);
1270 dst->tx_errors = get_unaligned_u64(&src->tx_errors);
1271 dst->rx_dropped = get_unaligned_u64(&src->rx_dropped);
1272 dst->tx_dropped = get_unaligned_u64(&src->tx_dropped);
1274 dst->collisions = 0;
1275 dst->rx_length_errors = 0;
1276 dst->rx_over_errors = 0;
1277 dst->rx_crc_errors = 0;
1278 dst->rx_frame_errors = 0;
1279 dst->rx_fifo_errors = 0;
1280 dst->rx_missed_errors = 0;
1281 dst->tx_aborted_errors = 0;
1282 dst->tx_carrier_errors = 0;
1283 dst->tx_fifo_errors = 0;
1284 dst->tx_heartbeat_errors = 0;
1285 dst->tx_window_errors = 0;
1289 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1291 struct dpif_linux_vport reply;
1295 error = dpif_linux_vport_get(netdev_get_name(netdev), &reply, &buf);
1298 } else if (!reply.stats) {
1303 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1311 get_stats_via_vport(const struct netdev *netdev_,
1312 struct netdev_stats *stats)
1314 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1316 if (!netdev->vport_stats_error ||
1317 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1320 error = get_stats_via_vport__(netdev_, stats);
1321 if (error && error != ENOENT) {
1322 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1324 netdev_get_name(netdev_), ovs_strerror(error));
1326 netdev->vport_stats_error = error;
1327 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1332 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1333 struct netdev_stats *stats)
1335 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
1336 static int use_netlink_stats;
1339 if (ovsthread_once_start(&once)) {
1340 use_netlink_stats = check_for_working_netlink_stats();
1341 ovsthread_once_done(&once);
1344 if (use_netlink_stats) {
1347 error = get_ifindex(netdev_, &ifindex);
1349 error = get_stats_via_netlink(ifindex, stats);
1352 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1356 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1357 netdev_get_name(netdev_), error);
1363 /* Retrieves current device stats for 'netdev-linux'. */
1365 netdev_linux_get_stats(const struct netdev *netdev_,
1366 struct netdev_stats *stats)
1368 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1369 struct netdev_stats dev_stats;
1372 get_stats_via_vport(netdev_, stats);
1374 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1377 if (netdev->vport_stats_error) {
1384 if (netdev->vport_stats_error) {
1385 /* stats not available from OVS then use ioctl stats. */
1388 stats->rx_errors += dev_stats.rx_errors;
1389 stats->tx_errors += dev_stats.tx_errors;
1390 stats->rx_dropped += dev_stats.rx_dropped;
1391 stats->tx_dropped += dev_stats.tx_dropped;
1392 stats->multicast += dev_stats.multicast;
1393 stats->collisions += dev_stats.collisions;
1394 stats->rx_length_errors += dev_stats.rx_length_errors;
1395 stats->rx_over_errors += dev_stats.rx_over_errors;
1396 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1397 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1398 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1399 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1400 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1401 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1402 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1403 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1404 stats->tx_window_errors += dev_stats.tx_window_errors;
1409 /* Retrieves current device stats for 'netdev-tap' netdev or
1410 * netdev-internal. */
1412 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1414 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1415 struct netdev_stats dev_stats;
1418 get_stats_via_vport(netdev_, stats);
1420 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1422 if (netdev->vport_stats_error) {
1429 /* If this port is an internal port then the transmit and receive stats
1430 * will appear to be swapped relative to the other ports since we are the
1431 * one sending the data, not a remote computer. For consistency, we swap
1432 * them back here. This does not apply if we are getting stats from the
1433 * vport layer because it always tracks stats from the perspective of the
1435 if (netdev->vport_stats_error) {
1437 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1438 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1439 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1440 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1441 stats->rx_length_errors = 0;
1442 stats->rx_over_errors = 0;
1443 stats->rx_crc_errors = 0;
1444 stats->rx_frame_errors = 0;
1445 stats->rx_fifo_errors = 0;
1446 stats->rx_missed_errors = 0;
1447 stats->tx_aborted_errors = 0;
1448 stats->tx_carrier_errors = 0;
1449 stats->tx_fifo_errors = 0;
1450 stats->tx_heartbeat_errors = 0;
1451 stats->tx_window_errors = 0;
1453 stats->rx_dropped += dev_stats.tx_dropped;
1454 stats->tx_dropped += dev_stats.rx_dropped;
1456 stats->rx_errors += dev_stats.tx_errors;
1457 stats->tx_errors += dev_stats.rx_errors;
1459 stats->multicast += dev_stats.multicast;
1460 stats->collisions += dev_stats.collisions;
1466 netdev_internal_get_stats(const struct netdev *netdev_,
1467 struct netdev_stats *stats)
1469 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1471 get_stats_via_vport(netdev_, stats);
1472 return netdev->vport_stats_error;
1476 netdev_internal_set_stats(struct netdev *netdev,
1477 const struct netdev_stats *stats)
1479 struct ovs_vport_stats vport_stats;
1480 struct dpif_linux_vport vport;
1483 vport_stats.rx_packets = stats->rx_packets;
1484 vport_stats.tx_packets = stats->tx_packets;
1485 vport_stats.rx_bytes = stats->rx_bytes;
1486 vport_stats.tx_bytes = stats->tx_bytes;
1487 vport_stats.rx_errors = stats->rx_errors;
1488 vport_stats.tx_errors = stats->tx_errors;
1489 vport_stats.rx_dropped = stats->rx_dropped;
1490 vport_stats.tx_dropped = stats->tx_dropped;
1492 dpif_linux_vport_init(&vport);
1493 vport.cmd = OVS_VPORT_CMD_SET;
1494 vport.name = netdev_get_name(netdev);
1495 vport.stats = &vport_stats;
1497 err = dpif_linux_vport_transact(&vport, NULL, NULL);
1499 /* If the vport layer doesn't know about the device, that doesn't mean it
1500 * doesn't exist (after all were able to open it when netdev_open() was
1501 * called), it just means that it isn't attached and we'll be getting
1502 * stats a different way. */
1503 if (err == ENODEV) {
1511 netdev_linux_read_features(struct netdev_linux *netdev)
1513 struct ethtool_cmd ecmd;
1517 if (netdev->cache_valid & VALID_FEATURES) {
1521 COVERAGE_INC(netdev_get_ethtool);
1522 memset(&ecmd, 0, sizeof ecmd);
1523 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1524 ETHTOOL_GSET, "ETHTOOL_GSET");
1529 /* Supported features. */
1530 netdev->supported = 0;
1531 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1532 netdev->supported |= NETDEV_F_10MB_HD;
1534 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1535 netdev->supported |= NETDEV_F_10MB_FD;
1537 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1538 netdev->supported |= NETDEV_F_100MB_HD;
1540 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1541 netdev->supported |= NETDEV_F_100MB_FD;
1543 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1544 netdev->supported |= NETDEV_F_1GB_HD;
1546 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1547 netdev->supported |= NETDEV_F_1GB_FD;
1549 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1550 netdev->supported |= NETDEV_F_10GB_FD;
1552 if (ecmd.supported & SUPPORTED_TP) {
1553 netdev->supported |= NETDEV_F_COPPER;
1555 if (ecmd.supported & SUPPORTED_FIBRE) {
1556 netdev->supported |= NETDEV_F_FIBER;
1558 if (ecmd.supported & SUPPORTED_Autoneg) {
1559 netdev->supported |= NETDEV_F_AUTONEG;
1561 if (ecmd.supported & SUPPORTED_Pause) {
1562 netdev->supported |= NETDEV_F_PAUSE;
1564 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1565 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1568 /* Advertised features. */
1569 netdev->advertised = 0;
1570 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1571 netdev->advertised |= NETDEV_F_10MB_HD;
1573 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1574 netdev->advertised |= NETDEV_F_10MB_FD;
1576 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1577 netdev->advertised |= NETDEV_F_100MB_HD;
1579 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1580 netdev->advertised |= NETDEV_F_100MB_FD;
1582 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1583 netdev->advertised |= NETDEV_F_1GB_HD;
1585 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1586 netdev->advertised |= NETDEV_F_1GB_FD;
1588 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1589 netdev->advertised |= NETDEV_F_10GB_FD;
1591 if (ecmd.advertising & ADVERTISED_TP) {
1592 netdev->advertised |= NETDEV_F_COPPER;
1594 if (ecmd.advertising & ADVERTISED_FIBRE) {
1595 netdev->advertised |= NETDEV_F_FIBER;
1597 if (ecmd.advertising & ADVERTISED_Autoneg) {
1598 netdev->advertised |= NETDEV_F_AUTONEG;
1600 if (ecmd.advertising & ADVERTISED_Pause) {
1601 netdev->advertised |= NETDEV_F_PAUSE;
1603 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1604 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1607 /* Current settings. */
1609 if (speed == SPEED_10) {
1610 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1611 } else if (speed == SPEED_100) {
1612 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1613 } else if (speed == SPEED_1000) {
1614 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1615 } else if (speed == SPEED_10000) {
1616 netdev->current = NETDEV_F_10GB_FD;
1617 } else if (speed == 40000) {
1618 netdev->current = NETDEV_F_40GB_FD;
1619 } else if (speed == 100000) {
1620 netdev->current = NETDEV_F_100GB_FD;
1621 } else if (speed == 1000000) {
1622 netdev->current = NETDEV_F_1TB_FD;
1624 netdev->current = 0;
1627 if (ecmd.port == PORT_TP) {
1628 netdev->current |= NETDEV_F_COPPER;
1629 } else if (ecmd.port == PORT_FIBRE) {
1630 netdev->current |= NETDEV_F_FIBER;
1634 netdev->current |= NETDEV_F_AUTONEG;
1638 netdev->cache_valid |= VALID_FEATURES;
1639 netdev->get_features_error = error;
1642 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
1643 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1644 * Returns 0 if successful, otherwise a positive errno value. */
1646 netdev_linux_get_features(const struct netdev *netdev_,
1647 enum netdev_features *current,
1648 enum netdev_features *advertised,
1649 enum netdev_features *supported,
1650 enum netdev_features *peer)
1652 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1654 netdev_linux_read_features(netdev);
1656 if (!netdev->get_features_error) {
1657 *current = netdev->current;
1658 *advertised = netdev->advertised;
1659 *supported = netdev->supported;
1660 *peer = 0; /* XXX */
1662 return netdev->get_features_error;
1665 /* Set the features advertised by 'netdev' to 'advertise'. */
1667 netdev_linux_set_advertisements(struct netdev *netdev,
1668 enum netdev_features advertise)
1670 struct ethtool_cmd ecmd;
1673 COVERAGE_INC(netdev_get_ethtool);
1674 memset(&ecmd, 0, sizeof ecmd);
1675 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1676 ETHTOOL_GSET, "ETHTOOL_GSET");
1681 ecmd.advertising = 0;
1682 if (advertise & NETDEV_F_10MB_HD) {
1683 ecmd.advertising |= ADVERTISED_10baseT_Half;
1685 if (advertise & NETDEV_F_10MB_FD) {
1686 ecmd.advertising |= ADVERTISED_10baseT_Full;
1688 if (advertise & NETDEV_F_100MB_HD) {
1689 ecmd.advertising |= ADVERTISED_100baseT_Half;
1691 if (advertise & NETDEV_F_100MB_FD) {
1692 ecmd.advertising |= ADVERTISED_100baseT_Full;
1694 if (advertise & NETDEV_F_1GB_HD) {
1695 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1697 if (advertise & NETDEV_F_1GB_FD) {
1698 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1700 if (advertise & NETDEV_F_10GB_FD) {
1701 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1703 if (advertise & NETDEV_F_COPPER) {
1704 ecmd.advertising |= ADVERTISED_TP;
1706 if (advertise & NETDEV_F_FIBER) {
1707 ecmd.advertising |= ADVERTISED_FIBRE;
1709 if (advertise & NETDEV_F_AUTONEG) {
1710 ecmd.advertising |= ADVERTISED_Autoneg;
1712 if (advertise & NETDEV_F_PAUSE) {
1713 ecmd.advertising |= ADVERTISED_Pause;
1715 if (advertise & NETDEV_F_PAUSE_ASYM) {
1716 ecmd.advertising |= ADVERTISED_Asym_Pause;
1718 COVERAGE_INC(netdev_set_ethtool);
1719 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1720 ETHTOOL_SSET, "ETHTOOL_SSET");
1723 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1724 * successful, otherwise a positive errno value. */
1726 netdev_linux_set_policing(struct netdev *netdev_,
1727 uint32_t kbits_rate, uint32_t kbits_burst)
1729 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1730 const char *netdev_name = netdev_get_name(netdev_);
1734 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1735 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1736 : kbits_burst); /* Stick with user-specified value. */
1738 if (netdev->cache_valid & VALID_POLICING) {
1739 if (netdev->netdev_policing_error) {
1740 return netdev->netdev_policing_error;
1743 if (netdev->kbits_rate == kbits_rate &&
1744 netdev->kbits_burst == kbits_burst) {
1745 /* Assume that settings haven't changed since we last set them. */
1748 netdev->cache_valid &= ~VALID_POLICING;
1751 COVERAGE_INC(netdev_set_policing);
1752 /* Remove any existing ingress qdisc. */
1753 error = tc_add_del_ingress_qdisc(netdev_, false);
1755 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1756 netdev_name, ovs_strerror(error));
1761 error = tc_add_del_ingress_qdisc(netdev_, true);
1763 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1764 netdev_name, ovs_strerror(error));
1768 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
1770 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1771 netdev_name, ovs_strerror(error));
1776 netdev->kbits_rate = kbits_rate;
1777 netdev->kbits_burst = kbits_burst;
1780 if (!error || error == ENODEV) {
1781 netdev->netdev_policing_error = error;
1782 netdev->cache_valid |= VALID_POLICING;
1788 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1791 const struct tc_ops *const *opsp;
1793 for (opsp = tcs; *opsp != NULL; opsp++) {
1794 const struct tc_ops *ops = *opsp;
1795 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1796 sset_add(types, ops->ovs_name);
1802 static const struct tc_ops *
1803 tc_lookup_ovs_name(const char *name)
1805 const struct tc_ops *const *opsp;
1807 for (opsp = tcs; *opsp != NULL; opsp++) {
1808 const struct tc_ops *ops = *opsp;
1809 if (!strcmp(name, ops->ovs_name)) {
1816 static const struct tc_ops *
1817 tc_lookup_linux_name(const char *name)
1819 const struct tc_ops *const *opsp;
1821 for (opsp = tcs; *opsp != NULL; opsp++) {
1822 const struct tc_ops *ops = *opsp;
1823 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1830 static struct tc_queue *
1831 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
1834 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1835 struct tc_queue *queue;
1837 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
1838 if (queue->queue_id == queue_id) {
1845 static struct tc_queue *
1846 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1848 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1852 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1854 struct netdev_qos_capabilities *caps)
1856 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1860 caps->n_queues = ops->n_queues;
1865 netdev_linux_get_qos(const struct netdev *netdev_,
1866 const char **typep, struct smap *details)
1868 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1871 error = tc_query_qdisc(netdev_);
1876 *typep = netdev->tc->ops->ovs_name;
1877 return (netdev->tc->ops->qdisc_get
1878 ? netdev->tc->ops->qdisc_get(netdev_, details)
1883 netdev_linux_set_qos(struct netdev *netdev_,
1884 const char *type, const struct smap *details)
1886 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1887 const struct tc_ops *new_ops;
1890 new_ops = tc_lookup_ovs_name(type);
1891 if (!new_ops || !new_ops->tc_install) {
1895 error = tc_query_qdisc(netdev_);
1900 if (new_ops == netdev->tc->ops) {
1901 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
1903 /* Delete existing qdisc. */
1904 error = tc_del_qdisc(netdev_);
1908 ovs_assert(netdev->tc == NULL);
1910 /* Install new qdisc. */
1911 error = new_ops->tc_install(netdev_, details);
1912 ovs_assert((error == 0) == (netdev->tc != NULL));
1919 netdev_linux_get_queue(const struct netdev *netdev_,
1920 unsigned int queue_id, struct smap *details)
1922 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1925 error = tc_query_qdisc(netdev_);
1929 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
1931 ? netdev->tc->ops->class_get(netdev_, queue, details)
1937 netdev_linux_set_queue(struct netdev *netdev_,
1938 unsigned int queue_id, const struct smap *details)
1940 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1943 error = tc_query_qdisc(netdev_);
1946 } else if (queue_id >= netdev->tc->ops->n_queues
1947 || !netdev->tc->ops->class_set) {
1951 return netdev->tc->ops->class_set(netdev_, queue_id, details);
1955 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
1957 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1960 error = tc_query_qdisc(netdev_);
1963 } else if (!netdev->tc->ops->class_delete) {
1966 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
1968 ? netdev->tc->ops->class_delete(netdev_, queue)
1974 netdev_linux_get_queue_stats(const struct netdev *netdev_,
1975 unsigned int queue_id,
1976 struct netdev_queue_stats *stats)
1978 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1981 error = tc_query_qdisc(netdev_);
1984 } else if (!netdev->tc->ops->class_get_stats) {
1987 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
1991 stats->created = queue->created;
1992 return netdev->tc->ops->class_get_stats(netdev_, queue, stats);
1997 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1999 struct ofpbuf request;
2000 struct tcmsg *tcmsg;
2002 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2006 tcmsg->tcm_parent = 0;
2007 nl_dump_start(dump, NETLINK_ROUTE, &request);
2008 ofpbuf_uninit(&request);
2013 netdev_linux_dump_queues(const struct netdev *netdev_,
2014 netdev_dump_queues_cb *cb, void *aux)
2016 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2017 struct tc_queue *queue, *next_queue;
2018 struct smap details;
2022 error = tc_query_qdisc(netdev_);
2025 } else if (!netdev->tc->ops->class_get) {
2030 smap_init(&details);
2031 HMAP_FOR_EACH_SAFE (queue, next_queue, hmap_node,
2032 &netdev->tc->queues) {
2033 smap_clear(&details);
2035 error = netdev->tc->ops->class_get(netdev_, queue, &details);
2037 (*cb)(queue->queue_id, &details, aux);
2042 smap_destroy(&details);
2048 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2049 netdev_dump_queue_stats_cb *cb, void *aux)
2051 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2052 struct nl_dump dump;
2057 error = tc_query_qdisc(netdev_);
2060 } else if (!netdev->tc->ops->class_dump_stats) {
2065 if (!start_queue_dump(netdev_, &dump)) {
2068 while (nl_dump_next(&dump, &msg)) {
2069 error = netdev->tc->ops->class_dump_stats(netdev_, &msg, cb, aux);
2075 error = nl_dump_done(&dump);
2076 return error ? error : last_error;
2080 netdev_linux_get_in4(const struct netdev *netdev_,
2081 struct in_addr *address, struct in_addr *netmask)
2083 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2085 if (!(netdev->cache_valid & VALID_IN4)) {
2088 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
2089 SIOCGIFADDR, "SIOCGIFADDR");
2094 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2095 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2100 netdev->cache_valid |= VALID_IN4;
2102 *address = netdev->address;
2103 *netmask = netdev->netmask;
2104 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
2108 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2109 struct in_addr netmask)
2111 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2114 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2116 netdev->cache_valid |= VALID_IN4;
2117 netdev->address = address;
2118 netdev->netmask = netmask;
2119 if (address.s_addr != INADDR_ANY) {
2120 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2121 "SIOCSIFNETMASK", netmask);
2128 parse_if_inet6_line(const char *line,
2129 struct in6_addr *in6, char ifname[16 + 1])
2131 uint8_t *s6 = in6->s6_addr;
2132 #define X8 "%2"SCNx8
2134 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2135 "%*x %*x %*x %*x %16s\n",
2136 &s6[0], &s6[1], &s6[2], &s6[3],
2137 &s6[4], &s6[5], &s6[6], &s6[7],
2138 &s6[8], &s6[9], &s6[10], &s6[11],
2139 &s6[12], &s6[13], &s6[14], &s6[15],
2143 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2144 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2146 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2148 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2149 if (!(netdev->cache_valid & VALID_IN6)) {
2153 netdev->in6 = in6addr_any;
2155 file = fopen("/proc/net/if_inet6", "r");
2157 const char *name = netdev_get_name(netdev_);
2158 while (fgets(line, sizeof line, file)) {
2159 struct in6_addr in6_tmp;
2160 char ifname[16 + 1];
2161 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2162 && !strcmp(name, ifname))
2164 netdev->in6 = in6_tmp;
2170 netdev->cache_valid |= VALID_IN6;
2177 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2179 struct sockaddr_in sin;
2180 memset(&sin, 0, sizeof sin);
2181 sin.sin_family = AF_INET;
2182 sin.sin_addr = addr;
2185 memset(sa, 0, sizeof *sa);
2186 memcpy(sa, &sin, sizeof sin);
2190 do_set_addr(struct netdev *netdev,
2191 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2194 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2195 make_in4_sockaddr(&ifr.ifr_addr, addr);
2197 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2201 /* Adds 'router' as a default IP gateway. */
2203 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2205 struct in_addr any = { INADDR_ANY };
2209 memset(&rt, 0, sizeof rt);
2210 make_in4_sockaddr(&rt.rt_dst, any);
2211 make_in4_sockaddr(&rt.rt_gateway, router);
2212 make_in4_sockaddr(&rt.rt_genmask, any);
2213 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2214 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2216 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2222 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2225 static const char fn[] = "/proc/net/route";
2230 *netdev_name = NULL;
2231 stream = fopen(fn, "r");
2232 if (stream == NULL) {
2233 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2238 while (fgets(line, sizeof line, stream)) {
2241 ovs_be32 dest, gateway, mask;
2242 int refcnt, metric, mtu;
2243 unsigned int flags, use, window, irtt;
2246 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2248 iface, &dest, &gateway, &flags, &refcnt,
2249 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2251 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2255 if (!(flags & RTF_UP)) {
2256 /* Skip routes that aren't up. */
2260 /* The output of 'dest', 'mask', and 'gateway' were given in
2261 * network byte order, so we don't need need any endian
2262 * conversions here. */
2263 if ((dest & mask) == (host->s_addr & mask)) {
2265 /* The host is directly reachable. */
2266 next_hop->s_addr = 0;
2268 /* To reach the host, we must go through a gateway. */
2269 next_hop->s_addr = gateway;
2271 *netdev_name = xstrdup(iface);
2283 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2285 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2288 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2289 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2291 COVERAGE_INC(netdev_get_ethtool);
2292 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2293 error = netdev_linux_do_ethtool(netdev->up.name,
2296 "ETHTOOL_GDRVINFO");
2298 netdev->cache_valid |= VALID_DRVINFO;
2303 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2304 smap_add(smap, "driver_version", netdev->drvinfo.version);
2305 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2311 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2314 smap_add(smap, "driver_name", "openvswitch");
2318 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2319 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2320 * returns 0. Otherwise, it returns a positive errno value; in particular,
2321 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2323 netdev_linux_arp_lookup(const struct netdev *netdev,
2324 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2327 struct sockaddr_in sin;
2330 memset(&r, 0, sizeof r);
2331 memset(&sin, 0, sizeof sin);
2332 sin.sin_family = AF_INET;
2333 sin.sin_addr.s_addr = ip;
2335 memcpy(&r.arp_pa, &sin, sizeof sin);
2336 r.arp_ha.sa_family = ARPHRD_ETHER;
2338 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2339 COVERAGE_INC(netdev_arp_lookup);
2340 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2342 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2343 } else if (retval != ENXIO) {
2344 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2345 netdev_get_name(netdev), IP_ARGS(ip),
2346 ovs_strerror(retval));
2352 nd_to_iff_flags(enum netdev_flags nd)
2355 if (nd & NETDEV_UP) {
2358 if (nd & NETDEV_PROMISC) {
2365 iff_to_nd_flags(int iff)
2367 enum netdev_flags nd = 0;
2371 if (iff & IFF_PROMISC) {
2372 nd |= NETDEV_PROMISC;
2378 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2379 enum netdev_flags on, enum netdev_flags *old_flagsp)
2381 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2382 int old_flags, new_flags;
2385 old_flags = netdev->ifi_flags;
2386 *old_flagsp = iff_to_nd_flags(old_flags);
2387 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2388 if (new_flags != old_flags) {
2389 error = set_flags(netdev_get_name(netdev_), new_flags);
2390 get_flags(netdev_, &netdev->ifi_flags);
2396 netdev_linux_change_seq(const struct netdev *netdev)
2398 return netdev_linux_cast(netdev)->change_seq;
2401 #define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS, \
2402 GET_FEATURES, GET_STATUS) \
2406 netdev_linux_init, \
2408 netdev_linux_wait, \
2411 netdev_linux_destroy, \
2412 NULL, /* get_config */ \
2413 NULL, /* set_config */ \
2414 NULL, /* get_tunnel_config */ \
2416 netdev_linux_rx_open, \
2418 netdev_linux_send, \
2419 netdev_linux_send_wait, \
2421 netdev_linux_set_etheraddr, \
2422 netdev_linux_get_etheraddr, \
2423 netdev_linux_get_mtu, \
2424 netdev_linux_set_mtu, \
2425 netdev_linux_get_ifindex, \
2426 netdev_linux_get_carrier, \
2427 netdev_linux_get_carrier_resets, \
2428 netdev_linux_set_miimon_interval, \
2433 netdev_linux_set_advertisements, \
2435 netdev_linux_set_policing, \
2436 netdev_linux_get_qos_types, \
2437 netdev_linux_get_qos_capabilities, \
2438 netdev_linux_get_qos, \
2439 netdev_linux_set_qos, \
2440 netdev_linux_get_queue, \
2441 netdev_linux_set_queue, \
2442 netdev_linux_delete_queue, \
2443 netdev_linux_get_queue_stats, \
2444 netdev_linux_dump_queues, \
2445 netdev_linux_dump_queue_stats, \
2447 netdev_linux_get_in4, \
2448 netdev_linux_set_in4, \
2449 netdev_linux_get_in6, \
2450 netdev_linux_add_router, \
2451 netdev_linux_get_next_hop, \
2453 netdev_linux_arp_lookup, \
2455 netdev_linux_update_flags, \
2457 netdev_linux_change_seq \
2460 const struct netdev_class netdev_linux_class =
2463 netdev_linux_create,
2464 netdev_linux_get_stats,
2465 NULL, /* set_stats */
2466 netdev_linux_get_features,
2467 netdev_linux_get_status);
2469 const struct netdev_class netdev_tap_class =
2472 netdev_linux_create_tap,
2473 netdev_tap_get_stats,
2474 NULL, /* set_stats */
2475 netdev_linux_get_features,
2476 netdev_linux_get_status);
2478 const struct netdev_class netdev_internal_class =
2481 netdev_linux_create,
2482 netdev_internal_get_stats,
2483 netdev_internal_set_stats,
2484 NULL, /* get_features */
2485 netdev_internal_get_status);
2487 static const struct netdev_rx_class netdev_rx_linux_class = {
2488 netdev_rx_linux_destroy,
2489 netdev_rx_linux_recv,
2490 netdev_rx_linux_wait,
2491 netdev_rx_linux_drain,
2494 /* HTB traffic control class. */
2496 #define HTB_N_QUEUES 0xf000
2500 unsigned int max_rate; /* In bytes/s. */
2504 struct tc_queue tc_queue;
2505 unsigned int min_rate; /* In bytes/s. */
2506 unsigned int max_rate; /* In bytes/s. */
2507 unsigned int burst; /* In bytes. */
2508 unsigned int priority; /* Lower values are higher priorities. */
2512 htb_get__(const struct netdev *netdev_)
2514 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2515 return CONTAINER_OF(netdev->tc, struct htb, tc);
2519 htb_install__(struct netdev *netdev_, uint64_t max_rate)
2521 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2524 htb = xmalloc(sizeof *htb);
2525 tc_init(&htb->tc, &tc_ops_htb);
2526 htb->max_rate = max_rate;
2528 netdev->tc = &htb->tc;
2531 /* Create an HTB qdisc.
2533 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2535 htb_setup_qdisc__(struct netdev *netdev)
2538 struct tc_htb_glob opt;
2539 struct ofpbuf request;
2540 struct tcmsg *tcmsg;
2542 tc_del_qdisc(netdev);
2544 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2545 NLM_F_EXCL | NLM_F_CREATE, &request);
2549 tcmsg->tcm_handle = tc_make_handle(1, 0);
2550 tcmsg->tcm_parent = TC_H_ROOT;
2552 nl_msg_put_string(&request, TCA_KIND, "htb");
2554 memset(&opt, 0, sizeof opt);
2555 opt.rate2quantum = 10;
2559 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2560 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2561 nl_msg_end_nested(&request, opt_offset);
2563 return tc_transact(&request, NULL);
2566 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2567 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2569 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2570 unsigned int parent, struct htb_class *class)
2573 struct tc_htb_opt opt;
2574 struct ofpbuf request;
2575 struct tcmsg *tcmsg;
2579 error = netdev_get_mtu(netdev, &mtu);
2581 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2582 netdev_get_name(netdev));
2586 memset(&opt, 0, sizeof opt);
2587 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2588 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2589 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2590 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2591 opt.prio = class->priority;
2593 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2597 tcmsg->tcm_handle = handle;
2598 tcmsg->tcm_parent = parent;
2600 nl_msg_put_string(&request, TCA_KIND, "htb");
2601 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2602 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2603 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2604 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2605 nl_msg_end_nested(&request, opt_offset);
2607 error = tc_transact(&request, NULL);
2609 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2610 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2611 netdev_get_name(netdev),
2612 tc_get_major(handle), tc_get_minor(handle),
2613 tc_get_major(parent), tc_get_minor(parent),
2614 class->min_rate, class->max_rate,
2615 class->burst, class->priority, ovs_strerror(error));
2620 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2621 * description of them into 'details'. The description complies with the
2622 * specification given in the vswitch database documentation for linux-htb
2625 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2627 static const struct nl_policy tca_htb_policy[] = {
2628 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2629 .min_len = sizeof(struct tc_htb_opt) },
2632 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2633 const struct tc_htb_opt *htb;
2635 if (!nl_parse_nested(nl_options, tca_htb_policy,
2636 attrs, ARRAY_SIZE(tca_htb_policy))) {
2637 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2641 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2642 class->min_rate = htb->rate.rate;
2643 class->max_rate = htb->ceil.rate;
2644 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2645 class->priority = htb->prio;
2650 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2651 struct htb_class *options,
2652 struct netdev_queue_stats *stats)
2654 struct nlattr *nl_options;
2655 unsigned int handle;
2658 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2659 if (!error && queue_id) {
2660 unsigned int major = tc_get_major(handle);
2661 unsigned int minor = tc_get_minor(handle);
2662 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2663 *queue_id = minor - 1;
2668 if (!error && options) {
2669 error = htb_parse_tca_options__(nl_options, options);
2675 htb_parse_qdisc_details__(struct netdev *netdev,
2676 const struct smap *details, struct htb_class *hc)
2678 const char *max_rate_s;
2680 max_rate_s = smap_get(details, "max-rate");
2681 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2682 if (!hc->max_rate) {
2683 enum netdev_features current;
2685 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2686 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
2688 hc->min_rate = hc->max_rate;
2694 htb_parse_class_details__(struct netdev *netdev,
2695 const struct smap *details, struct htb_class *hc)
2697 const struct htb *htb = htb_get__(netdev);
2698 const char *min_rate_s = smap_get(details, "min-rate");
2699 const char *max_rate_s = smap_get(details, "max-rate");
2700 const char *burst_s = smap_get(details, "burst");
2701 const char *priority_s = smap_get(details, "priority");
2704 error = netdev_get_mtu(netdev, &mtu);
2706 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2707 netdev_get_name(netdev));
2711 /* HTB requires at least an mtu sized min-rate to send any traffic even
2712 * on uncongested links. */
2713 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2714 hc->min_rate = MAX(hc->min_rate, mtu);
2715 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2718 hc->max_rate = (max_rate_s
2719 ? strtoull(max_rate_s, NULL, 10) / 8
2721 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2722 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2726 * According to hints in the documentation that I've read, it is important
2727 * that 'burst' be at least as big as the largest frame that might be
2728 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2729 * but having it a bit too small is a problem. Since netdev_get_mtu()
2730 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2731 * the MTU. We actually add 64, instead of 14, as a guard against
2732 * additional headers get tacked on somewhere that we're not aware of. */
2733 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2734 hc->burst = MAX(hc->burst, mtu + 64);
2737 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2743 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2744 unsigned int parent, struct htb_class *options,
2745 struct netdev_queue_stats *stats)
2747 struct ofpbuf *reply;
2750 error = tc_query_class(netdev, handle, parent, &reply);
2752 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2753 ofpbuf_delete(reply);
2759 htb_tc_install(struct netdev *netdev, const struct smap *details)
2763 error = htb_setup_qdisc__(netdev);
2765 struct htb_class hc;
2767 htb_parse_qdisc_details__(netdev, details, &hc);
2768 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2769 tc_make_handle(1, 0), &hc);
2771 htb_install__(netdev, hc.max_rate);
2777 static struct htb_class *
2778 htb_class_cast__(const struct tc_queue *queue)
2780 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2784 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2785 const struct htb_class *hc)
2787 struct htb *htb = htb_get__(netdev);
2788 size_t hash = hash_int(queue_id, 0);
2789 struct tc_queue *queue;
2790 struct htb_class *hcp;
2792 queue = tc_find_queue__(netdev, queue_id, hash);
2794 hcp = htb_class_cast__(queue);
2796 hcp = xmalloc(sizeof *hcp);
2797 queue = &hcp->tc_queue;
2798 queue->queue_id = queue_id;
2799 queue->created = time_msec();
2800 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2803 hcp->min_rate = hc->min_rate;
2804 hcp->max_rate = hc->max_rate;
2805 hcp->burst = hc->burst;
2806 hcp->priority = hc->priority;
2810 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2813 struct nl_dump dump;
2814 struct htb_class hc;
2816 /* Get qdisc options. */
2818 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2819 htb_install__(netdev, hc.max_rate);
2822 if (!start_queue_dump(netdev, &dump)) {
2825 while (nl_dump_next(&dump, &msg)) {
2826 unsigned int queue_id;
2828 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2829 htb_update_queue__(netdev, queue_id, &hc);
2832 nl_dump_done(&dump);
2838 htb_tc_destroy(struct tc *tc)
2840 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2841 struct htb_class *hc, *next;
2843 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2844 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2852 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
2854 const struct htb *htb = htb_get__(netdev);
2855 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
2860 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
2862 struct htb_class hc;
2865 htb_parse_qdisc_details__(netdev, details, &hc);
2866 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2867 tc_make_handle(1, 0), &hc);
2869 htb_get__(netdev)->max_rate = hc.max_rate;
2875 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2876 const struct tc_queue *queue, struct smap *details)
2878 const struct htb_class *hc = htb_class_cast__(queue);
2880 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
2881 if (hc->min_rate != hc->max_rate) {
2882 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
2884 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
2886 smap_add_format(details, "priority", "%u", hc->priority);
2892 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2893 const struct smap *details)
2895 struct htb_class hc;
2898 error = htb_parse_class_details__(netdev, details, &hc);
2903 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2904 tc_make_handle(1, 0xfffe), &hc);
2909 htb_update_queue__(netdev, queue_id, &hc);
2914 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2916 struct htb_class *hc = htb_class_cast__(queue);
2917 struct htb *htb = htb_get__(netdev);
2920 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2922 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2929 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2930 struct netdev_queue_stats *stats)
2932 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2933 tc_make_handle(1, 0xfffe), NULL, stats);
2937 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2938 const struct ofpbuf *nlmsg,
2939 netdev_dump_queue_stats_cb *cb, void *aux)
2941 struct netdev_queue_stats stats;
2942 unsigned int handle, major, minor;
2945 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2950 major = tc_get_major(handle);
2951 minor = tc_get_minor(handle);
2952 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2953 (*cb)(minor - 1, &stats, aux);
2958 static const struct tc_ops tc_ops_htb = {
2959 "htb", /* linux_name */
2960 "linux-htb", /* ovs_name */
2961 HTB_N_QUEUES, /* n_queues */
2970 htb_class_get_stats,
2971 htb_class_dump_stats
2974 /* "linux-hfsc" traffic control class. */
2976 #define HFSC_N_QUEUES 0xf000
2984 struct tc_queue tc_queue;
2989 static struct hfsc *
2990 hfsc_get__(const struct netdev *netdev_)
2992 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2993 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
2996 static struct hfsc_class *
2997 hfsc_class_cast__(const struct tc_queue *queue)
2999 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3003 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
3005 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3008 hfsc = xmalloc(sizeof *hfsc);
3009 tc_init(&hfsc->tc, &tc_ops_hfsc);
3010 hfsc->max_rate = max_rate;
3011 netdev->tc = &hfsc->tc;
3015 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3016 const struct hfsc_class *hc)
3020 struct hfsc_class *hcp;
3021 struct tc_queue *queue;
3023 hfsc = hfsc_get__(netdev);
3024 hash = hash_int(queue_id, 0);
3026 queue = tc_find_queue__(netdev, queue_id, hash);
3028 hcp = hfsc_class_cast__(queue);
3030 hcp = xmalloc(sizeof *hcp);
3031 queue = &hcp->tc_queue;
3032 queue->queue_id = queue_id;
3033 queue->created = time_msec();
3034 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3037 hcp->min_rate = hc->min_rate;
3038 hcp->max_rate = hc->max_rate;
3042 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3044 const struct tc_service_curve *rsc, *fsc, *usc;
3045 static const struct nl_policy tca_hfsc_policy[] = {
3047 .type = NL_A_UNSPEC,
3049 .min_len = sizeof(struct tc_service_curve),
3052 .type = NL_A_UNSPEC,
3054 .min_len = sizeof(struct tc_service_curve),
3057 .type = NL_A_UNSPEC,
3059 .min_len = sizeof(struct tc_service_curve),
3062 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3064 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3065 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3066 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3070 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3071 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3072 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3074 if (rsc->m1 != 0 || rsc->d != 0 ||
3075 fsc->m1 != 0 || fsc->d != 0 ||
3076 usc->m1 != 0 || usc->d != 0) {
3077 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3078 "Non-linear service curves are not supported.");
3082 if (rsc->m2 != fsc->m2) {
3083 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3084 "Real-time service curves are not supported ");
3088 if (rsc->m2 > usc->m2) {
3089 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3090 "Min-rate service curve is greater than "
3091 "the max-rate service curve.");
3095 class->min_rate = fsc->m2;
3096 class->max_rate = usc->m2;
3101 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3102 struct hfsc_class *options,
3103 struct netdev_queue_stats *stats)
3106 unsigned int handle;
3107 struct nlattr *nl_options;
3109 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3115 unsigned int major, minor;
3117 major = tc_get_major(handle);
3118 minor = tc_get_minor(handle);
3119 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3120 *queue_id = minor - 1;
3127 error = hfsc_parse_tca_options__(nl_options, options);
3134 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3135 unsigned int parent, struct hfsc_class *options,
3136 struct netdev_queue_stats *stats)
3139 struct ofpbuf *reply;
3141 error = tc_query_class(netdev, handle, parent, &reply);
3146 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3147 ofpbuf_delete(reply);
3152 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct smap *details,
3153 struct hfsc_class *class)
3156 const char *max_rate_s;
3158 max_rate_s = smap_get(details, "max-rate");
3159 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3162 enum netdev_features current;
3164 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
3165 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3168 class->min_rate = max_rate;
3169 class->max_rate = max_rate;
3173 hfsc_parse_class_details__(struct netdev *netdev,
3174 const struct smap *details,
3175 struct hfsc_class * class)
3177 const struct hfsc *hfsc;
3178 uint32_t min_rate, max_rate;
3179 const char *min_rate_s, *max_rate_s;
3181 hfsc = hfsc_get__(netdev);
3182 min_rate_s = smap_get(details, "min-rate");
3183 max_rate_s = smap_get(details, "max-rate");
3185 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3186 min_rate = MAX(min_rate, 1);
3187 min_rate = MIN(min_rate, hfsc->max_rate);
3189 max_rate = (max_rate_s
3190 ? strtoull(max_rate_s, NULL, 10) / 8
3192 max_rate = MAX(max_rate, min_rate);
3193 max_rate = MIN(max_rate, hfsc->max_rate);
3195 class->min_rate = min_rate;
3196 class->max_rate = max_rate;
3201 /* Create an HFSC qdisc.
3203 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3205 hfsc_setup_qdisc__(struct netdev * netdev)
3207 struct tcmsg *tcmsg;
3208 struct ofpbuf request;
3209 struct tc_hfsc_qopt opt;
3211 tc_del_qdisc(netdev);
3213 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3214 NLM_F_EXCL | NLM_F_CREATE, &request);
3220 tcmsg->tcm_handle = tc_make_handle(1, 0);
3221 tcmsg->tcm_parent = TC_H_ROOT;
3223 memset(&opt, 0, sizeof opt);
3226 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3227 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3229 return tc_transact(&request, NULL);
3232 /* Create an HFSC class.
3234 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3235 * sc rate <min_rate> ul rate <max_rate>" */
3237 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3238 unsigned int parent, struct hfsc_class *class)
3242 struct tcmsg *tcmsg;
3243 struct ofpbuf request;
3244 struct tc_service_curve min, max;
3246 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3252 tcmsg->tcm_handle = handle;
3253 tcmsg->tcm_parent = parent;
3257 min.m2 = class->min_rate;
3261 max.m2 = class->max_rate;
3263 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3264 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3265 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3266 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3267 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3268 nl_msg_end_nested(&request, opt_offset);
3270 error = tc_transact(&request, NULL);
3272 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3273 "min-rate %ubps, max-rate %ubps (%s)",
3274 netdev_get_name(netdev),
3275 tc_get_major(handle), tc_get_minor(handle),
3276 tc_get_major(parent), tc_get_minor(parent),
3277 class->min_rate, class->max_rate, ovs_strerror(error));
3284 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
3287 struct hfsc_class class;
3289 error = hfsc_setup_qdisc__(netdev);
3295 hfsc_parse_qdisc_details__(netdev, details, &class);
3296 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3297 tc_make_handle(1, 0), &class);
3303 hfsc_install__(netdev, class.max_rate);
3308 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3311 struct nl_dump dump;
3312 struct hfsc_class hc;
3315 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3316 hfsc_install__(netdev, hc.max_rate);
3318 if (!start_queue_dump(netdev, &dump)) {
3322 while (nl_dump_next(&dump, &msg)) {
3323 unsigned int queue_id;
3325 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3326 hfsc_update_queue__(netdev, queue_id, &hc);
3330 nl_dump_done(&dump);
3335 hfsc_tc_destroy(struct tc *tc)
3338 struct hfsc_class *hc, *next;
3340 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3342 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3343 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3352 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
3354 const struct hfsc *hfsc;
3355 hfsc = hfsc_get__(netdev);
3356 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
3361 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
3364 struct hfsc_class class;
3366 hfsc_parse_qdisc_details__(netdev, details, &class);
3367 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3368 tc_make_handle(1, 0), &class);
3371 hfsc_get__(netdev)->max_rate = class.max_rate;
3378 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3379 const struct tc_queue *queue, struct smap *details)
3381 const struct hfsc_class *hc;
3383 hc = hfsc_class_cast__(queue);
3384 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3385 if (hc->min_rate != hc->max_rate) {
3386 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3392 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3393 const struct smap *details)
3396 struct hfsc_class class;
3398 error = hfsc_parse_class_details__(netdev, details, &class);
3403 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3404 tc_make_handle(1, 0xfffe), &class);
3409 hfsc_update_queue__(netdev, queue_id, &class);
3414 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3418 struct hfsc_class *hc;
3420 hc = hfsc_class_cast__(queue);
3421 hfsc = hfsc_get__(netdev);
3423 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3425 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3432 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3433 struct netdev_queue_stats *stats)
3435 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3436 tc_make_handle(1, 0xfffe), NULL, stats);
3440 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3441 const struct ofpbuf *nlmsg,
3442 netdev_dump_queue_stats_cb *cb, void *aux)
3444 struct netdev_queue_stats stats;
3445 unsigned int handle, major, minor;
3448 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3453 major = tc_get_major(handle);
3454 minor = tc_get_minor(handle);
3455 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3456 (*cb)(minor - 1, &stats, aux);
3461 static const struct tc_ops tc_ops_hfsc = {
3462 "hfsc", /* linux_name */
3463 "linux-hfsc", /* ovs_name */
3464 HFSC_N_QUEUES, /* n_queues */
3465 hfsc_tc_install, /* tc_install */
3466 hfsc_tc_load, /* tc_load */
3467 hfsc_tc_destroy, /* tc_destroy */
3468 hfsc_qdisc_get, /* qdisc_get */
3469 hfsc_qdisc_set, /* qdisc_set */
3470 hfsc_class_get, /* class_get */
3471 hfsc_class_set, /* class_set */
3472 hfsc_class_delete, /* class_delete */
3473 hfsc_class_get_stats, /* class_get_stats */
3474 hfsc_class_dump_stats /* class_dump_stats */
3477 /* "linux-default" traffic control class.
3479 * This class represents the default, unnamed Linux qdisc. It corresponds to
3480 * the "" (empty string) QoS type in the OVS database. */
3483 default_install__(struct netdev *netdev_)
3485 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3486 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
3488 /* Nothing but a tc class implementation is allowed to write to a tc. This
3489 * class never does that, so we can legitimately use a const tc object. */
3490 netdev->tc = CONST_CAST(struct tc *, &tc);
3494 default_tc_install(struct netdev *netdev,
3495 const struct smap *details OVS_UNUSED)
3497 default_install__(netdev);
3502 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3504 default_install__(netdev);
3508 static const struct tc_ops tc_ops_default = {
3509 NULL, /* linux_name */
3514 NULL, /* tc_destroy */
3515 NULL, /* qdisc_get */
3516 NULL, /* qdisc_set */
3517 NULL, /* class_get */
3518 NULL, /* class_set */
3519 NULL, /* class_delete */
3520 NULL, /* class_get_stats */
3521 NULL /* class_dump_stats */
3524 /* "linux-other" traffic control class.
3529 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
3531 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3532 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
3534 /* Nothing but a tc class implementation is allowed to write to a tc. This
3535 * class never does that, so we can legitimately use a const tc object. */
3536 netdev->tc = CONST_CAST(struct tc *, &tc);
3540 static const struct tc_ops tc_ops_other = {
3541 NULL, /* linux_name */
3542 "linux-other", /* ovs_name */
3544 NULL, /* tc_install */
3546 NULL, /* tc_destroy */
3547 NULL, /* qdisc_get */
3548 NULL, /* qdisc_set */
3549 NULL, /* class_get */
3550 NULL, /* class_set */
3551 NULL, /* class_delete */
3552 NULL, /* class_get_stats */
3553 NULL /* class_dump_stats */
3556 /* Traffic control. */
3558 /* Number of kernel "tc" ticks per second. */
3559 static double ticks_per_s;
3561 /* Number of kernel "jiffies" per second. This is used for the purpose of
3562 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3563 * one jiffy's worth of data.
3565 * There are two possibilities here:
3567 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3568 * approximate range of 100 to 1024. That means that we really need to
3569 * make sure that the qdisc can buffer that much data.
3571 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3572 * has finely granular timers and there's no need to fudge additional room
3573 * for buffers. (There's no extra effort needed to implement that: the
3574 * large 'buffer_hz' is used as a divisor, so practically any number will
3575 * come out as 0 in the division. Small integer results in the case of
3576 * really high dividends won't have any real effect anyhow.)
3578 static unsigned int buffer_hz;
3580 /* Returns tc handle 'major':'minor'. */
3582 tc_make_handle(unsigned int major, unsigned int minor)
3584 return TC_H_MAKE(major << 16, minor);
3587 /* Returns the major number from 'handle'. */
3589 tc_get_major(unsigned int handle)
3591 return TC_H_MAJ(handle) >> 16;
3594 /* Returns the minor number from 'handle'. */
3596 tc_get_minor(unsigned int handle)
3598 return TC_H_MIN(handle);
3601 static struct tcmsg *
3602 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3603 struct ofpbuf *request)
3605 struct tcmsg *tcmsg;
3609 error = get_ifindex(netdev, &ifindex);
3614 ofpbuf_init(request, 512);
3615 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3616 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3617 tcmsg->tcm_family = AF_UNSPEC;
3618 tcmsg->tcm_ifindex = ifindex;
3619 /* Caller should fill in tcmsg->tcm_handle. */
3620 /* Caller should fill in tcmsg->tcm_parent. */
3626 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3628 int error = nl_transact(NETLINK_ROUTE, request, replyp);
3629 ofpbuf_uninit(request);
3633 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3634 * policing configuration.
3636 * This function is equivalent to running the following when 'add' is true:
3637 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3639 * This function is equivalent to running the following when 'add' is false:
3640 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3642 * The configuration and stats may be seen with the following command:
3643 * /sbin/tc -s qdisc show dev <devname>
3645 * Returns 0 if successful, otherwise a positive errno value.
3648 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3650 struct ofpbuf request;
3651 struct tcmsg *tcmsg;
3653 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3654 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3656 tcmsg = tc_make_request(netdev, type, flags, &request);
3660 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3661 tcmsg->tcm_parent = TC_H_INGRESS;
3662 nl_msg_put_string(&request, TCA_KIND, "ingress");
3663 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3665 error = tc_transact(&request, NULL);
3667 /* If we're deleting the qdisc, don't worry about some of the
3668 * error conditions. */
3669 if (!add && (error == ENOENT || error == EINVAL)) {
3678 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3681 * This function is equivalent to running:
3682 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3683 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3686 * The configuration and stats may be seen with the following command:
3687 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3689 * Returns 0 if successful, otherwise a positive errno value.
3692 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3694 struct tc_police tc_police;
3695 struct ofpbuf request;
3696 struct tcmsg *tcmsg;
3697 size_t basic_offset;
3698 size_t police_offset;
3702 memset(&tc_police, 0, sizeof tc_police);
3703 tc_police.action = TC_POLICE_SHOT;
3704 tc_police.mtu = mtu;
3705 tc_fill_rate(&tc_police.rate, (kbits_rate * 1000)/8, mtu);
3706 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3707 kbits_burst * 1024);
3709 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3710 NLM_F_EXCL | NLM_F_CREATE, &request);
3714 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3715 tcmsg->tcm_info = tc_make_handle(49,
3716 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3718 nl_msg_put_string(&request, TCA_KIND, "basic");
3719 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3720 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3721 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3722 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3723 nl_msg_end_nested(&request, police_offset);
3724 nl_msg_end_nested(&request, basic_offset);
3726 error = tc_transact(&request, NULL);
3737 /* The values in psched are not individually very meaningful, but they are
3738 * important. The tables below show some values seen in the wild.
3742 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3743 * (Before that, there are hints that it was 1000000000.)
3745 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3749 * -----------------------------------
3750 * [1] 000c8000 000f4240 000f4240 00000064
3751 * [2] 000003e8 00000400 000f4240 3b9aca00
3752 * [3] 000003e8 00000400 000f4240 3b9aca00
3753 * [4] 000003e8 00000400 000f4240 00000064
3754 * [5] 000003e8 00000040 000f4240 3b9aca00
3755 * [6] 000003e8 00000040 000f4240 000000f9
3757 * a b c d ticks_per_s buffer_hz
3758 * ------- --------- ---------- ------------- ----------- -------------
3759 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3760 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3761 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3762 * [4] 1,000 1,024 1,000,000 100 976,562 100
3763 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3764 * [6] 1,000 64 1,000,000 249 15,625,000 249
3766 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3767 * [2] 2.6.26-1-686-bigmem from Debian lenny
3768 * [3] 2.6.26-2-sparc64 from Debian lenny
3769 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3770 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3771 * [6] 2.6.34 from kernel.org on KVM
3773 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
3774 static const char fn[] = "/proc/net/psched";
3775 unsigned int a, b, c, d;
3778 if (!ovsthread_once_start(&once)) {
3785 stream = fopen(fn, "r");
3787 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
3791 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3792 VLOG_WARN("%s: read failed", fn);
3796 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3800 VLOG_WARN("%s: invalid scheduler parameters", fn);
3804 ticks_per_s = (double) a * c / b;
3808 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3811 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3814 ovsthread_once_done(&once);
3817 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3818 * rate of 'rate' bytes per second. */
3820 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3823 return (rate * ticks) / ticks_per_s;
3826 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3827 * rate of 'rate' bytes per second. */
3829 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3832 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3835 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3836 * a transmission rate of 'rate' bytes per second. */
3838 tc_buffer_per_jiffy(unsigned int rate)
3841 return rate / buffer_hz;
3844 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3845 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3846 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3847 * stores NULL into it if it is absent.
3849 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3852 * Returns 0 if successful, otherwise a positive errno value. */
3854 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3855 struct nlattr **options)
3857 static const struct nl_policy tca_policy[] = {
3858 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3859 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3861 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3863 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3864 tca_policy, ta, ARRAY_SIZE(ta))) {
3865 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3870 *kind = nl_attr_get_string(ta[TCA_KIND]);
3874 *options = ta[TCA_OPTIONS];
3889 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3890 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3891 * into '*options', and its queue statistics into '*stats'. Any of the output
3892 * arguments may be null.
3894 * Returns 0 if successful, otherwise a positive errno value. */
3896 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3897 struct nlattr **options, struct netdev_queue_stats *stats)
3899 static const struct nl_policy tca_policy[] = {
3900 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3901 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3903 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3905 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3906 tca_policy, ta, ARRAY_SIZE(ta))) {
3907 VLOG_WARN_RL(&rl, "failed to parse class message");
3912 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3913 *handlep = tc->tcm_handle;
3917 *options = ta[TCA_OPTIONS];
3921 const struct gnet_stats_queue *gsq;
3922 struct gnet_stats_basic gsb;
3924 static const struct nl_policy stats_policy[] = {
3925 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3926 .min_len = sizeof gsb },
3927 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3928 .min_len = sizeof *gsq },
3930 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3932 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3933 sa, ARRAY_SIZE(sa))) {
3934 VLOG_WARN_RL(&rl, "failed to parse class stats");
3938 /* Alignment issues screw up the length of struct gnet_stats_basic on
3939 * some arch/bitsize combinations. Newer versions of Linux have a
3940 * struct gnet_stats_basic_packed, but we can't depend on that. The
3941 * easiest thing to do is just to make a copy. */
3942 memset(&gsb, 0, sizeof gsb);
3943 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3944 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3945 stats->tx_bytes = gsb.bytes;
3946 stats->tx_packets = gsb.packets;
3948 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3949 stats->tx_errors = gsq->drops;
3959 memset(stats, 0, sizeof *stats);
3964 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3967 tc_query_class(const struct netdev *netdev,
3968 unsigned int handle, unsigned int parent,
3969 struct ofpbuf **replyp)
3971 struct ofpbuf request;
3972 struct tcmsg *tcmsg;
3975 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3979 tcmsg->tcm_handle = handle;
3980 tcmsg->tcm_parent = parent;
3982 error = tc_transact(&request, replyp);
3984 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3985 netdev_get_name(netdev),
3986 tc_get_major(handle), tc_get_minor(handle),
3987 tc_get_major(parent), tc_get_minor(parent),
3988 ovs_strerror(error));
3993 /* Equivalent to "tc class del dev <name> handle <handle>". */
3995 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3997 struct ofpbuf request;
3998 struct tcmsg *tcmsg;
4001 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
4005 tcmsg->tcm_handle = handle;
4006 tcmsg->tcm_parent = 0;
4008 error = tc_transact(&request, NULL);
4010 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4011 netdev_get_name(netdev),
4012 tc_get_major(handle), tc_get_minor(handle),
4013 ovs_strerror(error));
4018 /* Equivalent to "tc qdisc del dev <name> root". */
4020 tc_del_qdisc(struct netdev *netdev_)
4022 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4023 struct ofpbuf request;
4024 struct tcmsg *tcmsg;
4027 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
4031 tcmsg->tcm_handle = tc_make_handle(1, 0);
4032 tcmsg->tcm_parent = TC_H_ROOT;
4034 error = tc_transact(&request, NULL);
4035 if (error == EINVAL) {
4036 /* EINVAL probably means that the default qdisc was in use, in which
4037 * case we've accomplished our purpose. */
4040 if (!error && netdev->tc) {
4041 if (netdev->tc->ops->tc_destroy) {
4042 netdev->tc->ops->tc_destroy(netdev->tc);
4049 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4050 * kernel to determine what they are. Returns 0 if successful, otherwise a
4051 * positive errno value. */
4053 tc_query_qdisc(const struct netdev *netdev_)
4055 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4056 struct ofpbuf request, *qdisc;
4057 const struct tc_ops *ops;
4058 struct tcmsg *tcmsg;
4066 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4067 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4068 * 2.6.35 without that fix backported to it.
4070 * To avoid the OOPS, we must not make a request that would attempt to dump
4071 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4072 * few others. There are a few ways that I can see to do this, but most of
4073 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4074 * technique chosen here is to assume that any non-default qdisc that we
4075 * create will have a class with handle 1:0. The built-in qdiscs only have
4076 * a class with handle 0:0.
4078 * We could check for Linux 2.6.35+ and use a more straightforward method
4080 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
4084 tcmsg->tcm_handle = tc_make_handle(1, 0);
4085 tcmsg->tcm_parent = 0;
4087 /* Figure out what tc class to instantiate. */
4088 error = tc_transact(&request, &qdisc);
4092 error = tc_parse_qdisc(qdisc, &kind, NULL);
4094 ops = &tc_ops_other;
4096 ops = tc_lookup_linux_name(kind);
4098 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4099 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4101 ops = &tc_ops_other;
4104 } else if (error == ENOENT) {
4105 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4106 * other entity that doesn't have a handle 1:0. We will assume
4107 * that it's the system default qdisc. */
4108 ops = &tc_ops_default;
4111 /* Who knows? Maybe the device got deleted. */
4112 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4113 netdev_get_name(netdev_), ovs_strerror(error));
4114 ops = &tc_ops_other;
4117 /* Instantiate it. */
4118 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
4119 ovs_assert((load_error == 0) == (netdev->tc != NULL));
4120 ofpbuf_delete(qdisc);
4122 return error ? error : load_error;
4125 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4126 approximate the time to transmit packets of various lengths. For an MTU of
4127 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4128 represents two possible packet lengths; for a MTU of 513 through 1024, four
4129 possible lengths; and so on.
4131 Returns, for the specified 'mtu', the number of bits that packet lengths
4132 need to be shifted right to fit within such a 256-entry table. */
4134 tc_calc_cell_log(unsigned int mtu)
4139 mtu = ETH_PAYLOAD_MAX;
4141 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4143 for (cell_log = 0; mtu >= 256; cell_log++) {
4150 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4153 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4155 memset(rate, 0, sizeof *rate);
4156 rate->cell_log = tc_calc_cell_log(mtu);
4157 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4158 /* rate->cell_align = 0; */ /* distro headers. */
4159 rate->mpu = ETH_TOTAL_MIN;
4163 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4164 * attribute of the specified "type".
4166 * See tc_calc_cell_log() above for a description of "rtab"s. */
4168 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4173 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4174 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4175 unsigned packet_size = (i + 1) << rate->cell_log;
4176 if (packet_size < rate->mpu) {
4177 packet_size = rate->mpu;
4179 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4183 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4184 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4185 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4188 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4190 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4191 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4194 /* Linux-only functions declared in netdev-linux.h */
4196 /* Returns a fd for an AF_INET socket or a negative errno value. */
4198 netdev_linux_get_af_inet_sock(void)
4200 int error = netdev_linux_init();
4201 return error ? -error : af_inet_sock;
4204 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4205 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4207 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4208 const char *flag_name, bool enable)
4210 const char *netdev_name = netdev_get_name(netdev);
4211 struct ethtool_value evalue;
4215 COVERAGE_INC(netdev_get_ethtool);
4216 memset(&evalue, 0, sizeof evalue);
4217 error = netdev_linux_do_ethtool(netdev_name,
4218 (struct ethtool_cmd *)&evalue,
4219 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4224 COVERAGE_INC(netdev_set_ethtool);
4225 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4226 error = netdev_linux_do_ethtool(netdev_name,
4227 (struct ethtool_cmd *)&evalue,
4228 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4233 COVERAGE_INC(netdev_get_ethtool);
4234 memset(&evalue, 0, sizeof evalue);
4235 error = netdev_linux_do_ethtool(netdev_name,
4236 (struct ethtool_cmd *)&evalue,
4237 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4242 if (new_flags != evalue.data) {
4243 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4244 "device %s failed", enable ? "enable" : "disable",
4245 flag_name, netdev_name);
4252 /* Utility functions. */
4254 /* Copies 'src' into 'dst', performing format conversion in the process. */
4256 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4257 const struct rtnl_link_stats *src)
4259 dst->rx_packets = src->rx_packets;
4260 dst->tx_packets = src->tx_packets;
4261 dst->rx_bytes = src->rx_bytes;
4262 dst->tx_bytes = src->tx_bytes;
4263 dst->rx_errors = src->rx_errors;
4264 dst->tx_errors = src->tx_errors;
4265 dst->rx_dropped = src->rx_dropped;
4266 dst->tx_dropped = src->tx_dropped;
4267 dst->multicast = src->multicast;
4268 dst->collisions = src->collisions;
4269 dst->rx_length_errors = src->rx_length_errors;
4270 dst->rx_over_errors = src->rx_over_errors;
4271 dst->rx_crc_errors = src->rx_crc_errors;
4272 dst->rx_frame_errors = src->rx_frame_errors;
4273 dst->rx_fifo_errors = src->rx_fifo_errors;
4274 dst->rx_missed_errors = src->rx_missed_errors;
4275 dst->tx_aborted_errors = src->tx_aborted_errors;
4276 dst->tx_carrier_errors = src->tx_carrier_errors;
4277 dst->tx_fifo_errors = src->tx_fifo_errors;
4278 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4279 dst->tx_window_errors = src->tx_window_errors;
4283 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4285 /* Policy for RTNLGRP_LINK messages.
4287 * There are *many* more fields in these messages, but currently we only
4288 * care about these fields. */
4289 static const struct nl_policy rtnlgrp_link_policy[] = {
4290 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4291 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4292 .min_len = sizeof(struct rtnl_link_stats) },
4295 struct ofpbuf request;
4296 struct ofpbuf *reply;
4297 struct ifinfomsg *ifi;
4298 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4301 ofpbuf_init(&request, 0);
4302 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4303 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4304 ifi->ifi_family = PF_UNSPEC;
4305 ifi->ifi_index = ifindex;
4306 error = nl_transact(NETLINK_ROUTE, &request, &reply);
4307 ofpbuf_uninit(&request);
4312 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4313 rtnlgrp_link_policy,
4314 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4315 ofpbuf_delete(reply);
4319 if (!attrs[IFLA_STATS]) {
4320 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4321 ofpbuf_delete(reply);
4325 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4327 ofpbuf_delete(reply);
4333 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4335 static const char fn[] = "/proc/net/dev";
4340 stream = fopen(fn, "r");
4342 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
4347 while (fgets(line, sizeof line, stream)) {
4350 #define X64 "%"SCNu64
4353 X64 X64 X64 X64 X64 X64 X64 "%*u"
4354 X64 X64 X64 X64 X64 X64 X64 "%*u",
4360 &stats->rx_fifo_errors,
4361 &stats->rx_frame_errors,
4367 &stats->tx_fifo_errors,
4369 &stats->tx_carrier_errors) != 15) {
4370 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4371 } else if (!strcmp(devname, netdev_name)) {
4372 stats->rx_length_errors = UINT64_MAX;
4373 stats->rx_over_errors = UINT64_MAX;
4374 stats->rx_crc_errors = UINT64_MAX;
4375 stats->rx_missed_errors = UINT64_MAX;
4376 stats->tx_aborted_errors = UINT64_MAX;
4377 stats->tx_heartbeat_errors = UINT64_MAX;
4378 stats->tx_window_errors = UINT64_MAX;
4384 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4390 get_flags(const struct netdev *dev, unsigned int *flags)
4396 error = netdev_linux_do_ioctl(dev->name, &ifr, SIOCGIFFLAGS,
4399 *flags = ifr.ifr_flags;
4405 set_flags(const char *name, unsigned int flags)
4409 ifr.ifr_flags = flags;
4410 return netdev_linux_do_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
4414 do_get_ifindex(const char *netdev_name)
4418 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4419 COVERAGE_INC(netdev_get_ifindex);
4420 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4421 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4422 netdev_name, ovs_strerror(errno));
4425 return ifr.ifr_ifindex;
4429 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4431 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4433 if (!(netdev->cache_valid & VALID_IFINDEX)) {
4434 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4437 netdev->get_ifindex_error = -ifindex;
4438 netdev->ifindex = 0;
4440 netdev->get_ifindex_error = 0;
4441 netdev->ifindex = ifindex;
4443 netdev->cache_valid |= VALID_IFINDEX;
4446 *ifindexp = netdev->ifindex;
4447 return netdev->get_ifindex_error;
4451 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4456 memset(&ifr, 0, sizeof ifr);
4457 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4458 COVERAGE_INC(netdev_get_hwaddr);
4459 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4460 /* ENODEV probably means that a vif disappeared asynchronously and
4461 * hasn't been removed from the database yet, so reduce the log level
4462 * to INFO for that case. */
4463 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4464 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4465 netdev_name, ovs_strerror(errno));
4468 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4469 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4470 VLOG_WARN("%s device has unknown hardware address family %d",
4471 netdev_name, hwaddr_family);
4473 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4478 set_etheraddr(const char *netdev_name,
4479 const uint8_t mac[ETH_ADDR_LEN])
4483 memset(&ifr, 0, sizeof ifr);
4484 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4485 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4486 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4487 COVERAGE_INC(netdev_set_hwaddr);
4488 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4489 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4490 netdev_name, ovs_strerror(errno));
4497 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4498 int cmd, const char *cmd_name)
4502 memset(&ifr, 0, sizeof ifr);
4503 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4504 ifr.ifr_data = (caddr_t) ecmd;
4507 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4510 if (errno != EOPNOTSUPP) {
4511 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4512 "failed: %s", cmd_name, name, ovs_strerror(errno));
4514 /* The device doesn't support this operation. That's pretty
4515 * common, so there's no point in logging anything. */
4522 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4523 const char *cmd_name)
4525 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4526 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4527 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4528 ovs_strerror(errno));
4535 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4536 int cmd, const char *cmd_name)
4541 ifr.ifr_addr.sa_family = AF_INET;
4542 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4544 const struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *,
4546 *ip = sin->sin_addr;
4551 /* Returns an AF_PACKET raw socket or a negative errno value. */
4553 af_packet_sock(void)
4555 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4558 if (ovsthread_once_start(&once)) {
4559 sock = socket(AF_PACKET, SOCK_RAW, 0);
4561 int error = set_nonblocking(sock);
4568 VLOG_ERR("failed to create packet socket: %s",
4569 ovs_strerror(errno));
4571 ovsthread_once_done(&once);