2 * Copyright (c) 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <linux/version.h>
37 #include <sys/types.h>
38 #include <sys/ioctl.h>
39 #include <sys/socket.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
52 #include "dpif-linux.h"
53 #include "dynamic-string.h"
54 #include "fatal-signal.h"
57 #include "netdev-provider.h"
58 #include "netdev-vport.h"
59 #include "netlink-notifier.h"
60 #include "netlink-socket.h"
63 #include "openflow/openflow.h"
65 #include "poll-loop.h"
66 #include "rtnetlink-link.h"
68 #include "socket-util.h"
71 #include "unaligned.h"
74 VLOG_DEFINE_THIS_MODULE(netdev_linux);
76 COVERAGE_DEFINE(netdev_set_policing);
77 COVERAGE_DEFINE(netdev_arp_lookup);
78 COVERAGE_DEFINE(netdev_get_ifindex);
79 COVERAGE_DEFINE(netdev_get_hwaddr);
80 COVERAGE_DEFINE(netdev_set_hwaddr);
81 COVERAGE_DEFINE(netdev_get_ethtool);
82 COVERAGE_DEFINE(netdev_set_ethtool);
85 /* These were introduced in Linux 2.6.14, so they might be missing if we have
87 #ifndef ADVERTISED_Pause
88 #define ADVERTISED_Pause (1 << 13)
90 #ifndef ADVERTISED_Asym_Pause
91 #define ADVERTISED_Asym_Pause (1 << 14)
94 /* These were introduced in Linux 2.6.24, so they might be missing if we
95 * have old headers. */
96 #ifndef ETHTOOL_GFLAGS
97 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
99 #ifndef ETHTOOL_SFLAGS
100 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
103 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
106 #define TC_RTAB_SIZE 1024
109 static struct nln_notifier *netdev_linux_cache_notifier = NULL;
110 static int cache_notifier_refcount;
113 VALID_IFINDEX = 1 << 0,
114 VALID_ETHERADDR = 1 << 1,
118 VALID_POLICING = 1 << 5,
119 VALID_VPORT_STAT_ERROR = 1 << 6,
120 VALID_DRVINFO = 1 << 7,
121 VALID_FEATURES = 1 << 8,
124 /* Traffic control. */
126 /* An instance of a traffic control class. Always associated with a particular
129 * Each TC implementation subclasses this with whatever additional data it
132 const struct tc_ops *ops;
133 struct hmap queues; /* Contains "struct tc_queue"s.
134 * Read by generic TC layer.
135 * Written only by TC implementation. */
138 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
140 /* One traffic control queue.
142 * Each TC implementation subclasses this with whatever additional data it
145 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
146 unsigned int queue_id; /* OpenFlow queue ID. */
147 long long int created; /* Time queue was created, in msecs. */
150 /* A particular kind of traffic control. Each implementation generally maps to
151 * one particular Linux qdisc class.
153 * The functions below return 0 if successful or a positive errno value on
154 * failure, except where otherwise noted. All of them must be provided, except
155 * where otherwise noted. */
157 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
158 * This is null for tc_ops_default and tc_ops_other, for which there are no
159 * appropriate values. */
160 const char *linux_name;
162 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
163 const char *ovs_name;
165 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
166 * queues. The queues are numbered 0 through n_queues - 1. */
167 unsigned int n_queues;
169 /* Called to install this TC class on 'netdev'. The implementation should
170 * make the Netlink calls required to set up 'netdev' with the right qdisc
171 * and configure it according to 'details'. The implementation may assume
172 * that the current qdisc is the default; that is, there is no need for it
173 * to delete the current qdisc before installing itself.
175 * The contents of 'details' should be documented as valid for 'ovs_name'
176 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
177 * (which is built as ovs-vswitchd.conf.db(8)).
179 * This function must return 0 if and only if it sets 'netdev->tc' to an
180 * initialized 'struct tc'.
182 * (This function is null for tc_ops_other, which cannot be installed. For
183 * other TC classes it should always be nonnull.) */
184 int (*tc_install)(struct netdev *netdev, const struct smap *details);
186 /* Called when the netdev code determines (through a Netlink query) that
187 * this TC class's qdisc is installed on 'netdev', but we didn't install
188 * it ourselves and so don't know any of the details.
190 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
191 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
192 * implementation should parse the other attributes of 'nlmsg' as
193 * necessary to determine its configuration. If necessary it should also
194 * use Netlink queries to determine the configuration of queues on
197 * This function must return 0 if and only if it sets 'netdev->tc' to an
198 * initialized 'struct tc'. */
199 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
201 /* Destroys the data structures allocated by the implementation as part of
202 * 'tc'. (This includes destroying 'tc->queues' by calling
205 * The implementation should not need to perform any Netlink calls. If
206 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
207 * (But it may not be desirable.)
209 * This function may be null if 'tc' is trivial. */
210 void (*tc_destroy)(struct tc *tc);
212 /* Retrieves details of 'netdev->tc' configuration into 'details'.
214 * The implementation should not need to perform any Netlink calls, because
215 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
216 * cached the configuration.
218 * The contents of 'details' should be documented as valid for 'ovs_name'
219 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
220 * (which is built as ovs-vswitchd.conf.db(8)).
222 * This function may be null if 'tc' is not configurable.
224 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
226 /* Reconfigures 'netdev->tc' according to 'details', performing any
227 * required Netlink calls to complete the reconfiguration.
229 * The contents of 'details' should be documented as valid for 'ovs_name'
230 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
231 * (which is built as ovs-vswitchd.conf.db(8)).
233 * This function may be null if 'tc' is not configurable.
235 int (*qdisc_set)(struct netdev *, const struct smap *details);
237 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
238 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
240 * The contents of 'details' should be documented as valid for 'ovs_name'
241 * in the "other_config" column in the "Queue" table in
242 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
244 * The implementation should not need to perform any Netlink calls, because
245 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
246 * cached the queue configuration.
248 * This function may be null if 'tc' does not have queues ('n_queues' is
250 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
251 struct smap *details);
253 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
254 * 'details', perfoming any required Netlink calls to complete the
255 * reconfiguration. The caller ensures that 'queue_id' is less than
258 * The contents of 'details' should be documented as valid for 'ovs_name'
259 * in the "other_config" column in the "Queue" table in
260 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
262 * This function may be null if 'tc' does not have queues or its queues are
263 * not configurable. */
264 int (*class_set)(struct netdev *, unsigned int queue_id,
265 const struct smap *details);
267 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
268 * tc_queue's within 'netdev->tc->queues'.
270 * This function may be null if 'tc' does not have queues or its queues
271 * cannot be deleted. */
272 int (*class_delete)(struct netdev *, struct tc_queue *queue);
274 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
275 * 'struct tc_queue's within 'netdev->tc->queues'.
277 * On success, initializes '*stats'.
279 * This function may be null if 'tc' does not have queues or if it cannot
280 * report queue statistics. */
281 int (*class_get_stats)(const struct netdev *netdev,
282 const struct tc_queue *queue,
283 struct netdev_queue_stats *stats);
285 /* Extracts queue stats from 'nlmsg', which is a response to a
286 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
288 * This function may be null if 'tc' does not have queues or if it cannot
289 * report queue statistics. */
290 int (*class_dump_stats)(const struct netdev *netdev,
291 const struct ofpbuf *nlmsg,
292 netdev_dump_queue_stats_cb *cb, void *aux);
296 tc_init(struct tc *tc, const struct tc_ops *ops)
299 hmap_init(&tc->queues);
303 tc_destroy(struct tc *tc)
305 hmap_destroy(&tc->queues);
308 static const struct tc_ops tc_ops_htb;
309 static const struct tc_ops tc_ops_hfsc;
310 static const struct tc_ops tc_ops_default;
311 static const struct tc_ops tc_ops_other;
313 static const struct tc_ops *const tcs[] = {
314 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
315 &tc_ops_hfsc, /* Hierarchical fair service curve. */
316 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
317 &tc_ops_other, /* Some other qdisc. */
321 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
322 static unsigned int tc_get_major(unsigned int handle);
323 static unsigned int tc_get_minor(unsigned int handle);
325 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
326 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
327 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
329 static struct tcmsg *tc_make_request(const struct netdev *, int type,
330 unsigned int flags, struct ofpbuf *);
331 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
332 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
333 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
336 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
337 struct nlattr **options);
338 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
339 struct nlattr **options,
340 struct netdev_queue_stats *);
341 static int tc_query_class(const struct netdev *,
342 unsigned int handle, unsigned int parent,
343 struct ofpbuf **replyp);
344 static int tc_delete_class(const struct netdev *, unsigned int handle);
346 static int tc_del_qdisc(struct netdev *netdev);
347 static int tc_query_qdisc(const struct netdev *netdev);
349 static int tc_calc_cell_log(unsigned int mtu);
350 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
351 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
352 const struct tc_ratespec *rate);
353 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
355 struct netdev_linux {
358 unsigned int cache_valid;
359 unsigned int change_seq;
361 bool miimon; /* Link status of last poll. */
362 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
363 struct timer miimon_timer;
365 /* The following are figured out "on demand" only. They are only valid
366 * when the corresponding VALID_* bit in 'cache_valid' is set. */
368 uint8_t etheraddr[ETH_ADDR_LEN];
369 struct in_addr address, netmask;
372 unsigned int ifi_flags;
373 long long int carrier_resets;
374 uint32_t kbits_rate; /* Policing data. */
375 uint32_t kbits_burst;
376 int vport_stats_error; /* Cached error code from vport_get_stats().
377 0 or an errno value. */
378 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
379 int ether_addr_error; /* Cached error code from set/get etheraddr. */
380 int netdev_policing_error; /* Cached error code from set policing. */
381 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
382 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
384 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
385 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
386 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
388 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
391 /* For devices of class netdev_tap_class only. */
395 struct netdev_rx_linux {
401 /* This is set pretty low because we probably won't learn anything from the
402 * additional log messages. */
403 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
405 static void netdev_linux_run(void);
407 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
408 int cmd, const char *cmd_name);
409 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
410 int cmd, const char *cmd_name);
411 static int get_flags(const struct netdev *, unsigned int *flags);
412 static int set_flags(const char *, unsigned int flags);
413 static int do_get_ifindex(const char *netdev_name);
414 static int get_ifindex(const struct netdev *, int *ifindexp);
415 static int do_set_addr(struct netdev *netdev,
416 int ioctl_nr, const char *ioctl_name,
417 struct in_addr addr);
418 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
419 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
420 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
421 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
422 static int af_packet_sock(void);
423 static void netdev_linux_miimon_run(void);
424 static void netdev_linux_miimon_wait(void);
427 is_netdev_linux_class(const struct netdev_class *netdev_class)
429 return netdev_class->run == netdev_linux_run;
433 is_tap_netdev(const struct netdev *netdev)
435 return netdev_get_class(netdev) == &netdev_tap_class;
438 static struct netdev_linux *
439 netdev_linux_cast(const struct netdev *netdev)
441 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
443 return CONTAINER_OF(netdev, struct netdev_linux, up);
446 static struct netdev_rx_linux *
447 netdev_rx_linux_cast(const struct netdev_rx *rx)
449 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
450 return CONTAINER_OF(rx, struct netdev_rx_linux, up);
454 netdev_linux_run(void)
456 rtnetlink_link_run();
457 netdev_linux_miimon_run();
461 netdev_linux_wait(void)
463 rtnetlink_link_wait();
464 netdev_linux_miimon_wait();
468 netdev_linux_changed(struct netdev_linux *dev,
469 unsigned int ifi_flags, unsigned int mask)
472 if (!dev->change_seq) {
476 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
477 dev->carrier_resets++;
479 dev->ifi_flags = ifi_flags;
481 dev->cache_valid &= mask;
485 netdev_linux_update(struct netdev_linux *dev,
486 const struct rtnetlink_link_change *change)
488 if (change->nlmsg_type == RTM_NEWLINK) {
490 netdev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
492 /* Update netdev from rtnl-change msg. */
494 dev->mtu = change->mtu;
495 dev->cache_valid |= VALID_MTU;
496 dev->netdev_mtu_error = 0;
499 if (!eth_addr_is_zero(change->addr)) {
500 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
501 dev->cache_valid |= VALID_ETHERADDR;
502 dev->ether_addr_error = 0;
505 dev->ifindex = change->ifi_index;
506 dev->cache_valid |= VALID_IFINDEX;
507 dev->get_ifindex_error = 0;
510 netdev_linux_changed(dev, change->ifi_flags, 0);
515 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
516 void *aux OVS_UNUSED)
519 struct netdev *base_dev = netdev_from_name(change->ifname);
520 if (base_dev && is_netdev_linux_class(netdev_get_class(base_dev))) {
521 netdev_linux_update(netdev_linux_cast(base_dev), change);
522 netdev_close(base_dev);
525 struct shash device_shash;
526 struct shash_node *node;
528 shash_init(&device_shash);
529 netdev_get_devices(&netdev_linux_class, &device_shash);
530 SHASH_FOR_EACH (node, &device_shash) {
531 struct netdev *netdev = node->data;
532 struct netdev_linux *dev = netdev_linux_cast(netdev);
535 get_flags(&dev->up, &flags);
536 netdev_linux_changed(dev, flags, 0);
537 netdev_close(netdev);
539 shash_destroy(&device_shash);
544 cache_notifier_ref(void)
546 if (!cache_notifier_refcount) {
547 ovs_assert(!netdev_linux_cache_notifier);
549 netdev_linux_cache_notifier =
550 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
552 if (!netdev_linux_cache_notifier) {
556 cache_notifier_refcount++;
562 cache_notifier_unref(void)
564 ovs_assert(cache_notifier_refcount > 0);
565 if (!--cache_notifier_refcount) {
566 ovs_assert(netdev_linux_cache_notifier);
567 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
568 netdev_linux_cache_notifier = NULL;
572 static struct netdev *
573 netdev_linux_alloc(void)
575 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
580 netdev_linux_common_construct(struct netdev_linux *netdev)
582 netdev->change_seq = 1;
584 return cache_notifier_ref();
587 /* Creates system and internal devices. */
589 netdev_linux_construct(struct netdev *netdev_)
591 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
594 error = netdev_linux_common_construct(netdev);
599 error = get_flags(&netdev->up, &netdev->ifi_flags);
600 if (error == ENODEV) {
601 if (netdev->up.netdev_class != &netdev_internal_class) {
602 /* The device does not exist, so don't allow it to be opened. */
603 cache_notifier_unref();
606 /* "Internal" netdevs have to be created as netdev objects before
607 * they exist in the kernel, because creating them in the kernel
608 * happens by passing a netdev object to dpif_port_add().
609 * Therefore, ignore the error. */
616 /* For most types of netdevs we open the device for each call of
617 * netdev_open(). However, this is not the case with tap devices,
618 * since it is only possible to open the device once. In this
619 * situation we share a single file descriptor, and consequently
620 * buffers, across all readers. Therefore once data is read it will
621 * be unavailable to other reads for tap devices. */
623 netdev_linux_construct_tap(struct netdev *netdev_)
625 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
626 static const char tap_dev[] = "/dev/net/tun";
627 const char *name = netdev_->name;
631 error = netdev_linux_common_construct(netdev);
636 /* Open tap device. */
637 netdev->tap_fd = open(tap_dev, O_RDWR);
638 if (netdev->tap_fd < 0) {
640 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
641 goto error_unref_notifier;
644 /* Create tap device. */
645 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
646 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
647 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
648 VLOG_WARN("%s: creating tap device failed: %s", name,
649 ovs_strerror(errno));
654 /* Make non-blocking. */
655 error = set_nonblocking(netdev->tap_fd);
663 close(netdev->tap_fd);
664 error_unref_notifier:
665 cache_notifier_unref();
671 netdev_linux_destruct(struct netdev *netdev_)
673 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
675 if (netdev->tc && netdev->tc->ops->tc_destroy) {
676 netdev->tc->ops->tc_destroy(netdev->tc);
679 if (netdev_get_class(netdev_) == &netdev_tap_class
680 && netdev->tap_fd >= 0)
682 close(netdev->tap_fd);
685 cache_notifier_unref();
689 netdev_linux_dealloc(struct netdev *netdev_)
691 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
695 static struct netdev_rx *
696 netdev_linux_rx_alloc(void)
698 struct netdev_rx_linux *rx = xzalloc(sizeof *rx);
703 netdev_linux_rx_construct(struct netdev_rx *rx_)
705 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
706 struct netdev *netdev_ = rx->up.netdev;
707 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
710 rx->is_tap = is_tap_netdev(netdev_);
712 rx->fd = netdev->tap_fd;
714 struct sockaddr_ll sll;
716 /* Result of tcpdump -dd inbound */
717 static const struct sock_filter filt[] = {
718 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
719 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
720 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
721 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
723 static const struct sock_fprog fprog = {
724 ARRAY_SIZE(filt), (struct sock_filter *) filt
727 /* Create file descriptor. */
728 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
731 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
735 /* Set non-blocking mode. */
736 error = set_nonblocking(rx->fd);
741 /* Get ethernet device index. */
742 error = get_ifindex(&netdev->up, &ifindex);
747 /* Bind to specific ethernet device. */
748 memset(&sll, 0, sizeof sll);
749 sll.sll_family = AF_PACKET;
750 sll.sll_ifindex = ifindex;
751 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
752 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
754 VLOG_ERR("%s: failed to bind raw socket (%s)",
755 netdev_get_name(netdev_), ovs_strerror(error));
759 /* Filter for only inbound packets. */
760 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
764 VLOG_ERR("%s: failed to attach filter (%s)",
765 netdev_get_name(netdev_), ovs_strerror(error));
780 netdev_linux_rx_destruct(struct netdev_rx *rx_)
782 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
790 netdev_linux_rx_dealloc(struct netdev_rx *rx_)
792 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
798 netdev_linux_rx_recv(struct netdev_rx *rx_, void *data, size_t size)
800 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
805 ? read(rx->fd, data, size)
806 : recv(rx->fd, data, size, MSG_TRUNC));
807 } while (retval < 0 && errno == EINTR);
810 return retval > size ? -EMSGSIZE : retval;
812 if (errno != EAGAIN) {
813 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
814 ovs_strerror(errno), netdev_rx_get_name(rx_));
821 netdev_linux_rx_wait(struct netdev_rx *rx_)
823 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
824 poll_fd_wait(rx->fd, POLLIN);
828 netdev_linux_rx_drain(struct netdev_rx *rx_)
830 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
833 int error = af_inet_ifreq_ioctl(netdev_rx_get_name(rx_), &ifr,
834 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
838 drain_fd(rx->fd, ifr.ifr_qlen);
841 return drain_rcvbuf(rx->fd);
845 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
846 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
847 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
848 * the packet is too big or too small to transmit on the device.
850 * The caller retains ownership of 'buffer' in all cases.
852 * The kernel maintains a packet transmission queue, so the caller is not
853 * expected to do additional queuing of packets. */
855 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
860 if (!is_tap_netdev(netdev_)) {
861 /* Use our AF_PACKET socket to send to this device. */
862 struct sockaddr_ll sll;
869 sock = af_packet_sock();
874 error = get_ifindex(netdev_, &ifindex);
879 /* We don't bother setting most fields in sockaddr_ll because the
880 * kernel ignores them for SOCK_RAW. */
881 memset(&sll, 0, sizeof sll);
882 sll.sll_family = AF_PACKET;
883 sll.sll_ifindex = ifindex;
885 iov.iov_base = CONST_CAST(void *, data);
889 msg.msg_namelen = sizeof sll;
892 msg.msg_control = NULL;
893 msg.msg_controllen = 0;
896 retval = sendmsg(sock, &msg, 0);
898 /* Use the tap fd to send to this device. This is essential for
899 * tap devices, because packets sent to a tap device with an
900 * AF_PACKET socket will loop back to be *received* again on the
901 * tap device. This doesn't occur on other interface types
902 * because we attach a socket filter to the rx socket. */
903 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
905 retval = write(netdev->tap_fd, data, size);
909 /* The Linux AF_PACKET implementation never blocks waiting for room
910 * for packets, instead returning ENOBUFS. Translate this into
911 * EAGAIN for the caller. */
912 if (errno == ENOBUFS) {
914 } else if (errno == EINTR) {
916 } else if (errno != EAGAIN) {
917 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
918 netdev_get_name(netdev_), ovs_strerror(errno));
921 } else if (retval != size) {
922 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
923 "%zu) on %s", retval, size, netdev_get_name(netdev_));
931 /* Registers with the poll loop to wake up from the next call to poll_block()
932 * when the packet transmission queue has sufficient room to transmit a packet
933 * with netdev_send().
935 * The kernel maintains a packet transmission queue, so the client is not
936 * expected to do additional queuing of packets. Thus, this function is
937 * unlikely to ever be used. It is included for completeness. */
939 netdev_linux_send_wait(struct netdev *netdev)
941 if (is_tap_netdev(netdev)) {
942 /* TAP device always accepts packets.*/
943 poll_immediate_wake();
947 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
948 * otherwise a positive errno value. */
950 netdev_linux_set_etheraddr(struct netdev *netdev_,
951 const uint8_t mac[ETH_ADDR_LEN])
953 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
954 struct netdev_saved_flags *sf = NULL;
957 if (netdev->cache_valid & VALID_ETHERADDR) {
958 if (netdev->ether_addr_error) {
959 return netdev->ether_addr_error;
961 if (eth_addr_equals(netdev->etheraddr, mac)) {
964 netdev->cache_valid &= ~VALID_ETHERADDR;
967 /* Tap devices must be brought down before setting the address. */
968 if (is_tap_netdev(netdev_)) {
969 netdev_turn_flags_off(netdev_, NETDEV_UP, &sf);
971 error = set_etheraddr(netdev_get_name(netdev_), mac);
972 if (!error || error == ENODEV) {
973 netdev->ether_addr_error = error;
974 netdev->cache_valid |= VALID_ETHERADDR;
976 memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
980 netdev_restore_flags(sf);
985 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
987 netdev_linux_get_etheraddr(const struct netdev *netdev_,
988 uint8_t mac[ETH_ADDR_LEN])
990 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
992 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
993 int error = get_etheraddr(netdev_get_name(netdev_),
996 netdev->ether_addr_error = error;
997 netdev->cache_valid |= VALID_ETHERADDR;
1000 if (!netdev->ether_addr_error) {
1001 memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
1004 return netdev->ether_addr_error;
1007 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1008 * in bytes, not including the hardware header; thus, this is typically 1500
1009 * bytes for Ethernet devices. */
1011 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1013 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1014 if (!(netdev->cache_valid & VALID_MTU)) {
1018 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1019 SIOCGIFMTU, "SIOCGIFMTU");
1021 netdev->netdev_mtu_error = error;
1022 netdev->mtu = ifr.ifr_mtu;
1023 netdev->cache_valid |= VALID_MTU;
1026 if (!netdev->netdev_mtu_error) {
1027 *mtup = netdev->mtu;
1029 return netdev->netdev_mtu_error;
1032 /* Sets the maximum size of transmitted (MTU) for given device using linux
1033 * networking ioctl interface.
1036 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1038 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1042 if (netdev->cache_valid & VALID_MTU) {
1043 if (netdev->netdev_mtu_error) {
1044 return netdev->netdev_mtu_error;
1046 if (netdev->mtu == mtu) {
1049 netdev->cache_valid &= ~VALID_MTU;
1052 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1053 SIOCSIFMTU, "SIOCSIFMTU");
1054 if (!error || error == ENODEV) {
1055 netdev->netdev_mtu_error = error;
1056 netdev->mtu = ifr.ifr_mtu;
1057 netdev->cache_valid |= VALID_MTU;
1062 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1063 * On failure, returns a negative errno value. */
1065 netdev_linux_get_ifindex(const struct netdev *netdev)
1069 error = get_ifindex(netdev, &ifindex);
1070 return error ? -error : ifindex;
1074 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1076 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1078 if (netdev->miimon_interval > 0) {
1079 *carrier = netdev->miimon;
1081 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1087 static long long int
1088 netdev_linux_get_carrier_resets(const struct netdev *netdev)
1090 return netdev_linux_cast(netdev)->carrier_resets;
1094 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1095 struct mii_ioctl_data *data)
1100 memset(&ifr, 0, sizeof ifr);
1101 memcpy(&ifr.ifr_data, data, sizeof *data);
1102 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
1103 memcpy(data, &ifr.ifr_data, sizeof *data);
1109 netdev_linux_get_miimon(const char *name, bool *miimon)
1111 struct mii_ioctl_data data;
1116 memset(&data, 0, sizeof data);
1117 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1119 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1120 data.reg_num = MII_BMSR;
1121 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1125 *miimon = !!(data.val_out & BMSR_LSTATUS);
1127 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1130 struct ethtool_cmd ecmd;
1132 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1135 COVERAGE_INC(netdev_get_ethtool);
1136 memset(&ecmd, 0, sizeof ecmd);
1137 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1140 struct ethtool_value eval;
1142 memcpy(&eval, &ecmd, sizeof eval);
1143 *miimon = !!eval.data;
1145 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1153 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1154 long long int interval)
1156 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1158 interval = interval > 0 ? MAX(interval, 100) : 0;
1159 if (netdev->miimon_interval != interval) {
1160 netdev->miimon_interval = interval;
1161 timer_set_expired(&netdev->miimon_timer);
1168 netdev_linux_miimon_run(void)
1170 struct shash device_shash;
1171 struct shash_node *node;
1173 shash_init(&device_shash);
1174 netdev_get_devices(&netdev_linux_class, &device_shash);
1175 SHASH_FOR_EACH (node, &device_shash) {
1176 struct netdev *netdev = node->data;
1177 struct netdev_linux *dev = netdev_linux_cast(netdev);
1180 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1181 netdev_close(netdev);
1185 netdev_linux_get_miimon(dev->up.name, &miimon);
1186 if (miimon != dev->miimon) {
1187 dev->miimon = miimon;
1188 netdev_linux_changed(dev, dev->ifi_flags, 0);
1191 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1192 netdev_close(netdev);
1195 shash_destroy(&device_shash);
1199 netdev_linux_miimon_wait(void)
1201 struct shash device_shash;
1202 struct shash_node *node;
1204 shash_init(&device_shash);
1205 netdev_get_devices(&netdev_linux_class, &device_shash);
1206 SHASH_FOR_EACH (node, &device_shash) {
1207 struct netdev *netdev = node->data;
1208 struct netdev_linux *dev = netdev_linux_cast(netdev);
1210 if (dev->miimon_interval > 0) {
1211 timer_wait(&dev->miimon_timer);
1213 netdev_close(netdev);
1215 shash_destroy(&device_shash);
1218 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1219 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1222 check_for_working_netlink_stats(void)
1224 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1225 * preferable, so if that works, we'll use it. */
1226 int ifindex = do_get_ifindex("lo");
1228 VLOG_WARN("failed to get ifindex for lo, "
1229 "obtaining netdev stats from proc");
1232 struct netdev_stats stats;
1233 int error = get_stats_via_netlink(ifindex, &stats);
1235 VLOG_DBG("obtaining netdev stats via rtnetlink");
1238 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1239 "via proc (you are probably running a pre-2.6.19 "
1240 "kernel)", ovs_strerror(error));
1247 swap_uint64(uint64_t *a, uint64_t *b)
1254 /* Copies 'src' into 'dst', performing format conversion in the process.
1256 * 'src' is allowed to be misaligned. */
1258 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1259 const struct ovs_vport_stats *src)
1261 dst->rx_packets = get_unaligned_u64(&src->rx_packets);
1262 dst->tx_packets = get_unaligned_u64(&src->tx_packets);
1263 dst->rx_bytes = get_unaligned_u64(&src->rx_bytes);
1264 dst->tx_bytes = get_unaligned_u64(&src->tx_bytes);
1265 dst->rx_errors = get_unaligned_u64(&src->rx_errors);
1266 dst->tx_errors = get_unaligned_u64(&src->tx_errors);
1267 dst->rx_dropped = get_unaligned_u64(&src->rx_dropped);
1268 dst->tx_dropped = get_unaligned_u64(&src->tx_dropped);
1270 dst->collisions = 0;
1271 dst->rx_length_errors = 0;
1272 dst->rx_over_errors = 0;
1273 dst->rx_crc_errors = 0;
1274 dst->rx_frame_errors = 0;
1275 dst->rx_fifo_errors = 0;
1276 dst->rx_missed_errors = 0;
1277 dst->tx_aborted_errors = 0;
1278 dst->tx_carrier_errors = 0;
1279 dst->tx_fifo_errors = 0;
1280 dst->tx_heartbeat_errors = 0;
1281 dst->tx_window_errors = 0;
1285 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1287 struct dpif_linux_vport reply;
1291 error = dpif_linux_vport_get(netdev_get_name(netdev), &reply, &buf);
1294 } else if (!reply.stats) {
1299 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1307 get_stats_via_vport(const struct netdev *netdev_,
1308 struct netdev_stats *stats)
1310 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1312 if (!netdev->vport_stats_error ||
1313 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1316 error = get_stats_via_vport__(netdev_, stats);
1317 if (error && error != ENOENT) {
1318 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1320 netdev_get_name(netdev_), ovs_strerror(error));
1322 netdev->vport_stats_error = error;
1323 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1328 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1329 struct netdev_stats *stats)
1331 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
1332 static int use_netlink_stats;
1335 if (ovsthread_once_start(&once)) {
1336 use_netlink_stats = check_for_working_netlink_stats();
1337 ovsthread_once_done(&once);
1340 if (use_netlink_stats) {
1343 error = get_ifindex(netdev_, &ifindex);
1345 error = get_stats_via_netlink(ifindex, stats);
1348 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1352 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1353 netdev_get_name(netdev_), error);
1359 /* Retrieves current device stats for 'netdev-linux'. */
1361 netdev_linux_get_stats(const struct netdev *netdev_,
1362 struct netdev_stats *stats)
1364 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1365 struct netdev_stats dev_stats;
1368 get_stats_via_vport(netdev_, stats);
1370 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1373 if (netdev->vport_stats_error) {
1380 if (netdev->vport_stats_error) {
1381 /* stats not available from OVS then use ioctl stats. */
1384 stats->rx_errors += dev_stats.rx_errors;
1385 stats->tx_errors += dev_stats.tx_errors;
1386 stats->rx_dropped += dev_stats.rx_dropped;
1387 stats->tx_dropped += dev_stats.tx_dropped;
1388 stats->multicast += dev_stats.multicast;
1389 stats->collisions += dev_stats.collisions;
1390 stats->rx_length_errors += dev_stats.rx_length_errors;
1391 stats->rx_over_errors += dev_stats.rx_over_errors;
1392 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1393 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1394 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1395 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1396 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1397 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1398 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1399 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1400 stats->tx_window_errors += dev_stats.tx_window_errors;
1405 /* Retrieves current device stats for 'netdev-tap' netdev or
1406 * netdev-internal. */
1408 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1410 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1411 struct netdev_stats dev_stats;
1414 get_stats_via_vport(netdev_, stats);
1416 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1418 if (netdev->vport_stats_error) {
1425 /* If this port is an internal port then the transmit and receive stats
1426 * will appear to be swapped relative to the other ports since we are the
1427 * one sending the data, not a remote computer. For consistency, we swap
1428 * them back here. This does not apply if we are getting stats from the
1429 * vport layer because it always tracks stats from the perspective of the
1431 if (netdev->vport_stats_error) {
1433 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1434 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1435 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1436 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1437 stats->rx_length_errors = 0;
1438 stats->rx_over_errors = 0;
1439 stats->rx_crc_errors = 0;
1440 stats->rx_frame_errors = 0;
1441 stats->rx_fifo_errors = 0;
1442 stats->rx_missed_errors = 0;
1443 stats->tx_aborted_errors = 0;
1444 stats->tx_carrier_errors = 0;
1445 stats->tx_fifo_errors = 0;
1446 stats->tx_heartbeat_errors = 0;
1447 stats->tx_window_errors = 0;
1449 stats->rx_dropped += dev_stats.tx_dropped;
1450 stats->tx_dropped += dev_stats.rx_dropped;
1452 stats->rx_errors += dev_stats.tx_errors;
1453 stats->tx_errors += dev_stats.rx_errors;
1455 stats->multicast += dev_stats.multicast;
1456 stats->collisions += dev_stats.collisions;
1462 netdev_internal_get_stats(const struct netdev *netdev_,
1463 struct netdev_stats *stats)
1465 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1467 get_stats_via_vport(netdev_, stats);
1468 return netdev->vport_stats_error;
1472 netdev_internal_set_stats(struct netdev *netdev,
1473 const struct netdev_stats *stats)
1475 struct ovs_vport_stats vport_stats;
1476 struct dpif_linux_vport vport;
1479 vport_stats.rx_packets = stats->rx_packets;
1480 vport_stats.tx_packets = stats->tx_packets;
1481 vport_stats.rx_bytes = stats->rx_bytes;
1482 vport_stats.tx_bytes = stats->tx_bytes;
1483 vport_stats.rx_errors = stats->rx_errors;
1484 vport_stats.tx_errors = stats->tx_errors;
1485 vport_stats.rx_dropped = stats->rx_dropped;
1486 vport_stats.tx_dropped = stats->tx_dropped;
1488 dpif_linux_vport_init(&vport);
1489 vport.cmd = OVS_VPORT_CMD_SET;
1490 vport.name = netdev_get_name(netdev);
1491 vport.stats = &vport_stats;
1493 err = dpif_linux_vport_transact(&vport, NULL, NULL);
1495 /* If the vport layer doesn't know about the device, that doesn't mean it
1496 * doesn't exist (after all were able to open it when netdev_open() was
1497 * called), it just means that it isn't attached and we'll be getting
1498 * stats a different way. */
1499 if (err == ENODEV) {
1507 netdev_linux_read_features(struct netdev_linux *netdev)
1509 struct ethtool_cmd ecmd;
1513 if (netdev->cache_valid & VALID_FEATURES) {
1517 COVERAGE_INC(netdev_get_ethtool);
1518 memset(&ecmd, 0, sizeof ecmd);
1519 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1520 ETHTOOL_GSET, "ETHTOOL_GSET");
1525 /* Supported features. */
1526 netdev->supported = 0;
1527 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1528 netdev->supported |= NETDEV_F_10MB_HD;
1530 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1531 netdev->supported |= NETDEV_F_10MB_FD;
1533 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1534 netdev->supported |= NETDEV_F_100MB_HD;
1536 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1537 netdev->supported |= NETDEV_F_100MB_FD;
1539 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1540 netdev->supported |= NETDEV_F_1GB_HD;
1542 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1543 netdev->supported |= NETDEV_F_1GB_FD;
1545 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1546 netdev->supported |= NETDEV_F_10GB_FD;
1548 if (ecmd.supported & SUPPORTED_TP) {
1549 netdev->supported |= NETDEV_F_COPPER;
1551 if (ecmd.supported & SUPPORTED_FIBRE) {
1552 netdev->supported |= NETDEV_F_FIBER;
1554 if (ecmd.supported & SUPPORTED_Autoneg) {
1555 netdev->supported |= NETDEV_F_AUTONEG;
1557 if (ecmd.supported & SUPPORTED_Pause) {
1558 netdev->supported |= NETDEV_F_PAUSE;
1560 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1561 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1564 /* Advertised features. */
1565 netdev->advertised = 0;
1566 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1567 netdev->advertised |= NETDEV_F_10MB_HD;
1569 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1570 netdev->advertised |= NETDEV_F_10MB_FD;
1572 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1573 netdev->advertised |= NETDEV_F_100MB_HD;
1575 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1576 netdev->advertised |= NETDEV_F_100MB_FD;
1578 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1579 netdev->advertised |= NETDEV_F_1GB_HD;
1581 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1582 netdev->advertised |= NETDEV_F_1GB_FD;
1584 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1585 netdev->advertised |= NETDEV_F_10GB_FD;
1587 if (ecmd.advertising & ADVERTISED_TP) {
1588 netdev->advertised |= NETDEV_F_COPPER;
1590 if (ecmd.advertising & ADVERTISED_FIBRE) {
1591 netdev->advertised |= NETDEV_F_FIBER;
1593 if (ecmd.advertising & ADVERTISED_Autoneg) {
1594 netdev->advertised |= NETDEV_F_AUTONEG;
1596 if (ecmd.advertising & ADVERTISED_Pause) {
1597 netdev->advertised |= NETDEV_F_PAUSE;
1599 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1600 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1603 /* Current settings. */
1605 if (speed == SPEED_10) {
1606 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1607 } else if (speed == SPEED_100) {
1608 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1609 } else if (speed == SPEED_1000) {
1610 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1611 } else if (speed == SPEED_10000) {
1612 netdev->current = NETDEV_F_10GB_FD;
1613 } else if (speed == 40000) {
1614 netdev->current = NETDEV_F_40GB_FD;
1615 } else if (speed == 100000) {
1616 netdev->current = NETDEV_F_100GB_FD;
1617 } else if (speed == 1000000) {
1618 netdev->current = NETDEV_F_1TB_FD;
1620 netdev->current = 0;
1623 if (ecmd.port == PORT_TP) {
1624 netdev->current |= NETDEV_F_COPPER;
1625 } else if (ecmd.port == PORT_FIBRE) {
1626 netdev->current |= NETDEV_F_FIBER;
1630 netdev->current |= NETDEV_F_AUTONEG;
1634 netdev->cache_valid |= VALID_FEATURES;
1635 netdev->get_features_error = error;
1638 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
1639 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1640 * Returns 0 if successful, otherwise a positive errno value. */
1642 netdev_linux_get_features(const struct netdev *netdev_,
1643 enum netdev_features *current,
1644 enum netdev_features *advertised,
1645 enum netdev_features *supported,
1646 enum netdev_features *peer)
1648 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1650 netdev_linux_read_features(netdev);
1652 if (!netdev->get_features_error) {
1653 *current = netdev->current;
1654 *advertised = netdev->advertised;
1655 *supported = netdev->supported;
1656 *peer = 0; /* XXX */
1658 return netdev->get_features_error;
1661 /* Set the features advertised by 'netdev' to 'advertise'. */
1663 netdev_linux_set_advertisements(struct netdev *netdev,
1664 enum netdev_features advertise)
1666 struct ethtool_cmd ecmd;
1669 COVERAGE_INC(netdev_get_ethtool);
1670 memset(&ecmd, 0, sizeof ecmd);
1671 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1672 ETHTOOL_GSET, "ETHTOOL_GSET");
1677 ecmd.advertising = 0;
1678 if (advertise & NETDEV_F_10MB_HD) {
1679 ecmd.advertising |= ADVERTISED_10baseT_Half;
1681 if (advertise & NETDEV_F_10MB_FD) {
1682 ecmd.advertising |= ADVERTISED_10baseT_Full;
1684 if (advertise & NETDEV_F_100MB_HD) {
1685 ecmd.advertising |= ADVERTISED_100baseT_Half;
1687 if (advertise & NETDEV_F_100MB_FD) {
1688 ecmd.advertising |= ADVERTISED_100baseT_Full;
1690 if (advertise & NETDEV_F_1GB_HD) {
1691 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1693 if (advertise & NETDEV_F_1GB_FD) {
1694 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1696 if (advertise & NETDEV_F_10GB_FD) {
1697 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1699 if (advertise & NETDEV_F_COPPER) {
1700 ecmd.advertising |= ADVERTISED_TP;
1702 if (advertise & NETDEV_F_FIBER) {
1703 ecmd.advertising |= ADVERTISED_FIBRE;
1705 if (advertise & NETDEV_F_AUTONEG) {
1706 ecmd.advertising |= ADVERTISED_Autoneg;
1708 if (advertise & NETDEV_F_PAUSE) {
1709 ecmd.advertising |= ADVERTISED_Pause;
1711 if (advertise & NETDEV_F_PAUSE_ASYM) {
1712 ecmd.advertising |= ADVERTISED_Asym_Pause;
1714 COVERAGE_INC(netdev_set_ethtool);
1715 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1716 ETHTOOL_SSET, "ETHTOOL_SSET");
1719 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1720 * successful, otherwise a positive errno value. */
1722 netdev_linux_set_policing(struct netdev *netdev_,
1723 uint32_t kbits_rate, uint32_t kbits_burst)
1725 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1726 const char *netdev_name = netdev_get_name(netdev_);
1730 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1731 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1732 : kbits_burst); /* Stick with user-specified value. */
1734 if (netdev->cache_valid & VALID_POLICING) {
1735 if (netdev->netdev_policing_error) {
1736 return netdev->netdev_policing_error;
1739 if (netdev->kbits_rate == kbits_rate &&
1740 netdev->kbits_burst == kbits_burst) {
1741 /* Assume that settings haven't changed since we last set them. */
1744 netdev->cache_valid &= ~VALID_POLICING;
1747 COVERAGE_INC(netdev_set_policing);
1748 /* Remove any existing ingress qdisc. */
1749 error = tc_add_del_ingress_qdisc(netdev_, false);
1751 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1752 netdev_name, ovs_strerror(error));
1757 error = tc_add_del_ingress_qdisc(netdev_, true);
1759 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1760 netdev_name, ovs_strerror(error));
1764 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
1766 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1767 netdev_name, ovs_strerror(error));
1772 netdev->kbits_rate = kbits_rate;
1773 netdev->kbits_burst = kbits_burst;
1776 if (!error || error == ENODEV) {
1777 netdev->netdev_policing_error = error;
1778 netdev->cache_valid |= VALID_POLICING;
1784 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1787 const struct tc_ops *const *opsp;
1789 for (opsp = tcs; *opsp != NULL; opsp++) {
1790 const struct tc_ops *ops = *opsp;
1791 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1792 sset_add(types, ops->ovs_name);
1798 static const struct tc_ops *
1799 tc_lookup_ovs_name(const char *name)
1801 const struct tc_ops *const *opsp;
1803 for (opsp = tcs; *opsp != NULL; opsp++) {
1804 const struct tc_ops *ops = *opsp;
1805 if (!strcmp(name, ops->ovs_name)) {
1812 static const struct tc_ops *
1813 tc_lookup_linux_name(const char *name)
1815 const struct tc_ops *const *opsp;
1817 for (opsp = tcs; *opsp != NULL; opsp++) {
1818 const struct tc_ops *ops = *opsp;
1819 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1826 static struct tc_queue *
1827 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
1830 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1831 struct tc_queue *queue;
1833 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
1834 if (queue->queue_id == queue_id) {
1841 static struct tc_queue *
1842 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1844 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1848 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1850 struct netdev_qos_capabilities *caps)
1852 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1856 caps->n_queues = ops->n_queues;
1861 netdev_linux_get_qos(const struct netdev *netdev_,
1862 const char **typep, struct smap *details)
1864 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1867 error = tc_query_qdisc(netdev_);
1872 *typep = netdev->tc->ops->ovs_name;
1873 return (netdev->tc->ops->qdisc_get
1874 ? netdev->tc->ops->qdisc_get(netdev_, details)
1879 netdev_linux_set_qos(struct netdev *netdev_,
1880 const char *type, const struct smap *details)
1882 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1883 const struct tc_ops *new_ops;
1886 new_ops = tc_lookup_ovs_name(type);
1887 if (!new_ops || !new_ops->tc_install) {
1891 error = tc_query_qdisc(netdev_);
1896 if (new_ops == netdev->tc->ops) {
1897 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
1899 /* Delete existing qdisc. */
1900 error = tc_del_qdisc(netdev_);
1904 ovs_assert(netdev->tc == NULL);
1906 /* Install new qdisc. */
1907 error = new_ops->tc_install(netdev_, details);
1908 ovs_assert((error == 0) == (netdev->tc != NULL));
1915 netdev_linux_get_queue(const struct netdev *netdev_,
1916 unsigned int queue_id, struct smap *details)
1918 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1921 error = tc_query_qdisc(netdev_);
1925 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
1927 ? netdev->tc->ops->class_get(netdev_, queue, details)
1933 netdev_linux_set_queue(struct netdev *netdev_,
1934 unsigned int queue_id, const struct smap *details)
1936 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1939 error = tc_query_qdisc(netdev_);
1942 } else if (queue_id >= netdev->tc->ops->n_queues
1943 || !netdev->tc->ops->class_set) {
1947 return netdev->tc->ops->class_set(netdev_, queue_id, details);
1951 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
1953 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1956 error = tc_query_qdisc(netdev_);
1959 } else if (!netdev->tc->ops->class_delete) {
1962 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
1964 ? netdev->tc->ops->class_delete(netdev_, queue)
1970 netdev_linux_get_queue_stats(const struct netdev *netdev_,
1971 unsigned int queue_id,
1972 struct netdev_queue_stats *stats)
1974 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1977 error = tc_query_qdisc(netdev_);
1980 } else if (!netdev->tc->ops->class_get_stats) {
1983 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
1987 stats->created = queue->created;
1988 return netdev->tc->ops->class_get_stats(netdev_, queue, stats);
1993 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1995 struct ofpbuf request;
1996 struct tcmsg *tcmsg;
1998 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2002 tcmsg->tcm_parent = 0;
2003 nl_dump_start(dump, NETLINK_ROUTE, &request);
2004 ofpbuf_uninit(&request);
2009 netdev_linux_dump_queues(const struct netdev *netdev_,
2010 netdev_dump_queues_cb *cb, void *aux)
2012 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2013 struct tc_queue *queue, *next_queue;
2014 struct smap details;
2018 error = tc_query_qdisc(netdev_);
2021 } else if (!netdev->tc->ops->class_get) {
2026 smap_init(&details);
2027 HMAP_FOR_EACH_SAFE (queue, next_queue, hmap_node,
2028 &netdev->tc->queues) {
2029 smap_clear(&details);
2031 error = netdev->tc->ops->class_get(netdev_, queue, &details);
2033 (*cb)(queue->queue_id, &details, aux);
2038 smap_destroy(&details);
2044 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2045 netdev_dump_queue_stats_cb *cb, void *aux)
2047 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2048 struct nl_dump dump;
2053 error = tc_query_qdisc(netdev_);
2056 } else if (!netdev->tc->ops->class_dump_stats) {
2061 if (!start_queue_dump(netdev_, &dump)) {
2064 while (nl_dump_next(&dump, &msg)) {
2065 error = netdev->tc->ops->class_dump_stats(netdev_, &msg, cb, aux);
2071 error = nl_dump_done(&dump);
2072 return error ? error : last_error;
2076 netdev_linux_get_in4(const struct netdev *netdev_,
2077 struct in_addr *address, struct in_addr *netmask)
2079 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2081 if (!(netdev->cache_valid & VALID_IN4)) {
2084 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
2085 SIOCGIFADDR, "SIOCGIFADDR");
2090 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2091 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2096 netdev->cache_valid |= VALID_IN4;
2098 *address = netdev->address;
2099 *netmask = netdev->netmask;
2100 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
2104 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2105 struct in_addr netmask)
2107 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2110 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2112 netdev->cache_valid |= VALID_IN4;
2113 netdev->address = address;
2114 netdev->netmask = netmask;
2115 if (address.s_addr != INADDR_ANY) {
2116 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2117 "SIOCSIFNETMASK", netmask);
2124 parse_if_inet6_line(const char *line,
2125 struct in6_addr *in6, char ifname[16 + 1])
2127 uint8_t *s6 = in6->s6_addr;
2128 #define X8 "%2"SCNx8
2130 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2131 "%*x %*x %*x %*x %16s\n",
2132 &s6[0], &s6[1], &s6[2], &s6[3],
2133 &s6[4], &s6[5], &s6[6], &s6[7],
2134 &s6[8], &s6[9], &s6[10], &s6[11],
2135 &s6[12], &s6[13], &s6[14], &s6[15],
2139 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2140 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2142 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2144 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2145 if (!(netdev->cache_valid & VALID_IN6)) {
2149 netdev->in6 = in6addr_any;
2151 file = fopen("/proc/net/if_inet6", "r");
2153 const char *name = netdev_get_name(netdev_);
2154 while (fgets(line, sizeof line, file)) {
2155 struct in6_addr in6_tmp;
2156 char ifname[16 + 1];
2157 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2158 && !strcmp(name, ifname))
2160 netdev->in6 = in6_tmp;
2166 netdev->cache_valid |= VALID_IN6;
2173 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2175 struct sockaddr_in sin;
2176 memset(&sin, 0, sizeof sin);
2177 sin.sin_family = AF_INET;
2178 sin.sin_addr = addr;
2181 memset(sa, 0, sizeof *sa);
2182 memcpy(sa, &sin, sizeof sin);
2186 do_set_addr(struct netdev *netdev,
2187 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2191 make_in4_sockaddr(&ifr.ifr_addr, addr);
2192 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2196 /* Adds 'router' as a default IP gateway. */
2198 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2200 struct in_addr any = { INADDR_ANY };
2204 memset(&rt, 0, sizeof rt);
2205 make_in4_sockaddr(&rt.rt_dst, any);
2206 make_in4_sockaddr(&rt.rt_gateway, router);
2207 make_in4_sockaddr(&rt.rt_genmask, any);
2208 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2209 error = af_inet_ioctl(SIOCADDRT, &rt);
2211 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2217 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2220 static const char fn[] = "/proc/net/route";
2225 *netdev_name = NULL;
2226 stream = fopen(fn, "r");
2227 if (stream == NULL) {
2228 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2233 while (fgets(line, sizeof line, stream)) {
2236 ovs_be32 dest, gateway, mask;
2237 int refcnt, metric, mtu;
2238 unsigned int flags, use, window, irtt;
2241 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2243 iface, &dest, &gateway, &flags, &refcnt,
2244 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2246 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2250 if (!(flags & RTF_UP)) {
2251 /* Skip routes that aren't up. */
2255 /* The output of 'dest', 'mask', and 'gateway' were given in
2256 * network byte order, so we don't need need any endian
2257 * conversions here. */
2258 if ((dest & mask) == (host->s_addr & mask)) {
2260 /* The host is directly reachable. */
2261 next_hop->s_addr = 0;
2263 /* To reach the host, we must go through a gateway. */
2264 next_hop->s_addr = gateway;
2266 *netdev_name = xstrdup(iface);
2278 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2280 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2283 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2284 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2286 COVERAGE_INC(netdev_get_ethtool);
2287 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2288 error = netdev_linux_do_ethtool(netdev->up.name,
2291 "ETHTOOL_GDRVINFO");
2293 netdev->cache_valid |= VALID_DRVINFO;
2298 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2299 smap_add(smap, "driver_version", netdev->drvinfo.version);
2300 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2306 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2309 smap_add(smap, "driver_name", "openvswitch");
2313 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2314 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2315 * returns 0. Otherwise, it returns a positive errno value; in particular,
2316 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2318 netdev_linux_arp_lookup(const struct netdev *netdev,
2319 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2322 struct sockaddr_in sin;
2325 memset(&r, 0, sizeof r);
2326 memset(&sin, 0, sizeof sin);
2327 sin.sin_family = AF_INET;
2328 sin.sin_addr.s_addr = ip;
2330 memcpy(&r.arp_pa, &sin, sizeof sin);
2331 r.arp_ha.sa_family = ARPHRD_ETHER;
2333 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2334 COVERAGE_INC(netdev_arp_lookup);
2335 retval = af_inet_ioctl(SIOCGARP, &r);
2337 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2338 } else if (retval != ENXIO) {
2339 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2340 netdev_get_name(netdev), IP_ARGS(ip),
2341 ovs_strerror(retval));
2347 nd_to_iff_flags(enum netdev_flags nd)
2350 if (nd & NETDEV_UP) {
2353 if (nd & NETDEV_PROMISC) {
2360 iff_to_nd_flags(int iff)
2362 enum netdev_flags nd = 0;
2366 if (iff & IFF_PROMISC) {
2367 nd |= NETDEV_PROMISC;
2373 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2374 enum netdev_flags on, enum netdev_flags *old_flagsp)
2376 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2377 int old_flags, new_flags;
2380 old_flags = netdev->ifi_flags;
2381 *old_flagsp = iff_to_nd_flags(old_flags);
2382 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2383 if (new_flags != old_flags) {
2384 error = set_flags(netdev_get_name(netdev_), new_flags);
2385 get_flags(netdev_, &netdev->ifi_flags);
2391 netdev_linux_change_seq(const struct netdev *netdev)
2393 return netdev_linux_cast(netdev)->change_seq;
2396 #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, SET_STATS, \
2397 GET_FEATURES, GET_STATUS) \
2403 netdev_linux_wait, \
2405 netdev_linux_alloc, \
2407 netdev_linux_destruct, \
2408 netdev_linux_dealloc, \
2409 NULL, /* get_config */ \
2410 NULL, /* set_config */ \
2411 NULL, /* get_tunnel_config */ \
2413 netdev_linux_send, \
2414 netdev_linux_send_wait, \
2416 netdev_linux_set_etheraddr, \
2417 netdev_linux_get_etheraddr, \
2418 netdev_linux_get_mtu, \
2419 netdev_linux_set_mtu, \
2420 netdev_linux_get_ifindex, \
2421 netdev_linux_get_carrier, \
2422 netdev_linux_get_carrier_resets, \
2423 netdev_linux_set_miimon_interval, \
2428 netdev_linux_set_advertisements, \
2430 netdev_linux_set_policing, \
2431 netdev_linux_get_qos_types, \
2432 netdev_linux_get_qos_capabilities, \
2433 netdev_linux_get_qos, \
2434 netdev_linux_set_qos, \
2435 netdev_linux_get_queue, \
2436 netdev_linux_set_queue, \
2437 netdev_linux_delete_queue, \
2438 netdev_linux_get_queue_stats, \
2439 netdev_linux_dump_queues, \
2440 netdev_linux_dump_queue_stats, \
2442 netdev_linux_get_in4, \
2443 netdev_linux_set_in4, \
2444 netdev_linux_get_in6, \
2445 netdev_linux_add_router, \
2446 netdev_linux_get_next_hop, \
2448 netdev_linux_arp_lookup, \
2450 netdev_linux_update_flags, \
2452 netdev_linux_change_seq, \
2454 netdev_linux_rx_alloc, \
2455 netdev_linux_rx_construct, \
2456 netdev_linux_rx_destruct, \
2457 netdev_linux_rx_dealloc, \
2458 netdev_linux_rx_recv, \
2459 netdev_linux_rx_wait, \
2460 netdev_linux_rx_drain, \
2463 const struct netdev_class netdev_linux_class =
2466 netdev_linux_construct,
2467 netdev_linux_get_stats,
2468 NULL, /* set_stats */
2469 netdev_linux_get_features,
2470 netdev_linux_get_status);
2472 const struct netdev_class netdev_tap_class =
2475 netdev_linux_construct_tap,
2476 netdev_tap_get_stats,
2477 NULL, /* set_stats */
2478 netdev_linux_get_features,
2479 netdev_linux_get_status);
2481 const struct netdev_class netdev_internal_class =
2484 netdev_linux_construct,
2485 netdev_internal_get_stats,
2486 netdev_internal_set_stats,
2487 NULL, /* get_features */
2488 netdev_internal_get_status);
2490 /* HTB traffic control class. */
2492 #define HTB_N_QUEUES 0xf000
2496 unsigned int max_rate; /* In bytes/s. */
2500 struct tc_queue tc_queue;
2501 unsigned int min_rate; /* In bytes/s. */
2502 unsigned int max_rate; /* In bytes/s. */
2503 unsigned int burst; /* In bytes. */
2504 unsigned int priority; /* Lower values are higher priorities. */
2508 htb_get__(const struct netdev *netdev_)
2510 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2511 return CONTAINER_OF(netdev->tc, struct htb, tc);
2515 htb_install__(struct netdev *netdev_, uint64_t max_rate)
2517 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2520 htb = xmalloc(sizeof *htb);
2521 tc_init(&htb->tc, &tc_ops_htb);
2522 htb->max_rate = max_rate;
2524 netdev->tc = &htb->tc;
2527 /* Create an HTB qdisc.
2529 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2531 htb_setup_qdisc__(struct netdev *netdev)
2534 struct tc_htb_glob opt;
2535 struct ofpbuf request;
2536 struct tcmsg *tcmsg;
2538 tc_del_qdisc(netdev);
2540 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2541 NLM_F_EXCL | NLM_F_CREATE, &request);
2545 tcmsg->tcm_handle = tc_make_handle(1, 0);
2546 tcmsg->tcm_parent = TC_H_ROOT;
2548 nl_msg_put_string(&request, TCA_KIND, "htb");
2550 memset(&opt, 0, sizeof opt);
2551 opt.rate2quantum = 10;
2555 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2556 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2557 nl_msg_end_nested(&request, opt_offset);
2559 return tc_transact(&request, NULL);
2562 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2563 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2565 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2566 unsigned int parent, struct htb_class *class)
2569 struct tc_htb_opt opt;
2570 struct ofpbuf request;
2571 struct tcmsg *tcmsg;
2575 error = netdev_get_mtu(netdev, &mtu);
2577 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2578 netdev_get_name(netdev));
2582 memset(&opt, 0, sizeof opt);
2583 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2584 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2585 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2586 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2587 opt.prio = class->priority;
2589 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2593 tcmsg->tcm_handle = handle;
2594 tcmsg->tcm_parent = parent;
2596 nl_msg_put_string(&request, TCA_KIND, "htb");
2597 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2598 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2599 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2600 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2601 nl_msg_end_nested(&request, opt_offset);
2603 error = tc_transact(&request, NULL);
2605 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2606 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2607 netdev_get_name(netdev),
2608 tc_get_major(handle), tc_get_minor(handle),
2609 tc_get_major(parent), tc_get_minor(parent),
2610 class->min_rate, class->max_rate,
2611 class->burst, class->priority, ovs_strerror(error));
2616 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2617 * description of them into 'details'. The description complies with the
2618 * specification given in the vswitch database documentation for linux-htb
2621 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2623 static const struct nl_policy tca_htb_policy[] = {
2624 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2625 .min_len = sizeof(struct tc_htb_opt) },
2628 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2629 const struct tc_htb_opt *htb;
2631 if (!nl_parse_nested(nl_options, tca_htb_policy,
2632 attrs, ARRAY_SIZE(tca_htb_policy))) {
2633 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2637 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2638 class->min_rate = htb->rate.rate;
2639 class->max_rate = htb->ceil.rate;
2640 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2641 class->priority = htb->prio;
2646 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2647 struct htb_class *options,
2648 struct netdev_queue_stats *stats)
2650 struct nlattr *nl_options;
2651 unsigned int handle;
2654 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2655 if (!error && queue_id) {
2656 unsigned int major = tc_get_major(handle);
2657 unsigned int minor = tc_get_minor(handle);
2658 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2659 *queue_id = minor - 1;
2664 if (!error && options) {
2665 error = htb_parse_tca_options__(nl_options, options);
2671 htb_parse_qdisc_details__(struct netdev *netdev,
2672 const struct smap *details, struct htb_class *hc)
2674 const char *max_rate_s;
2676 max_rate_s = smap_get(details, "max-rate");
2677 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2678 if (!hc->max_rate) {
2679 enum netdev_features current;
2681 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2682 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
2684 hc->min_rate = hc->max_rate;
2690 htb_parse_class_details__(struct netdev *netdev,
2691 const struct smap *details, struct htb_class *hc)
2693 const struct htb *htb = htb_get__(netdev);
2694 const char *min_rate_s = smap_get(details, "min-rate");
2695 const char *max_rate_s = smap_get(details, "max-rate");
2696 const char *burst_s = smap_get(details, "burst");
2697 const char *priority_s = smap_get(details, "priority");
2700 error = netdev_get_mtu(netdev, &mtu);
2702 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2703 netdev_get_name(netdev));
2707 /* HTB requires at least an mtu sized min-rate to send any traffic even
2708 * on uncongested links. */
2709 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2710 hc->min_rate = MAX(hc->min_rate, mtu);
2711 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2714 hc->max_rate = (max_rate_s
2715 ? strtoull(max_rate_s, NULL, 10) / 8
2717 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2718 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2722 * According to hints in the documentation that I've read, it is important
2723 * that 'burst' be at least as big as the largest frame that might be
2724 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2725 * but having it a bit too small is a problem. Since netdev_get_mtu()
2726 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2727 * the MTU. We actually add 64, instead of 14, as a guard against
2728 * additional headers get tacked on somewhere that we're not aware of. */
2729 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2730 hc->burst = MAX(hc->burst, mtu + 64);
2733 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2739 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2740 unsigned int parent, struct htb_class *options,
2741 struct netdev_queue_stats *stats)
2743 struct ofpbuf *reply;
2746 error = tc_query_class(netdev, handle, parent, &reply);
2748 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2749 ofpbuf_delete(reply);
2755 htb_tc_install(struct netdev *netdev, const struct smap *details)
2759 error = htb_setup_qdisc__(netdev);
2761 struct htb_class hc;
2763 htb_parse_qdisc_details__(netdev, details, &hc);
2764 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2765 tc_make_handle(1, 0), &hc);
2767 htb_install__(netdev, hc.max_rate);
2773 static struct htb_class *
2774 htb_class_cast__(const struct tc_queue *queue)
2776 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2780 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2781 const struct htb_class *hc)
2783 struct htb *htb = htb_get__(netdev);
2784 size_t hash = hash_int(queue_id, 0);
2785 struct tc_queue *queue;
2786 struct htb_class *hcp;
2788 queue = tc_find_queue__(netdev, queue_id, hash);
2790 hcp = htb_class_cast__(queue);
2792 hcp = xmalloc(sizeof *hcp);
2793 queue = &hcp->tc_queue;
2794 queue->queue_id = queue_id;
2795 queue->created = time_msec();
2796 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2799 hcp->min_rate = hc->min_rate;
2800 hcp->max_rate = hc->max_rate;
2801 hcp->burst = hc->burst;
2802 hcp->priority = hc->priority;
2806 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2809 struct nl_dump dump;
2810 struct htb_class hc;
2812 /* Get qdisc options. */
2814 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2815 htb_install__(netdev, hc.max_rate);
2818 if (!start_queue_dump(netdev, &dump)) {
2821 while (nl_dump_next(&dump, &msg)) {
2822 unsigned int queue_id;
2824 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2825 htb_update_queue__(netdev, queue_id, &hc);
2828 nl_dump_done(&dump);
2834 htb_tc_destroy(struct tc *tc)
2836 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2837 struct htb_class *hc, *next;
2839 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2840 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2848 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
2850 const struct htb *htb = htb_get__(netdev);
2851 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
2856 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
2858 struct htb_class hc;
2861 htb_parse_qdisc_details__(netdev, details, &hc);
2862 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2863 tc_make_handle(1, 0), &hc);
2865 htb_get__(netdev)->max_rate = hc.max_rate;
2871 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2872 const struct tc_queue *queue, struct smap *details)
2874 const struct htb_class *hc = htb_class_cast__(queue);
2876 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
2877 if (hc->min_rate != hc->max_rate) {
2878 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
2880 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
2882 smap_add_format(details, "priority", "%u", hc->priority);
2888 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2889 const struct smap *details)
2891 struct htb_class hc;
2894 error = htb_parse_class_details__(netdev, details, &hc);
2899 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2900 tc_make_handle(1, 0xfffe), &hc);
2905 htb_update_queue__(netdev, queue_id, &hc);
2910 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2912 struct htb_class *hc = htb_class_cast__(queue);
2913 struct htb *htb = htb_get__(netdev);
2916 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2918 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2925 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2926 struct netdev_queue_stats *stats)
2928 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2929 tc_make_handle(1, 0xfffe), NULL, stats);
2933 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2934 const struct ofpbuf *nlmsg,
2935 netdev_dump_queue_stats_cb *cb, void *aux)
2937 struct netdev_queue_stats stats;
2938 unsigned int handle, major, minor;
2941 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2946 major = tc_get_major(handle);
2947 minor = tc_get_minor(handle);
2948 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2949 (*cb)(minor - 1, &stats, aux);
2954 static const struct tc_ops tc_ops_htb = {
2955 "htb", /* linux_name */
2956 "linux-htb", /* ovs_name */
2957 HTB_N_QUEUES, /* n_queues */
2966 htb_class_get_stats,
2967 htb_class_dump_stats
2970 /* "linux-hfsc" traffic control class. */
2972 #define HFSC_N_QUEUES 0xf000
2980 struct tc_queue tc_queue;
2985 static struct hfsc *
2986 hfsc_get__(const struct netdev *netdev_)
2988 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2989 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
2992 static struct hfsc_class *
2993 hfsc_class_cast__(const struct tc_queue *queue)
2995 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2999 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
3001 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3004 hfsc = xmalloc(sizeof *hfsc);
3005 tc_init(&hfsc->tc, &tc_ops_hfsc);
3006 hfsc->max_rate = max_rate;
3007 netdev->tc = &hfsc->tc;
3011 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3012 const struct hfsc_class *hc)
3016 struct hfsc_class *hcp;
3017 struct tc_queue *queue;
3019 hfsc = hfsc_get__(netdev);
3020 hash = hash_int(queue_id, 0);
3022 queue = tc_find_queue__(netdev, queue_id, hash);
3024 hcp = hfsc_class_cast__(queue);
3026 hcp = xmalloc(sizeof *hcp);
3027 queue = &hcp->tc_queue;
3028 queue->queue_id = queue_id;
3029 queue->created = time_msec();
3030 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3033 hcp->min_rate = hc->min_rate;
3034 hcp->max_rate = hc->max_rate;
3038 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3040 const struct tc_service_curve *rsc, *fsc, *usc;
3041 static const struct nl_policy tca_hfsc_policy[] = {
3043 .type = NL_A_UNSPEC,
3045 .min_len = sizeof(struct tc_service_curve),
3048 .type = NL_A_UNSPEC,
3050 .min_len = sizeof(struct tc_service_curve),
3053 .type = NL_A_UNSPEC,
3055 .min_len = sizeof(struct tc_service_curve),
3058 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3060 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3061 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3062 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3066 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3067 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3068 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3070 if (rsc->m1 != 0 || rsc->d != 0 ||
3071 fsc->m1 != 0 || fsc->d != 0 ||
3072 usc->m1 != 0 || usc->d != 0) {
3073 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3074 "Non-linear service curves are not supported.");
3078 if (rsc->m2 != fsc->m2) {
3079 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3080 "Real-time service curves are not supported ");
3084 if (rsc->m2 > usc->m2) {
3085 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3086 "Min-rate service curve is greater than "
3087 "the max-rate service curve.");
3091 class->min_rate = fsc->m2;
3092 class->max_rate = usc->m2;
3097 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3098 struct hfsc_class *options,
3099 struct netdev_queue_stats *stats)
3102 unsigned int handle;
3103 struct nlattr *nl_options;
3105 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3111 unsigned int major, minor;
3113 major = tc_get_major(handle);
3114 minor = tc_get_minor(handle);
3115 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3116 *queue_id = minor - 1;
3123 error = hfsc_parse_tca_options__(nl_options, options);
3130 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3131 unsigned int parent, struct hfsc_class *options,
3132 struct netdev_queue_stats *stats)
3135 struct ofpbuf *reply;
3137 error = tc_query_class(netdev, handle, parent, &reply);
3142 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3143 ofpbuf_delete(reply);
3148 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct smap *details,
3149 struct hfsc_class *class)
3152 const char *max_rate_s;
3154 max_rate_s = smap_get(details, "max-rate");
3155 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3158 enum netdev_features current;
3160 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
3161 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3164 class->min_rate = max_rate;
3165 class->max_rate = max_rate;
3169 hfsc_parse_class_details__(struct netdev *netdev,
3170 const struct smap *details,
3171 struct hfsc_class * class)
3173 const struct hfsc *hfsc;
3174 uint32_t min_rate, max_rate;
3175 const char *min_rate_s, *max_rate_s;
3177 hfsc = hfsc_get__(netdev);
3178 min_rate_s = smap_get(details, "min-rate");
3179 max_rate_s = smap_get(details, "max-rate");
3181 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3182 min_rate = MAX(min_rate, 1);
3183 min_rate = MIN(min_rate, hfsc->max_rate);
3185 max_rate = (max_rate_s
3186 ? strtoull(max_rate_s, NULL, 10) / 8
3188 max_rate = MAX(max_rate, min_rate);
3189 max_rate = MIN(max_rate, hfsc->max_rate);
3191 class->min_rate = min_rate;
3192 class->max_rate = max_rate;
3197 /* Create an HFSC qdisc.
3199 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3201 hfsc_setup_qdisc__(struct netdev * netdev)
3203 struct tcmsg *tcmsg;
3204 struct ofpbuf request;
3205 struct tc_hfsc_qopt opt;
3207 tc_del_qdisc(netdev);
3209 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3210 NLM_F_EXCL | NLM_F_CREATE, &request);
3216 tcmsg->tcm_handle = tc_make_handle(1, 0);
3217 tcmsg->tcm_parent = TC_H_ROOT;
3219 memset(&opt, 0, sizeof opt);
3222 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3223 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3225 return tc_transact(&request, NULL);
3228 /* Create an HFSC class.
3230 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3231 * sc rate <min_rate> ul rate <max_rate>" */
3233 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3234 unsigned int parent, struct hfsc_class *class)
3238 struct tcmsg *tcmsg;
3239 struct ofpbuf request;
3240 struct tc_service_curve min, max;
3242 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3248 tcmsg->tcm_handle = handle;
3249 tcmsg->tcm_parent = parent;
3253 min.m2 = class->min_rate;
3257 max.m2 = class->max_rate;
3259 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3260 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3261 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3262 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3263 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3264 nl_msg_end_nested(&request, opt_offset);
3266 error = tc_transact(&request, NULL);
3268 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3269 "min-rate %ubps, max-rate %ubps (%s)",
3270 netdev_get_name(netdev),
3271 tc_get_major(handle), tc_get_minor(handle),
3272 tc_get_major(parent), tc_get_minor(parent),
3273 class->min_rate, class->max_rate, ovs_strerror(error));
3280 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
3283 struct hfsc_class class;
3285 error = hfsc_setup_qdisc__(netdev);
3291 hfsc_parse_qdisc_details__(netdev, details, &class);
3292 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3293 tc_make_handle(1, 0), &class);
3299 hfsc_install__(netdev, class.max_rate);
3304 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3307 struct nl_dump dump;
3308 struct hfsc_class hc;
3311 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3312 hfsc_install__(netdev, hc.max_rate);
3314 if (!start_queue_dump(netdev, &dump)) {
3318 while (nl_dump_next(&dump, &msg)) {
3319 unsigned int queue_id;
3321 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3322 hfsc_update_queue__(netdev, queue_id, &hc);
3326 nl_dump_done(&dump);
3331 hfsc_tc_destroy(struct tc *tc)
3334 struct hfsc_class *hc, *next;
3336 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3338 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3339 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3348 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
3350 const struct hfsc *hfsc;
3351 hfsc = hfsc_get__(netdev);
3352 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
3357 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
3360 struct hfsc_class class;
3362 hfsc_parse_qdisc_details__(netdev, details, &class);
3363 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3364 tc_make_handle(1, 0), &class);
3367 hfsc_get__(netdev)->max_rate = class.max_rate;
3374 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3375 const struct tc_queue *queue, struct smap *details)
3377 const struct hfsc_class *hc;
3379 hc = hfsc_class_cast__(queue);
3380 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3381 if (hc->min_rate != hc->max_rate) {
3382 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3388 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3389 const struct smap *details)
3392 struct hfsc_class class;
3394 error = hfsc_parse_class_details__(netdev, details, &class);
3399 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3400 tc_make_handle(1, 0xfffe), &class);
3405 hfsc_update_queue__(netdev, queue_id, &class);
3410 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3414 struct hfsc_class *hc;
3416 hc = hfsc_class_cast__(queue);
3417 hfsc = hfsc_get__(netdev);
3419 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3421 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3428 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3429 struct netdev_queue_stats *stats)
3431 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3432 tc_make_handle(1, 0xfffe), NULL, stats);
3436 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3437 const struct ofpbuf *nlmsg,
3438 netdev_dump_queue_stats_cb *cb, void *aux)
3440 struct netdev_queue_stats stats;
3441 unsigned int handle, major, minor;
3444 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3449 major = tc_get_major(handle);
3450 minor = tc_get_minor(handle);
3451 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3452 (*cb)(minor - 1, &stats, aux);
3457 static const struct tc_ops tc_ops_hfsc = {
3458 "hfsc", /* linux_name */
3459 "linux-hfsc", /* ovs_name */
3460 HFSC_N_QUEUES, /* n_queues */
3461 hfsc_tc_install, /* tc_install */
3462 hfsc_tc_load, /* tc_load */
3463 hfsc_tc_destroy, /* tc_destroy */
3464 hfsc_qdisc_get, /* qdisc_get */
3465 hfsc_qdisc_set, /* qdisc_set */
3466 hfsc_class_get, /* class_get */
3467 hfsc_class_set, /* class_set */
3468 hfsc_class_delete, /* class_delete */
3469 hfsc_class_get_stats, /* class_get_stats */
3470 hfsc_class_dump_stats /* class_dump_stats */
3473 /* "linux-default" traffic control class.
3475 * This class represents the default, unnamed Linux qdisc. It corresponds to
3476 * the "" (empty string) QoS type in the OVS database. */
3479 default_install__(struct netdev *netdev_)
3481 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3482 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
3484 /* Nothing but a tc class implementation is allowed to write to a tc. This
3485 * class never does that, so we can legitimately use a const tc object. */
3486 netdev->tc = CONST_CAST(struct tc *, &tc);
3490 default_tc_install(struct netdev *netdev,
3491 const struct smap *details OVS_UNUSED)
3493 default_install__(netdev);
3498 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3500 default_install__(netdev);
3504 static const struct tc_ops tc_ops_default = {
3505 NULL, /* linux_name */
3510 NULL, /* tc_destroy */
3511 NULL, /* qdisc_get */
3512 NULL, /* qdisc_set */
3513 NULL, /* class_get */
3514 NULL, /* class_set */
3515 NULL, /* class_delete */
3516 NULL, /* class_get_stats */
3517 NULL /* class_dump_stats */
3520 /* "linux-other" traffic control class.
3525 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
3527 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3528 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
3530 /* Nothing but a tc class implementation is allowed to write to a tc. This
3531 * class never does that, so we can legitimately use a const tc object. */
3532 netdev->tc = CONST_CAST(struct tc *, &tc);
3536 static const struct tc_ops tc_ops_other = {
3537 NULL, /* linux_name */
3538 "linux-other", /* ovs_name */
3540 NULL, /* tc_install */
3542 NULL, /* tc_destroy */
3543 NULL, /* qdisc_get */
3544 NULL, /* qdisc_set */
3545 NULL, /* class_get */
3546 NULL, /* class_set */
3547 NULL, /* class_delete */
3548 NULL, /* class_get_stats */
3549 NULL /* class_dump_stats */
3552 /* Traffic control. */
3554 /* Number of kernel "tc" ticks per second. */
3555 static double ticks_per_s;
3557 /* Number of kernel "jiffies" per second. This is used for the purpose of
3558 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3559 * one jiffy's worth of data.
3561 * There are two possibilities here:
3563 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3564 * approximate range of 100 to 1024. That means that we really need to
3565 * make sure that the qdisc can buffer that much data.
3567 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3568 * has finely granular timers and there's no need to fudge additional room
3569 * for buffers. (There's no extra effort needed to implement that: the
3570 * large 'buffer_hz' is used as a divisor, so practically any number will
3571 * come out as 0 in the division. Small integer results in the case of
3572 * really high dividends won't have any real effect anyhow.)
3574 static unsigned int buffer_hz;
3576 /* Returns tc handle 'major':'minor'. */
3578 tc_make_handle(unsigned int major, unsigned int minor)
3580 return TC_H_MAKE(major << 16, minor);
3583 /* Returns the major number from 'handle'. */
3585 tc_get_major(unsigned int handle)
3587 return TC_H_MAJ(handle) >> 16;
3590 /* Returns the minor number from 'handle'. */
3592 tc_get_minor(unsigned int handle)
3594 return TC_H_MIN(handle);
3597 static struct tcmsg *
3598 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3599 struct ofpbuf *request)
3601 struct tcmsg *tcmsg;
3605 error = get_ifindex(netdev, &ifindex);
3610 ofpbuf_init(request, 512);
3611 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3612 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3613 tcmsg->tcm_family = AF_UNSPEC;
3614 tcmsg->tcm_ifindex = ifindex;
3615 /* Caller should fill in tcmsg->tcm_handle. */
3616 /* Caller should fill in tcmsg->tcm_parent. */
3622 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3624 int error = nl_transact(NETLINK_ROUTE, request, replyp);
3625 ofpbuf_uninit(request);
3629 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3630 * policing configuration.
3632 * This function is equivalent to running the following when 'add' is true:
3633 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3635 * This function is equivalent to running the following when 'add' is false:
3636 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3638 * The configuration and stats may be seen with the following command:
3639 * /sbin/tc -s qdisc show dev <devname>
3641 * Returns 0 if successful, otherwise a positive errno value.
3644 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3646 struct ofpbuf request;
3647 struct tcmsg *tcmsg;
3649 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3650 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3652 tcmsg = tc_make_request(netdev, type, flags, &request);
3656 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3657 tcmsg->tcm_parent = TC_H_INGRESS;
3658 nl_msg_put_string(&request, TCA_KIND, "ingress");
3659 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3661 error = tc_transact(&request, NULL);
3663 /* If we're deleting the qdisc, don't worry about some of the
3664 * error conditions. */
3665 if (!add && (error == ENOENT || error == EINVAL)) {
3674 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3677 * This function is equivalent to running:
3678 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3679 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3682 * The configuration and stats may be seen with the following command:
3683 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3685 * Returns 0 if successful, otherwise a positive errno value.
3688 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3690 struct tc_police tc_police;
3691 struct ofpbuf request;
3692 struct tcmsg *tcmsg;
3693 size_t basic_offset;
3694 size_t police_offset;
3698 memset(&tc_police, 0, sizeof tc_police);
3699 tc_police.action = TC_POLICE_SHOT;
3700 tc_police.mtu = mtu;
3701 tc_fill_rate(&tc_police.rate, (kbits_rate * 1000)/8, mtu);
3702 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3703 kbits_burst * 1024);
3705 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3706 NLM_F_EXCL | NLM_F_CREATE, &request);
3710 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3711 tcmsg->tcm_info = tc_make_handle(49,
3712 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3714 nl_msg_put_string(&request, TCA_KIND, "basic");
3715 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3716 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3717 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3718 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3719 nl_msg_end_nested(&request, police_offset);
3720 nl_msg_end_nested(&request, basic_offset);
3722 error = tc_transact(&request, NULL);
3733 /* The values in psched are not individually very meaningful, but they are
3734 * important. The tables below show some values seen in the wild.
3738 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3739 * (Before that, there are hints that it was 1000000000.)
3741 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3745 * -----------------------------------
3746 * [1] 000c8000 000f4240 000f4240 00000064
3747 * [2] 000003e8 00000400 000f4240 3b9aca00
3748 * [3] 000003e8 00000400 000f4240 3b9aca00
3749 * [4] 000003e8 00000400 000f4240 00000064
3750 * [5] 000003e8 00000040 000f4240 3b9aca00
3751 * [6] 000003e8 00000040 000f4240 000000f9
3753 * a b c d ticks_per_s buffer_hz
3754 * ------- --------- ---------- ------------- ----------- -------------
3755 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3756 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3757 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3758 * [4] 1,000 1,024 1,000,000 100 976,562 100
3759 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3760 * [6] 1,000 64 1,000,000 249 15,625,000 249
3762 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3763 * [2] 2.6.26-1-686-bigmem from Debian lenny
3764 * [3] 2.6.26-2-sparc64 from Debian lenny
3765 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3766 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3767 * [6] 2.6.34 from kernel.org on KVM
3769 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
3770 static const char fn[] = "/proc/net/psched";
3771 unsigned int a, b, c, d;
3774 if (!ovsthread_once_start(&once)) {
3781 stream = fopen(fn, "r");
3783 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
3787 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3788 VLOG_WARN("%s: read failed", fn);
3792 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3796 VLOG_WARN("%s: invalid scheduler parameters", fn);
3800 ticks_per_s = (double) a * c / b;
3804 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3807 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3810 ovsthread_once_done(&once);
3813 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3814 * rate of 'rate' bytes per second. */
3816 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3819 return (rate * ticks) / ticks_per_s;
3822 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3823 * rate of 'rate' bytes per second. */
3825 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3828 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3831 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3832 * a transmission rate of 'rate' bytes per second. */
3834 tc_buffer_per_jiffy(unsigned int rate)
3837 return rate / buffer_hz;
3840 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3841 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3842 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3843 * stores NULL into it if it is absent.
3845 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3848 * Returns 0 if successful, otherwise a positive errno value. */
3850 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3851 struct nlattr **options)
3853 static const struct nl_policy tca_policy[] = {
3854 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3855 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3857 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3859 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3860 tca_policy, ta, ARRAY_SIZE(ta))) {
3861 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3866 *kind = nl_attr_get_string(ta[TCA_KIND]);
3870 *options = ta[TCA_OPTIONS];
3885 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3886 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3887 * into '*options', and its queue statistics into '*stats'. Any of the output
3888 * arguments may be null.
3890 * Returns 0 if successful, otherwise a positive errno value. */
3892 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3893 struct nlattr **options, struct netdev_queue_stats *stats)
3895 static const struct nl_policy tca_policy[] = {
3896 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3897 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3899 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3901 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3902 tca_policy, ta, ARRAY_SIZE(ta))) {
3903 VLOG_WARN_RL(&rl, "failed to parse class message");
3908 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3909 *handlep = tc->tcm_handle;
3913 *options = ta[TCA_OPTIONS];
3917 const struct gnet_stats_queue *gsq;
3918 struct gnet_stats_basic gsb;
3920 static const struct nl_policy stats_policy[] = {
3921 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3922 .min_len = sizeof gsb },
3923 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3924 .min_len = sizeof *gsq },
3926 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3928 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3929 sa, ARRAY_SIZE(sa))) {
3930 VLOG_WARN_RL(&rl, "failed to parse class stats");
3934 /* Alignment issues screw up the length of struct gnet_stats_basic on
3935 * some arch/bitsize combinations. Newer versions of Linux have a
3936 * struct gnet_stats_basic_packed, but we can't depend on that. The
3937 * easiest thing to do is just to make a copy. */
3938 memset(&gsb, 0, sizeof gsb);
3939 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3940 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3941 stats->tx_bytes = gsb.bytes;
3942 stats->tx_packets = gsb.packets;
3944 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3945 stats->tx_errors = gsq->drops;
3955 memset(stats, 0, sizeof *stats);
3960 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3963 tc_query_class(const struct netdev *netdev,
3964 unsigned int handle, unsigned int parent,
3965 struct ofpbuf **replyp)
3967 struct ofpbuf request;
3968 struct tcmsg *tcmsg;
3971 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3975 tcmsg->tcm_handle = handle;
3976 tcmsg->tcm_parent = parent;
3978 error = tc_transact(&request, replyp);
3980 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3981 netdev_get_name(netdev),
3982 tc_get_major(handle), tc_get_minor(handle),
3983 tc_get_major(parent), tc_get_minor(parent),
3984 ovs_strerror(error));
3989 /* Equivalent to "tc class del dev <name> handle <handle>". */
3991 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3993 struct ofpbuf request;
3994 struct tcmsg *tcmsg;
3997 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
4001 tcmsg->tcm_handle = handle;
4002 tcmsg->tcm_parent = 0;
4004 error = tc_transact(&request, NULL);
4006 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4007 netdev_get_name(netdev),
4008 tc_get_major(handle), tc_get_minor(handle),
4009 ovs_strerror(error));
4014 /* Equivalent to "tc qdisc del dev <name> root". */
4016 tc_del_qdisc(struct netdev *netdev_)
4018 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4019 struct ofpbuf request;
4020 struct tcmsg *tcmsg;
4023 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
4027 tcmsg->tcm_handle = tc_make_handle(1, 0);
4028 tcmsg->tcm_parent = TC_H_ROOT;
4030 error = tc_transact(&request, NULL);
4031 if (error == EINVAL) {
4032 /* EINVAL probably means that the default qdisc was in use, in which
4033 * case we've accomplished our purpose. */
4036 if (!error && netdev->tc) {
4037 if (netdev->tc->ops->tc_destroy) {
4038 netdev->tc->ops->tc_destroy(netdev->tc);
4045 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4046 * kernel to determine what they are. Returns 0 if successful, otherwise a
4047 * positive errno value. */
4049 tc_query_qdisc(const struct netdev *netdev_)
4051 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4052 struct ofpbuf request, *qdisc;
4053 const struct tc_ops *ops;
4054 struct tcmsg *tcmsg;
4062 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4063 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4064 * 2.6.35 without that fix backported to it.
4066 * To avoid the OOPS, we must not make a request that would attempt to dump
4067 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4068 * few others. There are a few ways that I can see to do this, but most of
4069 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4070 * technique chosen here is to assume that any non-default qdisc that we
4071 * create will have a class with handle 1:0. The built-in qdiscs only have
4072 * a class with handle 0:0.
4074 * We could check for Linux 2.6.35+ and use a more straightforward method
4076 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
4080 tcmsg->tcm_handle = tc_make_handle(1, 0);
4081 tcmsg->tcm_parent = 0;
4083 /* Figure out what tc class to instantiate. */
4084 error = tc_transact(&request, &qdisc);
4088 error = tc_parse_qdisc(qdisc, &kind, NULL);
4090 ops = &tc_ops_other;
4092 ops = tc_lookup_linux_name(kind);
4094 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4095 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4097 ops = &tc_ops_other;
4100 } else if (error == ENOENT) {
4101 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4102 * other entity that doesn't have a handle 1:0. We will assume
4103 * that it's the system default qdisc. */
4104 ops = &tc_ops_default;
4107 /* Who knows? Maybe the device got deleted. */
4108 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4109 netdev_get_name(netdev_), ovs_strerror(error));
4110 ops = &tc_ops_other;
4113 /* Instantiate it. */
4114 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
4115 ovs_assert((load_error == 0) == (netdev->tc != NULL));
4116 ofpbuf_delete(qdisc);
4118 return error ? error : load_error;
4121 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4122 approximate the time to transmit packets of various lengths. For an MTU of
4123 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4124 represents two possible packet lengths; for a MTU of 513 through 1024, four
4125 possible lengths; and so on.
4127 Returns, for the specified 'mtu', the number of bits that packet lengths
4128 need to be shifted right to fit within such a 256-entry table. */
4130 tc_calc_cell_log(unsigned int mtu)
4135 mtu = ETH_PAYLOAD_MAX;
4137 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4139 for (cell_log = 0; mtu >= 256; cell_log++) {
4146 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4149 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4151 memset(rate, 0, sizeof *rate);
4152 rate->cell_log = tc_calc_cell_log(mtu);
4153 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4154 /* rate->cell_align = 0; */ /* distro headers. */
4155 rate->mpu = ETH_TOTAL_MIN;
4159 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4160 * attribute of the specified "type".
4162 * See tc_calc_cell_log() above for a description of "rtab"s. */
4164 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4169 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4170 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4171 unsigned packet_size = (i + 1) << rate->cell_log;
4172 if (packet_size < rate->mpu) {
4173 packet_size = rate->mpu;
4175 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4179 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4180 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4181 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4184 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4186 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4187 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4190 /* Linux-only functions declared in netdev-linux.h */
4192 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4193 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4195 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4196 const char *flag_name, bool enable)
4198 const char *netdev_name = netdev_get_name(netdev);
4199 struct ethtool_value evalue;
4203 COVERAGE_INC(netdev_get_ethtool);
4204 memset(&evalue, 0, sizeof evalue);
4205 error = netdev_linux_do_ethtool(netdev_name,
4206 (struct ethtool_cmd *)&evalue,
4207 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4212 COVERAGE_INC(netdev_set_ethtool);
4213 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4214 error = netdev_linux_do_ethtool(netdev_name,
4215 (struct ethtool_cmd *)&evalue,
4216 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4221 COVERAGE_INC(netdev_get_ethtool);
4222 memset(&evalue, 0, sizeof evalue);
4223 error = netdev_linux_do_ethtool(netdev_name,
4224 (struct ethtool_cmd *)&evalue,
4225 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4230 if (new_flags != evalue.data) {
4231 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4232 "device %s failed", enable ? "enable" : "disable",
4233 flag_name, netdev_name);
4240 /* Utility functions. */
4242 /* Copies 'src' into 'dst', performing format conversion in the process. */
4244 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4245 const struct rtnl_link_stats *src)
4247 dst->rx_packets = src->rx_packets;
4248 dst->tx_packets = src->tx_packets;
4249 dst->rx_bytes = src->rx_bytes;
4250 dst->tx_bytes = src->tx_bytes;
4251 dst->rx_errors = src->rx_errors;
4252 dst->tx_errors = src->tx_errors;
4253 dst->rx_dropped = src->rx_dropped;
4254 dst->tx_dropped = src->tx_dropped;
4255 dst->multicast = src->multicast;
4256 dst->collisions = src->collisions;
4257 dst->rx_length_errors = src->rx_length_errors;
4258 dst->rx_over_errors = src->rx_over_errors;
4259 dst->rx_crc_errors = src->rx_crc_errors;
4260 dst->rx_frame_errors = src->rx_frame_errors;
4261 dst->rx_fifo_errors = src->rx_fifo_errors;
4262 dst->rx_missed_errors = src->rx_missed_errors;
4263 dst->tx_aborted_errors = src->tx_aborted_errors;
4264 dst->tx_carrier_errors = src->tx_carrier_errors;
4265 dst->tx_fifo_errors = src->tx_fifo_errors;
4266 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4267 dst->tx_window_errors = src->tx_window_errors;
4271 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4273 /* Policy for RTNLGRP_LINK messages.
4275 * There are *many* more fields in these messages, but currently we only
4276 * care about these fields. */
4277 static const struct nl_policy rtnlgrp_link_policy[] = {
4278 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4279 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4280 .min_len = sizeof(struct rtnl_link_stats) },
4283 struct ofpbuf request;
4284 struct ofpbuf *reply;
4285 struct ifinfomsg *ifi;
4286 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4289 ofpbuf_init(&request, 0);
4290 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4291 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4292 ifi->ifi_family = PF_UNSPEC;
4293 ifi->ifi_index = ifindex;
4294 error = nl_transact(NETLINK_ROUTE, &request, &reply);
4295 ofpbuf_uninit(&request);
4300 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4301 rtnlgrp_link_policy,
4302 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4303 ofpbuf_delete(reply);
4307 if (!attrs[IFLA_STATS]) {
4308 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4309 ofpbuf_delete(reply);
4313 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4315 ofpbuf_delete(reply);
4321 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4323 static const char fn[] = "/proc/net/dev";
4328 stream = fopen(fn, "r");
4330 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
4335 while (fgets(line, sizeof line, stream)) {
4338 #define X64 "%"SCNu64
4341 X64 X64 X64 X64 X64 X64 X64 "%*u"
4342 X64 X64 X64 X64 X64 X64 X64 "%*u",
4348 &stats->rx_fifo_errors,
4349 &stats->rx_frame_errors,
4355 &stats->tx_fifo_errors,
4357 &stats->tx_carrier_errors) != 15) {
4358 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4359 } else if (!strcmp(devname, netdev_name)) {
4360 stats->rx_length_errors = UINT64_MAX;
4361 stats->rx_over_errors = UINT64_MAX;
4362 stats->rx_crc_errors = UINT64_MAX;
4363 stats->rx_missed_errors = UINT64_MAX;
4364 stats->tx_aborted_errors = UINT64_MAX;
4365 stats->tx_heartbeat_errors = UINT64_MAX;
4366 stats->tx_window_errors = UINT64_MAX;
4372 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4378 get_flags(const struct netdev *dev, unsigned int *flags)
4384 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
4386 *flags = ifr.ifr_flags;
4392 set_flags(const char *name, unsigned int flags)
4396 ifr.ifr_flags = flags;
4397 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
4401 do_get_ifindex(const char *netdev_name)
4406 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4407 COVERAGE_INC(netdev_get_ifindex);
4409 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
4411 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4412 netdev_name, ovs_strerror(error));
4415 return ifr.ifr_ifindex;
4419 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4421 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4423 if (!(netdev->cache_valid & VALID_IFINDEX)) {
4424 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4427 netdev->get_ifindex_error = -ifindex;
4428 netdev->ifindex = 0;
4430 netdev->get_ifindex_error = 0;
4431 netdev->ifindex = ifindex;
4433 netdev->cache_valid |= VALID_IFINDEX;
4436 *ifindexp = netdev->ifindex;
4437 return netdev->get_ifindex_error;
4441 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4447 memset(&ifr, 0, sizeof ifr);
4448 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4449 COVERAGE_INC(netdev_get_hwaddr);
4450 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
4452 /* ENODEV probably means that a vif disappeared asynchronously and
4453 * hasn't been removed from the database yet, so reduce the log level
4454 * to INFO for that case. */
4455 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
4456 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4457 netdev_name, ovs_strerror(error));
4460 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4461 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4462 VLOG_WARN("%s device has unknown hardware address family %d",
4463 netdev_name, hwaddr_family);
4465 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4470 set_etheraddr(const char *netdev_name,
4471 const uint8_t mac[ETH_ADDR_LEN])
4476 memset(&ifr, 0, sizeof ifr);
4477 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4478 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4479 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4480 COVERAGE_INC(netdev_set_hwaddr);
4481 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
4483 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4484 netdev_name, ovs_strerror(error));
4490 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4491 int cmd, const char *cmd_name)
4496 memset(&ifr, 0, sizeof ifr);
4497 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4498 ifr.ifr_data = (caddr_t) ecmd;
4501 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
4503 if (error != EOPNOTSUPP) {
4504 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4505 "failed: %s", cmd_name, name, ovs_strerror(error));
4507 /* The device doesn't support this operation. That's pretty
4508 * common, so there's no point in logging anything. */
4515 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4516 int cmd, const char *cmd_name)
4521 ifr.ifr_addr.sa_family = AF_INET;
4522 error = af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4524 const struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *,
4526 *ip = sin->sin_addr;
4531 /* Returns an AF_PACKET raw socket or a negative errno value. */
4533 af_packet_sock(void)
4535 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4538 if (ovsthread_once_start(&once)) {
4539 sock = socket(AF_PACKET, SOCK_RAW, 0);
4541 int error = set_nonblocking(sock);
4548 VLOG_ERR("failed to create packet socket: %s",
4549 ovs_strerror(errno));
4551 ovsthread_once_done(&once);