2 * Copyright (c) 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <linux/version.h>
37 #include <sys/types.h>
38 #include <sys/ioctl.h>
39 #include <sys/socket.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
52 #include "dpif-linux.h"
53 #include "dynamic-string.h"
54 #include "fatal-signal.h"
57 #include "netdev-provider.h"
58 #include "netdev-vport.h"
59 #include "netlink-notifier.h"
60 #include "netlink-socket.h"
63 #include "openflow/openflow.h"
65 #include "poll-loop.h"
66 #include "rtnetlink-link.h"
68 #include "socket-util.h"
71 #include "unaligned.h"
74 VLOG_DEFINE_THIS_MODULE(netdev_linux);
76 COVERAGE_DEFINE(netdev_set_policing);
77 COVERAGE_DEFINE(netdev_arp_lookup);
78 COVERAGE_DEFINE(netdev_get_ifindex);
79 COVERAGE_DEFINE(netdev_get_hwaddr);
80 COVERAGE_DEFINE(netdev_set_hwaddr);
81 COVERAGE_DEFINE(netdev_get_ethtool);
82 COVERAGE_DEFINE(netdev_set_ethtool);
85 /* These were introduced in Linux 2.6.14, so they might be missing if we have
87 #ifndef ADVERTISED_Pause
88 #define ADVERTISED_Pause (1 << 13)
90 #ifndef ADVERTISED_Asym_Pause
91 #define ADVERTISED_Asym_Pause (1 << 14)
94 /* These were introduced in Linux 2.6.24, so they might be missing if we
95 * have old headers. */
96 #ifndef ETHTOOL_GFLAGS
97 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
99 #ifndef ETHTOOL_SFLAGS
100 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
103 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
106 #define TC_RTAB_SIZE 1024
109 static struct nln_notifier *netdev_linux_cache_notifier = NULL;
110 static int cache_notifier_refcount;
113 VALID_IFINDEX = 1 << 0,
114 VALID_ETHERADDR = 1 << 1,
118 VALID_POLICING = 1 << 5,
119 VALID_VPORT_STAT_ERROR = 1 << 6,
120 VALID_DRVINFO = 1 << 7,
121 VALID_FEATURES = 1 << 8,
128 /* Traffic control. */
130 /* An instance of a traffic control class. Always associated with a particular
133 * Each TC implementation subclasses this with whatever additional data it
136 const struct tc_ops *ops;
137 struct hmap queues; /* Contains "struct tc_queue"s.
138 * Read by generic TC layer.
139 * Written only by TC implementation. */
142 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
144 /* One traffic control queue.
146 * Each TC implementation subclasses this with whatever additional data it
149 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
150 unsigned int queue_id; /* OpenFlow queue ID. */
153 /* A particular kind of traffic control. Each implementation generally maps to
154 * one particular Linux qdisc class.
156 * The functions below return 0 if successful or a positive errno value on
157 * failure, except where otherwise noted. All of them must be provided, except
158 * where otherwise noted. */
160 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
161 * This is null for tc_ops_default and tc_ops_other, for which there are no
162 * appropriate values. */
163 const char *linux_name;
165 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
166 const char *ovs_name;
168 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
169 * queues. The queues are numbered 0 through n_queues - 1. */
170 unsigned int n_queues;
172 /* Called to install this TC class on 'netdev'. The implementation should
173 * make the Netlink calls required to set up 'netdev' with the right qdisc
174 * and configure it according to 'details'. The implementation may assume
175 * that the current qdisc is the default; that is, there is no need for it
176 * to delete the current qdisc before installing itself.
178 * The contents of 'details' should be documented as valid for 'ovs_name'
179 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
180 * (which is built as ovs-vswitchd.conf.db(8)).
182 * This function must return 0 if and only if it sets 'netdev->tc' to an
183 * initialized 'struct tc'.
185 * (This function is null for tc_ops_other, which cannot be installed. For
186 * other TC classes it should always be nonnull.) */
187 int (*tc_install)(struct netdev *netdev, const struct smap *details);
189 /* Called when the netdev code determines (through a Netlink query) that
190 * this TC class's qdisc is installed on 'netdev', but we didn't install
191 * it ourselves and so don't know any of the details.
193 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
194 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
195 * implementation should parse the other attributes of 'nlmsg' as
196 * necessary to determine its configuration. If necessary it should also
197 * use Netlink queries to determine the configuration of queues on
200 * This function must return 0 if and only if it sets 'netdev->tc' to an
201 * initialized 'struct tc'. */
202 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
204 /* Destroys the data structures allocated by the implementation as part of
205 * 'tc'. (This includes destroying 'tc->queues' by calling
208 * The implementation should not need to perform any Netlink calls. If
209 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
210 * (But it may not be desirable.)
212 * This function may be null if 'tc' is trivial. */
213 void (*tc_destroy)(struct tc *tc);
215 /* Retrieves details of 'netdev->tc' configuration into 'details'.
217 * The implementation should not need to perform any Netlink calls, because
218 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
219 * cached the configuration.
221 * The contents of 'details' should be documented as valid for 'ovs_name'
222 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
223 * (which is built as ovs-vswitchd.conf.db(8)).
225 * This function may be null if 'tc' is not configurable.
227 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
229 /* Reconfigures 'netdev->tc' according to 'details', performing any
230 * required Netlink calls to complete the reconfiguration.
232 * The contents of 'details' should be documented as valid for 'ovs_name'
233 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
234 * (which is built as ovs-vswitchd.conf.db(8)).
236 * This function may be null if 'tc' is not configurable.
238 int (*qdisc_set)(struct netdev *, const struct smap *details);
240 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
241 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
243 * The contents of 'details' should be documented as valid for 'ovs_name'
244 * in the "other_config" column in the "Queue" table in
245 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
247 * The implementation should not need to perform any Netlink calls, because
248 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
249 * cached the queue configuration.
251 * This function may be null if 'tc' does not have queues ('n_queues' is
253 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
254 struct smap *details);
256 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
257 * 'details', perfoming any required Netlink calls to complete the
258 * reconfiguration. The caller ensures that 'queue_id' is less than
261 * The contents of 'details' should be documented as valid for 'ovs_name'
262 * in the "other_config" column in the "Queue" table in
263 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
265 * This function may be null if 'tc' does not have queues or its queues are
266 * not configurable. */
267 int (*class_set)(struct netdev *, unsigned int queue_id,
268 const struct smap *details);
270 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
271 * tc_queue's within 'netdev->tc->queues'.
273 * This function may be null if 'tc' does not have queues or its queues
274 * cannot be deleted. */
275 int (*class_delete)(struct netdev *, struct tc_queue *queue);
277 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
278 * 'struct tc_queue's within 'netdev->tc->queues'.
280 * On success, initializes '*stats'.
282 * This function may be null if 'tc' does not have queues or if it cannot
283 * report queue statistics. */
284 int (*class_get_stats)(const struct netdev *netdev,
285 const struct tc_queue *queue,
286 struct netdev_queue_stats *stats);
288 /* Extracts queue stats from 'nlmsg', which is a response to a
289 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
291 * This function may be null if 'tc' does not have queues or if it cannot
292 * report queue statistics. */
293 int (*class_dump_stats)(const struct netdev *netdev,
294 const struct ofpbuf *nlmsg,
295 netdev_dump_queue_stats_cb *cb, void *aux);
299 tc_init(struct tc *tc, const struct tc_ops *ops)
302 hmap_init(&tc->queues);
306 tc_destroy(struct tc *tc)
308 hmap_destroy(&tc->queues);
311 static const struct tc_ops tc_ops_htb;
312 static const struct tc_ops tc_ops_hfsc;
313 static const struct tc_ops tc_ops_default;
314 static const struct tc_ops tc_ops_other;
316 static const struct tc_ops *const tcs[] = {
317 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
318 &tc_ops_hfsc, /* Hierarchical fair service curve. */
319 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
320 &tc_ops_other, /* Some other qdisc. */
324 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
325 static unsigned int tc_get_major(unsigned int handle);
326 static unsigned int tc_get_minor(unsigned int handle);
328 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
329 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
330 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
332 static struct tcmsg *tc_make_request(const struct netdev *, int type,
333 unsigned int flags, struct ofpbuf *);
334 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
335 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
336 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
339 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
340 struct nlattr **options);
341 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
342 struct nlattr **options,
343 struct netdev_queue_stats *);
344 static int tc_query_class(const struct netdev *,
345 unsigned int handle, unsigned int parent,
346 struct ofpbuf **replyp);
347 static int tc_delete_class(const struct netdev *, unsigned int handle);
349 static int tc_del_qdisc(struct netdev *netdev);
350 static int tc_query_qdisc(const struct netdev *netdev);
352 static int tc_calc_cell_log(unsigned int mtu);
353 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
354 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
355 const struct tc_ratespec *rate);
356 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
358 struct netdev_linux {
361 struct shash_node *shash_node;
362 unsigned int cache_valid;
363 unsigned int change_seq;
365 bool miimon; /* Link status of last poll. */
366 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
367 struct timer miimon_timer;
369 /* The following are figured out "on demand" only. They are only valid
370 * when the corresponding VALID_* bit in 'cache_valid' is set. */
372 uint8_t etheraddr[ETH_ADDR_LEN];
373 struct in_addr address, netmask;
376 unsigned int ifi_flags;
377 long long int carrier_resets;
378 uint32_t kbits_rate; /* Policing data. */
379 uint32_t kbits_burst;
380 int vport_stats_error; /* Cached error code from vport_get_stats().
381 0 or an errno value. */
382 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
383 int ether_addr_error; /* Cached error code from set/get etheraddr. */
384 int netdev_policing_error; /* Cached error code from set policing. */
385 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
386 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
388 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
389 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
390 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
391 enum netdev_features peer; /* Cached from ETHTOOL_GSET. */
393 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
397 struct tap_state tap;
401 struct netdev_rx_linux {
407 static const struct netdev_rx_class netdev_rx_linux_class;
409 /* Sockets used for ioctl operations. */
410 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
412 /* This is set pretty low because we probably won't learn anything from the
413 * additional log messages. */
414 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
416 static int netdev_linux_init(void);
418 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
419 int cmd, const char *cmd_name);
420 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
421 const char *cmd_name);
422 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
423 int cmd, const char *cmd_name);
424 static int get_flags(const struct netdev *, unsigned int *flags);
425 static int set_flags(const char *, unsigned int flags);
426 static int do_get_ifindex(const char *netdev_name);
427 static int get_ifindex(const struct netdev *, int *ifindexp);
428 static int do_set_addr(struct netdev *netdev,
429 int ioctl_nr, const char *ioctl_name,
430 struct in_addr addr);
431 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
432 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
433 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
434 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
435 static int af_packet_sock(void);
436 static void netdev_linux_miimon_run(void);
437 static void netdev_linux_miimon_wait(void);
440 is_netdev_linux_class(const struct netdev_class *netdev_class)
442 return netdev_class->init == netdev_linux_init;
446 is_tap_netdev(const struct netdev *netdev)
448 return netdev_get_class(netdev) == &netdev_tap_class;
451 static struct netdev_linux *
452 netdev_linux_cast(const struct netdev *netdev)
454 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
456 return CONTAINER_OF(netdev, struct netdev_linux, up);
459 static struct netdev_rx_linux *
460 netdev_rx_linux_cast(const struct netdev_rx *rx)
462 netdev_rx_assert_class(rx, &netdev_rx_linux_class);
463 return CONTAINER_OF(rx, struct netdev_rx_linux, up);
467 netdev_linux_init(void)
469 static int status = -1;
471 /* Create AF_INET socket. */
472 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
473 status = af_inet_sock >= 0 ? 0 : errno;
475 VLOG_ERR("failed to create inet socket: %s", ovs_strerror(status));
482 netdev_linux_run(void)
484 rtnetlink_link_run();
485 netdev_linux_miimon_run();
489 netdev_linux_wait(void)
491 rtnetlink_link_wait();
492 netdev_linux_miimon_wait();
496 netdev_linux_changed(struct netdev_linux *dev,
497 unsigned int ifi_flags, unsigned int mask)
500 if (!dev->change_seq) {
504 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
505 dev->carrier_resets++;
507 dev->ifi_flags = ifi_flags;
509 dev->cache_valid &= mask;
513 netdev_linux_update(struct netdev_linux *dev,
514 const struct rtnetlink_link_change *change)
516 if (change->nlmsg_type == RTM_NEWLINK) {
518 netdev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
520 /* Update netdev from rtnl-change msg. */
522 dev->mtu = change->mtu;
523 dev->cache_valid |= VALID_MTU;
524 dev->netdev_mtu_error = 0;
527 if (!eth_addr_is_zero(change->addr)) {
528 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
529 dev->cache_valid |= VALID_ETHERADDR;
530 dev->ether_addr_error = 0;
533 dev->ifindex = change->ifi_index;
534 dev->cache_valid |= VALID_IFINDEX;
535 dev->get_ifindex_error = 0;
538 netdev_linux_changed(dev, change->ifi_flags, 0);
543 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
544 void *aux OVS_UNUSED)
546 struct netdev_linux *dev;
548 struct netdev *base_dev = netdev_from_name(change->ifname);
549 if (base_dev && is_netdev_linux_class(netdev_get_class(base_dev))) {
550 netdev_linux_update(netdev_linux_cast(base_dev), change);
553 struct shash device_shash;
554 struct shash_node *node;
556 shash_init(&device_shash);
557 netdev_get_devices(&netdev_linux_class, &device_shash);
558 SHASH_FOR_EACH (node, &device_shash) {
563 get_flags(&dev->up, &flags);
564 netdev_linux_changed(dev, flags, 0);
566 shash_destroy(&device_shash);
571 cache_notifier_ref(void)
573 if (!cache_notifier_refcount) {
574 ovs_assert(!netdev_linux_cache_notifier);
576 netdev_linux_cache_notifier =
577 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
579 if (!netdev_linux_cache_notifier) {
583 cache_notifier_refcount++;
589 cache_notifier_unref(void)
591 ovs_assert(cache_notifier_refcount > 0);
592 if (!--cache_notifier_refcount) {
593 ovs_assert(netdev_linux_cache_notifier);
594 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
595 netdev_linux_cache_notifier = NULL;
599 /* Creates system and internal devices. */
601 netdev_linux_create(const struct netdev_class *class, const char *name,
602 struct netdev **netdevp)
604 struct netdev_linux *netdev;
607 error = cache_notifier_ref();
612 netdev = xzalloc(sizeof *netdev);
613 netdev->change_seq = 1;
614 netdev_init(&netdev->up, name, class);
615 error = get_flags(&netdev->up, &netdev->ifi_flags);
616 if (error == ENODEV) {
617 if (class != &netdev_internal_class) {
618 /* The device does not exist, so don't allow it to be opened. */
619 netdev_uninit(&netdev->up, false);
620 cache_notifier_unref();
624 /* "Internal" netdevs have to be created as netdev objects before
625 * they exist in the kernel, because creating them in the kernel
626 * happens by passing a netdev object to dpif_port_add().
627 * Therefore, ignore the error. */
631 *netdevp = &netdev->up;
635 /* For most types of netdevs we open the device for each call of
636 * netdev_open(). However, this is not the case with tap devices,
637 * since it is only possible to open the device once. In this
638 * situation we share a single file descriptor, and consequently
639 * buffers, across all readers. Therefore once data is read it will
640 * be unavailable to other reads for tap devices. */
642 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
643 const char *name, struct netdev **netdevp)
645 struct netdev_linux *netdev;
646 struct tap_state *state;
647 static const char tap_dev[] = "/dev/net/tun";
651 netdev = xzalloc(sizeof *netdev);
652 state = &netdev->state.tap;
654 error = cache_notifier_ref();
659 /* Open tap device. */
660 state->fd = open(tap_dev, O_RDWR);
663 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
664 goto error_unref_notifier;
667 /* Create tap device. */
668 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
669 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
670 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
671 VLOG_WARN("%s: creating tap device failed: %s", name,
672 ovs_strerror(errno));
674 goto error_unref_notifier;
677 /* Make non-blocking. */
678 error = set_nonblocking(state->fd);
680 goto error_unref_notifier;
683 netdev_init(&netdev->up, name, &netdev_tap_class);
684 *netdevp = &netdev->up;
687 error_unref_notifier:
688 cache_notifier_unref();
695 destroy_tap(struct netdev_linux *netdev)
697 struct tap_state *state = &netdev->state.tap;
699 if (state->fd >= 0) {
704 /* Destroys the netdev device 'netdev_'. */
706 netdev_linux_destroy(struct netdev *netdev_)
708 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
710 if (netdev->tc && netdev->tc->ops->tc_destroy) {
711 netdev->tc->ops->tc_destroy(netdev->tc);
714 if (netdev_get_class(netdev_) == &netdev_tap_class) {
719 cache_notifier_unref();
723 netdev_linux_rx_open(struct netdev *netdev_, struct netdev_rx **rxp)
725 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
726 bool is_tap = is_tap_netdev(netdev_);
727 struct netdev_rx_linux *rx;
732 fd = netdev->state.tap.fd;
734 struct sockaddr_ll sll;
736 /* Result of tcpdump -dd inbound */
737 static struct sock_filter filt[] = {
738 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
739 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
740 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
741 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
743 static struct sock_fprog fprog = { ARRAY_SIZE(filt), filt };
745 /* Create file descriptor. */
746 fd = socket(PF_PACKET, SOCK_RAW, 0);
749 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
753 /* Set non-blocking mode. */
754 error = set_nonblocking(fd);
759 /* Get ethernet device index. */
760 error = get_ifindex(&netdev->up, &ifindex);
765 /* Bind to specific ethernet device. */
766 memset(&sll, 0, sizeof sll);
767 sll.sll_family = AF_PACKET;
768 sll.sll_ifindex = ifindex;
769 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
770 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
772 VLOG_ERR("%s: failed to bind raw socket (%s)",
773 netdev_get_name(netdev_), ovs_strerror(error));
777 /* Filter for only inbound packets. */
778 error = setsockopt(fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
782 VLOG_ERR("%s: failed attach filter (%s)",
783 netdev_get_name(netdev_), ovs_strerror(error));
788 rx = xmalloc(sizeof *rx);
789 netdev_rx_init(&rx->up, netdev_, &netdev_rx_linux_class);
804 netdev_rx_linux_destroy(struct netdev_rx *rx_)
806 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
815 netdev_rx_linux_recv(struct netdev_rx *rx_, void *data, size_t size)
817 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
822 ? read(rx->fd, data, size)
823 : recv(rx->fd, data, size, MSG_TRUNC));
824 } while (retval < 0 && errno == EINTR);
827 return retval > size ? -EMSGSIZE : retval;
829 if (errno != EAGAIN) {
830 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
831 ovs_strerror(errno), netdev_rx_get_name(rx_));
838 netdev_rx_linux_wait(struct netdev_rx *rx_)
840 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
841 poll_fd_wait(rx->fd, POLLIN);
845 netdev_rx_linux_drain(struct netdev_rx *rx_)
847 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
850 int error = netdev_linux_do_ioctl(netdev_rx_get_name(rx_), &ifr,
851 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
855 drain_fd(rx->fd, ifr.ifr_qlen);
858 return drain_rcvbuf(rx->fd);
862 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
863 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
864 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
865 * the packet is too big or too small to transmit on the device.
867 * The caller retains ownership of 'buffer' in all cases.
869 * The kernel maintains a packet transmission queue, so the caller is not
870 * expected to do additional queuing of packets. */
872 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
877 if (!is_tap_netdev(netdev_)) {
878 /* Use our AF_PACKET socket to send to this device. */
879 struct sockaddr_ll sll;
886 sock = af_packet_sock();
891 error = get_ifindex(netdev_, &ifindex);
896 /* We don't bother setting most fields in sockaddr_ll because the
897 * kernel ignores them for SOCK_RAW. */
898 memset(&sll, 0, sizeof sll);
899 sll.sll_family = AF_PACKET;
900 sll.sll_ifindex = ifindex;
902 iov.iov_base = CONST_CAST(void *, data);
906 msg.msg_namelen = sizeof sll;
909 msg.msg_control = NULL;
910 msg.msg_controllen = 0;
913 retval = sendmsg(sock, &msg, 0);
915 /* Use the tap fd to send to this device. This is essential for
916 * tap devices, because packets sent to a tap device with an
917 * AF_PACKET socket will loop back to be *received* again on the
918 * tap device. This doesn't occur on other interface types
919 * because we attach a socket filter to the rx socket. */
920 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
922 retval = write(netdev->state.tap.fd, data, size);
926 /* The Linux AF_PACKET implementation never blocks waiting for room
927 * for packets, instead returning ENOBUFS. Translate this into
928 * EAGAIN for the caller. */
929 if (errno == ENOBUFS) {
931 } else if (errno == EINTR) {
933 } else if (errno != EAGAIN) {
934 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
935 netdev_get_name(netdev_), ovs_strerror(errno));
938 } else if (retval != size) {
939 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
940 "%zu) on %s", retval, size, netdev_get_name(netdev_));
948 /* Registers with the poll loop to wake up from the next call to poll_block()
949 * when the packet transmission queue has sufficient room to transmit a packet
950 * with netdev_send().
952 * The kernel maintains a packet transmission queue, so the client is not
953 * expected to do additional queuing of packets. Thus, this function is
954 * unlikely to ever be used. It is included for completeness. */
956 netdev_linux_send_wait(struct netdev *netdev)
958 if (is_tap_netdev(netdev)) {
959 /* TAP device always accepts packets.*/
960 poll_immediate_wake();
964 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
965 * otherwise a positive errno value. */
967 netdev_linux_set_etheraddr(struct netdev *netdev_,
968 const uint8_t mac[ETH_ADDR_LEN])
970 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
971 struct netdev_saved_flags *sf = NULL;
974 if (netdev->cache_valid & VALID_ETHERADDR) {
975 if (netdev->ether_addr_error) {
976 return netdev->ether_addr_error;
978 if (eth_addr_equals(netdev->etheraddr, mac)) {
981 netdev->cache_valid &= ~VALID_ETHERADDR;
984 /* Tap devices must be brought down before setting the address. */
985 if (is_tap_netdev(netdev_)) {
986 netdev_turn_flags_off(netdev_, NETDEV_UP, &sf);
988 error = set_etheraddr(netdev_get_name(netdev_), mac);
989 if (!error || error == ENODEV) {
990 netdev->ether_addr_error = error;
991 netdev->cache_valid |= VALID_ETHERADDR;
993 memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
997 netdev_restore_flags(sf);
1002 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1004 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1005 uint8_t mac[ETH_ADDR_LEN])
1007 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1009 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1010 int error = get_etheraddr(netdev_get_name(netdev_),
1013 netdev->ether_addr_error = error;
1014 netdev->cache_valid |= VALID_ETHERADDR;
1017 if (!netdev->ether_addr_error) {
1018 memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
1021 return netdev->ether_addr_error;
1024 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1025 * in bytes, not including the hardware header; thus, this is typically 1500
1026 * bytes for Ethernet devices. */
1028 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1030 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1031 if (!(netdev->cache_valid & VALID_MTU)) {
1035 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1036 SIOCGIFMTU, "SIOCGIFMTU");
1038 netdev->netdev_mtu_error = error;
1039 netdev->mtu = ifr.ifr_mtu;
1040 netdev->cache_valid |= VALID_MTU;
1043 if (!netdev->netdev_mtu_error) {
1044 *mtup = netdev->mtu;
1046 return netdev->netdev_mtu_error;
1049 /* Sets the maximum size of transmitted (MTU) for given device using linux
1050 * networking ioctl interface.
1053 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1055 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1059 if (netdev->cache_valid & VALID_MTU) {
1060 if (netdev->netdev_mtu_error) {
1061 return netdev->netdev_mtu_error;
1063 if (netdev->mtu == mtu) {
1066 netdev->cache_valid &= ~VALID_MTU;
1069 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1070 SIOCSIFMTU, "SIOCSIFMTU");
1071 if (!error || error == ENODEV) {
1072 netdev->netdev_mtu_error = error;
1073 netdev->mtu = ifr.ifr_mtu;
1074 netdev->cache_valid |= VALID_MTU;
1079 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1080 * On failure, returns a negative errno value. */
1082 netdev_linux_get_ifindex(const struct netdev *netdev)
1086 error = get_ifindex(netdev, &ifindex);
1087 return error ? -error : ifindex;
1091 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1093 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1095 if (netdev->miimon_interval > 0) {
1096 *carrier = netdev->miimon;
1098 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1104 static long long int
1105 netdev_linux_get_carrier_resets(const struct netdev *netdev)
1107 return netdev_linux_cast(netdev)->carrier_resets;
1111 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1112 struct mii_ioctl_data *data)
1117 memset(&ifr, 0, sizeof ifr);
1118 memcpy(&ifr.ifr_data, data, sizeof *data);
1119 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1120 memcpy(data, &ifr.ifr_data, sizeof *data);
1126 netdev_linux_get_miimon(const char *name, bool *miimon)
1128 struct mii_ioctl_data data;
1133 memset(&data, 0, sizeof data);
1134 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1136 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1137 data.reg_num = MII_BMSR;
1138 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1142 *miimon = !!(data.val_out & BMSR_LSTATUS);
1144 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1147 struct ethtool_cmd ecmd;
1149 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1152 COVERAGE_INC(netdev_get_ethtool);
1153 memset(&ecmd, 0, sizeof ecmd);
1154 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1157 struct ethtool_value eval;
1159 memcpy(&eval, &ecmd, sizeof eval);
1160 *miimon = !!eval.data;
1162 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1170 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1171 long long int interval)
1173 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1175 interval = interval > 0 ? MAX(interval, 100) : 0;
1176 if (netdev->miimon_interval != interval) {
1177 netdev->miimon_interval = interval;
1178 timer_set_expired(&netdev->miimon_timer);
1185 netdev_linux_miimon_run(void)
1187 struct shash device_shash;
1188 struct shash_node *node;
1190 shash_init(&device_shash);
1191 netdev_get_devices(&netdev_linux_class, &device_shash);
1192 SHASH_FOR_EACH (node, &device_shash) {
1193 struct netdev_linux *dev = node->data;
1196 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1200 netdev_linux_get_miimon(dev->up.name, &miimon);
1201 if (miimon != dev->miimon) {
1202 dev->miimon = miimon;
1203 netdev_linux_changed(dev, dev->ifi_flags, 0);
1206 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1209 shash_destroy(&device_shash);
1213 netdev_linux_miimon_wait(void)
1215 struct shash device_shash;
1216 struct shash_node *node;
1218 shash_init(&device_shash);
1219 netdev_get_devices(&netdev_linux_class, &device_shash);
1220 SHASH_FOR_EACH (node, &device_shash) {
1221 struct netdev_linux *dev = node->data;
1223 if (dev->miimon_interval > 0) {
1224 timer_wait(&dev->miimon_timer);
1227 shash_destroy(&device_shash);
1230 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1231 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1234 check_for_working_netlink_stats(void)
1236 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1237 * preferable, so if that works, we'll use it. */
1238 int ifindex = do_get_ifindex("lo");
1240 VLOG_WARN("failed to get ifindex for lo, "
1241 "obtaining netdev stats from proc");
1244 struct netdev_stats stats;
1245 int error = get_stats_via_netlink(ifindex, &stats);
1247 VLOG_DBG("obtaining netdev stats via rtnetlink");
1250 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1251 "via proc (you are probably running a pre-2.6.19 "
1252 "kernel)", ovs_strerror(error));
1259 swap_uint64(uint64_t *a, uint64_t *b)
1266 /* Copies 'src' into 'dst', performing format conversion in the process.
1268 * 'src' is allowed to be misaligned. */
1270 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1271 const struct ovs_vport_stats *src)
1273 dst->rx_packets = get_unaligned_u64(&src->rx_packets);
1274 dst->tx_packets = get_unaligned_u64(&src->tx_packets);
1275 dst->rx_bytes = get_unaligned_u64(&src->rx_bytes);
1276 dst->tx_bytes = get_unaligned_u64(&src->tx_bytes);
1277 dst->rx_errors = get_unaligned_u64(&src->rx_errors);
1278 dst->tx_errors = get_unaligned_u64(&src->tx_errors);
1279 dst->rx_dropped = get_unaligned_u64(&src->rx_dropped);
1280 dst->tx_dropped = get_unaligned_u64(&src->tx_dropped);
1282 dst->collisions = 0;
1283 dst->rx_length_errors = 0;
1284 dst->rx_over_errors = 0;
1285 dst->rx_crc_errors = 0;
1286 dst->rx_frame_errors = 0;
1287 dst->rx_fifo_errors = 0;
1288 dst->rx_missed_errors = 0;
1289 dst->tx_aborted_errors = 0;
1290 dst->tx_carrier_errors = 0;
1291 dst->tx_fifo_errors = 0;
1292 dst->tx_heartbeat_errors = 0;
1293 dst->tx_window_errors = 0;
1297 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1299 struct dpif_linux_vport reply;
1303 error = dpif_linux_vport_get(netdev_get_name(netdev), &reply, &buf);
1306 } else if (!reply.stats) {
1311 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1319 get_stats_via_vport(const struct netdev *netdev_,
1320 struct netdev_stats *stats)
1322 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1324 if (!netdev->vport_stats_error ||
1325 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1328 error = get_stats_via_vport__(netdev_, stats);
1329 if (error && error != ENOENT) {
1330 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1332 netdev_get_name(netdev_), ovs_strerror(error));
1334 netdev->vport_stats_error = error;
1335 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1340 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1341 struct netdev_stats *stats)
1343 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
1344 static int use_netlink_stats;
1347 if (ovsthread_once_start(&once)) {
1348 use_netlink_stats = check_for_working_netlink_stats();
1349 ovsthread_once_done(&once);
1352 if (use_netlink_stats) {
1355 error = get_ifindex(netdev_, &ifindex);
1357 error = get_stats_via_netlink(ifindex, stats);
1360 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1364 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1365 netdev_get_name(netdev_), error);
1371 /* Retrieves current device stats for 'netdev-linux'. */
1373 netdev_linux_get_stats(const struct netdev *netdev_,
1374 struct netdev_stats *stats)
1376 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1377 struct netdev_stats dev_stats;
1380 get_stats_via_vport(netdev_, stats);
1382 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1385 if (netdev->vport_stats_error) {
1392 if (netdev->vport_stats_error) {
1393 /* stats not available from OVS then use ioctl stats. */
1396 stats->rx_errors += dev_stats.rx_errors;
1397 stats->tx_errors += dev_stats.tx_errors;
1398 stats->rx_dropped += dev_stats.rx_dropped;
1399 stats->tx_dropped += dev_stats.tx_dropped;
1400 stats->multicast += dev_stats.multicast;
1401 stats->collisions += dev_stats.collisions;
1402 stats->rx_length_errors += dev_stats.rx_length_errors;
1403 stats->rx_over_errors += dev_stats.rx_over_errors;
1404 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1405 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1406 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1407 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1408 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1409 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1410 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1411 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1412 stats->tx_window_errors += dev_stats.tx_window_errors;
1417 /* Retrieves current device stats for 'netdev-tap' netdev or
1418 * netdev-internal. */
1420 netdev_tap_get_stats(const struct netdev *netdev_,
1421 struct netdev_stats *stats)
1423 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1424 struct netdev_stats dev_stats;
1427 get_stats_via_vport(netdev_, stats);
1429 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1431 if (netdev->vport_stats_error) {
1438 /* If this port is an internal port then the transmit and receive stats
1439 * will appear to be swapped relative to the other ports since we are the
1440 * one sending the data, not a remote computer. For consistency, we swap
1441 * them back here. This does not apply if we are getting stats from the
1442 * vport layer because it always tracks stats from the perspective of the
1444 if (netdev->vport_stats_error) {
1446 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1447 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1448 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1449 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1450 stats->rx_length_errors = 0;
1451 stats->rx_over_errors = 0;
1452 stats->rx_crc_errors = 0;
1453 stats->rx_frame_errors = 0;
1454 stats->rx_fifo_errors = 0;
1455 stats->rx_missed_errors = 0;
1456 stats->tx_aborted_errors = 0;
1457 stats->tx_carrier_errors = 0;
1458 stats->tx_fifo_errors = 0;
1459 stats->tx_heartbeat_errors = 0;
1460 stats->tx_window_errors = 0;
1462 stats->rx_dropped += dev_stats.tx_dropped;
1463 stats->tx_dropped += dev_stats.rx_dropped;
1465 stats->rx_errors += dev_stats.tx_errors;
1466 stats->tx_errors += dev_stats.rx_errors;
1468 stats->multicast += dev_stats.multicast;
1469 stats->collisions += dev_stats.collisions;
1475 netdev_internal_get_stats(const struct netdev *netdev_,
1476 struct netdev_stats *stats)
1478 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1480 get_stats_via_vport(netdev_, stats);
1481 return netdev->vport_stats_error;
1485 netdev_internal_set_stats(struct netdev *netdev,
1486 const struct netdev_stats *stats)
1488 struct ovs_vport_stats vport_stats;
1489 struct dpif_linux_vport vport;
1492 vport_stats.rx_packets = stats->rx_packets;
1493 vport_stats.tx_packets = stats->tx_packets;
1494 vport_stats.rx_bytes = stats->rx_bytes;
1495 vport_stats.tx_bytes = stats->tx_bytes;
1496 vport_stats.rx_errors = stats->rx_errors;
1497 vport_stats.tx_errors = stats->tx_errors;
1498 vport_stats.rx_dropped = stats->rx_dropped;
1499 vport_stats.tx_dropped = stats->tx_dropped;
1501 dpif_linux_vport_init(&vport);
1502 vport.cmd = OVS_VPORT_CMD_SET;
1503 vport.name = netdev_get_name(netdev);
1504 vport.stats = &vport_stats;
1506 err = dpif_linux_vport_transact(&vport, NULL, NULL);
1508 /* If the vport layer doesn't know about the device, that doesn't mean it
1509 * doesn't exist (after all were able to open it when netdev_open() was
1510 * called), it just means that it isn't attached and we'll be getting
1511 * stats a different way. */
1512 if (err == ENODEV) {
1520 netdev_linux_read_features(struct netdev_linux *netdev)
1522 struct ethtool_cmd ecmd;
1526 if (netdev->cache_valid & VALID_FEATURES) {
1530 COVERAGE_INC(netdev_get_ethtool);
1531 memset(&ecmd, 0, sizeof ecmd);
1532 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1533 ETHTOOL_GSET, "ETHTOOL_GSET");
1538 /* Supported features. */
1539 netdev->supported = 0;
1540 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1541 netdev->supported |= NETDEV_F_10MB_HD;
1543 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1544 netdev->supported |= NETDEV_F_10MB_FD;
1546 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1547 netdev->supported |= NETDEV_F_100MB_HD;
1549 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1550 netdev->supported |= NETDEV_F_100MB_FD;
1552 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1553 netdev->supported |= NETDEV_F_1GB_HD;
1555 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1556 netdev->supported |= NETDEV_F_1GB_FD;
1558 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1559 netdev->supported |= NETDEV_F_10GB_FD;
1561 if (ecmd.supported & SUPPORTED_TP) {
1562 netdev->supported |= NETDEV_F_COPPER;
1564 if (ecmd.supported & SUPPORTED_FIBRE) {
1565 netdev->supported |= NETDEV_F_FIBER;
1567 if (ecmd.supported & SUPPORTED_Autoneg) {
1568 netdev->supported |= NETDEV_F_AUTONEG;
1570 if (ecmd.supported & SUPPORTED_Pause) {
1571 netdev->supported |= NETDEV_F_PAUSE;
1573 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1574 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1577 /* Advertised features. */
1578 netdev->advertised = 0;
1579 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1580 netdev->advertised |= NETDEV_F_10MB_HD;
1582 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1583 netdev->advertised |= NETDEV_F_10MB_FD;
1585 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1586 netdev->advertised |= NETDEV_F_100MB_HD;
1588 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1589 netdev->advertised |= NETDEV_F_100MB_FD;
1591 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1592 netdev->advertised |= NETDEV_F_1GB_HD;
1594 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1595 netdev->advertised |= NETDEV_F_1GB_FD;
1597 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1598 netdev->advertised |= NETDEV_F_10GB_FD;
1600 if (ecmd.advertising & ADVERTISED_TP) {
1601 netdev->advertised |= NETDEV_F_COPPER;
1603 if (ecmd.advertising & ADVERTISED_FIBRE) {
1604 netdev->advertised |= NETDEV_F_FIBER;
1606 if (ecmd.advertising & ADVERTISED_Autoneg) {
1607 netdev->advertised |= NETDEV_F_AUTONEG;
1609 if (ecmd.advertising & ADVERTISED_Pause) {
1610 netdev->advertised |= NETDEV_F_PAUSE;
1612 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1613 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1616 /* Current settings. */
1618 if (speed == SPEED_10) {
1619 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1620 } else if (speed == SPEED_100) {
1621 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1622 } else if (speed == SPEED_1000) {
1623 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1624 } else if (speed == SPEED_10000) {
1625 netdev->current = NETDEV_F_10GB_FD;
1626 } else if (speed == 40000) {
1627 netdev->current = NETDEV_F_40GB_FD;
1628 } else if (speed == 100000) {
1629 netdev->current = NETDEV_F_100GB_FD;
1630 } else if (speed == 1000000) {
1631 netdev->current = NETDEV_F_1TB_FD;
1633 netdev->current = 0;
1636 if (ecmd.port == PORT_TP) {
1637 netdev->current |= NETDEV_F_COPPER;
1638 } else if (ecmd.port == PORT_FIBRE) {
1639 netdev->current |= NETDEV_F_FIBER;
1643 netdev->current |= NETDEV_F_AUTONEG;
1646 /* Peer advertisements. */
1647 netdev->peer = 0; /* XXX */
1650 netdev->cache_valid |= VALID_FEATURES;
1651 netdev->get_features_error = error;
1654 /* Stores the features supported by 'netdev' into each of '*current',
1655 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1656 * bitmap of NETDEV_* bits. Returns 0 if successful, otherwise a positive
1659 netdev_linux_get_features(const struct netdev *netdev_,
1660 enum netdev_features *current,
1661 enum netdev_features *advertised,
1662 enum netdev_features *supported,
1663 enum netdev_features *peer)
1665 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1667 netdev_linux_read_features(netdev);
1669 if (!netdev->get_features_error) {
1670 *current = netdev->current;
1671 *advertised = netdev->advertised;
1672 *supported = netdev->supported;
1673 *peer = netdev->peer;
1675 return netdev->get_features_error;
1678 /* Set the features advertised by 'netdev' to 'advertise'. */
1680 netdev_linux_set_advertisements(struct netdev *netdev,
1681 enum netdev_features advertise)
1683 struct ethtool_cmd ecmd;
1686 COVERAGE_INC(netdev_get_ethtool);
1687 memset(&ecmd, 0, sizeof ecmd);
1688 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1689 ETHTOOL_GSET, "ETHTOOL_GSET");
1694 ecmd.advertising = 0;
1695 if (advertise & NETDEV_F_10MB_HD) {
1696 ecmd.advertising |= ADVERTISED_10baseT_Half;
1698 if (advertise & NETDEV_F_10MB_FD) {
1699 ecmd.advertising |= ADVERTISED_10baseT_Full;
1701 if (advertise & NETDEV_F_100MB_HD) {
1702 ecmd.advertising |= ADVERTISED_100baseT_Half;
1704 if (advertise & NETDEV_F_100MB_FD) {
1705 ecmd.advertising |= ADVERTISED_100baseT_Full;
1707 if (advertise & NETDEV_F_1GB_HD) {
1708 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1710 if (advertise & NETDEV_F_1GB_FD) {
1711 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1713 if (advertise & NETDEV_F_10GB_FD) {
1714 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1716 if (advertise & NETDEV_F_COPPER) {
1717 ecmd.advertising |= ADVERTISED_TP;
1719 if (advertise & NETDEV_F_FIBER) {
1720 ecmd.advertising |= ADVERTISED_FIBRE;
1722 if (advertise & NETDEV_F_AUTONEG) {
1723 ecmd.advertising |= ADVERTISED_Autoneg;
1725 if (advertise & NETDEV_F_PAUSE) {
1726 ecmd.advertising |= ADVERTISED_Pause;
1728 if (advertise & NETDEV_F_PAUSE_ASYM) {
1729 ecmd.advertising |= ADVERTISED_Asym_Pause;
1731 COVERAGE_INC(netdev_set_ethtool);
1732 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1733 ETHTOOL_SSET, "ETHTOOL_SSET");
1736 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1737 * successful, otherwise a positive errno value. */
1739 netdev_linux_set_policing(struct netdev *netdev_,
1740 uint32_t kbits_rate, uint32_t kbits_burst)
1742 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1743 const char *netdev_name = netdev_get_name(netdev_);
1747 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1748 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1749 : kbits_burst); /* Stick with user-specified value. */
1751 if (netdev->cache_valid & VALID_POLICING) {
1752 if (netdev->netdev_policing_error) {
1753 return netdev->netdev_policing_error;
1756 if (netdev->kbits_rate == kbits_rate &&
1757 netdev->kbits_burst == kbits_burst) {
1758 /* Assume that settings haven't changed since we last set them. */
1761 netdev->cache_valid &= ~VALID_POLICING;
1764 COVERAGE_INC(netdev_set_policing);
1765 /* Remove any existing ingress qdisc. */
1766 error = tc_add_del_ingress_qdisc(netdev_, false);
1768 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1769 netdev_name, ovs_strerror(error));
1774 error = tc_add_del_ingress_qdisc(netdev_, true);
1776 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1777 netdev_name, ovs_strerror(error));
1781 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
1783 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1784 netdev_name, ovs_strerror(error));
1789 netdev->kbits_rate = kbits_rate;
1790 netdev->kbits_burst = kbits_burst;
1793 if (!error || error == ENODEV) {
1794 netdev->netdev_policing_error = error;
1795 netdev->cache_valid |= VALID_POLICING;
1801 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1804 const struct tc_ops *const *opsp;
1806 for (opsp = tcs; *opsp != NULL; opsp++) {
1807 const struct tc_ops *ops = *opsp;
1808 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1809 sset_add(types, ops->ovs_name);
1815 static const struct tc_ops *
1816 tc_lookup_ovs_name(const char *name)
1818 const struct tc_ops *const *opsp;
1820 for (opsp = tcs; *opsp != NULL; opsp++) {
1821 const struct tc_ops *ops = *opsp;
1822 if (!strcmp(name, ops->ovs_name)) {
1829 static const struct tc_ops *
1830 tc_lookup_linux_name(const char *name)
1832 const struct tc_ops *const *opsp;
1834 for (opsp = tcs; *opsp != NULL; opsp++) {
1835 const struct tc_ops *ops = *opsp;
1836 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1843 static struct tc_queue *
1844 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
1847 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1848 struct tc_queue *queue;
1850 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
1851 if (queue->queue_id == queue_id) {
1858 static struct tc_queue *
1859 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1861 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1865 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1867 struct netdev_qos_capabilities *caps)
1869 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1873 caps->n_queues = ops->n_queues;
1878 netdev_linux_get_qos(const struct netdev *netdev_,
1879 const char **typep, struct smap *details)
1881 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1884 error = tc_query_qdisc(netdev_);
1889 *typep = netdev->tc->ops->ovs_name;
1890 return (netdev->tc->ops->qdisc_get
1891 ? netdev->tc->ops->qdisc_get(netdev_, details)
1896 netdev_linux_set_qos(struct netdev *netdev_,
1897 const char *type, const struct smap *details)
1899 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1900 const struct tc_ops *new_ops;
1903 new_ops = tc_lookup_ovs_name(type);
1904 if (!new_ops || !new_ops->tc_install) {
1908 error = tc_query_qdisc(netdev_);
1913 if (new_ops == netdev->tc->ops) {
1914 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
1916 /* Delete existing qdisc. */
1917 error = tc_del_qdisc(netdev_);
1921 ovs_assert(netdev->tc == NULL);
1923 /* Install new qdisc. */
1924 error = new_ops->tc_install(netdev_, details);
1925 ovs_assert((error == 0) == (netdev->tc != NULL));
1932 netdev_linux_get_queue(const struct netdev *netdev_,
1933 unsigned int queue_id, struct smap *details)
1935 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1938 error = tc_query_qdisc(netdev_);
1942 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
1944 ? netdev->tc->ops->class_get(netdev_, queue, details)
1950 netdev_linux_set_queue(struct netdev *netdev_,
1951 unsigned int queue_id, const struct smap *details)
1953 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1956 error = tc_query_qdisc(netdev_);
1959 } else if (queue_id >= netdev->tc->ops->n_queues
1960 || !netdev->tc->ops->class_set) {
1964 return netdev->tc->ops->class_set(netdev_, queue_id, details);
1968 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
1970 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1973 error = tc_query_qdisc(netdev_);
1976 } else if (!netdev->tc->ops->class_delete) {
1979 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
1981 ? netdev->tc->ops->class_delete(netdev_, queue)
1987 netdev_linux_get_queue_stats(const struct netdev *netdev_,
1988 unsigned int queue_id,
1989 struct netdev_queue_stats *stats)
1991 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1994 error = tc_query_qdisc(netdev_);
1997 } else if (!netdev->tc->ops->class_get_stats) {
2000 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2002 ? netdev->tc->ops->class_get_stats(netdev_, queue, stats)
2008 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
2010 struct ofpbuf request;
2011 struct tcmsg *tcmsg;
2013 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2017 tcmsg->tcm_parent = 0;
2018 nl_dump_start(dump, NETLINK_ROUTE, &request);
2019 ofpbuf_uninit(&request);
2024 netdev_linux_dump_queues(const struct netdev *netdev_,
2025 netdev_dump_queues_cb *cb, void *aux)
2027 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2028 struct tc_queue *queue, *next_queue;
2029 struct smap details;
2033 error = tc_query_qdisc(netdev_);
2036 } else if (!netdev->tc->ops->class_get) {
2041 smap_init(&details);
2042 HMAP_FOR_EACH_SAFE (queue, next_queue, hmap_node,
2043 &netdev->tc->queues) {
2044 smap_clear(&details);
2046 error = netdev->tc->ops->class_get(netdev_, queue, &details);
2048 (*cb)(queue->queue_id, &details, aux);
2053 smap_destroy(&details);
2059 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2060 netdev_dump_queue_stats_cb *cb, void *aux)
2062 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2063 struct nl_dump dump;
2068 error = tc_query_qdisc(netdev_);
2071 } else if (!netdev->tc->ops->class_dump_stats) {
2076 if (!start_queue_dump(netdev_, &dump)) {
2079 while (nl_dump_next(&dump, &msg)) {
2080 error = netdev->tc->ops->class_dump_stats(netdev_, &msg, cb, aux);
2086 error = nl_dump_done(&dump);
2087 return error ? error : last_error;
2091 netdev_linux_get_in4(const struct netdev *netdev_,
2092 struct in_addr *address, struct in_addr *netmask)
2094 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2096 if (!(netdev->cache_valid & VALID_IN4)) {
2099 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
2100 SIOCGIFADDR, "SIOCGIFADDR");
2105 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2106 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2111 netdev->cache_valid |= VALID_IN4;
2113 *address = netdev->address;
2114 *netmask = netdev->netmask;
2115 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
2119 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2120 struct in_addr netmask)
2122 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2125 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2127 netdev->cache_valid |= VALID_IN4;
2128 netdev->address = address;
2129 netdev->netmask = netmask;
2130 if (address.s_addr != INADDR_ANY) {
2131 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2132 "SIOCSIFNETMASK", netmask);
2139 parse_if_inet6_line(const char *line,
2140 struct in6_addr *in6, char ifname[16 + 1])
2142 uint8_t *s6 = in6->s6_addr;
2143 #define X8 "%2"SCNx8
2145 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2146 "%*x %*x %*x %*x %16s\n",
2147 &s6[0], &s6[1], &s6[2], &s6[3],
2148 &s6[4], &s6[5], &s6[6], &s6[7],
2149 &s6[8], &s6[9], &s6[10], &s6[11],
2150 &s6[12], &s6[13], &s6[14], &s6[15],
2154 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2155 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2157 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2159 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2160 if (!(netdev->cache_valid & VALID_IN6)) {
2164 netdev->in6 = in6addr_any;
2166 file = fopen("/proc/net/if_inet6", "r");
2168 const char *name = netdev_get_name(netdev_);
2169 while (fgets(line, sizeof line, file)) {
2170 struct in6_addr in6_tmp;
2171 char ifname[16 + 1];
2172 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2173 && !strcmp(name, ifname))
2175 netdev->in6 = in6_tmp;
2181 netdev->cache_valid |= VALID_IN6;
2188 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2190 struct sockaddr_in sin;
2191 memset(&sin, 0, sizeof sin);
2192 sin.sin_family = AF_INET;
2193 sin.sin_addr = addr;
2196 memset(sa, 0, sizeof *sa);
2197 memcpy(sa, &sin, sizeof sin);
2201 do_set_addr(struct netdev *netdev,
2202 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2205 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2206 make_in4_sockaddr(&ifr.ifr_addr, addr);
2208 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2212 /* Adds 'router' as a default IP gateway. */
2214 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2216 struct in_addr any = { INADDR_ANY };
2220 memset(&rt, 0, sizeof rt);
2221 make_in4_sockaddr(&rt.rt_dst, any);
2222 make_in4_sockaddr(&rt.rt_gateway, router);
2223 make_in4_sockaddr(&rt.rt_genmask, any);
2224 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2225 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2227 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2233 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2236 static const char fn[] = "/proc/net/route";
2241 *netdev_name = NULL;
2242 stream = fopen(fn, "r");
2243 if (stream == NULL) {
2244 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2249 while (fgets(line, sizeof line, stream)) {
2252 ovs_be32 dest, gateway, mask;
2253 int refcnt, metric, mtu;
2254 unsigned int flags, use, window, irtt;
2257 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2259 iface, &dest, &gateway, &flags, &refcnt,
2260 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2262 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2266 if (!(flags & RTF_UP)) {
2267 /* Skip routes that aren't up. */
2271 /* The output of 'dest', 'mask', and 'gateway' were given in
2272 * network byte order, so we don't need need any endian
2273 * conversions here. */
2274 if ((dest & mask) == (host->s_addr & mask)) {
2276 /* The host is directly reachable. */
2277 next_hop->s_addr = 0;
2279 /* To reach the host, we must go through a gateway. */
2280 next_hop->s_addr = gateway;
2282 *netdev_name = xstrdup(iface);
2294 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2296 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2299 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2300 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2302 COVERAGE_INC(netdev_get_ethtool);
2303 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2304 error = netdev_linux_do_ethtool(netdev->up.name,
2307 "ETHTOOL_GDRVINFO");
2309 netdev->cache_valid |= VALID_DRVINFO;
2314 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2315 smap_add(smap, "driver_version", netdev->drvinfo.version);
2316 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2322 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2325 smap_add(smap, "driver_name", "openvswitch");
2329 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2330 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2331 * returns 0. Otherwise, it returns a positive errno value; in particular,
2332 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2334 netdev_linux_arp_lookup(const struct netdev *netdev,
2335 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2338 struct sockaddr_in sin;
2341 memset(&r, 0, sizeof r);
2342 memset(&sin, 0, sizeof sin);
2343 sin.sin_family = AF_INET;
2344 sin.sin_addr.s_addr = ip;
2346 memcpy(&r.arp_pa, &sin, sizeof sin);
2347 r.arp_ha.sa_family = ARPHRD_ETHER;
2349 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2350 COVERAGE_INC(netdev_arp_lookup);
2351 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2353 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2354 } else if (retval != ENXIO) {
2355 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2356 netdev_get_name(netdev), IP_ARGS(ip),
2357 ovs_strerror(retval));
2363 nd_to_iff_flags(enum netdev_flags nd)
2366 if (nd & NETDEV_UP) {
2369 if (nd & NETDEV_PROMISC) {
2376 iff_to_nd_flags(int iff)
2378 enum netdev_flags nd = 0;
2382 if (iff & IFF_PROMISC) {
2383 nd |= NETDEV_PROMISC;
2389 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2390 enum netdev_flags on, enum netdev_flags *old_flagsp)
2392 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2393 int old_flags, new_flags;
2396 old_flags = netdev->ifi_flags;
2397 *old_flagsp = iff_to_nd_flags(old_flags);
2398 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2399 if (new_flags != old_flags) {
2400 error = set_flags(netdev_get_name(netdev_), new_flags);
2401 get_flags(netdev_, &netdev->ifi_flags);
2407 netdev_linux_change_seq(const struct netdev *netdev)
2409 return netdev_linux_cast(netdev)->change_seq;
2412 #define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS, \
2413 GET_FEATURES, GET_STATUS) \
2417 netdev_linux_init, \
2419 netdev_linux_wait, \
2422 netdev_linux_destroy, \
2423 NULL, /* get_config */ \
2424 NULL, /* set_config */ \
2425 NULL, /* get_tunnel_config */ \
2427 netdev_linux_rx_open, \
2429 netdev_linux_send, \
2430 netdev_linux_send_wait, \
2432 netdev_linux_set_etheraddr, \
2433 netdev_linux_get_etheraddr, \
2434 netdev_linux_get_mtu, \
2435 netdev_linux_set_mtu, \
2436 netdev_linux_get_ifindex, \
2437 netdev_linux_get_carrier, \
2438 netdev_linux_get_carrier_resets, \
2439 netdev_linux_set_miimon_interval, \
2444 netdev_linux_set_advertisements, \
2446 netdev_linux_set_policing, \
2447 netdev_linux_get_qos_types, \
2448 netdev_linux_get_qos_capabilities, \
2449 netdev_linux_get_qos, \
2450 netdev_linux_set_qos, \
2451 netdev_linux_get_queue, \
2452 netdev_linux_set_queue, \
2453 netdev_linux_delete_queue, \
2454 netdev_linux_get_queue_stats, \
2455 netdev_linux_dump_queues, \
2456 netdev_linux_dump_queue_stats, \
2458 netdev_linux_get_in4, \
2459 netdev_linux_set_in4, \
2460 netdev_linux_get_in6, \
2461 netdev_linux_add_router, \
2462 netdev_linux_get_next_hop, \
2464 netdev_linux_arp_lookup, \
2466 netdev_linux_update_flags, \
2468 netdev_linux_change_seq \
2471 const struct netdev_class netdev_linux_class =
2474 netdev_linux_create,
2475 netdev_linux_get_stats,
2476 NULL, /* set_stats */
2477 netdev_linux_get_features,
2478 netdev_linux_get_status);
2480 const struct netdev_class netdev_tap_class =
2483 netdev_linux_create_tap,
2484 netdev_tap_get_stats,
2485 NULL, /* set_stats */
2486 netdev_linux_get_features,
2487 netdev_linux_get_status);
2489 const struct netdev_class netdev_internal_class =
2492 netdev_linux_create,
2493 netdev_internal_get_stats,
2494 netdev_internal_set_stats,
2495 NULL, /* get_features */
2496 netdev_internal_get_status);
2498 static const struct netdev_rx_class netdev_rx_linux_class = {
2499 netdev_rx_linux_destroy,
2500 netdev_rx_linux_recv,
2501 netdev_rx_linux_wait,
2502 netdev_rx_linux_drain,
2505 /* HTB traffic control class. */
2507 #define HTB_N_QUEUES 0xf000
2511 unsigned int max_rate; /* In bytes/s. */
2515 struct tc_queue tc_queue;
2516 unsigned int min_rate; /* In bytes/s. */
2517 unsigned int max_rate; /* In bytes/s. */
2518 unsigned int burst; /* In bytes. */
2519 unsigned int priority; /* Lower values are higher priorities. */
2523 htb_get__(const struct netdev *netdev_)
2525 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2526 return CONTAINER_OF(netdev->tc, struct htb, tc);
2530 htb_install__(struct netdev *netdev_, uint64_t max_rate)
2532 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2535 htb = xmalloc(sizeof *htb);
2536 tc_init(&htb->tc, &tc_ops_htb);
2537 htb->max_rate = max_rate;
2539 netdev->tc = &htb->tc;
2542 /* Create an HTB qdisc.
2544 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2546 htb_setup_qdisc__(struct netdev *netdev)
2549 struct tc_htb_glob opt;
2550 struct ofpbuf request;
2551 struct tcmsg *tcmsg;
2553 tc_del_qdisc(netdev);
2555 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2556 NLM_F_EXCL | NLM_F_CREATE, &request);
2560 tcmsg->tcm_handle = tc_make_handle(1, 0);
2561 tcmsg->tcm_parent = TC_H_ROOT;
2563 nl_msg_put_string(&request, TCA_KIND, "htb");
2565 memset(&opt, 0, sizeof opt);
2566 opt.rate2quantum = 10;
2570 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2571 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2572 nl_msg_end_nested(&request, opt_offset);
2574 return tc_transact(&request, NULL);
2577 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2578 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2580 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2581 unsigned int parent, struct htb_class *class)
2584 struct tc_htb_opt opt;
2585 struct ofpbuf request;
2586 struct tcmsg *tcmsg;
2590 error = netdev_get_mtu(netdev, &mtu);
2592 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2593 netdev_get_name(netdev));
2597 memset(&opt, 0, sizeof opt);
2598 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2599 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2600 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2601 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2602 opt.prio = class->priority;
2604 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2608 tcmsg->tcm_handle = handle;
2609 tcmsg->tcm_parent = parent;
2611 nl_msg_put_string(&request, TCA_KIND, "htb");
2612 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2613 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2614 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2615 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2616 nl_msg_end_nested(&request, opt_offset);
2618 error = tc_transact(&request, NULL);
2620 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2621 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2622 netdev_get_name(netdev),
2623 tc_get_major(handle), tc_get_minor(handle),
2624 tc_get_major(parent), tc_get_minor(parent),
2625 class->min_rate, class->max_rate,
2626 class->burst, class->priority, ovs_strerror(error));
2631 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2632 * description of them into 'details'. The description complies with the
2633 * specification given in the vswitch database documentation for linux-htb
2636 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2638 static const struct nl_policy tca_htb_policy[] = {
2639 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2640 .min_len = sizeof(struct tc_htb_opt) },
2643 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2644 const struct tc_htb_opt *htb;
2646 if (!nl_parse_nested(nl_options, tca_htb_policy,
2647 attrs, ARRAY_SIZE(tca_htb_policy))) {
2648 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2652 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2653 class->min_rate = htb->rate.rate;
2654 class->max_rate = htb->ceil.rate;
2655 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2656 class->priority = htb->prio;
2661 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2662 struct htb_class *options,
2663 struct netdev_queue_stats *stats)
2665 struct nlattr *nl_options;
2666 unsigned int handle;
2669 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2670 if (!error && queue_id) {
2671 unsigned int major = tc_get_major(handle);
2672 unsigned int minor = tc_get_minor(handle);
2673 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2674 *queue_id = minor - 1;
2679 if (!error && options) {
2680 error = htb_parse_tca_options__(nl_options, options);
2686 htb_parse_qdisc_details__(struct netdev *netdev,
2687 const struct smap *details, struct htb_class *hc)
2689 const char *max_rate_s;
2691 max_rate_s = smap_get(details, "max-rate");
2692 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2693 if (!hc->max_rate) {
2694 enum netdev_features current;
2696 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2697 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
2699 hc->min_rate = hc->max_rate;
2705 htb_parse_class_details__(struct netdev *netdev,
2706 const struct smap *details, struct htb_class *hc)
2708 const struct htb *htb = htb_get__(netdev);
2709 const char *min_rate_s = smap_get(details, "min-rate");
2710 const char *max_rate_s = smap_get(details, "max-rate");
2711 const char *burst_s = smap_get(details, "burst");
2712 const char *priority_s = smap_get(details, "priority");
2715 error = netdev_get_mtu(netdev, &mtu);
2717 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2718 netdev_get_name(netdev));
2722 /* HTB requires at least an mtu sized min-rate to send any traffic even
2723 * on uncongested links. */
2724 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2725 hc->min_rate = MAX(hc->min_rate, mtu);
2726 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2729 hc->max_rate = (max_rate_s
2730 ? strtoull(max_rate_s, NULL, 10) / 8
2732 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2733 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2737 * According to hints in the documentation that I've read, it is important
2738 * that 'burst' be at least as big as the largest frame that might be
2739 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2740 * but having it a bit too small is a problem. Since netdev_get_mtu()
2741 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2742 * the MTU. We actually add 64, instead of 14, as a guard against
2743 * additional headers get tacked on somewhere that we're not aware of. */
2744 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2745 hc->burst = MAX(hc->burst, mtu + 64);
2748 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2754 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2755 unsigned int parent, struct htb_class *options,
2756 struct netdev_queue_stats *stats)
2758 struct ofpbuf *reply;
2761 error = tc_query_class(netdev, handle, parent, &reply);
2763 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2764 ofpbuf_delete(reply);
2770 htb_tc_install(struct netdev *netdev, const struct smap *details)
2774 error = htb_setup_qdisc__(netdev);
2776 struct htb_class hc;
2778 htb_parse_qdisc_details__(netdev, details, &hc);
2779 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2780 tc_make_handle(1, 0), &hc);
2782 htb_install__(netdev, hc.max_rate);
2788 static struct htb_class *
2789 htb_class_cast__(const struct tc_queue *queue)
2791 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2795 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2796 const struct htb_class *hc)
2798 struct htb *htb = htb_get__(netdev);
2799 size_t hash = hash_int(queue_id, 0);
2800 struct tc_queue *queue;
2801 struct htb_class *hcp;
2803 queue = tc_find_queue__(netdev, queue_id, hash);
2805 hcp = htb_class_cast__(queue);
2807 hcp = xmalloc(sizeof *hcp);
2808 queue = &hcp->tc_queue;
2809 queue->queue_id = queue_id;
2810 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2813 hcp->min_rate = hc->min_rate;
2814 hcp->max_rate = hc->max_rate;
2815 hcp->burst = hc->burst;
2816 hcp->priority = hc->priority;
2820 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2823 struct nl_dump dump;
2824 struct htb_class hc;
2826 /* Get qdisc options. */
2828 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2829 htb_install__(netdev, hc.max_rate);
2832 if (!start_queue_dump(netdev, &dump)) {
2835 while (nl_dump_next(&dump, &msg)) {
2836 unsigned int queue_id;
2838 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2839 htb_update_queue__(netdev, queue_id, &hc);
2842 nl_dump_done(&dump);
2848 htb_tc_destroy(struct tc *tc)
2850 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2851 struct htb_class *hc, *next;
2853 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2854 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2862 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
2864 const struct htb *htb = htb_get__(netdev);
2865 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
2870 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
2872 struct htb_class hc;
2875 htb_parse_qdisc_details__(netdev, details, &hc);
2876 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2877 tc_make_handle(1, 0), &hc);
2879 htb_get__(netdev)->max_rate = hc.max_rate;
2885 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2886 const struct tc_queue *queue, struct smap *details)
2888 const struct htb_class *hc = htb_class_cast__(queue);
2890 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
2891 if (hc->min_rate != hc->max_rate) {
2892 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
2894 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
2896 smap_add_format(details, "priority", "%u", hc->priority);
2902 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2903 const struct smap *details)
2905 struct htb_class hc;
2908 error = htb_parse_class_details__(netdev, details, &hc);
2913 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2914 tc_make_handle(1, 0xfffe), &hc);
2919 htb_update_queue__(netdev, queue_id, &hc);
2924 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2926 struct htb_class *hc = htb_class_cast__(queue);
2927 struct htb *htb = htb_get__(netdev);
2930 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2932 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2939 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2940 struct netdev_queue_stats *stats)
2942 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2943 tc_make_handle(1, 0xfffe), NULL, stats);
2947 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2948 const struct ofpbuf *nlmsg,
2949 netdev_dump_queue_stats_cb *cb, void *aux)
2951 struct netdev_queue_stats stats;
2952 unsigned int handle, major, minor;
2955 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2960 major = tc_get_major(handle);
2961 minor = tc_get_minor(handle);
2962 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2963 (*cb)(minor - 1, &stats, aux);
2968 static const struct tc_ops tc_ops_htb = {
2969 "htb", /* linux_name */
2970 "linux-htb", /* ovs_name */
2971 HTB_N_QUEUES, /* n_queues */
2980 htb_class_get_stats,
2981 htb_class_dump_stats
2984 /* "linux-hfsc" traffic control class. */
2986 #define HFSC_N_QUEUES 0xf000
2994 struct tc_queue tc_queue;
2999 static struct hfsc *
3000 hfsc_get__(const struct netdev *netdev_)
3002 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3003 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
3006 static struct hfsc_class *
3007 hfsc_class_cast__(const struct tc_queue *queue)
3009 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3013 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
3015 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3018 hfsc = xmalloc(sizeof *hfsc);
3019 tc_init(&hfsc->tc, &tc_ops_hfsc);
3020 hfsc->max_rate = max_rate;
3021 netdev->tc = &hfsc->tc;
3025 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3026 const struct hfsc_class *hc)
3030 struct hfsc_class *hcp;
3031 struct tc_queue *queue;
3033 hfsc = hfsc_get__(netdev);
3034 hash = hash_int(queue_id, 0);
3036 queue = tc_find_queue__(netdev, queue_id, hash);
3038 hcp = hfsc_class_cast__(queue);
3040 hcp = xmalloc(sizeof *hcp);
3041 queue = &hcp->tc_queue;
3042 queue->queue_id = queue_id;
3043 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3046 hcp->min_rate = hc->min_rate;
3047 hcp->max_rate = hc->max_rate;
3051 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3053 const struct tc_service_curve *rsc, *fsc, *usc;
3054 static const struct nl_policy tca_hfsc_policy[] = {
3056 .type = NL_A_UNSPEC,
3058 .min_len = sizeof(struct tc_service_curve),
3061 .type = NL_A_UNSPEC,
3063 .min_len = sizeof(struct tc_service_curve),
3066 .type = NL_A_UNSPEC,
3068 .min_len = sizeof(struct tc_service_curve),
3071 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3073 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3074 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3075 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3079 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3080 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3081 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3083 if (rsc->m1 != 0 || rsc->d != 0 ||
3084 fsc->m1 != 0 || fsc->d != 0 ||
3085 usc->m1 != 0 || usc->d != 0) {
3086 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3087 "Non-linear service curves are not supported.");
3091 if (rsc->m2 != fsc->m2) {
3092 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3093 "Real-time service curves are not supported ");
3097 if (rsc->m2 > usc->m2) {
3098 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3099 "Min-rate service curve is greater than "
3100 "the max-rate service curve.");
3104 class->min_rate = fsc->m2;
3105 class->max_rate = usc->m2;
3110 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3111 struct hfsc_class *options,
3112 struct netdev_queue_stats *stats)
3115 unsigned int handle;
3116 struct nlattr *nl_options;
3118 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3124 unsigned int major, minor;
3126 major = tc_get_major(handle);
3127 minor = tc_get_minor(handle);
3128 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3129 *queue_id = minor - 1;
3136 error = hfsc_parse_tca_options__(nl_options, options);
3143 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3144 unsigned int parent, struct hfsc_class *options,
3145 struct netdev_queue_stats *stats)
3148 struct ofpbuf *reply;
3150 error = tc_query_class(netdev, handle, parent, &reply);
3155 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3156 ofpbuf_delete(reply);
3161 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct smap *details,
3162 struct hfsc_class *class)
3165 const char *max_rate_s;
3167 max_rate_s = smap_get(details, "max-rate");
3168 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3171 enum netdev_features current;
3173 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
3174 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3177 class->min_rate = max_rate;
3178 class->max_rate = max_rate;
3182 hfsc_parse_class_details__(struct netdev *netdev,
3183 const struct smap *details,
3184 struct hfsc_class * class)
3186 const struct hfsc *hfsc;
3187 uint32_t min_rate, max_rate;
3188 const char *min_rate_s, *max_rate_s;
3190 hfsc = hfsc_get__(netdev);
3191 min_rate_s = smap_get(details, "min-rate");
3192 max_rate_s = smap_get(details, "max-rate");
3194 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3195 min_rate = MAX(min_rate, 1);
3196 min_rate = MIN(min_rate, hfsc->max_rate);
3198 max_rate = (max_rate_s
3199 ? strtoull(max_rate_s, NULL, 10) / 8
3201 max_rate = MAX(max_rate, min_rate);
3202 max_rate = MIN(max_rate, hfsc->max_rate);
3204 class->min_rate = min_rate;
3205 class->max_rate = max_rate;
3210 /* Create an HFSC qdisc.
3212 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3214 hfsc_setup_qdisc__(struct netdev * netdev)
3216 struct tcmsg *tcmsg;
3217 struct ofpbuf request;
3218 struct tc_hfsc_qopt opt;
3220 tc_del_qdisc(netdev);
3222 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3223 NLM_F_EXCL | NLM_F_CREATE, &request);
3229 tcmsg->tcm_handle = tc_make_handle(1, 0);
3230 tcmsg->tcm_parent = TC_H_ROOT;
3232 memset(&opt, 0, sizeof opt);
3235 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3236 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3238 return tc_transact(&request, NULL);
3241 /* Create an HFSC class.
3243 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3244 * sc rate <min_rate> ul rate <max_rate>" */
3246 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3247 unsigned int parent, struct hfsc_class *class)
3251 struct tcmsg *tcmsg;
3252 struct ofpbuf request;
3253 struct tc_service_curve min, max;
3255 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3261 tcmsg->tcm_handle = handle;
3262 tcmsg->tcm_parent = parent;
3266 min.m2 = class->min_rate;
3270 max.m2 = class->max_rate;
3272 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3273 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3274 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3275 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3276 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3277 nl_msg_end_nested(&request, opt_offset);
3279 error = tc_transact(&request, NULL);
3281 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3282 "min-rate %ubps, max-rate %ubps (%s)",
3283 netdev_get_name(netdev),
3284 tc_get_major(handle), tc_get_minor(handle),
3285 tc_get_major(parent), tc_get_minor(parent),
3286 class->min_rate, class->max_rate, ovs_strerror(error));
3293 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
3296 struct hfsc_class class;
3298 error = hfsc_setup_qdisc__(netdev);
3304 hfsc_parse_qdisc_details__(netdev, details, &class);
3305 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3306 tc_make_handle(1, 0), &class);
3312 hfsc_install__(netdev, class.max_rate);
3317 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3320 struct nl_dump dump;
3321 struct hfsc_class hc;
3324 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3325 hfsc_install__(netdev, hc.max_rate);
3327 if (!start_queue_dump(netdev, &dump)) {
3331 while (nl_dump_next(&dump, &msg)) {
3332 unsigned int queue_id;
3334 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3335 hfsc_update_queue__(netdev, queue_id, &hc);
3339 nl_dump_done(&dump);
3344 hfsc_tc_destroy(struct tc *tc)
3347 struct hfsc_class *hc, *next;
3349 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3351 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3352 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3361 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
3363 const struct hfsc *hfsc;
3364 hfsc = hfsc_get__(netdev);
3365 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
3370 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
3373 struct hfsc_class class;
3375 hfsc_parse_qdisc_details__(netdev, details, &class);
3376 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3377 tc_make_handle(1, 0), &class);
3380 hfsc_get__(netdev)->max_rate = class.max_rate;
3387 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3388 const struct tc_queue *queue, struct smap *details)
3390 const struct hfsc_class *hc;
3392 hc = hfsc_class_cast__(queue);
3393 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3394 if (hc->min_rate != hc->max_rate) {
3395 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3401 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3402 const struct smap *details)
3405 struct hfsc_class class;
3407 error = hfsc_parse_class_details__(netdev, details, &class);
3412 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3413 tc_make_handle(1, 0xfffe), &class);
3418 hfsc_update_queue__(netdev, queue_id, &class);
3423 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3427 struct hfsc_class *hc;
3429 hc = hfsc_class_cast__(queue);
3430 hfsc = hfsc_get__(netdev);
3432 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3434 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3441 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3442 struct netdev_queue_stats *stats)
3444 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3445 tc_make_handle(1, 0xfffe), NULL, stats);
3449 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3450 const struct ofpbuf *nlmsg,
3451 netdev_dump_queue_stats_cb *cb, void *aux)
3453 struct netdev_queue_stats stats;
3454 unsigned int handle, major, minor;
3457 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3462 major = tc_get_major(handle);
3463 minor = tc_get_minor(handle);
3464 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3465 (*cb)(minor - 1, &stats, aux);
3470 static const struct tc_ops tc_ops_hfsc = {
3471 "hfsc", /* linux_name */
3472 "linux-hfsc", /* ovs_name */
3473 HFSC_N_QUEUES, /* n_queues */
3474 hfsc_tc_install, /* tc_install */
3475 hfsc_tc_load, /* tc_load */
3476 hfsc_tc_destroy, /* tc_destroy */
3477 hfsc_qdisc_get, /* qdisc_get */
3478 hfsc_qdisc_set, /* qdisc_set */
3479 hfsc_class_get, /* class_get */
3480 hfsc_class_set, /* class_set */
3481 hfsc_class_delete, /* class_delete */
3482 hfsc_class_get_stats, /* class_get_stats */
3483 hfsc_class_dump_stats /* class_dump_stats */
3486 /* "linux-default" traffic control class.
3488 * This class represents the default, unnamed Linux qdisc. It corresponds to
3489 * the "" (empty string) QoS type in the OVS database. */
3492 default_install__(struct netdev *netdev_)
3494 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3495 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
3497 /* Nothing but a tc class implementation is allowed to write to a tc. This
3498 * class never does that, so we can legitimately use a const tc object. */
3499 netdev->tc = CONST_CAST(struct tc *, &tc);
3503 default_tc_install(struct netdev *netdev,
3504 const struct smap *details OVS_UNUSED)
3506 default_install__(netdev);
3511 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3513 default_install__(netdev);
3517 static const struct tc_ops tc_ops_default = {
3518 NULL, /* linux_name */
3523 NULL, /* tc_destroy */
3524 NULL, /* qdisc_get */
3525 NULL, /* qdisc_set */
3526 NULL, /* class_get */
3527 NULL, /* class_set */
3528 NULL, /* class_delete */
3529 NULL, /* class_get_stats */
3530 NULL /* class_dump_stats */
3533 /* "linux-other" traffic control class.
3538 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
3540 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3541 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
3543 /* Nothing but a tc class implementation is allowed to write to a tc. This
3544 * class never does that, so we can legitimately use a const tc object. */
3545 netdev->tc = CONST_CAST(struct tc *, &tc);
3549 static const struct tc_ops tc_ops_other = {
3550 NULL, /* linux_name */
3551 "linux-other", /* ovs_name */
3553 NULL, /* tc_install */
3555 NULL, /* tc_destroy */
3556 NULL, /* qdisc_get */
3557 NULL, /* qdisc_set */
3558 NULL, /* class_get */
3559 NULL, /* class_set */
3560 NULL, /* class_delete */
3561 NULL, /* class_get_stats */
3562 NULL /* class_dump_stats */
3565 /* Traffic control. */
3567 /* Number of kernel "tc" ticks per second. */
3568 static double ticks_per_s;
3570 /* Number of kernel "jiffies" per second. This is used for the purpose of
3571 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3572 * one jiffy's worth of data.
3574 * There are two possibilities here:
3576 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3577 * approximate range of 100 to 1024. That means that we really need to
3578 * make sure that the qdisc can buffer that much data.
3580 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3581 * has finely granular timers and there's no need to fudge additional room
3582 * for buffers. (There's no extra effort needed to implement that: the
3583 * large 'buffer_hz' is used as a divisor, so practically any number will
3584 * come out as 0 in the division. Small integer results in the case of
3585 * really high dividends won't have any real effect anyhow.)
3587 static unsigned int buffer_hz;
3589 /* Returns tc handle 'major':'minor'. */
3591 tc_make_handle(unsigned int major, unsigned int minor)
3593 return TC_H_MAKE(major << 16, minor);
3596 /* Returns the major number from 'handle'. */
3598 tc_get_major(unsigned int handle)
3600 return TC_H_MAJ(handle) >> 16;
3603 /* Returns the minor number from 'handle'. */
3605 tc_get_minor(unsigned int handle)
3607 return TC_H_MIN(handle);
3610 static struct tcmsg *
3611 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3612 struct ofpbuf *request)
3614 struct tcmsg *tcmsg;
3618 error = get_ifindex(netdev, &ifindex);
3623 ofpbuf_init(request, 512);
3624 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3625 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3626 tcmsg->tcm_family = AF_UNSPEC;
3627 tcmsg->tcm_ifindex = ifindex;
3628 /* Caller should fill in tcmsg->tcm_handle. */
3629 /* Caller should fill in tcmsg->tcm_parent. */
3635 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3637 int error = nl_transact(NETLINK_ROUTE, request, replyp);
3638 ofpbuf_uninit(request);
3642 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3643 * policing configuration.
3645 * This function is equivalent to running the following when 'add' is true:
3646 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3648 * This function is equivalent to running the following when 'add' is false:
3649 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3651 * The configuration and stats may be seen with the following command:
3652 * /sbin/tc -s qdisc show dev <devname>
3654 * Returns 0 if successful, otherwise a positive errno value.
3657 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3659 struct ofpbuf request;
3660 struct tcmsg *tcmsg;
3662 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3663 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3665 tcmsg = tc_make_request(netdev, type, flags, &request);
3669 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3670 tcmsg->tcm_parent = TC_H_INGRESS;
3671 nl_msg_put_string(&request, TCA_KIND, "ingress");
3672 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3674 error = tc_transact(&request, NULL);
3676 /* If we're deleting the qdisc, don't worry about some of the
3677 * error conditions. */
3678 if (!add && (error == ENOENT || error == EINVAL)) {
3687 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3690 * This function is equivalent to running:
3691 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3692 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3695 * The configuration and stats may be seen with the following command:
3696 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3698 * Returns 0 if successful, otherwise a positive errno value.
3701 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3703 struct tc_police tc_police;
3704 struct ofpbuf request;
3705 struct tcmsg *tcmsg;
3706 size_t basic_offset;
3707 size_t police_offset;
3711 memset(&tc_police, 0, sizeof tc_police);
3712 tc_police.action = TC_POLICE_SHOT;
3713 tc_police.mtu = mtu;
3714 tc_fill_rate(&tc_police.rate, (kbits_rate * 1000)/8, mtu);
3715 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3716 kbits_burst * 1024);
3718 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3719 NLM_F_EXCL | NLM_F_CREATE, &request);
3723 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3724 tcmsg->tcm_info = tc_make_handle(49,
3725 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3727 nl_msg_put_string(&request, TCA_KIND, "basic");
3728 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3729 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3730 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3731 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3732 nl_msg_end_nested(&request, police_offset);
3733 nl_msg_end_nested(&request, basic_offset);
3735 error = tc_transact(&request, NULL);
3746 /* The values in psched are not individually very meaningful, but they are
3747 * important. The tables below show some values seen in the wild.
3751 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3752 * (Before that, there are hints that it was 1000000000.)
3754 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3758 * -----------------------------------
3759 * [1] 000c8000 000f4240 000f4240 00000064
3760 * [2] 000003e8 00000400 000f4240 3b9aca00
3761 * [3] 000003e8 00000400 000f4240 3b9aca00
3762 * [4] 000003e8 00000400 000f4240 00000064
3763 * [5] 000003e8 00000040 000f4240 3b9aca00
3764 * [6] 000003e8 00000040 000f4240 000000f9
3766 * a b c d ticks_per_s buffer_hz
3767 * ------- --------- ---------- ------------- ----------- -------------
3768 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3769 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3770 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3771 * [4] 1,000 1,024 1,000,000 100 976,562 100
3772 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3773 * [6] 1,000 64 1,000,000 249 15,625,000 249
3775 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3776 * [2] 2.6.26-1-686-bigmem from Debian lenny
3777 * [3] 2.6.26-2-sparc64 from Debian lenny
3778 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3779 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3780 * [6] 2.6.34 from kernel.org on KVM
3782 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
3783 static const char fn[] = "/proc/net/psched";
3784 unsigned int a, b, c, d;
3787 if (!ovsthread_once_start(&once)) {
3794 stream = fopen(fn, "r");
3796 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
3800 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3801 VLOG_WARN("%s: read failed", fn);
3805 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3809 VLOG_WARN("%s: invalid scheduler parameters", fn);
3813 ticks_per_s = (double) a * c / b;
3817 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3820 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3823 ovsthread_once_done(&once);
3826 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3827 * rate of 'rate' bytes per second. */
3829 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3832 return (rate * ticks) / ticks_per_s;
3835 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3836 * rate of 'rate' bytes per second. */
3838 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3841 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3844 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3845 * a transmission rate of 'rate' bytes per second. */
3847 tc_buffer_per_jiffy(unsigned int rate)
3850 return rate / buffer_hz;
3853 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3854 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3855 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3856 * stores NULL into it if it is absent.
3858 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3861 * Returns 0 if successful, otherwise a positive errno value. */
3863 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3864 struct nlattr **options)
3866 static const struct nl_policy tca_policy[] = {
3867 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3868 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3870 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3872 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3873 tca_policy, ta, ARRAY_SIZE(ta))) {
3874 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3879 *kind = nl_attr_get_string(ta[TCA_KIND]);
3883 *options = ta[TCA_OPTIONS];
3898 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3899 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3900 * into '*options', and its queue statistics into '*stats'. Any of the output
3901 * arguments may be null.
3903 * Returns 0 if successful, otherwise a positive errno value. */
3905 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3906 struct nlattr **options, struct netdev_queue_stats *stats)
3908 static const struct nl_policy tca_policy[] = {
3909 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3910 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3912 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3914 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3915 tca_policy, ta, ARRAY_SIZE(ta))) {
3916 VLOG_WARN_RL(&rl, "failed to parse class message");
3921 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3922 *handlep = tc->tcm_handle;
3926 *options = ta[TCA_OPTIONS];
3930 const struct gnet_stats_queue *gsq;
3931 struct gnet_stats_basic gsb;
3933 static const struct nl_policy stats_policy[] = {
3934 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3935 .min_len = sizeof gsb },
3936 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3937 .min_len = sizeof *gsq },
3939 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3941 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3942 sa, ARRAY_SIZE(sa))) {
3943 VLOG_WARN_RL(&rl, "failed to parse class stats");
3947 /* Alignment issues screw up the length of struct gnet_stats_basic on
3948 * some arch/bitsize combinations. Newer versions of Linux have a
3949 * struct gnet_stats_basic_packed, but we can't depend on that. The
3950 * easiest thing to do is just to make a copy. */
3951 memset(&gsb, 0, sizeof gsb);
3952 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3953 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3954 stats->tx_bytes = gsb.bytes;
3955 stats->tx_packets = gsb.packets;
3957 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3958 stats->tx_errors = gsq->drops;
3968 memset(stats, 0, sizeof *stats);
3973 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3976 tc_query_class(const struct netdev *netdev,
3977 unsigned int handle, unsigned int parent,
3978 struct ofpbuf **replyp)
3980 struct ofpbuf request;
3981 struct tcmsg *tcmsg;
3984 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3988 tcmsg->tcm_handle = handle;
3989 tcmsg->tcm_parent = parent;
3991 error = tc_transact(&request, replyp);
3993 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3994 netdev_get_name(netdev),
3995 tc_get_major(handle), tc_get_minor(handle),
3996 tc_get_major(parent), tc_get_minor(parent),
3997 ovs_strerror(error));
4002 /* Equivalent to "tc class del dev <name> handle <handle>". */
4004 tc_delete_class(const struct netdev *netdev, unsigned int handle)
4006 struct ofpbuf request;
4007 struct tcmsg *tcmsg;
4010 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
4014 tcmsg->tcm_handle = handle;
4015 tcmsg->tcm_parent = 0;
4017 error = tc_transact(&request, NULL);
4019 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4020 netdev_get_name(netdev),
4021 tc_get_major(handle), tc_get_minor(handle),
4022 ovs_strerror(error));
4027 /* Equivalent to "tc qdisc del dev <name> root". */
4029 tc_del_qdisc(struct netdev *netdev_)
4031 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4032 struct ofpbuf request;
4033 struct tcmsg *tcmsg;
4036 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
4040 tcmsg->tcm_handle = tc_make_handle(1, 0);
4041 tcmsg->tcm_parent = TC_H_ROOT;
4043 error = tc_transact(&request, NULL);
4044 if (error == EINVAL) {
4045 /* EINVAL probably means that the default qdisc was in use, in which
4046 * case we've accomplished our purpose. */
4049 if (!error && netdev->tc) {
4050 if (netdev->tc->ops->tc_destroy) {
4051 netdev->tc->ops->tc_destroy(netdev->tc);
4058 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4059 * kernel to determine what they are. Returns 0 if successful, otherwise a
4060 * positive errno value. */
4062 tc_query_qdisc(const struct netdev *netdev_)
4064 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4065 struct ofpbuf request, *qdisc;
4066 const struct tc_ops *ops;
4067 struct tcmsg *tcmsg;
4075 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4076 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4077 * 2.6.35 without that fix backported to it.
4079 * To avoid the OOPS, we must not make a request that would attempt to dump
4080 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4081 * few others. There are a few ways that I can see to do this, but most of
4082 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4083 * technique chosen here is to assume that any non-default qdisc that we
4084 * create will have a class with handle 1:0. The built-in qdiscs only have
4085 * a class with handle 0:0.
4087 * We could check for Linux 2.6.35+ and use a more straightforward method
4089 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
4093 tcmsg->tcm_handle = tc_make_handle(1, 0);
4094 tcmsg->tcm_parent = 0;
4096 /* Figure out what tc class to instantiate. */
4097 error = tc_transact(&request, &qdisc);
4101 error = tc_parse_qdisc(qdisc, &kind, NULL);
4103 ops = &tc_ops_other;
4105 ops = tc_lookup_linux_name(kind);
4107 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4108 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4110 ops = &tc_ops_other;
4113 } else if (error == ENOENT) {
4114 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4115 * other entity that doesn't have a handle 1:0. We will assume
4116 * that it's the system default qdisc. */
4117 ops = &tc_ops_default;
4120 /* Who knows? Maybe the device got deleted. */
4121 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4122 netdev_get_name(netdev_), ovs_strerror(error));
4123 ops = &tc_ops_other;
4126 /* Instantiate it. */
4127 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
4128 ovs_assert((load_error == 0) == (netdev->tc != NULL));
4129 ofpbuf_delete(qdisc);
4131 return error ? error : load_error;
4134 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4135 approximate the time to transmit packets of various lengths. For an MTU of
4136 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4137 represents two possible packet lengths; for a MTU of 513 through 1024, four
4138 possible lengths; and so on.
4140 Returns, for the specified 'mtu', the number of bits that packet lengths
4141 need to be shifted right to fit within such a 256-entry table. */
4143 tc_calc_cell_log(unsigned int mtu)
4148 mtu = ETH_PAYLOAD_MAX;
4150 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4152 for (cell_log = 0; mtu >= 256; cell_log++) {
4159 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4162 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4164 memset(rate, 0, sizeof *rate);
4165 rate->cell_log = tc_calc_cell_log(mtu);
4166 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4167 /* rate->cell_align = 0; */ /* distro headers. */
4168 rate->mpu = ETH_TOTAL_MIN;
4172 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4173 * attribute of the specified "type".
4175 * See tc_calc_cell_log() above for a description of "rtab"s. */
4177 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4182 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4183 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4184 unsigned packet_size = (i + 1) << rate->cell_log;
4185 if (packet_size < rate->mpu) {
4186 packet_size = rate->mpu;
4188 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4192 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4193 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4194 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4197 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4199 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4200 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4203 /* Linux-only functions declared in netdev-linux.h */
4205 /* Returns a fd for an AF_INET socket or a negative errno value. */
4207 netdev_linux_get_af_inet_sock(void)
4209 int error = netdev_linux_init();
4210 return error ? -error : af_inet_sock;
4213 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4214 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4216 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4217 const char *flag_name, bool enable)
4219 const char *netdev_name = netdev_get_name(netdev);
4220 struct ethtool_value evalue;
4224 COVERAGE_INC(netdev_get_ethtool);
4225 memset(&evalue, 0, sizeof evalue);
4226 error = netdev_linux_do_ethtool(netdev_name,
4227 (struct ethtool_cmd *)&evalue,
4228 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4233 COVERAGE_INC(netdev_set_ethtool);
4234 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4235 error = netdev_linux_do_ethtool(netdev_name,
4236 (struct ethtool_cmd *)&evalue,
4237 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4242 COVERAGE_INC(netdev_get_ethtool);
4243 memset(&evalue, 0, sizeof evalue);
4244 error = netdev_linux_do_ethtool(netdev_name,
4245 (struct ethtool_cmd *)&evalue,
4246 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4251 if (new_flags != evalue.data) {
4252 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4253 "device %s failed", enable ? "enable" : "disable",
4254 flag_name, netdev_name);
4261 /* Utility functions. */
4263 /* Copies 'src' into 'dst', performing format conversion in the process. */
4265 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4266 const struct rtnl_link_stats *src)
4268 dst->rx_packets = src->rx_packets;
4269 dst->tx_packets = src->tx_packets;
4270 dst->rx_bytes = src->rx_bytes;
4271 dst->tx_bytes = src->tx_bytes;
4272 dst->rx_errors = src->rx_errors;
4273 dst->tx_errors = src->tx_errors;
4274 dst->rx_dropped = src->rx_dropped;
4275 dst->tx_dropped = src->tx_dropped;
4276 dst->multicast = src->multicast;
4277 dst->collisions = src->collisions;
4278 dst->rx_length_errors = src->rx_length_errors;
4279 dst->rx_over_errors = src->rx_over_errors;
4280 dst->rx_crc_errors = src->rx_crc_errors;
4281 dst->rx_frame_errors = src->rx_frame_errors;
4282 dst->rx_fifo_errors = src->rx_fifo_errors;
4283 dst->rx_missed_errors = src->rx_missed_errors;
4284 dst->tx_aborted_errors = src->tx_aborted_errors;
4285 dst->tx_carrier_errors = src->tx_carrier_errors;
4286 dst->tx_fifo_errors = src->tx_fifo_errors;
4287 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4288 dst->tx_window_errors = src->tx_window_errors;
4292 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4294 /* Policy for RTNLGRP_LINK messages.
4296 * There are *many* more fields in these messages, but currently we only
4297 * care about these fields. */
4298 static const struct nl_policy rtnlgrp_link_policy[] = {
4299 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4300 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4301 .min_len = sizeof(struct rtnl_link_stats) },
4304 struct ofpbuf request;
4305 struct ofpbuf *reply;
4306 struct ifinfomsg *ifi;
4307 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4310 ofpbuf_init(&request, 0);
4311 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4312 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4313 ifi->ifi_family = PF_UNSPEC;
4314 ifi->ifi_index = ifindex;
4315 error = nl_transact(NETLINK_ROUTE, &request, &reply);
4316 ofpbuf_uninit(&request);
4321 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4322 rtnlgrp_link_policy,
4323 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4324 ofpbuf_delete(reply);
4328 if (!attrs[IFLA_STATS]) {
4329 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4330 ofpbuf_delete(reply);
4334 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4336 ofpbuf_delete(reply);
4342 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4344 static const char fn[] = "/proc/net/dev";
4349 stream = fopen(fn, "r");
4351 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
4356 while (fgets(line, sizeof line, stream)) {
4359 #define X64 "%"SCNu64
4362 X64 X64 X64 X64 X64 X64 X64 "%*u"
4363 X64 X64 X64 X64 X64 X64 X64 "%*u",
4369 &stats->rx_fifo_errors,
4370 &stats->rx_frame_errors,
4376 &stats->tx_fifo_errors,
4378 &stats->tx_carrier_errors) != 15) {
4379 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4380 } else if (!strcmp(devname, netdev_name)) {
4381 stats->rx_length_errors = UINT64_MAX;
4382 stats->rx_over_errors = UINT64_MAX;
4383 stats->rx_crc_errors = UINT64_MAX;
4384 stats->rx_missed_errors = UINT64_MAX;
4385 stats->tx_aborted_errors = UINT64_MAX;
4386 stats->tx_heartbeat_errors = UINT64_MAX;
4387 stats->tx_window_errors = UINT64_MAX;
4393 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4399 get_flags(const struct netdev *dev, unsigned int *flags)
4405 error = netdev_linux_do_ioctl(dev->name, &ifr, SIOCGIFFLAGS,
4408 *flags = ifr.ifr_flags;
4414 set_flags(const char *name, unsigned int flags)
4418 ifr.ifr_flags = flags;
4419 return netdev_linux_do_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
4423 do_get_ifindex(const char *netdev_name)
4427 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4428 COVERAGE_INC(netdev_get_ifindex);
4429 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4430 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4431 netdev_name, ovs_strerror(errno));
4434 return ifr.ifr_ifindex;
4438 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4440 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4442 if (!(netdev->cache_valid & VALID_IFINDEX)) {
4443 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4446 netdev->get_ifindex_error = -ifindex;
4447 netdev->ifindex = 0;
4449 netdev->get_ifindex_error = 0;
4450 netdev->ifindex = ifindex;
4452 netdev->cache_valid |= VALID_IFINDEX;
4455 *ifindexp = netdev->ifindex;
4456 return netdev->get_ifindex_error;
4460 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4465 memset(&ifr, 0, sizeof ifr);
4466 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4467 COVERAGE_INC(netdev_get_hwaddr);
4468 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4469 /* ENODEV probably means that a vif disappeared asynchronously and
4470 * hasn't been removed from the database yet, so reduce the log level
4471 * to INFO for that case. */
4472 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4473 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4474 netdev_name, ovs_strerror(errno));
4477 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4478 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4479 VLOG_WARN("%s device has unknown hardware address family %d",
4480 netdev_name, hwaddr_family);
4482 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4487 set_etheraddr(const char *netdev_name,
4488 const uint8_t mac[ETH_ADDR_LEN])
4492 memset(&ifr, 0, sizeof ifr);
4493 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4494 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4495 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4496 COVERAGE_INC(netdev_set_hwaddr);
4497 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4498 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4499 netdev_name, ovs_strerror(errno));
4506 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4507 int cmd, const char *cmd_name)
4511 memset(&ifr, 0, sizeof ifr);
4512 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4513 ifr.ifr_data = (caddr_t) ecmd;
4516 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4519 if (errno != EOPNOTSUPP) {
4520 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4521 "failed: %s", cmd_name, name, ovs_strerror(errno));
4523 /* The device doesn't support this operation. That's pretty
4524 * common, so there's no point in logging anything. */
4531 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4532 const char *cmd_name)
4534 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4535 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4536 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4537 ovs_strerror(errno));
4544 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4545 int cmd, const char *cmd_name)
4550 ifr.ifr_addr.sa_family = AF_INET;
4551 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4553 const struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *,
4555 *ip = sin->sin_addr;
4560 /* Returns an AF_PACKET raw socket or a negative errno value. */
4562 af_packet_sock(void)
4564 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4567 if (ovsthread_once_start(&once)) {
4568 sock = socket(AF_PACKET, SOCK_RAW, 0);
4570 int error = set_nonblocking(sock);
4577 VLOG_ERR("failed to create packet socket: %s",
4578 ovs_strerror(errno));
4580 ovsthread_once_done(&once);