2 * Copyright (c) 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <linux/version.h>
37 #include <sys/types.h>
38 #include <sys/ioctl.h>
39 #include <sys/socket.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
52 #include "dpif-linux.h"
53 #include "dynamic-string.h"
54 #include "fatal-signal.h"
57 #include "netdev-provider.h"
58 #include "netdev-vport.h"
59 #include "netlink-notifier.h"
60 #include "netlink-socket.h"
63 #include "openflow/openflow.h"
65 #include "poll-loop.h"
66 #include "rtnetlink-link.h"
68 #include "socket-util.h"
71 #include "unaligned.h"
74 VLOG_DEFINE_THIS_MODULE(netdev_linux);
76 COVERAGE_DEFINE(netdev_set_policing);
77 COVERAGE_DEFINE(netdev_arp_lookup);
78 COVERAGE_DEFINE(netdev_get_ifindex);
79 COVERAGE_DEFINE(netdev_get_hwaddr);
80 COVERAGE_DEFINE(netdev_set_hwaddr);
81 COVERAGE_DEFINE(netdev_get_ethtool);
82 COVERAGE_DEFINE(netdev_set_ethtool);
85 /* These were introduced in Linux 2.6.14, so they might be missing if we have
87 #ifndef ADVERTISED_Pause
88 #define ADVERTISED_Pause (1 << 13)
90 #ifndef ADVERTISED_Asym_Pause
91 #define ADVERTISED_Asym_Pause (1 << 14)
94 /* These were introduced in Linux 2.6.24, so they might be missing if we
95 * have old headers. */
96 #ifndef ETHTOOL_GFLAGS
97 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
99 #ifndef ETHTOOL_SFLAGS
100 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
103 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
106 #define TC_RTAB_SIZE 1024
109 static struct nln_notifier *netdev_linux_cache_notifier = NULL;
110 static int cache_notifier_refcount;
113 VALID_IFINDEX = 1 << 0,
114 VALID_ETHERADDR = 1 << 1,
118 VALID_POLICING = 1 << 5,
119 VALID_VPORT_STAT_ERROR = 1 << 6,
120 VALID_DRVINFO = 1 << 7,
121 VALID_FEATURES = 1 << 8,
128 /* Traffic control. */
130 /* An instance of a traffic control class. Always associated with a particular
133 * Each TC implementation subclasses this with whatever additional data it
136 const struct tc_ops *ops;
137 struct hmap queues; /* Contains "struct tc_queue"s.
138 * Read by generic TC layer.
139 * Written only by TC implementation. */
142 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
144 /* One traffic control queue.
146 * Each TC implementation subclasses this with whatever additional data it
149 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
150 unsigned int queue_id; /* OpenFlow queue ID. */
153 /* A particular kind of traffic control. Each implementation generally maps to
154 * one particular Linux qdisc class.
156 * The functions below return 0 if successful or a positive errno value on
157 * failure, except where otherwise noted. All of them must be provided, except
158 * where otherwise noted. */
160 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
161 * This is null for tc_ops_default and tc_ops_other, for which there are no
162 * appropriate values. */
163 const char *linux_name;
165 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
166 const char *ovs_name;
168 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
169 * queues. The queues are numbered 0 through n_queues - 1. */
170 unsigned int n_queues;
172 /* Called to install this TC class on 'netdev'. The implementation should
173 * make the Netlink calls required to set up 'netdev' with the right qdisc
174 * and configure it according to 'details'. The implementation may assume
175 * that the current qdisc is the default; that is, there is no need for it
176 * to delete the current qdisc before installing itself.
178 * The contents of 'details' should be documented as valid for 'ovs_name'
179 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
180 * (which is built as ovs-vswitchd.conf.db(8)).
182 * This function must return 0 if and only if it sets 'netdev->tc' to an
183 * initialized 'struct tc'.
185 * (This function is null for tc_ops_other, which cannot be installed. For
186 * other TC classes it should always be nonnull.) */
187 int (*tc_install)(struct netdev *netdev, const struct smap *details);
189 /* Called when the netdev code determines (through a Netlink query) that
190 * this TC class's qdisc is installed on 'netdev', but we didn't install
191 * it ourselves and so don't know any of the details.
193 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
194 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
195 * implementation should parse the other attributes of 'nlmsg' as
196 * necessary to determine its configuration. If necessary it should also
197 * use Netlink queries to determine the configuration of queues on
200 * This function must return 0 if and only if it sets 'netdev->tc' to an
201 * initialized 'struct tc'. */
202 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
204 /* Destroys the data structures allocated by the implementation as part of
205 * 'tc'. (This includes destroying 'tc->queues' by calling
208 * The implementation should not need to perform any Netlink calls. If
209 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
210 * (But it may not be desirable.)
212 * This function may be null if 'tc' is trivial. */
213 void (*tc_destroy)(struct tc *tc);
215 /* Retrieves details of 'netdev->tc' configuration into 'details'.
217 * The implementation should not need to perform any Netlink calls, because
218 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
219 * cached the configuration.
221 * The contents of 'details' should be documented as valid for 'ovs_name'
222 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
223 * (which is built as ovs-vswitchd.conf.db(8)).
225 * This function may be null if 'tc' is not configurable.
227 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
229 /* Reconfigures 'netdev->tc' according to 'details', performing any
230 * required Netlink calls to complete the reconfiguration.
232 * The contents of 'details' should be documented as valid for 'ovs_name'
233 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
234 * (which is built as ovs-vswitchd.conf.db(8)).
236 * This function may be null if 'tc' is not configurable.
238 int (*qdisc_set)(struct netdev *, const struct smap *details);
240 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
241 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
243 * The contents of 'details' should be documented as valid for 'ovs_name'
244 * in the "other_config" column in the "Queue" table in
245 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
247 * The implementation should not need to perform any Netlink calls, because
248 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
249 * cached the queue configuration.
251 * This function may be null if 'tc' does not have queues ('n_queues' is
253 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
254 struct smap *details);
256 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
257 * 'details', perfoming any required Netlink calls to complete the
258 * reconfiguration. The caller ensures that 'queue_id' is less than
261 * The contents of 'details' should be documented as valid for 'ovs_name'
262 * in the "other_config" column in the "Queue" table in
263 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
265 * This function may be null if 'tc' does not have queues or its queues are
266 * not configurable. */
267 int (*class_set)(struct netdev *, unsigned int queue_id,
268 const struct smap *details);
270 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
271 * tc_queue's within 'netdev->tc->queues'.
273 * This function may be null if 'tc' does not have queues or its queues
274 * cannot be deleted. */
275 int (*class_delete)(struct netdev *, struct tc_queue *queue);
277 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
278 * 'struct tc_queue's within 'netdev->tc->queues'.
280 * On success, initializes '*stats'.
282 * This function may be null if 'tc' does not have queues or if it cannot
283 * report queue statistics. */
284 int (*class_get_stats)(const struct netdev *netdev,
285 const struct tc_queue *queue,
286 struct netdev_queue_stats *stats);
288 /* Extracts queue stats from 'nlmsg', which is a response to a
289 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
291 * This function may be null if 'tc' does not have queues or if it cannot
292 * report queue statistics. */
293 int (*class_dump_stats)(const struct netdev *netdev,
294 const struct ofpbuf *nlmsg,
295 netdev_dump_queue_stats_cb *cb, void *aux);
299 tc_init(struct tc *tc, const struct tc_ops *ops)
302 hmap_init(&tc->queues);
306 tc_destroy(struct tc *tc)
308 hmap_destroy(&tc->queues);
311 static const struct tc_ops tc_ops_htb;
312 static const struct tc_ops tc_ops_hfsc;
313 static const struct tc_ops tc_ops_default;
314 static const struct tc_ops tc_ops_other;
316 static const struct tc_ops *const tcs[] = {
317 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
318 &tc_ops_hfsc, /* Hierarchical fair service curve. */
319 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
320 &tc_ops_other, /* Some other qdisc. */
324 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
325 static unsigned int tc_get_major(unsigned int handle);
326 static unsigned int tc_get_minor(unsigned int handle);
328 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
329 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
330 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
332 static struct tcmsg *tc_make_request(const struct netdev *, int type,
333 unsigned int flags, struct ofpbuf *);
334 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
335 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
336 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
339 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
340 struct nlattr **options);
341 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
342 struct nlattr **options,
343 struct netdev_queue_stats *);
344 static int tc_query_class(const struct netdev *,
345 unsigned int handle, unsigned int parent,
346 struct ofpbuf **replyp);
347 static int tc_delete_class(const struct netdev *, unsigned int handle);
349 static int tc_del_qdisc(struct netdev *netdev);
350 static int tc_query_qdisc(const struct netdev *netdev);
352 static int tc_calc_cell_log(unsigned int mtu);
353 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
354 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
355 const struct tc_ratespec *rate);
356 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
358 struct netdev_linux {
361 struct shash_node *shash_node;
362 unsigned int cache_valid;
363 unsigned int change_seq;
365 bool miimon; /* Link status of last poll. */
366 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
367 struct timer miimon_timer;
369 /* The following are figured out "on demand" only. They are only valid
370 * when the corresponding VALID_* bit in 'cache_valid' is set. */
372 uint8_t etheraddr[ETH_ADDR_LEN];
373 struct in_addr address, netmask;
376 unsigned int ifi_flags;
377 long long int carrier_resets;
378 uint32_t kbits_rate; /* Policing data. */
379 uint32_t kbits_burst;
380 int vport_stats_error; /* Cached error code from vport_get_stats().
381 0 or an errno value. */
382 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
383 int ether_addr_error; /* Cached error code from set/get etheraddr. */
384 int netdev_policing_error; /* Cached error code from set policing. */
385 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
386 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
388 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
389 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
390 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
391 enum netdev_features peer; /* Cached from ETHTOOL_GSET. */
393 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
397 struct tap_state tap;
401 struct netdev_rx_linux {
407 static const struct netdev_rx_class netdev_rx_linux_class;
409 /* Sockets used for ioctl operations. */
410 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
412 /* This is set pretty low because we probably won't learn anything from the
413 * additional log messages. */
414 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
416 static int netdev_linux_init(void);
418 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
419 int cmd, const char *cmd_name);
420 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
421 const char *cmd_name);
422 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
423 int cmd, const char *cmd_name);
424 static int get_flags(const struct netdev *, unsigned int *flags);
425 static int set_flags(const char *, unsigned int flags);
426 static int do_get_ifindex(const char *netdev_name);
427 static int get_ifindex(const struct netdev *, int *ifindexp);
428 static int do_set_addr(struct netdev *netdev,
429 int ioctl_nr, const char *ioctl_name,
430 struct in_addr addr);
431 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
432 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
433 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
434 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
435 static int af_packet_sock(void);
436 static void netdev_linux_miimon_run(void);
437 static void netdev_linux_miimon_wait(void);
440 is_netdev_linux_class(const struct netdev_class *netdev_class)
442 return netdev_class->init == netdev_linux_init;
446 is_tap_netdev(const struct netdev *netdev)
448 return netdev_get_class(netdev) == &netdev_tap_class;
451 static struct netdev_linux *
452 netdev_linux_cast(const struct netdev *netdev)
454 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
456 return CONTAINER_OF(netdev, struct netdev_linux, up);
459 static struct netdev_rx_linux *
460 netdev_rx_linux_cast(const struct netdev_rx *rx)
462 netdev_rx_assert_class(rx, &netdev_rx_linux_class);
463 return CONTAINER_OF(rx, struct netdev_rx_linux, up);
467 netdev_linux_init(void)
469 static int status = -1;
471 /* Create AF_INET socket. */
472 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
473 status = af_inet_sock >= 0 ? 0 : errno;
475 VLOG_ERR("failed to create inet socket: %s", ovs_strerror(status));
482 netdev_linux_run(void)
484 rtnetlink_link_run();
485 netdev_linux_miimon_run();
489 netdev_linux_wait(void)
491 rtnetlink_link_wait();
492 netdev_linux_miimon_wait();
496 netdev_linux_changed(struct netdev_linux *dev,
497 unsigned int ifi_flags, unsigned int mask)
500 if (!dev->change_seq) {
504 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
505 dev->carrier_resets++;
507 dev->ifi_flags = ifi_flags;
509 dev->cache_valid &= mask;
513 netdev_linux_update(struct netdev_linux *dev,
514 const struct rtnetlink_link_change *change)
516 if (change->nlmsg_type == RTM_NEWLINK) {
518 netdev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
520 /* Update netdev from rtnl-change msg. */
522 dev->mtu = change->mtu;
523 dev->cache_valid |= VALID_MTU;
524 dev->netdev_mtu_error = 0;
527 if (!eth_addr_is_zero(change->addr)) {
528 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
529 dev->cache_valid |= VALID_ETHERADDR;
530 dev->ether_addr_error = 0;
533 dev->ifindex = change->ifi_index;
534 dev->cache_valid |= VALID_IFINDEX;
535 dev->get_ifindex_error = 0;
538 netdev_linux_changed(dev, change->ifi_flags, 0);
543 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
544 void *aux OVS_UNUSED)
546 struct netdev_linux *dev;
548 struct netdev *base_dev = netdev_from_name(change->ifname);
549 if (base_dev && is_netdev_linux_class(netdev_get_class(base_dev))) {
550 netdev_linux_update(netdev_linux_cast(base_dev), change);
553 struct shash device_shash;
554 struct shash_node *node;
556 shash_init(&device_shash);
557 netdev_get_devices(&netdev_linux_class, &device_shash);
558 SHASH_FOR_EACH (node, &device_shash) {
563 get_flags(&dev->up, &flags);
564 netdev_linux_changed(dev, flags, 0);
566 shash_destroy(&device_shash);
571 cache_notifier_ref(void)
573 if (!cache_notifier_refcount) {
574 ovs_assert(!netdev_linux_cache_notifier);
576 netdev_linux_cache_notifier =
577 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
579 if (!netdev_linux_cache_notifier) {
583 cache_notifier_refcount++;
589 cache_notifier_unref(void)
591 ovs_assert(cache_notifier_refcount > 0);
592 if (!--cache_notifier_refcount) {
593 ovs_assert(netdev_linux_cache_notifier);
594 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
595 netdev_linux_cache_notifier = NULL;
599 /* Creates system and internal devices. */
601 netdev_linux_create(const struct netdev_class *class, const char *name,
602 struct netdev **netdevp)
604 struct netdev_linux *netdev;
607 error = cache_notifier_ref();
612 netdev = xzalloc(sizeof *netdev);
613 netdev->change_seq = 1;
614 netdev_init(&netdev->up, name, class);
615 error = get_flags(&netdev->up, &netdev->ifi_flags);
616 if (error == ENODEV) {
617 if (class != &netdev_internal_class) {
618 /* The device does not exist, so don't allow it to be opened. */
619 netdev_uninit(&netdev->up, false);
620 cache_notifier_unref();
624 /* "Internal" netdevs have to be created as netdev objects before
625 * they exist in the kernel, because creating them in the kernel
626 * happens by passing a netdev object to dpif_port_add().
627 * Therefore, ignore the error. */
631 *netdevp = &netdev->up;
635 /* For most types of netdevs we open the device for each call of
636 * netdev_open(). However, this is not the case with tap devices,
637 * since it is only possible to open the device once. In this
638 * situation we share a single file descriptor, and consequently
639 * buffers, across all readers. Therefore once data is read it will
640 * be unavailable to other reads for tap devices. */
642 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
643 const char *name, struct netdev **netdevp)
645 struct netdev_linux *netdev;
646 struct tap_state *state;
647 static const char tap_dev[] = "/dev/net/tun";
651 netdev = xzalloc(sizeof *netdev);
652 state = &netdev->state.tap;
654 error = cache_notifier_ref();
659 /* Open tap device. */
660 state->fd = open(tap_dev, O_RDWR);
663 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
664 goto error_unref_notifier;
667 /* Create tap device. */
668 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
669 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
670 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
671 VLOG_WARN("%s: creating tap device failed: %s", name,
672 ovs_strerror(errno));
674 goto error_unref_notifier;
677 /* Make non-blocking. */
678 error = set_nonblocking(state->fd);
680 goto error_unref_notifier;
683 netdev_init(&netdev->up, name, &netdev_tap_class);
684 *netdevp = &netdev->up;
687 error_unref_notifier:
688 cache_notifier_unref();
695 destroy_tap(struct netdev_linux *netdev)
697 struct tap_state *state = &netdev->state.tap;
699 if (state->fd >= 0) {
704 /* Destroys the netdev device 'netdev_'. */
706 netdev_linux_destroy(struct netdev *netdev_)
708 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
710 if (netdev->tc && netdev->tc->ops->tc_destroy) {
711 netdev->tc->ops->tc_destroy(netdev->tc);
714 if (netdev_get_class(netdev_) == &netdev_tap_class) {
719 cache_notifier_unref();
723 netdev_linux_rx_open(struct netdev *netdev_, struct netdev_rx **rxp)
725 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
726 bool is_tap = is_tap_netdev(netdev_);
727 struct netdev_rx_linux *rx;
732 fd = netdev->state.tap.fd;
734 struct sockaddr_ll sll;
736 /* Result of tcpdump -dd inbound */
737 static struct sock_filter filt[] = {
738 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
739 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
740 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
741 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
743 static struct sock_fprog fprog = { ARRAY_SIZE(filt), filt };
745 /* Create file descriptor. */
746 fd = socket(PF_PACKET, SOCK_RAW, 0);
749 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
753 /* Set non-blocking mode. */
754 error = set_nonblocking(fd);
759 /* Get ethernet device index. */
760 error = get_ifindex(&netdev->up, &ifindex);
765 /* Bind to specific ethernet device. */
766 memset(&sll, 0, sizeof sll);
767 sll.sll_family = AF_PACKET;
768 sll.sll_ifindex = ifindex;
769 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
770 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
772 VLOG_ERR("%s: failed to bind raw socket (%s)",
773 netdev_get_name(netdev_), ovs_strerror(error));
777 /* Filter for only inbound packets. */
778 error = setsockopt(fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
782 VLOG_ERR("%s: failed attach filter (%s)",
783 netdev_get_name(netdev_), ovs_strerror(error));
788 rx = xmalloc(sizeof *rx);
789 netdev_rx_init(&rx->up, netdev_, &netdev_rx_linux_class);
804 netdev_rx_linux_destroy(struct netdev_rx *rx_)
806 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
815 netdev_rx_linux_recv(struct netdev_rx *rx_, void *data, size_t size)
817 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
822 ? read(rx->fd, data, size)
823 : recv(rx->fd, data, size, MSG_TRUNC));
824 } while (retval < 0 && errno == EINTR);
827 return retval > size ? -EMSGSIZE : retval;
829 if (errno != EAGAIN) {
830 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
831 ovs_strerror(errno), netdev_rx_get_name(rx_));
838 netdev_rx_linux_wait(struct netdev_rx *rx_)
840 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
841 poll_fd_wait(rx->fd, POLLIN);
845 netdev_rx_linux_drain(struct netdev_rx *rx_)
847 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
850 int error = netdev_linux_do_ioctl(netdev_rx_get_name(rx_), &ifr,
851 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
855 drain_fd(rx->fd, ifr.ifr_qlen);
858 return drain_rcvbuf(rx->fd);
862 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
863 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
864 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
865 * the packet is too big or too small to transmit on the device.
867 * The caller retains ownership of 'buffer' in all cases.
869 * The kernel maintains a packet transmission queue, so the caller is not
870 * expected to do additional queuing of packets. */
872 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
877 if (!is_tap_netdev(netdev_)) {
878 /* Use our AF_PACKET socket to send to this device. */
879 struct sockaddr_ll sll;
886 sock = af_packet_sock();
891 error = get_ifindex(netdev_, &ifindex);
896 /* We don't bother setting most fields in sockaddr_ll because the
897 * kernel ignores them for SOCK_RAW. */
898 memset(&sll, 0, sizeof sll);
899 sll.sll_family = AF_PACKET;
900 sll.sll_ifindex = ifindex;
902 iov.iov_base = CONST_CAST(void *, data);
906 msg.msg_namelen = sizeof sll;
909 msg.msg_control = NULL;
910 msg.msg_controllen = 0;
913 retval = sendmsg(sock, &msg, 0);
915 /* Use the tap fd to send to this device. This is essential for
916 * tap devices, because packets sent to a tap device with an
917 * AF_PACKET socket will loop back to be *received* again on the
918 * tap device. This doesn't occur on other interface types
919 * because we attach a socket filter to the rx socket. */
920 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
922 retval = write(netdev->state.tap.fd, data, size);
926 /* The Linux AF_PACKET implementation never blocks waiting for room
927 * for packets, instead returning ENOBUFS. Translate this into
928 * EAGAIN for the caller. */
929 if (errno == ENOBUFS) {
931 } else if (errno == EINTR) {
933 } else if (errno != EAGAIN) {
934 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
935 netdev_get_name(netdev_), ovs_strerror(errno));
938 } else if (retval != size) {
939 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
940 "%zu) on %s", retval, size, netdev_get_name(netdev_));
948 /* Registers with the poll loop to wake up from the next call to poll_block()
949 * when the packet transmission queue has sufficient room to transmit a packet
950 * with netdev_send().
952 * The kernel maintains a packet transmission queue, so the client is not
953 * expected to do additional queuing of packets. Thus, this function is
954 * unlikely to ever be used. It is included for completeness. */
956 netdev_linux_send_wait(struct netdev *netdev)
958 if (is_tap_netdev(netdev)) {
959 /* TAP device always accepts packets.*/
960 poll_immediate_wake();
964 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
965 * otherwise a positive errno value. */
967 netdev_linux_set_etheraddr(struct netdev *netdev_,
968 const uint8_t mac[ETH_ADDR_LEN])
970 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
971 struct netdev_saved_flags *sf = NULL;
974 if (netdev->cache_valid & VALID_ETHERADDR) {
975 if (netdev->ether_addr_error) {
976 return netdev->ether_addr_error;
978 if (eth_addr_equals(netdev->etheraddr, mac)) {
981 netdev->cache_valid &= ~VALID_ETHERADDR;
984 /* Tap devices must be brought down before setting the address. */
985 if (is_tap_netdev(netdev_)) {
986 netdev_turn_flags_off(netdev_, NETDEV_UP, &sf);
988 error = set_etheraddr(netdev_get_name(netdev_), mac);
989 if (!error || error == ENODEV) {
990 netdev->ether_addr_error = error;
991 netdev->cache_valid |= VALID_ETHERADDR;
993 memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
997 netdev_restore_flags(sf);
1002 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1004 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1005 uint8_t mac[ETH_ADDR_LEN])
1007 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1009 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1010 int error = get_etheraddr(netdev_get_name(netdev_),
1013 netdev->ether_addr_error = error;
1014 netdev->cache_valid |= VALID_ETHERADDR;
1017 if (!netdev->ether_addr_error) {
1018 memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
1021 return netdev->ether_addr_error;
1024 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1025 * in bytes, not including the hardware header; thus, this is typically 1500
1026 * bytes for Ethernet devices. */
1028 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1030 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1031 if (!(netdev->cache_valid & VALID_MTU)) {
1035 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1036 SIOCGIFMTU, "SIOCGIFMTU");
1038 netdev->netdev_mtu_error = error;
1039 netdev->mtu = ifr.ifr_mtu;
1040 netdev->cache_valid |= VALID_MTU;
1043 if (!netdev->netdev_mtu_error) {
1044 *mtup = netdev->mtu;
1046 return netdev->netdev_mtu_error;
1049 /* Sets the maximum size of transmitted (MTU) for given device using linux
1050 * networking ioctl interface.
1053 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1055 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1059 if (netdev->cache_valid & VALID_MTU) {
1060 if (netdev->netdev_mtu_error) {
1061 return netdev->netdev_mtu_error;
1063 if (netdev->mtu == mtu) {
1066 netdev->cache_valid &= ~VALID_MTU;
1069 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1070 SIOCSIFMTU, "SIOCSIFMTU");
1071 if (!error || error == ENODEV) {
1072 netdev->netdev_mtu_error = error;
1073 netdev->mtu = ifr.ifr_mtu;
1074 netdev->cache_valid |= VALID_MTU;
1079 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1080 * On failure, returns a negative errno value. */
1082 netdev_linux_get_ifindex(const struct netdev *netdev)
1086 error = get_ifindex(netdev, &ifindex);
1087 return error ? -error : ifindex;
1091 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1093 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1095 if (netdev->miimon_interval > 0) {
1096 *carrier = netdev->miimon;
1098 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1104 static long long int
1105 netdev_linux_get_carrier_resets(const struct netdev *netdev)
1107 return netdev_linux_cast(netdev)->carrier_resets;
1111 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1112 struct mii_ioctl_data *data)
1117 memset(&ifr, 0, sizeof ifr);
1118 memcpy(&ifr.ifr_data, data, sizeof *data);
1119 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1120 memcpy(data, &ifr.ifr_data, sizeof *data);
1126 netdev_linux_get_miimon(const char *name, bool *miimon)
1128 struct mii_ioctl_data data;
1133 memset(&data, 0, sizeof data);
1134 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1136 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1137 data.reg_num = MII_BMSR;
1138 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1142 *miimon = !!(data.val_out & BMSR_LSTATUS);
1144 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1147 struct ethtool_cmd ecmd;
1149 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1152 COVERAGE_INC(netdev_get_ethtool);
1153 memset(&ecmd, 0, sizeof ecmd);
1154 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1157 struct ethtool_value eval;
1159 memcpy(&eval, &ecmd, sizeof eval);
1160 *miimon = !!eval.data;
1162 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1170 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1171 long long int interval)
1173 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1175 interval = interval > 0 ? MAX(interval, 100) : 0;
1176 if (netdev->miimon_interval != interval) {
1177 netdev->miimon_interval = interval;
1178 timer_set_expired(&netdev->miimon_timer);
1185 netdev_linux_miimon_run(void)
1187 struct shash device_shash;
1188 struct shash_node *node;
1190 shash_init(&device_shash);
1191 netdev_get_devices(&netdev_linux_class, &device_shash);
1192 SHASH_FOR_EACH (node, &device_shash) {
1193 struct netdev_linux *dev = node->data;
1196 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1200 netdev_linux_get_miimon(dev->up.name, &miimon);
1201 if (miimon != dev->miimon) {
1202 dev->miimon = miimon;
1203 netdev_linux_changed(dev, dev->ifi_flags, 0);
1206 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1209 shash_destroy(&device_shash);
1213 netdev_linux_miimon_wait(void)
1215 struct shash device_shash;
1216 struct shash_node *node;
1218 shash_init(&device_shash);
1219 netdev_get_devices(&netdev_linux_class, &device_shash);
1220 SHASH_FOR_EACH (node, &device_shash) {
1221 struct netdev_linux *dev = node->data;
1223 if (dev->miimon_interval > 0) {
1224 timer_wait(&dev->miimon_timer);
1227 shash_destroy(&device_shash);
1230 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1231 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1234 check_for_working_netlink_stats(void)
1236 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1237 * preferable, so if that works, we'll use it. */
1238 int ifindex = do_get_ifindex("lo");
1240 VLOG_WARN("failed to get ifindex for lo, "
1241 "obtaining netdev stats from proc");
1244 struct netdev_stats stats;
1245 int error = get_stats_via_netlink(ifindex, &stats);
1247 VLOG_DBG("obtaining netdev stats via rtnetlink");
1250 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1251 "via proc (you are probably running a pre-2.6.19 "
1252 "kernel)", ovs_strerror(error));
1259 swap_uint64(uint64_t *a, uint64_t *b)
1266 /* Copies 'src' into 'dst', performing format conversion in the process.
1268 * 'src' is allowed to be misaligned. */
1270 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1271 const struct ovs_vport_stats *src)
1273 dst->rx_packets = get_unaligned_u64(&src->rx_packets);
1274 dst->tx_packets = get_unaligned_u64(&src->tx_packets);
1275 dst->rx_bytes = get_unaligned_u64(&src->rx_bytes);
1276 dst->tx_bytes = get_unaligned_u64(&src->tx_bytes);
1277 dst->rx_errors = get_unaligned_u64(&src->rx_errors);
1278 dst->tx_errors = get_unaligned_u64(&src->tx_errors);
1279 dst->rx_dropped = get_unaligned_u64(&src->rx_dropped);
1280 dst->tx_dropped = get_unaligned_u64(&src->tx_dropped);
1282 dst->collisions = 0;
1283 dst->rx_length_errors = 0;
1284 dst->rx_over_errors = 0;
1285 dst->rx_crc_errors = 0;
1286 dst->rx_frame_errors = 0;
1287 dst->rx_fifo_errors = 0;
1288 dst->rx_missed_errors = 0;
1289 dst->tx_aborted_errors = 0;
1290 dst->tx_carrier_errors = 0;
1291 dst->tx_fifo_errors = 0;
1292 dst->tx_heartbeat_errors = 0;
1293 dst->tx_window_errors = 0;
1297 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1299 struct dpif_linux_vport reply;
1303 error = dpif_linux_vport_get(netdev_get_name(netdev), &reply, &buf);
1306 } else if (!reply.stats) {
1311 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1319 get_stats_via_vport(const struct netdev *netdev_,
1320 struct netdev_stats *stats)
1322 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1324 if (!netdev->vport_stats_error ||
1325 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1328 error = get_stats_via_vport__(netdev_, stats);
1329 if (error && error != ENOENT) {
1330 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1332 netdev_get_name(netdev_), ovs_strerror(error));
1334 netdev->vport_stats_error = error;
1335 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1340 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1341 struct netdev_stats *stats)
1343 static int use_netlink_stats = -1;
1346 if (use_netlink_stats < 0) {
1347 use_netlink_stats = check_for_working_netlink_stats();
1350 if (use_netlink_stats) {
1353 error = get_ifindex(netdev_, &ifindex);
1355 error = get_stats_via_netlink(ifindex, stats);
1358 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1362 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1363 netdev_get_name(netdev_), error);
1369 /* Retrieves current device stats for 'netdev-linux'. */
1371 netdev_linux_get_stats(const struct netdev *netdev_,
1372 struct netdev_stats *stats)
1374 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1375 struct netdev_stats dev_stats;
1378 get_stats_via_vport(netdev_, stats);
1380 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1383 if (netdev->vport_stats_error) {
1390 if (netdev->vport_stats_error) {
1391 /* stats not available from OVS then use ioctl stats. */
1394 stats->rx_errors += dev_stats.rx_errors;
1395 stats->tx_errors += dev_stats.tx_errors;
1396 stats->rx_dropped += dev_stats.rx_dropped;
1397 stats->tx_dropped += dev_stats.tx_dropped;
1398 stats->multicast += dev_stats.multicast;
1399 stats->collisions += dev_stats.collisions;
1400 stats->rx_length_errors += dev_stats.rx_length_errors;
1401 stats->rx_over_errors += dev_stats.rx_over_errors;
1402 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1403 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1404 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1405 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1406 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1407 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1408 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1409 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1410 stats->tx_window_errors += dev_stats.tx_window_errors;
1415 /* Retrieves current device stats for 'netdev-tap' netdev or
1416 * netdev-internal. */
1418 netdev_tap_get_stats(const struct netdev *netdev_,
1419 struct netdev_stats *stats)
1421 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1422 struct netdev_stats dev_stats;
1425 get_stats_via_vport(netdev_, stats);
1427 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1429 if (netdev->vport_stats_error) {
1436 /* If this port is an internal port then the transmit and receive stats
1437 * will appear to be swapped relative to the other ports since we are the
1438 * one sending the data, not a remote computer. For consistency, we swap
1439 * them back here. This does not apply if we are getting stats from the
1440 * vport layer because it always tracks stats from the perspective of the
1442 if (netdev->vport_stats_error) {
1444 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1445 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1446 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1447 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1448 stats->rx_length_errors = 0;
1449 stats->rx_over_errors = 0;
1450 stats->rx_crc_errors = 0;
1451 stats->rx_frame_errors = 0;
1452 stats->rx_fifo_errors = 0;
1453 stats->rx_missed_errors = 0;
1454 stats->tx_aborted_errors = 0;
1455 stats->tx_carrier_errors = 0;
1456 stats->tx_fifo_errors = 0;
1457 stats->tx_heartbeat_errors = 0;
1458 stats->tx_window_errors = 0;
1460 stats->rx_dropped += dev_stats.tx_dropped;
1461 stats->tx_dropped += dev_stats.rx_dropped;
1463 stats->rx_errors += dev_stats.tx_errors;
1464 stats->tx_errors += dev_stats.rx_errors;
1466 stats->multicast += dev_stats.multicast;
1467 stats->collisions += dev_stats.collisions;
1473 netdev_internal_get_stats(const struct netdev *netdev_,
1474 struct netdev_stats *stats)
1476 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1478 get_stats_via_vport(netdev_, stats);
1479 return netdev->vport_stats_error;
1483 netdev_internal_set_stats(struct netdev *netdev,
1484 const struct netdev_stats *stats)
1486 struct ovs_vport_stats vport_stats;
1487 struct dpif_linux_vport vport;
1490 vport_stats.rx_packets = stats->rx_packets;
1491 vport_stats.tx_packets = stats->tx_packets;
1492 vport_stats.rx_bytes = stats->rx_bytes;
1493 vport_stats.tx_bytes = stats->tx_bytes;
1494 vport_stats.rx_errors = stats->rx_errors;
1495 vport_stats.tx_errors = stats->tx_errors;
1496 vport_stats.rx_dropped = stats->rx_dropped;
1497 vport_stats.tx_dropped = stats->tx_dropped;
1499 dpif_linux_vport_init(&vport);
1500 vport.cmd = OVS_VPORT_CMD_SET;
1501 vport.name = netdev_get_name(netdev);
1502 vport.stats = &vport_stats;
1504 err = dpif_linux_vport_transact(&vport, NULL, NULL);
1506 /* If the vport layer doesn't know about the device, that doesn't mean it
1507 * doesn't exist (after all were able to open it when netdev_open() was
1508 * called), it just means that it isn't attached and we'll be getting
1509 * stats a different way. */
1510 if (err == ENODEV) {
1518 netdev_linux_read_features(struct netdev_linux *netdev)
1520 struct ethtool_cmd ecmd;
1524 if (netdev->cache_valid & VALID_FEATURES) {
1528 COVERAGE_INC(netdev_get_ethtool);
1529 memset(&ecmd, 0, sizeof ecmd);
1530 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1531 ETHTOOL_GSET, "ETHTOOL_GSET");
1536 /* Supported features. */
1537 netdev->supported = 0;
1538 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1539 netdev->supported |= NETDEV_F_10MB_HD;
1541 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1542 netdev->supported |= NETDEV_F_10MB_FD;
1544 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1545 netdev->supported |= NETDEV_F_100MB_HD;
1547 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1548 netdev->supported |= NETDEV_F_100MB_FD;
1550 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1551 netdev->supported |= NETDEV_F_1GB_HD;
1553 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1554 netdev->supported |= NETDEV_F_1GB_FD;
1556 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1557 netdev->supported |= NETDEV_F_10GB_FD;
1559 if (ecmd.supported & SUPPORTED_TP) {
1560 netdev->supported |= NETDEV_F_COPPER;
1562 if (ecmd.supported & SUPPORTED_FIBRE) {
1563 netdev->supported |= NETDEV_F_FIBER;
1565 if (ecmd.supported & SUPPORTED_Autoneg) {
1566 netdev->supported |= NETDEV_F_AUTONEG;
1568 if (ecmd.supported & SUPPORTED_Pause) {
1569 netdev->supported |= NETDEV_F_PAUSE;
1571 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1572 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1575 /* Advertised features. */
1576 netdev->advertised = 0;
1577 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1578 netdev->advertised |= NETDEV_F_10MB_HD;
1580 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1581 netdev->advertised |= NETDEV_F_10MB_FD;
1583 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1584 netdev->advertised |= NETDEV_F_100MB_HD;
1586 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1587 netdev->advertised |= NETDEV_F_100MB_FD;
1589 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1590 netdev->advertised |= NETDEV_F_1GB_HD;
1592 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1593 netdev->advertised |= NETDEV_F_1GB_FD;
1595 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1596 netdev->advertised |= NETDEV_F_10GB_FD;
1598 if (ecmd.advertising & ADVERTISED_TP) {
1599 netdev->advertised |= NETDEV_F_COPPER;
1601 if (ecmd.advertising & ADVERTISED_FIBRE) {
1602 netdev->advertised |= NETDEV_F_FIBER;
1604 if (ecmd.advertising & ADVERTISED_Autoneg) {
1605 netdev->advertised |= NETDEV_F_AUTONEG;
1607 if (ecmd.advertising & ADVERTISED_Pause) {
1608 netdev->advertised |= NETDEV_F_PAUSE;
1610 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1611 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1614 /* Current settings. */
1616 if (speed == SPEED_10) {
1617 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1618 } else if (speed == SPEED_100) {
1619 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1620 } else if (speed == SPEED_1000) {
1621 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1622 } else if (speed == SPEED_10000) {
1623 netdev->current = NETDEV_F_10GB_FD;
1624 } else if (speed == 40000) {
1625 netdev->current = NETDEV_F_40GB_FD;
1626 } else if (speed == 100000) {
1627 netdev->current = NETDEV_F_100GB_FD;
1628 } else if (speed == 1000000) {
1629 netdev->current = NETDEV_F_1TB_FD;
1631 netdev->current = 0;
1634 if (ecmd.port == PORT_TP) {
1635 netdev->current |= NETDEV_F_COPPER;
1636 } else if (ecmd.port == PORT_FIBRE) {
1637 netdev->current |= NETDEV_F_FIBER;
1641 netdev->current |= NETDEV_F_AUTONEG;
1644 /* Peer advertisements. */
1645 netdev->peer = 0; /* XXX */
1648 netdev->cache_valid |= VALID_FEATURES;
1649 netdev->get_features_error = error;
1652 /* Stores the features supported by 'netdev' into each of '*current',
1653 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1654 * bitmap of NETDEV_* bits. Returns 0 if successful, otherwise a positive
1657 netdev_linux_get_features(const struct netdev *netdev_,
1658 enum netdev_features *current,
1659 enum netdev_features *advertised,
1660 enum netdev_features *supported,
1661 enum netdev_features *peer)
1663 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1665 netdev_linux_read_features(netdev);
1667 if (!netdev->get_features_error) {
1668 *current = netdev->current;
1669 *advertised = netdev->advertised;
1670 *supported = netdev->supported;
1671 *peer = netdev->peer;
1673 return netdev->get_features_error;
1676 /* Set the features advertised by 'netdev' to 'advertise'. */
1678 netdev_linux_set_advertisements(struct netdev *netdev,
1679 enum netdev_features advertise)
1681 struct ethtool_cmd ecmd;
1684 COVERAGE_INC(netdev_get_ethtool);
1685 memset(&ecmd, 0, sizeof ecmd);
1686 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1687 ETHTOOL_GSET, "ETHTOOL_GSET");
1692 ecmd.advertising = 0;
1693 if (advertise & NETDEV_F_10MB_HD) {
1694 ecmd.advertising |= ADVERTISED_10baseT_Half;
1696 if (advertise & NETDEV_F_10MB_FD) {
1697 ecmd.advertising |= ADVERTISED_10baseT_Full;
1699 if (advertise & NETDEV_F_100MB_HD) {
1700 ecmd.advertising |= ADVERTISED_100baseT_Half;
1702 if (advertise & NETDEV_F_100MB_FD) {
1703 ecmd.advertising |= ADVERTISED_100baseT_Full;
1705 if (advertise & NETDEV_F_1GB_HD) {
1706 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1708 if (advertise & NETDEV_F_1GB_FD) {
1709 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1711 if (advertise & NETDEV_F_10GB_FD) {
1712 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1714 if (advertise & NETDEV_F_COPPER) {
1715 ecmd.advertising |= ADVERTISED_TP;
1717 if (advertise & NETDEV_F_FIBER) {
1718 ecmd.advertising |= ADVERTISED_FIBRE;
1720 if (advertise & NETDEV_F_AUTONEG) {
1721 ecmd.advertising |= ADVERTISED_Autoneg;
1723 if (advertise & NETDEV_F_PAUSE) {
1724 ecmd.advertising |= ADVERTISED_Pause;
1726 if (advertise & NETDEV_F_PAUSE_ASYM) {
1727 ecmd.advertising |= ADVERTISED_Asym_Pause;
1729 COVERAGE_INC(netdev_set_ethtool);
1730 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1731 ETHTOOL_SSET, "ETHTOOL_SSET");
1734 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1735 * successful, otherwise a positive errno value. */
1737 netdev_linux_set_policing(struct netdev *netdev_,
1738 uint32_t kbits_rate, uint32_t kbits_burst)
1740 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1741 const char *netdev_name = netdev_get_name(netdev_);
1745 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1746 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1747 : kbits_burst); /* Stick with user-specified value. */
1749 if (netdev->cache_valid & VALID_POLICING) {
1750 if (netdev->netdev_policing_error) {
1751 return netdev->netdev_policing_error;
1754 if (netdev->kbits_rate == kbits_rate &&
1755 netdev->kbits_burst == kbits_burst) {
1756 /* Assume that settings haven't changed since we last set them. */
1759 netdev->cache_valid &= ~VALID_POLICING;
1762 COVERAGE_INC(netdev_set_policing);
1763 /* Remove any existing ingress qdisc. */
1764 error = tc_add_del_ingress_qdisc(netdev_, false);
1766 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1767 netdev_name, ovs_strerror(error));
1772 error = tc_add_del_ingress_qdisc(netdev_, true);
1774 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1775 netdev_name, ovs_strerror(error));
1779 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
1781 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1782 netdev_name, ovs_strerror(error));
1787 netdev->kbits_rate = kbits_rate;
1788 netdev->kbits_burst = kbits_burst;
1791 if (!error || error == ENODEV) {
1792 netdev->netdev_policing_error = error;
1793 netdev->cache_valid |= VALID_POLICING;
1799 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1802 const struct tc_ops *const *opsp;
1804 for (opsp = tcs; *opsp != NULL; opsp++) {
1805 const struct tc_ops *ops = *opsp;
1806 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1807 sset_add(types, ops->ovs_name);
1813 static const struct tc_ops *
1814 tc_lookup_ovs_name(const char *name)
1816 const struct tc_ops *const *opsp;
1818 for (opsp = tcs; *opsp != NULL; opsp++) {
1819 const struct tc_ops *ops = *opsp;
1820 if (!strcmp(name, ops->ovs_name)) {
1827 static const struct tc_ops *
1828 tc_lookup_linux_name(const char *name)
1830 const struct tc_ops *const *opsp;
1832 for (opsp = tcs; *opsp != NULL; opsp++) {
1833 const struct tc_ops *ops = *opsp;
1834 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1841 static struct tc_queue *
1842 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
1845 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1846 struct tc_queue *queue;
1848 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
1849 if (queue->queue_id == queue_id) {
1856 static struct tc_queue *
1857 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1859 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1863 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1865 struct netdev_qos_capabilities *caps)
1867 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1871 caps->n_queues = ops->n_queues;
1876 netdev_linux_get_qos(const struct netdev *netdev_,
1877 const char **typep, struct smap *details)
1879 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1882 error = tc_query_qdisc(netdev_);
1887 *typep = netdev->tc->ops->ovs_name;
1888 return (netdev->tc->ops->qdisc_get
1889 ? netdev->tc->ops->qdisc_get(netdev_, details)
1894 netdev_linux_set_qos(struct netdev *netdev_,
1895 const char *type, const struct smap *details)
1897 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1898 const struct tc_ops *new_ops;
1901 new_ops = tc_lookup_ovs_name(type);
1902 if (!new_ops || !new_ops->tc_install) {
1906 error = tc_query_qdisc(netdev_);
1911 if (new_ops == netdev->tc->ops) {
1912 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
1914 /* Delete existing qdisc. */
1915 error = tc_del_qdisc(netdev_);
1919 ovs_assert(netdev->tc == NULL);
1921 /* Install new qdisc. */
1922 error = new_ops->tc_install(netdev_, details);
1923 ovs_assert((error == 0) == (netdev->tc != NULL));
1930 netdev_linux_get_queue(const struct netdev *netdev_,
1931 unsigned int queue_id, struct smap *details)
1933 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1936 error = tc_query_qdisc(netdev_);
1940 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
1942 ? netdev->tc->ops->class_get(netdev_, queue, details)
1948 netdev_linux_set_queue(struct netdev *netdev_,
1949 unsigned int queue_id, const struct smap *details)
1951 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1954 error = tc_query_qdisc(netdev_);
1957 } else if (queue_id >= netdev->tc->ops->n_queues
1958 || !netdev->tc->ops->class_set) {
1962 return netdev->tc->ops->class_set(netdev_, queue_id, details);
1966 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
1968 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1971 error = tc_query_qdisc(netdev_);
1974 } else if (!netdev->tc->ops->class_delete) {
1977 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
1979 ? netdev->tc->ops->class_delete(netdev_, queue)
1985 netdev_linux_get_queue_stats(const struct netdev *netdev_,
1986 unsigned int queue_id,
1987 struct netdev_queue_stats *stats)
1989 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1992 error = tc_query_qdisc(netdev_);
1995 } else if (!netdev->tc->ops->class_get_stats) {
1998 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2000 ? netdev->tc->ops->class_get_stats(netdev_, queue, stats)
2006 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
2008 struct ofpbuf request;
2009 struct tcmsg *tcmsg;
2011 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2015 tcmsg->tcm_parent = 0;
2016 nl_dump_start(dump, NETLINK_ROUTE, &request);
2017 ofpbuf_uninit(&request);
2022 netdev_linux_dump_queues(const struct netdev *netdev_,
2023 netdev_dump_queues_cb *cb, void *aux)
2025 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2026 struct tc_queue *queue, *next_queue;
2027 struct smap details;
2031 error = tc_query_qdisc(netdev_);
2034 } else if (!netdev->tc->ops->class_get) {
2039 smap_init(&details);
2040 HMAP_FOR_EACH_SAFE (queue, next_queue, hmap_node,
2041 &netdev->tc->queues) {
2042 smap_clear(&details);
2044 error = netdev->tc->ops->class_get(netdev_, queue, &details);
2046 (*cb)(queue->queue_id, &details, aux);
2051 smap_destroy(&details);
2057 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2058 netdev_dump_queue_stats_cb *cb, void *aux)
2060 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2061 struct nl_dump dump;
2066 error = tc_query_qdisc(netdev_);
2069 } else if (!netdev->tc->ops->class_dump_stats) {
2074 if (!start_queue_dump(netdev_, &dump)) {
2077 while (nl_dump_next(&dump, &msg)) {
2078 error = netdev->tc->ops->class_dump_stats(netdev_, &msg, cb, aux);
2084 error = nl_dump_done(&dump);
2085 return error ? error : last_error;
2089 netdev_linux_get_in4(const struct netdev *netdev_,
2090 struct in_addr *address, struct in_addr *netmask)
2092 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2094 if (!(netdev->cache_valid & VALID_IN4)) {
2097 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
2098 SIOCGIFADDR, "SIOCGIFADDR");
2103 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2104 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2109 netdev->cache_valid |= VALID_IN4;
2111 *address = netdev->address;
2112 *netmask = netdev->netmask;
2113 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
2117 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2118 struct in_addr netmask)
2120 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2123 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2125 netdev->cache_valid |= VALID_IN4;
2126 netdev->address = address;
2127 netdev->netmask = netmask;
2128 if (address.s_addr != INADDR_ANY) {
2129 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2130 "SIOCSIFNETMASK", netmask);
2137 parse_if_inet6_line(const char *line,
2138 struct in6_addr *in6, char ifname[16 + 1])
2140 uint8_t *s6 = in6->s6_addr;
2141 #define X8 "%2"SCNx8
2143 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2144 "%*x %*x %*x %*x %16s\n",
2145 &s6[0], &s6[1], &s6[2], &s6[3],
2146 &s6[4], &s6[5], &s6[6], &s6[7],
2147 &s6[8], &s6[9], &s6[10], &s6[11],
2148 &s6[12], &s6[13], &s6[14], &s6[15],
2152 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2153 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2155 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2157 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2158 if (!(netdev->cache_valid & VALID_IN6)) {
2162 netdev->in6 = in6addr_any;
2164 file = fopen("/proc/net/if_inet6", "r");
2166 const char *name = netdev_get_name(netdev_);
2167 while (fgets(line, sizeof line, file)) {
2168 struct in6_addr in6_tmp;
2169 char ifname[16 + 1];
2170 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2171 && !strcmp(name, ifname))
2173 netdev->in6 = in6_tmp;
2179 netdev->cache_valid |= VALID_IN6;
2186 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2188 struct sockaddr_in sin;
2189 memset(&sin, 0, sizeof sin);
2190 sin.sin_family = AF_INET;
2191 sin.sin_addr = addr;
2194 memset(sa, 0, sizeof *sa);
2195 memcpy(sa, &sin, sizeof sin);
2199 do_set_addr(struct netdev *netdev,
2200 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2203 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2204 make_in4_sockaddr(&ifr.ifr_addr, addr);
2206 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2210 /* Adds 'router' as a default IP gateway. */
2212 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2214 struct in_addr any = { INADDR_ANY };
2218 memset(&rt, 0, sizeof rt);
2219 make_in4_sockaddr(&rt.rt_dst, any);
2220 make_in4_sockaddr(&rt.rt_gateway, router);
2221 make_in4_sockaddr(&rt.rt_genmask, any);
2222 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2223 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2225 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2231 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2234 static const char fn[] = "/proc/net/route";
2239 *netdev_name = NULL;
2240 stream = fopen(fn, "r");
2241 if (stream == NULL) {
2242 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2247 while (fgets(line, sizeof line, stream)) {
2250 ovs_be32 dest, gateway, mask;
2251 int refcnt, metric, mtu;
2252 unsigned int flags, use, window, irtt;
2255 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2257 iface, &dest, &gateway, &flags, &refcnt,
2258 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2260 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2264 if (!(flags & RTF_UP)) {
2265 /* Skip routes that aren't up. */
2269 /* The output of 'dest', 'mask', and 'gateway' were given in
2270 * network byte order, so we don't need need any endian
2271 * conversions here. */
2272 if ((dest & mask) == (host->s_addr & mask)) {
2274 /* The host is directly reachable. */
2275 next_hop->s_addr = 0;
2277 /* To reach the host, we must go through a gateway. */
2278 next_hop->s_addr = gateway;
2280 *netdev_name = xstrdup(iface);
2292 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2294 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2297 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2298 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2300 COVERAGE_INC(netdev_get_ethtool);
2301 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2302 error = netdev_linux_do_ethtool(netdev->up.name,
2305 "ETHTOOL_GDRVINFO");
2307 netdev->cache_valid |= VALID_DRVINFO;
2312 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2313 smap_add(smap, "driver_version", netdev->drvinfo.version);
2314 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2320 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2323 smap_add(smap, "driver_name", "openvswitch");
2327 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2328 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2329 * returns 0. Otherwise, it returns a positive errno value; in particular,
2330 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2332 netdev_linux_arp_lookup(const struct netdev *netdev,
2333 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2336 struct sockaddr_in sin;
2339 memset(&r, 0, sizeof r);
2340 memset(&sin, 0, sizeof sin);
2341 sin.sin_family = AF_INET;
2342 sin.sin_addr.s_addr = ip;
2344 memcpy(&r.arp_pa, &sin, sizeof sin);
2345 r.arp_ha.sa_family = ARPHRD_ETHER;
2347 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2348 COVERAGE_INC(netdev_arp_lookup);
2349 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2351 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2352 } else if (retval != ENXIO) {
2353 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2354 netdev_get_name(netdev), IP_ARGS(ip),
2355 ovs_strerror(retval));
2361 nd_to_iff_flags(enum netdev_flags nd)
2364 if (nd & NETDEV_UP) {
2367 if (nd & NETDEV_PROMISC) {
2374 iff_to_nd_flags(int iff)
2376 enum netdev_flags nd = 0;
2380 if (iff & IFF_PROMISC) {
2381 nd |= NETDEV_PROMISC;
2387 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2388 enum netdev_flags on, enum netdev_flags *old_flagsp)
2390 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2391 int old_flags, new_flags;
2394 old_flags = netdev->ifi_flags;
2395 *old_flagsp = iff_to_nd_flags(old_flags);
2396 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2397 if (new_flags != old_flags) {
2398 error = set_flags(netdev_get_name(netdev_), new_flags);
2399 get_flags(netdev_, &netdev->ifi_flags);
2405 netdev_linux_change_seq(const struct netdev *netdev)
2407 return netdev_linux_cast(netdev)->change_seq;
2410 #define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS, \
2411 GET_FEATURES, GET_STATUS) \
2415 netdev_linux_init, \
2417 netdev_linux_wait, \
2420 netdev_linux_destroy, \
2421 NULL, /* get_config */ \
2422 NULL, /* set_config */ \
2423 NULL, /* get_tunnel_config */ \
2425 netdev_linux_rx_open, \
2427 netdev_linux_send, \
2428 netdev_linux_send_wait, \
2430 netdev_linux_set_etheraddr, \
2431 netdev_linux_get_etheraddr, \
2432 netdev_linux_get_mtu, \
2433 netdev_linux_set_mtu, \
2434 netdev_linux_get_ifindex, \
2435 netdev_linux_get_carrier, \
2436 netdev_linux_get_carrier_resets, \
2437 netdev_linux_set_miimon_interval, \
2442 netdev_linux_set_advertisements, \
2444 netdev_linux_set_policing, \
2445 netdev_linux_get_qos_types, \
2446 netdev_linux_get_qos_capabilities, \
2447 netdev_linux_get_qos, \
2448 netdev_linux_set_qos, \
2449 netdev_linux_get_queue, \
2450 netdev_linux_set_queue, \
2451 netdev_linux_delete_queue, \
2452 netdev_linux_get_queue_stats, \
2453 netdev_linux_dump_queues, \
2454 netdev_linux_dump_queue_stats, \
2456 netdev_linux_get_in4, \
2457 netdev_linux_set_in4, \
2458 netdev_linux_get_in6, \
2459 netdev_linux_add_router, \
2460 netdev_linux_get_next_hop, \
2462 netdev_linux_arp_lookup, \
2464 netdev_linux_update_flags, \
2466 netdev_linux_change_seq \
2469 const struct netdev_class netdev_linux_class =
2472 netdev_linux_create,
2473 netdev_linux_get_stats,
2474 NULL, /* set_stats */
2475 netdev_linux_get_features,
2476 netdev_linux_get_status);
2478 const struct netdev_class netdev_tap_class =
2481 netdev_linux_create_tap,
2482 netdev_tap_get_stats,
2483 NULL, /* set_stats */
2484 netdev_linux_get_features,
2485 netdev_linux_get_status);
2487 const struct netdev_class netdev_internal_class =
2490 netdev_linux_create,
2491 netdev_internal_get_stats,
2492 netdev_internal_set_stats,
2493 NULL, /* get_features */
2494 netdev_internal_get_status);
2496 static const struct netdev_rx_class netdev_rx_linux_class = {
2497 netdev_rx_linux_destroy,
2498 netdev_rx_linux_recv,
2499 netdev_rx_linux_wait,
2500 netdev_rx_linux_drain,
2503 /* HTB traffic control class. */
2505 #define HTB_N_QUEUES 0xf000
2509 unsigned int max_rate; /* In bytes/s. */
2513 struct tc_queue tc_queue;
2514 unsigned int min_rate; /* In bytes/s. */
2515 unsigned int max_rate; /* In bytes/s. */
2516 unsigned int burst; /* In bytes. */
2517 unsigned int priority; /* Lower values are higher priorities. */
2521 htb_get__(const struct netdev *netdev_)
2523 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2524 return CONTAINER_OF(netdev->tc, struct htb, tc);
2528 htb_install__(struct netdev *netdev_, uint64_t max_rate)
2530 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2533 htb = xmalloc(sizeof *htb);
2534 tc_init(&htb->tc, &tc_ops_htb);
2535 htb->max_rate = max_rate;
2537 netdev->tc = &htb->tc;
2540 /* Create an HTB qdisc.
2542 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2544 htb_setup_qdisc__(struct netdev *netdev)
2547 struct tc_htb_glob opt;
2548 struct ofpbuf request;
2549 struct tcmsg *tcmsg;
2551 tc_del_qdisc(netdev);
2553 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2554 NLM_F_EXCL | NLM_F_CREATE, &request);
2558 tcmsg->tcm_handle = tc_make_handle(1, 0);
2559 tcmsg->tcm_parent = TC_H_ROOT;
2561 nl_msg_put_string(&request, TCA_KIND, "htb");
2563 memset(&opt, 0, sizeof opt);
2564 opt.rate2quantum = 10;
2568 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2569 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2570 nl_msg_end_nested(&request, opt_offset);
2572 return tc_transact(&request, NULL);
2575 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2576 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2578 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2579 unsigned int parent, struct htb_class *class)
2582 struct tc_htb_opt opt;
2583 struct ofpbuf request;
2584 struct tcmsg *tcmsg;
2588 error = netdev_get_mtu(netdev, &mtu);
2590 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2591 netdev_get_name(netdev));
2595 memset(&opt, 0, sizeof opt);
2596 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2597 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2598 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2599 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2600 opt.prio = class->priority;
2602 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2606 tcmsg->tcm_handle = handle;
2607 tcmsg->tcm_parent = parent;
2609 nl_msg_put_string(&request, TCA_KIND, "htb");
2610 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2611 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2612 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2613 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2614 nl_msg_end_nested(&request, opt_offset);
2616 error = tc_transact(&request, NULL);
2618 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2619 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2620 netdev_get_name(netdev),
2621 tc_get_major(handle), tc_get_minor(handle),
2622 tc_get_major(parent), tc_get_minor(parent),
2623 class->min_rate, class->max_rate,
2624 class->burst, class->priority, ovs_strerror(error));
2629 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2630 * description of them into 'details'. The description complies with the
2631 * specification given in the vswitch database documentation for linux-htb
2634 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2636 static const struct nl_policy tca_htb_policy[] = {
2637 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2638 .min_len = sizeof(struct tc_htb_opt) },
2641 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2642 const struct tc_htb_opt *htb;
2644 if (!nl_parse_nested(nl_options, tca_htb_policy,
2645 attrs, ARRAY_SIZE(tca_htb_policy))) {
2646 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2650 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2651 class->min_rate = htb->rate.rate;
2652 class->max_rate = htb->ceil.rate;
2653 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2654 class->priority = htb->prio;
2659 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2660 struct htb_class *options,
2661 struct netdev_queue_stats *stats)
2663 struct nlattr *nl_options;
2664 unsigned int handle;
2667 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2668 if (!error && queue_id) {
2669 unsigned int major = tc_get_major(handle);
2670 unsigned int minor = tc_get_minor(handle);
2671 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2672 *queue_id = minor - 1;
2677 if (!error && options) {
2678 error = htb_parse_tca_options__(nl_options, options);
2684 htb_parse_qdisc_details__(struct netdev *netdev,
2685 const struct smap *details, struct htb_class *hc)
2687 const char *max_rate_s;
2689 max_rate_s = smap_get(details, "max-rate");
2690 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2691 if (!hc->max_rate) {
2692 enum netdev_features current;
2694 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2695 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
2697 hc->min_rate = hc->max_rate;
2703 htb_parse_class_details__(struct netdev *netdev,
2704 const struct smap *details, struct htb_class *hc)
2706 const struct htb *htb = htb_get__(netdev);
2707 const char *min_rate_s = smap_get(details, "min-rate");
2708 const char *max_rate_s = smap_get(details, "max-rate");
2709 const char *burst_s = smap_get(details, "burst");
2710 const char *priority_s = smap_get(details, "priority");
2713 error = netdev_get_mtu(netdev, &mtu);
2715 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2716 netdev_get_name(netdev));
2720 /* HTB requires at least an mtu sized min-rate to send any traffic even
2721 * on uncongested links. */
2722 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2723 hc->min_rate = MAX(hc->min_rate, mtu);
2724 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2727 hc->max_rate = (max_rate_s
2728 ? strtoull(max_rate_s, NULL, 10) / 8
2730 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2731 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2735 * According to hints in the documentation that I've read, it is important
2736 * that 'burst' be at least as big as the largest frame that might be
2737 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2738 * but having it a bit too small is a problem. Since netdev_get_mtu()
2739 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2740 * the MTU. We actually add 64, instead of 14, as a guard against
2741 * additional headers get tacked on somewhere that we're not aware of. */
2742 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2743 hc->burst = MAX(hc->burst, mtu + 64);
2746 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2752 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2753 unsigned int parent, struct htb_class *options,
2754 struct netdev_queue_stats *stats)
2756 struct ofpbuf *reply;
2759 error = tc_query_class(netdev, handle, parent, &reply);
2761 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2762 ofpbuf_delete(reply);
2768 htb_tc_install(struct netdev *netdev, const struct smap *details)
2772 error = htb_setup_qdisc__(netdev);
2774 struct htb_class hc;
2776 htb_parse_qdisc_details__(netdev, details, &hc);
2777 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2778 tc_make_handle(1, 0), &hc);
2780 htb_install__(netdev, hc.max_rate);
2786 static struct htb_class *
2787 htb_class_cast__(const struct tc_queue *queue)
2789 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2793 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2794 const struct htb_class *hc)
2796 struct htb *htb = htb_get__(netdev);
2797 size_t hash = hash_int(queue_id, 0);
2798 struct tc_queue *queue;
2799 struct htb_class *hcp;
2801 queue = tc_find_queue__(netdev, queue_id, hash);
2803 hcp = htb_class_cast__(queue);
2805 hcp = xmalloc(sizeof *hcp);
2806 queue = &hcp->tc_queue;
2807 queue->queue_id = queue_id;
2808 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2811 hcp->min_rate = hc->min_rate;
2812 hcp->max_rate = hc->max_rate;
2813 hcp->burst = hc->burst;
2814 hcp->priority = hc->priority;
2818 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2821 struct nl_dump dump;
2822 struct htb_class hc;
2824 /* Get qdisc options. */
2826 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2827 htb_install__(netdev, hc.max_rate);
2830 if (!start_queue_dump(netdev, &dump)) {
2833 while (nl_dump_next(&dump, &msg)) {
2834 unsigned int queue_id;
2836 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2837 htb_update_queue__(netdev, queue_id, &hc);
2840 nl_dump_done(&dump);
2846 htb_tc_destroy(struct tc *tc)
2848 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2849 struct htb_class *hc, *next;
2851 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2852 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2860 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
2862 const struct htb *htb = htb_get__(netdev);
2863 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
2868 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
2870 struct htb_class hc;
2873 htb_parse_qdisc_details__(netdev, details, &hc);
2874 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2875 tc_make_handle(1, 0), &hc);
2877 htb_get__(netdev)->max_rate = hc.max_rate;
2883 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2884 const struct tc_queue *queue, struct smap *details)
2886 const struct htb_class *hc = htb_class_cast__(queue);
2888 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
2889 if (hc->min_rate != hc->max_rate) {
2890 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
2892 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
2894 smap_add_format(details, "priority", "%u", hc->priority);
2900 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2901 const struct smap *details)
2903 struct htb_class hc;
2906 error = htb_parse_class_details__(netdev, details, &hc);
2911 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2912 tc_make_handle(1, 0xfffe), &hc);
2917 htb_update_queue__(netdev, queue_id, &hc);
2922 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2924 struct htb_class *hc = htb_class_cast__(queue);
2925 struct htb *htb = htb_get__(netdev);
2928 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2930 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2937 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2938 struct netdev_queue_stats *stats)
2940 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2941 tc_make_handle(1, 0xfffe), NULL, stats);
2945 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2946 const struct ofpbuf *nlmsg,
2947 netdev_dump_queue_stats_cb *cb, void *aux)
2949 struct netdev_queue_stats stats;
2950 unsigned int handle, major, minor;
2953 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2958 major = tc_get_major(handle);
2959 minor = tc_get_minor(handle);
2960 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2961 (*cb)(minor - 1, &stats, aux);
2966 static const struct tc_ops tc_ops_htb = {
2967 "htb", /* linux_name */
2968 "linux-htb", /* ovs_name */
2969 HTB_N_QUEUES, /* n_queues */
2978 htb_class_get_stats,
2979 htb_class_dump_stats
2982 /* "linux-hfsc" traffic control class. */
2984 #define HFSC_N_QUEUES 0xf000
2992 struct tc_queue tc_queue;
2997 static struct hfsc *
2998 hfsc_get__(const struct netdev *netdev_)
3000 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3001 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
3004 static struct hfsc_class *
3005 hfsc_class_cast__(const struct tc_queue *queue)
3007 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3011 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
3013 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3016 hfsc = xmalloc(sizeof *hfsc);
3017 tc_init(&hfsc->tc, &tc_ops_hfsc);
3018 hfsc->max_rate = max_rate;
3019 netdev->tc = &hfsc->tc;
3023 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3024 const struct hfsc_class *hc)
3028 struct hfsc_class *hcp;
3029 struct tc_queue *queue;
3031 hfsc = hfsc_get__(netdev);
3032 hash = hash_int(queue_id, 0);
3034 queue = tc_find_queue__(netdev, queue_id, hash);
3036 hcp = hfsc_class_cast__(queue);
3038 hcp = xmalloc(sizeof *hcp);
3039 queue = &hcp->tc_queue;
3040 queue->queue_id = queue_id;
3041 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3044 hcp->min_rate = hc->min_rate;
3045 hcp->max_rate = hc->max_rate;
3049 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3051 const struct tc_service_curve *rsc, *fsc, *usc;
3052 static const struct nl_policy tca_hfsc_policy[] = {
3054 .type = NL_A_UNSPEC,
3056 .min_len = sizeof(struct tc_service_curve),
3059 .type = NL_A_UNSPEC,
3061 .min_len = sizeof(struct tc_service_curve),
3064 .type = NL_A_UNSPEC,
3066 .min_len = sizeof(struct tc_service_curve),
3069 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3071 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3072 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3073 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3077 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3078 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3079 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3081 if (rsc->m1 != 0 || rsc->d != 0 ||
3082 fsc->m1 != 0 || fsc->d != 0 ||
3083 usc->m1 != 0 || usc->d != 0) {
3084 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3085 "Non-linear service curves are not supported.");
3089 if (rsc->m2 != fsc->m2) {
3090 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3091 "Real-time service curves are not supported ");
3095 if (rsc->m2 > usc->m2) {
3096 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3097 "Min-rate service curve is greater than "
3098 "the max-rate service curve.");
3102 class->min_rate = fsc->m2;
3103 class->max_rate = usc->m2;
3108 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3109 struct hfsc_class *options,
3110 struct netdev_queue_stats *stats)
3113 unsigned int handle;
3114 struct nlattr *nl_options;
3116 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3122 unsigned int major, minor;
3124 major = tc_get_major(handle);
3125 minor = tc_get_minor(handle);
3126 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3127 *queue_id = minor - 1;
3134 error = hfsc_parse_tca_options__(nl_options, options);
3141 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3142 unsigned int parent, struct hfsc_class *options,
3143 struct netdev_queue_stats *stats)
3146 struct ofpbuf *reply;
3148 error = tc_query_class(netdev, handle, parent, &reply);
3153 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3154 ofpbuf_delete(reply);
3159 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct smap *details,
3160 struct hfsc_class *class)
3163 const char *max_rate_s;
3165 max_rate_s = smap_get(details, "max-rate");
3166 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3169 enum netdev_features current;
3171 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
3172 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3175 class->min_rate = max_rate;
3176 class->max_rate = max_rate;
3180 hfsc_parse_class_details__(struct netdev *netdev,
3181 const struct smap *details,
3182 struct hfsc_class * class)
3184 const struct hfsc *hfsc;
3185 uint32_t min_rate, max_rate;
3186 const char *min_rate_s, *max_rate_s;
3188 hfsc = hfsc_get__(netdev);
3189 min_rate_s = smap_get(details, "min-rate");
3190 max_rate_s = smap_get(details, "max-rate");
3192 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3193 min_rate = MAX(min_rate, 1);
3194 min_rate = MIN(min_rate, hfsc->max_rate);
3196 max_rate = (max_rate_s
3197 ? strtoull(max_rate_s, NULL, 10) / 8
3199 max_rate = MAX(max_rate, min_rate);
3200 max_rate = MIN(max_rate, hfsc->max_rate);
3202 class->min_rate = min_rate;
3203 class->max_rate = max_rate;
3208 /* Create an HFSC qdisc.
3210 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3212 hfsc_setup_qdisc__(struct netdev * netdev)
3214 struct tcmsg *tcmsg;
3215 struct ofpbuf request;
3216 struct tc_hfsc_qopt opt;
3218 tc_del_qdisc(netdev);
3220 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3221 NLM_F_EXCL | NLM_F_CREATE, &request);
3227 tcmsg->tcm_handle = tc_make_handle(1, 0);
3228 tcmsg->tcm_parent = TC_H_ROOT;
3230 memset(&opt, 0, sizeof opt);
3233 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3234 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3236 return tc_transact(&request, NULL);
3239 /* Create an HFSC class.
3241 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3242 * sc rate <min_rate> ul rate <max_rate>" */
3244 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3245 unsigned int parent, struct hfsc_class *class)
3249 struct tcmsg *tcmsg;
3250 struct ofpbuf request;
3251 struct tc_service_curve min, max;
3253 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3259 tcmsg->tcm_handle = handle;
3260 tcmsg->tcm_parent = parent;
3264 min.m2 = class->min_rate;
3268 max.m2 = class->max_rate;
3270 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3271 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3272 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3273 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3274 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3275 nl_msg_end_nested(&request, opt_offset);
3277 error = tc_transact(&request, NULL);
3279 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3280 "min-rate %ubps, max-rate %ubps (%s)",
3281 netdev_get_name(netdev),
3282 tc_get_major(handle), tc_get_minor(handle),
3283 tc_get_major(parent), tc_get_minor(parent),
3284 class->min_rate, class->max_rate, ovs_strerror(error));
3291 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
3294 struct hfsc_class class;
3296 error = hfsc_setup_qdisc__(netdev);
3302 hfsc_parse_qdisc_details__(netdev, details, &class);
3303 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3304 tc_make_handle(1, 0), &class);
3310 hfsc_install__(netdev, class.max_rate);
3315 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3318 struct nl_dump dump;
3319 struct hfsc_class hc;
3322 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3323 hfsc_install__(netdev, hc.max_rate);
3325 if (!start_queue_dump(netdev, &dump)) {
3329 while (nl_dump_next(&dump, &msg)) {
3330 unsigned int queue_id;
3332 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3333 hfsc_update_queue__(netdev, queue_id, &hc);
3337 nl_dump_done(&dump);
3342 hfsc_tc_destroy(struct tc *tc)
3345 struct hfsc_class *hc, *next;
3347 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3349 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3350 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3359 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
3361 const struct hfsc *hfsc;
3362 hfsc = hfsc_get__(netdev);
3363 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
3368 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
3371 struct hfsc_class class;
3373 hfsc_parse_qdisc_details__(netdev, details, &class);
3374 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3375 tc_make_handle(1, 0), &class);
3378 hfsc_get__(netdev)->max_rate = class.max_rate;
3385 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3386 const struct tc_queue *queue, struct smap *details)
3388 const struct hfsc_class *hc;
3390 hc = hfsc_class_cast__(queue);
3391 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3392 if (hc->min_rate != hc->max_rate) {
3393 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3399 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3400 const struct smap *details)
3403 struct hfsc_class class;
3405 error = hfsc_parse_class_details__(netdev, details, &class);
3410 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3411 tc_make_handle(1, 0xfffe), &class);
3416 hfsc_update_queue__(netdev, queue_id, &class);
3421 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3425 struct hfsc_class *hc;
3427 hc = hfsc_class_cast__(queue);
3428 hfsc = hfsc_get__(netdev);
3430 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3432 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3439 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3440 struct netdev_queue_stats *stats)
3442 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3443 tc_make_handle(1, 0xfffe), NULL, stats);
3447 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3448 const struct ofpbuf *nlmsg,
3449 netdev_dump_queue_stats_cb *cb, void *aux)
3451 struct netdev_queue_stats stats;
3452 unsigned int handle, major, minor;
3455 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3460 major = tc_get_major(handle);
3461 minor = tc_get_minor(handle);
3462 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3463 (*cb)(minor - 1, &stats, aux);
3468 static const struct tc_ops tc_ops_hfsc = {
3469 "hfsc", /* linux_name */
3470 "linux-hfsc", /* ovs_name */
3471 HFSC_N_QUEUES, /* n_queues */
3472 hfsc_tc_install, /* tc_install */
3473 hfsc_tc_load, /* tc_load */
3474 hfsc_tc_destroy, /* tc_destroy */
3475 hfsc_qdisc_get, /* qdisc_get */
3476 hfsc_qdisc_set, /* qdisc_set */
3477 hfsc_class_get, /* class_get */
3478 hfsc_class_set, /* class_set */
3479 hfsc_class_delete, /* class_delete */
3480 hfsc_class_get_stats, /* class_get_stats */
3481 hfsc_class_dump_stats /* class_dump_stats */
3484 /* "linux-default" traffic control class.
3486 * This class represents the default, unnamed Linux qdisc. It corresponds to
3487 * the "" (empty string) QoS type in the OVS database. */
3490 default_install__(struct netdev *netdev_)
3492 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3493 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
3495 /* Nothing but a tc class implementation is allowed to write to a tc. This
3496 * class never does that, so we can legitimately use a const tc object. */
3497 netdev->tc = CONST_CAST(struct tc *, &tc);
3501 default_tc_install(struct netdev *netdev,
3502 const struct smap *details OVS_UNUSED)
3504 default_install__(netdev);
3509 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3511 default_install__(netdev);
3515 static const struct tc_ops tc_ops_default = {
3516 NULL, /* linux_name */
3521 NULL, /* tc_destroy */
3522 NULL, /* qdisc_get */
3523 NULL, /* qdisc_set */
3524 NULL, /* class_get */
3525 NULL, /* class_set */
3526 NULL, /* class_delete */
3527 NULL, /* class_get_stats */
3528 NULL /* class_dump_stats */
3531 /* "linux-other" traffic control class.
3536 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
3538 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3539 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
3541 /* Nothing but a tc class implementation is allowed to write to a tc. This
3542 * class never does that, so we can legitimately use a const tc object. */
3543 netdev->tc = CONST_CAST(struct tc *, &tc);
3547 static const struct tc_ops tc_ops_other = {
3548 NULL, /* linux_name */
3549 "linux-other", /* ovs_name */
3551 NULL, /* tc_install */
3553 NULL, /* tc_destroy */
3554 NULL, /* qdisc_get */
3555 NULL, /* qdisc_set */
3556 NULL, /* class_get */
3557 NULL, /* class_set */
3558 NULL, /* class_delete */
3559 NULL, /* class_get_stats */
3560 NULL /* class_dump_stats */
3563 /* Traffic control. */
3565 /* Number of kernel "tc" ticks per second. */
3566 static double ticks_per_s;
3568 /* Number of kernel "jiffies" per second. This is used for the purpose of
3569 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3570 * one jiffy's worth of data.
3572 * There are two possibilities here:
3574 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3575 * approximate range of 100 to 1024. That means that we really need to
3576 * make sure that the qdisc can buffer that much data.
3578 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3579 * has finely granular timers and there's no need to fudge additional room
3580 * for buffers. (There's no extra effort needed to implement that: the
3581 * large 'buffer_hz' is used as a divisor, so practically any number will
3582 * come out as 0 in the division. Small integer results in the case of
3583 * really high dividends won't have any real effect anyhow.)
3585 static unsigned int buffer_hz;
3587 /* Returns tc handle 'major':'minor'. */
3589 tc_make_handle(unsigned int major, unsigned int minor)
3591 return TC_H_MAKE(major << 16, minor);
3594 /* Returns the major number from 'handle'. */
3596 tc_get_major(unsigned int handle)
3598 return TC_H_MAJ(handle) >> 16;
3601 /* Returns the minor number from 'handle'. */
3603 tc_get_minor(unsigned int handle)
3605 return TC_H_MIN(handle);
3608 static struct tcmsg *
3609 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3610 struct ofpbuf *request)
3612 struct tcmsg *tcmsg;
3616 error = get_ifindex(netdev, &ifindex);
3621 ofpbuf_init(request, 512);
3622 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3623 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3624 tcmsg->tcm_family = AF_UNSPEC;
3625 tcmsg->tcm_ifindex = ifindex;
3626 /* Caller should fill in tcmsg->tcm_handle. */
3627 /* Caller should fill in tcmsg->tcm_parent. */
3633 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3635 int error = nl_transact(NETLINK_ROUTE, request, replyp);
3636 ofpbuf_uninit(request);
3640 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3641 * policing configuration.
3643 * This function is equivalent to running the following when 'add' is true:
3644 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3646 * This function is equivalent to running the following when 'add' is false:
3647 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3649 * The configuration and stats may be seen with the following command:
3650 * /sbin/tc -s qdisc show dev <devname>
3652 * Returns 0 if successful, otherwise a positive errno value.
3655 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3657 struct ofpbuf request;
3658 struct tcmsg *tcmsg;
3660 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3661 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3663 tcmsg = tc_make_request(netdev, type, flags, &request);
3667 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3668 tcmsg->tcm_parent = TC_H_INGRESS;
3669 nl_msg_put_string(&request, TCA_KIND, "ingress");
3670 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3672 error = tc_transact(&request, NULL);
3674 /* If we're deleting the qdisc, don't worry about some of the
3675 * error conditions. */
3676 if (!add && (error == ENOENT || error == EINVAL)) {
3685 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3688 * This function is equivalent to running:
3689 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3690 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3693 * The configuration and stats may be seen with the following command:
3694 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3696 * Returns 0 if successful, otherwise a positive errno value.
3699 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3701 struct tc_police tc_police;
3702 struct ofpbuf request;
3703 struct tcmsg *tcmsg;
3704 size_t basic_offset;
3705 size_t police_offset;
3709 memset(&tc_police, 0, sizeof tc_police);
3710 tc_police.action = TC_POLICE_SHOT;
3711 tc_police.mtu = mtu;
3712 tc_fill_rate(&tc_police.rate, (kbits_rate * 1000)/8, mtu);
3713 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3714 kbits_burst * 1024);
3716 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3717 NLM_F_EXCL | NLM_F_CREATE, &request);
3721 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3722 tcmsg->tcm_info = tc_make_handle(49,
3723 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3725 nl_msg_put_string(&request, TCA_KIND, "basic");
3726 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3727 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3728 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3729 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3730 nl_msg_end_nested(&request, police_offset);
3731 nl_msg_end_nested(&request, basic_offset);
3733 error = tc_transact(&request, NULL);
3744 /* The values in psched are not individually very meaningful, but they are
3745 * important. The tables below show some values seen in the wild.
3749 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3750 * (Before that, there are hints that it was 1000000000.)
3752 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3756 * -----------------------------------
3757 * [1] 000c8000 000f4240 000f4240 00000064
3758 * [2] 000003e8 00000400 000f4240 3b9aca00
3759 * [3] 000003e8 00000400 000f4240 3b9aca00
3760 * [4] 000003e8 00000400 000f4240 00000064
3761 * [5] 000003e8 00000040 000f4240 3b9aca00
3762 * [6] 000003e8 00000040 000f4240 000000f9
3764 * a b c d ticks_per_s buffer_hz
3765 * ------- --------- ---------- ------------- ----------- -------------
3766 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3767 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3768 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3769 * [4] 1,000 1,024 1,000,000 100 976,562 100
3770 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3771 * [6] 1,000 64 1,000,000 249 15,625,000 249
3773 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3774 * [2] 2.6.26-1-686-bigmem from Debian lenny
3775 * [3] 2.6.26-2-sparc64 from Debian lenny
3776 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3777 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3778 * [6] 2.6.34 from kernel.org on KVM
3780 static const char fn[] = "/proc/net/psched";
3781 unsigned int a, b, c, d;
3787 stream = fopen(fn, "r");
3789 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
3793 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3794 VLOG_WARN("%s: read failed", fn);
3798 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3802 VLOG_WARN("%s: invalid scheduler parameters", fn);
3806 ticks_per_s = (double) a * c / b;
3810 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3813 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3816 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3817 * rate of 'rate' bytes per second. */
3819 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3824 return (rate * ticks) / ticks_per_s;
3827 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3828 * rate of 'rate' bytes per second. */
3830 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3835 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3838 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3839 * a transmission rate of 'rate' bytes per second. */
3841 tc_buffer_per_jiffy(unsigned int rate)
3846 return rate / buffer_hz;
3849 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3850 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3851 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3852 * stores NULL into it if it is absent.
3854 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3857 * Returns 0 if successful, otherwise a positive errno value. */
3859 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3860 struct nlattr **options)
3862 static const struct nl_policy tca_policy[] = {
3863 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3864 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3866 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3868 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3869 tca_policy, ta, ARRAY_SIZE(ta))) {
3870 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3875 *kind = nl_attr_get_string(ta[TCA_KIND]);
3879 *options = ta[TCA_OPTIONS];
3894 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3895 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3896 * into '*options', and its queue statistics into '*stats'. Any of the output
3897 * arguments may be null.
3899 * Returns 0 if successful, otherwise a positive errno value. */
3901 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3902 struct nlattr **options, struct netdev_queue_stats *stats)
3904 static const struct nl_policy tca_policy[] = {
3905 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3906 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3908 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3910 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3911 tca_policy, ta, ARRAY_SIZE(ta))) {
3912 VLOG_WARN_RL(&rl, "failed to parse class message");
3917 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3918 *handlep = tc->tcm_handle;
3922 *options = ta[TCA_OPTIONS];
3926 const struct gnet_stats_queue *gsq;
3927 struct gnet_stats_basic gsb;
3929 static const struct nl_policy stats_policy[] = {
3930 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3931 .min_len = sizeof gsb },
3932 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3933 .min_len = sizeof *gsq },
3935 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3937 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3938 sa, ARRAY_SIZE(sa))) {
3939 VLOG_WARN_RL(&rl, "failed to parse class stats");
3943 /* Alignment issues screw up the length of struct gnet_stats_basic on
3944 * some arch/bitsize combinations. Newer versions of Linux have a
3945 * struct gnet_stats_basic_packed, but we can't depend on that. The
3946 * easiest thing to do is just to make a copy. */
3947 memset(&gsb, 0, sizeof gsb);
3948 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3949 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3950 stats->tx_bytes = gsb.bytes;
3951 stats->tx_packets = gsb.packets;
3953 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3954 stats->tx_errors = gsq->drops;
3964 memset(stats, 0, sizeof *stats);
3969 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3972 tc_query_class(const struct netdev *netdev,
3973 unsigned int handle, unsigned int parent,
3974 struct ofpbuf **replyp)
3976 struct ofpbuf request;
3977 struct tcmsg *tcmsg;
3980 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3984 tcmsg->tcm_handle = handle;
3985 tcmsg->tcm_parent = parent;
3987 error = tc_transact(&request, replyp);
3989 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3990 netdev_get_name(netdev),
3991 tc_get_major(handle), tc_get_minor(handle),
3992 tc_get_major(parent), tc_get_minor(parent),
3993 ovs_strerror(error));
3998 /* Equivalent to "tc class del dev <name> handle <handle>". */
4000 tc_delete_class(const struct netdev *netdev, unsigned int handle)
4002 struct ofpbuf request;
4003 struct tcmsg *tcmsg;
4006 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
4010 tcmsg->tcm_handle = handle;
4011 tcmsg->tcm_parent = 0;
4013 error = tc_transact(&request, NULL);
4015 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4016 netdev_get_name(netdev),
4017 tc_get_major(handle), tc_get_minor(handle),
4018 ovs_strerror(error));
4023 /* Equivalent to "tc qdisc del dev <name> root". */
4025 tc_del_qdisc(struct netdev *netdev_)
4027 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4028 struct ofpbuf request;
4029 struct tcmsg *tcmsg;
4032 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
4036 tcmsg->tcm_handle = tc_make_handle(1, 0);
4037 tcmsg->tcm_parent = TC_H_ROOT;
4039 error = tc_transact(&request, NULL);
4040 if (error == EINVAL) {
4041 /* EINVAL probably means that the default qdisc was in use, in which
4042 * case we've accomplished our purpose. */
4045 if (!error && netdev->tc) {
4046 if (netdev->tc->ops->tc_destroy) {
4047 netdev->tc->ops->tc_destroy(netdev->tc);
4054 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4055 * kernel to determine what they are. Returns 0 if successful, otherwise a
4056 * positive errno value. */
4058 tc_query_qdisc(const struct netdev *netdev_)
4060 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4061 struct ofpbuf request, *qdisc;
4062 const struct tc_ops *ops;
4063 struct tcmsg *tcmsg;
4071 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4072 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4073 * 2.6.35 without that fix backported to it.
4075 * To avoid the OOPS, we must not make a request that would attempt to dump
4076 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4077 * few others. There are a few ways that I can see to do this, but most of
4078 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4079 * technique chosen here is to assume that any non-default qdisc that we
4080 * create will have a class with handle 1:0. The built-in qdiscs only have
4081 * a class with handle 0:0.
4083 * We could check for Linux 2.6.35+ and use a more straightforward method
4085 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
4089 tcmsg->tcm_handle = tc_make_handle(1, 0);
4090 tcmsg->tcm_parent = 0;
4092 /* Figure out what tc class to instantiate. */
4093 error = tc_transact(&request, &qdisc);
4097 error = tc_parse_qdisc(qdisc, &kind, NULL);
4099 ops = &tc_ops_other;
4101 ops = tc_lookup_linux_name(kind);
4103 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4104 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4106 ops = &tc_ops_other;
4109 } else if (error == ENOENT) {
4110 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4111 * other entity that doesn't have a handle 1:0. We will assume
4112 * that it's the system default qdisc. */
4113 ops = &tc_ops_default;
4116 /* Who knows? Maybe the device got deleted. */
4117 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4118 netdev_get_name(netdev_), ovs_strerror(error));
4119 ops = &tc_ops_other;
4122 /* Instantiate it. */
4123 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
4124 ovs_assert((load_error == 0) == (netdev->tc != NULL));
4125 ofpbuf_delete(qdisc);
4127 return error ? error : load_error;
4130 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4131 approximate the time to transmit packets of various lengths. For an MTU of
4132 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4133 represents two possible packet lengths; for a MTU of 513 through 1024, four
4134 possible lengths; and so on.
4136 Returns, for the specified 'mtu', the number of bits that packet lengths
4137 need to be shifted right to fit within such a 256-entry table. */
4139 tc_calc_cell_log(unsigned int mtu)
4144 mtu = ETH_PAYLOAD_MAX;
4146 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4148 for (cell_log = 0; mtu >= 256; cell_log++) {
4155 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4158 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4160 memset(rate, 0, sizeof *rate);
4161 rate->cell_log = tc_calc_cell_log(mtu);
4162 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4163 /* rate->cell_align = 0; */ /* distro headers. */
4164 rate->mpu = ETH_TOTAL_MIN;
4168 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4169 * attribute of the specified "type".
4171 * See tc_calc_cell_log() above for a description of "rtab"s. */
4173 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4178 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4179 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4180 unsigned packet_size = (i + 1) << rate->cell_log;
4181 if (packet_size < rate->mpu) {
4182 packet_size = rate->mpu;
4184 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4188 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4189 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4190 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4193 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4195 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4196 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4199 /* Linux-only functions declared in netdev-linux.h */
4201 /* Returns a fd for an AF_INET socket or a negative errno value. */
4203 netdev_linux_get_af_inet_sock(void)
4205 int error = netdev_linux_init();
4206 return error ? -error : af_inet_sock;
4209 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4210 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4212 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4213 const char *flag_name, bool enable)
4215 const char *netdev_name = netdev_get_name(netdev);
4216 struct ethtool_value evalue;
4220 COVERAGE_INC(netdev_get_ethtool);
4221 memset(&evalue, 0, sizeof evalue);
4222 error = netdev_linux_do_ethtool(netdev_name,
4223 (struct ethtool_cmd *)&evalue,
4224 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4229 COVERAGE_INC(netdev_set_ethtool);
4230 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4231 error = netdev_linux_do_ethtool(netdev_name,
4232 (struct ethtool_cmd *)&evalue,
4233 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4238 COVERAGE_INC(netdev_get_ethtool);
4239 memset(&evalue, 0, sizeof evalue);
4240 error = netdev_linux_do_ethtool(netdev_name,
4241 (struct ethtool_cmd *)&evalue,
4242 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4247 if (new_flags != evalue.data) {
4248 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4249 "device %s failed", enable ? "enable" : "disable",
4250 flag_name, netdev_name);
4257 /* Utility functions. */
4259 /* Copies 'src' into 'dst', performing format conversion in the process. */
4261 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4262 const struct rtnl_link_stats *src)
4264 dst->rx_packets = src->rx_packets;
4265 dst->tx_packets = src->tx_packets;
4266 dst->rx_bytes = src->rx_bytes;
4267 dst->tx_bytes = src->tx_bytes;
4268 dst->rx_errors = src->rx_errors;
4269 dst->tx_errors = src->tx_errors;
4270 dst->rx_dropped = src->rx_dropped;
4271 dst->tx_dropped = src->tx_dropped;
4272 dst->multicast = src->multicast;
4273 dst->collisions = src->collisions;
4274 dst->rx_length_errors = src->rx_length_errors;
4275 dst->rx_over_errors = src->rx_over_errors;
4276 dst->rx_crc_errors = src->rx_crc_errors;
4277 dst->rx_frame_errors = src->rx_frame_errors;
4278 dst->rx_fifo_errors = src->rx_fifo_errors;
4279 dst->rx_missed_errors = src->rx_missed_errors;
4280 dst->tx_aborted_errors = src->tx_aborted_errors;
4281 dst->tx_carrier_errors = src->tx_carrier_errors;
4282 dst->tx_fifo_errors = src->tx_fifo_errors;
4283 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4284 dst->tx_window_errors = src->tx_window_errors;
4288 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4290 /* Policy for RTNLGRP_LINK messages.
4292 * There are *many* more fields in these messages, but currently we only
4293 * care about these fields. */
4294 static const struct nl_policy rtnlgrp_link_policy[] = {
4295 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4296 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4297 .min_len = sizeof(struct rtnl_link_stats) },
4300 struct ofpbuf request;
4301 struct ofpbuf *reply;
4302 struct ifinfomsg *ifi;
4303 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4306 ofpbuf_init(&request, 0);
4307 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4308 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4309 ifi->ifi_family = PF_UNSPEC;
4310 ifi->ifi_index = ifindex;
4311 error = nl_transact(NETLINK_ROUTE, &request, &reply);
4312 ofpbuf_uninit(&request);
4317 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4318 rtnlgrp_link_policy,
4319 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4320 ofpbuf_delete(reply);
4324 if (!attrs[IFLA_STATS]) {
4325 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4326 ofpbuf_delete(reply);
4330 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4332 ofpbuf_delete(reply);
4338 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4340 static const char fn[] = "/proc/net/dev";
4345 stream = fopen(fn, "r");
4347 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
4352 while (fgets(line, sizeof line, stream)) {
4355 #define X64 "%"SCNu64
4358 X64 X64 X64 X64 X64 X64 X64 "%*u"
4359 X64 X64 X64 X64 X64 X64 X64 "%*u",
4365 &stats->rx_fifo_errors,
4366 &stats->rx_frame_errors,
4372 &stats->tx_fifo_errors,
4374 &stats->tx_carrier_errors) != 15) {
4375 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4376 } else if (!strcmp(devname, netdev_name)) {
4377 stats->rx_length_errors = UINT64_MAX;
4378 stats->rx_over_errors = UINT64_MAX;
4379 stats->rx_crc_errors = UINT64_MAX;
4380 stats->rx_missed_errors = UINT64_MAX;
4381 stats->tx_aborted_errors = UINT64_MAX;
4382 stats->tx_heartbeat_errors = UINT64_MAX;
4383 stats->tx_window_errors = UINT64_MAX;
4389 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4395 get_flags(const struct netdev *dev, unsigned int *flags)
4401 error = netdev_linux_do_ioctl(dev->name, &ifr, SIOCGIFFLAGS,
4404 *flags = ifr.ifr_flags;
4410 set_flags(const char *name, unsigned int flags)
4414 ifr.ifr_flags = flags;
4415 return netdev_linux_do_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
4419 do_get_ifindex(const char *netdev_name)
4423 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4424 COVERAGE_INC(netdev_get_ifindex);
4425 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4426 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4427 netdev_name, ovs_strerror(errno));
4430 return ifr.ifr_ifindex;
4434 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4436 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4438 if (!(netdev->cache_valid & VALID_IFINDEX)) {
4439 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4442 netdev->get_ifindex_error = -ifindex;
4443 netdev->ifindex = 0;
4445 netdev->get_ifindex_error = 0;
4446 netdev->ifindex = ifindex;
4448 netdev->cache_valid |= VALID_IFINDEX;
4451 *ifindexp = netdev->ifindex;
4452 return netdev->get_ifindex_error;
4456 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4461 memset(&ifr, 0, sizeof ifr);
4462 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4463 COVERAGE_INC(netdev_get_hwaddr);
4464 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4465 /* ENODEV probably means that a vif disappeared asynchronously and
4466 * hasn't been removed from the database yet, so reduce the log level
4467 * to INFO for that case. */
4468 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4469 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4470 netdev_name, ovs_strerror(errno));
4473 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4474 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4475 VLOG_WARN("%s device has unknown hardware address family %d",
4476 netdev_name, hwaddr_family);
4478 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4483 set_etheraddr(const char *netdev_name,
4484 const uint8_t mac[ETH_ADDR_LEN])
4488 memset(&ifr, 0, sizeof ifr);
4489 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4490 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4491 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4492 COVERAGE_INC(netdev_set_hwaddr);
4493 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4494 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4495 netdev_name, ovs_strerror(errno));
4502 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4503 int cmd, const char *cmd_name)
4507 memset(&ifr, 0, sizeof ifr);
4508 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4509 ifr.ifr_data = (caddr_t) ecmd;
4512 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4515 if (errno != EOPNOTSUPP) {
4516 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4517 "failed: %s", cmd_name, name, ovs_strerror(errno));
4519 /* The device doesn't support this operation. That's pretty
4520 * common, so there's no point in logging anything. */
4527 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4528 const char *cmd_name)
4530 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4531 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4532 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4533 ovs_strerror(errno));
4540 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4541 int cmd, const char *cmd_name)
4546 ifr.ifr_addr.sa_family = AF_INET;
4547 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4549 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4550 *ip = sin->sin_addr;
4555 /* Returns an AF_PACKET raw socket or a negative errno value. */
4557 af_packet_sock(void)
4559 static int sock = INT_MIN;
4561 if (sock == INT_MIN) {
4562 sock = socket(AF_PACKET, SOCK_RAW, 0);
4564 int error = set_nonblocking(sock);
4571 VLOG_ERR("failed to create packet socket: %s",
4572 ovs_strerror(errno));