2 * Copyright (c) 2009, 2010, 2011, 2012 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
24 #include <arpa/inet.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <linux/version.h>
37 #include <sys/types.h>
38 #include <sys/ioctl.h>
39 #include <sys/socket.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
52 #include "dpif-linux.h"
53 #include "dynamic-string.h"
54 #include "fatal-signal.h"
57 #include "netdev-provider.h"
58 #include "netdev-vport.h"
60 #include "netlink-notifier.h"
61 #include "netlink-socket.h"
63 #include "openflow/openflow.h"
65 #include "poll-loop.h"
66 #include "rtnetlink-link.h"
67 #include "socket-util.h"
73 VLOG_DEFINE_THIS_MODULE(netdev_linux);
75 COVERAGE_DEFINE(netdev_set_policing);
76 COVERAGE_DEFINE(netdev_arp_lookup);
77 COVERAGE_DEFINE(netdev_get_ifindex);
78 COVERAGE_DEFINE(netdev_get_hwaddr);
79 COVERAGE_DEFINE(netdev_set_hwaddr);
80 COVERAGE_DEFINE(netdev_get_ethtool);
81 COVERAGE_DEFINE(netdev_set_ethtool);
84 /* These were introduced in Linux 2.6.14, so they might be missing if we have
86 #ifndef ADVERTISED_Pause
87 #define ADVERTISED_Pause (1 << 13)
89 #ifndef ADVERTISED_Asym_Pause
90 #define ADVERTISED_Asym_Pause (1 << 14)
93 /* These were introduced in Linux 2.6.24, so they might be missing if we
94 * have old headers. */
95 #ifndef ETHTOOL_GFLAGS
96 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
98 #ifndef ETHTOOL_SFLAGS
99 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
102 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
105 #define TC_RTAB_SIZE 1024
108 static struct nln_notifier *netdev_linux_cache_notifier = NULL;
109 static int cache_notifier_refcount;
112 VALID_IFINDEX = 1 << 0,
113 VALID_ETHERADDR = 1 << 1,
117 VALID_POLICING = 1 << 5,
118 VALID_VPORT_STAT_ERROR = 1 << 6,
119 VALID_DRVINFO = 1 << 7,
120 VALID_FEATURES = 1 << 8,
128 /* Traffic control. */
130 /* An instance of a traffic control class. Always associated with a particular
133 * Each TC implementation subclasses this with whatever additional data it
136 const struct tc_ops *ops;
137 struct hmap queues; /* Contains "struct tc_queue"s.
138 * Read by generic TC layer.
139 * Written only by TC implementation. */
142 /* One traffic control queue.
144 * Each TC implementation subclasses this with whatever additional data it
147 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
148 unsigned int queue_id; /* OpenFlow queue ID. */
151 /* A particular kind of traffic control. Each implementation generally maps to
152 * one particular Linux qdisc class.
154 * The functions below return 0 if successful or a positive errno value on
155 * failure, except where otherwise noted. All of them must be provided, except
156 * where otherwise noted. */
158 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
159 * This is null for tc_ops_default and tc_ops_other, for which there are no
160 * appropriate values. */
161 const char *linux_name;
163 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
164 const char *ovs_name;
166 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
167 * queues. The queues are numbered 0 through n_queues - 1. */
168 unsigned int n_queues;
170 /* Called to install this TC class on 'netdev'. The implementation should
171 * make the Netlink calls required to set up 'netdev' with the right qdisc
172 * and configure it according to 'details'. The implementation may assume
173 * that the current qdisc is the default; that is, there is no need for it
174 * to delete the current qdisc before installing itself.
176 * The contents of 'details' should be documented as valid for 'ovs_name'
177 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
178 * (which is built as ovs-vswitchd.conf.db(8)).
180 * This function must return 0 if and only if it sets 'netdev->tc' to an
181 * initialized 'struct tc'.
183 * (This function is null for tc_ops_other, which cannot be installed. For
184 * other TC classes it should always be nonnull.) */
185 int (*tc_install)(struct netdev *netdev, const struct smap *details);
187 /* Called when the netdev code determines (through a Netlink query) that
188 * this TC class's qdisc is installed on 'netdev', but we didn't install
189 * it ourselves and so don't know any of the details.
191 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
192 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
193 * implementation should parse the other attributes of 'nlmsg' as
194 * necessary to determine its configuration. If necessary it should also
195 * use Netlink queries to determine the configuration of queues on
198 * This function must return 0 if and only if it sets 'netdev->tc' to an
199 * initialized 'struct tc'. */
200 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
202 /* Destroys the data structures allocated by the implementation as part of
203 * 'tc'. (This includes destroying 'tc->queues' by calling
206 * The implementation should not need to perform any Netlink calls. If
207 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
208 * (But it may not be desirable.)
210 * This function may be null if 'tc' is trivial. */
211 void (*tc_destroy)(struct tc *tc);
213 /* Retrieves details of 'netdev->tc' configuration into 'details'.
215 * The implementation should not need to perform any Netlink calls, because
216 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
217 * cached the configuration.
219 * The contents of 'details' should be documented as valid for 'ovs_name'
220 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
221 * (which is built as ovs-vswitchd.conf.db(8)).
223 * This function may be null if 'tc' is not configurable.
225 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
227 /* Reconfigures 'netdev->tc' according to 'details', performing any
228 * required Netlink calls to complete the reconfiguration.
230 * The contents of 'details' should be documented as valid for 'ovs_name'
231 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
232 * (which is built as ovs-vswitchd.conf.db(8)).
234 * This function may be null if 'tc' is not configurable.
236 int (*qdisc_set)(struct netdev *, const struct smap *details);
238 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
239 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
241 * The contents of 'details' should be documented as valid for 'ovs_name'
242 * in the "other_config" column in the "Queue" table in
243 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
245 * The implementation should not need to perform any Netlink calls, because
246 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
247 * cached the queue configuration.
249 * This function may be null if 'tc' does not have queues ('n_queues' is
251 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
252 struct smap *details);
254 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
255 * 'details', perfoming any required Netlink calls to complete the
256 * reconfiguration. The caller ensures that 'queue_id' is less than
259 * The contents of 'details' should be documented as valid for 'ovs_name'
260 * in the "other_config" column in the "Queue" table in
261 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
263 * This function may be null if 'tc' does not have queues or its queues are
264 * not configurable. */
265 int (*class_set)(struct netdev *, unsigned int queue_id,
266 const struct smap *details);
268 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
269 * tc_queue's within 'netdev->tc->queues'.
271 * This function may be null if 'tc' does not have queues or its queues
272 * cannot be deleted. */
273 int (*class_delete)(struct netdev *, struct tc_queue *queue);
275 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
276 * 'struct tc_queue's within 'netdev->tc->queues'.
278 * On success, initializes '*stats'.
280 * This function may be null if 'tc' does not have queues or if it cannot
281 * report queue statistics. */
282 int (*class_get_stats)(const struct netdev *netdev,
283 const struct tc_queue *queue,
284 struct netdev_queue_stats *stats);
286 /* Extracts queue stats from 'nlmsg', which is a response to a
287 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
289 * This function may be null if 'tc' does not have queues or if it cannot
290 * report queue statistics. */
291 int (*class_dump_stats)(const struct netdev *netdev,
292 const struct ofpbuf *nlmsg,
293 netdev_dump_queue_stats_cb *cb, void *aux);
297 tc_init(struct tc *tc, const struct tc_ops *ops)
300 hmap_init(&tc->queues);
304 tc_destroy(struct tc *tc)
306 hmap_destroy(&tc->queues);
309 static const struct tc_ops tc_ops_htb;
310 static const struct tc_ops tc_ops_hfsc;
311 static const struct tc_ops tc_ops_default;
312 static const struct tc_ops tc_ops_other;
314 static const struct tc_ops *tcs[] = {
315 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
316 &tc_ops_hfsc, /* Hierarchical fair service curve. */
317 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
318 &tc_ops_other, /* Some other qdisc. */
322 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
323 static unsigned int tc_get_major(unsigned int handle);
324 static unsigned int tc_get_minor(unsigned int handle);
326 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
327 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
328 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
330 static struct tcmsg *tc_make_request(const struct netdev *, int type,
331 unsigned int flags, struct ofpbuf *);
332 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
333 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
334 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
337 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
338 struct nlattr **options);
339 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
340 struct nlattr **options,
341 struct netdev_queue_stats *);
342 static int tc_query_class(const struct netdev *,
343 unsigned int handle, unsigned int parent,
344 struct ofpbuf **replyp);
345 static int tc_delete_class(const struct netdev *, unsigned int handle);
347 static int tc_del_qdisc(struct netdev *netdev);
348 static int tc_query_qdisc(const struct netdev *netdev);
350 static int tc_calc_cell_log(unsigned int mtu);
351 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
352 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
353 const struct tc_ratespec *rate);
354 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
356 struct netdev_dev_linux {
357 struct netdev_dev netdev_dev;
359 struct shash_node *shash_node;
360 unsigned int cache_valid;
361 unsigned int change_seq;
363 bool miimon; /* Link status of last poll. */
364 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
365 struct timer miimon_timer;
367 /* The following are figured out "on demand" only. They are only valid
368 * when the corresponding VALID_* bit in 'cache_valid' is set. */
370 uint8_t etheraddr[ETH_ADDR_LEN];
371 struct in_addr address, netmask;
374 unsigned int ifi_flags;
375 long long int carrier_resets;
376 uint32_t kbits_rate; /* Policing data. */
377 uint32_t kbits_burst;
378 int vport_stats_error; /* Cached error code from vport_get_stats().
379 0 or an errno value. */
380 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
381 int ether_addr_error; /* Cached error code from set/get etheraddr. */
382 int netdev_policing_error; /* Cached error code from set policing. */
383 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
384 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
386 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
387 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
388 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
389 enum netdev_features peer; /* Cached from ETHTOOL_GSET. */
391 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
395 struct tap_state tap;
399 struct netdev_linux {
400 struct netdev netdev;
404 /* Sockets used for ioctl operations. */
405 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
407 /* A Netlink routing socket that is not subscribed to any multicast groups. */
408 static struct nl_sock *rtnl_sock;
410 /* This is set pretty low because we probably won't learn anything from the
411 * additional log messages. */
412 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
414 static int netdev_linux_init(void);
416 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
417 int cmd, const char *cmd_name);
418 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
419 const char *cmd_name);
420 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
421 int cmd, const char *cmd_name);
422 static int get_flags(const struct netdev_dev *, unsigned int *flags);
423 static int set_flags(struct netdev *, unsigned int flags);
424 static int do_get_ifindex(const char *netdev_name);
425 static int get_ifindex(const struct netdev *, int *ifindexp);
426 static int do_set_addr(struct netdev *netdev,
427 int ioctl_nr, const char *ioctl_name,
428 struct in_addr addr);
429 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
430 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
431 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
432 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
433 static int af_packet_sock(void);
434 static void netdev_linux_miimon_run(void);
435 static void netdev_linux_miimon_wait(void);
438 is_netdev_linux_class(const struct netdev_class *netdev_class)
440 return netdev_class->init == netdev_linux_init;
443 static struct netdev_dev_linux *
444 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
446 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
447 assert(is_netdev_linux_class(netdev_class));
449 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
452 static struct netdev_linux *
453 netdev_linux_cast(const struct netdev *netdev)
455 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
456 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
457 assert(is_netdev_linux_class(netdev_class));
459 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
463 netdev_linux_init(void)
465 static int status = -1;
467 /* Create AF_INET socket. */
468 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
469 status = af_inet_sock >= 0 ? 0 : errno;
471 VLOG_ERR("failed to create inet socket: %s", strerror(status));
474 /* Create rtnetlink socket. */
476 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
478 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
487 netdev_linux_run(void)
489 rtnetlink_link_run();
490 netdev_linux_miimon_run();
494 netdev_linux_wait(void)
496 rtnetlink_link_wait();
497 netdev_linux_miimon_wait();
501 netdev_linux_get_drvinfo(struct netdev_dev_linux *netdev_dev)
506 if (netdev_dev->cache_valid & VALID_DRVINFO) {
510 COVERAGE_INC(netdev_get_ethtool);
511 memset(&netdev_dev->drvinfo, 0, sizeof netdev_dev->drvinfo);
512 error = netdev_linux_do_ethtool(netdev_dev->netdev_dev.name,
513 (struct ethtool_cmd *)&netdev_dev->drvinfo,
517 netdev_dev->cache_valid |= VALID_DRVINFO;
523 netdev_dev_linux_changed(struct netdev_dev_linux *dev,
524 unsigned int ifi_flags,
528 if (!dev->change_seq) {
532 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
533 dev->carrier_resets++;
535 dev->ifi_flags = ifi_flags;
537 dev->cache_valid &= mask;
541 netdev_dev_linux_update(struct netdev_dev_linux *dev,
542 const struct rtnetlink_link_change *change)
544 if (change->nlmsg_type == RTM_NEWLINK) {
546 netdev_dev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
548 /* Update netdev from rtnl-change msg. */
550 dev->mtu = change->mtu;
551 dev->cache_valid |= VALID_MTU;
552 dev->netdev_mtu_error = 0;
555 if (!eth_addr_is_zero(change->addr)) {
556 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
557 dev->cache_valid |= VALID_ETHERADDR;
558 dev->ether_addr_error = 0;
561 dev->ifindex = change->ifi_index;
562 dev->cache_valid |= VALID_IFINDEX;
563 dev->get_ifindex_error = 0;
566 netdev_dev_linux_changed(dev, change->ifi_flags, 0);
571 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
572 void *aux OVS_UNUSED)
574 struct netdev_dev_linux *dev;
576 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
578 const struct netdev_class *netdev_class =
579 netdev_dev_get_class(base_dev);
581 if (is_netdev_linux_class(netdev_class)) {
582 dev = netdev_dev_linux_cast(base_dev);
583 netdev_dev_linux_update(dev, change);
587 struct shash device_shash;
588 struct shash_node *node;
590 shash_init(&device_shash);
591 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
592 SHASH_FOR_EACH (node, &device_shash) {
597 get_flags(&dev->netdev_dev, &flags);
598 netdev_dev_linux_changed(dev, flags, 0);
600 shash_destroy(&device_shash);
605 cache_notifier_ref(void)
607 if (!cache_notifier_refcount) {
608 assert(!netdev_linux_cache_notifier);
610 netdev_linux_cache_notifier =
611 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
613 if (!netdev_linux_cache_notifier) {
617 cache_notifier_refcount++;
623 cache_notifier_unref(void)
625 assert(cache_notifier_refcount > 0);
626 if (!--cache_notifier_refcount) {
627 assert(netdev_linux_cache_notifier);
628 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
629 netdev_linux_cache_notifier = NULL;
633 /* Creates system and internal devices. */
635 netdev_linux_create(const struct netdev_class *class, const char *name,
636 struct netdev_dev **netdev_devp)
638 struct netdev_dev_linux *netdev_dev;
641 error = cache_notifier_ref();
646 netdev_dev = xzalloc(sizeof *netdev_dev);
647 netdev_dev->change_seq = 1;
648 netdev_dev_init(&netdev_dev->netdev_dev, name, class);
649 get_flags(&netdev_dev->netdev_dev, &netdev_dev->ifi_flags);
651 *netdev_devp = &netdev_dev->netdev_dev;
655 /* For most types of netdevs we open the device for each call of
656 * netdev_open(). However, this is not the case with tap devices,
657 * since it is only possible to open the device once. In this
658 * situation we share a single file descriptor, and consequently
659 * buffers, across all readers. Therefore once data is read it will
660 * be unavailable to other reads for tap devices. */
662 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
663 const char *name, struct netdev_dev **netdev_devp)
665 struct netdev_dev_linux *netdev_dev;
666 struct tap_state *state;
667 static const char tap_dev[] = "/dev/net/tun";
671 netdev_dev = xzalloc(sizeof *netdev_dev);
672 state = &netdev_dev->state.tap;
674 error = cache_notifier_ref();
679 /* Open tap device. */
680 state->fd = open(tap_dev, O_RDWR);
683 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
684 goto error_unref_notifier;
687 /* Create tap device. */
688 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
689 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
690 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
691 VLOG_WARN("%s: creating tap device failed: %s", name,
694 goto error_unref_notifier;
697 /* Make non-blocking. */
698 error = set_nonblocking(state->fd);
700 goto error_unref_notifier;
703 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
704 *netdev_devp = &netdev_dev->netdev_dev;
707 error_unref_notifier:
708 cache_notifier_unref();
715 destroy_tap(struct netdev_dev_linux *netdev_dev)
717 struct tap_state *state = &netdev_dev->state.tap;
719 if (state->fd >= 0) {
724 /* Destroys the netdev device 'netdev_dev_'. */
726 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
728 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
729 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
731 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
732 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
735 if (class == &netdev_tap_class) {
736 destroy_tap(netdev_dev);
740 cache_notifier_unref();
744 netdev_linux_open(struct netdev_dev *netdev_dev_, struct netdev **netdevp)
746 struct netdev_linux *netdev;
747 enum netdev_flags flags;
750 /* Allocate network device. */
751 netdev = xzalloc(sizeof *netdev);
753 netdev_init(&netdev->netdev, netdev_dev_);
755 /* Verify that the device really exists, by attempting to read its flags.
756 * (The flags might be cached, in which case this won't actually do an
759 * Don't do this for "internal" netdevs, though, because those have to be
760 * created as netdev objects before they exist in the kernel, because
761 * creating them in the kernel happens by passing a netdev object to
762 * dpif_port_add(). */
763 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
764 error = netdev_get_flags(&netdev->netdev, &flags);
765 if (error == ENODEV) {
770 *netdevp = &netdev->netdev;
774 netdev_uninit(&netdev->netdev, true);
778 /* Closes and destroys 'netdev'. */
780 netdev_linux_close(struct netdev *netdev_)
782 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
784 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
791 netdev_linux_listen(struct netdev *netdev_)
793 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
794 struct netdev_dev_linux *netdev_dev =
795 netdev_dev_linux_cast(netdev_get_dev(netdev_));
796 struct sockaddr_ll sll;
801 if (netdev->fd >= 0) {
805 if (!strcmp(netdev_get_type(netdev_), "tap")
806 && !netdev_dev->state.tap.opened) {
807 netdev->fd = netdev_dev->state.tap.fd;
808 netdev_dev->state.tap.opened = true;
812 /* Create file descriptor. */
813 fd = socket(PF_PACKET, SOCK_RAW, 0);
816 VLOG_ERR("failed to create raw socket (%s)", strerror(error));
820 /* Set non-blocking mode. */
821 error = set_nonblocking(fd);
826 /* Get ethernet device index. */
827 error = get_ifindex(&netdev->netdev, &ifindex);
832 /* Bind to specific ethernet device. */
833 memset(&sll, 0, sizeof sll);
834 sll.sll_family = AF_PACKET;
835 sll.sll_ifindex = ifindex;
836 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
837 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
839 VLOG_ERR("%s: failed to bind raw socket (%s)",
840 netdev_get_name(netdev_), strerror(error));
855 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
857 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
859 if (netdev->fd < 0) {
860 /* Device is not listening. */
867 retval = (netdev_->netdev_dev->netdev_class == &netdev_tap_class
868 ? read(netdev->fd, data, size)
869 : recv(netdev->fd, data, size, MSG_TRUNC));
871 return retval <= size ? retval : -EMSGSIZE;
872 } else if (errno != EINTR) {
873 if (errno != EAGAIN) {
874 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
875 strerror(errno), netdev_get_name(netdev_));
882 /* Registers with the poll loop to wake up from the next call to poll_block()
883 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
885 netdev_linux_recv_wait(struct netdev *netdev_)
887 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
888 if (netdev->fd >= 0) {
889 poll_fd_wait(netdev->fd, POLLIN);
893 /* Discards all packets waiting to be received from 'netdev'. */
895 netdev_linux_drain(struct netdev *netdev_)
897 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
898 if (netdev->fd < 0) {
900 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
902 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
903 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
907 drain_fd(netdev->fd, ifr.ifr_qlen);
910 return drain_rcvbuf(netdev->fd);
914 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
915 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
916 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
917 * the packet is too big or too small to transmit on the device.
919 * The caller retains ownership of 'buffer' in all cases.
921 * The kernel maintains a packet transmission queue, so the caller is not
922 * expected to do additional queuing of packets. */
924 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
926 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
930 if (netdev->fd < 0) {
931 /* Use our AF_PACKET socket to send to this device. */
932 struct sockaddr_ll sll;
939 sock = af_packet_sock();
944 error = get_ifindex(netdev_, &ifindex);
949 /* We don't bother setting most fields in sockaddr_ll because the
950 * kernel ignores them for SOCK_RAW. */
951 memset(&sll, 0, sizeof sll);
952 sll.sll_family = AF_PACKET;
953 sll.sll_ifindex = ifindex;
955 iov.iov_base = CONST_CAST(void *, data);
959 msg.msg_namelen = sizeof sll;
962 msg.msg_control = NULL;
963 msg.msg_controllen = 0;
966 retval = sendmsg(sock, &msg, 0);
968 /* Use the netdev's own fd to send to this device. This is
969 * essential for tap devices, because packets sent to a tap device
970 * with an AF_PACKET socket will loop back to be *received* again
971 * on the tap device. */
972 retval = write(netdev->fd, data, size);
976 /* The Linux AF_PACKET implementation never blocks waiting for room
977 * for packets, instead returning ENOBUFS. Translate this into
978 * EAGAIN for the caller. */
979 if (errno == ENOBUFS) {
981 } else if (errno == EINTR) {
983 } else if (errno != EAGAIN) {
984 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
985 netdev_get_name(netdev_), strerror(errno));
988 } else if (retval != size) {
989 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
990 "%zu) on %s", retval, size, netdev_get_name(netdev_));
998 /* Registers with the poll loop to wake up from the next call to poll_block()
999 * when the packet transmission queue has sufficient room to transmit a packet
1000 * with netdev_send().
1002 * The kernel maintains a packet transmission queue, so the client is not
1003 * expected to do additional queuing of packets. Thus, this function is
1004 * unlikely to ever be used. It is included for completeness. */
1006 netdev_linux_send_wait(struct netdev *netdev_)
1008 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1009 if (netdev->fd < 0) {
1010 /* Nothing to do. */
1011 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
1012 poll_fd_wait(netdev->fd, POLLOUT);
1014 /* TAP device always accepts packets.*/
1015 poll_immediate_wake();
1019 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1020 * otherwise a positive errno value. */
1022 netdev_linux_set_etheraddr(struct netdev *netdev_,
1023 const uint8_t mac[ETH_ADDR_LEN])
1025 struct netdev_dev_linux *netdev_dev =
1026 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1028 bool up_again = false;
1030 if (netdev_dev->cache_valid & VALID_ETHERADDR) {
1031 if (netdev_dev->ether_addr_error) {
1032 return netdev_dev->ether_addr_error;
1034 if (eth_addr_equals(netdev_dev->etheraddr, mac)) {
1037 netdev_dev->cache_valid &= ~VALID_ETHERADDR;
1040 /* Tap devices must be brought down before setting the address. */
1041 if (!strcmp(netdev_get_type(netdev_), "tap")) {
1042 enum netdev_flags flags;
1044 if (!netdev_get_flags(netdev_, &flags) && (flags & NETDEV_UP)) {
1045 netdev_turn_flags_off(netdev_, NETDEV_UP, false);
1049 error = set_etheraddr(netdev_get_name(netdev_), mac);
1050 if (!error || error == ENODEV) {
1051 netdev_dev->ether_addr_error = error;
1052 netdev_dev->cache_valid |= VALID_ETHERADDR;
1054 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
1059 netdev_turn_flags_on(netdev_, NETDEV_UP, false);
1065 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1067 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1068 uint8_t mac[ETH_ADDR_LEN])
1070 struct netdev_dev_linux *netdev_dev =
1071 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1073 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
1074 int error = get_etheraddr(netdev_get_name(netdev_),
1075 netdev_dev->etheraddr);
1077 netdev_dev->ether_addr_error = error;
1078 netdev_dev->cache_valid |= VALID_ETHERADDR;
1081 if (!netdev_dev->ether_addr_error) {
1082 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
1085 return netdev_dev->ether_addr_error;
1088 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1089 * in bytes, not including the hardware header; thus, this is typically 1500
1090 * bytes for Ethernet devices. */
1092 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1094 struct netdev_dev_linux *netdev_dev =
1095 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1096 if (!(netdev_dev->cache_valid & VALID_MTU)) {
1100 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1101 SIOCGIFMTU, "SIOCGIFMTU");
1103 netdev_dev->netdev_mtu_error = error;
1104 netdev_dev->mtu = ifr.ifr_mtu;
1105 netdev_dev->cache_valid |= VALID_MTU;
1108 if (!netdev_dev->netdev_mtu_error) {
1109 *mtup = netdev_dev->mtu;
1111 return netdev_dev->netdev_mtu_error;
1114 /* Sets the maximum size of transmitted (MTU) for given device using linux
1115 * networking ioctl interface.
1118 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1120 struct netdev_dev_linux *netdev_dev =
1121 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1125 if (netdev_dev->cache_valid & VALID_MTU) {
1126 if (netdev_dev->netdev_mtu_error) {
1127 return netdev_dev->netdev_mtu_error;
1129 if (netdev_dev->mtu == mtu) {
1132 netdev_dev->cache_valid &= ~VALID_MTU;
1135 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1136 SIOCSIFMTU, "SIOCSIFMTU");
1137 if (!error || error == ENODEV) {
1138 netdev_dev->netdev_mtu_error = error;
1139 netdev_dev->mtu = ifr.ifr_mtu;
1140 netdev_dev->cache_valid |= VALID_MTU;
1145 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1146 * On failure, returns a negative errno value. */
1148 netdev_linux_get_ifindex(const struct netdev *netdev)
1152 error = get_ifindex(netdev, &ifindex);
1153 return error ? -error : ifindex;
1157 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1159 struct netdev_dev_linux *netdev_dev =
1160 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1162 if (netdev_dev->miimon_interval > 0) {
1163 *carrier = netdev_dev->miimon;
1165 *carrier = (netdev_dev->ifi_flags & IFF_RUNNING) != 0;
1171 static long long int
1172 netdev_linux_get_carrier_resets(const struct netdev *netdev)
1174 return netdev_dev_linux_cast(netdev_get_dev(netdev))->carrier_resets;
1178 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1179 struct mii_ioctl_data *data)
1184 memset(&ifr, 0, sizeof ifr);
1185 memcpy(&ifr.ifr_data, data, sizeof *data);
1186 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1187 memcpy(data, &ifr.ifr_data, sizeof *data);
1193 netdev_linux_get_miimon(const char *name, bool *miimon)
1195 struct mii_ioctl_data data;
1200 memset(&data, 0, sizeof data);
1201 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1203 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1204 data.reg_num = MII_BMSR;
1205 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1209 *miimon = !!(data.val_out & BMSR_LSTATUS);
1211 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1214 struct ethtool_cmd ecmd;
1216 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1219 COVERAGE_INC(netdev_get_ethtool);
1220 memset(&ecmd, 0, sizeof ecmd);
1221 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1224 struct ethtool_value eval;
1226 memcpy(&eval, &ecmd, sizeof eval);
1227 *miimon = !!eval.data;
1229 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1237 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1238 long long int interval)
1240 struct netdev_dev_linux *netdev_dev;
1242 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev_));
1244 interval = interval > 0 ? MAX(interval, 100) : 0;
1245 if (netdev_dev->miimon_interval != interval) {
1246 netdev_dev->miimon_interval = interval;
1247 timer_set_expired(&netdev_dev->miimon_timer);
1254 netdev_linux_miimon_run(void)
1256 struct shash device_shash;
1257 struct shash_node *node;
1259 shash_init(&device_shash);
1260 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1261 SHASH_FOR_EACH (node, &device_shash) {
1262 struct netdev_dev_linux *dev = node->data;
1265 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1269 netdev_linux_get_miimon(dev->netdev_dev.name, &miimon);
1270 if (miimon != dev->miimon) {
1271 dev->miimon = miimon;
1272 netdev_dev_linux_changed(dev, dev->ifi_flags, 0);
1275 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1278 shash_destroy(&device_shash);
1282 netdev_linux_miimon_wait(void)
1284 struct shash device_shash;
1285 struct shash_node *node;
1287 shash_init(&device_shash);
1288 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1289 SHASH_FOR_EACH (node, &device_shash) {
1290 struct netdev_dev_linux *dev = node->data;
1292 if (dev->miimon_interval > 0) {
1293 timer_wait(&dev->miimon_timer);
1296 shash_destroy(&device_shash);
1299 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1300 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1303 check_for_working_netlink_stats(void)
1305 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1306 * preferable, so if that works, we'll use it. */
1307 int ifindex = do_get_ifindex("lo");
1309 VLOG_WARN("failed to get ifindex for lo, "
1310 "obtaining netdev stats from proc");
1313 struct netdev_stats stats;
1314 int error = get_stats_via_netlink(ifindex, &stats);
1316 VLOG_DBG("obtaining netdev stats via rtnetlink");
1319 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1320 "via proc (you are probably running a pre-2.6.19 "
1321 "kernel)", strerror(error));
1328 swap_uint64(uint64_t *a, uint64_t *b)
1336 get_stats_via_vport(const struct netdev *netdev_,
1337 struct netdev_stats *stats)
1339 struct netdev_dev_linux *netdev_dev =
1340 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1342 if (!netdev_dev->vport_stats_error ||
1343 !(netdev_dev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1346 error = netdev_vport_get_stats(netdev_, stats);
1347 if (error && error != ENOENT) {
1348 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1349 "(%s)", netdev_get_name(netdev_), strerror(error));
1351 netdev_dev->vport_stats_error = error;
1352 netdev_dev->cache_valid |= VALID_VPORT_STAT_ERROR;
1357 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1358 struct netdev_stats *stats)
1360 static int use_netlink_stats = -1;
1363 if (use_netlink_stats < 0) {
1364 use_netlink_stats = check_for_working_netlink_stats();
1367 if (use_netlink_stats) {
1370 error = get_ifindex(netdev_, &ifindex);
1372 error = get_stats_via_netlink(ifindex, stats);
1375 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1379 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1380 netdev_get_name(netdev_), error);
1386 /* Retrieves current device stats for 'netdev-linux'. */
1388 netdev_linux_get_stats(const struct netdev *netdev_,
1389 struct netdev_stats *stats)
1391 struct netdev_dev_linux *netdev_dev =
1392 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1393 struct netdev_stats dev_stats;
1396 get_stats_via_vport(netdev_, stats);
1398 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1401 if (netdev_dev->vport_stats_error) {
1408 if (netdev_dev->vport_stats_error) {
1409 /* stats not available from OVS then use ioctl stats. */
1412 stats->rx_errors += dev_stats.rx_errors;
1413 stats->tx_errors += dev_stats.tx_errors;
1414 stats->rx_dropped += dev_stats.rx_dropped;
1415 stats->tx_dropped += dev_stats.tx_dropped;
1416 stats->multicast += dev_stats.multicast;
1417 stats->collisions += dev_stats.collisions;
1418 stats->rx_length_errors += dev_stats.rx_length_errors;
1419 stats->rx_over_errors += dev_stats.rx_over_errors;
1420 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1421 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1422 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1423 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1424 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1425 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1426 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1427 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1428 stats->tx_window_errors += dev_stats.tx_window_errors;
1433 /* Retrieves current device stats for 'netdev-tap' netdev or
1434 * netdev-internal. */
1436 netdev_tap_get_stats(const struct netdev *netdev_,
1437 struct netdev_stats *stats)
1439 struct netdev_dev_linux *netdev_dev =
1440 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1441 struct netdev_stats dev_stats;
1444 get_stats_via_vport(netdev_, stats);
1446 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1448 if (netdev_dev->vport_stats_error) {
1455 /* If this port is an internal port then the transmit and receive stats
1456 * will appear to be swapped relative to the other ports since we are the
1457 * one sending the data, not a remote computer. For consistency, we swap
1458 * them back here. This does not apply if we are getting stats from the
1459 * vport layer because it always tracks stats from the perspective of the
1461 if (netdev_dev->vport_stats_error) {
1463 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1464 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1465 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1466 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1467 stats->rx_length_errors = 0;
1468 stats->rx_over_errors = 0;
1469 stats->rx_crc_errors = 0;
1470 stats->rx_frame_errors = 0;
1471 stats->rx_fifo_errors = 0;
1472 stats->rx_missed_errors = 0;
1473 stats->tx_aborted_errors = 0;
1474 stats->tx_carrier_errors = 0;
1475 stats->tx_fifo_errors = 0;
1476 stats->tx_heartbeat_errors = 0;
1477 stats->tx_window_errors = 0;
1479 stats->rx_dropped += dev_stats.tx_dropped;
1480 stats->tx_dropped += dev_stats.rx_dropped;
1482 stats->rx_errors += dev_stats.tx_errors;
1483 stats->tx_errors += dev_stats.rx_errors;
1485 stats->multicast += dev_stats.multicast;
1486 stats->collisions += dev_stats.collisions;
1492 netdev_internal_get_stats(const struct netdev *netdev_,
1493 struct netdev_stats *stats)
1495 struct netdev_dev_linux *netdev_dev =
1496 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1498 get_stats_via_vport(netdev_, stats);
1499 return netdev_dev->vport_stats_error;
1503 netdev_internal_set_stats(struct netdev *netdev,
1504 const struct netdev_stats *stats)
1506 struct ovs_vport_stats vport_stats;
1507 struct dpif_linux_vport vport;
1510 vport_stats.rx_packets = stats->rx_packets;
1511 vport_stats.tx_packets = stats->tx_packets;
1512 vport_stats.rx_bytes = stats->rx_bytes;
1513 vport_stats.tx_bytes = stats->tx_bytes;
1514 vport_stats.rx_errors = stats->rx_errors;
1515 vport_stats.tx_errors = stats->tx_errors;
1516 vport_stats.rx_dropped = stats->rx_dropped;
1517 vport_stats.tx_dropped = stats->tx_dropped;
1519 dpif_linux_vport_init(&vport);
1520 vport.cmd = OVS_VPORT_CMD_SET;
1521 vport.name = netdev_get_name(netdev);
1522 vport.stats = &vport_stats;
1524 err = dpif_linux_vport_transact(&vport, NULL, NULL);
1526 /* If the vport layer doesn't know about the device, that doesn't mean it
1527 * doesn't exist (after all were able to open it when netdev_open() was
1528 * called), it just means that it isn't attached and we'll be getting
1529 * stats a different way. */
1530 if (err == ENODEV) {
1538 netdev_linux_read_features(struct netdev_dev_linux *netdev_dev)
1540 struct ethtool_cmd ecmd;
1544 if (netdev_dev->cache_valid & VALID_FEATURES) {
1548 COVERAGE_INC(netdev_get_ethtool);
1549 memset(&ecmd, 0, sizeof ecmd);
1550 error = netdev_linux_do_ethtool(netdev_dev->netdev_dev.name, &ecmd,
1551 ETHTOOL_GSET, "ETHTOOL_GSET");
1556 /* Supported features. */
1557 netdev_dev->supported = 0;
1558 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1559 netdev_dev->supported |= NETDEV_F_10MB_HD;
1561 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1562 netdev_dev->supported |= NETDEV_F_10MB_FD;
1564 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1565 netdev_dev->supported |= NETDEV_F_100MB_HD;
1567 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1568 netdev_dev->supported |= NETDEV_F_100MB_FD;
1570 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1571 netdev_dev->supported |= NETDEV_F_1GB_HD;
1573 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1574 netdev_dev->supported |= NETDEV_F_1GB_FD;
1576 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1577 netdev_dev->supported |= NETDEV_F_10GB_FD;
1579 if (ecmd.supported & SUPPORTED_TP) {
1580 netdev_dev->supported |= NETDEV_F_COPPER;
1582 if (ecmd.supported & SUPPORTED_FIBRE) {
1583 netdev_dev->supported |= NETDEV_F_FIBER;
1585 if (ecmd.supported & SUPPORTED_Autoneg) {
1586 netdev_dev->supported |= NETDEV_F_AUTONEG;
1588 if (ecmd.supported & SUPPORTED_Pause) {
1589 netdev_dev->supported |= NETDEV_F_PAUSE;
1591 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1592 netdev_dev->supported |= NETDEV_F_PAUSE_ASYM;
1595 /* Advertised features. */
1596 netdev_dev->advertised = 0;
1597 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1598 netdev_dev->advertised |= NETDEV_F_10MB_HD;
1600 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1601 netdev_dev->advertised |= NETDEV_F_10MB_FD;
1603 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1604 netdev_dev->advertised |= NETDEV_F_100MB_HD;
1606 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1607 netdev_dev->advertised |= NETDEV_F_100MB_FD;
1609 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1610 netdev_dev->advertised |= NETDEV_F_1GB_HD;
1612 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1613 netdev_dev->advertised |= NETDEV_F_1GB_FD;
1615 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1616 netdev_dev->advertised |= NETDEV_F_10GB_FD;
1618 if (ecmd.advertising & ADVERTISED_TP) {
1619 netdev_dev->advertised |= NETDEV_F_COPPER;
1621 if (ecmd.advertising & ADVERTISED_FIBRE) {
1622 netdev_dev->advertised |= NETDEV_F_FIBER;
1624 if (ecmd.advertising & ADVERTISED_Autoneg) {
1625 netdev_dev->advertised |= NETDEV_F_AUTONEG;
1627 if (ecmd.advertising & ADVERTISED_Pause) {
1628 netdev_dev->advertised |= NETDEV_F_PAUSE;
1630 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1631 netdev_dev->advertised |= NETDEV_F_PAUSE_ASYM;
1634 /* Current settings. */
1636 if (speed == SPEED_10) {
1637 netdev_dev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1638 } else if (speed == SPEED_100) {
1639 netdev_dev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1640 } else if (speed == SPEED_1000) {
1641 netdev_dev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1642 } else if (speed == SPEED_10000) {
1643 netdev_dev->current = NETDEV_F_10GB_FD;
1644 } else if (speed == 40000) {
1645 netdev_dev->current = NETDEV_F_40GB_FD;
1646 } else if (speed == 100000) {
1647 netdev_dev->current = NETDEV_F_100GB_FD;
1648 } else if (speed == 1000000) {
1649 netdev_dev->current = NETDEV_F_1TB_FD;
1651 netdev_dev->current = 0;
1654 if (ecmd.port == PORT_TP) {
1655 netdev_dev->current |= NETDEV_F_COPPER;
1656 } else if (ecmd.port == PORT_FIBRE) {
1657 netdev_dev->current |= NETDEV_F_FIBER;
1661 netdev_dev->current |= NETDEV_F_AUTONEG;
1664 /* Peer advertisements. */
1665 netdev_dev->peer = 0; /* XXX */
1668 netdev_dev->cache_valid |= VALID_FEATURES;
1669 netdev_dev->get_features_error = error;
1672 /* Stores the features supported by 'netdev' into each of '*current',
1673 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1674 * bitmap of NETDEV_* bits. Returns 0 if successful, otherwise a positive
1677 netdev_linux_get_features(const struct netdev *netdev_,
1678 enum netdev_features *current,
1679 enum netdev_features *advertised,
1680 enum netdev_features *supported,
1681 enum netdev_features *peer)
1683 struct netdev_dev_linux *netdev_dev =
1684 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1686 netdev_linux_read_features(netdev_dev);
1688 if (!netdev_dev->get_features_error) {
1689 *current = netdev_dev->current;
1690 *advertised = netdev_dev->advertised;
1691 *supported = netdev_dev->supported;
1692 *peer = netdev_dev->peer;
1694 return netdev_dev->get_features_error;
1697 /* Set the features advertised by 'netdev' to 'advertise'. */
1699 netdev_linux_set_advertisements(struct netdev *netdev,
1700 enum netdev_features advertise)
1702 struct ethtool_cmd ecmd;
1705 COVERAGE_INC(netdev_get_ethtool);
1706 memset(&ecmd, 0, sizeof ecmd);
1707 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1708 ETHTOOL_GSET, "ETHTOOL_GSET");
1713 ecmd.advertising = 0;
1714 if (advertise & NETDEV_F_10MB_HD) {
1715 ecmd.advertising |= ADVERTISED_10baseT_Half;
1717 if (advertise & NETDEV_F_10MB_FD) {
1718 ecmd.advertising |= ADVERTISED_10baseT_Full;
1720 if (advertise & NETDEV_F_100MB_HD) {
1721 ecmd.advertising |= ADVERTISED_100baseT_Half;
1723 if (advertise & NETDEV_F_100MB_FD) {
1724 ecmd.advertising |= ADVERTISED_100baseT_Full;
1726 if (advertise & NETDEV_F_1GB_HD) {
1727 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1729 if (advertise & NETDEV_F_1GB_FD) {
1730 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1732 if (advertise & NETDEV_F_10GB_FD) {
1733 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1735 if (advertise & NETDEV_F_COPPER) {
1736 ecmd.advertising |= ADVERTISED_TP;
1738 if (advertise & NETDEV_F_FIBER) {
1739 ecmd.advertising |= ADVERTISED_FIBRE;
1741 if (advertise & NETDEV_F_AUTONEG) {
1742 ecmd.advertising |= ADVERTISED_Autoneg;
1744 if (advertise & NETDEV_F_PAUSE) {
1745 ecmd.advertising |= ADVERTISED_Pause;
1747 if (advertise & NETDEV_F_PAUSE_ASYM) {
1748 ecmd.advertising |= ADVERTISED_Asym_Pause;
1750 COVERAGE_INC(netdev_set_ethtool);
1751 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1752 ETHTOOL_SSET, "ETHTOOL_SSET");
1755 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1756 * successful, otherwise a positive errno value. */
1758 netdev_linux_set_policing(struct netdev *netdev,
1759 uint32_t kbits_rate, uint32_t kbits_burst)
1761 struct netdev_dev_linux *netdev_dev =
1762 netdev_dev_linux_cast(netdev_get_dev(netdev));
1763 const char *netdev_name = netdev_get_name(netdev);
1767 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1768 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1769 : kbits_burst); /* Stick with user-specified value. */
1771 if (netdev_dev->cache_valid & VALID_POLICING) {
1772 if (netdev_dev->netdev_policing_error) {
1773 return netdev_dev->netdev_policing_error;
1776 if (netdev_dev->kbits_rate == kbits_rate &&
1777 netdev_dev->kbits_burst == kbits_burst) {
1778 /* Assume that settings haven't changed since we last set them. */
1781 netdev_dev->cache_valid &= ~VALID_POLICING;
1784 COVERAGE_INC(netdev_set_policing);
1785 /* Remove any existing ingress qdisc. */
1786 error = tc_add_del_ingress_qdisc(netdev, false);
1788 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1789 netdev_name, strerror(error));
1794 error = tc_add_del_ingress_qdisc(netdev, true);
1796 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1797 netdev_name, strerror(error));
1801 error = tc_add_policer(netdev, kbits_rate, kbits_burst);
1803 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1804 netdev_name, strerror(error));
1809 netdev_dev->kbits_rate = kbits_rate;
1810 netdev_dev->kbits_burst = kbits_burst;
1813 if (!error || error == ENODEV) {
1814 netdev_dev->netdev_policing_error = error;
1815 netdev_dev->cache_valid |= VALID_POLICING;
1821 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1824 const struct tc_ops **opsp;
1826 for (opsp = tcs; *opsp != NULL; opsp++) {
1827 const struct tc_ops *ops = *opsp;
1828 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1829 sset_add(types, ops->ovs_name);
1835 static const struct tc_ops *
1836 tc_lookup_ovs_name(const char *name)
1838 const struct tc_ops **opsp;
1840 for (opsp = tcs; *opsp != NULL; opsp++) {
1841 const struct tc_ops *ops = *opsp;
1842 if (!strcmp(name, ops->ovs_name)) {
1849 static const struct tc_ops *
1850 tc_lookup_linux_name(const char *name)
1852 const struct tc_ops **opsp;
1854 for (opsp = tcs; *opsp != NULL; opsp++) {
1855 const struct tc_ops *ops = *opsp;
1856 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1863 static struct tc_queue *
1864 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1867 struct netdev_dev_linux *netdev_dev =
1868 netdev_dev_linux_cast(netdev_get_dev(netdev));
1869 struct tc_queue *queue;
1871 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1872 if (queue->queue_id == queue_id) {
1879 static struct tc_queue *
1880 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1882 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1886 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1888 struct netdev_qos_capabilities *caps)
1890 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1894 caps->n_queues = ops->n_queues;
1899 netdev_linux_get_qos(const struct netdev *netdev,
1900 const char **typep, struct smap *details)
1902 struct netdev_dev_linux *netdev_dev =
1903 netdev_dev_linux_cast(netdev_get_dev(netdev));
1906 error = tc_query_qdisc(netdev);
1911 *typep = netdev_dev->tc->ops->ovs_name;
1912 return (netdev_dev->tc->ops->qdisc_get
1913 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1918 netdev_linux_set_qos(struct netdev *netdev,
1919 const char *type, const struct smap *details)
1921 struct netdev_dev_linux *netdev_dev =
1922 netdev_dev_linux_cast(netdev_get_dev(netdev));
1923 const struct tc_ops *new_ops;
1926 new_ops = tc_lookup_ovs_name(type);
1927 if (!new_ops || !new_ops->tc_install) {
1931 error = tc_query_qdisc(netdev);
1936 if (new_ops == netdev_dev->tc->ops) {
1937 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1939 /* Delete existing qdisc. */
1940 error = tc_del_qdisc(netdev);
1944 assert(netdev_dev->tc == NULL);
1946 /* Install new qdisc. */
1947 error = new_ops->tc_install(netdev, details);
1948 assert((error == 0) == (netdev_dev->tc != NULL));
1955 netdev_linux_get_queue(const struct netdev *netdev,
1956 unsigned int queue_id, struct smap *details)
1958 struct netdev_dev_linux *netdev_dev =
1959 netdev_dev_linux_cast(netdev_get_dev(netdev));
1962 error = tc_query_qdisc(netdev);
1966 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1968 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1974 netdev_linux_set_queue(struct netdev *netdev,
1975 unsigned int queue_id, const struct smap *details)
1977 struct netdev_dev_linux *netdev_dev =
1978 netdev_dev_linux_cast(netdev_get_dev(netdev));
1981 error = tc_query_qdisc(netdev);
1984 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1985 || !netdev_dev->tc->ops->class_set) {
1989 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1993 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1995 struct netdev_dev_linux *netdev_dev =
1996 netdev_dev_linux_cast(netdev_get_dev(netdev));
1999 error = tc_query_qdisc(netdev);
2002 } else if (!netdev_dev->tc->ops->class_delete) {
2005 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
2007 ? netdev_dev->tc->ops->class_delete(netdev, queue)
2013 netdev_linux_get_queue_stats(const struct netdev *netdev,
2014 unsigned int queue_id,
2015 struct netdev_queue_stats *stats)
2017 struct netdev_dev_linux *netdev_dev =
2018 netdev_dev_linux_cast(netdev_get_dev(netdev));
2021 error = tc_query_qdisc(netdev);
2024 } else if (!netdev_dev->tc->ops->class_get_stats) {
2027 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
2029 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
2035 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
2037 struct ofpbuf request;
2038 struct tcmsg *tcmsg;
2040 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2044 tcmsg->tcm_parent = 0;
2045 nl_dump_start(dump, rtnl_sock, &request);
2046 ofpbuf_uninit(&request);
2051 netdev_linux_dump_queues(const struct netdev *netdev,
2052 netdev_dump_queues_cb *cb, void *aux)
2054 struct netdev_dev_linux *netdev_dev =
2055 netdev_dev_linux_cast(netdev_get_dev(netdev));
2056 struct tc_queue *queue, *next_queue;
2057 struct smap details;
2061 error = tc_query_qdisc(netdev);
2064 } else if (!netdev_dev->tc->ops->class_get) {
2069 smap_init(&details);
2070 HMAP_FOR_EACH_SAFE (queue, next_queue, hmap_node,
2071 &netdev_dev->tc->queues) {
2072 smap_clear(&details);
2074 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
2076 (*cb)(queue->queue_id, &details, aux);
2081 smap_destroy(&details);
2087 netdev_linux_dump_queue_stats(const struct netdev *netdev,
2088 netdev_dump_queue_stats_cb *cb, void *aux)
2090 struct netdev_dev_linux *netdev_dev =
2091 netdev_dev_linux_cast(netdev_get_dev(netdev));
2092 struct nl_dump dump;
2097 error = tc_query_qdisc(netdev);
2100 } else if (!netdev_dev->tc->ops->class_dump_stats) {
2105 if (!start_queue_dump(netdev, &dump)) {
2108 while (nl_dump_next(&dump, &msg)) {
2109 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
2115 error = nl_dump_done(&dump);
2116 return error ? error : last_error;
2120 netdev_linux_get_in4(const struct netdev *netdev_,
2121 struct in_addr *address, struct in_addr *netmask)
2123 struct netdev_dev_linux *netdev_dev =
2124 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2126 if (!(netdev_dev->cache_valid & VALID_IN4)) {
2129 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
2130 SIOCGIFADDR, "SIOCGIFADDR");
2135 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
2136 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2141 netdev_dev->cache_valid |= VALID_IN4;
2143 *address = netdev_dev->address;
2144 *netmask = netdev_dev->netmask;
2145 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
2149 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2150 struct in_addr netmask)
2152 struct netdev_dev_linux *netdev_dev =
2153 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2156 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2158 netdev_dev->cache_valid |= VALID_IN4;
2159 netdev_dev->address = address;
2160 netdev_dev->netmask = netmask;
2161 if (address.s_addr != INADDR_ANY) {
2162 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2163 "SIOCSIFNETMASK", netmask);
2170 parse_if_inet6_line(const char *line,
2171 struct in6_addr *in6, char ifname[16 + 1])
2173 uint8_t *s6 = in6->s6_addr;
2174 #define X8 "%2"SCNx8
2176 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2177 "%*x %*x %*x %*x %16s\n",
2178 &s6[0], &s6[1], &s6[2], &s6[3],
2179 &s6[4], &s6[5], &s6[6], &s6[7],
2180 &s6[8], &s6[9], &s6[10], &s6[11],
2181 &s6[12], &s6[13], &s6[14], &s6[15],
2185 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2186 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2188 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2190 struct netdev_dev_linux *netdev_dev =
2191 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2192 if (!(netdev_dev->cache_valid & VALID_IN6)) {
2196 netdev_dev->in6 = in6addr_any;
2198 file = fopen("/proc/net/if_inet6", "r");
2200 const char *name = netdev_get_name(netdev_);
2201 while (fgets(line, sizeof line, file)) {
2202 struct in6_addr in6_tmp;
2203 char ifname[16 + 1];
2204 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2205 && !strcmp(name, ifname))
2207 netdev_dev->in6 = in6_tmp;
2213 netdev_dev->cache_valid |= VALID_IN6;
2215 *in6 = netdev_dev->in6;
2220 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2222 struct sockaddr_in sin;
2223 memset(&sin, 0, sizeof sin);
2224 sin.sin_family = AF_INET;
2225 sin.sin_addr = addr;
2228 memset(sa, 0, sizeof *sa);
2229 memcpy(sa, &sin, sizeof sin);
2233 do_set_addr(struct netdev *netdev,
2234 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2237 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2238 make_in4_sockaddr(&ifr.ifr_addr, addr);
2240 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2244 /* Adds 'router' as a default IP gateway. */
2246 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2248 struct in_addr any = { INADDR_ANY };
2252 memset(&rt, 0, sizeof rt);
2253 make_in4_sockaddr(&rt.rt_dst, any);
2254 make_in4_sockaddr(&rt.rt_gateway, router);
2255 make_in4_sockaddr(&rt.rt_genmask, any);
2256 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2257 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2259 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
2265 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2268 static const char fn[] = "/proc/net/route";
2273 *netdev_name = NULL;
2274 stream = fopen(fn, "r");
2275 if (stream == NULL) {
2276 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
2281 while (fgets(line, sizeof line, stream)) {
2284 ovs_be32 dest, gateway, mask;
2285 int refcnt, metric, mtu;
2286 unsigned int flags, use, window, irtt;
2289 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2291 iface, &dest, &gateway, &flags, &refcnt,
2292 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2294 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2298 if (!(flags & RTF_UP)) {
2299 /* Skip routes that aren't up. */
2303 /* The output of 'dest', 'mask', and 'gateway' were given in
2304 * network byte order, so we don't need need any endian
2305 * conversions here. */
2306 if ((dest & mask) == (host->s_addr & mask)) {
2308 /* The host is directly reachable. */
2309 next_hop->s_addr = 0;
2311 /* To reach the host, we must go through a gateway. */
2312 next_hop->s_addr = gateway;
2314 *netdev_name = xstrdup(iface);
2326 netdev_linux_get_drv_info(const struct netdev *netdev, struct smap *smap)
2329 struct netdev_dev_linux *netdev_dev =
2330 netdev_dev_linux_cast(netdev_get_dev(netdev));
2332 error = netdev_linux_get_drvinfo(netdev_dev);
2334 smap_add(smap, "driver_name", netdev_dev->drvinfo.driver);
2335 smap_add(smap, "driver_version", netdev_dev->drvinfo.version);
2336 smap_add(smap, "firmware_version", netdev_dev->drvinfo.fw_version);
2342 netdev_internal_get_drv_info(const struct netdev *netdev OVS_UNUSED,
2345 smap_add(smap, "driver_name", "openvswitch");
2349 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2350 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2351 * returns 0. Otherwise, it returns a positive errno value; in particular,
2352 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2354 netdev_linux_arp_lookup(const struct netdev *netdev,
2355 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2358 struct sockaddr_in sin;
2361 memset(&r, 0, sizeof r);
2362 memset(&sin, 0, sizeof sin);
2363 sin.sin_family = AF_INET;
2364 sin.sin_addr.s_addr = ip;
2366 memcpy(&r.arp_pa, &sin, sizeof sin);
2367 r.arp_ha.sa_family = ARPHRD_ETHER;
2369 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2370 COVERAGE_INC(netdev_arp_lookup);
2371 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2373 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2374 } else if (retval != ENXIO) {
2375 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2376 netdev_get_name(netdev), IP_ARGS(ip), strerror(retval));
2382 nd_to_iff_flags(enum netdev_flags nd)
2385 if (nd & NETDEV_UP) {
2388 if (nd & NETDEV_PROMISC) {
2395 iff_to_nd_flags(int iff)
2397 enum netdev_flags nd = 0;
2401 if (iff & IFF_PROMISC) {
2402 nd |= NETDEV_PROMISC;
2408 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2409 enum netdev_flags on, enum netdev_flags *old_flagsp)
2411 struct netdev_dev_linux *netdev_dev;
2412 int old_flags, new_flags;
2415 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2416 old_flags = netdev_dev->ifi_flags;
2417 *old_flagsp = iff_to_nd_flags(old_flags);
2418 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2419 if (new_flags != old_flags) {
2420 error = set_flags(netdev, new_flags);
2421 get_flags(&netdev_dev->netdev_dev, &netdev_dev->ifi_flags);
2427 netdev_linux_change_seq(const struct netdev *netdev)
2429 return netdev_dev_linux_cast(netdev_get_dev(netdev))->change_seq;
2432 #define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS, \
2433 GET_FEATURES, GET_STATUS) \
2437 netdev_linux_init, \
2439 netdev_linux_wait, \
2442 netdev_linux_destroy, \
2443 NULL, /* get_config */ \
2444 NULL, /* set_config */ \
2446 netdev_linux_open, \
2447 netdev_linux_close, \
2449 netdev_linux_listen, \
2450 netdev_linux_recv, \
2451 netdev_linux_recv_wait, \
2452 netdev_linux_drain, \
2454 netdev_linux_send, \
2455 netdev_linux_send_wait, \
2457 netdev_linux_set_etheraddr, \
2458 netdev_linux_get_etheraddr, \
2459 netdev_linux_get_mtu, \
2460 netdev_linux_set_mtu, \
2461 netdev_linux_get_ifindex, \
2462 netdev_linux_get_carrier, \
2463 netdev_linux_get_carrier_resets, \
2464 netdev_linux_set_miimon_interval, \
2469 netdev_linux_set_advertisements, \
2471 netdev_linux_set_policing, \
2472 netdev_linux_get_qos_types, \
2473 netdev_linux_get_qos_capabilities, \
2474 netdev_linux_get_qos, \
2475 netdev_linux_set_qos, \
2476 netdev_linux_get_queue, \
2477 netdev_linux_set_queue, \
2478 netdev_linux_delete_queue, \
2479 netdev_linux_get_queue_stats, \
2480 netdev_linux_dump_queues, \
2481 netdev_linux_dump_queue_stats, \
2483 netdev_linux_get_in4, \
2484 netdev_linux_set_in4, \
2485 netdev_linux_get_in6, \
2486 netdev_linux_add_router, \
2487 netdev_linux_get_next_hop, \
2489 netdev_linux_arp_lookup, \
2491 netdev_linux_update_flags, \
2493 netdev_linux_change_seq \
2496 const struct netdev_class netdev_linux_class =
2499 netdev_linux_create,
2500 netdev_linux_get_stats,
2501 NULL, /* set_stats */
2502 netdev_linux_get_features,
2503 netdev_linux_get_drv_info);
2505 const struct netdev_class netdev_tap_class =
2508 netdev_linux_create_tap,
2509 netdev_tap_get_stats,
2510 NULL, /* set_stats */
2511 netdev_linux_get_features,
2512 netdev_linux_get_drv_info);
2514 const struct netdev_class netdev_internal_class =
2517 netdev_linux_create,
2518 netdev_internal_get_stats,
2519 netdev_internal_set_stats,
2520 NULL, /* get_features */
2521 netdev_internal_get_drv_info);
2523 /* HTB traffic control class. */
2525 #define HTB_N_QUEUES 0xf000
2529 unsigned int max_rate; /* In bytes/s. */
2533 struct tc_queue tc_queue;
2534 unsigned int min_rate; /* In bytes/s. */
2535 unsigned int max_rate; /* In bytes/s. */
2536 unsigned int burst; /* In bytes. */
2537 unsigned int priority; /* Lower values are higher priorities. */
2541 htb_get__(const struct netdev *netdev)
2543 struct netdev_dev_linux *netdev_dev =
2544 netdev_dev_linux_cast(netdev_get_dev(netdev));
2545 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2549 htb_install__(struct netdev *netdev, uint64_t max_rate)
2551 struct netdev_dev_linux *netdev_dev =
2552 netdev_dev_linux_cast(netdev_get_dev(netdev));
2555 htb = xmalloc(sizeof *htb);
2556 tc_init(&htb->tc, &tc_ops_htb);
2557 htb->max_rate = max_rate;
2559 netdev_dev->tc = &htb->tc;
2562 /* Create an HTB qdisc.
2564 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2566 htb_setup_qdisc__(struct netdev *netdev)
2569 struct tc_htb_glob opt;
2570 struct ofpbuf request;
2571 struct tcmsg *tcmsg;
2573 tc_del_qdisc(netdev);
2575 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2576 NLM_F_EXCL | NLM_F_CREATE, &request);
2580 tcmsg->tcm_handle = tc_make_handle(1, 0);
2581 tcmsg->tcm_parent = TC_H_ROOT;
2583 nl_msg_put_string(&request, TCA_KIND, "htb");
2585 memset(&opt, 0, sizeof opt);
2586 opt.rate2quantum = 10;
2590 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2591 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2592 nl_msg_end_nested(&request, opt_offset);
2594 return tc_transact(&request, NULL);
2597 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2598 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2600 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2601 unsigned int parent, struct htb_class *class)
2604 struct tc_htb_opt opt;
2605 struct ofpbuf request;
2606 struct tcmsg *tcmsg;
2610 error = netdev_get_mtu(netdev, &mtu);
2612 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2613 netdev_get_name(netdev));
2617 memset(&opt, 0, sizeof opt);
2618 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2619 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2620 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2621 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2622 opt.prio = class->priority;
2624 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2628 tcmsg->tcm_handle = handle;
2629 tcmsg->tcm_parent = parent;
2631 nl_msg_put_string(&request, TCA_KIND, "htb");
2632 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2633 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2634 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2635 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2636 nl_msg_end_nested(&request, opt_offset);
2638 error = tc_transact(&request, NULL);
2640 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2641 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2642 netdev_get_name(netdev),
2643 tc_get_major(handle), tc_get_minor(handle),
2644 tc_get_major(parent), tc_get_minor(parent),
2645 class->min_rate, class->max_rate,
2646 class->burst, class->priority, strerror(error));
2651 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2652 * description of them into 'details'. The description complies with the
2653 * specification given in the vswitch database documentation for linux-htb
2656 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2658 static const struct nl_policy tca_htb_policy[] = {
2659 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2660 .min_len = sizeof(struct tc_htb_opt) },
2663 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2664 const struct tc_htb_opt *htb;
2666 if (!nl_parse_nested(nl_options, tca_htb_policy,
2667 attrs, ARRAY_SIZE(tca_htb_policy))) {
2668 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2672 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2673 class->min_rate = htb->rate.rate;
2674 class->max_rate = htb->ceil.rate;
2675 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2676 class->priority = htb->prio;
2681 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2682 struct htb_class *options,
2683 struct netdev_queue_stats *stats)
2685 struct nlattr *nl_options;
2686 unsigned int handle;
2689 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2690 if (!error && queue_id) {
2691 unsigned int major = tc_get_major(handle);
2692 unsigned int minor = tc_get_minor(handle);
2693 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2694 *queue_id = minor - 1;
2699 if (!error && options) {
2700 error = htb_parse_tca_options__(nl_options, options);
2706 htb_parse_qdisc_details__(struct netdev *netdev,
2707 const struct smap *details, struct htb_class *hc)
2709 const char *max_rate_s;
2711 max_rate_s = smap_get(details, "max-rate");
2712 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2713 if (!hc->max_rate) {
2714 enum netdev_features current;
2716 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2717 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
2719 hc->min_rate = hc->max_rate;
2725 htb_parse_class_details__(struct netdev *netdev,
2726 const struct smap *details, struct htb_class *hc)
2728 const struct htb *htb = htb_get__(netdev);
2729 const char *min_rate_s = smap_get(details, "min-rate");
2730 const char *max_rate_s = smap_get(details, "max-rate");
2731 const char *burst_s = smap_get(details, "burst");
2732 const char *priority_s = smap_get(details, "priority");
2735 error = netdev_get_mtu(netdev, &mtu);
2737 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2738 netdev_get_name(netdev));
2742 /* HTB requires at least an mtu sized min-rate to send any traffic even
2743 * on uncongested links. */
2744 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2745 hc->min_rate = MAX(hc->min_rate, mtu);
2746 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2749 hc->max_rate = (max_rate_s
2750 ? strtoull(max_rate_s, NULL, 10) / 8
2752 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2753 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2757 * According to hints in the documentation that I've read, it is important
2758 * that 'burst' be at least as big as the largest frame that might be
2759 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2760 * but having it a bit too small is a problem. Since netdev_get_mtu()
2761 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2762 * the MTU. We actually add 64, instead of 14, as a guard against
2763 * additional headers get tacked on somewhere that we're not aware of. */
2764 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2765 hc->burst = MAX(hc->burst, mtu + 64);
2768 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2774 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2775 unsigned int parent, struct htb_class *options,
2776 struct netdev_queue_stats *stats)
2778 struct ofpbuf *reply;
2781 error = tc_query_class(netdev, handle, parent, &reply);
2783 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2784 ofpbuf_delete(reply);
2790 htb_tc_install(struct netdev *netdev, const struct smap *details)
2794 error = htb_setup_qdisc__(netdev);
2796 struct htb_class hc;
2798 htb_parse_qdisc_details__(netdev, details, &hc);
2799 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2800 tc_make_handle(1, 0), &hc);
2802 htb_install__(netdev, hc.max_rate);
2808 static struct htb_class *
2809 htb_class_cast__(const struct tc_queue *queue)
2811 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2815 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2816 const struct htb_class *hc)
2818 struct htb *htb = htb_get__(netdev);
2819 size_t hash = hash_int(queue_id, 0);
2820 struct tc_queue *queue;
2821 struct htb_class *hcp;
2823 queue = tc_find_queue__(netdev, queue_id, hash);
2825 hcp = htb_class_cast__(queue);
2827 hcp = xmalloc(sizeof *hcp);
2828 queue = &hcp->tc_queue;
2829 queue->queue_id = queue_id;
2830 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2833 hcp->min_rate = hc->min_rate;
2834 hcp->max_rate = hc->max_rate;
2835 hcp->burst = hc->burst;
2836 hcp->priority = hc->priority;
2840 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2843 struct nl_dump dump;
2844 struct htb_class hc;
2846 /* Get qdisc options. */
2848 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2849 htb_install__(netdev, hc.max_rate);
2852 if (!start_queue_dump(netdev, &dump)) {
2855 while (nl_dump_next(&dump, &msg)) {
2856 unsigned int queue_id;
2858 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2859 htb_update_queue__(netdev, queue_id, &hc);
2862 nl_dump_done(&dump);
2868 htb_tc_destroy(struct tc *tc)
2870 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2871 struct htb_class *hc, *next;
2873 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2874 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2882 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
2884 const struct htb *htb = htb_get__(netdev);
2885 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
2890 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
2892 struct htb_class hc;
2895 htb_parse_qdisc_details__(netdev, details, &hc);
2896 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2897 tc_make_handle(1, 0), &hc);
2899 htb_get__(netdev)->max_rate = hc.max_rate;
2905 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2906 const struct tc_queue *queue, struct smap *details)
2908 const struct htb_class *hc = htb_class_cast__(queue);
2910 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
2911 if (hc->min_rate != hc->max_rate) {
2912 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
2914 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
2916 smap_add_format(details, "priority", "%u", hc->priority);
2922 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2923 const struct smap *details)
2925 struct htb_class hc;
2928 error = htb_parse_class_details__(netdev, details, &hc);
2933 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2934 tc_make_handle(1, 0xfffe), &hc);
2939 htb_update_queue__(netdev, queue_id, &hc);
2944 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2946 struct htb_class *hc = htb_class_cast__(queue);
2947 struct htb *htb = htb_get__(netdev);
2950 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2952 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2959 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2960 struct netdev_queue_stats *stats)
2962 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2963 tc_make_handle(1, 0xfffe), NULL, stats);
2967 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2968 const struct ofpbuf *nlmsg,
2969 netdev_dump_queue_stats_cb *cb, void *aux)
2971 struct netdev_queue_stats stats;
2972 unsigned int handle, major, minor;
2975 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2980 major = tc_get_major(handle);
2981 minor = tc_get_minor(handle);
2982 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2983 (*cb)(minor - 1, &stats, aux);
2988 static const struct tc_ops tc_ops_htb = {
2989 "htb", /* linux_name */
2990 "linux-htb", /* ovs_name */
2991 HTB_N_QUEUES, /* n_queues */
3000 htb_class_get_stats,
3001 htb_class_dump_stats
3004 /* "linux-hfsc" traffic control class. */
3006 #define HFSC_N_QUEUES 0xf000
3014 struct tc_queue tc_queue;
3019 static struct hfsc *
3020 hfsc_get__(const struct netdev *netdev)
3022 struct netdev_dev_linux *netdev_dev;
3023 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
3024 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
3027 static struct hfsc_class *
3028 hfsc_class_cast__(const struct tc_queue *queue)
3030 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3034 hfsc_install__(struct netdev *netdev, uint32_t max_rate)
3036 struct netdev_dev_linux * netdev_dev;
3039 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
3040 hfsc = xmalloc(sizeof *hfsc);
3041 tc_init(&hfsc->tc, &tc_ops_hfsc);
3042 hfsc->max_rate = max_rate;
3043 netdev_dev->tc = &hfsc->tc;
3047 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3048 const struct hfsc_class *hc)
3052 struct hfsc_class *hcp;
3053 struct tc_queue *queue;
3055 hfsc = hfsc_get__(netdev);
3056 hash = hash_int(queue_id, 0);
3058 queue = tc_find_queue__(netdev, queue_id, hash);
3060 hcp = hfsc_class_cast__(queue);
3062 hcp = xmalloc(sizeof *hcp);
3063 queue = &hcp->tc_queue;
3064 queue->queue_id = queue_id;
3065 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3068 hcp->min_rate = hc->min_rate;
3069 hcp->max_rate = hc->max_rate;
3073 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3075 const struct tc_service_curve *rsc, *fsc, *usc;
3076 static const struct nl_policy tca_hfsc_policy[] = {
3078 .type = NL_A_UNSPEC,
3080 .min_len = sizeof(struct tc_service_curve),
3083 .type = NL_A_UNSPEC,
3085 .min_len = sizeof(struct tc_service_curve),
3088 .type = NL_A_UNSPEC,
3090 .min_len = sizeof(struct tc_service_curve),
3093 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3095 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3096 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3097 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3101 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3102 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3103 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3105 if (rsc->m1 != 0 || rsc->d != 0 ||
3106 fsc->m1 != 0 || fsc->d != 0 ||
3107 usc->m1 != 0 || usc->d != 0) {
3108 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3109 "Non-linear service curves are not supported.");
3113 if (rsc->m2 != fsc->m2) {
3114 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3115 "Real-time service curves are not supported ");
3119 if (rsc->m2 > usc->m2) {
3120 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3121 "Min-rate service curve is greater than "
3122 "the max-rate service curve.");
3126 class->min_rate = fsc->m2;
3127 class->max_rate = usc->m2;
3132 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3133 struct hfsc_class *options,
3134 struct netdev_queue_stats *stats)
3137 unsigned int handle;
3138 struct nlattr *nl_options;
3140 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3146 unsigned int major, minor;
3148 major = tc_get_major(handle);
3149 minor = tc_get_minor(handle);
3150 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3151 *queue_id = minor - 1;
3158 error = hfsc_parse_tca_options__(nl_options, options);
3165 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3166 unsigned int parent, struct hfsc_class *options,
3167 struct netdev_queue_stats *stats)
3170 struct ofpbuf *reply;
3172 error = tc_query_class(netdev, handle, parent, &reply);
3177 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3178 ofpbuf_delete(reply);
3183 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct smap *details,
3184 struct hfsc_class *class)
3187 const char *max_rate_s;
3189 max_rate_s = smap_get(details, "max-rate");
3190 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3193 enum netdev_features current;
3195 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
3196 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3199 class->min_rate = max_rate;
3200 class->max_rate = max_rate;
3204 hfsc_parse_class_details__(struct netdev *netdev,
3205 const struct smap *details,
3206 struct hfsc_class * class)
3208 const struct hfsc *hfsc;
3209 uint32_t min_rate, max_rate;
3210 const char *min_rate_s, *max_rate_s;
3212 hfsc = hfsc_get__(netdev);
3213 min_rate_s = smap_get(details, "min-rate");
3214 max_rate_s = smap_get(details, "max-rate");
3216 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3217 min_rate = MAX(min_rate, 1);
3218 min_rate = MIN(min_rate, hfsc->max_rate);
3220 max_rate = (max_rate_s
3221 ? strtoull(max_rate_s, NULL, 10) / 8
3223 max_rate = MAX(max_rate, min_rate);
3224 max_rate = MIN(max_rate, hfsc->max_rate);
3226 class->min_rate = min_rate;
3227 class->max_rate = max_rate;
3232 /* Create an HFSC qdisc.
3234 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3236 hfsc_setup_qdisc__(struct netdev * netdev)
3238 struct tcmsg *tcmsg;
3239 struct ofpbuf request;
3240 struct tc_hfsc_qopt opt;
3242 tc_del_qdisc(netdev);
3244 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3245 NLM_F_EXCL | NLM_F_CREATE, &request);
3251 tcmsg->tcm_handle = tc_make_handle(1, 0);
3252 tcmsg->tcm_parent = TC_H_ROOT;
3254 memset(&opt, 0, sizeof opt);
3257 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3258 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3260 return tc_transact(&request, NULL);
3263 /* Create an HFSC class.
3265 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3266 * sc rate <min_rate> ul rate <max_rate>" */
3268 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3269 unsigned int parent, struct hfsc_class *class)
3273 struct tcmsg *tcmsg;
3274 struct ofpbuf request;
3275 struct tc_service_curve min, max;
3277 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3283 tcmsg->tcm_handle = handle;
3284 tcmsg->tcm_parent = parent;
3288 min.m2 = class->min_rate;
3292 max.m2 = class->max_rate;
3294 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3295 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3296 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3297 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3298 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3299 nl_msg_end_nested(&request, opt_offset);
3301 error = tc_transact(&request, NULL);
3303 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3304 "min-rate %ubps, max-rate %ubps (%s)",
3305 netdev_get_name(netdev),
3306 tc_get_major(handle), tc_get_minor(handle),
3307 tc_get_major(parent), tc_get_minor(parent),
3308 class->min_rate, class->max_rate, strerror(error));
3315 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
3318 struct hfsc_class class;
3320 error = hfsc_setup_qdisc__(netdev);
3326 hfsc_parse_qdisc_details__(netdev, details, &class);
3327 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3328 tc_make_handle(1, 0), &class);
3334 hfsc_install__(netdev, class.max_rate);
3339 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3342 struct nl_dump dump;
3343 struct hfsc_class hc;
3346 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3347 hfsc_install__(netdev, hc.max_rate);
3349 if (!start_queue_dump(netdev, &dump)) {
3353 while (nl_dump_next(&dump, &msg)) {
3354 unsigned int queue_id;
3356 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3357 hfsc_update_queue__(netdev, queue_id, &hc);
3361 nl_dump_done(&dump);
3366 hfsc_tc_destroy(struct tc *tc)
3369 struct hfsc_class *hc, *next;
3371 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3373 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3374 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3383 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
3385 const struct hfsc *hfsc;
3386 hfsc = hfsc_get__(netdev);
3387 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
3392 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
3395 struct hfsc_class class;
3397 hfsc_parse_qdisc_details__(netdev, details, &class);
3398 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3399 tc_make_handle(1, 0), &class);
3402 hfsc_get__(netdev)->max_rate = class.max_rate;
3409 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3410 const struct tc_queue *queue, struct smap *details)
3412 const struct hfsc_class *hc;
3414 hc = hfsc_class_cast__(queue);
3415 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3416 if (hc->min_rate != hc->max_rate) {
3417 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3423 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3424 const struct smap *details)
3427 struct hfsc_class class;
3429 error = hfsc_parse_class_details__(netdev, details, &class);
3434 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3435 tc_make_handle(1, 0xfffe), &class);
3440 hfsc_update_queue__(netdev, queue_id, &class);
3445 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3449 struct hfsc_class *hc;
3451 hc = hfsc_class_cast__(queue);
3452 hfsc = hfsc_get__(netdev);
3454 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3456 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3463 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3464 struct netdev_queue_stats *stats)
3466 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3467 tc_make_handle(1, 0xfffe), NULL, stats);
3471 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3472 const struct ofpbuf *nlmsg,
3473 netdev_dump_queue_stats_cb *cb, void *aux)
3475 struct netdev_queue_stats stats;
3476 unsigned int handle, major, minor;
3479 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3484 major = tc_get_major(handle);
3485 minor = tc_get_minor(handle);
3486 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3487 (*cb)(minor - 1, &stats, aux);
3492 static const struct tc_ops tc_ops_hfsc = {
3493 "hfsc", /* linux_name */
3494 "linux-hfsc", /* ovs_name */
3495 HFSC_N_QUEUES, /* n_queues */
3496 hfsc_tc_install, /* tc_install */
3497 hfsc_tc_load, /* tc_load */
3498 hfsc_tc_destroy, /* tc_destroy */
3499 hfsc_qdisc_get, /* qdisc_get */
3500 hfsc_qdisc_set, /* qdisc_set */
3501 hfsc_class_get, /* class_get */
3502 hfsc_class_set, /* class_set */
3503 hfsc_class_delete, /* class_delete */
3504 hfsc_class_get_stats, /* class_get_stats */
3505 hfsc_class_dump_stats /* class_dump_stats */
3508 /* "linux-default" traffic control class.
3510 * This class represents the default, unnamed Linux qdisc. It corresponds to
3511 * the "" (empty string) QoS type in the OVS database. */
3514 default_install__(struct netdev *netdev)
3516 struct netdev_dev_linux *netdev_dev =
3517 netdev_dev_linux_cast(netdev_get_dev(netdev));
3518 static struct tc *tc;
3521 tc = xmalloc(sizeof *tc);
3522 tc_init(tc, &tc_ops_default);
3524 netdev_dev->tc = tc;
3528 default_tc_install(struct netdev *netdev,
3529 const struct smap *details OVS_UNUSED)
3531 default_install__(netdev);
3536 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3538 default_install__(netdev);
3542 static const struct tc_ops tc_ops_default = {
3543 NULL, /* linux_name */
3548 NULL, /* tc_destroy */
3549 NULL, /* qdisc_get */
3550 NULL, /* qdisc_set */
3551 NULL, /* class_get */
3552 NULL, /* class_set */
3553 NULL, /* class_delete */
3554 NULL, /* class_get_stats */
3555 NULL /* class_dump_stats */
3558 /* "linux-other" traffic control class.
3563 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3565 struct netdev_dev_linux *netdev_dev =
3566 netdev_dev_linux_cast(netdev_get_dev(netdev));
3567 static struct tc *tc;
3570 tc = xmalloc(sizeof *tc);
3571 tc_init(tc, &tc_ops_other);
3573 netdev_dev->tc = tc;
3577 static const struct tc_ops tc_ops_other = {
3578 NULL, /* linux_name */
3579 "linux-other", /* ovs_name */
3581 NULL, /* tc_install */
3583 NULL, /* tc_destroy */
3584 NULL, /* qdisc_get */
3585 NULL, /* qdisc_set */
3586 NULL, /* class_get */
3587 NULL, /* class_set */
3588 NULL, /* class_delete */
3589 NULL, /* class_get_stats */
3590 NULL /* class_dump_stats */
3593 /* Traffic control. */
3595 /* Number of kernel "tc" ticks per second. */
3596 static double ticks_per_s;
3598 /* Number of kernel "jiffies" per second. This is used for the purpose of
3599 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3600 * one jiffy's worth of data.
3602 * There are two possibilities here:
3604 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3605 * approximate range of 100 to 1024. That means that we really need to
3606 * make sure that the qdisc can buffer that much data.
3608 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3609 * has finely granular timers and there's no need to fudge additional room
3610 * for buffers. (There's no extra effort needed to implement that: the
3611 * large 'buffer_hz' is used as a divisor, so practically any number will
3612 * come out as 0 in the division. Small integer results in the case of
3613 * really high dividends won't have any real effect anyhow.)
3615 static unsigned int buffer_hz;
3617 /* Returns tc handle 'major':'minor'. */
3619 tc_make_handle(unsigned int major, unsigned int minor)
3621 return TC_H_MAKE(major << 16, minor);
3624 /* Returns the major number from 'handle'. */
3626 tc_get_major(unsigned int handle)
3628 return TC_H_MAJ(handle) >> 16;
3631 /* Returns the minor number from 'handle'. */
3633 tc_get_minor(unsigned int handle)
3635 return TC_H_MIN(handle);
3638 static struct tcmsg *
3639 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3640 struct ofpbuf *request)
3642 struct tcmsg *tcmsg;
3646 error = get_ifindex(netdev, &ifindex);
3651 ofpbuf_init(request, 512);
3652 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3653 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3654 tcmsg->tcm_family = AF_UNSPEC;
3655 tcmsg->tcm_ifindex = ifindex;
3656 /* Caller should fill in tcmsg->tcm_handle. */
3657 /* Caller should fill in tcmsg->tcm_parent. */
3663 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3665 int error = nl_sock_transact(rtnl_sock, request, replyp);
3666 ofpbuf_uninit(request);
3670 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3671 * policing configuration.
3673 * This function is equivalent to running the following when 'add' is true:
3674 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3676 * This function is equivalent to running the following when 'add' is false:
3677 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3679 * The configuration and stats may be seen with the following command:
3680 * /sbin/tc -s qdisc show dev <devname>
3682 * Returns 0 if successful, otherwise a positive errno value.
3685 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3687 struct ofpbuf request;
3688 struct tcmsg *tcmsg;
3690 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3691 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3693 tcmsg = tc_make_request(netdev, type, flags, &request);
3697 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3698 tcmsg->tcm_parent = TC_H_INGRESS;
3699 nl_msg_put_string(&request, TCA_KIND, "ingress");
3700 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3702 error = tc_transact(&request, NULL);
3704 /* If we're deleting the qdisc, don't worry about some of the
3705 * error conditions. */
3706 if (!add && (error == ENOENT || error == EINVAL)) {
3715 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3718 * This function is equivalent to running:
3719 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3720 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3723 * The configuration and stats may be seen with the following command:
3724 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3726 * Returns 0 if successful, otherwise a positive errno value.
3729 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3731 struct tc_police tc_police;
3732 struct ofpbuf request;
3733 struct tcmsg *tcmsg;
3734 size_t basic_offset;
3735 size_t police_offset;
3739 memset(&tc_police, 0, sizeof tc_police);
3740 tc_police.action = TC_POLICE_SHOT;
3741 tc_police.mtu = mtu;
3742 tc_fill_rate(&tc_police.rate, kbits_rate/8 * 1000, mtu);
3743 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3744 kbits_burst * 1024);
3746 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3747 NLM_F_EXCL | NLM_F_CREATE, &request);
3751 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3752 tcmsg->tcm_info = tc_make_handle(49,
3753 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3755 nl_msg_put_string(&request, TCA_KIND, "basic");
3756 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3757 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3758 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3759 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3760 nl_msg_end_nested(&request, police_offset);
3761 nl_msg_end_nested(&request, basic_offset);
3763 error = tc_transact(&request, NULL);
3774 /* The values in psched are not individually very meaningful, but they are
3775 * important. The tables below show some values seen in the wild.
3779 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3780 * (Before that, there are hints that it was 1000000000.)
3782 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3786 * -----------------------------------
3787 * [1] 000c8000 000f4240 000f4240 00000064
3788 * [2] 000003e8 00000400 000f4240 3b9aca00
3789 * [3] 000003e8 00000400 000f4240 3b9aca00
3790 * [4] 000003e8 00000400 000f4240 00000064
3791 * [5] 000003e8 00000040 000f4240 3b9aca00
3792 * [6] 000003e8 00000040 000f4240 000000f9
3794 * a b c d ticks_per_s buffer_hz
3795 * ------- --------- ---------- ------------- ----------- -------------
3796 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3797 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3798 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3799 * [4] 1,000 1,024 1,000,000 100 976,562 100
3800 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3801 * [6] 1,000 64 1,000,000 249 15,625,000 249
3803 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3804 * [2] 2.6.26-1-686-bigmem from Debian lenny
3805 * [3] 2.6.26-2-sparc64 from Debian lenny
3806 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3807 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3808 * [6] 2.6.34 from kernel.org on KVM
3810 static const char fn[] = "/proc/net/psched";
3811 unsigned int a, b, c, d;
3817 stream = fopen(fn, "r");
3819 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3823 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3824 VLOG_WARN("%s: read failed", fn);
3828 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3832 VLOG_WARN("%s: invalid scheduler parameters", fn);
3836 ticks_per_s = (double) a * c / b;
3840 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3843 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3846 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3847 * rate of 'rate' bytes per second. */
3849 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3854 return (rate * ticks) / ticks_per_s;
3857 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3858 * rate of 'rate' bytes per second. */
3860 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3865 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3868 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3869 * a transmission rate of 'rate' bytes per second. */
3871 tc_buffer_per_jiffy(unsigned int rate)
3876 return rate / buffer_hz;
3879 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3880 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3881 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3882 * stores NULL into it if it is absent.
3884 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3887 * Returns 0 if successful, otherwise a positive errno value. */
3889 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3890 struct nlattr **options)
3892 static const struct nl_policy tca_policy[] = {
3893 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3894 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3896 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3898 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3899 tca_policy, ta, ARRAY_SIZE(ta))) {
3900 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3905 *kind = nl_attr_get_string(ta[TCA_KIND]);
3909 *options = ta[TCA_OPTIONS];
3924 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3925 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3926 * into '*options', and its queue statistics into '*stats'. Any of the output
3927 * arguments may be null.
3929 * Returns 0 if successful, otherwise a positive errno value. */
3931 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3932 struct nlattr **options, struct netdev_queue_stats *stats)
3934 static const struct nl_policy tca_policy[] = {
3935 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3936 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3938 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3940 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3941 tca_policy, ta, ARRAY_SIZE(ta))) {
3942 VLOG_WARN_RL(&rl, "failed to parse class message");
3947 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3948 *handlep = tc->tcm_handle;
3952 *options = ta[TCA_OPTIONS];
3956 const struct gnet_stats_queue *gsq;
3957 struct gnet_stats_basic gsb;
3959 static const struct nl_policy stats_policy[] = {
3960 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3961 .min_len = sizeof gsb },
3962 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3963 .min_len = sizeof *gsq },
3965 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3967 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3968 sa, ARRAY_SIZE(sa))) {
3969 VLOG_WARN_RL(&rl, "failed to parse class stats");
3973 /* Alignment issues screw up the length of struct gnet_stats_basic on
3974 * some arch/bitsize combinations. Newer versions of Linux have a
3975 * struct gnet_stats_basic_packed, but we can't depend on that. The
3976 * easiest thing to do is just to make a copy. */
3977 memset(&gsb, 0, sizeof gsb);
3978 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3979 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3980 stats->tx_bytes = gsb.bytes;
3981 stats->tx_packets = gsb.packets;
3983 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3984 stats->tx_errors = gsq->drops;
3994 memset(stats, 0, sizeof *stats);
3999 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
4002 tc_query_class(const struct netdev *netdev,
4003 unsigned int handle, unsigned int parent,
4004 struct ofpbuf **replyp)
4006 struct ofpbuf request;
4007 struct tcmsg *tcmsg;
4010 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
4014 tcmsg->tcm_handle = handle;
4015 tcmsg->tcm_parent = parent;
4017 error = tc_transact(&request, replyp);
4019 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
4020 netdev_get_name(netdev),
4021 tc_get_major(handle), tc_get_minor(handle),
4022 tc_get_major(parent), tc_get_minor(parent),
4028 /* Equivalent to "tc class del dev <name> handle <handle>". */
4030 tc_delete_class(const struct netdev *netdev, unsigned int handle)
4032 struct ofpbuf request;
4033 struct tcmsg *tcmsg;
4036 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
4040 tcmsg->tcm_handle = handle;
4041 tcmsg->tcm_parent = 0;
4043 error = tc_transact(&request, NULL);
4045 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4046 netdev_get_name(netdev),
4047 tc_get_major(handle), tc_get_minor(handle),
4053 /* Equivalent to "tc qdisc del dev <name> root". */
4055 tc_del_qdisc(struct netdev *netdev)
4057 struct netdev_dev_linux *netdev_dev =
4058 netdev_dev_linux_cast(netdev_get_dev(netdev));
4059 struct ofpbuf request;
4060 struct tcmsg *tcmsg;
4063 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
4067 tcmsg->tcm_handle = tc_make_handle(1, 0);
4068 tcmsg->tcm_parent = TC_H_ROOT;
4070 error = tc_transact(&request, NULL);
4071 if (error == EINVAL) {
4072 /* EINVAL probably means that the default qdisc was in use, in which
4073 * case we've accomplished our purpose. */
4076 if (!error && netdev_dev->tc) {
4077 if (netdev_dev->tc->ops->tc_destroy) {
4078 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
4080 netdev_dev->tc = NULL;
4085 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4086 * kernel to determine what they are. Returns 0 if successful, otherwise a
4087 * positive errno value. */
4089 tc_query_qdisc(const struct netdev *netdev)
4091 struct netdev_dev_linux *netdev_dev =
4092 netdev_dev_linux_cast(netdev_get_dev(netdev));
4093 struct ofpbuf request, *qdisc;
4094 const struct tc_ops *ops;
4095 struct tcmsg *tcmsg;
4099 if (netdev_dev->tc) {
4103 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4104 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4105 * 2.6.35 without that fix backported to it.
4107 * To avoid the OOPS, we must not make a request that would attempt to dump
4108 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4109 * few others. There are a few ways that I can see to do this, but most of
4110 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4111 * technique chosen here is to assume that any non-default qdisc that we
4112 * create will have a class with handle 1:0. The built-in qdiscs only have
4113 * a class with handle 0:0.
4115 * We could check for Linux 2.6.35+ and use a more straightforward method
4117 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
4121 tcmsg->tcm_handle = tc_make_handle(1, 0);
4122 tcmsg->tcm_parent = 0;
4124 /* Figure out what tc class to instantiate. */
4125 error = tc_transact(&request, &qdisc);
4129 error = tc_parse_qdisc(qdisc, &kind, NULL);
4131 ops = &tc_ops_other;
4133 ops = tc_lookup_linux_name(kind);
4135 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4136 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4138 ops = &tc_ops_other;
4141 } else if (error == ENOENT) {
4142 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4143 * other entity that doesn't have a handle 1:0. We will assume
4144 * that it's the system default qdisc. */
4145 ops = &tc_ops_default;
4148 /* Who knows? Maybe the device got deleted. */
4149 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4150 netdev_get_name(netdev), strerror(error));
4151 ops = &tc_ops_other;
4154 /* Instantiate it. */
4155 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev), qdisc);
4156 assert((load_error == 0) == (netdev_dev->tc != NULL));
4157 ofpbuf_delete(qdisc);
4159 return error ? error : load_error;
4162 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4163 approximate the time to transmit packets of various lengths. For an MTU of
4164 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4165 represents two possible packet lengths; for a MTU of 513 through 1024, four
4166 possible lengths; and so on.
4168 Returns, for the specified 'mtu', the number of bits that packet lengths
4169 need to be shifted right to fit within such a 256-entry table. */
4171 tc_calc_cell_log(unsigned int mtu)
4176 mtu = ETH_PAYLOAD_MAX;
4178 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4180 for (cell_log = 0; mtu >= 256; cell_log++) {
4187 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4190 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4192 memset(rate, 0, sizeof *rate);
4193 rate->cell_log = tc_calc_cell_log(mtu);
4194 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4195 /* rate->cell_align = 0; */ /* distro headers. */
4196 rate->mpu = ETH_TOTAL_MIN;
4200 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4201 * attribute of the specified "type".
4203 * See tc_calc_cell_log() above for a description of "rtab"s. */
4205 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4210 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4211 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4212 unsigned packet_size = (i + 1) << rate->cell_log;
4213 if (packet_size < rate->mpu) {
4214 packet_size = rate->mpu;
4216 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4220 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4221 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4222 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4225 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4227 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4228 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4231 /* Linux-only functions declared in netdev-linux.h */
4233 /* Returns a fd for an AF_INET socket or a negative errno value. */
4235 netdev_linux_get_af_inet_sock(void)
4237 int error = netdev_linux_init();
4238 return error ? -error : af_inet_sock;
4241 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4242 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4244 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4245 const char *flag_name, bool enable)
4247 const char *netdev_name = netdev_get_name(netdev);
4248 struct ethtool_value evalue;
4252 COVERAGE_INC(netdev_get_ethtool);
4253 memset(&evalue, 0, sizeof evalue);
4254 error = netdev_linux_do_ethtool(netdev_name,
4255 (struct ethtool_cmd *)&evalue,
4256 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4261 COVERAGE_INC(netdev_set_ethtool);
4262 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4263 error = netdev_linux_do_ethtool(netdev_name,
4264 (struct ethtool_cmd *)&evalue,
4265 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4270 COVERAGE_INC(netdev_get_ethtool);
4271 memset(&evalue, 0, sizeof evalue);
4272 error = netdev_linux_do_ethtool(netdev_name,
4273 (struct ethtool_cmd *)&evalue,
4274 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4279 if (new_flags != evalue.data) {
4280 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4281 "device %s failed", enable ? "enable" : "disable",
4282 flag_name, netdev_name);
4289 /* Utility functions. */
4291 /* Copies 'src' into 'dst', performing format conversion in the process. */
4293 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4294 const struct rtnl_link_stats *src)
4296 dst->rx_packets = src->rx_packets;
4297 dst->tx_packets = src->tx_packets;
4298 dst->rx_bytes = src->rx_bytes;
4299 dst->tx_bytes = src->tx_bytes;
4300 dst->rx_errors = src->rx_errors;
4301 dst->tx_errors = src->tx_errors;
4302 dst->rx_dropped = src->rx_dropped;
4303 dst->tx_dropped = src->tx_dropped;
4304 dst->multicast = src->multicast;
4305 dst->collisions = src->collisions;
4306 dst->rx_length_errors = src->rx_length_errors;
4307 dst->rx_over_errors = src->rx_over_errors;
4308 dst->rx_crc_errors = src->rx_crc_errors;
4309 dst->rx_frame_errors = src->rx_frame_errors;
4310 dst->rx_fifo_errors = src->rx_fifo_errors;
4311 dst->rx_missed_errors = src->rx_missed_errors;
4312 dst->tx_aborted_errors = src->tx_aborted_errors;
4313 dst->tx_carrier_errors = src->tx_carrier_errors;
4314 dst->tx_fifo_errors = src->tx_fifo_errors;
4315 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4316 dst->tx_window_errors = src->tx_window_errors;
4320 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4322 /* Policy for RTNLGRP_LINK messages.
4324 * There are *many* more fields in these messages, but currently we only
4325 * care about these fields. */
4326 static const struct nl_policy rtnlgrp_link_policy[] = {
4327 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4328 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4329 .min_len = sizeof(struct rtnl_link_stats) },
4332 struct ofpbuf request;
4333 struct ofpbuf *reply;
4334 struct ifinfomsg *ifi;
4335 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4338 ofpbuf_init(&request, 0);
4339 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4340 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4341 ifi->ifi_family = PF_UNSPEC;
4342 ifi->ifi_index = ifindex;
4343 error = nl_sock_transact(rtnl_sock, &request, &reply);
4344 ofpbuf_uninit(&request);
4349 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4350 rtnlgrp_link_policy,
4351 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4352 ofpbuf_delete(reply);
4356 if (!attrs[IFLA_STATS]) {
4357 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4358 ofpbuf_delete(reply);
4362 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4364 ofpbuf_delete(reply);
4370 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4372 static const char fn[] = "/proc/net/dev";
4377 stream = fopen(fn, "r");
4379 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
4384 while (fgets(line, sizeof line, stream)) {
4387 #define X64 "%"SCNu64
4390 X64 X64 X64 X64 X64 X64 X64 "%*u"
4391 X64 X64 X64 X64 X64 X64 X64 "%*u",
4397 &stats->rx_fifo_errors,
4398 &stats->rx_frame_errors,
4404 &stats->tx_fifo_errors,
4406 &stats->tx_carrier_errors) != 15) {
4407 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4408 } else if (!strcmp(devname, netdev_name)) {
4409 stats->rx_length_errors = UINT64_MAX;
4410 stats->rx_over_errors = UINT64_MAX;
4411 stats->rx_crc_errors = UINT64_MAX;
4412 stats->rx_missed_errors = UINT64_MAX;
4413 stats->tx_aborted_errors = UINT64_MAX;
4414 stats->tx_heartbeat_errors = UINT64_MAX;
4415 stats->tx_window_errors = UINT64_MAX;
4421 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4427 get_flags(const struct netdev_dev *dev, unsigned int *flags)
4433 error = netdev_linux_do_ioctl(dev->name, &ifr, SIOCGIFFLAGS,
4436 *flags = ifr.ifr_flags;
4442 set_flags(struct netdev *netdev, unsigned int flags)
4446 ifr.ifr_flags = flags;
4447 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4452 do_get_ifindex(const char *netdev_name)
4456 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4457 COVERAGE_INC(netdev_get_ifindex);
4458 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4459 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4460 netdev_name, strerror(errno));
4463 return ifr.ifr_ifindex;
4467 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4469 struct netdev_dev_linux *netdev_dev =
4470 netdev_dev_linux_cast(netdev_get_dev(netdev_));
4472 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
4473 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4476 netdev_dev->get_ifindex_error = -ifindex;
4477 netdev_dev->ifindex = 0;
4479 netdev_dev->get_ifindex_error = 0;
4480 netdev_dev->ifindex = ifindex;
4482 netdev_dev->cache_valid |= VALID_IFINDEX;
4485 *ifindexp = netdev_dev->ifindex;
4486 return netdev_dev->get_ifindex_error;
4490 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4495 memset(&ifr, 0, sizeof ifr);
4496 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4497 COVERAGE_INC(netdev_get_hwaddr);
4498 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4499 /* ENODEV probably means that a vif disappeared asynchronously and
4500 * hasn't been removed from the database yet, so reduce the log level
4501 * to INFO for that case. */
4502 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4503 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4504 netdev_name, strerror(errno));
4507 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4508 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4509 VLOG_WARN("%s device has unknown hardware address family %d",
4510 netdev_name, hwaddr_family);
4512 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4517 set_etheraddr(const char *netdev_name,
4518 const uint8_t mac[ETH_ADDR_LEN])
4522 memset(&ifr, 0, sizeof ifr);
4523 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4524 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4525 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4526 COVERAGE_INC(netdev_set_hwaddr);
4527 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4528 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4529 netdev_name, strerror(errno));
4536 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4537 int cmd, const char *cmd_name)
4541 memset(&ifr, 0, sizeof ifr);
4542 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4543 ifr.ifr_data = (caddr_t) ecmd;
4546 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4549 if (errno != EOPNOTSUPP) {
4550 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4551 "failed: %s", cmd_name, name, strerror(errno));
4553 /* The device doesn't support this operation. That's pretty
4554 * common, so there's no point in logging anything. */
4561 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4562 const char *cmd_name)
4564 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4565 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4566 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4574 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4575 int cmd, const char *cmd_name)
4580 ifr.ifr_addr.sa_family = AF_INET;
4581 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4583 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4584 *ip = sin->sin_addr;
4589 /* Returns an AF_PACKET raw socket or a negative errno value. */
4591 af_packet_sock(void)
4593 static int sock = INT_MIN;
4595 if (sock == INT_MIN) {
4596 sock = socket(AF_PACKET, SOCK_RAW, 0);
4598 set_nonblocking(sock);
4601 VLOG_ERR("failed to create packet socket: %s", strerror(errno));