2 * Copyright (c) 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/gen_stats.h>
26 #include <linux/if_ether.h>
27 #include <linux/if_tun.h>
28 #include <linux/types.h>
29 #include <linux/ethtool.h>
30 #include <linux/mii.h>
31 #include <linux/pkt_cls.h>
32 #include <linux/pkt_sched.h>
33 #include <linux/rtnetlink.h>
34 #include <linux/sockios.h>
35 #include <linux/version.h>
36 #include <sys/types.h>
37 #include <sys/ioctl.h>
38 #include <sys/socket.h>
39 #include <netpacket/packet.h>
41 #include <net/if_arp.h>
42 #include <net/if_packet.h>
43 #include <net/route.h>
44 #include <netinet/in.h>
51 #include "dpif-linux.h"
52 #include "dynamic-string.h"
53 #include "fatal-signal.h"
56 #include "netdev-provider.h"
57 #include "netdev-vport.h"
58 #include "netlink-notifier.h"
59 #include "netlink-socket.h"
62 #include "openflow/openflow.h"
64 #include "poll-loop.h"
65 #include "rtnetlink-link.h"
67 #include "socket-util.h"
70 #include "unaligned.h"
73 VLOG_DEFINE_THIS_MODULE(netdev_linux);
75 COVERAGE_DEFINE(netdev_set_policing);
76 COVERAGE_DEFINE(netdev_arp_lookup);
77 COVERAGE_DEFINE(netdev_get_ifindex);
78 COVERAGE_DEFINE(netdev_get_hwaddr);
79 COVERAGE_DEFINE(netdev_set_hwaddr);
80 COVERAGE_DEFINE(netdev_get_ethtool);
81 COVERAGE_DEFINE(netdev_set_ethtool);
84 /* These were introduced in Linux 2.6.14, so they might be missing if we have
86 #ifndef ADVERTISED_Pause
87 #define ADVERTISED_Pause (1 << 13)
89 #ifndef ADVERTISED_Asym_Pause
90 #define ADVERTISED_Asym_Pause (1 << 14)
93 /* These were introduced in Linux 2.6.24, so they might be missing if we
94 * have old headers. */
95 #ifndef ETHTOOL_GFLAGS
96 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
98 #ifndef ETHTOOL_SFLAGS
99 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
102 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
105 #define TC_RTAB_SIZE 1024
108 static struct nln_notifier *netdev_linux_cache_notifier = NULL;
109 static int cache_notifier_refcount;
112 VALID_IFINDEX = 1 << 0,
113 VALID_ETHERADDR = 1 << 1,
117 VALID_POLICING = 1 << 5,
118 VALID_VPORT_STAT_ERROR = 1 << 6,
119 VALID_DRVINFO = 1 << 7,
120 VALID_FEATURES = 1 << 8,
127 /* Traffic control. */
129 /* An instance of a traffic control class. Always associated with a particular
132 * Each TC implementation subclasses this with whatever additional data it
135 const struct tc_ops *ops;
136 struct hmap queues; /* Contains "struct tc_queue"s.
137 * Read by generic TC layer.
138 * Written only by TC implementation. */
141 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
143 /* One traffic control queue.
145 * Each TC implementation subclasses this with whatever additional data it
148 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
149 unsigned int queue_id; /* OpenFlow queue ID. */
152 /* A particular kind of traffic control. Each implementation generally maps to
153 * one particular Linux qdisc class.
155 * The functions below return 0 if successful or a positive errno value on
156 * failure, except where otherwise noted. All of them must be provided, except
157 * where otherwise noted. */
159 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
160 * This is null for tc_ops_default and tc_ops_other, for which there are no
161 * appropriate values. */
162 const char *linux_name;
164 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
165 const char *ovs_name;
167 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
168 * queues. The queues are numbered 0 through n_queues - 1. */
169 unsigned int n_queues;
171 /* Called to install this TC class on 'netdev'. The implementation should
172 * make the Netlink calls required to set up 'netdev' with the right qdisc
173 * and configure it according to 'details'. The implementation may assume
174 * that the current qdisc is the default; that is, there is no need for it
175 * to delete the current qdisc before installing itself.
177 * The contents of 'details' should be documented as valid for 'ovs_name'
178 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
179 * (which is built as ovs-vswitchd.conf.db(8)).
181 * This function must return 0 if and only if it sets 'netdev->tc' to an
182 * initialized 'struct tc'.
184 * (This function is null for tc_ops_other, which cannot be installed. For
185 * other TC classes it should always be nonnull.) */
186 int (*tc_install)(struct netdev *netdev, const struct smap *details);
188 /* Called when the netdev code determines (through a Netlink query) that
189 * this TC class's qdisc is installed on 'netdev', but we didn't install
190 * it ourselves and so don't know any of the details.
192 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
193 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
194 * implementation should parse the other attributes of 'nlmsg' as
195 * necessary to determine its configuration. If necessary it should also
196 * use Netlink queries to determine the configuration of queues on
199 * This function must return 0 if and only if it sets 'netdev->tc' to an
200 * initialized 'struct tc'. */
201 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
203 /* Destroys the data structures allocated by the implementation as part of
204 * 'tc'. (This includes destroying 'tc->queues' by calling
207 * The implementation should not need to perform any Netlink calls. If
208 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
209 * (But it may not be desirable.)
211 * This function may be null if 'tc' is trivial. */
212 void (*tc_destroy)(struct tc *tc);
214 /* Retrieves details of 'netdev->tc' configuration into 'details'.
216 * The implementation should not need to perform any Netlink calls, because
217 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
218 * cached the configuration.
220 * The contents of 'details' should be documented as valid for 'ovs_name'
221 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
222 * (which is built as ovs-vswitchd.conf.db(8)).
224 * This function may be null if 'tc' is not configurable.
226 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
228 /* Reconfigures 'netdev->tc' according to 'details', performing any
229 * required Netlink calls to complete the reconfiguration.
231 * The contents of 'details' should be documented as valid for 'ovs_name'
232 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
233 * (which is built as ovs-vswitchd.conf.db(8)).
235 * This function may be null if 'tc' is not configurable.
237 int (*qdisc_set)(struct netdev *, const struct smap *details);
239 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
240 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
242 * The contents of 'details' should be documented as valid for 'ovs_name'
243 * in the "other_config" column in the "Queue" table in
244 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
246 * The implementation should not need to perform any Netlink calls, because
247 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
248 * cached the queue configuration.
250 * This function may be null if 'tc' does not have queues ('n_queues' is
252 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
253 struct smap *details);
255 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
256 * 'details', perfoming any required Netlink calls to complete the
257 * reconfiguration. The caller ensures that 'queue_id' is less than
260 * The contents of 'details' should be documented as valid for 'ovs_name'
261 * in the "other_config" column in the "Queue" table in
262 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
264 * This function may be null if 'tc' does not have queues or its queues are
265 * not configurable. */
266 int (*class_set)(struct netdev *, unsigned int queue_id,
267 const struct smap *details);
269 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
270 * tc_queue's within 'netdev->tc->queues'.
272 * This function may be null if 'tc' does not have queues or its queues
273 * cannot be deleted. */
274 int (*class_delete)(struct netdev *, struct tc_queue *queue);
276 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
277 * 'struct tc_queue's within 'netdev->tc->queues'.
279 * On success, initializes '*stats'.
281 * This function may be null if 'tc' does not have queues or if it cannot
282 * report queue statistics. */
283 int (*class_get_stats)(const struct netdev *netdev,
284 const struct tc_queue *queue,
285 struct netdev_queue_stats *stats);
287 /* Extracts queue stats from 'nlmsg', which is a response to a
288 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
290 * This function may be null if 'tc' does not have queues or if it cannot
291 * report queue statistics. */
292 int (*class_dump_stats)(const struct netdev *netdev,
293 const struct ofpbuf *nlmsg,
294 netdev_dump_queue_stats_cb *cb, void *aux);
298 tc_init(struct tc *tc, const struct tc_ops *ops)
301 hmap_init(&tc->queues);
305 tc_destroy(struct tc *tc)
307 hmap_destroy(&tc->queues);
310 static const struct tc_ops tc_ops_htb;
311 static const struct tc_ops tc_ops_hfsc;
312 static const struct tc_ops tc_ops_default;
313 static const struct tc_ops tc_ops_other;
315 static const struct tc_ops *const tcs[] = {
316 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
317 &tc_ops_hfsc, /* Hierarchical fair service curve. */
318 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
319 &tc_ops_other, /* Some other qdisc. */
323 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
324 static unsigned int tc_get_major(unsigned int handle);
325 static unsigned int tc_get_minor(unsigned int handle);
327 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
328 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
329 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
331 static struct tcmsg *tc_make_request(const struct netdev *, int type,
332 unsigned int flags, struct ofpbuf *);
333 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
334 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
335 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
338 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
339 struct nlattr **options);
340 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
341 struct nlattr **options,
342 struct netdev_queue_stats *);
343 static int tc_query_class(const struct netdev *,
344 unsigned int handle, unsigned int parent,
345 struct ofpbuf **replyp);
346 static int tc_delete_class(const struct netdev *, unsigned int handle);
348 static int tc_del_qdisc(struct netdev *netdev);
349 static int tc_query_qdisc(const struct netdev *netdev);
351 static int tc_calc_cell_log(unsigned int mtu);
352 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
353 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
354 const struct tc_ratespec *rate);
355 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
357 struct netdev_dev_linux {
358 struct netdev_dev up;
360 struct shash_node *shash_node;
361 unsigned int cache_valid;
362 unsigned int change_seq;
364 bool miimon; /* Link status of last poll. */
365 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
366 struct timer miimon_timer;
368 /* The following are figured out "on demand" only. They are only valid
369 * when the corresponding VALID_* bit in 'cache_valid' is set. */
371 uint8_t etheraddr[ETH_ADDR_LEN];
372 struct in_addr address, netmask;
375 unsigned int ifi_flags;
376 long long int carrier_resets;
377 uint32_t kbits_rate; /* Policing data. */
378 uint32_t kbits_burst;
379 int vport_stats_error; /* Cached error code from vport_get_stats().
380 0 or an errno value. */
381 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
382 int ether_addr_error; /* Cached error code from set/get etheraddr. */
383 int netdev_policing_error; /* Cached error code from set policing. */
384 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
385 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
387 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
388 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
389 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
390 enum netdev_features peer; /* Cached from ETHTOOL_GSET. */
392 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
396 struct tap_state tap;
400 struct netdev_linux {
404 struct netdev_rx_linux {
410 static const struct netdev_rx_class netdev_rx_linux_class;
412 /* Sockets used for ioctl operations. */
413 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
415 /* A Netlink routing socket that is not subscribed to any multicast groups. */
416 static struct nl_sock *rtnl_sock;
418 /* This is set pretty low because we probably won't learn anything from the
419 * additional log messages. */
420 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
422 static int netdev_linux_init(void);
424 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
425 int cmd, const char *cmd_name);
426 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
427 const char *cmd_name);
428 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
429 int cmd, const char *cmd_name);
430 static int get_flags(const struct netdev_dev *, unsigned int *flags);
431 static int set_flags(const char *, unsigned int flags);
432 static int do_get_ifindex(const char *netdev_name);
433 static int get_ifindex(const struct netdev *, int *ifindexp);
434 static int do_set_addr(struct netdev *netdev,
435 int ioctl_nr, const char *ioctl_name,
436 struct in_addr addr);
437 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
438 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
439 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
440 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
441 static int af_packet_sock(void);
442 static void netdev_linux_miimon_run(void);
443 static void netdev_linux_miimon_wait(void);
446 is_netdev_linux_class(const struct netdev_class *netdev_class)
448 return netdev_class->init == netdev_linux_init;
452 is_tap_netdev(const struct netdev *netdev)
454 return netdev_dev_get_class(netdev_get_dev(netdev)) == &netdev_tap_class;
457 static struct netdev_dev_linux *
458 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
460 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
461 ovs_assert(is_netdev_linux_class(netdev_class));
463 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, up);
466 static struct netdev_linux *
467 netdev_linux_cast(const struct netdev *netdev)
469 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
470 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
471 ovs_assert(is_netdev_linux_class(netdev_class));
473 return CONTAINER_OF(netdev, struct netdev_linux, up);
476 static struct netdev_rx_linux *
477 netdev_rx_linux_cast(const struct netdev_rx *rx)
479 netdev_rx_assert_class(rx, &netdev_rx_linux_class);
480 return CONTAINER_OF(rx, struct netdev_rx_linux, up);
484 netdev_linux_init(void)
486 static int status = -1;
488 /* Create AF_INET socket. */
489 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
490 status = af_inet_sock >= 0 ? 0 : errno;
492 VLOG_ERR("failed to create inet socket: %s", strerror(status));
495 /* Create rtnetlink socket. */
497 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
499 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
508 netdev_linux_run(void)
510 rtnetlink_link_run();
511 netdev_linux_miimon_run();
515 netdev_linux_wait(void)
517 rtnetlink_link_wait();
518 netdev_linux_miimon_wait();
522 netdev_dev_linux_changed(struct netdev_dev_linux *dev,
523 unsigned int ifi_flags,
527 if (!dev->change_seq) {
531 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
532 dev->carrier_resets++;
534 dev->ifi_flags = ifi_flags;
536 dev->cache_valid &= mask;
540 netdev_dev_linux_update(struct netdev_dev_linux *dev,
541 const struct rtnetlink_link_change *change)
543 if (change->nlmsg_type == RTM_NEWLINK) {
545 netdev_dev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
547 /* Update netdev from rtnl-change msg. */
549 dev->mtu = change->mtu;
550 dev->cache_valid |= VALID_MTU;
551 dev->netdev_mtu_error = 0;
554 if (!eth_addr_is_zero(change->addr)) {
555 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
556 dev->cache_valid |= VALID_ETHERADDR;
557 dev->ether_addr_error = 0;
560 dev->ifindex = change->ifi_index;
561 dev->cache_valid |= VALID_IFINDEX;
562 dev->get_ifindex_error = 0;
565 netdev_dev_linux_changed(dev, change->ifi_flags, 0);
570 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
571 void *aux OVS_UNUSED)
573 struct netdev_dev_linux *dev;
575 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
577 const struct netdev_class *netdev_class =
578 netdev_dev_get_class(base_dev);
580 if (is_netdev_linux_class(netdev_class)) {
581 dev = netdev_dev_linux_cast(base_dev);
582 netdev_dev_linux_update(dev, change);
586 struct shash device_shash;
587 struct shash_node *node;
589 shash_init(&device_shash);
590 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
591 SHASH_FOR_EACH (node, &device_shash) {
596 get_flags(&dev->up, &flags);
597 netdev_dev_linux_changed(dev, flags, 0);
599 shash_destroy(&device_shash);
604 cache_notifier_ref(void)
606 if (!cache_notifier_refcount) {
607 ovs_assert(!netdev_linux_cache_notifier);
609 netdev_linux_cache_notifier =
610 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
612 if (!netdev_linux_cache_notifier) {
616 cache_notifier_refcount++;
622 cache_notifier_unref(void)
624 ovs_assert(cache_notifier_refcount > 0);
625 if (!--cache_notifier_refcount) {
626 ovs_assert(netdev_linux_cache_notifier);
627 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
628 netdev_linux_cache_notifier = NULL;
632 /* Creates system and internal devices. */
634 netdev_linux_create(const struct netdev_class *class, const char *name,
635 struct netdev_dev **netdev_devp)
637 struct netdev_dev_linux *netdev_dev;
640 error = cache_notifier_ref();
645 netdev_dev = xzalloc(sizeof *netdev_dev);
646 netdev_dev->change_seq = 1;
647 netdev_dev_init(&netdev_dev->up, name, class);
648 get_flags(&netdev_dev->up, &netdev_dev->ifi_flags);
650 *netdev_devp = &netdev_dev->up;
654 /* For most types of netdevs we open the device for each call of
655 * netdev_open(). However, this is not the case with tap devices,
656 * since it is only possible to open the device once. In this
657 * situation we share a single file descriptor, and consequently
658 * buffers, across all readers. Therefore once data is read it will
659 * be unavailable to other reads for tap devices. */
661 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
662 const char *name, struct netdev_dev **netdev_devp)
664 struct netdev_dev_linux *netdev_dev;
665 struct tap_state *state;
666 static const char tap_dev[] = "/dev/net/tun";
670 netdev_dev = xzalloc(sizeof *netdev_dev);
671 state = &netdev_dev->state.tap;
673 error = cache_notifier_ref();
678 /* Open tap device. */
679 state->fd = open(tap_dev, O_RDWR);
682 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
683 goto error_unref_notifier;
686 /* Create tap device. */
687 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
688 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
689 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
690 VLOG_WARN("%s: creating tap device failed: %s", name,
693 goto error_unref_notifier;
696 /* Make non-blocking. */
697 error = set_nonblocking(state->fd);
699 goto error_unref_notifier;
702 netdev_dev_init(&netdev_dev->up, name, &netdev_tap_class);
703 *netdev_devp = &netdev_dev->up;
706 error_unref_notifier:
707 cache_notifier_unref();
714 destroy_tap(struct netdev_dev_linux *netdev_dev)
716 struct tap_state *state = &netdev_dev->state.tap;
718 if (state->fd >= 0) {
723 /* Destroys the netdev device 'netdev_dev_'. */
725 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
727 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
728 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
730 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
731 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
734 if (class == &netdev_tap_class) {
735 destroy_tap(netdev_dev);
739 cache_notifier_unref();
743 netdev_linux_open(struct netdev_dev *netdev_dev_, struct netdev **netdevp)
745 struct netdev_linux *netdev;
746 enum netdev_flags flags;
749 /* Allocate network device. */
750 netdev = xzalloc(sizeof *netdev);
751 netdev_init(&netdev->up, netdev_dev_);
753 /* Verify that the device really exists, by attempting to read its flags.
754 * (The flags might be cached, in which case this won't actually do an
757 * Don't do this for "internal" netdevs, though, because those have to be
758 * created as netdev objects before they exist in the kernel, because
759 * creating them in the kernel happens by passing a netdev object to
760 * dpif_port_add(). */
761 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
762 error = netdev_get_flags(&netdev->up, &flags);
763 if (error == ENODEV) {
768 *netdevp = &netdev->up;
772 netdev_uninit(&netdev->up, true);
776 /* Closes and destroys 'netdev'. */
778 netdev_linux_close(struct netdev *netdev_)
780 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
786 netdev_linux_rx_open(struct netdev *netdev_, struct netdev_rx **rxp)
788 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
789 struct netdev_dev_linux *netdev_dev =
790 netdev_dev_linux_cast(netdev_get_dev(netdev_));
791 bool is_tap = is_tap_netdev(netdev_);
792 struct netdev_rx_linux *rx;
797 fd = netdev_dev->state.tap.fd;
799 struct sockaddr_ll sll;
802 /* Create file descriptor. */
803 fd = socket(PF_PACKET, SOCK_RAW, 0);
806 VLOG_ERR("failed to create raw socket (%s)", strerror(error));
810 /* Set non-blocking mode. */
811 error = set_nonblocking(fd);
816 /* Get ethernet device index. */
817 error = get_ifindex(&netdev->up, &ifindex);
822 /* Bind to specific ethernet device. */
823 memset(&sll, 0, sizeof sll);
824 sll.sll_family = AF_PACKET;
825 sll.sll_ifindex = ifindex;
826 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
827 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
829 VLOG_ERR("%s: failed to bind raw socket (%s)",
830 netdev_get_name(netdev_), strerror(error));
835 rx = xmalloc(sizeof *rx);
836 netdev_rx_init(&rx->up, netdev_get_dev(netdev_), &netdev_rx_linux_class);
851 netdev_rx_linux_destroy(struct netdev_rx *rx_)
853 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
862 netdev_rx_linux_recv(struct netdev_rx *rx_, void *data, size_t size)
864 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
869 ? read(rx->fd, data, size)
870 : recv(rx->fd, data, size, MSG_TRUNC));
871 } while (retval < 0 && errno == EINTR);
875 } else if (retval >= 0) {
878 if (errno != EAGAIN) {
879 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
880 strerror(errno), netdev_rx_get_name(rx_));
887 netdev_rx_linux_wait(struct netdev_rx *rx_)
889 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
890 poll_fd_wait(rx->fd, POLLIN);
894 netdev_rx_linux_drain(struct netdev_rx *rx_)
896 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
899 int error = netdev_linux_do_ioctl(netdev_rx_get_name(rx_), &ifr,
900 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
904 drain_fd(rx->fd, ifr.ifr_qlen);
907 return drain_rcvbuf(rx->fd);
911 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
912 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
913 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
914 * the packet is too big or too small to transmit on the device.
916 * The caller retains ownership of 'buffer' in all cases.
918 * The kernel maintains a packet transmission queue, so the caller is not
919 * expected to do additional queuing of packets. */
921 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
926 if (!is_tap_netdev(netdev_)) {
927 /* Use our AF_PACKET socket to send to this device. */
928 struct sockaddr_ll sll;
935 sock = af_packet_sock();
940 error = get_ifindex(netdev_, &ifindex);
945 /* We don't bother setting most fields in sockaddr_ll because the
946 * kernel ignores them for SOCK_RAW. */
947 memset(&sll, 0, sizeof sll);
948 sll.sll_family = AF_PACKET;
949 sll.sll_ifindex = ifindex;
951 iov.iov_base = CONST_CAST(void *, data);
955 msg.msg_namelen = sizeof sll;
958 msg.msg_control = NULL;
959 msg.msg_controllen = 0;
962 retval = sendmsg(sock, &msg, 0);
964 /* Use the tap fd to send to this device. This is essential for
965 * tap devices, because packets sent to a tap device with an
966 * AF_PACKET socket will loop back to be *received* again on the
968 struct netdev_dev_linux *dev
969 = netdev_dev_linux_cast(netdev_get_dev(netdev_));
971 retval = write(dev->state.tap.fd, data, size);
975 /* The Linux AF_PACKET implementation never blocks waiting for room
976 * for packets, instead returning ENOBUFS. Translate this into
977 * EAGAIN for the caller. */
978 if (errno == ENOBUFS) {
980 } else if (errno == EINTR) {
982 } else if (errno != EAGAIN) {
983 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
984 netdev_get_name(netdev_), strerror(errno));
987 } else if (retval != size) {
988 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
989 "%zu) on %s", retval, size, netdev_get_name(netdev_));
997 /* Registers with the poll loop to wake up from the next call to poll_block()
998 * when the packet transmission queue has sufficient room to transmit a packet
999 * with netdev_send().
1001 * The kernel maintains a packet transmission queue, so the client is not
1002 * expected to do additional queuing of packets. Thus, this function is
1003 * unlikely to ever be used. It is included for completeness. */
1005 netdev_linux_send_wait(struct netdev *netdev)
1007 if (is_tap_netdev(netdev)) {
1008 /* TAP device always accepts packets.*/
1009 poll_immediate_wake();
1013 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1014 * otherwise a positive errno value. */
1016 netdev_linux_set_etheraddr(struct netdev *netdev_,
1017 const uint8_t mac[ETH_ADDR_LEN])
1019 struct netdev_dev_linux *netdev_dev =
1020 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1021 struct netdev_saved_flags *sf = NULL;
1024 if (netdev_dev->cache_valid & VALID_ETHERADDR) {
1025 if (netdev_dev->ether_addr_error) {
1026 return netdev_dev->ether_addr_error;
1028 if (eth_addr_equals(netdev_dev->etheraddr, mac)) {
1031 netdev_dev->cache_valid &= ~VALID_ETHERADDR;
1034 /* Tap devices must be brought down before setting the address. */
1035 if (is_tap_netdev(netdev_)) {
1036 enum netdev_flags flags;
1038 if (!netdev_get_flags(netdev_, &flags) && (flags & NETDEV_UP)) {
1039 netdev_turn_flags_off(netdev_, NETDEV_UP, &sf);
1042 error = set_etheraddr(netdev_get_name(netdev_), mac);
1043 if (!error || error == ENODEV) {
1044 netdev_dev->ether_addr_error = error;
1045 netdev_dev->cache_valid |= VALID_ETHERADDR;
1047 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
1051 netdev_restore_flags(sf);
1056 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1058 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1059 uint8_t mac[ETH_ADDR_LEN])
1061 struct netdev_dev_linux *netdev_dev =
1062 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1064 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
1065 int error = get_etheraddr(netdev_get_name(netdev_),
1066 netdev_dev->etheraddr);
1068 netdev_dev->ether_addr_error = error;
1069 netdev_dev->cache_valid |= VALID_ETHERADDR;
1072 if (!netdev_dev->ether_addr_error) {
1073 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
1076 return netdev_dev->ether_addr_error;
1079 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1080 * in bytes, not including the hardware header; thus, this is typically 1500
1081 * bytes for Ethernet devices. */
1083 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1085 struct netdev_dev_linux *netdev_dev =
1086 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1087 if (!(netdev_dev->cache_valid & VALID_MTU)) {
1091 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1092 SIOCGIFMTU, "SIOCGIFMTU");
1094 netdev_dev->netdev_mtu_error = error;
1095 netdev_dev->mtu = ifr.ifr_mtu;
1096 netdev_dev->cache_valid |= VALID_MTU;
1099 if (!netdev_dev->netdev_mtu_error) {
1100 *mtup = netdev_dev->mtu;
1102 return netdev_dev->netdev_mtu_error;
1105 /* Sets the maximum size of transmitted (MTU) for given device using linux
1106 * networking ioctl interface.
1109 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1111 struct netdev_dev_linux *netdev_dev =
1112 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1116 if (netdev_dev->cache_valid & VALID_MTU) {
1117 if (netdev_dev->netdev_mtu_error) {
1118 return netdev_dev->netdev_mtu_error;
1120 if (netdev_dev->mtu == mtu) {
1123 netdev_dev->cache_valid &= ~VALID_MTU;
1126 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1127 SIOCSIFMTU, "SIOCSIFMTU");
1128 if (!error || error == ENODEV) {
1129 netdev_dev->netdev_mtu_error = error;
1130 netdev_dev->mtu = ifr.ifr_mtu;
1131 netdev_dev->cache_valid |= VALID_MTU;
1136 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1137 * On failure, returns a negative errno value. */
1139 netdev_linux_get_ifindex(const struct netdev *netdev)
1143 error = get_ifindex(netdev, &ifindex);
1144 return error ? -error : ifindex;
1148 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1150 struct netdev_dev_linux *netdev_dev =
1151 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1153 if (netdev_dev->miimon_interval > 0) {
1154 *carrier = netdev_dev->miimon;
1156 *carrier = (netdev_dev->ifi_flags & IFF_RUNNING) != 0;
1162 static long long int
1163 netdev_linux_get_carrier_resets(const struct netdev *netdev)
1165 return netdev_dev_linux_cast(netdev_get_dev(netdev))->carrier_resets;
1169 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1170 struct mii_ioctl_data *data)
1175 memset(&ifr, 0, sizeof ifr);
1176 memcpy(&ifr.ifr_data, data, sizeof *data);
1177 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1178 memcpy(data, &ifr.ifr_data, sizeof *data);
1184 netdev_linux_get_miimon(const char *name, bool *miimon)
1186 struct mii_ioctl_data data;
1191 memset(&data, 0, sizeof data);
1192 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1194 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1195 data.reg_num = MII_BMSR;
1196 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1200 *miimon = !!(data.val_out & BMSR_LSTATUS);
1202 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1205 struct ethtool_cmd ecmd;
1207 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1210 COVERAGE_INC(netdev_get_ethtool);
1211 memset(&ecmd, 0, sizeof ecmd);
1212 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1215 struct ethtool_value eval;
1217 memcpy(&eval, &ecmd, sizeof eval);
1218 *miimon = !!eval.data;
1220 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1228 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1229 long long int interval)
1231 struct netdev_dev_linux *netdev_dev;
1233 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev_));
1235 interval = interval > 0 ? MAX(interval, 100) : 0;
1236 if (netdev_dev->miimon_interval != interval) {
1237 netdev_dev->miimon_interval = interval;
1238 timer_set_expired(&netdev_dev->miimon_timer);
1245 netdev_linux_miimon_run(void)
1247 struct shash device_shash;
1248 struct shash_node *node;
1250 shash_init(&device_shash);
1251 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1252 SHASH_FOR_EACH (node, &device_shash) {
1253 struct netdev_dev_linux *dev = node->data;
1256 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1260 netdev_linux_get_miimon(dev->up.name, &miimon);
1261 if (miimon != dev->miimon) {
1262 dev->miimon = miimon;
1263 netdev_dev_linux_changed(dev, dev->ifi_flags, 0);
1266 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1269 shash_destroy(&device_shash);
1273 netdev_linux_miimon_wait(void)
1275 struct shash device_shash;
1276 struct shash_node *node;
1278 shash_init(&device_shash);
1279 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1280 SHASH_FOR_EACH (node, &device_shash) {
1281 struct netdev_dev_linux *dev = node->data;
1283 if (dev->miimon_interval > 0) {
1284 timer_wait(&dev->miimon_timer);
1287 shash_destroy(&device_shash);
1290 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1291 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1294 check_for_working_netlink_stats(void)
1296 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1297 * preferable, so if that works, we'll use it. */
1298 int ifindex = do_get_ifindex("lo");
1300 VLOG_WARN("failed to get ifindex for lo, "
1301 "obtaining netdev stats from proc");
1304 struct netdev_stats stats;
1305 int error = get_stats_via_netlink(ifindex, &stats);
1307 VLOG_DBG("obtaining netdev stats via rtnetlink");
1310 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1311 "via proc (you are probably running a pre-2.6.19 "
1312 "kernel)", strerror(error));
1319 swap_uint64(uint64_t *a, uint64_t *b)
1326 /* Copies 'src' into 'dst', performing format conversion in the process.
1328 * 'src' is allowed to be misaligned. */
1330 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1331 const struct ovs_vport_stats *src)
1333 dst->rx_packets = get_unaligned_u64(&src->rx_packets);
1334 dst->tx_packets = get_unaligned_u64(&src->tx_packets);
1335 dst->rx_bytes = get_unaligned_u64(&src->rx_bytes);
1336 dst->tx_bytes = get_unaligned_u64(&src->tx_bytes);
1337 dst->rx_errors = get_unaligned_u64(&src->rx_errors);
1338 dst->tx_errors = get_unaligned_u64(&src->tx_errors);
1339 dst->rx_dropped = get_unaligned_u64(&src->rx_dropped);
1340 dst->tx_dropped = get_unaligned_u64(&src->tx_dropped);
1342 dst->collisions = 0;
1343 dst->rx_length_errors = 0;
1344 dst->rx_over_errors = 0;
1345 dst->rx_crc_errors = 0;
1346 dst->rx_frame_errors = 0;
1347 dst->rx_fifo_errors = 0;
1348 dst->rx_missed_errors = 0;
1349 dst->tx_aborted_errors = 0;
1350 dst->tx_carrier_errors = 0;
1351 dst->tx_fifo_errors = 0;
1352 dst->tx_heartbeat_errors = 0;
1353 dst->tx_window_errors = 0;
1357 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1359 struct dpif_linux_vport reply;
1363 error = dpif_linux_vport_get(netdev_get_name(netdev), &reply, &buf);
1366 } else if (!reply.stats) {
1371 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1379 get_stats_via_vport(const struct netdev *netdev_,
1380 struct netdev_stats *stats)
1382 struct netdev_dev_linux *netdev_dev =
1383 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1385 if (!netdev_dev->vport_stats_error ||
1386 !(netdev_dev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1389 error = get_stats_via_vport__(netdev_, stats);
1390 if (error && error != ENOENT) {
1391 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1392 "(%s)", netdev_get_name(netdev_), strerror(error));
1394 netdev_dev->vport_stats_error = error;
1395 netdev_dev->cache_valid |= VALID_VPORT_STAT_ERROR;
1400 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1401 struct netdev_stats *stats)
1403 static int use_netlink_stats = -1;
1406 if (use_netlink_stats < 0) {
1407 use_netlink_stats = check_for_working_netlink_stats();
1410 if (use_netlink_stats) {
1413 error = get_ifindex(netdev_, &ifindex);
1415 error = get_stats_via_netlink(ifindex, stats);
1418 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1422 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1423 netdev_get_name(netdev_), error);
1429 /* Retrieves current device stats for 'netdev-linux'. */
1431 netdev_linux_get_stats(const struct netdev *netdev_,
1432 struct netdev_stats *stats)
1434 struct netdev_dev_linux *netdev_dev =
1435 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1436 struct netdev_stats dev_stats;
1439 get_stats_via_vport(netdev_, stats);
1441 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1444 if (netdev_dev->vport_stats_error) {
1451 if (netdev_dev->vport_stats_error) {
1452 /* stats not available from OVS then use ioctl stats. */
1455 stats->rx_errors += dev_stats.rx_errors;
1456 stats->tx_errors += dev_stats.tx_errors;
1457 stats->rx_dropped += dev_stats.rx_dropped;
1458 stats->tx_dropped += dev_stats.tx_dropped;
1459 stats->multicast += dev_stats.multicast;
1460 stats->collisions += dev_stats.collisions;
1461 stats->rx_length_errors += dev_stats.rx_length_errors;
1462 stats->rx_over_errors += dev_stats.rx_over_errors;
1463 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1464 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1465 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1466 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1467 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1468 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1469 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1470 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1471 stats->tx_window_errors += dev_stats.tx_window_errors;
1476 /* Retrieves current device stats for 'netdev-tap' netdev or
1477 * netdev-internal. */
1479 netdev_tap_get_stats(const struct netdev *netdev_,
1480 struct netdev_stats *stats)
1482 struct netdev_dev_linux *netdev_dev =
1483 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1484 struct netdev_stats dev_stats;
1487 get_stats_via_vport(netdev_, stats);
1489 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1491 if (netdev_dev->vport_stats_error) {
1498 /* If this port is an internal port then the transmit and receive stats
1499 * will appear to be swapped relative to the other ports since we are the
1500 * one sending the data, not a remote computer. For consistency, we swap
1501 * them back here. This does not apply if we are getting stats from the
1502 * vport layer because it always tracks stats from the perspective of the
1504 if (netdev_dev->vport_stats_error) {
1506 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1507 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1508 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1509 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1510 stats->rx_length_errors = 0;
1511 stats->rx_over_errors = 0;
1512 stats->rx_crc_errors = 0;
1513 stats->rx_frame_errors = 0;
1514 stats->rx_fifo_errors = 0;
1515 stats->rx_missed_errors = 0;
1516 stats->tx_aborted_errors = 0;
1517 stats->tx_carrier_errors = 0;
1518 stats->tx_fifo_errors = 0;
1519 stats->tx_heartbeat_errors = 0;
1520 stats->tx_window_errors = 0;
1522 stats->rx_dropped += dev_stats.tx_dropped;
1523 stats->tx_dropped += dev_stats.rx_dropped;
1525 stats->rx_errors += dev_stats.tx_errors;
1526 stats->tx_errors += dev_stats.rx_errors;
1528 stats->multicast += dev_stats.multicast;
1529 stats->collisions += dev_stats.collisions;
1535 netdev_internal_get_stats(const struct netdev *netdev_,
1536 struct netdev_stats *stats)
1538 struct netdev_dev_linux *netdev_dev =
1539 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1541 get_stats_via_vport(netdev_, stats);
1542 return netdev_dev->vport_stats_error;
1546 netdev_internal_set_stats(struct netdev *netdev,
1547 const struct netdev_stats *stats)
1549 struct ovs_vport_stats vport_stats;
1550 struct dpif_linux_vport vport;
1553 vport_stats.rx_packets = stats->rx_packets;
1554 vport_stats.tx_packets = stats->tx_packets;
1555 vport_stats.rx_bytes = stats->rx_bytes;
1556 vport_stats.tx_bytes = stats->tx_bytes;
1557 vport_stats.rx_errors = stats->rx_errors;
1558 vport_stats.tx_errors = stats->tx_errors;
1559 vport_stats.rx_dropped = stats->rx_dropped;
1560 vport_stats.tx_dropped = stats->tx_dropped;
1562 dpif_linux_vport_init(&vport);
1563 vport.cmd = OVS_VPORT_CMD_SET;
1564 vport.name = netdev_get_name(netdev);
1565 vport.stats = &vport_stats;
1567 err = dpif_linux_vport_transact(&vport, NULL, NULL);
1569 /* If the vport layer doesn't know about the device, that doesn't mean it
1570 * doesn't exist (after all were able to open it when netdev_open() was
1571 * called), it just means that it isn't attached and we'll be getting
1572 * stats a different way. */
1573 if (err == ENODEV) {
1581 netdev_linux_read_features(struct netdev_dev_linux *netdev_dev)
1583 struct ethtool_cmd ecmd;
1587 if (netdev_dev->cache_valid & VALID_FEATURES) {
1591 COVERAGE_INC(netdev_get_ethtool);
1592 memset(&ecmd, 0, sizeof ecmd);
1593 error = netdev_linux_do_ethtool(netdev_dev->up.name, &ecmd,
1594 ETHTOOL_GSET, "ETHTOOL_GSET");
1599 /* Supported features. */
1600 netdev_dev->supported = 0;
1601 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1602 netdev_dev->supported |= NETDEV_F_10MB_HD;
1604 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1605 netdev_dev->supported |= NETDEV_F_10MB_FD;
1607 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1608 netdev_dev->supported |= NETDEV_F_100MB_HD;
1610 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1611 netdev_dev->supported |= NETDEV_F_100MB_FD;
1613 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1614 netdev_dev->supported |= NETDEV_F_1GB_HD;
1616 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1617 netdev_dev->supported |= NETDEV_F_1GB_FD;
1619 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1620 netdev_dev->supported |= NETDEV_F_10GB_FD;
1622 if (ecmd.supported & SUPPORTED_TP) {
1623 netdev_dev->supported |= NETDEV_F_COPPER;
1625 if (ecmd.supported & SUPPORTED_FIBRE) {
1626 netdev_dev->supported |= NETDEV_F_FIBER;
1628 if (ecmd.supported & SUPPORTED_Autoneg) {
1629 netdev_dev->supported |= NETDEV_F_AUTONEG;
1631 if (ecmd.supported & SUPPORTED_Pause) {
1632 netdev_dev->supported |= NETDEV_F_PAUSE;
1634 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1635 netdev_dev->supported |= NETDEV_F_PAUSE_ASYM;
1638 /* Advertised features. */
1639 netdev_dev->advertised = 0;
1640 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1641 netdev_dev->advertised |= NETDEV_F_10MB_HD;
1643 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1644 netdev_dev->advertised |= NETDEV_F_10MB_FD;
1646 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1647 netdev_dev->advertised |= NETDEV_F_100MB_HD;
1649 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1650 netdev_dev->advertised |= NETDEV_F_100MB_FD;
1652 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1653 netdev_dev->advertised |= NETDEV_F_1GB_HD;
1655 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1656 netdev_dev->advertised |= NETDEV_F_1GB_FD;
1658 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1659 netdev_dev->advertised |= NETDEV_F_10GB_FD;
1661 if (ecmd.advertising & ADVERTISED_TP) {
1662 netdev_dev->advertised |= NETDEV_F_COPPER;
1664 if (ecmd.advertising & ADVERTISED_FIBRE) {
1665 netdev_dev->advertised |= NETDEV_F_FIBER;
1667 if (ecmd.advertising & ADVERTISED_Autoneg) {
1668 netdev_dev->advertised |= NETDEV_F_AUTONEG;
1670 if (ecmd.advertising & ADVERTISED_Pause) {
1671 netdev_dev->advertised |= NETDEV_F_PAUSE;
1673 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1674 netdev_dev->advertised |= NETDEV_F_PAUSE_ASYM;
1677 /* Current settings. */
1679 if (speed == SPEED_10) {
1680 netdev_dev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1681 } else if (speed == SPEED_100) {
1682 netdev_dev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1683 } else if (speed == SPEED_1000) {
1684 netdev_dev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1685 } else if (speed == SPEED_10000) {
1686 netdev_dev->current = NETDEV_F_10GB_FD;
1687 } else if (speed == 40000) {
1688 netdev_dev->current = NETDEV_F_40GB_FD;
1689 } else if (speed == 100000) {
1690 netdev_dev->current = NETDEV_F_100GB_FD;
1691 } else if (speed == 1000000) {
1692 netdev_dev->current = NETDEV_F_1TB_FD;
1694 netdev_dev->current = 0;
1697 if (ecmd.port == PORT_TP) {
1698 netdev_dev->current |= NETDEV_F_COPPER;
1699 } else if (ecmd.port == PORT_FIBRE) {
1700 netdev_dev->current |= NETDEV_F_FIBER;
1704 netdev_dev->current |= NETDEV_F_AUTONEG;
1707 /* Peer advertisements. */
1708 netdev_dev->peer = 0; /* XXX */
1711 netdev_dev->cache_valid |= VALID_FEATURES;
1712 netdev_dev->get_features_error = error;
1715 /* Stores the features supported by 'netdev' into each of '*current',
1716 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1717 * bitmap of NETDEV_* bits. Returns 0 if successful, otherwise a positive
1720 netdev_linux_get_features(const struct netdev *netdev_,
1721 enum netdev_features *current,
1722 enum netdev_features *advertised,
1723 enum netdev_features *supported,
1724 enum netdev_features *peer)
1726 struct netdev_dev_linux *netdev_dev =
1727 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1729 netdev_linux_read_features(netdev_dev);
1731 if (!netdev_dev->get_features_error) {
1732 *current = netdev_dev->current;
1733 *advertised = netdev_dev->advertised;
1734 *supported = netdev_dev->supported;
1735 *peer = netdev_dev->peer;
1737 return netdev_dev->get_features_error;
1740 /* Set the features advertised by 'netdev' to 'advertise'. */
1742 netdev_linux_set_advertisements(struct netdev *netdev,
1743 enum netdev_features advertise)
1745 struct ethtool_cmd ecmd;
1748 COVERAGE_INC(netdev_get_ethtool);
1749 memset(&ecmd, 0, sizeof ecmd);
1750 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1751 ETHTOOL_GSET, "ETHTOOL_GSET");
1756 ecmd.advertising = 0;
1757 if (advertise & NETDEV_F_10MB_HD) {
1758 ecmd.advertising |= ADVERTISED_10baseT_Half;
1760 if (advertise & NETDEV_F_10MB_FD) {
1761 ecmd.advertising |= ADVERTISED_10baseT_Full;
1763 if (advertise & NETDEV_F_100MB_HD) {
1764 ecmd.advertising |= ADVERTISED_100baseT_Half;
1766 if (advertise & NETDEV_F_100MB_FD) {
1767 ecmd.advertising |= ADVERTISED_100baseT_Full;
1769 if (advertise & NETDEV_F_1GB_HD) {
1770 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1772 if (advertise & NETDEV_F_1GB_FD) {
1773 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1775 if (advertise & NETDEV_F_10GB_FD) {
1776 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1778 if (advertise & NETDEV_F_COPPER) {
1779 ecmd.advertising |= ADVERTISED_TP;
1781 if (advertise & NETDEV_F_FIBER) {
1782 ecmd.advertising |= ADVERTISED_FIBRE;
1784 if (advertise & NETDEV_F_AUTONEG) {
1785 ecmd.advertising |= ADVERTISED_Autoneg;
1787 if (advertise & NETDEV_F_PAUSE) {
1788 ecmd.advertising |= ADVERTISED_Pause;
1790 if (advertise & NETDEV_F_PAUSE_ASYM) {
1791 ecmd.advertising |= ADVERTISED_Asym_Pause;
1793 COVERAGE_INC(netdev_set_ethtool);
1794 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1795 ETHTOOL_SSET, "ETHTOOL_SSET");
1798 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1799 * successful, otherwise a positive errno value. */
1801 netdev_linux_set_policing(struct netdev *netdev,
1802 uint32_t kbits_rate, uint32_t kbits_burst)
1804 struct netdev_dev_linux *netdev_dev =
1805 netdev_dev_linux_cast(netdev_get_dev(netdev));
1806 const char *netdev_name = netdev_get_name(netdev);
1810 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1811 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1812 : kbits_burst); /* Stick with user-specified value. */
1814 if (netdev_dev->cache_valid & VALID_POLICING) {
1815 if (netdev_dev->netdev_policing_error) {
1816 return netdev_dev->netdev_policing_error;
1819 if (netdev_dev->kbits_rate == kbits_rate &&
1820 netdev_dev->kbits_burst == kbits_burst) {
1821 /* Assume that settings haven't changed since we last set them. */
1824 netdev_dev->cache_valid &= ~VALID_POLICING;
1827 COVERAGE_INC(netdev_set_policing);
1828 /* Remove any existing ingress qdisc. */
1829 error = tc_add_del_ingress_qdisc(netdev, false);
1831 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1832 netdev_name, strerror(error));
1837 error = tc_add_del_ingress_qdisc(netdev, true);
1839 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1840 netdev_name, strerror(error));
1844 error = tc_add_policer(netdev, kbits_rate, kbits_burst);
1846 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1847 netdev_name, strerror(error));
1852 netdev_dev->kbits_rate = kbits_rate;
1853 netdev_dev->kbits_burst = kbits_burst;
1856 if (!error || error == ENODEV) {
1857 netdev_dev->netdev_policing_error = error;
1858 netdev_dev->cache_valid |= VALID_POLICING;
1864 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1867 const struct tc_ops *const *opsp;
1869 for (opsp = tcs; *opsp != NULL; opsp++) {
1870 const struct tc_ops *ops = *opsp;
1871 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1872 sset_add(types, ops->ovs_name);
1878 static const struct tc_ops *
1879 tc_lookup_ovs_name(const char *name)
1881 const struct tc_ops *const *opsp;
1883 for (opsp = tcs; *opsp != NULL; opsp++) {
1884 const struct tc_ops *ops = *opsp;
1885 if (!strcmp(name, ops->ovs_name)) {
1892 static const struct tc_ops *
1893 tc_lookup_linux_name(const char *name)
1895 const struct tc_ops *const *opsp;
1897 for (opsp = tcs; *opsp != NULL; opsp++) {
1898 const struct tc_ops *ops = *opsp;
1899 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1906 static struct tc_queue *
1907 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1910 struct netdev_dev_linux *netdev_dev =
1911 netdev_dev_linux_cast(netdev_get_dev(netdev));
1912 struct tc_queue *queue;
1914 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1915 if (queue->queue_id == queue_id) {
1922 static struct tc_queue *
1923 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1925 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1929 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1931 struct netdev_qos_capabilities *caps)
1933 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1937 caps->n_queues = ops->n_queues;
1942 netdev_linux_get_qos(const struct netdev *netdev,
1943 const char **typep, struct smap *details)
1945 struct netdev_dev_linux *netdev_dev =
1946 netdev_dev_linux_cast(netdev_get_dev(netdev));
1949 error = tc_query_qdisc(netdev);
1954 *typep = netdev_dev->tc->ops->ovs_name;
1955 return (netdev_dev->tc->ops->qdisc_get
1956 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1961 netdev_linux_set_qos(struct netdev *netdev,
1962 const char *type, const struct smap *details)
1964 struct netdev_dev_linux *netdev_dev =
1965 netdev_dev_linux_cast(netdev_get_dev(netdev));
1966 const struct tc_ops *new_ops;
1969 new_ops = tc_lookup_ovs_name(type);
1970 if (!new_ops || !new_ops->tc_install) {
1974 error = tc_query_qdisc(netdev);
1979 if (new_ops == netdev_dev->tc->ops) {
1980 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1982 /* Delete existing qdisc. */
1983 error = tc_del_qdisc(netdev);
1987 ovs_assert(netdev_dev->tc == NULL);
1989 /* Install new qdisc. */
1990 error = new_ops->tc_install(netdev, details);
1991 ovs_assert((error == 0) == (netdev_dev->tc != NULL));
1998 netdev_linux_get_queue(const struct netdev *netdev,
1999 unsigned int queue_id, struct smap *details)
2001 struct netdev_dev_linux *netdev_dev =
2002 netdev_dev_linux_cast(netdev_get_dev(netdev));
2005 error = tc_query_qdisc(netdev);
2009 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
2011 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
2017 netdev_linux_set_queue(struct netdev *netdev,
2018 unsigned int queue_id, const struct smap *details)
2020 struct netdev_dev_linux *netdev_dev =
2021 netdev_dev_linux_cast(netdev_get_dev(netdev));
2024 error = tc_query_qdisc(netdev);
2027 } else if (queue_id >= netdev_dev->tc->ops->n_queues
2028 || !netdev_dev->tc->ops->class_set) {
2032 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
2036 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
2038 struct netdev_dev_linux *netdev_dev =
2039 netdev_dev_linux_cast(netdev_get_dev(netdev));
2042 error = tc_query_qdisc(netdev);
2045 } else if (!netdev_dev->tc->ops->class_delete) {
2048 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
2050 ? netdev_dev->tc->ops->class_delete(netdev, queue)
2056 netdev_linux_get_queue_stats(const struct netdev *netdev,
2057 unsigned int queue_id,
2058 struct netdev_queue_stats *stats)
2060 struct netdev_dev_linux *netdev_dev =
2061 netdev_dev_linux_cast(netdev_get_dev(netdev));
2064 error = tc_query_qdisc(netdev);
2067 } else if (!netdev_dev->tc->ops->class_get_stats) {
2070 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
2072 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
2078 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
2080 struct ofpbuf request;
2081 struct tcmsg *tcmsg;
2083 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2087 tcmsg->tcm_parent = 0;
2088 nl_dump_start(dump, rtnl_sock, &request);
2089 ofpbuf_uninit(&request);
2094 netdev_linux_dump_queues(const struct netdev *netdev,
2095 netdev_dump_queues_cb *cb, void *aux)
2097 struct netdev_dev_linux *netdev_dev =
2098 netdev_dev_linux_cast(netdev_get_dev(netdev));
2099 struct tc_queue *queue, *next_queue;
2100 struct smap details;
2104 error = tc_query_qdisc(netdev);
2107 } else if (!netdev_dev->tc->ops->class_get) {
2112 smap_init(&details);
2113 HMAP_FOR_EACH_SAFE (queue, next_queue, hmap_node,
2114 &netdev_dev->tc->queues) {
2115 smap_clear(&details);
2117 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
2119 (*cb)(queue->queue_id, &details, aux);
2124 smap_destroy(&details);
2130 netdev_linux_dump_queue_stats(const struct netdev *netdev,
2131 netdev_dump_queue_stats_cb *cb, void *aux)
2133 struct netdev_dev_linux *netdev_dev =
2134 netdev_dev_linux_cast(netdev_get_dev(netdev));
2135 struct nl_dump dump;
2140 error = tc_query_qdisc(netdev);
2143 } else if (!netdev_dev->tc->ops->class_dump_stats) {
2148 if (!start_queue_dump(netdev, &dump)) {
2151 while (nl_dump_next(&dump, &msg)) {
2152 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
2158 error = nl_dump_done(&dump);
2159 return error ? error : last_error;
2163 netdev_linux_get_in4(const struct netdev *netdev_,
2164 struct in_addr *address, struct in_addr *netmask)
2166 struct netdev_dev_linux *netdev_dev =
2167 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2169 if (!(netdev_dev->cache_valid & VALID_IN4)) {
2172 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
2173 SIOCGIFADDR, "SIOCGIFADDR");
2178 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
2179 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2184 netdev_dev->cache_valid |= VALID_IN4;
2186 *address = netdev_dev->address;
2187 *netmask = netdev_dev->netmask;
2188 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
2192 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2193 struct in_addr netmask)
2195 struct netdev_dev_linux *netdev_dev =
2196 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2199 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2201 netdev_dev->cache_valid |= VALID_IN4;
2202 netdev_dev->address = address;
2203 netdev_dev->netmask = netmask;
2204 if (address.s_addr != INADDR_ANY) {
2205 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2206 "SIOCSIFNETMASK", netmask);
2213 parse_if_inet6_line(const char *line,
2214 struct in6_addr *in6, char ifname[16 + 1])
2216 uint8_t *s6 = in6->s6_addr;
2217 #define X8 "%2"SCNx8
2219 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2220 "%*x %*x %*x %*x %16s\n",
2221 &s6[0], &s6[1], &s6[2], &s6[3],
2222 &s6[4], &s6[5], &s6[6], &s6[7],
2223 &s6[8], &s6[9], &s6[10], &s6[11],
2224 &s6[12], &s6[13], &s6[14], &s6[15],
2228 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2229 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2231 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2233 struct netdev_dev_linux *netdev_dev =
2234 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2235 if (!(netdev_dev->cache_valid & VALID_IN6)) {
2239 netdev_dev->in6 = in6addr_any;
2241 file = fopen("/proc/net/if_inet6", "r");
2243 const char *name = netdev_get_name(netdev_);
2244 while (fgets(line, sizeof line, file)) {
2245 struct in6_addr in6_tmp;
2246 char ifname[16 + 1];
2247 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2248 && !strcmp(name, ifname))
2250 netdev_dev->in6 = in6_tmp;
2256 netdev_dev->cache_valid |= VALID_IN6;
2258 *in6 = netdev_dev->in6;
2263 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2265 struct sockaddr_in sin;
2266 memset(&sin, 0, sizeof sin);
2267 sin.sin_family = AF_INET;
2268 sin.sin_addr = addr;
2271 memset(sa, 0, sizeof *sa);
2272 memcpy(sa, &sin, sizeof sin);
2276 do_set_addr(struct netdev *netdev,
2277 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2280 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2281 make_in4_sockaddr(&ifr.ifr_addr, addr);
2283 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2287 /* Adds 'router' as a default IP gateway. */
2289 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2291 struct in_addr any = { INADDR_ANY };
2295 memset(&rt, 0, sizeof rt);
2296 make_in4_sockaddr(&rt.rt_dst, any);
2297 make_in4_sockaddr(&rt.rt_gateway, router);
2298 make_in4_sockaddr(&rt.rt_genmask, any);
2299 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2300 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2302 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
2308 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2311 static const char fn[] = "/proc/net/route";
2316 *netdev_name = NULL;
2317 stream = fopen(fn, "r");
2318 if (stream == NULL) {
2319 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
2324 while (fgets(line, sizeof line, stream)) {
2327 ovs_be32 dest, gateway, mask;
2328 int refcnt, metric, mtu;
2329 unsigned int flags, use, window, irtt;
2332 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2334 iface, &dest, &gateway, &flags, &refcnt,
2335 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2337 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2341 if (!(flags & RTF_UP)) {
2342 /* Skip routes that aren't up. */
2346 /* The output of 'dest', 'mask', and 'gateway' were given in
2347 * network byte order, so we don't need need any endian
2348 * conversions here. */
2349 if ((dest & mask) == (host->s_addr & mask)) {
2351 /* The host is directly reachable. */
2352 next_hop->s_addr = 0;
2354 /* To reach the host, we must go through a gateway. */
2355 next_hop->s_addr = gateway;
2357 *netdev_name = xstrdup(iface);
2369 netdev_linux_get_status(const struct netdev *netdev, struct smap *smap)
2371 struct netdev_dev_linux *netdev_dev;
2374 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2375 if (!(netdev_dev->cache_valid & VALID_DRVINFO)) {
2376 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev_dev->drvinfo;
2378 COVERAGE_INC(netdev_get_ethtool);
2379 memset(&netdev_dev->drvinfo, 0, sizeof netdev_dev->drvinfo);
2380 error = netdev_linux_do_ethtool(netdev_dev->up.name,
2383 "ETHTOOL_GDRVINFO");
2385 netdev_dev->cache_valid |= VALID_DRVINFO;
2390 smap_add(smap, "driver_name", netdev_dev->drvinfo.driver);
2391 smap_add(smap, "driver_version", netdev_dev->drvinfo.version);
2392 smap_add(smap, "firmware_version", netdev_dev->drvinfo.fw_version);
2398 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2401 smap_add(smap, "driver_name", "openvswitch");
2405 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2406 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2407 * returns 0. Otherwise, it returns a positive errno value; in particular,
2408 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2410 netdev_linux_arp_lookup(const struct netdev *netdev,
2411 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2414 struct sockaddr_in sin;
2417 memset(&r, 0, sizeof r);
2418 memset(&sin, 0, sizeof sin);
2419 sin.sin_family = AF_INET;
2420 sin.sin_addr.s_addr = ip;
2422 memcpy(&r.arp_pa, &sin, sizeof sin);
2423 r.arp_ha.sa_family = ARPHRD_ETHER;
2425 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2426 COVERAGE_INC(netdev_arp_lookup);
2427 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2429 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2430 } else if (retval != ENXIO) {
2431 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2432 netdev_get_name(netdev), IP_ARGS(ip), strerror(retval));
2438 nd_to_iff_flags(enum netdev_flags nd)
2441 if (nd & NETDEV_UP) {
2444 if (nd & NETDEV_PROMISC) {
2451 iff_to_nd_flags(int iff)
2453 enum netdev_flags nd = 0;
2457 if (iff & IFF_PROMISC) {
2458 nd |= NETDEV_PROMISC;
2464 netdev_linux_update_flags(struct netdev_dev *dev_, enum netdev_flags off,
2465 enum netdev_flags on, enum netdev_flags *old_flagsp)
2467 struct netdev_dev_linux *netdev_dev;
2468 int old_flags, new_flags;
2471 netdev_dev = netdev_dev_linux_cast(dev_);
2472 old_flags = netdev_dev->ifi_flags;
2473 *old_flagsp = iff_to_nd_flags(old_flags);
2474 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2475 if (new_flags != old_flags) {
2476 error = set_flags(netdev_dev_get_name(dev_), new_flags);
2477 get_flags(&netdev_dev->up, &netdev_dev->ifi_flags);
2483 netdev_linux_change_seq(const struct netdev *netdev)
2485 return netdev_dev_linux_cast(netdev_get_dev(netdev))->change_seq;
2488 #define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS, \
2489 GET_FEATURES, GET_STATUS) \
2493 netdev_linux_init, \
2495 netdev_linux_wait, \
2498 netdev_linux_destroy, \
2499 NULL, /* get_config */ \
2500 NULL, /* set_config */ \
2501 NULL, /* get_tunnel_config */ \
2503 netdev_linux_open, \
2504 netdev_linux_close, \
2506 netdev_linux_rx_open, \
2508 netdev_linux_send, \
2509 netdev_linux_send_wait, \
2511 netdev_linux_set_etheraddr, \
2512 netdev_linux_get_etheraddr, \
2513 netdev_linux_get_mtu, \
2514 netdev_linux_set_mtu, \
2515 netdev_linux_get_ifindex, \
2516 netdev_linux_get_carrier, \
2517 netdev_linux_get_carrier_resets, \
2518 netdev_linux_set_miimon_interval, \
2523 netdev_linux_set_advertisements, \
2525 netdev_linux_set_policing, \
2526 netdev_linux_get_qos_types, \
2527 netdev_linux_get_qos_capabilities, \
2528 netdev_linux_get_qos, \
2529 netdev_linux_set_qos, \
2530 netdev_linux_get_queue, \
2531 netdev_linux_set_queue, \
2532 netdev_linux_delete_queue, \
2533 netdev_linux_get_queue_stats, \
2534 netdev_linux_dump_queues, \
2535 netdev_linux_dump_queue_stats, \
2537 netdev_linux_get_in4, \
2538 netdev_linux_set_in4, \
2539 netdev_linux_get_in6, \
2540 netdev_linux_add_router, \
2541 netdev_linux_get_next_hop, \
2543 netdev_linux_arp_lookup, \
2545 netdev_linux_update_flags, \
2547 netdev_linux_change_seq \
2550 const struct netdev_class netdev_linux_class =
2553 netdev_linux_create,
2554 netdev_linux_get_stats,
2555 NULL, /* set_stats */
2556 netdev_linux_get_features,
2557 netdev_linux_get_status);
2559 const struct netdev_class netdev_tap_class =
2562 netdev_linux_create_tap,
2563 netdev_tap_get_stats,
2564 NULL, /* set_stats */
2565 netdev_linux_get_features,
2566 netdev_linux_get_status);
2568 const struct netdev_class netdev_internal_class =
2571 netdev_linux_create,
2572 netdev_internal_get_stats,
2573 netdev_internal_set_stats,
2574 NULL, /* get_features */
2575 netdev_internal_get_status);
2577 static const struct netdev_rx_class netdev_rx_linux_class = {
2578 netdev_rx_linux_destroy,
2579 netdev_rx_linux_recv,
2580 netdev_rx_linux_wait,
2581 netdev_rx_linux_drain,
2584 /* HTB traffic control class. */
2586 #define HTB_N_QUEUES 0xf000
2590 unsigned int max_rate; /* In bytes/s. */
2594 struct tc_queue tc_queue;
2595 unsigned int min_rate; /* In bytes/s. */
2596 unsigned int max_rate; /* In bytes/s. */
2597 unsigned int burst; /* In bytes. */
2598 unsigned int priority; /* Lower values are higher priorities. */
2602 htb_get__(const struct netdev *netdev)
2604 struct netdev_dev_linux *netdev_dev =
2605 netdev_dev_linux_cast(netdev_get_dev(netdev));
2606 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2610 htb_install__(struct netdev *netdev, uint64_t max_rate)
2612 struct netdev_dev_linux *netdev_dev =
2613 netdev_dev_linux_cast(netdev_get_dev(netdev));
2616 htb = xmalloc(sizeof *htb);
2617 tc_init(&htb->tc, &tc_ops_htb);
2618 htb->max_rate = max_rate;
2620 netdev_dev->tc = &htb->tc;
2623 /* Create an HTB qdisc.
2625 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2627 htb_setup_qdisc__(struct netdev *netdev)
2630 struct tc_htb_glob opt;
2631 struct ofpbuf request;
2632 struct tcmsg *tcmsg;
2634 tc_del_qdisc(netdev);
2636 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2637 NLM_F_EXCL | NLM_F_CREATE, &request);
2641 tcmsg->tcm_handle = tc_make_handle(1, 0);
2642 tcmsg->tcm_parent = TC_H_ROOT;
2644 nl_msg_put_string(&request, TCA_KIND, "htb");
2646 memset(&opt, 0, sizeof opt);
2647 opt.rate2quantum = 10;
2651 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2652 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2653 nl_msg_end_nested(&request, opt_offset);
2655 return tc_transact(&request, NULL);
2658 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2659 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2661 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2662 unsigned int parent, struct htb_class *class)
2665 struct tc_htb_opt opt;
2666 struct ofpbuf request;
2667 struct tcmsg *tcmsg;
2671 error = netdev_get_mtu(netdev, &mtu);
2673 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2674 netdev_get_name(netdev));
2678 memset(&opt, 0, sizeof opt);
2679 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2680 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2681 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2682 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2683 opt.prio = class->priority;
2685 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2689 tcmsg->tcm_handle = handle;
2690 tcmsg->tcm_parent = parent;
2692 nl_msg_put_string(&request, TCA_KIND, "htb");
2693 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2694 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2695 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2696 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2697 nl_msg_end_nested(&request, opt_offset);
2699 error = tc_transact(&request, NULL);
2701 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2702 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2703 netdev_get_name(netdev),
2704 tc_get_major(handle), tc_get_minor(handle),
2705 tc_get_major(parent), tc_get_minor(parent),
2706 class->min_rate, class->max_rate,
2707 class->burst, class->priority, strerror(error));
2712 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2713 * description of them into 'details'. The description complies with the
2714 * specification given in the vswitch database documentation for linux-htb
2717 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2719 static const struct nl_policy tca_htb_policy[] = {
2720 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2721 .min_len = sizeof(struct tc_htb_opt) },
2724 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2725 const struct tc_htb_opt *htb;
2727 if (!nl_parse_nested(nl_options, tca_htb_policy,
2728 attrs, ARRAY_SIZE(tca_htb_policy))) {
2729 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2733 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2734 class->min_rate = htb->rate.rate;
2735 class->max_rate = htb->ceil.rate;
2736 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2737 class->priority = htb->prio;
2742 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2743 struct htb_class *options,
2744 struct netdev_queue_stats *stats)
2746 struct nlattr *nl_options;
2747 unsigned int handle;
2750 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2751 if (!error && queue_id) {
2752 unsigned int major = tc_get_major(handle);
2753 unsigned int minor = tc_get_minor(handle);
2754 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2755 *queue_id = minor - 1;
2760 if (!error && options) {
2761 error = htb_parse_tca_options__(nl_options, options);
2767 htb_parse_qdisc_details__(struct netdev *netdev,
2768 const struct smap *details, struct htb_class *hc)
2770 const char *max_rate_s;
2772 max_rate_s = smap_get(details, "max-rate");
2773 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2774 if (!hc->max_rate) {
2775 enum netdev_features current;
2777 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2778 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
2780 hc->min_rate = hc->max_rate;
2786 htb_parse_class_details__(struct netdev *netdev,
2787 const struct smap *details, struct htb_class *hc)
2789 const struct htb *htb = htb_get__(netdev);
2790 const char *min_rate_s = smap_get(details, "min-rate");
2791 const char *max_rate_s = smap_get(details, "max-rate");
2792 const char *burst_s = smap_get(details, "burst");
2793 const char *priority_s = smap_get(details, "priority");
2796 error = netdev_get_mtu(netdev, &mtu);
2798 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2799 netdev_get_name(netdev));
2803 /* HTB requires at least an mtu sized min-rate to send any traffic even
2804 * on uncongested links. */
2805 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2806 hc->min_rate = MAX(hc->min_rate, mtu);
2807 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2810 hc->max_rate = (max_rate_s
2811 ? strtoull(max_rate_s, NULL, 10) / 8
2813 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2814 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2818 * According to hints in the documentation that I've read, it is important
2819 * that 'burst' be at least as big as the largest frame that might be
2820 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2821 * but having it a bit too small is a problem. Since netdev_get_mtu()
2822 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2823 * the MTU. We actually add 64, instead of 14, as a guard against
2824 * additional headers get tacked on somewhere that we're not aware of. */
2825 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2826 hc->burst = MAX(hc->burst, mtu + 64);
2829 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2835 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2836 unsigned int parent, struct htb_class *options,
2837 struct netdev_queue_stats *stats)
2839 struct ofpbuf *reply;
2842 error = tc_query_class(netdev, handle, parent, &reply);
2844 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2845 ofpbuf_delete(reply);
2851 htb_tc_install(struct netdev *netdev, const struct smap *details)
2855 error = htb_setup_qdisc__(netdev);
2857 struct htb_class hc;
2859 htb_parse_qdisc_details__(netdev, details, &hc);
2860 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2861 tc_make_handle(1, 0), &hc);
2863 htb_install__(netdev, hc.max_rate);
2869 static struct htb_class *
2870 htb_class_cast__(const struct tc_queue *queue)
2872 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2876 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2877 const struct htb_class *hc)
2879 struct htb *htb = htb_get__(netdev);
2880 size_t hash = hash_int(queue_id, 0);
2881 struct tc_queue *queue;
2882 struct htb_class *hcp;
2884 queue = tc_find_queue__(netdev, queue_id, hash);
2886 hcp = htb_class_cast__(queue);
2888 hcp = xmalloc(sizeof *hcp);
2889 queue = &hcp->tc_queue;
2890 queue->queue_id = queue_id;
2891 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2894 hcp->min_rate = hc->min_rate;
2895 hcp->max_rate = hc->max_rate;
2896 hcp->burst = hc->burst;
2897 hcp->priority = hc->priority;
2901 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2904 struct nl_dump dump;
2905 struct htb_class hc;
2907 /* Get qdisc options. */
2909 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2910 htb_install__(netdev, hc.max_rate);
2913 if (!start_queue_dump(netdev, &dump)) {
2916 while (nl_dump_next(&dump, &msg)) {
2917 unsigned int queue_id;
2919 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2920 htb_update_queue__(netdev, queue_id, &hc);
2923 nl_dump_done(&dump);
2929 htb_tc_destroy(struct tc *tc)
2931 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2932 struct htb_class *hc, *next;
2934 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2935 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2943 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
2945 const struct htb *htb = htb_get__(netdev);
2946 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
2951 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
2953 struct htb_class hc;
2956 htb_parse_qdisc_details__(netdev, details, &hc);
2957 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2958 tc_make_handle(1, 0), &hc);
2960 htb_get__(netdev)->max_rate = hc.max_rate;
2966 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2967 const struct tc_queue *queue, struct smap *details)
2969 const struct htb_class *hc = htb_class_cast__(queue);
2971 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
2972 if (hc->min_rate != hc->max_rate) {
2973 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
2975 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
2977 smap_add_format(details, "priority", "%u", hc->priority);
2983 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2984 const struct smap *details)
2986 struct htb_class hc;
2989 error = htb_parse_class_details__(netdev, details, &hc);
2994 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2995 tc_make_handle(1, 0xfffe), &hc);
3000 htb_update_queue__(netdev, queue_id, &hc);
3005 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
3007 struct htb_class *hc = htb_class_cast__(queue);
3008 struct htb *htb = htb_get__(netdev);
3011 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3013 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3020 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3021 struct netdev_queue_stats *stats)
3023 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3024 tc_make_handle(1, 0xfffe), NULL, stats);
3028 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3029 const struct ofpbuf *nlmsg,
3030 netdev_dump_queue_stats_cb *cb, void *aux)
3032 struct netdev_queue_stats stats;
3033 unsigned int handle, major, minor;
3036 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3041 major = tc_get_major(handle);
3042 minor = tc_get_minor(handle);
3043 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3044 (*cb)(minor - 1, &stats, aux);
3049 static const struct tc_ops tc_ops_htb = {
3050 "htb", /* linux_name */
3051 "linux-htb", /* ovs_name */
3052 HTB_N_QUEUES, /* n_queues */
3061 htb_class_get_stats,
3062 htb_class_dump_stats
3065 /* "linux-hfsc" traffic control class. */
3067 #define HFSC_N_QUEUES 0xf000
3075 struct tc_queue tc_queue;
3080 static struct hfsc *
3081 hfsc_get__(const struct netdev *netdev)
3083 struct netdev_dev_linux *netdev_dev;
3084 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
3085 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
3088 static struct hfsc_class *
3089 hfsc_class_cast__(const struct tc_queue *queue)
3091 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3095 hfsc_install__(struct netdev *netdev, uint32_t max_rate)
3097 struct netdev_dev_linux * netdev_dev;
3100 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
3101 hfsc = xmalloc(sizeof *hfsc);
3102 tc_init(&hfsc->tc, &tc_ops_hfsc);
3103 hfsc->max_rate = max_rate;
3104 netdev_dev->tc = &hfsc->tc;
3108 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3109 const struct hfsc_class *hc)
3113 struct hfsc_class *hcp;
3114 struct tc_queue *queue;
3116 hfsc = hfsc_get__(netdev);
3117 hash = hash_int(queue_id, 0);
3119 queue = tc_find_queue__(netdev, queue_id, hash);
3121 hcp = hfsc_class_cast__(queue);
3123 hcp = xmalloc(sizeof *hcp);
3124 queue = &hcp->tc_queue;
3125 queue->queue_id = queue_id;
3126 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3129 hcp->min_rate = hc->min_rate;
3130 hcp->max_rate = hc->max_rate;
3134 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3136 const struct tc_service_curve *rsc, *fsc, *usc;
3137 static const struct nl_policy tca_hfsc_policy[] = {
3139 .type = NL_A_UNSPEC,
3141 .min_len = sizeof(struct tc_service_curve),
3144 .type = NL_A_UNSPEC,
3146 .min_len = sizeof(struct tc_service_curve),
3149 .type = NL_A_UNSPEC,
3151 .min_len = sizeof(struct tc_service_curve),
3154 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3156 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3157 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3158 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3162 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3163 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3164 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3166 if (rsc->m1 != 0 || rsc->d != 0 ||
3167 fsc->m1 != 0 || fsc->d != 0 ||
3168 usc->m1 != 0 || usc->d != 0) {
3169 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3170 "Non-linear service curves are not supported.");
3174 if (rsc->m2 != fsc->m2) {
3175 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3176 "Real-time service curves are not supported ");
3180 if (rsc->m2 > usc->m2) {
3181 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3182 "Min-rate service curve is greater than "
3183 "the max-rate service curve.");
3187 class->min_rate = fsc->m2;
3188 class->max_rate = usc->m2;
3193 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3194 struct hfsc_class *options,
3195 struct netdev_queue_stats *stats)
3198 unsigned int handle;
3199 struct nlattr *nl_options;
3201 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3207 unsigned int major, minor;
3209 major = tc_get_major(handle);
3210 minor = tc_get_minor(handle);
3211 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3212 *queue_id = minor - 1;
3219 error = hfsc_parse_tca_options__(nl_options, options);
3226 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3227 unsigned int parent, struct hfsc_class *options,
3228 struct netdev_queue_stats *stats)
3231 struct ofpbuf *reply;
3233 error = tc_query_class(netdev, handle, parent, &reply);
3238 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3239 ofpbuf_delete(reply);
3244 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct smap *details,
3245 struct hfsc_class *class)
3248 const char *max_rate_s;
3250 max_rate_s = smap_get(details, "max-rate");
3251 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3254 enum netdev_features current;
3256 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
3257 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3260 class->min_rate = max_rate;
3261 class->max_rate = max_rate;
3265 hfsc_parse_class_details__(struct netdev *netdev,
3266 const struct smap *details,
3267 struct hfsc_class * class)
3269 const struct hfsc *hfsc;
3270 uint32_t min_rate, max_rate;
3271 const char *min_rate_s, *max_rate_s;
3273 hfsc = hfsc_get__(netdev);
3274 min_rate_s = smap_get(details, "min-rate");
3275 max_rate_s = smap_get(details, "max-rate");
3277 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3278 min_rate = MAX(min_rate, 1);
3279 min_rate = MIN(min_rate, hfsc->max_rate);
3281 max_rate = (max_rate_s
3282 ? strtoull(max_rate_s, NULL, 10) / 8
3284 max_rate = MAX(max_rate, min_rate);
3285 max_rate = MIN(max_rate, hfsc->max_rate);
3287 class->min_rate = min_rate;
3288 class->max_rate = max_rate;
3293 /* Create an HFSC qdisc.
3295 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3297 hfsc_setup_qdisc__(struct netdev * netdev)
3299 struct tcmsg *tcmsg;
3300 struct ofpbuf request;
3301 struct tc_hfsc_qopt opt;
3303 tc_del_qdisc(netdev);
3305 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3306 NLM_F_EXCL | NLM_F_CREATE, &request);
3312 tcmsg->tcm_handle = tc_make_handle(1, 0);
3313 tcmsg->tcm_parent = TC_H_ROOT;
3315 memset(&opt, 0, sizeof opt);
3318 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3319 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3321 return tc_transact(&request, NULL);
3324 /* Create an HFSC class.
3326 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3327 * sc rate <min_rate> ul rate <max_rate>" */
3329 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3330 unsigned int parent, struct hfsc_class *class)
3334 struct tcmsg *tcmsg;
3335 struct ofpbuf request;
3336 struct tc_service_curve min, max;
3338 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3344 tcmsg->tcm_handle = handle;
3345 tcmsg->tcm_parent = parent;
3349 min.m2 = class->min_rate;
3353 max.m2 = class->max_rate;
3355 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3356 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3357 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3358 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3359 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3360 nl_msg_end_nested(&request, opt_offset);
3362 error = tc_transact(&request, NULL);
3364 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3365 "min-rate %ubps, max-rate %ubps (%s)",
3366 netdev_get_name(netdev),
3367 tc_get_major(handle), tc_get_minor(handle),
3368 tc_get_major(parent), tc_get_minor(parent),
3369 class->min_rate, class->max_rate, strerror(error));
3376 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
3379 struct hfsc_class class;
3381 error = hfsc_setup_qdisc__(netdev);
3387 hfsc_parse_qdisc_details__(netdev, details, &class);
3388 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3389 tc_make_handle(1, 0), &class);
3395 hfsc_install__(netdev, class.max_rate);
3400 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3403 struct nl_dump dump;
3404 struct hfsc_class hc;
3407 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3408 hfsc_install__(netdev, hc.max_rate);
3410 if (!start_queue_dump(netdev, &dump)) {
3414 while (nl_dump_next(&dump, &msg)) {
3415 unsigned int queue_id;
3417 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3418 hfsc_update_queue__(netdev, queue_id, &hc);
3422 nl_dump_done(&dump);
3427 hfsc_tc_destroy(struct tc *tc)
3430 struct hfsc_class *hc, *next;
3432 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3434 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3435 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3444 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
3446 const struct hfsc *hfsc;
3447 hfsc = hfsc_get__(netdev);
3448 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
3453 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
3456 struct hfsc_class class;
3458 hfsc_parse_qdisc_details__(netdev, details, &class);
3459 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3460 tc_make_handle(1, 0), &class);
3463 hfsc_get__(netdev)->max_rate = class.max_rate;
3470 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3471 const struct tc_queue *queue, struct smap *details)
3473 const struct hfsc_class *hc;
3475 hc = hfsc_class_cast__(queue);
3476 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3477 if (hc->min_rate != hc->max_rate) {
3478 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3484 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3485 const struct smap *details)
3488 struct hfsc_class class;
3490 error = hfsc_parse_class_details__(netdev, details, &class);
3495 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3496 tc_make_handle(1, 0xfffe), &class);
3501 hfsc_update_queue__(netdev, queue_id, &class);
3506 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3510 struct hfsc_class *hc;
3512 hc = hfsc_class_cast__(queue);
3513 hfsc = hfsc_get__(netdev);
3515 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3517 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3524 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3525 struct netdev_queue_stats *stats)
3527 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3528 tc_make_handle(1, 0xfffe), NULL, stats);
3532 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3533 const struct ofpbuf *nlmsg,
3534 netdev_dump_queue_stats_cb *cb, void *aux)
3536 struct netdev_queue_stats stats;
3537 unsigned int handle, major, minor;
3540 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3545 major = tc_get_major(handle);
3546 minor = tc_get_minor(handle);
3547 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3548 (*cb)(minor - 1, &stats, aux);
3553 static const struct tc_ops tc_ops_hfsc = {
3554 "hfsc", /* linux_name */
3555 "linux-hfsc", /* ovs_name */
3556 HFSC_N_QUEUES, /* n_queues */
3557 hfsc_tc_install, /* tc_install */
3558 hfsc_tc_load, /* tc_load */
3559 hfsc_tc_destroy, /* tc_destroy */
3560 hfsc_qdisc_get, /* qdisc_get */
3561 hfsc_qdisc_set, /* qdisc_set */
3562 hfsc_class_get, /* class_get */
3563 hfsc_class_set, /* class_set */
3564 hfsc_class_delete, /* class_delete */
3565 hfsc_class_get_stats, /* class_get_stats */
3566 hfsc_class_dump_stats /* class_dump_stats */
3569 /* "linux-default" traffic control class.
3571 * This class represents the default, unnamed Linux qdisc. It corresponds to
3572 * the "" (empty string) QoS type in the OVS database. */
3575 default_install__(struct netdev *netdev)
3577 struct netdev_dev_linux *netdev_dev =
3578 netdev_dev_linux_cast(netdev_get_dev(netdev));
3579 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
3581 /* Nothing but a tc class implementation is allowed to write to a tc. This
3582 * class never does that, so we can legitimately use a const tc object. */
3583 netdev_dev->tc = CONST_CAST(struct tc *, &tc);
3587 default_tc_install(struct netdev *netdev,
3588 const struct smap *details OVS_UNUSED)
3590 default_install__(netdev);
3595 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3597 default_install__(netdev);
3601 static const struct tc_ops tc_ops_default = {
3602 NULL, /* linux_name */
3607 NULL, /* tc_destroy */
3608 NULL, /* qdisc_get */
3609 NULL, /* qdisc_set */
3610 NULL, /* class_get */
3611 NULL, /* class_set */
3612 NULL, /* class_delete */
3613 NULL, /* class_get_stats */
3614 NULL /* class_dump_stats */
3617 /* "linux-other" traffic control class.
3622 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3624 struct netdev_dev_linux *netdev_dev =
3625 netdev_dev_linux_cast(netdev_get_dev(netdev));
3626 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
3628 /* Nothing but a tc class implementation is allowed to write to a tc. This
3629 * class never does that, so we can legitimately use a const tc object. */
3630 netdev_dev->tc = CONST_CAST(struct tc *, &tc);
3634 static const struct tc_ops tc_ops_other = {
3635 NULL, /* linux_name */
3636 "linux-other", /* ovs_name */
3638 NULL, /* tc_install */
3640 NULL, /* tc_destroy */
3641 NULL, /* qdisc_get */
3642 NULL, /* qdisc_set */
3643 NULL, /* class_get */
3644 NULL, /* class_set */
3645 NULL, /* class_delete */
3646 NULL, /* class_get_stats */
3647 NULL /* class_dump_stats */
3650 /* Traffic control. */
3652 /* Number of kernel "tc" ticks per second. */
3653 static double ticks_per_s;
3655 /* Number of kernel "jiffies" per second. This is used for the purpose of
3656 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3657 * one jiffy's worth of data.
3659 * There are two possibilities here:
3661 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3662 * approximate range of 100 to 1024. That means that we really need to
3663 * make sure that the qdisc can buffer that much data.
3665 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3666 * has finely granular timers and there's no need to fudge additional room
3667 * for buffers. (There's no extra effort needed to implement that: the
3668 * large 'buffer_hz' is used as a divisor, so practically any number will
3669 * come out as 0 in the division. Small integer results in the case of
3670 * really high dividends won't have any real effect anyhow.)
3672 static unsigned int buffer_hz;
3674 /* Returns tc handle 'major':'minor'. */
3676 tc_make_handle(unsigned int major, unsigned int minor)
3678 return TC_H_MAKE(major << 16, minor);
3681 /* Returns the major number from 'handle'. */
3683 tc_get_major(unsigned int handle)
3685 return TC_H_MAJ(handle) >> 16;
3688 /* Returns the minor number from 'handle'. */
3690 tc_get_minor(unsigned int handle)
3692 return TC_H_MIN(handle);
3695 static struct tcmsg *
3696 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3697 struct ofpbuf *request)
3699 struct tcmsg *tcmsg;
3703 error = get_ifindex(netdev, &ifindex);
3708 ofpbuf_init(request, 512);
3709 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3710 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3711 tcmsg->tcm_family = AF_UNSPEC;
3712 tcmsg->tcm_ifindex = ifindex;
3713 /* Caller should fill in tcmsg->tcm_handle. */
3714 /* Caller should fill in tcmsg->tcm_parent. */
3720 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3722 int error = nl_sock_transact(rtnl_sock, request, replyp);
3723 ofpbuf_uninit(request);
3727 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3728 * policing configuration.
3730 * This function is equivalent to running the following when 'add' is true:
3731 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3733 * This function is equivalent to running the following when 'add' is false:
3734 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3736 * The configuration and stats may be seen with the following command:
3737 * /sbin/tc -s qdisc show dev <devname>
3739 * Returns 0 if successful, otherwise a positive errno value.
3742 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3744 struct ofpbuf request;
3745 struct tcmsg *tcmsg;
3747 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3748 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3750 tcmsg = tc_make_request(netdev, type, flags, &request);
3754 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3755 tcmsg->tcm_parent = TC_H_INGRESS;
3756 nl_msg_put_string(&request, TCA_KIND, "ingress");
3757 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3759 error = tc_transact(&request, NULL);
3761 /* If we're deleting the qdisc, don't worry about some of the
3762 * error conditions. */
3763 if (!add && (error == ENOENT || error == EINVAL)) {
3772 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3775 * This function is equivalent to running:
3776 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3777 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3780 * The configuration and stats may be seen with the following command:
3781 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3783 * Returns 0 if successful, otherwise a positive errno value.
3786 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3788 struct tc_police tc_police;
3789 struct ofpbuf request;
3790 struct tcmsg *tcmsg;
3791 size_t basic_offset;
3792 size_t police_offset;
3796 memset(&tc_police, 0, sizeof tc_police);
3797 tc_police.action = TC_POLICE_SHOT;
3798 tc_police.mtu = mtu;
3799 tc_fill_rate(&tc_police.rate, (kbits_rate * 1000)/8, mtu);
3800 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3801 kbits_burst * 1024);
3803 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3804 NLM_F_EXCL | NLM_F_CREATE, &request);
3808 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3809 tcmsg->tcm_info = tc_make_handle(49,
3810 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3812 nl_msg_put_string(&request, TCA_KIND, "basic");
3813 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3814 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3815 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3816 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3817 nl_msg_end_nested(&request, police_offset);
3818 nl_msg_end_nested(&request, basic_offset);
3820 error = tc_transact(&request, NULL);
3831 /* The values in psched are not individually very meaningful, but they are
3832 * important. The tables below show some values seen in the wild.
3836 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3837 * (Before that, there are hints that it was 1000000000.)
3839 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3843 * -----------------------------------
3844 * [1] 000c8000 000f4240 000f4240 00000064
3845 * [2] 000003e8 00000400 000f4240 3b9aca00
3846 * [3] 000003e8 00000400 000f4240 3b9aca00
3847 * [4] 000003e8 00000400 000f4240 00000064
3848 * [5] 000003e8 00000040 000f4240 3b9aca00
3849 * [6] 000003e8 00000040 000f4240 000000f9
3851 * a b c d ticks_per_s buffer_hz
3852 * ------- --------- ---------- ------------- ----------- -------------
3853 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3854 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3855 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3856 * [4] 1,000 1,024 1,000,000 100 976,562 100
3857 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3858 * [6] 1,000 64 1,000,000 249 15,625,000 249
3860 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3861 * [2] 2.6.26-1-686-bigmem from Debian lenny
3862 * [3] 2.6.26-2-sparc64 from Debian lenny
3863 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3864 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3865 * [6] 2.6.34 from kernel.org on KVM
3867 static const char fn[] = "/proc/net/psched";
3868 unsigned int a, b, c, d;
3874 stream = fopen(fn, "r");
3876 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3880 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3881 VLOG_WARN("%s: read failed", fn);
3885 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3889 VLOG_WARN("%s: invalid scheduler parameters", fn);
3893 ticks_per_s = (double) a * c / b;
3897 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3900 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3903 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3904 * rate of 'rate' bytes per second. */
3906 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3911 return (rate * ticks) / ticks_per_s;
3914 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3915 * rate of 'rate' bytes per second. */
3917 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3922 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3925 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3926 * a transmission rate of 'rate' bytes per second. */
3928 tc_buffer_per_jiffy(unsigned int rate)
3933 return rate / buffer_hz;
3936 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3937 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3938 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3939 * stores NULL into it if it is absent.
3941 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3944 * Returns 0 if successful, otherwise a positive errno value. */
3946 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3947 struct nlattr **options)
3949 static const struct nl_policy tca_policy[] = {
3950 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3951 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3953 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3955 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3956 tca_policy, ta, ARRAY_SIZE(ta))) {
3957 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3962 *kind = nl_attr_get_string(ta[TCA_KIND]);
3966 *options = ta[TCA_OPTIONS];
3981 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3982 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3983 * into '*options', and its queue statistics into '*stats'. Any of the output
3984 * arguments may be null.
3986 * Returns 0 if successful, otherwise a positive errno value. */
3988 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3989 struct nlattr **options, struct netdev_queue_stats *stats)
3991 static const struct nl_policy tca_policy[] = {
3992 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3993 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3995 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3997 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3998 tca_policy, ta, ARRAY_SIZE(ta))) {
3999 VLOG_WARN_RL(&rl, "failed to parse class message");
4004 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
4005 *handlep = tc->tcm_handle;
4009 *options = ta[TCA_OPTIONS];
4013 const struct gnet_stats_queue *gsq;
4014 struct gnet_stats_basic gsb;
4016 static const struct nl_policy stats_policy[] = {
4017 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
4018 .min_len = sizeof gsb },
4019 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
4020 .min_len = sizeof *gsq },
4022 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
4024 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
4025 sa, ARRAY_SIZE(sa))) {
4026 VLOG_WARN_RL(&rl, "failed to parse class stats");
4030 /* Alignment issues screw up the length of struct gnet_stats_basic on
4031 * some arch/bitsize combinations. Newer versions of Linux have a
4032 * struct gnet_stats_basic_packed, but we can't depend on that. The
4033 * easiest thing to do is just to make a copy. */
4034 memset(&gsb, 0, sizeof gsb);
4035 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
4036 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
4037 stats->tx_bytes = gsb.bytes;
4038 stats->tx_packets = gsb.packets;
4040 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
4041 stats->tx_errors = gsq->drops;
4051 memset(stats, 0, sizeof *stats);
4056 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
4059 tc_query_class(const struct netdev *netdev,
4060 unsigned int handle, unsigned int parent,
4061 struct ofpbuf **replyp)
4063 struct ofpbuf request;
4064 struct tcmsg *tcmsg;
4067 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
4071 tcmsg->tcm_handle = handle;
4072 tcmsg->tcm_parent = parent;
4074 error = tc_transact(&request, replyp);
4076 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
4077 netdev_get_name(netdev),
4078 tc_get_major(handle), tc_get_minor(handle),
4079 tc_get_major(parent), tc_get_minor(parent),
4085 /* Equivalent to "tc class del dev <name> handle <handle>". */
4087 tc_delete_class(const struct netdev *netdev, unsigned int handle)
4089 struct ofpbuf request;
4090 struct tcmsg *tcmsg;
4093 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
4097 tcmsg->tcm_handle = handle;
4098 tcmsg->tcm_parent = 0;
4100 error = tc_transact(&request, NULL);
4102 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4103 netdev_get_name(netdev),
4104 tc_get_major(handle), tc_get_minor(handle),
4110 /* Equivalent to "tc qdisc del dev <name> root". */
4112 tc_del_qdisc(struct netdev *netdev)
4114 struct netdev_dev_linux *netdev_dev =
4115 netdev_dev_linux_cast(netdev_get_dev(netdev));
4116 struct ofpbuf request;
4117 struct tcmsg *tcmsg;
4120 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
4124 tcmsg->tcm_handle = tc_make_handle(1, 0);
4125 tcmsg->tcm_parent = TC_H_ROOT;
4127 error = tc_transact(&request, NULL);
4128 if (error == EINVAL) {
4129 /* EINVAL probably means that the default qdisc was in use, in which
4130 * case we've accomplished our purpose. */
4133 if (!error && netdev_dev->tc) {
4134 if (netdev_dev->tc->ops->tc_destroy) {
4135 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
4137 netdev_dev->tc = NULL;
4142 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4143 * kernel to determine what they are. Returns 0 if successful, otherwise a
4144 * positive errno value. */
4146 tc_query_qdisc(const struct netdev *netdev)
4148 struct netdev_dev_linux *netdev_dev =
4149 netdev_dev_linux_cast(netdev_get_dev(netdev));
4150 struct ofpbuf request, *qdisc;
4151 const struct tc_ops *ops;
4152 struct tcmsg *tcmsg;
4156 if (netdev_dev->tc) {
4160 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4161 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4162 * 2.6.35 without that fix backported to it.
4164 * To avoid the OOPS, we must not make a request that would attempt to dump
4165 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4166 * few others. There are a few ways that I can see to do this, but most of
4167 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4168 * technique chosen here is to assume that any non-default qdisc that we
4169 * create will have a class with handle 1:0. The built-in qdiscs only have
4170 * a class with handle 0:0.
4172 * We could check for Linux 2.6.35+ and use a more straightforward method
4174 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
4178 tcmsg->tcm_handle = tc_make_handle(1, 0);
4179 tcmsg->tcm_parent = 0;
4181 /* Figure out what tc class to instantiate. */
4182 error = tc_transact(&request, &qdisc);
4186 error = tc_parse_qdisc(qdisc, &kind, NULL);
4188 ops = &tc_ops_other;
4190 ops = tc_lookup_linux_name(kind);
4192 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4193 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4195 ops = &tc_ops_other;
4198 } else if (error == ENOENT) {
4199 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4200 * other entity that doesn't have a handle 1:0. We will assume
4201 * that it's the system default qdisc. */
4202 ops = &tc_ops_default;
4205 /* Who knows? Maybe the device got deleted. */
4206 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4207 netdev_get_name(netdev), strerror(error));
4208 ops = &tc_ops_other;
4211 /* Instantiate it. */
4212 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev), qdisc);
4213 ovs_assert((load_error == 0) == (netdev_dev->tc != NULL));
4214 ofpbuf_delete(qdisc);
4216 return error ? error : load_error;
4219 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4220 approximate the time to transmit packets of various lengths. For an MTU of
4221 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4222 represents two possible packet lengths; for a MTU of 513 through 1024, four
4223 possible lengths; and so on.
4225 Returns, for the specified 'mtu', the number of bits that packet lengths
4226 need to be shifted right to fit within such a 256-entry table. */
4228 tc_calc_cell_log(unsigned int mtu)
4233 mtu = ETH_PAYLOAD_MAX;
4235 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4237 for (cell_log = 0; mtu >= 256; cell_log++) {
4244 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4247 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4249 memset(rate, 0, sizeof *rate);
4250 rate->cell_log = tc_calc_cell_log(mtu);
4251 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4252 /* rate->cell_align = 0; */ /* distro headers. */
4253 rate->mpu = ETH_TOTAL_MIN;
4257 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4258 * attribute of the specified "type".
4260 * See tc_calc_cell_log() above for a description of "rtab"s. */
4262 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4267 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4268 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4269 unsigned packet_size = (i + 1) << rate->cell_log;
4270 if (packet_size < rate->mpu) {
4271 packet_size = rate->mpu;
4273 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4277 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4278 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4279 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4282 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4284 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4285 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4288 /* Linux-only functions declared in netdev-linux.h */
4290 /* Returns a fd for an AF_INET socket or a negative errno value. */
4292 netdev_linux_get_af_inet_sock(void)
4294 int error = netdev_linux_init();
4295 return error ? -error : af_inet_sock;
4298 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4299 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4301 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4302 const char *flag_name, bool enable)
4304 const char *netdev_name = netdev_get_name(netdev);
4305 struct ethtool_value evalue;
4309 COVERAGE_INC(netdev_get_ethtool);
4310 memset(&evalue, 0, sizeof evalue);
4311 error = netdev_linux_do_ethtool(netdev_name,
4312 (struct ethtool_cmd *)&evalue,
4313 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4318 COVERAGE_INC(netdev_set_ethtool);
4319 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4320 error = netdev_linux_do_ethtool(netdev_name,
4321 (struct ethtool_cmd *)&evalue,
4322 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4327 COVERAGE_INC(netdev_get_ethtool);
4328 memset(&evalue, 0, sizeof evalue);
4329 error = netdev_linux_do_ethtool(netdev_name,
4330 (struct ethtool_cmd *)&evalue,
4331 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4336 if (new_flags != evalue.data) {
4337 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4338 "device %s failed", enable ? "enable" : "disable",
4339 flag_name, netdev_name);
4346 /* Utility functions. */
4348 /* Copies 'src' into 'dst', performing format conversion in the process. */
4350 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4351 const struct rtnl_link_stats *src)
4353 dst->rx_packets = src->rx_packets;
4354 dst->tx_packets = src->tx_packets;
4355 dst->rx_bytes = src->rx_bytes;
4356 dst->tx_bytes = src->tx_bytes;
4357 dst->rx_errors = src->rx_errors;
4358 dst->tx_errors = src->tx_errors;
4359 dst->rx_dropped = src->rx_dropped;
4360 dst->tx_dropped = src->tx_dropped;
4361 dst->multicast = src->multicast;
4362 dst->collisions = src->collisions;
4363 dst->rx_length_errors = src->rx_length_errors;
4364 dst->rx_over_errors = src->rx_over_errors;
4365 dst->rx_crc_errors = src->rx_crc_errors;
4366 dst->rx_frame_errors = src->rx_frame_errors;
4367 dst->rx_fifo_errors = src->rx_fifo_errors;
4368 dst->rx_missed_errors = src->rx_missed_errors;
4369 dst->tx_aborted_errors = src->tx_aborted_errors;
4370 dst->tx_carrier_errors = src->tx_carrier_errors;
4371 dst->tx_fifo_errors = src->tx_fifo_errors;
4372 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4373 dst->tx_window_errors = src->tx_window_errors;
4377 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4379 /* Policy for RTNLGRP_LINK messages.
4381 * There are *many* more fields in these messages, but currently we only
4382 * care about these fields. */
4383 static const struct nl_policy rtnlgrp_link_policy[] = {
4384 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4385 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4386 .min_len = sizeof(struct rtnl_link_stats) },
4389 struct ofpbuf request;
4390 struct ofpbuf *reply;
4391 struct ifinfomsg *ifi;
4392 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4395 ofpbuf_init(&request, 0);
4396 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4397 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4398 ifi->ifi_family = PF_UNSPEC;
4399 ifi->ifi_index = ifindex;
4400 error = nl_sock_transact(rtnl_sock, &request, &reply);
4401 ofpbuf_uninit(&request);
4406 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4407 rtnlgrp_link_policy,
4408 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4409 ofpbuf_delete(reply);
4413 if (!attrs[IFLA_STATS]) {
4414 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4415 ofpbuf_delete(reply);
4419 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4421 ofpbuf_delete(reply);
4427 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4429 static const char fn[] = "/proc/net/dev";
4434 stream = fopen(fn, "r");
4436 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
4441 while (fgets(line, sizeof line, stream)) {
4444 #define X64 "%"SCNu64
4447 X64 X64 X64 X64 X64 X64 X64 "%*u"
4448 X64 X64 X64 X64 X64 X64 X64 "%*u",
4454 &stats->rx_fifo_errors,
4455 &stats->rx_frame_errors,
4461 &stats->tx_fifo_errors,
4463 &stats->tx_carrier_errors) != 15) {
4464 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4465 } else if (!strcmp(devname, netdev_name)) {
4466 stats->rx_length_errors = UINT64_MAX;
4467 stats->rx_over_errors = UINT64_MAX;
4468 stats->rx_crc_errors = UINT64_MAX;
4469 stats->rx_missed_errors = UINT64_MAX;
4470 stats->tx_aborted_errors = UINT64_MAX;
4471 stats->tx_heartbeat_errors = UINT64_MAX;
4472 stats->tx_window_errors = UINT64_MAX;
4478 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4484 get_flags(const struct netdev_dev *dev, unsigned int *flags)
4490 error = netdev_linux_do_ioctl(dev->name, &ifr, SIOCGIFFLAGS,
4493 *flags = ifr.ifr_flags;
4499 set_flags(const char *name, unsigned int flags)
4503 ifr.ifr_flags = flags;
4504 return netdev_linux_do_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
4508 do_get_ifindex(const char *netdev_name)
4512 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4513 COVERAGE_INC(netdev_get_ifindex);
4514 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4515 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4516 netdev_name, strerror(errno));
4519 return ifr.ifr_ifindex;
4523 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4525 struct netdev_dev_linux *netdev_dev =
4526 netdev_dev_linux_cast(netdev_get_dev(netdev_));
4528 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
4529 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4532 netdev_dev->get_ifindex_error = -ifindex;
4533 netdev_dev->ifindex = 0;
4535 netdev_dev->get_ifindex_error = 0;
4536 netdev_dev->ifindex = ifindex;
4538 netdev_dev->cache_valid |= VALID_IFINDEX;
4541 *ifindexp = netdev_dev->ifindex;
4542 return netdev_dev->get_ifindex_error;
4546 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4551 memset(&ifr, 0, sizeof ifr);
4552 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4553 COVERAGE_INC(netdev_get_hwaddr);
4554 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4555 /* ENODEV probably means that a vif disappeared asynchronously and
4556 * hasn't been removed from the database yet, so reduce the log level
4557 * to INFO for that case. */
4558 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4559 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4560 netdev_name, strerror(errno));
4563 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4564 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4565 VLOG_WARN("%s device has unknown hardware address family %d",
4566 netdev_name, hwaddr_family);
4568 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4573 set_etheraddr(const char *netdev_name,
4574 const uint8_t mac[ETH_ADDR_LEN])
4578 memset(&ifr, 0, sizeof ifr);
4579 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4580 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4581 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4582 COVERAGE_INC(netdev_set_hwaddr);
4583 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4584 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4585 netdev_name, strerror(errno));
4592 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4593 int cmd, const char *cmd_name)
4597 memset(&ifr, 0, sizeof ifr);
4598 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4599 ifr.ifr_data = (caddr_t) ecmd;
4602 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4605 if (errno != EOPNOTSUPP) {
4606 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4607 "failed: %s", cmd_name, name, strerror(errno));
4609 /* The device doesn't support this operation. That's pretty
4610 * common, so there's no point in logging anything. */
4617 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4618 const char *cmd_name)
4620 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4621 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4622 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4630 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4631 int cmd, const char *cmd_name)
4636 ifr.ifr_addr.sa_family = AF_INET;
4637 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4639 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4640 *ip = sin->sin_addr;
4645 /* Returns an AF_PACKET raw socket or a negative errno value. */
4647 af_packet_sock(void)
4649 static int sock = INT_MIN;
4651 if (sock == INT_MIN) {
4652 sock = socket(AF_PACKET, SOCK_RAW, 0);
4654 int error = set_nonblocking(sock);
4661 VLOG_ERR("failed to create packet socket: %s", strerror(errno));