2 * Copyright (c) 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/gen_stats.h>
26 #include <linux/if_ether.h>
27 #include <linux/if_tun.h>
28 #include <linux/types.h>
29 #include <linux/ethtool.h>
30 #include <linux/mii.h>
31 #include <linux/pkt_cls.h>
32 #include <linux/pkt_sched.h>
33 #include <linux/rtnetlink.h>
34 #include <linux/sockios.h>
35 #include <linux/version.h>
36 #include <sys/types.h>
37 #include <sys/ioctl.h>
38 #include <sys/socket.h>
39 #include <netpacket/packet.h>
41 #include <net/if_arp.h>
42 #include <net/if_packet.h>
43 #include <net/route.h>
44 #include <netinet/in.h>
51 #include "dpif-linux.h"
52 #include "dynamic-string.h"
53 #include "fatal-signal.h"
56 #include "netdev-provider.h"
57 #include "netdev-vport.h"
59 #include "netlink-notifier.h"
60 #include "netlink-socket.h"
62 #include "openflow/openflow.h"
64 #include "poll-loop.h"
65 #include "rtnetlink-link.h"
66 #include "socket-util.h"
72 VLOG_DEFINE_THIS_MODULE(netdev_linux);
74 COVERAGE_DEFINE(netdev_set_policing);
75 COVERAGE_DEFINE(netdev_arp_lookup);
76 COVERAGE_DEFINE(netdev_get_ifindex);
77 COVERAGE_DEFINE(netdev_get_hwaddr);
78 COVERAGE_DEFINE(netdev_set_hwaddr);
79 COVERAGE_DEFINE(netdev_get_ethtool);
80 COVERAGE_DEFINE(netdev_set_ethtool);
83 /* These were introduced in Linux 2.6.14, so they might be missing if we have
85 #ifndef ADVERTISED_Pause
86 #define ADVERTISED_Pause (1 << 13)
88 #ifndef ADVERTISED_Asym_Pause
89 #define ADVERTISED_Asym_Pause (1 << 14)
92 /* These were introduced in Linux 2.6.24, so they might be missing if we
93 * have old headers. */
94 #ifndef ETHTOOL_GFLAGS
95 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
97 #ifndef ETHTOOL_SFLAGS
98 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
101 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
104 #define TC_RTAB_SIZE 1024
107 static struct nln_notifier *netdev_linux_cache_notifier = NULL;
108 static int cache_notifier_refcount;
111 VALID_IFINDEX = 1 << 0,
112 VALID_ETHERADDR = 1 << 1,
116 VALID_POLICING = 1 << 5,
117 VALID_VPORT_STAT_ERROR = 1 << 6,
118 VALID_DRVINFO = 1 << 7,
119 VALID_FEATURES = 1 << 8,
127 /* Traffic control. */
129 /* An instance of a traffic control class. Always associated with a particular
132 * Each TC implementation subclasses this with whatever additional data it
135 const struct tc_ops *ops;
136 struct hmap queues; /* Contains "struct tc_queue"s.
137 * Read by generic TC layer.
138 * Written only by TC implementation. */
141 /* One traffic control queue.
143 * Each TC implementation subclasses this with whatever additional data it
146 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
147 unsigned int queue_id; /* OpenFlow queue ID. */
150 /* A particular kind of traffic control. Each implementation generally maps to
151 * one particular Linux qdisc class.
153 * The functions below return 0 if successful or a positive errno value on
154 * failure, except where otherwise noted. All of them must be provided, except
155 * where otherwise noted. */
157 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
158 * This is null for tc_ops_default and tc_ops_other, for which there are no
159 * appropriate values. */
160 const char *linux_name;
162 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
163 const char *ovs_name;
165 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
166 * queues. The queues are numbered 0 through n_queues - 1. */
167 unsigned int n_queues;
169 /* Called to install this TC class on 'netdev'. The implementation should
170 * make the Netlink calls required to set up 'netdev' with the right qdisc
171 * and configure it according to 'details'. The implementation may assume
172 * that the current qdisc is the default; that is, there is no need for it
173 * to delete the current qdisc before installing itself.
175 * The contents of 'details' should be documented as valid for 'ovs_name'
176 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
177 * (which is built as ovs-vswitchd.conf.db(8)).
179 * This function must return 0 if and only if it sets 'netdev->tc' to an
180 * initialized 'struct tc'.
182 * (This function is null for tc_ops_other, which cannot be installed. For
183 * other TC classes it should always be nonnull.) */
184 int (*tc_install)(struct netdev *netdev, const struct smap *details);
186 /* Called when the netdev code determines (through a Netlink query) that
187 * this TC class's qdisc is installed on 'netdev', but we didn't install
188 * it ourselves and so don't know any of the details.
190 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
191 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
192 * implementation should parse the other attributes of 'nlmsg' as
193 * necessary to determine its configuration. If necessary it should also
194 * use Netlink queries to determine the configuration of queues on
197 * This function must return 0 if and only if it sets 'netdev->tc' to an
198 * initialized 'struct tc'. */
199 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
201 /* Destroys the data structures allocated by the implementation as part of
202 * 'tc'. (This includes destroying 'tc->queues' by calling
205 * The implementation should not need to perform any Netlink calls. If
206 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
207 * (But it may not be desirable.)
209 * This function may be null if 'tc' is trivial. */
210 void (*tc_destroy)(struct tc *tc);
212 /* Retrieves details of 'netdev->tc' configuration into 'details'.
214 * The implementation should not need to perform any Netlink calls, because
215 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
216 * cached the configuration.
218 * The contents of 'details' should be documented as valid for 'ovs_name'
219 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
220 * (which is built as ovs-vswitchd.conf.db(8)).
222 * This function may be null if 'tc' is not configurable.
224 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
226 /* Reconfigures 'netdev->tc' according to 'details', performing any
227 * required Netlink calls to complete the reconfiguration.
229 * The contents of 'details' should be documented as valid for 'ovs_name'
230 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
231 * (which is built as ovs-vswitchd.conf.db(8)).
233 * This function may be null if 'tc' is not configurable.
235 int (*qdisc_set)(struct netdev *, const struct smap *details);
237 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
238 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
240 * The contents of 'details' should be documented as valid for 'ovs_name'
241 * in the "other_config" column in the "Queue" table in
242 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
244 * The implementation should not need to perform any Netlink calls, because
245 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
246 * cached the queue configuration.
248 * This function may be null if 'tc' does not have queues ('n_queues' is
250 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
251 struct smap *details);
253 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
254 * 'details', perfoming any required Netlink calls to complete the
255 * reconfiguration. The caller ensures that 'queue_id' is less than
258 * The contents of 'details' should be documented as valid for 'ovs_name'
259 * in the "other_config" column in the "Queue" table in
260 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
262 * This function may be null if 'tc' does not have queues or its queues are
263 * not configurable. */
264 int (*class_set)(struct netdev *, unsigned int queue_id,
265 const struct smap *details);
267 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
268 * tc_queue's within 'netdev->tc->queues'.
270 * This function may be null if 'tc' does not have queues or its queues
271 * cannot be deleted. */
272 int (*class_delete)(struct netdev *, struct tc_queue *queue);
274 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
275 * 'struct tc_queue's within 'netdev->tc->queues'.
277 * On success, initializes '*stats'.
279 * This function may be null if 'tc' does not have queues or if it cannot
280 * report queue statistics. */
281 int (*class_get_stats)(const struct netdev *netdev,
282 const struct tc_queue *queue,
283 struct netdev_queue_stats *stats);
285 /* Extracts queue stats from 'nlmsg', which is a response to a
286 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
288 * This function may be null if 'tc' does not have queues or if it cannot
289 * report queue statistics. */
290 int (*class_dump_stats)(const struct netdev *netdev,
291 const struct ofpbuf *nlmsg,
292 netdev_dump_queue_stats_cb *cb, void *aux);
296 tc_init(struct tc *tc, const struct tc_ops *ops)
299 hmap_init(&tc->queues);
303 tc_destroy(struct tc *tc)
305 hmap_destroy(&tc->queues);
308 static const struct tc_ops tc_ops_htb;
309 static const struct tc_ops tc_ops_hfsc;
310 static const struct tc_ops tc_ops_default;
311 static const struct tc_ops tc_ops_other;
313 static const struct tc_ops *tcs[] = {
314 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
315 &tc_ops_hfsc, /* Hierarchical fair service curve. */
316 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
317 &tc_ops_other, /* Some other qdisc. */
321 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
322 static unsigned int tc_get_major(unsigned int handle);
323 static unsigned int tc_get_minor(unsigned int handle);
325 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
326 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
327 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
329 static struct tcmsg *tc_make_request(const struct netdev *, int type,
330 unsigned int flags, struct ofpbuf *);
331 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
332 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
333 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
336 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
337 struct nlattr **options);
338 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
339 struct nlattr **options,
340 struct netdev_queue_stats *);
341 static int tc_query_class(const struct netdev *,
342 unsigned int handle, unsigned int parent,
343 struct ofpbuf **replyp);
344 static int tc_delete_class(const struct netdev *, unsigned int handle);
346 static int tc_del_qdisc(struct netdev *netdev);
347 static int tc_query_qdisc(const struct netdev *netdev);
349 static int tc_calc_cell_log(unsigned int mtu);
350 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
351 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
352 const struct tc_ratespec *rate);
353 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
355 struct netdev_dev_linux {
356 struct netdev_dev netdev_dev;
358 struct shash_node *shash_node;
359 unsigned int cache_valid;
360 unsigned int change_seq;
362 bool miimon; /* Link status of last poll. */
363 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
364 struct timer miimon_timer;
366 /* The following are figured out "on demand" only. They are only valid
367 * when the corresponding VALID_* bit in 'cache_valid' is set. */
369 uint8_t etheraddr[ETH_ADDR_LEN];
370 struct in_addr address, netmask;
373 unsigned int ifi_flags;
374 long long int carrier_resets;
375 uint32_t kbits_rate; /* Policing data. */
376 uint32_t kbits_burst;
377 int vport_stats_error; /* Cached error code from vport_get_stats().
378 0 or an errno value. */
379 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
380 int ether_addr_error; /* Cached error code from set/get etheraddr. */
381 int netdev_policing_error; /* Cached error code from set policing. */
382 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
383 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
385 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
386 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
387 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
388 enum netdev_features peer; /* Cached from ETHTOOL_GSET. */
390 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
394 struct tap_state tap;
398 struct netdev_linux {
399 struct netdev netdev;
403 /* Sockets used for ioctl operations. */
404 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
406 /* A Netlink routing socket that is not subscribed to any multicast groups. */
407 static struct nl_sock *rtnl_sock;
409 /* This is set pretty low because we probably won't learn anything from the
410 * additional log messages. */
411 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
413 static int netdev_linux_init(void);
415 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
416 int cmd, const char *cmd_name);
417 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
418 const char *cmd_name);
419 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
420 int cmd, const char *cmd_name);
421 static int get_flags(const struct netdev_dev *, unsigned int *flags);
422 static int set_flags(struct netdev *, unsigned int flags);
423 static int do_get_ifindex(const char *netdev_name);
424 static int get_ifindex(const struct netdev *, int *ifindexp);
425 static int do_set_addr(struct netdev *netdev,
426 int ioctl_nr, const char *ioctl_name,
427 struct in_addr addr);
428 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
429 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
430 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
431 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
432 static int af_packet_sock(void);
433 static void netdev_linux_miimon_run(void);
434 static void netdev_linux_miimon_wait(void);
437 is_netdev_linux_class(const struct netdev_class *netdev_class)
439 return netdev_class->init == netdev_linux_init;
442 static struct netdev_dev_linux *
443 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
445 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
446 ovs_assert(is_netdev_linux_class(netdev_class));
448 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
451 static struct netdev_linux *
452 netdev_linux_cast(const struct netdev *netdev)
454 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
455 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
456 ovs_assert(is_netdev_linux_class(netdev_class));
458 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
462 netdev_linux_init(void)
464 static int status = -1;
466 /* Create AF_INET socket. */
467 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
468 status = af_inet_sock >= 0 ? 0 : errno;
470 VLOG_ERR("failed to create inet socket: %s", strerror(status));
473 /* Create rtnetlink socket. */
475 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
477 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
486 netdev_linux_run(void)
488 rtnetlink_link_run();
489 netdev_linux_miimon_run();
493 netdev_linux_wait(void)
495 rtnetlink_link_wait();
496 netdev_linux_miimon_wait();
500 netdev_dev_linux_changed(struct netdev_dev_linux *dev,
501 unsigned int ifi_flags,
505 if (!dev->change_seq) {
509 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
510 dev->carrier_resets++;
512 dev->ifi_flags = ifi_flags;
514 dev->cache_valid &= mask;
518 netdev_dev_linux_update(struct netdev_dev_linux *dev,
519 const struct rtnetlink_link_change *change)
521 if (change->nlmsg_type == RTM_NEWLINK) {
523 netdev_dev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
525 /* Update netdev from rtnl-change msg. */
527 dev->mtu = change->mtu;
528 dev->cache_valid |= VALID_MTU;
529 dev->netdev_mtu_error = 0;
532 if (!eth_addr_is_zero(change->addr)) {
533 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
534 dev->cache_valid |= VALID_ETHERADDR;
535 dev->ether_addr_error = 0;
538 dev->ifindex = change->ifi_index;
539 dev->cache_valid |= VALID_IFINDEX;
540 dev->get_ifindex_error = 0;
543 netdev_dev_linux_changed(dev, change->ifi_flags, 0);
548 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
549 void *aux OVS_UNUSED)
551 struct netdev_dev_linux *dev;
553 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
555 const struct netdev_class *netdev_class =
556 netdev_dev_get_class(base_dev);
558 if (is_netdev_linux_class(netdev_class)) {
559 dev = netdev_dev_linux_cast(base_dev);
560 netdev_dev_linux_update(dev, change);
564 struct shash device_shash;
565 struct shash_node *node;
567 shash_init(&device_shash);
568 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
569 SHASH_FOR_EACH (node, &device_shash) {
574 get_flags(&dev->netdev_dev, &flags);
575 netdev_dev_linux_changed(dev, flags, 0);
577 shash_destroy(&device_shash);
582 cache_notifier_ref(void)
584 if (!cache_notifier_refcount) {
585 ovs_assert(!netdev_linux_cache_notifier);
587 netdev_linux_cache_notifier =
588 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
590 if (!netdev_linux_cache_notifier) {
594 cache_notifier_refcount++;
600 cache_notifier_unref(void)
602 ovs_assert(cache_notifier_refcount > 0);
603 if (!--cache_notifier_refcount) {
604 ovs_assert(netdev_linux_cache_notifier);
605 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
606 netdev_linux_cache_notifier = NULL;
610 /* Creates system and internal devices. */
612 netdev_linux_create(const struct netdev_class *class, const char *name,
613 struct netdev_dev **netdev_devp)
615 struct netdev_dev_linux *netdev_dev;
618 error = cache_notifier_ref();
623 netdev_dev = xzalloc(sizeof *netdev_dev);
624 netdev_dev->change_seq = 1;
625 netdev_dev_init(&netdev_dev->netdev_dev, name, class);
626 get_flags(&netdev_dev->netdev_dev, &netdev_dev->ifi_flags);
628 *netdev_devp = &netdev_dev->netdev_dev;
632 /* For most types of netdevs we open the device for each call of
633 * netdev_open(). However, this is not the case with tap devices,
634 * since it is only possible to open the device once. In this
635 * situation we share a single file descriptor, and consequently
636 * buffers, across all readers. Therefore once data is read it will
637 * be unavailable to other reads for tap devices. */
639 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
640 const char *name, struct netdev_dev **netdev_devp)
642 struct netdev_dev_linux *netdev_dev;
643 struct tap_state *state;
644 static const char tap_dev[] = "/dev/net/tun";
648 netdev_dev = xzalloc(sizeof *netdev_dev);
649 state = &netdev_dev->state.tap;
651 error = cache_notifier_ref();
656 /* Open tap device. */
657 state->fd = open(tap_dev, O_RDWR);
660 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
661 goto error_unref_notifier;
664 /* Create tap device. */
665 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
666 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
667 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
668 VLOG_WARN("%s: creating tap device failed: %s", name,
671 goto error_unref_notifier;
674 /* Make non-blocking. */
675 error = set_nonblocking(state->fd);
677 goto error_unref_notifier;
680 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
681 *netdev_devp = &netdev_dev->netdev_dev;
684 error_unref_notifier:
685 cache_notifier_unref();
692 destroy_tap(struct netdev_dev_linux *netdev_dev)
694 struct tap_state *state = &netdev_dev->state.tap;
696 if (state->fd >= 0) {
701 /* Destroys the netdev device 'netdev_dev_'. */
703 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
705 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
706 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
708 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
709 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
712 if (class == &netdev_tap_class) {
713 destroy_tap(netdev_dev);
717 cache_notifier_unref();
721 netdev_linux_open(struct netdev_dev *netdev_dev_, struct netdev **netdevp)
723 struct netdev_linux *netdev;
724 enum netdev_flags flags;
727 /* Allocate network device. */
728 netdev = xzalloc(sizeof *netdev);
730 netdev_init(&netdev->netdev, netdev_dev_);
732 /* Verify that the device really exists, by attempting to read its flags.
733 * (The flags might be cached, in which case this won't actually do an
736 * Don't do this for "internal" netdevs, though, because those have to be
737 * created as netdev objects before they exist in the kernel, because
738 * creating them in the kernel happens by passing a netdev object to
739 * dpif_port_add(). */
740 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
741 error = netdev_get_flags(&netdev->netdev, &flags);
742 if (error == ENODEV) {
747 *netdevp = &netdev->netdev;
751 netdev_uninit(&netdev->netdev, true);
755 /* Closes and destroys 'netdev'. */
757 netdev_linux_close(struct netdev *netdev_)
759 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
761 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
768 netdev_linux_listen(struct netdev *netdev_)
770 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
771 struct netdev_dev_linux *netdev_dev =
772 netdev_dev_linux_cast(netdev_get_dev(netdev_));
773 struct sockaddr_ll sll;
778 if (netdev->fd >= 0) {
782 if (!strcmp(netdev_get_type(netdev_), "tap")
783 && !netdev_dev->state.tap.opened) {
784 netdev->fd = netdev_dev->state.tap.fd;
785 netdev_dev->state.tap.opened = true;
789 /* Create file descriptor. */
790 fd = socket(PF_PACKET, SOCK_RAW, 0);
793 VLOG_ERR("failed to create raw socket (%s)", strerror(error));
797 /* Set non-blocking mode. */
798 error = set_nonblocking(fd);
803 /* Get ethernet device index. */
804 error = get_ifindex(&netdev->netdev, &ifindex);
809 /* Bind to specific ethernet device. */
810 memset(&sll, 0, sizeof sll);
811 sll.sll_family = AF_PACKET;
812 sll.sll_ifindex = ifindex;
813 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
814 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
816 VLOG_ERR("%s: failed to bind raw socket (%s)",
817 netdev_get_name(netdev_), strerror(error));
832 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
834 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
836 if (netdev->fd < 0) {
837 /* Device is not listening. */
844 retval = (netdev_->netdev_dev->netdev_class == &netdev_tap_class
845 ? read(netdev->fd, data, size)
846 : recv(netdev->fd, data, size, MSG_TRUNC));
848 return retval <= size ? retval : -EMSGSIZE;
849 } else if (errno != EINTR) {
850 if (errno != EAGAIN) {
851 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
852 strerror(errno), netdev_get_name(netdev_));
859 /* Registers with the poll loop to wake up from the next call to poll_block()
860 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
862 netdev_linux_recv_wait(struct netdev *netdev_)
864 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
865 if (netdev->fd >= 0) {
866 poll_fd_wait(netdev->fd, POLLIN);
870 /* Discards all packets waiting to be received from 'netdev'. */
872 netdev_linux_drain(struct netdev *netdev_)
874 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
875 if (netdev->fd < 0) {
877 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
879 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
880 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
884 drain_fd(netdev->fd, ifr.ifr_qlen);
887 return drain_rcvbuf(netdev->fd);
891 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
892 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
893 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
894 * the packet is too big or too small to transmit on the device.
896 * The caller retains ownership of 'buffer' in all cases.
898 * The kernel maintains a packet transmission queue, so the caller is not
899 * expected to do additional queuing of packets. */
901 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
903 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
907 if (netdev->fd < 0) {
908 /* Use our AF_PACKET socket to send to this device. */
909 struct sockaddr_ll sll;
916 sock = af_packet_sock();
921 error = get_ifindex(netdev_, &ifindex);
926 /* We don't bother setting most fields in sockaddr_ll because the
927 * kernel ignores them for SOCK_RAW. */
928 memset(&sll, 0, sizeof sll);
929 sll.sll_family = AF_PACKET;
930 sll.sll_ifindex = ifindex;
932 iov.iov_base = CONST_CAST(void *, data);
936 msg.msg_namelen = sizeof sll;
939 msg.msg_control = NULL;
940 msg.msg_controllen = 0;
943 retval = sendmsg(sock, &msg, 0);
945 /* Use the netdev's own fd to send to this device. This is
946 * essential for tap devices, because packets sent to a tap device
947 * with an AF_PACKET socket will loop back to be *received* again
948 * on the tap device. */
949 retval = write(netdev->fd, data, size);
953 /* The Linux AF_PACKET implementation never blocks waiting for room
954 * for packets, instead returning ENOBUFS. Translate this into
955 * EAGAIN for the caller. */
956 if (errno == ENOBUFS) {
958 } else if (errno == EINTR) {
960 } else if (errno != EAGAIN) {
961 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
962 netdev_get_name(netdev_), strerror(errno));
965 } else if (retval != size) {
966 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
967 "%zu) on %s", retval, size, netdev_get_name(netdev_));
975 /* Registers with the poll loop to wake up from the next call to poll_block()
976 * when the packet transmission queue has sufficient room to transmit a packet
977 * with netdev_send().
979 * The kernel maintains a packet transmission queue, so the client is not
980 * expected to do additional queuing of packets. Thus, this function is
981 * unlikely to ever be used. It is included for completeness. */
983 netdev_linux_send_wait(struct netdev *netdev_)
985 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
986 if (netdev->fd < 0) {
988 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
989 poll_fd_wait(netdev->fd, POLLOUT);
991 /* TAP device always accepts packets.*/
992 poll_immediate_wake();
996 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
997 * otherwise a positive errno value. */
999 netdev_linux_set_etheraddr(struct netdev *netdev_,
1000 const uint8_t mac[ETH_ADDR_LEN])
1002 struct netdev_dev_linux *netdev_dev =
1003 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1005 bool up_again = false;
1007 if (netdev_dev->cache_valid & VALID_ETHERADDR) {
1008 if (netdev_dev->ether_addr_error) {
1009 return netdev_dev->ether_addr_error;
1011 if (eth_addr_equals(netdev_dev->etheraddr, mac)) {
1014 netdev_dev->cache_valid &= ~VALID_ETHERADDR;
1017 /* Tap devices must be brought down before setting the address. */
1018 if (!strcmp(netdev_get_type(netdev_), "tap")) {
1019 enum netdev_flags flags;
1021 if (!netdev_get_flags(netdev_, &flags) && (flags & NETDEV_UP)) {
1022 netdev_turn_flags_off(netdev_, NETDEV_UP, false);
1026 error = set_etheraddr(netdev_get_name(netdev_), mac);
1027 if (!error || error == ENODEV) {
1028 netdev_dev->ether_addr_error = error;
1029 netdev_dev->cache_valid |= VALID_ETHERADDR;
1031 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
1036 netdev_turn_flags_on(netdev_, NETDEV_UP, false);
1042 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1044 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1045 uint8_t mac[ETH_ADDR_LEN])
1047 struct netdev_dev_linux *netdev_dev =
1048 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1050 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
1051 int error = get_etheraddr(netdev_get_name(netdev_),
1052 netdev_dev->etheraddr);
1054 netdev_dev->ether_addr_error = error;
1055 netdev_dev->cache_valid |= VALID_ETHERADDR;
1058 if (!netdev_dev->ether_addr_error) {
1059 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
1062 return netdev_dev->ether_addr_error;
1065 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1066 * in bytes, not including the hardware header; thus, this is typically 1500
1067 * bytes for Ethernet devices. */
1069 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1071 struct netdev_dev_linux *netdev_dev =
1072 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1073 if (!(netdev_dev->cache_valid & VALID_MTU)) {
1077 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1078 SIOCGIFMTU, "SIOCGIFMTU");
1080 netdev_dev->netdev_mtu_error = error;
1081 netdev_dev->mtu = ifr.ifr_mtu;
1082 netdev_dev->cache_valid |= VALID_MTU;
1085 if (!netdev_dev->netdev_mtu_error) {
1086 *mtup = netdev_dev->mtu;
1088 return netdev_dev->netdev_mtu_error;
1091 /* Sets the maximum size of transmitted (MTU) for given device using linux
1092 * networking ioctl interface.
1095 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1097 struct netdev_dev_linux *netdev_dev =
1098 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1102 if (netdev_dev->cache_valid & VALID_MTU) {
1103 if (netdev_dev->netdev_mtu_error) {
1104 return netdev_dev->netdev_mtu_error;
1106 if (netdev_dev->mtu == mtu) {
1109 netdev_dev->cache_valid &= ~VALID_MTU;
1112 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1113 SIOCSIFMTU, "SIOCSIFMTU");
1114 if (!error || error == ENODEV) {
1115 netdev_dev->netdev_mtu_error = error;
1116 netdev_dev->mtu = ifr.ifr_mtu;
1117 netdev_dev->cache_valid |= VALID_MTU;
1122 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1123 * On failure, returns a negative errno value. */
1125 netdev_linux_get_ifindex(const struct netdev *netdev)
1129 error = get_ifindex(netdev, &ifindex);
1130 return error ? -error : ifindex;
1134 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1136 struct netdev_dev_linux *netdev_dev =
1137 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1139 if (netdev_dev->miimon_interval > 0) {
1140 *carrier = netdev_dev->miimon;
1142 *carrier = (netdev_dev->ifi_flags & IFF_RUNNING) != 0;
1148 static long long int
1149 netdev_linux_get_carrier_resets(const struct netdev *netdev)
1151 return netdev_dev_linux_cast(netdev_get_dev(netdev))->carrier_resets;
1155 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1156 struct mii_ioctl_data *data)
1161 memset(&ifr, 0, sizeof ifr);
1162 memcpy(&ifr.ifr_data, data, sizeof *data);
1163 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1164 memcpy(data, &ifr.ifr_data, sizeof *data);
1170 netdev_linux_get_miimon(const char *name, bool *miimon)
1172 struct mii_ioctl_data data;
1177 memset(&data, 0, sizeof data);
1178 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1180 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1181 data.reg_num = MII_BMSR;
1182 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1186 *miimon = !!(data.val_out & BMSR_LSTATUS);
1188 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1191 struct ethtool_cmd ecmd;
1193 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1196 COVERAGE_INC(netdev_get_ethtool);
1197 memset(&ecmd, 0, sizeof ecmd);
1198 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1201 struct ethtool_value eval;
1203 memcpy(&eval, &ecmd, sizeof eval);
1204 *miimon = !!eval.data;
1206 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1214 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1215 long long int interval)
1217 struct netdev_dev_linux *netdev_dev;
1219 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev_));
1221 interval = interval > 0 ? MAX(interval, 100) : 0;
1222 if (netdev_dev->miimon_interval != interval) {
1223 netdev_dev->miimon_interval = interval;
1224 timer_set_expired(&netdev_dev->miimon_timer);
1231 netdev_linux_miimon_run(void)
1233 struct shash device_shash;
1234 struct shash_node *node;
1236 shash_init(&device_shash);
1237 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1238 SHASH_FOR_EACH (node, &device_shash) {
1239 struct netdev_dev_linux *dev = node->data;
1242 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1246 netdev_linux_get_miimon(dev->netdev_dev.name, &miimon);
1247 if (miimon != dev->miimon) {
1248 dev->miimon = miimon;
1249 netdev_dev_linux_changed(dev, dev->ifi_flags, 0);
1252 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1255 shash_destroy(&device_shash);
1259 netdev_linux_miimon_wait(void)
1261 struct shash device_shash;
1262 struct shash_node *node;
1264 shash_init(&device_shash);
1265 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1266 SHASH_FOR_EACH (node, &device_shash) {
1267 struct netdev_dev_linux *dev = node->data;
1269 if (dev->miimon_interval > 0) {
1270 timer_wait(&dev->miimon_timer);
1273 shash_destroy(&device_shash);
1276 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1277 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1280 check_for_working_netlink_stats(void)
1282 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1283 * preferable, so if that works, we'll use it. */
1284 int ifindex = do_get_ifindex("lo");
1286 VLOG_WARN("failed to get ifindex for lo, "
1287 "obtaining netdev stats from proc");
1290 struct netdev_stats stats;
1291 int error = get_stats_via_netlink(ifindex, &stats);
1293 VLOG_DBG("obtaining netdev stats via rtnetlink");
1296 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1297 "via proc (you are probably running a pre-2.6.19 "
1298 "kernel)", strerror(error));
1305 swap_uint64(uint64_t *a, uint64_t *b)
1313 get_stats_via_vport(const struct netdev *netdev_,
1314 struct netdev_stats *stats)
1316 struct netdev_dev_linux *netdev_dev =
1317 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1319 if (!netdev_dev->vport_stats_error ||
1320 !(netdev_dev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1323 error = netdev_vport_get_stats(netdev_, stats);
1324 if (error && error != ENOENT) {
1325 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1326 "(%s)", netdev_get_name(netdev_), strerror(error));
1328 netdev_dev->vport_stats_error = error;
1329 netdev_dev->cache_valid |= VALID_VPORT_STAT_ERROR;
1334 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1335 struct netdev_stats *stats)
1337 static int use_netlink_stats = -1;
1340 if (use_netlink_stats < 0) {
1341 use_netlink_stats = check_for_working_netlink_stats();
1344 if (use_netlink_stats) {
1347 error = get_ifindex(netdev_, &ifindex);
1349 error = get_stats_via_netlink(ifindex, stats);
1352 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1356 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1357 netdev_get_name(netdev_), error);
1363 /* Retrieves current device stats for 'netdev-linux'. */
1365 netdev_linux_get_stats(const struct netdev *netdev_,
1366 struct netdev_stats *stats)
1368 struct netdev_dev_linux *netdev_dev =
1369 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1370 struct netdev_stats dev_stats;
1373 get_stats_via_vport(netdev_, stats);
1375 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1378 if (netdev_dev->vport_stats_error) {
1385 if (netdev_dev->vport_stats_error) {
1386 /* stats not available from OVS then use ioctl stats. */
1389 stats->rx_errors += dev_stats.rx_errors;
1390 stats->tx_errors += dev_stats.tx_errors;
1391 stats->rx_dropped += dev_stats.rx_dropped;
1392 stats->tx_dropped += dev_stats.tx_dropped;
1393 stats->multicast += dev_stats.multicast;
1394 stats->collisions += dev_stats.collisions;
1395 stats->rx_length_errors += dev_stats.rx_length_errors;
1396 stats->rx_over_errors += dev_stats.rx_over_errors;
1397 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1398 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1399 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1400 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1401 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1402 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1403 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1404 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1405 stats->tx_window_errors += dev_stats.tx_window_errors;
1410 /* Retrieves current device stats for 'netdev-tap' netdev or
1411 * netdev-internal. */
1413 netdev_tap_get_stats(const struct netdev *netdev_,
1414 struct netdev_stats *stats)
1416 struct netdev_dev_linux *netdev_dev =
1417 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1418 struct netdev_stats dev_stats;
1421 get_stats_via_vport(netdev_, stats);
1423 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1425 if (netdev_dev->vport_stats_error) {
1432 /* If this port is an internal port then the transmit and receive stats
1433 * will appear to be swapped relative to the other ports since we are the
1434 * one sending the data, not a remote computer. For consistency, we swap
1435 * them back here. This does not apply if we are getting stats from the
1436 * vport layer because it always tracks stats from the perspective of the
1438 if (netdev_dev->vport_stats_error) {
1440 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1441 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1442 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1443 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1444 stats->rx_length_errors = 0;
1445 stats->rx_over_errors = 0;
1446 stats->rx_crc_errors = 0;
1447 stats->rx_frame_errors = 0;
1448 stats->rx_fifo_errors = 0;
1449 stats->rx_missed_errors = 0;
1450 stats->tx_aborted_errors = 0;
1451 stats->tx_carrier_errors = 0;
1452 stats->tx_fifo_errors = 0;
1453 stats->tx_heartbeat_errors = 0;
1454 stats->tx_window_errors = 0;
1456 stats->rx_dropped += dev_stats.tx_dropped;
1457 stats->tx_dropped += dev_stats.rx_dropped;
1459 stats->rx_errors += dev_stats.tx_errors;
1460 stats->tx_errors += dev_stats.rx_errors;
1462 stats->multicast += dev_stats.multicast;
1463 stats->collisions += dev_stats.collisions;
1469 netdev_internal_get_stats(const struct netdev *netdev_,
1470 struct netdev_stats *stats)
1472 struct netdev_dev_linux *netdev_dev =
1473 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1475 get_stats_via_vport(netdev_, stats);
1476 return netdev_dev->vport_stats_error;
1480 netdev_internal_set_stats(struct netdev *netdev,
1481 const struct netdev_stats *stats)
1483 struct ovs_vport_stats vport_stats;
1484 struct dpif_linux_vport vport;
1487 vport_stats.rx_packets = stats->rx_packets;
1488 vport_stats.tx_packets = stats->tx_packets;
1489 vport_stats.rx_bytes = stats->rx_bytes;
1490 vport_stats.tx_bytes = stats->tx_bytes;
1491 vport_stats.rx_errors = stats->rx_errors;
1492 vport_stats.tx_errors = stats->tx_errors;
1493 vport_stats.rx_dropped = stats->rx_dropped;
1494 vport_stats.tx_dropped = stats->tx_dropped;
1496 dpif_linux_vport_init(&vport);
1497 vport.cmd = OVS_VPORT_CMD_SET;
1498 vport.name = netdev_get_name(netdev);
1499 vport.stats = &vport_stats;
1501 err = dpif_linux_vport_transact(&vport, NULL, NULL);
1503 /* If the vport layer doesn't know about the device, that doesn't mean it
1504 * doesn't exist (after all were able to open it when netdev_open() was
1505 * called), it just means that it isn't attached and we'll be getting
1506 * stats a different way. */
1507 if (err == ENODEV) {
1515 netdev_linux_read_features(struct netdev_dev_linux *netdev_dev)
1517 struct ethtool_cmd ecmd;
1521 if (netdev_dev->cache_valid & VALID_FEATURES) {
1525 COVERAGE_INC(netdev_get_ethtool);
1526 memset(&ecmd, 0, sizeof ecmd);
1527 error = netdev_linux_do_ethtool(netdev_dev->netdev_dev.name, &ecmd,
1528 ETHTOOL_GSET, "ETHTOOL_GSET");
1533 /* Supported features. */
1534 netdev_dev->supported = 0;
1535 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1536 netdev_dev->supported |= NETDEV_F_10MB_HD;
1538 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1539 netdev_dev->supported |= NETDEV_F_10MB_FD;
1541 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1542 netdev_dev->supported |= NETDEV_F_100MB_HD;
1544 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1545 netdev_dev->supported |= NETDEV_F_100MB_FD;
1547 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1548 netdev_dev->supported |= NETDEV_F_1GB_HD;
1550 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1551 netdev_dev->supported |= NETDEV_F_1GB_FD;
1553 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1554 netdev_dev->supported |= NETDEV_F_10GB_FD;
1556 if (ecmd.supported & SUPPORTED_TP) {
1557 netdev_dev->supported |= NETDEV_F_COPPER;
1559 if (ecmd.supported & SUPPORTED_FIBRE) {
1560 netdev_dev->supported |= NETDEV_F_FIBER;
1562 if (ecmd.supported & SUPPORTED_Autoneg) {
1563 netdev_dev->supported |= NETDEV_F_AUTONEG;
1565 if (ecmd.supported & SUPPORTED_Pause) {
1566 netdev_dev->supported |= NETDEV_F_PAUSE;
1568 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1569 netdev_dev->supported |= NETDEV_F_PAUSE_ASYM;
1572 /* Advertised features. */
1573 netdev_dev->advertised = 0;
1574 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1575 netdev_dev->advertised |= NETDEV_F_10MB_HD;
1577 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1578 netdev_dev->advertised |= NETDEV_F_10MB_FD;
1580 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1581 netdev_dev->advertised |= NETDEV_F_100MB_HD;
1583 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1584 netdev_dev->advertised |= NETDEV_F_100MB_FD;
1586 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1587 netdev_dev->advertised |= NETDEV_F_1GB_HD;
1589 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1590 netdev_dev->advertised |= NETDEV_F_1GB_FD;
1592 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1593 netdev_dev->advertised |= NETDEV_F_10GB_FD;
1595 if (ecmd.advertising & ADVERTISED_TP) {
1596 netdev_dev->advertised |= NETDEV_F_COPPER;
1598 if (ecmd.advertising & ADVERTISED_FIBRE) {
1599 netdev_dev->advertised |= NETDEV_F_FIBER;
1601 if (ecmd.advertising & ADVERTISED_Autoneg) {
1602 netdev_dev->advertised |= NETDEV_F_AUTONEG;
1604 if (ecmd.advertising & ADVERTISED_Pause) {
1605 netdev_dev->advertised |= NETDEV_F_PAUSE;
1607 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1608 netdev_dev->advertised |= NETDEV_F_PAUSE_ASYM;
1611 /* Current settings. */
1613 if (speed == SPEED_10) {
1614 netdev_dev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1615 } else if (speed == SPEED_100) {
1616 netdev_dev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1617 } else if (speed == SPEED_1000) {
1618 netdev_dev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1619 } else if (speed == SPEED_10000) {
1620 netdev_dev->current = NETDEV_F_10GB_FD;
1621 } else if (speed == 40000) {
1622 netdev_dev->current = NETDEV_F_40GB_FD;
1623 } else if (speed == 100000) {
1624 netdev_dev->current = NETDEV_F_100GB_FD;
1625 } else if (speed == 1000000) {
1626 netdev_dev->current = NETDEV_F_1TB_FD;
1628 netdev_dev->current = 0;
1631 if (ecmd.port == PORT_TP) {
1632 netdev_dev->current |= NETDEV_F_COPPER;
1633 } else if (ecmd.port == PORT_FIBRE) {
1634 netdev_dev->current |= NETDEV_F_FIBER;
1638 netdev_dev->current |= NETDEV_F_AUTONEG;
1641 /* Peer advertisements. */
1642 netdev_dev->peer = 0; /* XXX */
1645 netdev_dev->cache_valid |= VALID_FEATURES;
1646 netdev_dev->get_features_error = error;
1649 /* Stores the features supported by 'netdev' into each of '*current',
1650 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1651 * bitmap of NETDEV_* bits. Returns 0 if successful, otherwise a positive
1654 netdev_linux_get_features(const struct netdev *netdev_,
1655 enum netdev_features *current,
1656 enum netdev_features *advertised,
1657 enum netdev_features *supported,
1658 enum netdev_features *peer)
1660 struct netdev_dev_linux *netdev_dev =
1661 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1663 netdev_linux_read_features(netdev_dev);
1665 if (!netdev_dev->get_features_error) {
1666 *current = netdev_dev->current;
1667 *advertised = netdev_dev->advertised;
1668 *supported = netdev_dev->supported;
1669 *peer = netdev_dev->peer;
1671 return netdev_dev->get_features_error;
1674 /* Set the features advertised by 'netdev' to 'advertise'. */
1676 netdev_linux_set_advertisements(struct netdev *netdev,
1677 enum netdev_features advertise)
1679 struct ethtool_cmd ecmd;
1682 COVERAGE_INC(netdev_get_ethtool);
1683 memset(&ecmd, 0, sizeof ecmd);
1684 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1685 ETHTOOL_GSET, "ETHTOOL_GSET");
1690 ecmd.advertising = 0;
1691 if (advertise & NETDEV_F_10MB_HD) {
1692 ecmd.advertising |= ADVERTISED_10baseT_Half;
1694 if (advertise & NETDEV_F_10MB_FD) {
1695 ecmd.advertising |= ADVERTISED_10baseT_Full;
1697 if (advertise & NETDEV_F_100MB_HD) {
1698 ecmd.advertising |= ADVERTISED_100baseT_Half;
1700 if (advertise & NETDEV_F_100MB_FD) {
1701 ecmd.advertising |= ADVERTISED_100baseT_Full;
1703 if (advertise & NETDEV_F_1GB_HD) {
1704 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1706 if (advertise & NETDEV_F_1GB_FD) {
1707 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1709 if (advertise & NETDEV_F_10GB_FD) {
1710 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1712 if (advertise & NETDEV_F_COPPER) {
1713 ecmd.advertising |= ADVERTISED_TP;
1715 if (advertise & NETDEV_F_FIBER) {
1716 ecmd.advertising |= ADVERTISED_FIBRE;
1718 if (advertise & NETDEV_F_AUTONEG) {
1719 ecmd.advertising |= ADVERTISED_Autoneg;
1721 if (advertise & NETDEV_F_PAUSE) {
1722 ecmd.advertising |= ADVERTISED_Pause;
1724 if (advertise & NETDEV_F_PAUSE_ASYM) {
1725 ecmd.advertising |= ADVERTISED_Asym_Pause;
1727 COVERAGE_INC(netdev_set_ethtool);
1728 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1729 ETHTOOL_SSET, "ETHTOOL_SSET");
1732 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1733 * successful, otherwise a positive errno value. */
1735 netdev_linux_set_policing(struct netdev *netdev,
1736 uint32_t kbits_rate, uint32_t kbits_burst)
1738 struct netdev_dev_linux *netdev_dev =
1739 netdev_dev_linux_cast(netdev_get_dev(netdev));
1740 const char *netdev_name = netdev_get_name(netdev);
1744 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1745 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1746 : kbits_burst); /* Stick with user-specified value. */
1748 if (netdev_dev->cache_valid & VALID_POLICING) {
1749 if (netdev_dev->netdev_policing_error) {
1750 return netdev_dev->netdev_policing_error;
1753 if (netdev_dev->kbits_rate == kbits_rate &&
1754 netdev_dev->kbits_burst == kbits_burst) {
1755 /* Assume that settings haven't changed since we last set them. */
1758 netdev_dev->cache_valid &= ~VALID_POLICING;
1761 COVERAGE_INC(netdev_set_policing);
1762 /* Remove any existing ingress qdisc. */
1763 error = tc_add_del_ingress_qdisc(netdev, false);
1765 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1766 netdev_name, strerror(error));
1771 error = tc_add_del_ingress_qdisc(netdev, true);
1773 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1774 netdev_name, strerror(error));
1778 error = tc_add_policer(netdev, kbits_rate, kbits_burst);
1780 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1781 netdev_name, strerror(error));
1786 netdev_dev->kbits_rate = kbits_rate;
1787 netdev_dev->kbits_burst = kbits_burst;
1790 if (!error || error == ENODEV) {
1791 netdev_dev->netdev_policing_error = error;
1792 netdev_dev->cache_valid |= VALID_POLICING;
1798 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1801 const struct tc_ops **opsp;
1803 for (opsp = tcs; *opsp != NULL; opsp++) {
1804 const struct tc_ops *ops = *opsp;
1805 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1806 sset_add(types, ops->ovs_name);
1812 static const struct tc_ops *
1813 tc_lookup_ovs_name(const char *name)
1815 const struct tc_ops **opsp;
1817 for (opsp = tcs; *opsp != NULL; opsp++) {
1818 const struct tc_ops *ops = *opsp;
1819 if (!strcmp(name, ops->ovs_name)) {
1826 static const struct tc_ops *
1827 tc_lookup_linux_name(const char *name)
1829 const struct tc_ops **opsp;
1831 for (opsp = tcs; *opsp != NULL; opsp++) {
1832 const struct tc_ops *ops = *opsp;
1833 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1840 static struct tc_queue *
1841 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1844 struct netdev_dev_linux *netdev_dev =
1845 netdev_dev_linux_cast(netdev_get_dev(netdev));
1846 struct tc_queue *queue;
1848 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1849 if (queue->queue_id == queue_id) {
1856 static struct tc_queue *
1857 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1859 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1863 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1865 struct netdev_qos_capabilities *caps)
1867 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1871 caps->n_queues = ops->n_queues;
1876 netdev_linux_get_qos(const struct netdev *netdev,
1877 const char **typep, struct smap *details)
1879 struct netdev_dev_linux *netdev_dev =
1880 netdev_dev_linux_cast(netdev_get_dev(netdev));
1883 error = tc_query_qdisc(netdev);
1888 *typep = netdev_dev->tc->ops->ovs_name;
1889 return (netdev_dev->tc->ops->qdisc_get
1890 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1895 netdev_linux_set_qos(struct netdev *netdev,
1896 const char *type, const struct smap *details)
1898 struct netdev_dev_linux *netdev_dev =
1899 netdev_dev_linux_cast(netdev_get_dev(netdev));
1900 const struct tc_ops *new_ops;
1903 new_ops = tc_lookup_ovs_name(type);
1904 if (!new_ops || !new_ops->tc_install) {
1908 error = tc_query_qdisc(netdev);
1913 if (new_ops == netdev_dev->tc->ops) {
1914 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1916 /* Delete existing qdisc. */
1917 error = tc_del_qdisc(netdev);
1921 ovs_assert(netdev_dev->tc == NULL);
1923 /* Install new qdisc. */
1924 error = new_ops->tc_install(netdev, details);
1925 ovs_assert((error == 0) == (netdev_dev->tc != NULL));
1932 netdev_linux_get_queue(const struct netdev *netdev,
1933 unsigned int queue_id, struct smap *details)
1935 struct netdev_dev_linux *netdev_dev =
1936 netdev_dev_linux_cast(netdev_get_dev(netdev));
1939 error = tc_query_qdisc(netdev);
1943 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1945 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1951 netdev_linux_set_queue(struct netdev *netdev,
1952 unsigned int queue_id, const struct smap *details)
1954 struct netdev_dev_linux *netdev_dev =
1955 netdev_dev_linux_cast(netdev_get_dev(netdev));
1958 error = tc_query_qdisc(netdev);
1961 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1962 || !netdev_dev->tc->ops->class_set) {
1966 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1970 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1972 struct netdev_dev_linux *netdev_dev =
1973 netdev_dev_linux_cast(netdev_get_dev(netdev));
1976 error = tc_query_qdisc(netdev);
1979 } else if (!netdev_dev->tc->ops->class_delete) {
1982 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1984 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1990 netdev_linux_get_queue_stats(const struct netdev *netdev,
1991 unsigned int queue_id,
1992 struct netdev_queue_stats *stats)
1994 struct netdev_dev_linux *netdev_dev =
1995 netdev_dev_linux_cast(netdev_get_dev(netdev));
1998 error = tc_query_qdisc(netdev);
2001 } else if (!netdev_dev->tc->ops->class_get_stats) {
2004 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
2006 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
2012 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
2014 struct ofpbuf request;
2015 struct tcmsg *tcmsg;
2017 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2021 tcmsg->tcm_parent = 0;
2022 nl_dump_start(dump, rtnl_sock, &request);
2023 ofpbuf_uninit(&request);
2028 netdev_linux_dump_queues(const struct netdev *netdev,
2029 netdev_dump_queues_cb *cb, void *aux)
2031 struct netdev_dev_linux *netdev_dev =
2032 netdev_dev_linux_cast(netdev_get_dev(netdev));
2033 struct tc_queue *queue, *next_queue;
2034 struct smap details;
2038 error = tc_query_qdisc(netdev);
2041 } else if (!netdev_dev->tc->ops->class_get) {
2046 smap_init(&details);
2047 HMAP_FOR_EACH_SAFE (queue, next_queue, hmap_node,
2048 &netdev_dev->tc->queues) {
2049 smap_clear(&details);
2051 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
2053 (*cb)(queue->queue_id, &details, aux);
2058 smap_destroy(&details);
2064 netdev_linux_dump_queue_stats(const struct netdev *netdev,
2065 netdev_dump_queue_stats_cb *cb, void *aux)
2067 struct netdev_dev_linux *netdev_dev =
2068 netdev_dev_linux_cast(netdev_get_dev(netdev));
2069 struct nl_dump dump;
2074 error = tc_query_qdisc(netdev);
2077 } else if (!netdev_dev->tc->ops->class_dump_stats) {
2082 if (!start_queue_dump(netdev, &dump)) {
2085 while (nl_dump_next(&dump, &msg)) {
2086 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
2092 error = nl_dump_done(&dump);
2093 return error ? error : last_error;
2097 netdev_linux_get_in4(const struct netdev *netdev_,
2098 struct in_addr *address, struct in_addr *netmask)
2100 struct netdev_dev_linux *netdev_dev =
2101 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2103 if (!(netdev_dev->cache_valid & VALID_IN4)) {
2106 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
2107 SIOCGIFADDR, "SIOCGIFADDR");
2112 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
2113 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2118 netdev_dev->cache_valid |= VALID_IN4;
2120 *address = netdev_dev->address;
2121 *netmask = netdev_dev->netmask;
2122 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
2126 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2127 struct in_addr netmask)
2129 struct netdev_dev_linux *netdev_dev =
2130 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2133 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2135 netdev_dev->cache_valid |= VALID_IN4;
2136 netdev_dev->address = address;
2137 netdev_dev->netmask = netmask;
2138 if (address.s_addr != INADDR_ANY) {
2139 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2140 "SIOCSIFNETMASK", netmask);
2147 parse_if_inet6_line(const char *line,
2148 struct in6_addr *in6, char ifname[16 + 1])
2150 uint8_t *s6 = in6->s6_addr;
2151 #define X8 "%2"SCNx8
2153 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2154 "%*x %*x %*x %*x %16s\n",
2155 &s6[0], &s6[1], &s6[2], &s6[3],
2156 &s6[4], &s6[5], &s6[6], &s6[7],
2157 &s6[8], &s6[9], &s6[10], &s6[11],
2158 &s6[12], &s6[13], &s6[14], &s6[15],
2162 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2163 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2165 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2167 struct netdev_dev_linux *netdev_dev =
2168 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2169 if (!(netdev_dev->cache_valid & VALID_IN6)) {
2173 netdev_dev->in6 = in6addr_any;
2175 file = fopen("/proc/net/if_inet6", "r");
2177 const char *name = netdev_get_name(netdev_);
2178 while (fgets(line, sizeof line, file)) {
2179 struct in6_addr in6_tmp;
2180 char ifname[16 + 1];
2181 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2182 && !strcmp(name, ifname))
2184 netdev_dev->in6 = in6_tmp;
2190 netdev_dev->cache_valid |= VALID_IN6;
2192 *in6 = netdev_dev->in6;
2197 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2199 struct sockaddr_in sin;
2200 memset(&sin, 0, sizeof sin);
2201 sin.sin_family = AF_INET;
2202 sin.sin_addr = addr;
2205 memset(sa, 0, sizeof *sa);
2206 memcpy(sa, &sin, sizeof sin);
2210 do_set_addr(struct netdev *netdev,
2211 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2214 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2215 make_in4_sockaddr(&ifr.ifr_addr, addr);
2217 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2221 /* Adds 'router' as a default IP gateway. */
2223 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2225 struct in_addr any = { INADDR_ANY };
2229 memset(&rt, 0, sizeof rt);
2230 make_in4_sockaddr(&rt.rt_dst, any);
2231 make_in4_sockaddr(&rt.rt_gateway, router);
2232 make_in4_sockaddr(&rt.rt_genmask, any);
2233 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2234 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2236 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
2242 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2245 static const char fn[] = "/proc/net/route";
2250 *netdev_name = NULL;
2251 stream = fopen(fn, "r");
2252 if (stream == NULL) {
2253 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
2258 while (fgets(line, sizeof line, stream)) {
2261 ovs_be32 dest, gateway, mask;
2262 int refcnt, metric, mtu;
2263 unsigned int flags, use, window, irtt;
2266 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2268 iface, &dest, &gateway, &flags, &refcnt,
2269 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2271 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2275 if (!(flags & RTF_UP)) {
2276 /* Skip routes that aren't up. */
2280 /* The output of 'dest', 'mask', and 'gateway' were given in
2281 * network byte order, so we don't need need any endian
2282 * conversions here. */
2283 if ((dest & mask) == (host->s_addr & mask)) {
2285 /* The host is directly reachable. */
2286 next_hop->s_addr = 0;
2288 /* To reach the host, we must go through a gateway. */
2289 next_hop->s_addr = gateway;
2291 *netdev_name = xstrdup(iface);
2303 netdev_linux_get_status(const struct netdev *netdev, struct smap *smap)
2305 struct netdev_dev_linux *netdev_dev;
2308 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2309 if (!(netdev_dev->cache_valid & VALID_DRVINFO)) {
2310 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev_dev->drvinfo;
2312 COVERAGE_INC(netdev_get_ethtool);
2313 memset(&netdev_dev->drvinfo, 0, sizeof netdev_dev->drvinfo);
2314 error = netdev_linux_do_ethtool(netdev_dev->netdev_dev.name,
2317 "ETHTOOL_GDRVINFO");
2319 netdev_dev->cache_valid |= VALID_DRVINFO;
2324 smap_add(smap, "driver_name", netdev_dev->drvinfo.driver);
2325 smap_add(smap, "driver_version", netdev_dev->drvinfo.version);
2326 smap_add(smap, "firmware_version", netdev_dev->drvinfo.fw_version);
2332 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2335 smap_add(smap, "driver_name", "openvswitch");
2339 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2340 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2341 * returns 0. Otherwise, it returns a positive errno value; in particular,
2342 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2344 netdev_linux_arp_lookup(const struct netdev *netdev,
2345 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2348 struct sockaddr_in sin;
2351 memset(&r, 0, sizeof r);
2352 memset(&sin, 0, sizeof sin);
2353 sin.sin_family = AF_INET;
2354 sin.sin_addr.s_addr = ip;
2356 memcpy(&r.arp_pa, &sin, sizeof sin);
2357 r.arp_ha.sa_family = ARPHRD_ETHER;
2359 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2360 COVERAGE_INC(netdev_arp_lookup);
2361 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2363 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2364 } else if (retval != ENXIO) {
2365 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2366 netdev_get_name(netdev), IP_ARGS(ip), strerror(retval));
2372 nd_to_iff_flags(enum netdev_flags nd)
2375 if (nd & NETDEV_UP) {
2378 if (nd & NETDEV_PROMISC) {
2385 iff_to_nd_flags(int iff)
2387 enum netdev_flags nd = 0;
2391 if (iff & IFF_PROMISC) {
2392 nd |= NETDEV_PROMISC;
2398 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2399 enum netdev_flags on, enum netdev_flags *old_flagsp)
2401 struct netdev_dev_linux *netdev_dev;
2402 int old_flags, new_flags;
2405 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2406 old_flags = netdev_dev->ifi_flags;
2407 *old_flagsp = iff_to_nd_flags(old_flags);
2408 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2409 if (new_flags != old_flags) {
2410 error = set_flags(netdev, new_flags);
2411 get_flags(&netdev_dev->netdev_dev, &netdev_dev->ifi_flags);
2417 netdev_linux_change_seq(const struct netdev *netdev)
2419 return netdev_dev_linux_cast(netdev_get_dev(netdev))->change_seq;
2422 #define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS, \
2423 GET_FEATURES, GET_STATUS) \
2427 netdev_linux_init, \
2429 netdev_linux_wait, \
2432 netdev_linux_destroy, \
2433 NULL, /* get_config */ \
2434 NULL, /* set_config */ \
2435 NULL, /* get_tunnel_config */ \
2437 netdev_linux_open, \
2438 netdev_linux_close, \
2440 netdev_linux_listen, \
2441 netdev_linux_recv, \
2442 netdev_linux_recv_wait, \
2443 netdev_linux_drain, \
2445 netdev_linux_send, \
2446 netdev_linux_send_wait, \
2448 netdev_linux_set_etheraddr, \
2449 netdev_linux_get_etheraddr, \
2450 netdev_linux_get_mtu, \
2451 netdev_linux_set_mtu, \
2452 netdev_linux_get_ifindex, \
2453 netdev_linux_get_carrier, \
2454 netdev_linux_get_carrier_resets, \
2455 netdev_linux_set_miimon_interval, \
2460 netdev_linux_set_advertisements, \
2462 netdev_linux_set_policing, \
2463 netdev_linux_get_qos_types, \
2464 netdev_linux_get_qos_capabilities, \
2465 netdev_linux_get_qos, \
2466 netdev_linux_set_qos, \
2467 netdev_linux_get_queue, \
2468 netdev_linux_set_queue, \
2469 netdev_linux_delete_queue, \
2470 netdev_linux_get_queue_stats, \
2471 netdev_linux_dump_queues, \
2472 netdev_linux_dump_queue_stats, \
2474 netdev_linux_get_in4, \
2475 netdev_linux_set_in4, \
2476 netdev_linux_get_in6, \
2477 netdev_linux_add_router, \
2478 netdev_linux_get_next_hop, \
2480 netdev_linux_arp_lookup, \
2482 netdev_linux_update_flags, \
2484 netdev_linux_change_seq \
2487 const struct netdev_class netdev_linux_class =
2490 netdev_linux_create,
2491 netdev_linux_get_stats,
2492 NULL, /* set_stats */
2493 netdev_linux_get_features,
2494 netdev_linux_get_status);
2496 const struct netdev_class netdev_tap_class =
2499 netdev_linux_create_tap,
2500 netdev_tap_get_stats,
2501 NULL, /* set_stats */
2502 netdev_linux_get_features,
2503 netdev_linux_get_status);
2505 const struct netdev_class netdev_internal_class =
2508 netdev_linux_create,
2509 netdev_internal_get_stats,
2510 netdev_internal_set_stats,
2511 NULL, /* get_features */
2512 netdev_internal_get_status);
2514 /* HTB traffic control class. */
2516 #define HTB_N_QUEUES 0xf000
2520 unsigned int max_rate; /* In bytes/s. */
2524 struct tc_queue tc_queue;
2525 unsigned int min_rate; /* In bytes/s. */
2526 unsigned int max_rate; /* In bytes/s. */
2527 unsigned int burst; /* In bytes. */
2528 unsigned int priority; /* Lower values are higher priorities. */
2532 htb_get__(const struct netdev *netdev)
2534 struct netdev_dev_linux *netdev_dev =
2535 netdev_dev_linux_cast(netdev_get_dev(netdev));
2536 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2540 htb_install__(struct netdev *netdev, uint64_t max_rate)
2542 struct netdev_dev_linux *netdev_dev =
2543 netdev_dev_linux_cast(netdev_get_dev(netdev));
2546 htb = xmalloc(sizeof *htb);
2547 tc_init(&htb->tc, &tc_ops_htb);
2548 htb->max_rate = max_rate;
2550 netdev_dev->tc = &htb->tc;
2553 /* Create an HTB qdisc.
2555 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2557 htb_setup_qdisc__(struct netdev *netdev)
2560 struct tc_htb_glob opt;
2561 struct ofpbuf request;
2562 struct tcmsg *tcmsg;
2564 tc_del_qdisc(netdev);
2566 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2567 NLM_F_EXCL | NLM_F_CREATE, &request);
2571 tcmsg->tcm_handle = tc_make_handle(1, 0);
2572 tcmsg->tcm_parent = TC_H_ROOT;
2574 nl_msg_put_string(&request, TCA_KIND, "htb");
2576 memset(&opt, 0, sizeof opt);
2577 opt.rate2quantum = 10;
2581 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2582 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2583 nl_msg_end_nested(&request, opt_offset);
2585 return tc_transact(&request, NULL);
2588 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2589 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2591 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2592 unsigned int parent, struct htb_class *class)
2595 struct tc_htb_opt opt;
2596 struct ofpbuf request;
2597 struct tcmsg *tcmsg;
2601 error = netdev_get_mtu(netdev, &mtu);
2603 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2604 netdev_get_name(netdev));
2608 memset(&opt, 0, sizeof opt);
2609 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2610 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2611 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2612 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2613 opt.prio = class->priority;
2615 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2619 tcmsg->tcm_handle = handle;
2620 tcmsg->tcm_parent = parent;
2622 nl_msg_put_string(&request, TCA_KIND, "htb");
2623 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2624 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2625 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2626 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2627 nl_msg_end_nested(&request, opt_offset);
2629 error = tc_transact(&request, NULL);
2631 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2632 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2633 netdev_get_name(netdev),
2634 tc_get_major(handle), tc_get_minor(handle),
2635 tc_get_major(parent), tc_get_minor(parent),
2636 class->min_rate, class->max_rate,
2637 class->burst, class->priority, strerror(error));
2642 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2643 * description of them into 'details'. The description complies with the
2644 * specification given in the vswitch database documentation for linux-htb
2647 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2649 static const struct nl_policy tca_htb_policy[] = {
2650 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2651 .min_len = sizeof(struct tc_htb_opt) },
2654 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2655 const struct tc_htb_opt *htb;
2657 if (!nl_parse_nested(nl_options, tca_htb_policy,
2658 attrs, ARRAY_SIZE(tca_htb_policy))) {
2659 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2663 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2664 class->min_rate = htb->rate.rate;
2665 class->max_rate = htb->ceil.rate;
2666 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2667 class->priority = htb->prio;
2672 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2673 struct htb_class *options,
2674 struct netdev_queue_stats *stats)
2676 struct nlattr *nl_options;
2677 unsigned int handle;
2680 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2681 if (!error && queue_id) {
2682 unsigned int major = tc_get_major(handle);
2683 unsigned int minor = tc_get_minor(handle);
2684 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2685 *queue_id = minor - 1;
2690 if (!error && options) {
2691 error = htb_parse_tca_options__(nl_options, options);
2697 htb_parse_qdisc_details__(struct netdev *netdev,
2698 const struct smap *details, struct htb_class *hc)
2700 const char *max_rate_s;
2702 max_rate_s = smap_get(details, "max-rate");
2703 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2704 if (!hc->max_rate) {
2705 enum netdev_features current;
2707 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2708 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
2710 hc->min_rate = hc->max_rate;
2716 htb_parse_class_details__(struct netdev *netdev,
2717 const struct smap *details, struct htb_class *hc)
2719 const struct htb *htb = htb_get__(netdev);
2720 const char *min_rate_s = smap_get(details, "min-rate");
2721 const char *max_rate_s = smap_get(details, "max-rate");
2722 const char *burst_s = smap_get(details, "burst");
2723 const char *priority_s = smap_get(details, "priority");
2726 error = netdev_get_mtu(netdev, &mtu);
2728 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2729 netdev_get_name(netdev));
2733 /* HTB requires at least an mtu sized min-rate to send any traffic even
2734 * on uncongested links. */
2735 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2736 hc->min_rate = MAX(hc->min_rate, mtu);
2737 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2740 hc->max_rate = (max_rate_s
2741 ? strtoull(max_rate_s, NULL, 10) / 8
2743 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2744 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2748 * According to hints in the documentation that I've read, it is important
2749 * that 'burst' be at least as big as the largest frame that might be
2750 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2751 * but having it a bit too small is a problem. Since netdev_get_mtu()
2752 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2753 * the MTU. We actually add 64, instead of 14, as a guard against
2754 * additional headers get tacked on somewhere that we're not aware of. */
2755 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2756 hc->burst = MAX(hc->burst, mtu + 64);
2759 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2765 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2766 unsigned int parent, struct htb_class *options,
2767 struct netdev_queue_stats *stats)
2769 struct ofpbuf *reply;
2772 error = tc_query_class(netdev, handle, parent, &reply);
2774 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2775 ofpbuf_delete(reply);
2781 htb_tc_install(struct netdev *netdev, const struct smap *details)
2785 error = htb_setup_qdisc__(netdev);
2787 struct htb_class hc;
2789 htb_parse_qdisc_details__(netdev, details, &hc);
2790 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2791 tc_make_handle(1, 0), &hc);
2793 htb_install__(netdev, hc.max_rate);
2799 static struct htb_class *
2800 htb_class_cast__(const struct tc_queue *queue)
2802 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2806 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2807 const struct htb_class *hc)
2809 struct htb *htb = htb_get__(netdev);
2810 size_t hash = hash_int(queue_id, 0);
2811 struct tc_queue *queue;
2812 struct htb_class *hcp;
2814 queue = tc_find_queue__(netdev, queue_id, hash);
2816 hcp = htb_class_cast__(queue);
2818 hcp = xmalloc(sizeof *hcp);
2819 queue = &hcp->tc_queue;
2820 queue->queue_id = queue_id;
2821 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2824 hcp->min_rate = hc->min_rate;
2825 hcp->max_rate = hc->max_rate;
2826 hcp->burst = hc->burst;
2827 hcp->priority = hc->priority;
2831 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2834 struct nl_dump dump;
2835 struct htb_class hc;
2837 /* Get qdisc options. */
2839 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2840 htb_install__(netdev, hc.max_rate);
2843 if (!start_queue_dump(netdev, &dump)) {
2846 while (nl_dump_next(&dump, &msg)) {
2847 unsigned int queue_id;
2849 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2850 htb_update_queue__(netdev, queue_id, &hc);
2853 nl_dump_done(&dump);
2859 htb_tc_destroy(struct tc *tc)
2861 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2862 struct htb_class *hc, *next;
2864 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2865 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2873 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
2875 const struct htb *htb = htb_get__(netdev);
2876 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
2881 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
2883 struct htb_class hc;
2886 htb_parse_qdisc_details__(netdev, details, &hc);
2887 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2888 tc_make_handle(1, 0), &hc);
2890 htb_get__(netdev)->max_rate = hc.max_rate;
2896 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2897 const struct tc_queue *queue, struct smap *details)
2899 const struct htb_class *hc = htb_class_cast__(queue);
2901 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
2902 if (hc->min_rate != hc->max_rate) {
2903 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
2905 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
2907 smap_add_format(details, "priority", "%u", hc->priority);
2913 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2914 const struct smap *details)
2916 struct htb_class hc;
2919 error = htb_parse_class_details__(netdev, details, &hc);
2924 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2925 tc_make_handle(1, 0xfffe), &hc);
2930 htb_update_queue__(netdev, queue_id, &hc);
2935 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2937 struct htb_class *hc = htb_class_cast__(queue);
2938 struct htb *htb = htb_get__(netdev);
2941 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2943 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2950 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2951 struct netdev_queue_stats *stats)
2953 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2954 tc_make_handle(1, 0xfffe), NULL, stats);
2958 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2959 const struct ofpbuf *nlmsg,
2960 netdev_dump_queue_stats_cb *cb, void *aux)
2962 struct netdev_queue_stats stats;
2963 unsigned int handle, major, minor;
2966 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2971 major = tc_get_major(handle);
2972 minor = tc_get_minor(handle);
2973 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2974 (*cb)(minor - 1, &stats, aux);
2979 static const struct tc_ops tc_ops_htb = {
2980 "htb", /* linux_name */
2981 "linux-htb", /* ovs_name */
2982 HTB_N_QUEUES, /* n_queues */
2991 htb_class_get_stats,
2992 htb_class_dump_stats
2995 /* "linux-hfsc" traffic control class. */
2997 #define HFSC_N_QUEUES 0xf000
3005 struct tc_queue tc_queue;
3010 static struct hfsc *
3011 hfsc_get__(const struct netdev *netdev)
3013 struct netdev_dev_linux *netdev_dev;
3014 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
3015 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
3018 static struct hfsc_class *
3019 hfsc_class_cast__(const struct tc_queue *queue)
3021 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3025 hfsc_install__(struct netdev *netdev, uint32_t max_rate)
3027 struct netdev_dev_linux * netdev_dev;
3030 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
3031 hfsc = xmalloc(sizeof *hfsc);
3032 tc_init(&hfsc->tc, &tc_ops_hfsc);
3033 hfsc->max_rate = max_rate;
3034 netdev_dev->tc = &hfsc->tc;
3038 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3039 const struct hfsc_class *hc)
3043 struct hfsc_class *hcp;
3044 struct tc_queue *queue;
3046 hfsc = hfsc_get__(netdev);
3047 hash = hash_int(queue_id, 0);
3049 queue = tc_find_queue__(netdev, queue_id, hash);
3051 hcp = hfsc_class_cast__(queue);
3053 hcp = xmalloc(sizeof *hcp);
3054 queue = &hcp->tc_queue;
3055 queue->queue_id = queue_id;
3056 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3059 hcp->min_rate = hc->min_rate;
3060 hcp->max_rate = hc->max_rate;
3064 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3066 const struct tc_service_curve *rsc, *fsc, *usc;
3067 static const struct nl_policy tca_hfsc_policy[] = {
3069 .type = NL_A_UNSPEC,
3071 .min_len = sizeof(struct tc_service_curve),
3074 .type = NL_A_UNSPEC,
3076 .min_len = sizeof(struct tc_service_curve),
3079 .type = NL_A_UNSPEC,
3081 .min_len = sizeof(struct tc_service_curve),
3084 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3086 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3087 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3088 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3092 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3093 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3094 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3096 if (rsc->m1 != 0 || rsc->d != 0 ||
3097 fsc->m1 != 0 || fsc->d != 0 ||
3098 usc->m1 != 0 || usc->d != 0) {
3099 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3100 "Non-linear service curves are not supported.");
3104 if (rsc->m2 != fsc->m2) {
3105 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3106 "Real-time service curves are not supported ");
3110 if (rsc->m2 > usc->m2) {
3111 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3112 "Min-rate service curve is greater than "
3113 "the max-rate service curve.");
3117 class->min_rate = fsc->m2;
3118 class->max_rate = usc->m2;
3123 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3124 struct hfsc_class *options,
3125 struct netdev_queue_stats *stats)
3128 unsigned int handle;
3129 struct nlattr *nl_options;
3131 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3137 unsigned int major, minor;
3139 major = tc_get_major(handle);
3140 minor = tc_get_minor(handle);
3141 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3142 *queue_id = minor - 1;
3149 error = hfsc_parse_tca_options__(nl_options, options);
3156 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3157 unsigned int parent, struct hfsc_class *options,
3158 struct netdev_queue_stats *stats)
3161 struct ofpbuf *reply;
3163 error = tc_query_class(netdev, handle, parent, &reply);
3168 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3169 ofpbuf_delete(reply);
3174 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct smap *details,
3175 struct hfsc_class *class)
3178 const char *max_rate_s;
3180 max_rate_s = smap_get(details, "max-rate");
3181 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3184 enum netdev_features current;
3186 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
3187 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3190 class->min_rate = max_rate;
3191 class->max_rate = max_rate;
3195 hfsc_parse_class_details__(struct netdev *netdev,
3196 const struct smap *details,
3197 struct hfsc_class * class)
3199 const struct hfsc *hfsc;
3200 uint32_t min_rate, max_rate;
3201 const char *min_rate_s, *max_rate_s;
3203 hfsc = hfsc_get__(netdev);
3204 min_rate_s = smap_get(details, "min-rate");
3205 max_rate_s = smap_get(details, "max-rate");
3207 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3208 min_rate = MAX(min_rate, 1);
3209 min_rate = MIN(min_rate, hfsc->max_rate);
3211 max_rate = (max_rate_s
3212 ? strtoull(max_rate_s, NULL, 10) / 8
3214 max_rate = MAX(max_rate, min_rate);
3215 max_rate = MIN(max_rate, hfsc->max_rate);
3217 class->min_rate = min_rate;
3218 class->max_rate = max_rate;
3223 /* Create an HFSC qdisc.
3225 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3227 hfsc_setup_qdisc__(struct netdev * netdev)
3229 struct tcmsg *tcmsg;
3230 struct ofpbuf request;
3231 struct tc_hfsc_qopt opt;
3233 tc_del_qdisc(netdev);
3235 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3236 NLM_F_EXCL | NLM_F_CREATE, &request);
3242 tcmsg->tcm_handle = tc_make_handle(1, 0);
3243 tcmsg->tcm_parent = TC_H_ROOT;
3245 memset(&opt, 0, sizeof opt);
3248 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3249 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3251 return tc_transact(&request, NULL);
3254 /* Create an HFSC class.
3256 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3257 * sc rate <min_rate> ul rate <max_rate>" */
3259 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3260 unsigned int parent, struct hfsc_class *class)
3264 struct tcmsg *tcmsg;
3265 struct ofpbuf request;
3266 struct tc_service_curve min, max;
3268 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3274 tcmsg->tcm_handle = handle;
3275 tcmsg->tcm_parent = parent;
3279 min.m2 = class->min_rate;
3283 max.m2 = class->max_rate;
3285 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3286 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3287 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3288 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3289 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3290 nl_msg_end_nested(&request, opt_offset);
3292 error = tc_transact(&request, NULL);
3294 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3295 "min-rate %ubps, max-rate %ubps (%s)",
3296 netdev_get_name(netdev),
3297 tc_get_major(handle), tc_get_minor(handle),
3298 tc_get_major(parent), tc_get_minor(parent),
3299 class->min_rate, class->max_rate, strerror(error));
3306 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
3309 struct hfsc_class class;
3311 error = hfsc_setup_qdisc__(netdev);
3317 hfsc_parse_qdisc_details__(netdev, details, &class);
3318 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3319 tc_make_handle(1, 0), &class);
3325 hfsc_install__(netdev, class.max_rate);
3330 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3333 struct nl_dump dump;
3334 struct hfsc_class hc;
3337 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3338 hfsc_install__(netdev, hc.max_rate);
3340 if (!start_queue_dump(netdev, &dump)) {
3344 while (nl_dump_next(&dump, &msg)) {
3345 unsigned int queue_id;
3347 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3348 hfsc_update_queue__(netdev, queue_id, &hc);
3352 nl_dump_done(&dump);
3357 hfsc_tc_destroy(struct tc *tc)
3360 struct hfsc_class *hc, *next;
3362 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3364 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3365 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3374 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
3376 const struct hfsc *hfsc;
3377 hfsc = hfsc_get__(netdev);
3378 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
3383 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
3386 struct hfsc_class class;
3388 hfsc_parse_qdisc_details__(netdev, details, &class);
3389 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3390 tc_make_handle(1, 0), &class);
3393 hfsc_get__(netdev)->max_rate = class.max_rate;
3400 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3401 const struct tc_queue *queue, struct smap *details)
3403 const struct hfsc_class *hc;
3405 hc = hfsc_class_cast__(queue);
3406 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3407 if (hc->min_rate != hc->max_rate) {
3408 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3414 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3415 const struct smap *details)
3418 struct hfsc_class class;
3420 error = hfsc_parse_class_details__(netdev, details, &class);
3425 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3426 tc_make_handle(1, 0xfffe), &class);
3431 hfsc_update_queue__(netdev, queue_id, &class);
3436 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3440 struct hfsc_class *hc;
3442 hc = hfsc_class_cast__(queue);
3443 hfsc = hfsc_get__(netdev);
3445 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3447 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3454 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3455 struct netdev_queue_stats *stats)
3457 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3458 tc_make_handle(1, 0xfffe), NULL, stats);
3462 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3463 const struct ofpbuf *nlmsg,
3464 netdev_dump_queue_stats_cb *cb, void *aux)
3466 struct netdev_queue_stats stats;
3467 unsigned int handle, major, minor;
3470 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3475 major = tc_get_major(handle);
3476 minor = tc_get_minor(handle);
3477 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3478 (*cb)(minor - 1, &stats, aux);
3483 static const struct tc_ops tc_ops_hfsc = {
3484 "hfsc", /* linux_name */
3485 "linux-hfsc", /* ovs_name */
3486 HFSC_N_QUEUES, /* n_queues */
3487 hfsc_tc_install, /* tc_install */
3488 hfsc_tc_load, /* tc_load */
3489 hfsc_tc_destroy, /* tc_destroy */
3490 hfsc_qdisc_get, /* qdisc_get */
3491 hfsc_qdisc_set, /* qdisc_set */
3492 hfsc_class_get, /* class_get */
3493 hfsc_class_set, /* class_set */
3494 hfsc_class_delete, /* class_delete */
3495 hfsc_class_get_stats, /* class_get_stats */
3496 hfsc_class_dump_stats /* class_dump_stats */
3499 /* "linux-default" traffic control class.
3501 * This class represents the default, unnamed Linux qdisc. It corresponds to
3502 * the "" (empty string) QoS type in the OVS database. */
3505 default_install__(struct netdev *netdev)
3507 struct netdev_dev_linux *netdev_dev =
3508 netdev_dev_linux_cast(netdev_get_dev(netdev));
3509 static struct tc *tc;
3512 tc = xmalloc(sizeof *tc);
3513 tc_init(tc, &tc_ops_default);
3515 netdev_dev->tc = tc;
3519 default_tc_install(struct netdev *netdev,
3520 const struct smap *details OVS_UNUSED)
3522 default_install__(netdev);
3527 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3529 default_install__(netdev);
3533 static const struct tc_ops tc_ops_default = {
3534 NULL, /* linux_name */
3539 NULL, /* tc_destroy */
3540 NULL, /* qdisc_get */
3541 NULL, /* qdisc_set */
3542 NULL, /* class_get */
3543 NULL, /* class_set */
3544 NULL, /* class_delete */
3545 NULL, /* class_get_stats */
3546 NULL /* class_dump_stats */
3549 /* "linux-other" traffic control class.
3554 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3556 struct netdev_dev_linux *netdev_dev =
3557 netdev_dev_linux_cast(netdev_get_dev(netdev));
3558 static struct tc *tc;
3561 tc = xmalloc(sizeof *tc);
3562 tc_init(tc, &tc_ops_other);
3564 netdev_dev->tc = tc;
3568 static const struct tc_ops tc_ops_other = {
3569 NULL, /* linux_name */
3570 "linux-other", /* ovs_name */
3572 NULL, /* tc_install */
3574 NULL, /* tc_destroy */
3575 NULL, /* qdisc_get */
3576 NULL, /* qdisc_set */
3577 NULL, /* class_get */
3578 NULL, /* class_set */
3579 NULL, /* class_delete */
3580 NULL, /* class_get_stats */
3581 NULL /* class_dump_stats */
3584 /* Traffic control. */
3586 /* Number of kernel "tc" ticks per second. */
3587 static double ticks_per_s;
3589 /* Number of kernel "jiffies" per second. This is used for the purpose of
3590 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3591 * one jiffy's worth of data.
3593 * There are two possibilities here:
3595 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3596 * approximate range of 100 to 1024. That means that we really need to
3597 * make sure that the qdisc can buffer that much data.
3599 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3600 * has finely granular timers and there's no need to fudge additional room
3601 * for buffers. (There's no extra effort needed to implement that: the
3602 * large 'buffer_hz' is used as a divisor, so practically any number will
3603 * come out as 0 in the division. Small integer results in the case of
3604 * really high dividends won't have any real effect anyhow.)
3606 static unsigned int buffer_hz;
3608 /* Returns tc handle 'major':'minor'. */
3610 tc_make_handle(unsigned int major, unsigned int minor)
3612 return TC_H_MAKE(major << 16, minor);
3615 /* Returns the major number from 'handle'. */
3617 tc_get_major(unsigned int handle)
3619 return TC_H_MAJ(handle) >> 16;
3622 /* Returns the minor number from 'handle'. */
3624 tc_get_minor(unsigned int handle)
3626 return TC_H_MIN(handle);
3629 static struct tcmsg *
3630 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3631 struct ofpbuf *request)
3633 struct tcmsg *tcmsg;
3637 error = get_ifindex(netdev, &ifindex);
3642 ofpbuf_init(request, 512);
3643 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3644 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3645 tcmsg->tcm_family = AF_UNSPEC;
3646 tcmsg->tcm_ifindex = ifindex;
3647 /* Caller should fill in tcmsg->tcm_handle. */
3648 /* Caller should fill in tcmsg->tcm_parent. */
3654 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3656 int error = nl_sock_transact(rtnl_sock, request, replyp);
3657 ofpbuf_uninit(request);
3661 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3662 * policing configuration.
3664 * This function is equivalent to running the following when 'add' is true:
3665 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3667 * This function is equivalent to running the following when 'add' is false:
3668 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3670 * The configuration and stats may be seen with the following command:
3671 * /sbin/tc -s qdisc show dev <devname>
3673 * Returns 0 if successful, otherwise a positive errno value.
3676 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3678 struct ofpbuf request;
3679 struct tcmsg *tcmsg;
3681 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3682 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3684 tcmsg = tc_make_request(netdev, type, flags, &request);
3688 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3689 tcmsg->tcm_parent = TC_H_INGRESS;
3690 nl_msg_put_string(&request, TCA_KIND, "ingress");
3691 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3693 error = tc_transact(&request, NULL);
3695 /* If we're deleting the qdisc, don't worry about some of the
3696 * error conditions. */
3697 if (!add && (error == ENOENT || error == EINVAL)) {
3706 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3709 * This function is equivalent to running:
3710 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3711 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3714 * The configuration and stats may be seen with the following command:
3715 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3717 * Returns 0 if successful, otherwise a positive errno value.
3720 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3722 struct tc_police tc_police;
3723 struct ofpbuf request;
3724 struct tcmsg *tcmsg;
3725 size_t basic_offset;
3726 size_t police_offset;
3730 memset(&tc_police, 0, sizeof tc_police);
3731 tc_police.action = TC_POLICE_SHOT;
3732 tc_police.mtu = mtu;
3733 tc_fill_rate(&tc_police.rate, (kbits_rate * 1000)/8, mtu);
3734 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3735 kbits_burst * 1024);
3737 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3738 NLM_F_EXCL | NLM_F_CREATE, &request);
3742 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3743 tcmsg->tcm_info = tc_make_handle(49,
3744 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3746 nl_msg_put_string(&request, TCA_KIND, "basic");
3747 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3748 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3749 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3750 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3751 nl_msg_end_nested(&request, police_offset);
3752 nl_msg_end_nested(&request, basic_offset);
3754 error = tc_transact(&request, NULL);
3765 /* The values in psched are not individually very meaningful, but they are
3766 * important. The tables below show some values seen in the wild.
3770 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3771 * (Before that, there are hints that it was 1000000000.)
3773 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3777 * -----------------------------------
3778 * [1] 000c8000 000f4240 000f4240 00000064
3779 * [2] 000003e8 00000400 000f4240 3b9aca00
3780 * [3] 000003e8 00000400 000f4240 3b9aca00
3781 * [4] 000003e8 00000400 000f4240 00000064
3782 * [5] 000003e8 00000040 000f4240 3b9aca00
3783 * [6] 000003e8 00000040 000f4240 000000f9
3785 * a b c d ticks_per_s buffer_hz
3786 * ------- --------- ---------- ------------- ----------- -------------
3787 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3788 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3789 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3790 * [4] 1,000 1,024 1,000,000 100 976,562 100
3791 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3792 * [6] 1,000 64 1,000,000 249 15,625,000 249
3794 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3795 * [2] 2.6.26-1-686-bigmem from Debian lenny
3796 * [3] 2.6.26-2-sparc64 from Debian lenny
3797 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3798 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3799 * [6] 2.6.34 from kernel.org on KVM
3801 static const char fn[] = "/proc/net/psched";
3802 unsigned int a, b, c, d;
3808 stream = fopen(fn, "r");
3810 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3814 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3815 VLOG_WARN("%s: read failed", fn);
3819 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3823 VLOG_WARN("%s: invalid scheduler parameters", fn);
3827 ticks_per_s = (double) a * c / b;
3831 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3834 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3837 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3838 * rate of 'rate' bytes per second. */
3840 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3845 return (rate * ticks) / ticks_per_s;
3848 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3849 * rate of 'rate' bytes per second. */
3851 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3856 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3859 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3860 * a transmission rate of 'rate' bytes per second. */
3862 tc_buffer_per_jiffy(unsigned int rate)
3867 return rate / buffer_hz;
3870 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3871 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3872 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3873 * stores NULL into it if it is absent.
3875 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3878 * Returns 0 if successful, otherwise a positive errno value. */
3880 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3881 struct nlattr **options)
3883 static const struct nl_policy tca_policy[] = {
3884 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3885 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3887 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3889 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3890 tca_policy, ta, ARRAY_SIZE(ta))) {
3891 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3896 *kind = nl_attr_get_string(ta[TCA_KIND]);
3900 *options = ta[TCA_OPTIONS];
3915 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3916 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3917 * into '*options', and its queue statistics into '*stats'. Any of the output
3918 * arguments may be null.
3920 * Returns 0 if successful, otherwise a positive errno value. */
3922 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3923 struct nlattr **options, struct netdev_queue_stats *stats)
3925 static const struct nl_policy tca_policy[] = {
3926 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3927 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3929 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3931 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3932 tca_policy, ta, ARRAY_SIZE(ta))) {
3933 VLOG_WARN_RL(&rl, "failed to parse class message");
3938 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3939 *handlep = tc->tcm_handle;
3943 *options = ta[TCA_OPTIONS];
3947 const struct gnet_stats_queue *gsq;
3948 struct gnet_stats_basic gsb;
3950 static const struct nl_policy stats_policy[] = {
3951 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3952 .min_len = sizeof gsb },
3953 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3954 .min_len = sizeof *gsq },
3956 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3958 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3959 sa, ARRAY_SIZE(sa))) {
3960 VLOG_WARN_RL(&rl, "failed to parse class stats");
3964 /* Alignment issues screw up the length of struct gnet_stats_basic on
3965 * some arch/bitsize combinations. Newer versions of Linux have a
3966 * struct gnet_stats_basic_packed, but we can't depend on that. The
3967 * easiest thing to do is just to make a copy. */
3968 memset(&gsb, 0, sizeof gsb);
3969 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3970 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3971 stats->tx_bytes = gsb.bytes;
3972 stats->tx_packets = gsb.packets;
3974 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3975 stats->tx_errors = gsq->drops;
3985 memset(stats, 0, sizeof *stats);
3990 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3993 tc_query_class(const struct netdev *netdev,
3994 unsigned int handle, unsigned int parent,
3995 struct ofpbuf **replyp)
3997 struct ofpbuf request;
3998 struct tcmsg *tcmsg;
4001 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
4005 tcmsg->tcm_handle = handle;
4006 tcmsg->tcm_parent = parent;
4008 error = tc_transact(&request, replyp);
4010 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
4011 netdev_get_name(netdev),
4012 tc_get_major(handle), tc_get_minor(handle),
4013 tc_get_major(parent), tc_get_minor(parent),
4019 /* Equivalent to "tc class del dev <name> handle <handle>". */
4021 tc_delete_class(const struct netdev *netdev, unsigned int handle)
4023 struct ofpbuf request;
4024 struct tcmsg *tcmsg;
4027 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
4031 tcmsg->tcm_handle = handle;
4032 tcmsg->tcm_parent = 0;
4034 error = tc_transact(&request, NULL);
4036 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4037 netdev_get_name(netdev),
4038 tc_get_major(handle), tc_get_minor(handle),
4044 /* Equivalent to "tc qdisc del dev <name> root". */
4046 tc_del_qdisc(struct netdev *netdev)
4048 struct netdev_dev_linux *netdev_dev =
4049 netdev_dev_linux_cast(netdev_get_dev(netdev));
4050 struct ofpbuf request;
4051 struct tcmsg *tcmsg;
4054 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
4058 tcmsg->tcm_handle = tc_make_handle(1, 0);
4059 tcmsg->tcm_parent = TC_H_ROOT;
4061 error = tc_transact(&request, NULL);
4062 if (error == EINVAL) {
4063 /* EINVAL probably means that the default qdisc was in use, in which
4064 * case we've accomplished our purpose. */
4067 if (!error && netdev_dev->tc) {
4068 if (netdev_dev->tc->ops->tc_destroy) {
4069 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
4071 netdev_dev->tc = NULL;
4076 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4077 * kernel to determine what they are. Returns 0 if successful, otherwise a
4078 * positive errno value. */
4080 tc_query_qdisc(const struct netdev *netdev)
4082 struct netdev_dev_linux *netdev_dev =
4083 netdev_dev_linux_cast(netdev_get_dev(netdev));
4084 struct ofpbuf request, *qdisc;
4085 const struct tc_ops *ops;
4086 struct tcmsg *tcmsg;
4090 if (netdev_dev->tc) {
4094 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4095 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4096 * 2.6.35 without that fix backported to it.
4098 * To avoid the OOPS, we must not make a request that would attempt to dump
4099 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4100 * few others. There are a few ways that I can see to do this, but most of
4101 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4102 * technique chosen here is to assume that any non-default qdisc that we
4103 * create will have a class with handle 1:0. The built-in qdiscs only have
4104 * a class with handle 0:0.
4106 * We could check for Linux 2.6.35+ and use a more straightforward method
4108 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
4112 tcmsg->tcm_handle = tc_make_handle(1, 0);
4113 tcmsg->tcm_parent = 0;
4115 /* Figure out what tc class to instantiate. */
4116 error = tc_transact(&request, &qdisc);
4120 error = tc_parse_qdisc(qdisc, &kind, NULL);
4122 ops = &tc_ops_other;
4124 ops = tc_lookup_linux_name(kind);
4126 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4127 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4129 ops = &tc_ops_other;
4132 } else if (error == ENOENT) {
4133 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4134 * other entity that doesn't have a handle 1:0. We will assume
4135 * that it's the system default qdisc. */
4136 ops = &tc_ops_default;
4139 /* Who knows? Maybe the device got deleted. */
4140 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4141 netdev_get_name(netdev), strerror(error));
4142 ops = &tc_ops_other;
4145 /* Instantiate it. */
4146 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev), qdisc);
4147 ovs_assert((load_error == 0) == (netdev_dev->tc != NULL));
4148 ofpbuf_delete(qdisc);
4150 return error ? error : load_error;
4153 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4154 approximate the time to transmit packets of various lengths. For an MTU of
4155 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4156 represents two possible packet lengths; for a MTU of 513 through 1024, four
4157 possible lengths; and so on.
4159 Returns, for the specified 'mtu', the number of bits that packet lengths
4160 need to be shifted right to fit within such a 256-entry table. */
4162 tc_calc_cell_log(unsigned int mtu)
4167 mtu = ETH_PAYLOAD_MAX;
4169 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4171 for (cell_log = 0; mtu >= 256; cell_log++) {
4178 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4181 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4183 memset(rate, 0, sizeof *rate);
4184 rate->cell_log = tc_calc_cell_log(mtu);
4185 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4186 /* rate->cell_align = 0; */ /* distro headers. */
4187 rate->mpu = ETH_TOTAL_MIN;
4191 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4192 * attribute of the specified "type".
4194 * See tc_calc_cell_log() above for a description of "rtab"s. */
4196 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4201 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4202 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4203 unsigned packet_size = (i + 1) << rate->cell_log;
4204 if (packet_size < rate->mpu) {
4205 packet_size = rate->mpu;
4207 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4211 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4212 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4213 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4216 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4218 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4219 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4222 /* Linux-only functions declared in netdev-linux.h */
4224 /* Returns a fd for an AF_INET socket or a negative errno value. */
4226 netdev_linux_get_af_inet_sock(void)
4228 int error = netdev_linux_init();
4229 return error ? -error : af_inet_sock;
4232 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4233 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4235 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4236 const char *flag_name, bool enable)
4238 const char *netdev_name = netdev_get_name(netdev);
4239 struct ethtool_value evalue;
4243 COVERAGE_INC(netdev_get_ethtool);
4244 memset(&evalue, 0, sizeof evalue);
4245 error = netdev_linux_do_ethtool(netdev_name,
4246 (struct ethtool_cmd *)&evalue,
4247 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4252 COVERAGE_INC(netdev_set_ethtool);
4253 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4254 error = netdev_linux_do_ethtool(netdev_name,
4255 (struct ethtool_cmd *)&evalue,
4256 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4261 COVERAGE_INC(netdev_get_ethtool);
4262 memset(&evalue, 0, sizeof evalue);
4263 error = netdev_linux_do_ethtool(netdev_name,
4264 (struct ethtool_cmd *)&evalue,
4265 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4270 if (new_flags != evalue.data) {
4271 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4272 "device %s failed", enable ? "enable" : "disable",
4273 flag_name, netdev_name);
4280 /* Utility functions. */
4282 /* Copies 'src' into 'dst', performing format conversion in the process. */
4284 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4285 const struct rtnl_link_stats *src)
4287 dst->rx_packets = src->rx_packets;
4288 dst->tx_packets = src->tx_packets;
4289 dst->rx_bytes = src->rx_bytes;
4290 dst->tx_bytes = src->tx_bytes;
4291 dst->rx_errors = src->rx_errors;
4292 dst->tx_errors = src->tx_errors;
4293 dst->rx_dropped = src->rx_dropped;
4294 dst->tx_dropped = src->tx_dropped;
4295 dst->multicast = src->multicast;
4296 dst->collisions = src->collisions;
4297 dst->rx_length_errors = src->rx_length_errors;
4298 dst->rx_over_errors = src->rx_over_errors;
4299 dst->rx_crc_errors = src->rx_crc_errors;
4300 dst->rx_frame_errors = src->rx_frame_errors;
4301 dst->rx_fifo_errors = src->rx_fifo_errors;
4302 dst->rx_missed_errors = src->rx_missed_errors;
4303 dst->tx_aborted_errors = src->tx_aborted_errors;
4304 dst->tx_carrier_errors = src->tx_carrier_errors;
4305 dst->tx_fifo_errors = src->tx_fifo_errors;
4306 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4307 dst->tx_window_errors = src->tx_window_errors;
4311 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4313 /* Policy for RTNLGRP_LINK messages.
4315 * There are *many* more fields in these messages, but currently we only
4316 * care about these fields. */
4317 static const struct nl_policy rtnlgrp_link_policy[] = {
4318 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4319 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4320 .min_len = sizeof(struct rtnl_link_stats) },
4323 struct ofpbuf request;
4324 struct ofpbuf *reply;
4325 struct ifinfomsg *ifi;
4326 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4329 ofpbuf_init(&request, 0);
4330 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4331 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4332 ifi->ifi_family = PF_UNSPEC;
4333 ifi->ifi_index = ifindex;
4334 error = nl_sock_transact(rtnl_sock, &request, &reply);
4335 ofpbuf_uninit(&request);
4340 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4341 rtnlgrp_link_policy,
4342 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4343 ofpbuf_delete(reply);
4347 if (!attrs[IFLA_STATS]) {
4348 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4349 ofpbuf_delete(reply);
4353 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4355 ofpbuf_delete(reply);
4361 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4363 static const char fn[] = "/proc/net/dev";
4368 stream = fopen(fn, "r");
4370 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
4375 while (fgets(line, sizeof line, stream)) {
4378 #define X64 "%"SCNu64
4381 X64 X64 X64 X64 X64 X64 X64 "%*u"
4382 X64 X64 X64 X64 X64 X64 X64 "%*u",
4388 &stats->rx_fifo_errors,
4389 &stats->rx_frame_errors,
4395 &stats->tx_fifo_errors,
4397 &stats->tx_carrier_errors) != 15) {
4398 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4399 } else if (!strcmp(devname, netdev_name)) {
4400 stats->rx_length_errors = UINT64_MAX;
4401 stats->rx_over_errors = UINT64_MAX;
4402 stats->rx_crc_errors = UINT64_MAX;
4403 stats->rx_missed_errors = UINT64_MAX;
4404 stats->tx_aborted_errors = UINT64_MAX;
4405 stats->tx_heartbeat_errors = UINT64_MAX;
4406 stats->tx_window_errors = UINT64_MAX;
4412 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4418 get_flags(const struct netdev_dev *dev, unsigned int *flags)
4424 error = netdev_linux_do_ioctl(dev->name, &ifr, SIOCGIFFLAGS,
4427 *flags = ifr.ifr_flags;
4433 set_flags(struct netdev *netdev, unsigned int flags)
4437 ifr.ifr_flags = flags;
4438 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4443 do_get_ifindex(const char *netdev_name)
4447 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4448 COVERAGE_INC(netdev_get_ifindex);
4449 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4450 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4451 netdev_name, strerror(errno));
4454 return ifr.ifr_ifindex;
4458 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4460 struct netdev_dev_linux *netdev_dev =
4461 netdev_dev_linux_cast(netdev_get_dev(netdev_));
4463 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
4464 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4467 netdev_dev->get_ifindex_error = -ifindex;
4468 netdev_dev->ifindex = 0;
4470 netdev_dev->get_ifindex_error = 0;
4471 netdev_dev->ifindex = ifindex;
4473 netdev_dev->cache_valid |= VALID_IFINDEX;
4476 *ifindexp = netdev_dev->ifindex;
4477 return netdev_dev->get_ifindex_error;
4481 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4486 memset(&ifr, 0, sizeof ifr);
4487 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4488 COVERAGE_INC(netdev_get_hwaddr);
4489 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4490 /* ENODEV probably means that a vif disappeared asynchronously and
4491 * hasn't been removed from the database yet, so reduce the log level
4492 * to INFO for that case. */
4493 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4494 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4495 netdev_name, strerror(errno));
4498 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4499 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4500 VLOG_WARN("%s device has unknown hardware address family %d",
4501 netdev_name, hwaddr_family);
4503 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4508 set_etheraddr(const char *netdev_name,
4509 const uint8_t mac[ETH_ADDR_LEN])
4513 memset(&ifr, 0, sizeof ifr);
4514 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4515 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4516 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4517 COVERAGE_INC(netdev_set_hwaddr);
4518 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4519 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4520 netdev_name, strerror(errno));
4527 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4528 int cmd, const char *cmd_name)
4532 memset(&ifr, 0, sizeof ifr);
4533 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4534 ifr.ifr_data = (caddr_t) ecmd;
4537 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4540 if (errno != EOPNOTSUPP) {
4541 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4542 "failed: %s", cmd_name, name, strerror(errno));
4544 /* The device doesn't support this operation. That's pretty
4545 * common, so there's no point in logging anything. */
4552 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4553 const char *cmd_name)
4555 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4556 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4557 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4565 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4566 int cmd, const char *cmd_name)
4571 ifr.ifr_addr.sa_family = AF_INET;
4572 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4574 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4575 *ip = sin->sin_addr;
4580 /* Returns an AF_PACKET raw socket or a negative errno value. */
4582 af_packet_sock(void)
4584 static int sock = INT_MIN;
4586 if (sock == INT_MIN) {
4587 sock = socket(AF_PACKET, SOCK_RAW, 0);
4589 set_nonblocking(sock);
4592 VLOG_ERR("failed to create packet socket: %s", strerror(errno));