2 * Copyright (c) 2009, 2010, 2011, 2012 Nicira Networks.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
24 #include <arpa/inet.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <linux/version.h>
37 #include <sys/types.h>
38 #include <sys/ioctl.h>
39 #include <sys/socket.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
52 #include "dpif-linux.h"
53 #include "dynamic-string.h"
54 #include "fatal-signal.h"
57 #include "netdev-provider.h"
58 #include "netdev-vport.h"
60 #include "netlink-notifier.h"
61 #include "netlink-socket.h"
63 #include "openflow/openflow.h"
65 #include "poll-loop.h"
66 #include "rtnetlink-link.h"
67 #include "socket-util.h"
74 VLOG_DEFINE_THIS_MODULE(netdev_linux);
76 COVERAGE_DEFINE(netdev_set_policing);
77 COVERAGE_DEFINE(netdev_arp_lookup);
78 COVERAGE_DEFINE(netdev_get_ifindex);
79 COVERAGE_DEFINE(netdev_get_hwaddr);
80 COVERAGE_DEFINE(netdev_set_hwaddr);
81 COVERAGE_DEFINE(netdev_ethtool);
84 /* These were introduced in Linux 2.6.14, so they might be missing if we have
86 #ifndef ADVERTISED_Pause
87 #define ADVERTISED_Pause (1 << 13)
89 #ifndef ADVERTISED_Asym_Pause
90 #define ADVERTISED_Asym_Pause (1 << 14)
93 /* These were introduced in Linux 2.6.24, so they might be missing if we
94 * have old headers. */
95 #ifndef ETHTOOL_GFLAGS
96 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
98 #ifndef ETHTOOL_SFLAGS
99 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
102 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
105 #define TC_RTAB_SIZE 1024
108 static struct nln_notifier *netdev_linux_cache_notifier = NULL;
109 static int cache_notifier_refcount;
112 VALID_IFINDEX = 1 << 0,
113 VALID_ETHERADDR = 1 << 1,
117 VALID_POLICING = 1 << 5,
118 VALID_VPORT_STAT_ERROR = 1 << 6,
119 VALID_DRVINFO = 1 << 7,
120 VALID_FEATURES = 1 << 8,
128 /* Traffic control. */
130 /* An instance of a traffic control class. Always associated with a particular
133 * Each TC implementation subclasses this with whatever additional data it
136 const struct tc_ops *ops;
137 struct hmap queues; /* Contains "struct tc_queue"s.
138 * Read by generic TC layer.
139 * Written only by TC implementation. */
142 /* One traffic control queue.
144 * Each TC implementation subclasses this with whatever additional data it
147 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
148 unsigned int queue_id; /* OpenFlow queue ID. */
151 /* A particular kind of traffic control. Each implementation generally maps to
152 * one particular Linux qdisc class.
154 * The functions below return 0 if successful or a positive errno value on
155 * failure, except where otherwise noted. All of them must be provided, except
156 * where otherwise noted. */
158 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
159 * This is null for tc_ops_default and tc_ops_other, for which there are no
160 * appropriate values. */
161 const char *linux_name;
163 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
164 const char *ovs_name;
166 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
167 * queues. The queues are numbered 0 through n_queues - 1. */
168 unsigned int n_queues;
170 /* Called to install this TC class on 'netdev'. The implementation should
171 * make the Netlink calls required to set up 'netdev' with the right qdisc
172 * and configure it according to 'details'. The implementation may assume
173 * that the current qdisc is the default; that is, there is no need for it
174 * to delete the current qdisc before installing itself.
176 * The contents of 'details' should be documented as valid for 'ovs_name'
177 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
178 * (which is built as ovs-vswitchd.conf.db(8)).
180 * This function must return 0 if and only if it sets 'netdev->tc' to an
181 * initialized 'struct tc'.
183 * (This function is null for tc_ops_other, which cannot be installed. For
184 * other TC classes it should always be nonnull.) */
185 int (*tc_install)(struct netdev *netdev, const struct shash *details);
187 /* Called when the netdev code determines (through a Netlink query) that
188 * this TC class's qdisc is installed on 'netdev', but we didn't install
189 * it ourselves and so don't know any of the details.
191 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
192 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
193 * implementation should parse the other attributes of 'nlmsg' as
194 * necessary to determine its configuration. If necessary it should also
195 * use Netlink queries to determine the configuration of queues on
198 * This function must return 0 if and only if it sets 'netdev->tc' to an
199 * initialized 'struct tc'. */
200 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
202 /* Destroys the data structures allocated by the implementation as part of
203 * 'tc'. (This includes destroying 'tc->queues' by calling
206 * The implementation should not need to perform any Netlink calls. If
207 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
208 * (But it may not be desirable.)
210 * This function may be null if 'tc' is trivial. */
211 void (*tc_destroy)(struct tc *tc);
213 /* Retrieves details of 'netdev->tc' configuration into 'details'.
215 * The implementation should not need to perform any Netlink calls, because
216 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
217 * cached the configuration.
219 * The contents of 'details' should be documented as valid for 'ovs_name'
220 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
221 * (which is built as ovs-vswitchd.conf.db(8)).
223 * This function may be null if 'tc' is not configurable.
225 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
227 /* Reconfigures 'netdev->tc' according to 'details', performing any
228 * required Netlink calls to complete the reconfiguration.
230 * The contents of 'details' should be documented as valid for 'ovs_name'
231 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
232 * (which is built as ovs-vswitchd.conf.db(8)).
234 * This function may be null if 'tc' is not configurable.
236 int (*qdisc_set)(struct netdev *, const struct shash *details);
238 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
239 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
241 * The contents of 'details' should be documented as valid for 'ovs_name'
242 * in the "other_config" column in the "Queue" table in
243 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
245 * The implementation should not need to perform any Netlink calls, because
246 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
247 * cached the queue configuration.
249 * This function may be null if 'tc' does not have queues ('n_queues' is
251 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
252 struct shash *details);
254 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
255 * 'details', perfoming any required Netlink calls to complete the
256 * reconfiguration. The caller ensures that 'queue_id' is less than
259 * The contents of 'details' should be documented as valid for 'ovs_name'
260 * in the "other_config" column in the "Queue" table in
261 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
263 * This function may be null if 'tc' does not have queues or its queues are
264 * not configurable. */
265 int (*class_set)(struct netdev *, unsigned int queue_id,
266 const struct shash *details);
268 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
269 * tc_queue's within 'netdev->tc->queues'.
271 * This function may be null if 'tc' does not have queues or its queues
272 * cannot be deleted. */
273 int (*class_delete)(struct netdev *, struct tc_queue *queue);
275 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
276 * 'struct tc_queue's within 'netdev->tc->queues'.
278 * On success, initializes '*stats'.
280 * This function may be null if 'tc' does not have queues or if it cannot
281 * report queue statistics. */
282 int (*class_get_stats)(const struct netdev *netdev,
283 const struct tc_queue *queue,
284 struct netdev_queue_stats *stats);
286 /* Extracts queue stats from 'nlmsg', which is a response to a
287 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
289 * This function may be null if 'tc' does not have queues or if it cannot
290 * report queue statistics. */
291 int (*class_dump_stats)(const struct netdev *netdev,
292 const struct ofpbuf *nlmsg,
293 netdev_dump_queue_stats_cb *cb, void *aux);
297 tc_init(struct tc *tc, const struct tc_ops *ops)
300 hmap_init(&tc->queues);
304 tc_destroy(struct tc *tc)
306 hmap_destroy(&tc->queues);
309 static const struct tc_ops tc_ops_htb;
310 static const struct tc_ops tc_ops_hfsc;
311 static const struct tc_ops tc_ops_default;
312 static const struct tc_ops tc_ops_other;
314 static const struct tc_ops *tcs[] = {
315 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
316 &tc_ops_hfsc, /* Hierarchical fair service curve. */
317 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
318 &tc_ops_other, /* Some other qdisc. */
322 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
323 static unsigned int tc_get_major(unsigned int handle);
324 static unsigned int tc_get_minor(unsigned int handle);
326 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
327 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
328 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
330 static struct tcmsg *tc_make_request(const struct netdev *, int type,
331 unsigned int flags, struct ofpbuf *);
332 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
333 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
334 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
337 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
338 struct nlattr **options);
339 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
340 struct nlattr **options,
341 struct netdev_queue_stats *);
342 static int tc_query_class(const struct netdev *,
343 unsigned int handle, unsigned int parent,
344 struct ofpbuf **replyp);
345 static int tc_delete_class(const struct netdev *, unsigned int handle);
347 static int tc_del_qdisc(struct netdev *netdev);
348 static int tc_query_qdisc(const struct netdev *netdev);
350 static int tc_calc_cell_log(unsigned int mtu);
351 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
352 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
353 const struct tc_ratespec *rate);
354 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
356 struct netdev_dev_linux {
357 struct netdev_dev netdev_dev;
359 struct shash_node *shash_node;
360 unsigned int cache_valid;
361 unsigned int change_seq;
363 bool miimon; /* Link status of last poll. */
364 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
365 struct timer miimon_timer;
367 /* The following are figured out "on demand" only. They are only valid
368 * when the corresponding VALID_* bit in 'cache_valid' is set. */
370 uint8_t etheraddr[ETH_ADDR_LEN];
371 struct in_addr address, netmask;
374 unsigned int ifi_flags;
375 long long int carrier_resets;
376 uint32_t kbits_rate; /* Policing data. */
377 uint32_t kbits_burst;
378 int vport_stats_error; /* Cached error code from vport_get_stats().
379 0 or an errno value. */
380 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
381 int ether_addr_error; /* Cached error code from set/get etheraddr. */
382 int netdev_policing_error; /* Cached error code from set policing. */
383 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
384 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
386 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
387 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
388 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
389 enum netdev_features peer; /* Cached from ETHTOOL_GSET. */
391 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
395 struct tap_state tap;
399 struct netdev_linux {
400 struct netdev netdev;
404 /* Sockets used for ioctl operations. */
405 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
407 /* A Netlink routing socket that is not subscribed to any multicast groups. */
408 static struct nl_sock *rtnl_sock;
410 /* This is set pretty low because we probably won't learn anything from the
411 * additional log messages. */
412 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
414 static int netdev_linux_init(void);
416 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
417 int cmd, const char *cmd_name);
418 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
419 const char *cmd_name);
420 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
421 int cmd, const char *cmd_name);
422 static int get_flags(const struct netdev_dev *, unsigned int *flags);
423 static int set_flags(struct netdev *, unsigned int flags);
424 static int do_get_ifindex(const char *netdev_name);
425 static int get_ifindex(const struct netdev *, int *ifindexp);
426 static int do_set_addr(struct netdev *netdev,
427 int ioctl_nr, const char *ioctl_name,
428 struct in_addr addr);
429 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
430 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
431 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
432 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
433 static int af_packet_sock(void);
434 static void netdev_linux_miimon_run(void);
435 static void netdev_linux_miimon_wait(void);
438 is_netdev_linux_class(const struct netdev_class *netdev_class)
440 return netdev_class->init == netdev_linux_init;
443 static struct netdev_dev_linux *
444 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
446 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
447 assert(is_netdev_linux_class(netdev_class));
449 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
452 static struct netdev_linux *
453 netdev_linux_cast(const struct netdev *netdev)
455 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
456 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
457 assert(is_netdev_linux_class(netdev_class));
459 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
463 netdev_linux_init(void)
465 static int status = -1;
467 /* Create AF_INET socket. */
468 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
469 status = af_inet_sock >= 0 ? 0 : errno;
471 VLOG_ERR("failed to create inet socket: %s", strerror(status));
474 /* Create rtnetlink socket. */
476 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
478 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
487 netdev_linux_run(void)
489 rtnetlink_link_run();
490 netdev_linux_miimon_run();
494 netdev_linux_wait(void)
496 rtnetlink_link_wait();
497 netdev_linux_miimon_wait();
501 netdev_linux_get_drvinfo(struct netdev_dev_linux *netdev_dev)
506 if (netdev_dev->cache_valid & VALID_DRVINFO) {
510 memset(&netdev_dev->drvinfo, 0, sizeof netdev_dev->drvinfo);
511 error = netdev_linux_do_ethtool(netdev_dev->netdev_dev.name,
512 (struct ethtool_cmd *)&netdev_dev->drvinfo,
516 netdev_dev->cache_valid |= VALID_DRVINFO;
522 netdev_dev_linux_changed(struct netdev_dev_linux *dev,
523 unsigned int ifi_flags,
527 if (!dev->change_seq) {
531 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
532 dev->carrier_resets++;
534 dev->ifi_flags = ifi_flags;
536 dev->cache_valid &= mask;
540 netdev_dev_linux_update(struct netdev_dev_linux *dev,
541 const struct rtnetlink_link_change *change)
543 if (change->nlmsg_type == RTM_NEWLINK) {
545 netdev_dev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
547 /* Update netdev from rtnl-change msg. */
549 dev->mtu = change->mtu;
550 dev->cache_valid |= VALID_MTU;
551 dev->netdev_mtu_error = 0;
554 if (!eth_addr_is_zero(change->addr)) {
555 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
556 dev->cache_valid |= VALID_ETHERADDR;
557 dev->ether_addr_error = 0;
560 dev->ifindex = change->ifi_index;
561 dev->cache_valid |= VALID_IFINDEX;
562 dev->get_ifindex_error = 0;
565 netdev_dev_linux_changed(dev, change->ifi_flags, 0);
570 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
571 void *aux OVS_UNUSED)
573 struct netdev_dev_linux *dev;
575 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
577 const struct netdev_class *netdev_class =
578 netdev_dev_get_class(base_dev);
580 if (is_netdev_linux_class(netdev_class)) {
581 dev = netdev_dev_linux_cast(base_dev);
582 netdev_dev_linux_update(dev, change);
586 struct shash device_shash;
587 struct shash_node *node;
589 shash_init(&device_shash);
590 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
591 SHASH_FOR_EACH (node, &device_shash) {
596 get_flags(&dev->netdev_dev, &flags);
597 netdev_dev_linux_changed(dev, flags, 0);
599 shash_destroy(&device_shash);
604 cache_notifier_ref(void)
606 if (!cache_notifier_refcount) {
607 assert(!netdev_linux_cache_notifier);
609 netdev_linux_cache_notifier =
610 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
612 if (!netdev_linux_cache_notifier) {
616 cache_notifier_refcount++;
622 cache_notifier_unref(void)
624 assert(cache_notifier_refcount > 0);
625 if (!--cache_notifier_refcount) {
626 assert(netdev_linux_cache_notifier);
627 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
628 netdev_linux_cache_notifier = NULL;
632 /* Creates system and internal devices. */
634 netdev_linux_create(const struct netdev_class *class, const char *name,
635 struct netdev_dev **netdev_devp)
637 struct netdev_dev_linux *netdev_dev;
640 error = cache_notifier_ref();
645 netdev_dev = xzalloc(sizeof *netdev_dev);
646 netdev_dev->change_seq = 1;
647 netdev_dev_init(&netdev_dev->netdev_dev, name, class);
648 get_flags(&netdev_dev->netdev_dev, &netdev_dev->ifi_flags);
650 *netdev_devp = &netdev_dev->netdev_dev;
654 /* For most types of netdevs we open the device for each call of
655 * netdev_open(). However, this is not the case with tap devices,
656 * since it is only possible to open the device once. In this
657 * situation we share a single file descriptor, and consequently
658 * buffers, across all readers. Therefore once data is read it will
659 * be unavailable to other reads for tap devices. */
661 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
662 const char *name, struct netdev_dev **netdev_devp)
664 struct netdev_dev_linux *netdev_dev;
665 struct tap_state *state;
666 static const char tap_dev[] = "/dev/net/tun";
670 netdev_dev = xzalloc(sizeof *netdev_dev);
671 state = &netdev_dev->state.tap;
673 error = cache_notifier_ref();
678 /* Open tap device. */
679 state->fd = open(tap_dev, O_RDWR);
682 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
683 goto error_unref_notifier;
686 /* Create tap device. */
687 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
688 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
689 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
690 VLOG_WARN("%s: creating tap device failed: %s", name,
693 goto error_unref_notifier;
696 /* Make non-blocking. */
697 error = set_nonblocking(state->fd);
699 goto error_unref_notifier;
702 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
703 *netdev_devp = &netdev_dev->netdev_dev;
706 error_unref_notifier:
707 cache_notifier_unref();
714 destroy_tap(struct netdev_dev_linux *netdev_dev)
716 struct tap_state *state = &netdev_dev->state.tap;
718 if (state->fd >= 0) {
723 /* Destroys the netdev device 'netdev_dev_'. */
725 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
727 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
728 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
730 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
731 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
734 if (class == &netdev_tap_class || class == &netdev_tap_pl_class) {
735 destroy_tap(netdev_dev);
739 cache_notifier_unref();
743 netdev_linux_open(struct netdev_dev *netdev_dev_, struct netdev **netdevp)
745 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
746 struct netdev_linux *netdev;
747 enum netdev_flags flags;
750 /* Allocate network device. */
751 netdev = xzalloc(sizeof *netdev);
753 netdev_init(&netdev->netdev, netdev_dev_);
755 /* Verify that the device really exists, by attempting to read its flags.
756 * (The flags might be cached, in which case this won't actually do an
759 * Don't do this for "internal" netdevs, though, because those have to be
760 * created as netdev objects before they exist in the kernel, because
761 * creating them in the kernel happens by passing a netdev object to
762 * dpif_port_add(). */
763 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
764 error = netdev_get_flags(&netdev->netdev, &flags);
765 if (error == ENODEV) {
770 if (!strncmp(netdev_dev_get_type(netdev_dev_), "tap", 3) &&
771 !netdev_dev->state.tap.opened) {
773 /* We assume that the first user of the tap device is the primary user
774 * and give them the tap FD. Subsequent users probably just expect
775 * this to be a system device so open it normally to avoid send/receive
776 * directions appearing to be reversed. */
777 netdev->fd = netdev_dev->state.tap.fd;
778 netdev_dev->state.tap.opened = true;
781 *netdevp = &netdev->netdev;
785 netdev_uninit(&netdev->netdev, true);
789 /* Closes and destroys 'netdev'. */
791 netdev_linux_close(struct netdev *netdev_)
793 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
795 if (netdev->fd > 0 && strncmp(netdev_get_type(netdev_), "tap", 3)) {
802 netdev_linux_listen(struct netdev *netdev_)
804 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
805 struct sockaddr_ll sll;
810 if (netdev->fd >= 0) {
814 /* Create file descriptor. */
815 fd = socket(PF_PACKET, SOCK_RAW, 0);
818 VLOG_ERR("failed to create raw socket (%s)", strerror(error));
822 /* Set non-blocking mode. */
823 error = set_nonblocking(fd);
828 /* Get ethernet device index. */
829 error = get_ifindex(&netdev->netdev, &ifindex);
834 /* Bind to specific ethernet device. */
835 memset(&sll, 0, sizeof sll);
836 sll.sll_family = AF_PACKET;
837 sll.sll_ifindex = ifindex;
838 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
839 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
841 VLOG_ERR("%s: failed to bind raw socket (%s)",
842 netdev_get_name(netdev_), strerror(error));
857 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
859 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
861 if (netdev->fd < 0) {
862 /* Device is not listening. */
869 retval = ((netdev_->netdev_dev->netdev_class == &netdev_tap_class ||
870 netdev_->netdev_dev->netdev_class == &netdev_tap_pl_class)
871 ? read(netdev->fd, data, size)
872 : recv(netdev->fd, data, size, MSG_TRUNC));
874 return retval <= size ? retval : -EMSGSIZE;
875 } else if (errno != EINTR) {
876 if (errno != EAGAIN) {
877 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
878 strerror(errno), netdev_get_name(netdev_));
885 /* Registers with the poll loop to wake up from the next call to poll_block()
886 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
888 netdev_linux_recv_wait(struct netdev *netdev_)
890 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
891 if (netdev->fd >= 0) {
892 poll_fd_wait(netdev->fd, POLLIN);
896 /* Discards all packets waiting to be received from 'netdev'. */
898 netdev_linux_drain(struct netdev *netdev_)
900 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
901 if (netdev->fd < 0) {
903 } else if (!strncmp(netdev_get_type(netdev_), "tap", 3)) {
905 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
906 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
910 drain_fd(netdev->fd, ifr.ifr_qlen);
913 return drain_rcvbuf(netdev->fd);
917 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
918 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
919 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
920 * the packet is too big or too small to transmit on the device.
922 * The caller retains ownership of 'buffer' in all cases.
924 * The kernel maintains a packet transmission queue, so the caller is not
925 * expected to do additional queuing of packets. */
927 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
929 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
933 if (netdev->fd < 0) {
934 /* Use our AF_PACKET socket to send to this device. */
935 struct sockaddr_ll sll;
942 sock = af_packet_sock();
947 error = get_ifindex(netdev_, &ifindex);
952 /* We don't bother setting most fields in sockaddr_ll because the
953 * kernel ignores them for SOCK_RAW. */
954 memset(&sll, 0, sizeof sll);
955 sll.sll_family = AF_PACKET;
956 sll.sll_ifindex = ifindex;
958 iov.iov_base = (void *) data;
962 msg.msg_namelen = sizeof sll;
965 msg.msg_control = NULL;
966 msg.msg_controllen = 0;
969 retval = sendmsg(sock, &msg, 0);
971 /* Use the netdev's own fd to send to this device. This is
972 * essential for tap devices, because packets sent to a tap device
973 * with an AF_PACKET socket will loop back to be *received* again
974 * on the tap device. */
975 retval = write(netdev->fd, data, size);
979 /* The Linux AF_PACKET implementation never blocks waiting for room
980 * for packets, instead returning ENOBUFS. Translate this into
981 * EAGAIN for the caller. */
982 if (errno == ENOBUFS) {
984 } else if (errno == EINTR) {
986 } else if (errno != EAGAIN) {
987 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
988 netdev_get_name(netdev_), strerror(errno));
991 } else if (retval != size) {
992 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
993 "%zu) on %s", retval, size, netdev_get_name(netdev_));
1001 /* Registers with the poll loop to wake up from the next call to poll_block()
1002 * when the packet transmission queue has sufficient room to transmit a packet
1003 * with netdev_send().
1005 * The kernel maintains a packet transmission queue, so the client is not
1006 * expected to do additional queuing of packets. Thus, this function is
1007 * unlikely to ever be used. It is included for completeness. */
1009 netdev_linux_send_wait(struct netdev *netdev_)
1011 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1012 if (netdev->fd < 0) {
1013 /* Nothing to do. */
1014 } else if (strncmp(netdev_get_type(netdev_), "tap", 3)) {
1015 poll_fd_wait(netdev->fd, POLLOUT);
1017 /* TAP device always accepts packets.*/
1018 poll_immediate_wake();
1022 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1023 * otherwise a positive errno value. */
1025 netdev_linux_set_etheraddr(struct netdev *netdev_,
1026 const uint8_t mac[ETH_ADDR_LEN])
1028 struct netdev_dev_linux *netdev_dev =
1029 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1032 if (netdev_dev->cache_valid & VALID_ETHERADDR) {
1033 if (netdev_dev->ether_addr_error) {
1034 return netdev_dev->ether_addr_error;
1036 if (eth_addr_equals(netdev_dev->etheraddr, mac)) {
1039 netdev_dev->cache_valid &= ~VALID_ETHERADDR;
1042 error = set_etheraddr(netdev_get_name(netdev_), mac);
1043 if (!error || error == ENODEV) {
1044 netdev_dev->ether_addr_error = error;
1045 netdev_dev->cache_valid |= VALID_ETHERADDR;
1047 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
1054 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1056 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1057 uint8_t mac[ETH_ADDR_LEN])
1059 struct netdev_dev_linux *netdev_dev =
1060 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1062 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
1063 int error = get_etheraddr(netdev_get_name(netdev_),
1064 netdev_dev->etheraddr);
1066 netdev_dev->ether_addr_error = error;
1067 netdev_dev->cache_valid |= VALID_ETHERADDR;
1070 if (!netdev_dev->ether_addr_error) {
1071 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
1074 return netdev_dev->ether_addr_error;
1077 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1078 * in bytes, not including the hardware header; thus, this is typically 1500
1079 * bytes for Ethernet devices. */
1081 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1083 struct netdev_dev_linux *netdev_dev =
1084 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1085 if (!(netdev_dev->cache_valid & VALID_MTU)) {
1089 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1090 SIOCGIFMTU, "SIOCGIFMTU");
1092 netdev_dev->netdev_mtu_error = error;
1093 netdev_dev->mtu = ifr.ifr_mtu;
1094 netdev_dev->cache_valid |= VALID_MTU;
1097 if (!netdev_dev->netdev_mtu_error) {
1098 *mtup = netdev_dev->mtu;
1100 return netdev_dev->netdev_mtu_error;
1103 /* Sets the maximum size of transmitted (MTU) for given device using linux
1104 * networking ioctl interface.
1107 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1109 struct netdev_dev_linux *netdev_dev =
1110 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1114 if (netdev_dev->cache_valid & VALID_MTU) {
1115 if (netdev_dev->netdev_mtu_error) {
1116 return netdev_dev->netdev_mtu_error;
1118 if (netdev_dev->mtu == mtu) {
1121 netdev_dev->cache_valid &= ~VALID_MTU;
1124 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1125 SIOCSIFMTU, "SIOCSIFMTU");
1126 if (!error || error == ENODEV) {
1127 netdev_dev->netdev_mtu_error = error;
1128 netdev_dev->mtu = ifr.ifr_mtu;
1129 netdev_dev->cache_valid |= VALID_MTU;
1134 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1135 * On failure, returns a negative errno value. */
1137 netdev_linux_get_ifindex(const struct netdev *netdev)
1141 error = get_ifindex(netdev, &ifindex);
1142 return error ? -error : ifindex;
1146 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1148 struct netdev_dev_linux *netdev_dev =
1149 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1151 if (netdev_dev->miimon_interval > 0) {
1152 *carrier = netdev_dev->miimon;
1154 *carrier = (netdev_dev->ifi_flags & IFF_RUNNING) != 0;
1160 static long long int
1161 netdev_linux_get_carrier_resets(const struct netdev *netdev)
1163 return netdev_dev_linux_cast(netdev_get_dev(netdev))->carrier_resets;
1167 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1168 struct mii_ioctl_data *data)
1173 memset(&ifr, 0, sizeof ifr);
1174 memcpy(&ifr.ifr_data, data, sizeof *data);
1175 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1176 memcpy(data, &ifr.ifr_data, sizeof *data);
1182 netdev_linux_get_miimon(const char *name, bool *miimon)
1184 struct mii_ioctl_data data;
1189 memset(&data, 0, sizeof data);
1190 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1192 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1193 data.reg_num = MII_BMSR;
1194 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1198 *miimon = !!(data.val_out & BMSR_LSTATUS);
1200 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1203 struct ethtool_cmd ecmd;
1205 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1208 memset(&ecmd, 0, sizeof ecmd);
1209 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1212 struct ethtool_value eval;
1214 memcpy(&eval, &ecmd, sizeof eval);
1215 *miimon = !!eval.data;
1217 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1225 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1226 long long int interval)
1228 struct netdev_dev_linux *netdev_dev;
1230 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev_));
1232 interval = interval > 0 ? MAX(interval, 100) : 0;
1233 if (netdev_dev->miimon_interval != interval) {
1234 netdev_dev->miimon_interval = interval;
1235 timer_set_expired(&netdev_dev->miimon_timer);
1242 netdev_linux_miimon_run(void)
1244 struct shash device_shash;
1245 struct shash_node *node;
1247 shash_init(&device_shash);
1248 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1249 SHASH_FOR_EACH (node, &device_shash) {
1250 struct netdev_dev_linux *dev = node->data;
1253 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1257 netdev_linux_get_miimon(dev->netdev_dev.name, &miimon);
1258 if (miimon != dev->miimon) {
1259 dev->miimon = miimon;
1260 netdev_dev_linux_changed(dev, dev->ifi_flags, 0);
1263 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1266 shash_destroy(&device_shash);
1270 netdev_linux_miimon_wait(void)
1272 struct shash device_shash;
1273 struct shash_node *node;
1275 shash_init(&device_shash);
1276 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1277 SHASH_FOR_EACH (node, &device_shash) {
1278 struct netdev_dev_linux *dev = node->data;
1280 if (dev->miimon_interval > 0) {
1281 timer_wait(&dev->miimon_timer);
1284 shash_destroy(&device_shash);
1287 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1288 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1291 check_for_working_netlink_stats(void)
1293 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1294 * preferable, so if that works, we'll use it. */
1295 int ifindex = do_get_ifindex("lo");
1297 VLOG_WARN("failed to get ifindex for lo, "
1298 "obtaining netdev stats from proc");
1301 struct netdev_stats stats;
1302 int error = get_stats_via_netlink(ifindex, &stats);
1304 VLOG_DBG("obtaining netdev stats via rtnetlink");
1307 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1308 "via proc (you are probably running a pre-2.6.19 "
1309 "kernel)", strerror(error));
1316 swap_uint64(uint64_t *a, uint64_t *b)
1324 get_stats_via_vport(const struct netdev *netdev_,
1325 struct netdev_stats *stats)
1327 struct netdev_dev_linux *netdev_dev =
1328 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1330 if (!netdev_dev->vport_stats_error ||
1331 !(netdev_dev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1334 error = netdev_vport_get_stats(netdev_, stats);
1336 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1337 "(%s)", netdev_get_name(netdev_), strerror(error));
1339 netdev_dev->vport_stats_error = error;
1340 netdev_dev->cache_valid |= VALID_VPORT_STAT_ERROR;
1345 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1346 struct netdev_stats *stats)
1348 static int use_netlink_stats = -1;
1351 if (use_netlink_stats < 0) {
1352 use_netlink_stats = check_for_working_netlink_stats();
1355 if (use_netlink_stats) {
1358 error = get_ifindex(netdev_, &ifindex);
1360 error = get_stats_via_netlink(ifindex, stats);
1363 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1367 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1368 netdev_get_name(netdev_), error);
1374 /* Retrieves current device stats for 'netdev-linux'. */
1376 netdev_linux_get_stats(const struct netdev *netdev_,
1377 struct netdev_stats *stats)
1379 struct netdev_dev_linux *netdev_dev =
1380 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1381 struct netdev_stats dev_stats;
1384 get_stats_via_vport(netdev_, stats);
1386 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1389 if (netdev_dev->vport_stats_error) {
1396 if (netdev_dev->vport_stats_error) {
1397 /* stats not available from OVS then use ioctl stats. */
1400 stats->rx_errors += dev_stats.rx_errors;
1401 stats->tx_errors += dev_stats.tx_errors;
1402 stats->rx_dropped += dev_stats.rx_dropped;
1403 stats->tx_dropped += dev_stats.tx_dropped;
1404 stats->multicast += dev_stats.multicast;
1405 stats->collisions += dev_stats.collisions;
1406 stats->rx_length_errors += dev_stats.rx_length_errors;
1407 stats->rx_over_errors += dev_stats.rx_over_errors;
1408 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1409 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1410 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1411 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1412 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1413 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1414 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1415 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1416 stats->tx_window_errors += dev_stats.tx_window_errors;
1421 /* Retrieves current device stats for 'netdev-tap' netdev or
1422 * netdev-internal. */
1424 netdev_tap_get_stats(const struct netdev *netdev_,
1425 struct netdev_stats *stats)
1427 struct netdev_dev_linux *netdev_dev =
1428 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1429 struct netdev_stats dev_stats;
1432 get_stats_via_vport(netdev_, stats);
1434 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1436 if (netdev_dev->vport_stats_error) {
1443 /* If this port is an internal port then the transmit and receive stats
1444 * will appear to be swapped relative to the other ports since we are the
1445 * one sending the data, not a remote computer. For consistency, we swap
1446 * them back here. This does not apply if we are getting stats from the
1447 * vport layer because it always tracks stats from the perspective of the
1449 if (netdev_dev->vport_stats_error) {
1451 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1452 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1453 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1454 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1455 stats->rx_length_errors = 0;
1456 stats->rx_over_errors = 0;
1457 stats->rx_crc_errors = 0;
1458 stats->rx_frame_errors = 0;
1459 stats->rx_fifo_errors = 0;
1460 stats->rx_missed_errors = 0;
1461 stats->tx_aborted_errors = 0;
1462 stats->tx_carrier_errors = 0;
1463 stats->tx_fifo_errors = 0;
1464 stats->tx_heartbeat_errors = 0;
1465 stats->tx_window_errors = 0;
1467 stats->rx_dropped += dev_stats.tx_dropped;
1468 stats->tx_dropped += dev_stats.rx_dropped;
1470 stats->rx_errors += dev_stats.tx_errors;
1471 stats->tx_errors += dev_stats.rx_errors;
1473 stats->multicast += dev_stats.multicast;
1474 stats->collisions += dev_stats.collisions;
1480 netdev_internal_get_stats(const struct netdev *netdev_,
1481 struct netdev_stats *stats)
1483 struct netdev_dev_linux *netdev_dev =
1484 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1486 get_stats_via_vport(netdev_, stats);
1487 return netdev_dev->vport_stats_error;
1491 netdev_linux_read_features(struct netdev_dev_linux *netdev_dev)
1493 struct ethtool_cmd ecmd;
1497 if (netdev_dev->cache_valid & VALID_FEATURES) {
1501 memset(&ecmd, 0, sizeof ecmd);
1502 error = netdev_linux_do_ethtool(netdev_dev->netdev_dev.name, &ecmd,
1503 ETHTOOL_GSET, "ETHTOOL_GSET");
1508 /* Supported features. */
1509 netdev_dev->supported = 0;
1510 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1511 netdev_dev->supported |= NETDEV_F_10MB_HD;
1513 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1514 netdev_dev->supported |= NETDEV_F_10MB_FD;
1516 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1517 netdev_dev->supported |= NETDEV_F_100MB_HD;
1519 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1520 netdev_dev->supported |= NETDEV_F_100MB_FD;
1522 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1523 netdev_dev->supported |= NETDEV_F_1GB_HD;
1525 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1526 netdev_dev->supported |= NETDEV_F_1GB_FD;
1528 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1529 netdev_dev->supported |= NETDEV_F_10GB_FD;
1531 if (ecmd.supported & SUPPORTED_TP) {
1532 netdev_dev->supported |= NETDEV_F_COPPER;
1534 if (ecmd.supported & SUPPORTED_FIBRE) {
1535 netdev_dev->supported |= NETDEV_F_FIBER;
1537 if (ecmd.supported & SUPPORTED_Autoneg) {
1538 netdev_dev->supported |= NETDEV_F_AUTONEG;
1540 if (ecmd.supported & SUPPORTED_Pause) {
1541 netdev_dev->supported |= NETDEV_F_PAUSE;
1543 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1544 netdev_dev->supported |= NETDEV_F_PAUSE_ASYM;
1547 /* Advertised features. */
1548 netdev_dev->advertised = 0;
1549 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1550 netdev_dev->advertised |= NETDEV_F_10MB_HD;
1552 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1553 netdev_dev->advertised |= NETDEV_F_10MB_FD;
1555 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1556 netdev_dev->advertised |= NETDEV_F_100MB_HD;
1558 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1559 netdev_dev->advertised |= NETDEV_F_100MB_FD;
1561 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1562 netdev_dev->advertised |= NETDEV_F_1GB_HD;
1564 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1565 netdev_dev->advertised |= NETDEV_F_1GB_FD;
1567 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1568 netdev_dev->advertised |= NETDEV_F_10GB_FD;
1570 if (ecmd.advertising & ADVERTISED_TP) {
1571 netdev_dev->advertised |= NETDEV_F_COPPER;
1573 if (ecmd.advertising & ADVERTISED_FIBRE) {
1574 netdev_dev->advertised |= NETDEV_F_FIBER;
1576 if (ecmd.advertising & ADVERTISED_Autoneg) {
1577 netdev_dev->advertised |= NETDEV_F_AUTONEG;
1579 if (ecmd.advertising & ADVERTISED_Pause) {
1580 netdev_dev->advertised |= NETDEV_F_PAUSE;
1582 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1583 netdev_dev->advertised |= NETDEV_F_PAUSE_ASYM;
1586 /* Current settings. */
1588 if (speed == SPEED_10) {
1589 netdev_dev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1590 } else if (speed == SPEED_100) {
1591 netdev_dev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1592 } else if (speed == SPEED_1000) {
1593 netdev_dev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1594 } else if (speed == SPEED_10000) {
1595 netdev_dev->current = NETDEV_F_10GB_FD;
1596 } else if (speed == 40000) {
1597 netdev_dev->current = NETDEV_F_40GB_FD;
1598 } else if (speed == 100000) {
1599 netdev_dev->current = NETDEV_F_100GB_FD;
1600 } else if (speed == 1000000) {
1601 netdev_dev->current = NETDEV_F_1TB_FD;
1603 netdev_dev->current = 0;
1606 if (ecmd.port == PORT_TP) {
1607 netdev_dev->current |= NETDEV_F_COPPER;
1608 } else if (ecmd.port == PORT_FIBRE) {
1609 netdev_dev->current |= NETDEV_F_FIBER;
1613 netdev_dev->current |= NETDEV_F_AUTONEG;
1616 /* Peer advertisements. */
1617 netdev_dev->peer = 0; /* XXX */
1620 netdev_dev->cache_valid |= VALID_FEATURES;
1621 netdev_dev->get_features_error = error;
1624 /* Stores the features supported by 'netdev' into each of '*current',
1625 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1626 * bitmap of NETDEV_* bits. Returns 0 if successful, otherwise a positive
1629 netdev_linux_get_features(const struct netdev *netdev_,
1630 enum netdev_features *current,
1631 enum netdev_features *advertised,
1632 enum netdev_features *supported,
1633 enum netdev_features *peer)
1635 struct netdev_dev_linux *netdev_dev =
1636 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1638 netdev_linux_read_features(netdev_dev);
1640 if (!netdev_dev->get_features_error) {
1641 *current = netdev_dev->current;
1642 *advertised = netdev_dev->advertised;
1643 *supported = netdev_dev->supported;
1644 *peer = netdev_dev->peer;
1646 return netdev_dev->get_features_error;
1649 /* Set the features advertised by 'netdev' to 'advertise'. */
1651 netdev_linux_set_advertisements(struct netdev *netdev,
1652 enum netdev_features advertise)
1654 struct ethtool_cmd ecmd;
1657 memset(&ecmd, 0, sizeof ecmd);
1658 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1659 ETHTOOL_GSET, "ETHTOOL_GSET");
1664 ecmd.advertising = 0;
1665 if (advertise & NETDEV_F_10MB_HD) {
1666 ecmd.advertising |= ADVERTISED_10baseT_Half;
1668 if (advertise & NETDEV_F_10MB_FD) {
1669 ecmd.advertising |= ADVERTISED_10baseT_Full;
1671 if (advertise & NETDEV_F_100MB_HD) {
1672 ecmd.advertising |= ADVERTISED_100baseT_Half;
1674 if (advertise & NETDEV_F_100MB_FD) {
1675 ecmd.advertising |= ADVERTISED_100baseT_Full;
1677 if (advertise & NETDEV_F_1GB_HD) {
1678 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1680 if (advertise & NETDEV_F_1GB_FD) {
1681 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1683 if (advertise & NETDEV_F_10GB_FD) {
1684 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1686 if (advertise & NETDEV_F_COPPER) {
1687 ecmd.advertising |= ADVERTISED_TP;
1689 if (advertise & NETDEV_F_FIBER) {
1690 ecmd.advertising |= ADVERTISED_FIBRE;
1692 if (advertise & NETDEV_F_AUTONEG) {
1693 ecmd.advertising |= ADVERTISED_Autoneg;
1695 if (advertise & NETDEV_F_PAUSE) {
1696 ecmd.advertising |= ADVERTISED_Pause;
1698 if (advertise & NETDEV_F_PAUSE_ASYM) {
1699 ecmd.advertising |= ADVERTISED_Asym_Pause;
1701 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1702 ETHTOOL_SSET, "ETHTOOL_SSET");
1705 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1706 * successful, otherwise a positive errno value. */
1708 netdev_linux_set_policing(struct netdev *netdev,
1709 uint32_t kbits_rate, uint32_t kbits_burst)
1711 struct netdev_dev_linux *netdev_dev =
1712 netdev_dev_linux_cast(netdev_get_dev(netdev));
1713 const char *netdev_name = netdev_get_name(netdev);
1717 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1718 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1719 : kbits_burst); /* Stick with user-specified value. */
1721 if (netdev_dev->cache_valid & VALID_POLICING) {
1722 if (netdev_dev->netdev_policing_error) {
1723 return netdev_dev->netdev_policing_error;
1726 if (netdev_dev->kbits_rate == kbits_rate &&
1727 netdev_dev->kbits_burst == kbits_burst) {
1728 /* Assume that settings haven't changed since we last set them. */
1731 netdev_dev->cache_valid &= ~VALID_POLICING;
1734 COVERAGE_INC(netdev_set_policing);
1735 /* Remove any existing ingress qdisc. */
1736 error = tc_add_del_ingress_qdisc(netdev, false);
1738 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1739 netdev_name, strerror(error));
1744 error = tc_add_del_ingress_qdisc(netdev, true);
1746 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1747 netdev_name, strerror(error));
1751 error = tc_add_policer(netdev, kbits_rate, kbits_burst);
1753 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1754 netdev_name, strerror(error));
1759 netdev_dev->kbits_rate = kbits_rate;
1760 netdev_dev->kbits_burst = kbits_burst;
1763 if (!error || error == ENODEV) {
1764 netdev_dev->netdev_policing_error = error;
1765 netdev_dev->cache_valid |= VALID_POLICING;
1771 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1774 const struct tc_ops **opsp;
1776 for (opsp = tcs; *opsp != NULL; opsp++) {
1777 const struct tc_ops *ops = *opsp;
1778 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1779 sset_add(types, ops->ovs_name);
1786 netdev_linux_create_tap_pl(const struct netdev_class *class OVS_UNUSED,
1787 const char *name, struct netdev_dev **netdev_devp)
1789 struct netdev_dev_linux *netdev_dev;
1790 struct tap_state *state;
1791 char real_name[IFNAMSIZ];
1794 netdev_dev = xzalloc(sizeof *netdev_dev);
1795 state = &netdev_dev->state.tap;
1797 error = cache_notifier_ref();
1802 /* Open tap device. */
1803 state->fd = tun_alloc(IFF_TAP, real_name);
1804 if (state->fd < 0) {
1806 VLOG_WARN("tun_alloc(IFF_TAP, %s) failed: %s", name, strerror(error));
1807 goto error_unref_notifier;
1809 if (strcmp(name, real_name)) {
1810 VLOG_WARN("tap_pl: requested %s, created %s", name, real_name);
1813 /* Make non-blocking. */
1814 error = set_nonblocking(state->fd);
1816 goto error_unref_notifier;
1819 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_pl_class);
1820 *netdev_devp = &netdev_dev->netdev_dev;
1823 error_unref_notifier:
1824 cache_notifier_unref();
1830 static const struct tc_ops *
1831 tc_lookup_ovs_name(const char *name)
1833 const struct tc_ops **opsp;
1835 for (opsp = tcs; *opsp != NULL; opsp++) {
1836 const struct tc_ops *ops = *opsp;
1837 if (!strcmp(name, ops->ovs_name)) {
1844 static const struct tc_ops *
1845 tc_lookup_linux_name(const char *name)
1847 const struct tc_ops **opsp;
1849 for (opsp = tcs; *opsp != NULL; opsp++) {
1850 const struct tc_ops *ops = *opsp;
1851 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1858 static struct tc_queue *
1859 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1862 struct netdev_dev_linux *netdev_dev =
1863 netdev_dev_linux_cast(netdev_get_dev(netdev));
1864 struct tc_queue *queue;
1866 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1867 if (queue->queue_id == queue_id) {
1874 static struct tc_queue *
1875 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1877 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1881 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1883 struct netdev_qos_capabilities *caps)
1885 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1889 caps->n_queues = ops->n_queues;
1894 netdev_linux_get_qos(const struct netdev *netdev,
1895 const char **typep, struct shash *details)
1897 struct netdev_dev_linux *netdev_dev =
1898 netdev_dev_linux_cast(netdev_get_dev(netdev));
1901 error = tc_query_qdisc(netdev);
1906 *typep = netdev_dev->tc->ops->ovs_name;
1907 return (netdev_dev->tc->ops->qdisc_get
1908 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1913 netdev_linux_set_qos(struct netdev *netdev,
1914 const char *type, const struct shash *details)
1916 struct netdev_dev_linux *netdev_dev =
1917 netdev_dev_linux_cast(netdev_get_dev(netdev));
1918 const struct tc_ops *new_ops;
1921 new_ops = tc_lookup_ovs_name(type);
1922 if (!new_ops || !new_ops->tc_install) {
1926 error = tc_query_qdisc(netdev);
1931 if (new_ops == netdev_dev->tc->ops) {
1932 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1934 /* Delete existing qdisc. */
1935 error = tc_del_qdisc(netdev);
1939 assert(netdev_dev->tc == NULL);
1941 /* Install new qdisc. */
1942 error = new_ops->tc_install(netdev, details);
1943 assert((error == 0) == (netdev_dev->tc != NULL));
1950 netdev_linux_get_queue(const struct netdev *netdev,
1951 unsigned int queue_id, struct shash *details)
1953 struct netdev_dev_linux *netdev_dev =
1954 netdev_dev_linux_cast(netdev_get_dev(netdev));
1957 error = tc_query_qdisc(netdev);
1961 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1963 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1969 netdev_linux_set_queue(struct netdev *netdev,
1970 unsigned int queue_id, const struct shash *details)
1972 struct netdev_dev_linux *netdev_dev =
1973 netdev_dev_linux_cast(netdev_get_dev(netdev));
1976 error = tc_query_qdisc(netdev);
1979 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1980 || !netdev_dev->tc->ops->class_set) {
1984 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1988 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1990 struct netdev_dev_linux *netdev_dev =
1991 netdev_dev_linux_cast(netdev_get_dev(netdev));
1994 error = tc_query_qdisc(netdev);
1997 } else if (!netdev_dev->tc->ops->class_delete) {
2000 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
2002 ? netdev_dev->tc->ops->class_delete(netdev, queue)
2008 netdev_linux_get_queue_stats(const struct netdev *netdev,
2009 unsigned int queue_id,
2010 struct netdev_queue_stats *stats)
2012 struct netdev_dev_linux *netdev_dev =
2013 netdev_dev_linux_cast(netdev_get_dev(netdev));
2016 error = tc_query_qdisc(netdev);
2019 } else if (!netdev_dev->tc->ops->class_get_stats) {
2022 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
2024 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
2030 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
2032 struct ofpbuf request;
2033 struct tcmsg *tcmsg;
2035 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2039 tcmsg->tcm_parent = 0;
2040 nl_dump_start(dump, rtnl_sock, &request);
2041 ofpbuf_uninit(&request);
2046 netdev_linux_dump_queues(const struct netdev *netdev,
2047 netdev_dump_queues_cb *cb, void *aux)
2049 struct netdev_dev_linux *netdev_dev =
2050 netdev_dev_linux_cast(netdev_get_dev(netdev));
2051 struct tc_queue *queue, *next_queue;
2052 struct shash details;
2056 error = tc_query_qdisc(netdev);
2059 } else if (!netdev_dev->tc->ops->class_get) {
2064 shash_init(&details);
2065 HMAP_FOR_EACH_SAFE (queue, next_queue, hmap_node,
2066 &netdev_dev->tc->queues) {
2067 shash_clear(&details);
2069 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
2071 (*cb)(queue->queue_id, &details, aux);
2076 shash_destroy(&details);
2082 netdev_linux_dump_queue_stats(const struct netdev *netdev,
2083 netdev_dump_queue_stats_cb *cb, void *aux)
2085 struct netdev_dev_linux *netdev_dev =
2086 netdev_dev_linux_cast(netdev_get_dev(netdev));
2087 struct nl_dump dump;
2092 error = tc_query_qdisc(netdev);
2095 } else if (!netdev_dev->tc->ops->class_dump_stats) {
2100 if (!start_queue_dump(netdev, &dump)) {
2103 while (nl_dump_next(&dump, &msg)) {
2104 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
2110 error = nl_dump_done(&dump);
2111 return error ? error : last_error;
2115 netdev_linux_get_in4(const struct netdev *netdev_,
2116 struct in_addr *address, struct in_addr *netmask)
2118 struct netdev_dev_linux *netdev_dev =
2119 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2121 if (!(netdev_dev->cache_valid & VALID_IN4)) {
2124 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
2125 SIOCGIFADDR, "SIOCGIFADDR");
2130 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
2131 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2136 netdev_dev->cache_valid |= VALID_IN4;
2138 *address = netdev_dev->address;
2139 *netmask = netdev_dev->netmask;
2140 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
2144 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2145 struct in_addr netmask)
2147 struct netdev_dev_linux *netdev_dev =
2148 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2151 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2153 netdev_dev->cache_valid |= VALID_IN4;
2154 netdev_dev->address = address;
2155 netdev_dev->netmask = netmask;
2156 if (address.s_addr != INADDR_ANY) {
2157 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2158 "SIOCSIFNETMASK", netmask);
2165 parse_if_inet6_line(const char *line,
2166 struct in6_addr *in6, char ifname[16 + 1])
2168 uint8_t *s6 = in6->s6_addr;
2169 #define X8 "%2"SCNx8
2171 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2172 "%*x %*x %*x %*x %16s\n",
2173 &s6[0], &s6[1], &s6[2], &s6[3],
2174 &s6[4], &s6[5], &s6[6], &s6[7],
2175 &s6[8], &s6[9], &s6[10], &s6[11],
2176 &s6[12], &s6[13], &s6[14], &s6[15],
2180 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2181 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2183 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2185 struct netdev_dev_linux *netdev_dev =
2186 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2187 if (!(netdev_dev->cache_valid & VALID_IN6)) {
2191 netdev_dev->in6 = in6addr_any;
2193 file = fopen("/proc/net/if_inet6", "r");
2195 const char *name = netdev_get_name(netdev_);
2196 while (fgets(line, sizeof line, file)) {
2197 struct in6_addr in6_tmp;
2198 char ifname[16 + 1];
2199 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2200 && !strcmp(name, ifname))
2202 netdev_dev->in6 = in6_tmp;
2208 netdev_dev->cache_valid |= VALID_IN6;
2210 *in6 = netdev_dev->in6;
2215 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2217 struct sockaddr_in sin;
2218 memset(&sin, 0, sizeof sin);
2219 sin.sin_family = AF_INET;
2220 sin.sin_addr = addr;
2223 memset(sa, 0, sizeof *sa);
2224 memcpy(sa, &sin, sizeof sin);
2228 do_set_addr(struct netdev *netdev,
2229 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2232 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2233 make_in4_sockaddr(&ifr.ifr_addr, addr);
2235 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2239 /* Adds 'router' as a default IP gateway. */
2241 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2243 struct in_addr any = { INADDR_ANY };
2247 memset(&rt, 0, sizeof rt);
2248 make_in4_sockaddr(&rt.rt_dst, any);
2249 make_in4_sockaddr(&rt.rt_gateway, router);
2250 make_in4_sockaddr(&rt.rt_genmask, any);
2251 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2252 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2254 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
2260 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2263 static const char fn[] = "/proc/net/route";
2268 *netdev_name = NULL;
2269 stream = fopen(fn, "r");
2270 if (stream == NULL) {
2271 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
2276 while (fgets(line, sizeof line, stream)) {
2279 ovs_be32 dest, gateway, mask;
2280 int refcnt, metric, mtu;
2281 unsigned int flags, use, window, irtt;
2284 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2286 iface, &dest, &gateway, &flags, &refcnt,
2287 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2289 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2293 if (!(flags & RTF_UP)) {
2294 /* Skip routes that aren't up. */
2298 /* The output of 'dest', 'mask', and 'gateway' were given in
2299 * network byte order, so we don't need need any endian
2300 * conversions here. */
2301 if ((dest & mask) == (host->s_addr & mask)) {
2303 /* The host is directly reachable. */
2304 next_hop->s_addr = 0;
2306 /* To reach the host, we must go through a gateway. */
2307 next_hop->s_addr = gateway;
2309 *netdev_name = xstrdup(iface);
2321 netdev_linux_get_drv_info(const struct netdev *netdev, struct shash *sh)
2324 struct netdev_dev_linux *netdev_dev =
2325 netdev_dev_linux_cast(netdev_get_dev(netdev));
2327 error = netdev_linux_get_drvinfo(netdev_dev);
2329 shash_add(sh, "driver_name", xstrdup(netdev_dev->drvinfo.driver));
2330 shash_add(sh, "driver_version", xstrdup(netdev_dev->drvinfo.version));
2331 shash_add(sh, "firmware_version", xstrdup(netdev_dev->drvinfo.fw_version));
2337 netdev_internal_get_drv_info(const struct netdev *netdev OVS_UNUSED, struct shash *sh)
2339 shash_add(sh, "driver_name", xstrdup("openvswitch"));
2343 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2344 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2345 * returns 0. Otherwise, it returns a positive errno value; in particular,
2346 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2348 netdev_linux_arp_lookup(const struct netdev *netdev,
2349 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2352 struct sockaddr_in sin;
2355 memset(&r, 0, sizeof r);
2356 memset(&sin, 0, sizeof sin);
2357 sin.sin_family = AF_INET;
2358 sin.sin_addr.s_addr = ip;
2360 memcpy(&r.arp_pa, &sin, sizeof sin);
2361 r.arp_ha.sa_family = ARPHRD_ETHER;
2363 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2364 COVERAGE_INC(netdev_arp_lookup);
2365 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2367 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2368 } else if (retval != ENXIO) {
2369 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2370 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
2376 nd_to_iff_flags(enum netdev_flags nd)
2379 if (nd & NETDEV_UP) {
2382 if (nd & NETDEV_PROMISC) {
2389 iff_to_nd_flags(int iff)
2391 enum netdev_flags nd = 0;
2395 if (iff & IFF_PROMISC) {
2396 nd |= NETDEV_PROMISC;
2402 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2403 enum netdev_flags on, enum netdev_flags *old_flagsp)
2405 struct netdev_dev_linux *netdev_dev;
2406 int old_flags, new_flags;
2409 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2410 old_flags = netdev_dev->ifi_flags;
2411 *old_flagsp = iff_to_nd_flags(old_flags);
2412 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2413 if (new_flags != old_flags) {
2414 error = set_flags(netdev, new_flags);
2415 get_flags(&netdev_dev->netdev_dev, &netdev_dev->ifi_flags);
2421 netdev_tap_pl_update_flags(struct netdev *netdev, enum netdev_flags off,
2422 enum netdev_flags on, enum netdev_flags *old_flagsp)
2428 netdev_linux_change_seq(const struct netdev *netdev)
2430 return netdev_dev_linux_cast(netdev_get_dev(netdev))->change_seq;
2433 #define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS, \
2434 GET_FEATURES, GET_STATUS, \
2439 netdev_linux_init, \
2441 netdev_linux_wait, \
2444 netdev_linux_destroy, \
2445 NULL, /* get_config */ \
2446 NULL, /* set_config */ \
2448 netdev_linux_open, \
2449 netdev_linux_close, \
2451 netdev_linux_listen, \
2452 netdev_linux_recv, \
2453 netdev_linux_recv_wait, \
2454 netdev_linux_drain, \
2456 netdev_linux_send, \
2457 netdev_linux_send_wait, \
2459 netdev_linux_set_etheraddr, \
2460 netdev_linux_get_etheraddr, \
2461 netdev_linux_get_mtu, \
2462 netdev_linux_set_mtu, \
2463 netdev_linux_get_ifindex, \
2464 netdev_linux_get_carrier, \
2465 netdev_linux_get_carrier_resets, \
2466 netdev_linux_set_miimon_interval, \
2471 netdev_linux_set_advertisements, \
2473 netdev_linux_set_policing, \
2474 netdev_linux_get_qos_types, \
2475 netdev_linux_get_qos_capabilities, \
2476 netdev_linux_get_qos, \
2477 netdev_linux_set_qos, \
2478 netdev_linux_get_queue, \
2479 netdev_linux_set_queue, \
2480 netdev_linux_delete_queue, \
2481 netdev_linux_get_queue_stats, \
2482 netdev_linux_dump_queues, \
2483 netdev_linux_dump_queue_stats, \
2485 netdev_linux_get_in4, \
2486 netdev_linux_set_in4, \
2487 netdev_linux_get_in6, \
2488 netdev_linux_add_router, \
2489 netdev_linux_get_next_hop, \
2491 netdev_linux_arp_lookup, \
2495 netdev_linux_change_seq \
2498 const struct netdev_class netdev_linux_class =
2501 netdev_linux_create,
2502 netdev_linux_get_stats,
2503 NULL, /* set_stats */
2504 netdev_linux_get_features,
2505 netdev_linux_get_drv_info,
2506 netdev_linux_update_flags);
2508 const struct netdev_class netdev_tap_class =
2511 netdev_linux_create_tap,
2512 netdev_tap_get_stats,
2513 NULL, /* set_stats */
2514 netdev_linux_get_features,
2515 netdev_linux_get_drv_info,
2516 netdev_linux_update_flags);
2518 const struct netdev_class netdev_internal_class =
2521 netdev_linux_create,
2522 netdev_internal_get_stats,
2523 netdev_vport_set_stats,
2524 NULL, /* get_features */
2525 netdev_internal_get_drv_info,
2526 netdev_linux_update_flags);
2528 const struct netdev_class netdev_tap_pl_class =
2531 netdev_linux_create_tap_pl,
2532 netdev_tap_get_stats,
2533 NULL, /* set_stats */
2534 netdev_linux_get_features,
2535 netdev_linux_get_drv_info,
2536 netdev_tap_pl_update_flags);
2538 /* HTB traffic control class. */
2540 #define HTB_N_QUEUES 0xf000
2544 unsigned int max_rate; /* In bytes/s. */
2548 struct tc_queue tc_queue;
2549 unsigned int min_rate; /* In bytes/s. */
2550 unsigned int max_rate; /* In bytes/s. */
2551 unsigned int burst; /* In bytes. */
2552 unsigned int priority; /* Lower values are higher priorities. */
2556 htb_get__(const struct netdev *netdev)
2558 struct netdev_dev_linux *netdev_dev =
2559 netdev_dev_linux_cast(netdev_get_dev(netdev));
2560 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2564 htb_install__(struct netdev *netdev, uint64_t max_rate)
2566 struct netdev_dev_linux *netdev_dev =
2567 netdev_dev_linux_cast(netdev_get_dev(netdev));
2570 htb = xmalloc(sizeof *htb);
2571 tc_init(&htb->tc, &tc_ops_htb);
2572 htb->max_rate = max_rate;
2574 netdev_dev->tc = &htb->tc;
2577 /* Create an HTB qdisc.
2579 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2581 htb_setup_qdisc__(struct netdev *netdev)
2584 struct tc_htb_glob opt;
2585 struct ofpbuf request;
2586 struct tcmsg *tcmsg;
2588 tc_del_qdisc(netdev);
2590 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2591 NLM_F_EXCL | NLM_F_CREATE, &request);
2595 tcmsg->tcm_handle = tc_make_handle(1, 0);
2596 tcmsg->tcm_parent = TC_H_ROOT;
2598 nl_msg_put_string(&request, TCA_KIND, "htb");
2600 memset(&opt, 0, sizeof opt);
2601 opt.rate2quantum = 10;
2605 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2606 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2607 nl_msg_end_nested(&request, opt_offset);
2609 return tc_transact(&request, NULL);
2612 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2613 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2615 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2616 unsigned int parent, struct htb_class *class)
2619 struct tc_htb_opt opt;
2620 struct ofpbuf request;
2621 struct tcmsg *tcmsg;
2625 error = netdev_get_mtu(netdev, &mtu);
2627 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2628 netdev_get_name(netdev));
2632 memset(&opt, 0, sizeof opt);
2633 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2634 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2635 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2636 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2637 opt.prio = class->priority;
2639 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2643 tcmsg->tcm_handle = handle;
2644 tcmsg->tcm_parent = parent;
2646 nl_msg_put_string(&request, TCA_KIND, "htb");
2647 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2648 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2649 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2650 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2651 nl_msg_end_nested(&request, opt_offset);
2653 error = tc_transact(&request, NULL);
2655 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2656 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2657 netdev_get_name(netdev),
2658 tc_get_major(handle), tc_get_minor(handle),
2659 tc_get_major(parent), tc_get_minor(parent),
2660 class->min_rate, class->max_rate,
2661 class->burst, class->priority, strerror(error));
2666 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2667 * description of them into 'details'. The description complies with the
2668 * specification given in the vswitch database documentation for linux-htb
2671 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2673 static const struct nl_policy tca_htb_policy[] = {
2674 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2675 .min_len = sizeof(struct tc_htb_opt) },
2678 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2679 const struct tc_htb_opt *htb;
2681 if (!nl_parse_nested(nl_options, tca_htb_policy,
2682 attrs, ARRAY_SIZE(tca_htb_policy))) {
2683 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2687 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2688 class->min_rate = htb->rate.rate;
2689 class->max_rate = htb->ceil.rate;
2690 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2691 class->priority = htb->prio;
2696 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2697 struct htb_class *options,
2698 struct netdev_queue_stats *stats)
2700 struct nlattr *nl_options;
2701 unsigned int handle;
2704 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2705 if (!error && queue_id) {
2706 unsigned int major = tc_get_major(handle);
2707 unsigned int minor = tc_get_minor(handle);
2708 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2709 *queue_id = minor - 1;
2714 if (!error && options) {
2715 error = htb_parse_tca_options__(nl_options, options);
2721 htb_parse_qdisc_details__(struct netdev *netdev,
2722 const struct shash *details, struct htb_class *hc)
2724 const char *max_rate_s;
2726 max_rate_s = shash_find_data(details, "max-rate");
2727 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2728 if (!hc->max_rate) {
2729 enum netdev_features current;
2731 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2732 hc->max_rate = netdev_features_to_bps(current) / 8;
2734 hc->min_rate = hc->max_rate;
2740 htb_parse_class_details__(struct netdev *netdev,
2741 const struct shash *details, struct htb_class *hc)
2743 const struct htb *htb = htb_get__(netdev);
2744 const char *min_rate_s = shash_find_data(details, "min-rate");
2745 const char *max_rate_s = shash_find_data(details, "max-rate");
2746 const char *burst_s = shash_find_data(details, "burst");
2747 const char *priority_s = shash_find_data(details, "priority");
2750 error = netdev_get_mtu(netdev, &mtu);
2752 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2753 netdev_get_name(netdev));
2757 /* HTB requires at least an mtu sized min-rate to send any traffic even
2758 * on uncongested links. */
2759 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2760 hc->min_rate = MAX(hc->min_rate, mtu);
2761 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2764 hc->max_rate = (max_rate_s
2765 ? strtoull(max_rate_s, NULL, 10) / 8
2767 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2768 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2772 * According to hints in the documentation that I've read, it is important
2773 * that 'burst' be at least as big as the largest frame that might be
2774 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2775 * but having it a bit too small is a problem. Since netdev_get_mtu()
2776 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2777 * the MTU. We actually add 64, instead of 14, as a guard against
2778 * additional headers get tacked on somewhere that we're not aware of. */
2779 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2780 hc->burst = MAX(hc->burst, mtu + 64);
2783 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2789 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2790 unsigned int parent, struct htb_class *options,
2791 struct netdev_queue_stats *stats)
2793 struct ofpbuf *reply;
2796 error = tc_query_class(netdev, handle, parent, &reply);
2798 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2799 ofpbuf_delete(reply);
2805 htb_tc_install(struct netdev *netdev, const struct shash *details)
2809 error = htb_setup_qdisc__(netdev);
2811 struct htb_class hc;
2813 htb_parse_qdisc_details__(netdev, details, &hc);
2814 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2815 tc_make_handle(1, 0), &hc);
2817 htb_install__(netdev, hc.max_rate);
2823 static struct htb_class *
2824 htb_class_cast__(const struct tc_queue *queue)
2826 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2830 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2831 const struct htb_class *hc)
2833 struct htb *htb = htb_get__(netdev);
2834 size_t hash = hash_int(queue_id, 0);
2835 struct tc_queue *queue;
2836 struct htb_class *hcp;
2838 queue = tc_find_queue__(netdev, queue_id, hash);
2840 hcp = htb_class_cast__(queue);
2842 hcp = xmalloc(sizeof *hcp);
2843 queue = &hcp->tc_queue;
2844 queue->queue_id = queue_id;
2845 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2848 hcp->min_rate = hc->min_rate;
2849 hcp->max_rate = hc->max_rate;
2850 hcp->burst = hc->burst;
2851 hcp->priority = hc->priority;
2855 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2858 struct nl_dump dump;
2859 struct htb_class hc;
2861 /* Get qdisc options. */
2863 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2864 htb_install__(netdev, hc.max_rate);
2867 if (!start_queue_dump(netdev, &dump)) {
2870 while (nl_dump_next(&dump, &msg)) {
2871 unsigned int queue_id;
2873 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2874 htb_update_queue__(netdev, queue_id, &hc);
2877 nl_dump_done(&dump);
2883 htb_tc_destroy(struct tc *tc)
2885 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2886 struct htb_class *hc, *next;
2888 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2889 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2897 htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2899 const struct htb *htb = htb_get__(netdev);
2900 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2905 htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2907 struct htb_class hc;
2910 htb_parse_qdisc_details__(netdev, details, &hc);
2911 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2912 tc_make_handle(1, 0), &hc);
2914 htb_get__(netdev)->max_rate = hc.max_rate;
2920 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2921 const struct tc_queue *queue, struct shash *details)
2923 const struct htb_class *hc = htb_class_cast__(queue);
2925 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2926 if (hc->min_rate != hc->max_rate) {
2927 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2929 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2931 shash_add(details, "priority", xasprintf("%u", hc->priority));
2937 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2938 const struct shash *details)
2940 struct htb_class hc;
2943 error = htb_parse_class_details__(netdev, details, &hc);
2948 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2949 tc_make_handle(1, 0xfffe), &hc);
2954 htb_update_queue__(netdev, queue_id, &hc);
2959 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2961 struct htb_class *hc = htb_class_cast__(queue);
2962 struct htb *htb = htb_get__(netdev);
2965 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2967 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2974 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2975 struct netdev_queue_stats *stats)
2977 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2978 tc_make_handle(1, 0xfffe), NULL, stats);
2982 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2983 const struct ofpbuf *nlmsg,
2984 netdev_dump_queue_stats_cb *cb, void *aux)
2986 struct netdev_queue_stats stats;
2987 unsigned int handle, major, minor;
2990 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2995 major = tc_get_major(handle);
2996 minor = tc_get_minor(handle);
2997 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2998 (*cb)(minor - 1, &stats, aux);
3003 static const struct tc_ops tc_ops_htb = {
3004 "htb", /* linux_name */
3005 "linux-htb", /* ovs_name */
3006 HTB_N_QUEUES, /* n_queues */
3015 htb_class_get_stats,
3016 htb_class_dump_stats
3019 /* "linux-hfsc" traffic control class. */
3021 #define HFSC_N_QUEUES 0xf000
3029 struct tc_queue tc_queue;
3034 static struct hfsc *
3035 hfsc_get__(const struct netdev *netdev)
3037 struct netdev_dev_linux *netdev_dev;
3038 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
3039 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
3042 static struct hfsc_class *
3043 hfsc_class_cast__(const struct tc_queue *queue)
3045 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3049 hfsc_install__(struct netdev *netdev, uint32_t max_rate)
3051 struct netdev_dev_linux * netdev_dev;
3054 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
3055 hfsc = xmalloc(sizeof *hfsc);
3056 tc_init(&hfsc->tc, &tc_ops_hfsc);
3057 hfsc->max_rate = max_rate;
3058 netdev_dev->tc = &hfsc->tc;
3062 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3063 const struct hfsc_class *hc)
3067 struct hfsc_class *hcp;
3068 struct tc_queue *queue;
3070 hfsc = hfsc_get__(netdev);
3071 hash = hash_int(queue_id, 0);
3073 queue = tc_find_queue__(netdev, queue_id, hash);
3075 hcp = hfsc_class_cast__(queue);
3077 hcp = xmalloc(sizeof *hcp);
3078 queue = &hcp->tc_queue;
3079 queue->queue_id = queue_id;
3080 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3083 hcp->min_rate = hc->min_rate;
3084 hcp->max_rate = hc->max_rate;
3088 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3090 const struct tc_service_curve *rsc, *fsc, *usc;
3091 static const struct nl_policy tca_hfsc_policy[] = {
3093 .type = NL_A_UNSPEC,
3095 .min_len = sizeof(struct tc_service_curve),
3098 .type = NL_A_UNSPEC,
3100 .min_len = sizeof(struct tc_service_curve),
3103 .type = NL_A_UNSPEC,
3105 .min_len = sizeof(struct tc_service_curve),
3108 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3110 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3111 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3112 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3116 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3117 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3118 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3120 if (rsc->m1 != 0 || rsc->d != 0 ||
3121 fsc->m1 != 0 || fsc->d != 0 ||
3122 usc->m1 != 0 || usc->d != 0) {
3123 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3124 "Non-linear service curves are not supported.");
3128 if (rsc->m2 != fsc->m2) {
3129 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3130 "Real-time service curves are not supported ");
3134 if (rsc->m2 > usc->m2) {
3135 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3136 "Min-rate service curve is greater than "
3137 "the max-rate service curve.");
3141 class->min_rate = fsc->m2;
3142 class->max_rate = usc->m2;
3147 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3148 struct hfsc_class *options,
3149 struct netdev_queue_stats *stats)
3152 unsigned int handle;
3153 struct nlattr *nl_options;
3155 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3161 unsigned int major, minor;
3163 major = tc_get_major(handle);
3164 minor = tc_get_minor(handle);
3165 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3166 *queue_id = minor - 1;
3173 error = hfsc_parse_tca_options__(nl_options, options);
3180 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3181 unsigned int parent, struct hfsc_class *options,
3182 struct netdev_queue_stats *stats)
3185 struct ofpbuf *reply;
3187 error = tc_query_class(netdev, handle, parent, &reply);
3192 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3193 ofpbuf_delete(reply);
3198 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
3199 struct hfsc_class *class)
3202 const char *max_rate_s;
3204 max_rate_s = shash_find_data(details, "max-rate");
3205 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3208 enum netdev_features current;
3210 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
3211 max_rate = netdev_features_to_bps(current) / 8;
3214 class->min_rate = max_rate;
3215 class->max_rate = max_rate;
3219 hfsc_parse_class_details__(struct netdev *netdev,
3220 const struct shash *details,
3221 struct hfsc_class * class)
3223 const struct hfsc *hfsc;
3224 uint32_t min_rate, max_rate;
3225 const char *min_rate_s, *max_rate_s;
3227 hfsc = hfsc_get__(netdev);
3228 min_rate_s = shash_find_data(details, "min-rate");
3229 max_rate_s = shash_find_data(details, "max-rate");
3231 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3232 min_rate = MAX(min_rate, 1);
3233 min_rate = MIN(min_rate, hfsc->max_rate);
3235 max_rate = (max_rate_s
3236 ? strtoull(max_rate_s, NULL, 10) / 8
3238 max_rate = MAX(max_rate, min_rate);
3239 max_rate = MIN(max_rate, hfsc->max_rate);
3241 class->min_rate = min_rate;
3242 class->max_rate = max_rate;
3247 /* Create an HFSC qdisc.
3249 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3251 hfsc_setup_qdisc__(struct netdev * netdev)
3253 struct tcmsg *tcmsg;
3254 struct ofpbuf request;
3255 struct tc_hfsc_qopt opt;
3257 tc_del_qdisc(netdev);
3259 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3260 NLM_F_EXCL | NLM_F_CREATE, &request);
3266 tcmsg->tcm_handle = tc_make_handle(1, 0);
3267 tcmsg->tcm_parent = TC_H_ROOT;
3269 memset(&opt, 0, sizeof opt);
3272 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3273 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3275 return tc_transact(&request, NULL);
3278 /* Create an HFSC class.
3280 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3281 * sc rate <min_rate> ul rate <max_rate>" */
3283 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3284 unsigned int parent, struct hfsc_class *class)
3288 struct tcmsg *tcmsg;
3289 struct ofpbuf request;
3290 struct tc_service_curve min, max;
3292 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3298 tcmsg->tcm_handle = handle;
3299 tcmsg->tcm_parent = parent;
3303 min.m2 = class->min_rate;
3307 max.m2 = class->max_rate;
3309 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3310 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3311 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3312 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3313 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3314 nl_msg_end_nested(&request, opt_offset);
3316 error = tc_transact(&request, NULL);
3318 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3319 "min-rate %ubps, max-rate %ubps (%s)",
3320 netdev_get_name(netdev),
3321 tc_get_major(handle), tc_get_minor(handle),
3322 tc_get_major(parent), tc_get_minor(parent),
3323 class->min_rate, class->max_rate, strerror(error));
3330 hfsc_tc_install(struct netdev *netdev, const struct shash *details)
3333 struct hfsc_class class;
3335 error = hfsc_setup_qdisc__(netdev);
3341 hfsc_parse_qdisc_details__(netdev, details, &class);
3342 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3343 tc_make_handle(1, 0), &class);
3349 hfsc_install__(netdev, class.max_rate);
3354 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3357 struct nl_dump dump;
3358 struct hfsc_class hc;
3361 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3362 hfsc_install__(netdev, hc.max_rate);
3364 if (!start_queue_dump(netdev, &dump)) {
3368 while (nl_dump_next(&dump, &msg)) {
3369 unsigned int queue_id;
3371 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3372 hfsc_update_queue__(netdev, queue_id, &hc);
3376 nl_dump_done(&dump);
3381 hfsc_tc_destroy(struct tc *tc)
3384 struct hfsc_class *hc, *next;
3386 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3388 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3389 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3398 hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3400 const struct hfsc *hfsc;
3401 hfsc = hfsc_get__(netdev);
3402 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3407 hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3410 struct hfsc_class class;
3412 hfsc_parse_qdisc_details__(netdev, details, &class);
3413 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3414 tc_make_handle(1, 0), &class);
3417 hfsc_get__(netdev)->max_rate = class.max_rate;
3424 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3425 const struct tc_queue *queue, struct shash *details)
3427 const struct hfsc_class *hc;
3429 hc = hfsc_class_cast__(queue);
3430 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3431 if (hc->min_rate != hc->max_rate) {
3432 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3438 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3439 const struct shash *details)
3442 struct hfsc_class class;
3444 error = hfsc_parse_class_details__(netdev, details, &class);
3449 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3450 tc_make_handle(1, 0xfffe), &class);
3455 hfsc_update_queue__(netdev, queue_id, &class);
3460 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3464 struct hfsc_class *hc;
3466 hc = hfsc_class_cast__(queue);
3467 hfsc = hfsc_get__(netdev);
3469 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3471 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3478 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3479 struct netdev_queue_stats *stats)
3481 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3482 tc_make_handle(1, 0xfffe), NULL, stats);
3486 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3487 const struct ofpbuf *nlmsg,
3488 netdev_dump_queue_stats_cb *cb, void *aux)
3490 struct netdev_queue_stats stats;
3491 unsigned int handle, major, minor;
3494 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3499 major = tc_get_major(handle);
3500 minor = tc_get_minor(handle);
3501 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3502 (*cb)(minor - 1, &stats, aux);
3507 static const struct tc_ops tc_ops_hfsc = {
3508 "hfsc", /* linux_name */
3509 "linux-hfsc", /* ovs_name */
3510 HFSC_N_QUEUES, /* n_queues */
3511 hfsc_tc_install, /* tc_install */
3512 hfsc_tc_load, /* tc_load */
3513 hfsc_tc_destroy, /* tc_destroy */
3514 hfsc_qdisc_get, /* qdisc_get */
3515 hfsc_qdisc_set, /* qdisc_set */
3516 hfsc_class_get, /* class_get */
3517 hfsc_class_set, /* class_set */
3518 hfsc_class_delete, /* class_delete */
3519 hfsc_class_get_stats, /* class_get_stats */
3520 hfsc_class_dump_stats /* class_dump_stats */
3523 /* "linux-default" traffic control class.
3525 * This class represents the default, unnamed Linux qdisc. It corresponds to
3526 * the "" (empty string) QoS type in the OVS database. */
3529 default_install__(struct netdev *netdev)
3531 struct netdev_dev_linux *netdev_dev =
3532 netdev_dev_linux_cast(netdev_get_dev(netdev));
3533 static struct tc *tc;
3536 tc = xmalloc(sizeof *tc);
3537 tc_init(tc, &tc_ops_default);
3539 netdev_dev->tc = tc;
3543 default_tc_install(struct netdev *netdev,
3544 const struct shash *details OVS_UNUSED)
3546 default_install__(netdev);
3551 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3553 default_install__(netdev);
3557 static const struct tc_ops tc_ops_default = {
3558 NULL, /* linux_name */
3563 NULL, /* tc_destroy */
3564 NULL, /* qdisc_get */
3565 NULL, /* qdisc_set */
3566 NULL, /* class_get */
3567 NULL, /* class_set */
3568 NULL, /* class_delete */
3569 NULL, /* class_get_stats */
3570 NULL /* class_dump_stats */
3573 /* "linux-other" traffic control class.
3578 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3580 struct netdev_dev_linux *netdev_dev =
3581 netdev_dev_linux_cast(netdev_get_dev(netdev));
3582 static struct tc *tc;
3585 tc = xmalloc(sizeof *tc);
3586 tc_init(tc, &tc_ops_other);
3588 netdev_dev->tc = tc;
3592 static const struct tc_ops tc_ops_other = {
3593 NULL, /* linux_name */
3594 "linux-other", /* ovs_name */
3596 NULL, /* tc_install */
3598 NULL, /* tc_destroy */
3599 NULL, /* qdisc_get */
3600 NULL, /* qdisc_set */
3601 NULL, /* class_get */
3602 NULL, /* class_set */
3603 NULL, /* class_delete */
3604 NULL, /* class_get_stats */
3605 NULL /* class_dump_stats */
3608 /* Traffic control. */
3610 /* Number of kernel "tc" ticks per second. */
3611 static double ticks_per_s;
3613 /* Number of kernel "jiffies" per second. This is used for the purpose of
3614 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3615 * one jiffy's worth of data.
3617 * There are two possibilities here:
3619 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3620 * approximate range of 100 to 1024. That means that we really need to
3621 * make sure that the qdisc can buffer that much data.
3623 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3624 * has finely granular timers and there's no need to fudge additional room
3625 * for buffers. (There's no extra effort needed to implement that: the
3626 * large 'buffer_hz' is used as a divisor, so practically any number will
3627 * come out as 0 in the division. Small integer results in the case of
3628 * really high dividends won't have any real effect anyhow.)
3630 static unsigned int buffer_hz;
3632 /* Returns tc handle 'major':'minor'. */
3634 tc_make_handle(unsigned int major, unsigned int minor)
3636 return TC_H_MAKE(major << 16, minor);
3639 /* Returns the major number from 'handle'. */
3641 tc_get_major(unsigned int handle)
3643 return TC_H_MAJ(handle) >> 16;
3646 /* Returns the minor number from 'handle'. */
3648 tc_get_minor(unsigned int handle)
3650 return TC_H_MIN(handle);
3653 static struct tcmsg *
3654 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3655 struct ofpbuf *request)
3657 struct tcmsg *tcmsg;
3661 error = get_ifindex(netdev, &ifindex);
3666 ofpbuf_init(request, 512);
3667 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3668 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3669 tcmsg->tcm_family = AF_UNSPEC;
3670 tcmsg->tcm_ifindex = ifindex;
3671 /* Caller should fill in tcmsg->tcm_handle. */
3672 /* Caller should fill in tcmsg->tcm_parent. */
3678 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3680 int error = nl_sock_transact(rtnl_sock, request, replyp);
3681 ofpbuf_uninit(request);
3685 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3686 * policing configuration.
3688 * This function is equivalent to running the following when 'add' is true:
3689 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3691 * This function is equivalent to running the following when 'add' is false:
3692 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3694 * The configuration and stats may be seen with the following command:
3695 * /sbin/tc -s qdisc show dev <devname>
3697 * Returns 0 if successful, otherwise a positive errno value.
3700 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3702 struct ofpbuf request;
3703 struct tcmsg *tcmsg;
3705 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3706 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3708 tcmsg = tc_make_request(netdev, type, flags, &request);
3712 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3713 tcmsg->tcm_parent = TC_H_INGRESS;
3714 nl_msg_put_string(&request, TCA_KIND, "ingress");
3715 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3717 error = tc_transact(&request, NULL);
3719 /* If we're deleting the qdisc, don't worry about some of the
3720 * error conditions. */
3721 if (!add && (error == ENOENT || error == EINVAL)) {
3730 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3733 * This function is equivalent to running:
3734 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3735 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3738 * The configuration and stats may be seen with the following command:
3739 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3741 * Returns 0 if successful, otherwise a positive errno value.
3744 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3746 struct tc_police tc_police;
3747 struct ofpbuf request;
3748 struct tcmsg *tcmsg;
3749 size_t basic_offset;
3750 size_t police_offset;
3754 memset(&tc_police, 0, sizeof tc_police);
3755 tc_police.action = TC_POLICE_SHOT;
3756 tc_police.mtu = mtu;
3757 tc_fill_rate(&tc_police.rate, kbits_rate/8 * 1000, mtu);
3758 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3759 kbits_burst * 1024);
3761 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3762 NLM_F_EXCL | NLM_F_CREATE, &request);
3766 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3767 tcmsg->tcm_info = tc_make_handle(49,
3768 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3770 nl_msg_put_string(&request, TCA_KIND, "basic");
3771 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3772 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3773 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3774 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3775 nl_msg_end_nested(&request, police_offset);
3776 nl_msg_end_nested(&request, basic_offset);
3778 error = tc_transact(&request, NULL);
3789 /* The values in psched are not individually very meaningful, but they are
3790 * important. The tables below show some values seen in the wild.
3794 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3795 * (Before that, there are hints that it was 1000000000.)
3797 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3801 * -----------------------------------
3802 * [1] 000c8000 000f4240 000f4240 00000064
3803 * [2] 000003e8 00000400 000f4240 3b9aca00
3804 * [3] 000003e8 00000400 000f4240 3b9aca00
3805 * [4] 000003e8 00000400 000f4240 00000064
3806 * [5] 000003e8 00000040 000f4240 3b9aca00
3807 * [6] 000003e8 00000040 000f4240 000000f9
3809 * a b c d ticks_per_s buffer_hz
3810 * ------- --------- ---------- ------------- ----------- -------------
3811 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3812 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3813 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3814 * [4] 1,000 1,024 1,000,000 100 976,562 100
3815 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3816 * [6] 1,000 64 1,000,000 249 15,625,000 249
3818 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3819 * [2] 2.6.26-1-686-bigmem from Debian lenny
3820 * [3] 2.6.26-2-sparc64 from Debian lenny
3821 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3822 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3823 * [6] 2.6.34 from kernel.org on KVM
3825 static const char fn[] = "/proc/net/psched";
3826 unsigned int a, b, c, d;
3832 stream = fopen(fn, "r");
3834 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3838 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3839 VLOG_WARN("%s: read failed", fn);
3843 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3847 VLOG_WARN("%s: invalid scheduler parameters", fn);
3851 ticks_per_s = (double) a * c / b;
3855 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3858 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3861 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3862 * rate of 'rate' bytes per second. */
3864 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3869 return (rate * ticks) / ticks_per_s;
3872 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3873 * rate of 'rate' bytes per second. */
3875 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3880 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3883 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3884 * a transmission rate of 'rate' bytes per second. */
3886 tc_buffer_per_jiffy(unsigned int rate)
3891 return rate / buffer_hz;
3894 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3895 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3896 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3897 * stores NULL into it if it is absent.
3899 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3902 * Returns 0 if successful, otherwise a positive errno value. */
3904 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3905 struct nlattr **options)
3907 static const struct nl_policy tca_policy[] = {
3908 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3909 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3911 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3913 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3914 tca_policy, ta, ARRAY_SIZE(ta))) {
3915 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3920 *kind = nl_attr_get_string(ta[TCA_KIND]);
3924 *options = ta[TCA_OPTIONS];
3939 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3940 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3941 * into '*options', and its queue statistics into '*stats'. Any of the output
3942 * arguments may be null.
3944 * Returns 0 if successful, otherwise a positive errno value. */
3946 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3947 struct nlattr **options, struct netdev_queue_stats *stats)
3949 static const struct nl_policy tca_policy[] = {
3950 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3951 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3953 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3955 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3956 tca_policy, ta, ARRAY_SIZE(ta))) {
3957 VLOG_WARN_RL(&rl, "failed to parse class message");
3962 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3963 *handlep = tc->tcm_handle;
3967 *options = ta[TCA_OPTIONS];
3971 const struct gnet_stats_queue *gsq;
3972 struct gnet_stats_basic gsb;
3974 static const struct nl_policy stats_policy[] = {
3975 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3976 .min_len = sizeof gsb },
3977 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3978 .min_len = sizeof *gsq },
3980 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3982 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3983 sa, ARRAY_SIZE(sa))) {
3984 VLOG_WARN_RL(&rl, "failed to parse class stats");
3988 /* Alignment issues screw up the length of struct gnet_stats_basic on
3989 * some arch/bitsize combinations. Newer versions of Linux have a
3990 * struct gnet_stats_basic_packed, but we can't depend on that. The
3991 * easiest thing to do is just to make a copy. */
3992 memset(&gsb, 0, sizeof gsb);
3993 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3994 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3995 stats->tx_bytes = gsb.bytes;
3996 stats->tx_packets = gsb.packets;
3998 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3999 stats->tx_errors = gsq->drops;
4009 memset(stats, 0, sizeof *stats);
4014 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
4017 tc_query_class(const struct netdev *netdev,
4018 unsigned int handle, unsigned int parent,
4019 struct ofpbuf **replyp)
4021 struct ofpbuf request;
4022 struct tcmsg *tcmsg;
4025 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
4029 tcmsg->tcm_handle = handle;
4030 tcmsg->tcm_parent = parent;
4032 error = tc_transact(&request, replyp);
4034 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
4035 netdev_get_name(netdev),
4036 tc_get_major(handle), tc_get_minor(handle),
4037 tc_get_major(parent), tc_get_minor(parent),
4043 /* Equivalent to "tc class del dev <name> handle <handle>". */
4045 tc_delete_class(const struct netdev *netdev, unsigned int handle)
4047 struct ofpbuf request;
4048 struct tcmsg *tcmsg;
4051 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
4055 tcmsg->tcm_handle = handle;
4056 tcmsg->tcm_parent = 0;
4058 error = tc_transact(&request, NULL);
4060 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4061 netdev_get_name(netdev),
4062 tc_get_major(handle), tc_get_minor(handle),
4068 /* Equivalent to "tc qdisc del dev <name> root". */
4070 tc_del_qdisc(struct netdev *netdev)
4072 struct netdev_dev_linux *netdev_dev =
4073 netdev_dev_linux_cast(netdev_get_dev(netdev));
4074 struct ofpbuf request;
4075 struct tcmsg *tcmsg;
4078 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
4082 tcmsg->tcm_handle = tc_make_handle(1, 0);
4083 tcmsg->tcm_parent = TC_H_ROOT;
4085 error = tc_transact(&request, NULL);
4086 if (error == EINVAL) {
4087 /* EINVAL probably means that the default qdisc was in use, in which
4088 * case we've accomplished our purpose. */
4091 if (!error && netdev_dev->tc) {
4092 if (netdev_dev->tc->ops->tc_destroy) {
4093 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
4095 netdev_dev->tc = NULL;
4100 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4101 * kernel to determine what they are. Returns 0 if successful, otherwise a
4102 * positive errno value. */
4104 tc_query_qdisc(const struct netdev *netdev)
4106 struct netdev_dev_linux *netdev_dev =
4107 netdev_dev_linux_cast(netdev_get_dev(netdev));
4108 struct ofpbuf request, *qdisc;
4109 const struct tc_ops *ops;
4110 struct tcmsg *tcmsg;
4114 if (netdev_dev->tc) {
4118 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4119 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4120 * 2.6.35 without that fix backported to it.
4122 * To avoid the OOPS, we must not make a request that would attempt to dump
4123 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4124 * few others. There are a few ways that I can see to do this, but most of
4125 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4126 * technique chosen here is to assume that any non-default qdisc that we
4127 * create will have a class with handle 1:0. The built-in qdiscs only have
4128 * a class with handle 0:0.
4130 * We could check for Linux 2.6.35+ and use a more straightforward method
4132 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
4136 tcmsg->tcm_handle = tc_make_handle(1, 0);
4137 tcmsg->tcm_parent = 0;
4139 /* Figure out what tc class to instantiate. */
4140 error = tc_transact(&request, &qdisc);
4144 error = tc_parse_qdisc(qdisc, &kind, NULL);
4146 ops = &tc_ops_other;
4148 ops = tc_lookup_linux_name(kind);
4150 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4151 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4153 ops = &tc_ops_other;
4156 } else if (error == ENOENT) {
4157 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4158 * other entity that doesn't have a handle 1:0. We will assume
4159 * that it's the system default qdisc. */
4160 ops = &tc_ops_default;
4163 /* Who knows? Maybe the device got deleted. */
4164 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4165 netdev_get_name(netdev), strerror(error));
4166 ops = &tc_ops_other;
4169 /* Instantiate it. */
4170 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
4171 assert((load_error == 0) == (netdev_dev->tc != NULL));
4172 ofpbuf_delete(qdisc);
4174 return error ? error : load_error;
4177 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4178 approximate the time to transmit packets of various lengths. For an MTU of
4179 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4180 represents two possible packet lengths; for a MTU of 513 through 1024, four
4181 possible lengths; and so on.
4183 Returns, for the specified 'mtu', the number of bits that packet lengths
4184 need to be shifted right to fit within such a 256-entry table. */
4186 tc_calc_cell_log(unsigned int mtu)
4191 mtu = ETH_PAYLOAD_MAX;
4193 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4195 for (cell_log = 0; mtu >= 256; cell_log++) {
4202 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4205 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4207 memset(rate, 0, sizeof *rate);
4208 rate->cell_log = tc_calc_cell_log(mtu);
4209 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4210 /* rate->cell_align = 0; */ /* distro headers. */
4211 rate->mpu = ETH_TOTAL_MIN;
4215 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4216 * attribute of the specified "type".
4218 * See tc_calc_cell_log() above for a description of "rtab"s. */
4220 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4225 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4226 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4227 unsigned packet_size = (i + 1) << rate->cell_log;
4228 if (packet_size < rate->mpu) {
4229 packet_size = rate->mpu;
4231 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4235 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4236 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4237 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4240 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4242 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4243 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4246 /* Linux-only functions declared in netdev-linux.h */
4248 /* Returns a fd for an AF_INET socket or a negative errno value. */
4250 netdev_linux_get_af_inet_sock(void)
4252 int error = netdev_linux_init();
4253 return error ? -error : af_inet_sock;
4256 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4257 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4259 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4260 const char *flag_name, bool enable)
4262 const char *netdev_name = netdev_get_name(netdev);
4263 struct ethtool_value evalue;
4267 memset(&evalue, 0, sizeof evalue);
4268 error = netdev_linux_do_ethtool(netdev_name,
4269 (struct ethtool_cmd *)&evalue,
4270 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4275 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4276 error = netdev_linux_do_ethtool(netdev_name,
4277 (struct ethtool_cmd *)&evalue,
4278 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4283 memset(&evalue, 0, sizeof evalue);
4284 error = netdev_linux_do_ethtool(netdev_name,
4285 (struct ethtool_cmd *)&evalue,
4286 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4291 if (new_flags != evalue.data) {
4292 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4293 "device %s failed", enable ? "enable" : "disable",
4294 flag_name, netdev_name);
4301 /* Utility functions. */
4303 /* Copies 'src' into 'dst', performing format conversion in the process. */
4305 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4306 const struct rtnl_link_stats *src)
4308 dst->rx_packets = src->rx_packets;
4309 dst->tx_packets = src->tx_packets;
4310 dst->rx_bytes = src->rx_bytes;
4311 dst->tx_bytes = src->tx_bytes;
4312 dst->rx_errors = src->rx_errors;
4313 dst->tx_errors = src->tx_errors;
4314 dst->rx_dropped = src->rx_dropped;
4315 dst->tx_dropped = src->tx_dropped;
4316 dst->multicast = src->multicast;
4317 dst->collisions = src->collisions;
4318 dst->rx_length_errors = src->rx_length_errors;
4319 dst->rx_over_errors = src->rx_over_errors;
4320 dst->rx_crc_errors = src->rx_crc_errors;
4321 dst->rx_frame_errors = src->rx_frame_errors;
4322 dst->rx_fifo_errors = src->rx_fifo_errors;
4323 dst->rx_missed_errors = src->rx_missed_errors;
4324 dst->tx_aborted_errors = src->tx_aborted_errors;
4325 dst->tx_carrier_errors = src->tx_carrier_errors;
4326 dst->tx_fifo_errors = src->tx_fifo_errors;
4327 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4328 dst->tx_window_errors = src->tx_window_errors;
4332 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4334 /* Policy for RTNLGRP_LINK messages.
4336 * There are *many* more fields in these messages, but currently we only
4337 * care about these fields. */
4338 static const struct nl_policy rtnlgrp_link_policy[] = {
4339 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4340 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4341 .min_len = sizeof(struct rtnl_link_stats) },
4344 struct ofpbuf request;
4345 struct ofpbuf *reply;
4346 struct ifinfomsg *ifi;
4347 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4350 ofpbuf_init(&request, 0);
4351 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4352 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4353 ifi->ifi_family = PF_UNSPEC;
4354 ifi->ifi_index = ifindex;
4355 error = nl_sock_transact(rtnl_sock, &request, &reply);
4356 ofpbuf_uninit(&request);
4361 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4362 rtnlgrp_link_policy,
4363 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4364 ofpbuf_delete(reply);
4368 if (!attrs[IFLA_STATS]) {
4369 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4370 ofpbuf_delete(reply);
4374 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4376 ofpbuf_delete(reply);
4382 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4384 static const char fn[] = "/proc/net/dev";
4389 stream = fopen(fn, "r");
4391 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
4396 while (fgets(line, sizeof line, stream)) {
4399 #define X64 "%"SCNu64
4402 X64 X64 X64 X64 X64 X64 X64 "%*u"
4403 X64 X64 X64 X64 X64 X64 X64 "%*u",
4409 &stats->rx_fifo_errors,
4410 &stats->rx_frame_errors,
4416 &stats->tx_fifo_errors,
4418 &stats->tx_carrier_errors) != 15) {
4419 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4420 } else if (!strcmp(devname, netdev_name)) {
4421 stats->rx_length_errors = UINT64_MAX;
4422 stats->rx_over_errors = UINT64_MAX;
4423 stats->rx_crc_errors = UINT64_MAX;
4424 stats->rx_missed_errors = UINT64_MAX;
4425 stats->tx_aborted_errors = UINT64_MAX;
4426 stats->tx_heartbeat_errors = UINT64_MAX;
4427 stats->tx_window_errors = UINT64_MAX;
4433 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4439 get_flags(const struct netdev_dev *dev, unsigned int *flags)
4445 error = netdev_linux_do_ioctl(dev->name, &ifr, SIOCGIFFLAGS,
4448 *flags = ifr.ifr_flags;
4454 set_flags(struct netdev *netdev, unsigned int flags)
4458 ifr.ifr_flags = flags;
4459 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4464 do_get_ifindex(const char *netdev_name)
4468 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4469 COVERAGE_INC(netdev_get_ifindex);
4470 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4471 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4472 netdev_name, strerror(errno));
4475 return ifr.ifr_ifindex;
4479 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4481 struct netdev_dev_linux *netdev_dev =
4482 netdev_dev_linux_cast(netdev_get_dev(netdev_));
4484 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
4485 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4488 netdev_dev->get_ifindex_error = -ifindex;
4489 netdev_dev->ifindex = 0;
4491 netdev_dev->get_ifindex_error = 0;
4492 netdev_dev->ifindex = ifindex;
4494 netdev_dev->cache_valid |= VALID_IFINDEX;
4497 *ifindexp = netdev_dev->ifindex;
4498 return netdev_dev->get_ifindex_error;
4502 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4507 memset(&ifr, 0, sizeof ifr);
4508 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4509 COVERAGE_INC(netdev_get_hwaddr);
4510 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4511 /* ENODEV probably means that a vif disappeared asynchronously and
4512 * hasn't been removed from the database yet, so reduce the log level
4513 * to INFO for that case. */
4514 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4515 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4516 netdev_name, strerror(errno));
4519 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4520 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4521 VLOG_WARN("%s device has unknown hardware address family %d",
4522 netdev_name, hwaddr_family);
4524 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4529 set_etheraddr(const char *netdev_name,
4530 const uint8_t mac[ETH_ADDR_LEN])
4534 memset(&ifr, 0, sizeof ifr);
4535 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4536 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4537 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4538 COVERAGE_INC(netdev_set_hwaddr);
4539 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4540 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4541 netdev_name, strerror(errno));
4548 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4549 int cmd, const char *cmd_name)
4553 memset(&ifr, 0, sizeof ifr);
4554 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4555 ifr.ifr_data = (caddr_t) ecmd;
4558 COVERAGE_INC(netdev_ethtool);
4559 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4562 if (errno != EOPNOTSUPP) {
4563 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4564 "failed: %s", cmd_name, name, strerror(errno));
4566 /* The device doesn't support this operation. That's pretty
4567 * common, so there's no point in logging anything. */
4574 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4575 const char *cmd_name)
4577 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4578 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4579 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4587 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4588 int cmd, const char *cmd_name)
4593 ifr.ifr_addr.sa_family = AF_INET;
4594 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4596 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4597 *ip = sin->sin_addr;
4602 /* Returns an AF_PACKET raw socket or a negative errno value. */
4604 af_packet_sock(void)
4606 static int sock = INT_MIN;
4608 if (sock == INT_MIN) {
4609 sock = socket(AF_PACKET, SOCK_RAW, 0);
4611 set_nonblocking(sock);
4614 VLOG_ERR("failed to create packet socket: %s", strerror(errno));