2 * Copyright (c) 2009, 2010, 2011, 2012 Nicira Networks.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
24 #include <arpa/inet.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <linux/version.h>
37 #include <sys/types.h>
38 #include <sys/ioctl.h>
39 #include <sys/socket.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
52 #include "dpif-linux.h"
53 #include "dynamic-string.h"
54 #include "fatal-signal.h"
57 #include "netdev-provider.h"
58 #include "netdev-vport.h"
60 #include "netlink-notifier.h"
61 #include "netlink-socket.h"
63 #include "openflow/openflow.h"
65 #include "poll-loop.h"
66 #include "rtnetlink-link.h"
67 #include "socket-util.h"
74 VLOG_DEFINE_THIS_MODULE(netdev_linux);
76 COVERAGE_DEFINE(netdev_set_policing);
77 COVERAGE_DEFINE(netdev_arp_lookup);
78 COVERAGE_DEFINE(netdev_get_ifindex);
79 COVERAGE_DEFINE(netdev_get_hwaddr);
80 COVERAGE_DEFINE(netdev_set_hwaddr);
81 COVERAGE_DEFINE(netdev_ethtool);
84 /* These were introduced in Linux 2.6.14, so they might be missing if we have
86 #ifndef ADVERTISED_Pause
87 #define ADVERTISED_Pause (1 << 13)
89 #ifndef ADVERTISED_Asym_Pause
90 #define ADVERTISED_Asym_Pause (1 << 14)
93 /* These were introduced in Linux 2.6.24, so they might be missing if we
94 * have old headers. */
95 #ifndef ETHTOOL_GFLAGS
96 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
98 #ifndef ETHTOOL_SFLAGS
99 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
102 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
105 #define TC_RTAB_SIZE 1024
108 static struct nln_notifier *netdev_linux_cache_notifier = NULL;
109 static int cache_notifier_refcount;
112 VALID_IFINDEX = 1 << 0,
113 VALID_ETHERADDR = 1 << 1,
117 VALID_POLICING = 1 << 5,
118 VALID_VPORT_STAT_ERROR = 1 << 6,
119 VALID_DRVINFO = 1 << 7,
120 VALID_FEATURES = 1 << 8,
128 /* Traffic control. */
130 /* An instance of a traffic control class. Always associated with a particular
133 * Each TC implementation subclasses this with whatever additional data it
136 const struct tc_ops *ops;
137 struct hmap queues; /* Contains "struct tc_queue"s.
138 * Read by generic TC layer.
139 * Written only by TC implementation. */
142 /* One traffic control queue.
144 * Each TC implementation subclasses this with whatever additional data it
147 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
148 unsigned int queue_id; /* OpenFlow queue ID. */
151 /* A particular kind of traffic control. Each implementation generally maps to
152 * one particular Linux qdisc class.
154 * The functions below return 0 if successful or a positive errno value on
155 * failure, except where otherwise noted. All of them must be provided, except
156 * where otherwise noted. */
158 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
159 * This is null for tc_ops_default and tc_ops_other, for which there are no
160 * appropriate values. */
161 const char *linux_name;
163 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
164 const char *ovs_name;
166 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
167 * queues. The queues are numbered 0 through n_queues - 1. */
168 unsigned int n_queues;
170 /* Called to install this TC class on 'netdev'. The implementation should
171 * make the Netlink calls required to set up 'netdev' with the right qdisc
172 * and configure it according to 'details'. The implementation may assume
173 * that the current qdisc is the default; that is, there is no need for it
174 * to delete the current qdisc before installing itself.
176 * The contents of 'details' should be documented as valid for 'ovs_name'
177 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
178 * (which is built as ovs-vswitchd.conf.db(8)).
180 * This function must return 0 if and only if it sets 'netdev->tc' to an
181 * initialized 'struct tc'.
183 * (This function is null for tc_ops_other, which cannot be installed. For
184 * other TC classes it should always be nonnull.) */
185 int (*tc_install)(struct netdev *netdev, const struct shash *details);
187 /* Called when the netdev code determines (through a Netlink query) that
188 * this TC class's qdisc is installed on 'netdev', but we didn't install
189 * it ourselves and so don't know any of the details.
191 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
192 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
193 * implementation should parse the other attributes of 'nlmsg' as
194 * necessary to determine its configuration. If necessary it should also
195 * use Netlink queries to determine the configuration of queues on
198 * This function must return 0 if and only if it sets 'netdev->tc' to an
199 * initialized 'struct tc'. */
200 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
202 /* Destroys the data structures allocated by the implementation as part of
203 * 'tc'. (This includes destroying 'tc->queues' by calling
206 * The implementation should not need to perform any Netlink calls. If
207 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
208 * (But it may not be desirable.)
210 * This function may be null if 'tc' is trivial. */
211 void (*tc_destroy)(struct tc *tc);
213 /* Retrieves details of 'netdev->tc' configuration into 'details'.
215 * The implementation should not need to perform any Netlink calls, because
216 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
217 * cached the configuration.
219 * The contents of 'details' should be documented as valid for 'ovs_name'
220 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
221 * (which is built as ovs-vswitchd.conf.db(8)).
223 * This function may be null if 'tc' is not configurable.
225 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
227 /* Reconfigures 'netdev->tc' according to 'details', performing any
228 * required Netlink calls to complete the reconfiguration.
230 * The contents of 'details' should be documented as valid for 'ovs_name'
231 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
232 * (which is built as ovs-vswitchd.conf.db(8)).
234 * This function may be null if 'tc' is not configurable.
236 int (*qdisc_set)(struct netdev *, const struct shash *details);
238 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
239 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
241 * The contents of 'details' should be documented as valid for 'ovs_name'
242 * in the "other_config" column in the "Queue" table in
243 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
245 * The implementation should not need to perform any Netlink calls, because
246 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
247 * cached the queue configuration.
249 * This function may be null if 'tc' does not have queues ('n_queues' is
251 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
252 struct shash *details);
254 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
255 * 'details', perfoming any required Netlink calls to complete the
256 * reconfiguration. The caller ensures that 'queue_id' is less than
259 * The contents of 'details' should be documented as valid for 'ovs_name'
260 * in the "other_config" column in the "Queue" table in
261 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
263 * This function may be null if 'tc' does not have queues or its queues are
264 * not configurable. */
265 int (*class_set)(struct netdev *, unsigned int queue_id,
266 const struct shash *details);
268 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
269 * tc_queue's within 'netdev->tc->queues'.
271 * This function may be null if 'tc' does not have queues or its queues
272 * cannot be deleted. */
273 int (*class_delete)(struct netdev *, struct tc_queue *queue);
275 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
276 * 'struct tc_queue's within 'netdev->tc->queues'.
278 * On success, initializes '*stats'.
280 * This function may be null if 'tc' does not have queues or if it cannot
281 * report queue statistics. */
282 int (*class_get_stats)(const struct netdev *netdev,
283 const struct tc_queue *queue,
284 struct netdev_queue_stats *stats);
286 /* Extracts queue stats from 'nlmsg', which is a response to a
287 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
289 * This function may be null if 'tc' does not have queues or if it cannot
290 * report queue statistics. */
291 int (*class_dump_stats)(const struct netdev *netdev,
292 const struct ofpbuf *nlmsg,
293 netdev_dump_queue_stats_cb *cb, void *aux);
297 tc_init(struct tc *tc, const struct tc_ops *ops)
300 hmap_init(&tc->queues);
304 tc_destroy(struct tc *tc)
306 hmap_destroy(&tc->queues);
309 static const struct tc_ops tc_ops_htb;
310 static const struct tc_ops tc_ops_hfsc;
311 static const struct tc_ops tc_ops_default;
312 static const struct tc_ops tc_ops_other;
314 static const struct tc_ops *tcs[] = {
315 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
316 &tc_ops_hfsc, /* Hierarchical fair service curve. */
317 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
318 &tc_ops_other, /* Some other qdisc. */
322 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
323 static unsigned int tc_get_major(unsigned int handle);
324 static unsigned int tc_get_minor(unsigned int handle);
326 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
327 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
328 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
330 static struct tcmsg *tc_make_request(const struct netdev *, int type,
331 unsigned int flags, struct ofpbuf *);
332 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
333 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
334 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
337 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
338 struct nlattr **options);
339 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
340 struct nlattr **options,
341 struct netdev_queue_stats *);
342 static int tc_query_class(const struct netdev *,
343 unsigned int handle, unsigned int parent,
344 struct ofpbuf **replyp);
345 static int tc_delete_class(const struct netdev *, unsigned int handle);
347 static int tc_del_qdisc(struct netdev *netdev);
348 static int tc_query_qdisc(const struct netdev *netdev);
350 static int tc_calc_cell_log(unsigned int mtu);
351 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
352 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
353 const struct tc_ratespec *rate);
354 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
356 struct netdev_dev_linux {
357 struct netdev_dev netdev_dev;
359 struct shash_node *shash_node;
360 unsigned int cache_valid;
361 unsigned int change_seq;
363 bool miimon; /* Link status of last poll. */
364 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
365 struct timer miimon_timer;
367 /* The following are figured out "on demand" only. They are only valid
368 * when the corresponding VALID_* bit in 'cache_valid' is set. */
370 uint8_t etheraddr[ETH_ADDR_LEN];
371 struct in_addr address, netmask;
374 unsigned int ifi_flags;
375 long long int carrier_resets;
376 uint32_t kbits_rate; /* Policing data. */
377 uint32_t kbits_burst;
378 int vport_stats_error; /* Cached error code from vport_get_stats().
379 0 or an errno value. */
380 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
381 int ether_addr_error; /* Cached error code from set/get etheraddr. */
382 int netdev_policing_error; /* Cached error code from set policing. */
383 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
384 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
386 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
387 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
388 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
389 enum netdev_features peer; /* Cached from ETHTOOL_GSET. */
391 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
395 struct tap_state tap;
399 struct netdev_linux {
400 struct netdev netdev;
404 /* Sockets used for ioctl operations. */
405 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
407 /* A Netlink routing socket that is not subscribed to any multicast groups. */
408 static struct nl_sock *rtnl_sock;
410 /* This is set pretty low because we probably won't learn anything from the
411 * additional log messages. */
412 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
414 static int netdev_linux_init(void);
416 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
417 int cmd, const char *cmd_name);
418 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
419 const char *cmd_name);
420 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
421 int cmd, const char *cmd_name);
422 static int get_flags(const struct netdev_dev *, unsigned int *flags);
423 static int set_flags(struct netdev *, unsigned int flags);
424 static int do_get_ifindex(const char *netdev_name);
425 static int get_ifindex(const struct netdev *, int *ifindexp);
426 static int do_set_addr(struct netdev *netdev,
427 int ioctl_nr, const char *ioctl_name,
428 struct in_addr addr);
429 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
430 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
431 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
432 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
433 static int af_packet_sock(void);
434 static void netdev_linux_miimon_run(void);
435 static void netdev_linux_miimon_wait(void);
438 is_netdev_linux_class(const struct netdev_class *netdev_class)
440 return netdev_class->init == netdev_linux_init;
443 static struct netdev_dev_linux *
444 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
446 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
447 assert(is_netdev_linux_class(netdev_class));
449 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
452 static struct netdev_linux *
453 netdev_linux_cast(const struct netdev *netdev)
455 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
456 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
457 assert(is_netdev_linux_class(netdev_class));
459 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
463 netdev_linux_init(void)
465 static int status = -1;
467 /* Create AF_INET socket. */
468 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
469 status = af_inet_sock >= 0 ? 0 : errno;
471 VLOG_ERR("failed to create inet socket: %s", strerror(status));
474 /* Create rtnetlink socket. */
476 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
478 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
487 netdev_linux_run(void)
489 rtnetlink_link_run();
490 netdev_linux_miimon_run();
494 netdev_linux_wait(void)
496 rtnetlink_link_wait();
497 netdev_linux_miimon_wait();
501 netdev_linux_get_drvinfo(struct netdev_dev_linux *netdev_dev)
506 if (netdev_dev->cache_valid & VALID_DRVINFO) {
510 memset(&netdev_dev->drvinfo, 0, sizeof netdev_dev->drvinfo);
511 error = netdev_linux_do_ethtool(netdev_dev->netdev_dev.name,
512 (struct ethtool_cmd *)&netdev_dev->drvinfo,
516 netdev_dev->cache_valid |= VALID_DRVINFO;
522 netdev_dev_linux_changed(struct netdev_dev_linux *dev,
523 unsigned int ifi_flags,
527 if (!dev->change_seq) {
531 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
532 dev->carrier_resets++;
534 dev->ifi_flags = ifi_flags;
536 dev->cache_valid &= mask;
540 netdev_dev_linux_update(struct netdev_dev_linux *dev,
541 const struct rtnetlink_link_change *change)
543 if (change->nlmsg_type == RTM_NEWLINK) {
545 netdev_dev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
547 /* Update netdev from rtnl-change msg. */
549 dev->mtu = change->mtu;
550 dev->cache_valid |= VALID_MTU;
551 dev->netdev_mtu_error = 0;
554 if (!eth_addr_is_zero(change->addr)) {
555 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
556 dev->cache_valid |= VALID_ETHERADDR;
557 dev->ether_addr_error = 0;
560 dev->ifindex = change->ifi_index;
561 dev->cache_valid |= VALID_IFINDEX;
562 dev->get_ifindex_error = 0;
565 netdev_dev_linux_changed(dev, change->ifi_flags, 0);
570 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
571 void *aux OVS_UNUSED)
573 struct netdev_dev_linux *dev;
575 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
577 const struct netdev_class *netdev_class =
578 netdev_dev_get_class(base_dev);
580 if (is_netdev_linux_class(netdev_class)) {
581 dev = netdev_dev_linux_cast(base_dev);
582 netdev_dev_linux_update(dev, change);
586 struct shash device_shash;
587 struct shash_node *node;
589 shash_init(&device_shash);
590 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
591 SHASH_FOR_EACH (node, &device_shash) {
596 get_flags(&dev->netdev_dev, &flags);
597 netdev_dev_linux_changed(dev, flags, 0);
599 shash_destroy(&device_shash);
604 cache_notifier_ref(void)
606 if (!cache_notifier_refcount) {
607 assert(!netdev_linux_cache_notifier);
609 netdev_linux_cache_notifier =
610 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
612 if (!netdev_linux_cache_notifier) {
616 cache_notifier_refcount++;
622 cache_notifier_unref(void)
624 assert(cache_notifier_refcount > 0);
625 if (!--cache_notifier_refcount) {
626 assert(netdev_linux_cache_notifier);
627 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
628 netdev_linux_cache_notifier = NULL;
632 /* Creates system and internal devices. */
634 netdev_linux_create(const struct netdev_class *class, const char *name,
635 struct netdev_dev **netdev_devp)
637 struct netdev_dev_linux *netdev_dev;
640 error = cache_notifier_ref();
645 netdev_dev = xzalloc(sizeof *netdev_dev);
646 netdev_dev->change_seq = 1;
647 netdev_dev_init(&netdev_dev->netdev_dev, name, class);
648 get_flags(&netdev_dev->netdev_dev, &netdev_dev->ifi_flags);
650 *netdev_devp = &netdev_dev->netdev_dev;
654 /* For most types of netdevs we open the device for each call of
655 * netdev_open(). However, this is not the case with tap devices,
656 * since it is only possible to open the device once. In this
657 * situation we share a single file descriptor, and consequently
658 * buffers, across all readers. Therefore once data is read it will
659 * be unavailable to other reads for tap devices. */
661 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
662 const char *name, struct netdev_dev **netdev_devp)
664 struct netdev_dev_linux *netdev_dev;
665 struct tap_state *state;
666 static const char tap_dev[] = "/dev/net/tun";
670 netdev_dev = xzalloc(sizeof *netdev_dev);
671 state = &netdev_dev->state.tap;
673 error = cache_notifier_ref();
678 /* Open tap device. */
679 state->fd = open(tap_dev, O_RDWR);
682 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
683 goto error_unref_notifier;
686 /* Create tap device. */
687 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
688 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
689 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
690 VLOG_WARN("%s: creating tap device failed: %s", name,
693 goto error_unref_notifier;
696 /* Make non-blocking. */
697 error = set_nonblocking(state->fd);
699 goto error_unref_notifier;
702 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
703 *netdev_devp = &netdev_dev->netdev_dev;
706 error_unref_notifier:
707 cache_notifier_unref();
714 destroy_tap(struct netdev_dev_linux *netdev_dev)
716 struct tap_state *state = &netdev_dev->state.tap;
718 if (state->fd >= 0) {
723 /* Destroys the netdev device 'netdev_dev_'. */
725 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
727 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
728 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
730 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
731 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
734 if (class == &netdev_tap_class) {
735 destroy_tap(netdev_dev);
739 cache_notifier_unref();
743 netdev_linux_open(struct netdev_dev *netdev_dev_, struct netdev **netdevp)
745 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
746 struct netdev_linux *netdev;
747 enum netdev_flags flags;
750 /* Allocate network device. */
751 netdev = xzalloc(sizeof *netdev);
753 netdev_init(&netdev->netdev, netdev_dev_);
755 /* Verify that the device really exists, by attempting to read its flags.
756 * (The flags might be cached, in which case this won't actually do an
759 * Don't do this for "internal" netdevs, though, because those have to be
760 * created as netdev objects before they exist in the kernel, because
761 * creating them in the kernel happens by passing a netdev object to
762 * dpif_port_add(). */
763 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
764 error = netdev_get_flags(&netdev->netdev, &flags);
765 if (error == ENODEV) {
770 if (!strncmp(netdev_dev_get_type(netdev_dev_), "tap", 3) &&
771 !netdev_dev->state.tap.opened) {
773 /* We assume that the first user of the tap device is the primary user
774 * and give them the tap FD. Subsequent users probably just expect
775 * this to be a system device so open it normally to avoid send/receive
776 * directions appearing to be reversed. */
777 netdev->fd = netdev_dev->state.tap.fd;
778 netdev_dev->state.tap.opened = true;
781 *netdevp = &netdev->netdev;
785 netdev_uninit(&netdev->netdev, true);
789 /* Closes and destroys 'netdev'. */
791 netdev_linux_close(struct netdev *netdev_)
793 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
795 if (netdev->fd > 0 && strncmp(netdev_get_type(netdev_), "tap", 3)) {
802 netdev_linux_listen(struct netdev *netdev_)
804 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
805 struct sockaddr_ll sll;
810 if (netdev->fd >= 0) {
814 /* Create file descriptor. */
815 fd = socket(PF_PACKET, SOCK_RAW, 0);
818 VLOG_ERR("failed to create raw socket (%s)", strerror(error));
822 /* Set non-blocking mode. */
823 error = set_nonblocking(fd);
828 /* Get ethernet device index. */
829 error = get_ifindex(&netdev->netdev, &ifindex);
834 /* Bind to specific ethernet device. */
835 memset(&sll, 0, sizeof sll);
836 sll.sll_family = AF_PACKET;
837 sll.sll_ifindex = ifindex;
838 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
839 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
841 VLOG_ERR("%s: failed to bind raw socket (%s)",
842 netdev_get_name(netdev_), strerror(error));
857 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
859 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
861 if (netdev->fd < 0) {
862 /* Device is not listening. */
869 retval = (netdev_->netdev_dev->netdev_class == &netdev_tap_class
870 ? read(netdev->fd, data, size)
871 : recv(netdev->fd, data, size, MSG_TRUNC));
873 return retval <= size ? retval : -EMSGSIZE;
874 } else if (errno != EINTR) {
875 if (errno != EAGAIN) {
876 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
877 strerror(errno), netdev_get_name(netdev_));
884 /* Registers with the poll loop to wake up from the next call to poll_block()
885 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
887 netdev_linux_recv_wait(struct netdev *netdev_)
889 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
890 if (netdev->fd >= 0) {
891 poll_fd_wait(netdev->fd, POLLIN);
895 /* Discards all packets waiting to be received from 'netdev'. */
897 netdev_linux_drain(struct netdev *netdev_)
899 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
900 if (netdev->fd < 0) {
902 } else if (!strncmp(netdev_get_type(netdev_), "tap", 3)) {
904 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
905 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
909 drain_fd(netdev->fd, ifr.ifr_qlen);
912 return drain_rcvbuf(netdev->fd);
916 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
917 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
918 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
919 * the packet is too big or too small to transmit on the device.
921 * The caller retains ownership of 'buffer' in all cases.
923 * The kernel maintains a packet transmission queue, so the caller is not
924 * expected to do additional queuing of packets. */
926 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
928 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
932 if (netdev->fd < 0) {
933 /* Use our AF_PACKET socket to send to this device. */
934 struct sockaddr_ll sll;
941 sock = af_packet_sock();
946 error = get_ifindex(netdev_, &ifindex);
951 /* We don't bother setting most fields in sockaddr_ll because the
952 * kernel ignores them for SOCK_RAW. */
953 memset(&sll, 0, sizeof sll);
954 sll.sll_family = AF_PACKET;
955 sll.sll_ifindex = ifindex;
957 iov.iov_base = (void *) data;
961 msg.msg_namelen = sizeof sll;
964 msg.msg_control = NULL;
965 msg.msg_controllen = 0;
968 retval = sendmsg(sock, &msg, 0);
970 /* Use the netdev's own fd to send to this device. This is
971 * essential for tap devices, because packets sent to a tap device
972 * with an AF_PACKET socket will loop back to be *received* again
973 * on the tap device. */
974 retval = write(netdev->fd, data, size);
978 /* The Linux AF_PACKET implementation never blocks waiting for room
979 * for packets, instead returning ENOBUFS. Translate this into
980 * EAGAIN for the caller. */
981 if (errno == ENOBUFS) {
983 } else if (errno == EINTR) {
985 } else if (errno != EAGAIN) {
986 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
987 netdev_get_name(netdev_), strerror(errno));
990 } else if (retval != size) {
991 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
992 "%zu) on %s", retval, size, netdev_get_name(netdev_));
1000 /* Registers with the poll loop to wake up from the next call to poll_block()
1001 * when the packet transmission queue has sufficient room to transmit a packet
1002 * with netdev_send().
1004 * The kernel maintains a packet transmission queue, so the client is not
1005 * expected to do additional queuing of packets. Thus, this function is
1006 * unlikely to ever be used. It is included for completeness. */
1008 netdev_linux_send_wait(struct netdev *netdev_)
1010 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1011 if (netdev->fd < 0) {
1012 /* Nothing to do. */
1013 } else if (strncmp(netdev_get_type(netdev_), "tap", 3)) {
1014 poll_fd_wait(netdev->fd, POLLOUT);
1016 /* TAP device always accepts packets.*/
1017 poll_immediate_wake();
1021 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1022 * otherwise a positive errno value. */
1024 netdev_linux_set_etheraddr(struct netdev *netdev_,
1025 const uint8_t mac[ETH_ADDR_LEN])
1027 struct netdev_dev_linux *netdev_dev =
1028 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1031 if (netdev_dev->cache_valid & VALID_ETHERADDR) {
1032 if (netdev_dev->ether_addr_error) {
1033 return netdev_dev->ether_addr_error;
1035 if (eth_addr_equals(netdev_dev->etheraddr, mac)) {
1038 netdev_dev->cache_valid &= ~VALID_ETHERADDR;
1041 error = set_etheraddr(netdev_get_name(netdev_), mac);
1042 if (!error || error == ENODEV) {
1043 netdev_dev->ether_addr_error = error;
1044 netdev_dev->cache_valid |= VALID_ETHERADDR;
1046 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
1053 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1055 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1056 uint8_t mac[ETH_ADDR_LEN])
1058 struct netdev_dev_linux *netdev_dev =
1059 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1061 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
1062 int error = get_etheraddr(netdev_get_name(netdev_),
1063 netdev_dev->etheraddr);
1065 netdev_dev->ether_addr_error = error;
1066 netdev_dev->cache_valid |= VALID_ETHERADDR;
1069 if (!netdev_dev->ether_addr_error) {
1070 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
1073 return netdev_dev->ether_addr_error;
1076 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1077 * in bytes, not including the hardware header; thus, this is typically 1500
1078 * bytes for Ethernet devices. */
1080 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1082 struct netdev_dev_linux *netdev_dev =
1083 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1084 if (!(netdev_dev->cache_valid & VALID_MTU)) {
1088 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1089 SIOCGIFMTU, "SIOCGIFMTU");
1091 netdev_dev->netdev_mtu_error = error;
1092 netdev_dev->mtu = ifr.ifr_mtu;
1093 netdev_dev->cache_valid |= VALID_MTU;
1096 if (!netdev_dev->netdev_mtu_error) {
1097 *mtup = netdev_dev->mtu;
1099 return netdev_dev->netdev_mtu_error;
1102 /* Sets the maximum size of transmitted (MTU) for given device using linux
1103 * networking ioctl interface.
1106 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1108 struct netdev_dev_linux *netdev_dev =
1109 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1113 if (netdev_dev->cache_valid & VALID_MTU) {
1114 if (netdev_dev->netdev_mtu_error) {
1115 return netdev_dev->netdev_mtu_error;
1117 if (netdev_dev->mtu == mtu) {
1120 netdev_dev->cache_valid &= ~VALID_MTU;
1123 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1124 SIOCSIFMTU, "SIOCSIFMTU");
1125 if (!error || error == ENODEV) {
1126 netdev_dev->netdev_mtu_error = error;
1127 netdev_dev->mtu = ifr.ifr_mtu;
1128 netdev_dev->cache_valid |= VALID_MTU;
1133 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1134 * On failure, returns a negative errno value. */
1136 netdev_linux_get_ifindex(const struct netdev *netdev)
1140 error = get_ifindex(netdev, &ifindex);
1141 return error ? -error : ifindex;
1145 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1147 struct netdev_dev_linux *netdev_dev =
1148 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1150 if (netdev_dev->miimon_interval > 0) {
1151 *carrier = netdev_dev->miimon;
1153 *carrier = (netdev_dev->ifi_flags & IFF_RUNNING) != 0;
1159 static long long int
1160 netdev_linux_get_carrier_resets(const struct netdev *netdev)
1162 return netdev_dev_linux_cast(netdev_get_dev(netdev))->carrier_resets;
1166 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1167 struct mii_ioctl_data *data)
1172 memset(&ifr, 0, sizeof ifr);
1173 memcpy(&ifr.ifr_data, data, sizeof *data);
1174 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1175 memcpy(data, &ifr.ifr_data, sizeof *data);
1181 netdev_linux_get_miimon(const char *name, bool *miimon)
1183 struct mii_ioctl_data data;
1188 memset(&data, 0, sizeof data);
1189 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1191 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1192 data.reg_num = MII_BMSR;
1193 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1197 *miimon = !!(data.val_out & BMSR_LSTATUS);
1199 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1202 struct ethtool_cmd ecmd;
1204 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1207 memset(&ecmd, 0, sizeof ecmd);
1208 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1211 struct ethtool_value eval;
1213 memcpy(&eval, &ecmd, sizeof eval);
1214 *miimon = !!eval.data;
1216 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1224 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1225 long long int interval)
1227 struct netdev_dev_linux *netdev_dev;
1229 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev_));
1231 interval = interval > 0 ? MAX(interval, 100) : 0;
1232 if (netdev_dev->miimon_interval != interval) {
1233 netdev_dev->miimon_interval = interval;
1234 timer_set_expired(&netdev_dev->miimon_timer);
1241 netdev_linux_miimon_run(void)
1243 struct shash device_shash;
1244 struct shash_node *node;
1246 shash_init(&device_shash);
1247 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1248 SHASH_FOR_EACH (node, &device_shash) {
1249 struct netdev_dev_linux *dev = node->data;
1252 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1256 netdev_linux_get_miimon(dev->netdev_dev.name, &miimon);
1257 if (miimon != dev->miimon) {
1258 dev->miimon = miimon;
1259 netdev_dev_linux_changed(dev, dev->ifi_flags, 0);
1262 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1265 shash_destroy(&device_shash);
1269 netdev_linux_miimon_wait(void)
1271 struct shash device_shash;
1272 struct shash_node *node;
1274 shash_init(&device_shash);
1275 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1276 SHASH_FOR_EACH (node, &device_shash) {
1277 struct netdev_dev_linux *dev = node->data;
1279 if (dev->miimon_interval > 0) {
1280 timer_wait(&dev->miimon_timer);
1283 shash_destroy(&device_shash);
1286 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1287 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1290 check_for_working_netlink_stats(void)
1292 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1293 * preferable, so if that works, we'll use it. */
1294 int ifindex = do_get_ifindex("lo");
1296 VLOG_WARN("failed to get ifindex for lo, "
1297 "obtaining netdev stats from proc");
1300 struct netdev_stats stats;
1301 int error = get_stats_via_netlink(ifindex, &stats);
1303 VLOG_DBG("obtaining netdev stats via rtnetlink");
1306 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1307 "via proc (you are probably running a pre-2.6.19 "
1308 "kernel)", strerror(error));
1315 swap_uint64(uint64_t *a, uint64_t *b)
1323 get_stats_via_vport(const struct netdev *netdev_,
1324 struct netdev_stats *stats)
1326 struct netdev_dev_linux *netdev_dev =
1327 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1329 if (!netdev_dev->vport_stats_error ||
1330 !(netdev_dev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1333 error = netdev_vport_get_stats(netdev_, stats);
1335 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1336 "(%s)", netdev_get_name(netdev_), strerror(error));
1338 netdev_dev->vport_stats_error = error;
1339 netdev_dev->cache_valid |= VALID_VPORT_STAT_ERROR;
1344 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1345 struct netdev_stats *stats)
1347 static int use_netlink_stats = -1;
1350 if (use_netlink_stats < 0) {
1351 use_netlink_stats = check_for_working_netlink_stats();
1354 if (use_netlink_stats) {
1357 error = get_ifindex(netdev_, &ifindex);
1359 error = get_stats_via_netlink(ifindex, stats);
1362 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1366 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1367 netdev_get_name(netdev_), error);
1373 /* Retrieves current device stats for 'netdev-linux'. */
1375 netdev_linux_get_stats(const struct netdev *netdev_,
1376 struct netdev_stats *stats)
1378 struct netdev_dev_linux *netdev_dev =
1379 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1380 struct netdev_stats dev_stats;
1383 get_stats_via_vport(netdev_, stats);
1385 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1388 if (netdev_dev->vport_stats_error) {
1395 if (netdev_dev->vport_stats_error) {
1396 /* stats not available from OVS then use ioctl stats. */
1399 stats->rx_errors += dev_stats.rx_errors;
1400 stats->tx_errors += dev_stats.tx_errors;
1401 stats->rx_dropped += dev_stats.rx_dropped;
1402 stats->tx_dropped += dev_stats.tx_dropped;
1403 stats->multicast += dev_stats.multicast;
1404 stats->collisions += dev_stats.collisions;
1405 stats->rx_length_errors += dev_stats.rx_length_errors;
1406 stats->rx_over_errors += dev_stats.rx_over_errors;
1407 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1408 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1409 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1410 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1411 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1412 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1413 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1414 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1415 stats->tx_window_errors += dev_stats.tx_window_errors;
1420 /* Retrieves current device stats for 'netdev-tap' netdev or
1421 * netdev-internal. */
1423 netdev_tap_get_stats(const struct netdev *netdev_,
1424 struct netdev_stats *stats)
1426 struct netdev_dev_linux *netdev_dev =
1427 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1428 struct netdev_stats dev_stats;
1431 get_stats_via_vport(netdev_, stats);
1433 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1435 if (netdev_dev->vport_stats_error) {
1442 /* If this port is an internal port then the transmit and receive stats
1443 * will appear to be swapped relative to the other ports since we are the
1444 * one sending the data, not a remote computer. For consistency, we swap
1445 * them back here. This does not apply if we are getting stats from the
1446 * vport layer because it always tracks stats from the perspective of the
1448 if (netdev_dev->vport_stats_error) {
1450 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1451 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1452 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1453 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1454 stats->rx_length_errors = 0;
1455 stats->rx_over_errors = 0;
1456 stats->rx_crc_errors = 0;
1457 stats->rx_frame_errors = 0;
1458 stats->rx_fifo_errors = 0;
1459 stats->rx_missed_errors = 0;
1460 stats->tx_aborted_errors = 0;
1461 stats->tx_carrier_errors = 0;
1462 stats->tx_fifo_errors = 0;
1463 stats->tx_heartbeat_errors = 0;
1464 stats->tx_window_errors = 0;
1466 stats->rx_dropped += dev_stats.tx_dropped;
1467 stats->tx_dropped += dev_stats.rx_dropped;
1469 stats->rx_errors += dev_stats.tx_errors;
1470 stats->tx_errors += dev_stats.rx_errors;
1472 stats->multicast += dev_stats.multicast;
1473 stats->collisions += dev_stats.collisions;
1479 netdev_internal_get_stats(const struct netdev *netdev_,
1480 struct netdev_stats *stats)
1482 struct netdev_dev_linux *netdev_dev =
1483 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1485 get_stats_via_vport(netdev_, stats);
1486 return netdev_dev->vport_stats_error;
1490 netdev_linux_read_features(struct netdev_dev_linux *netdev_dev)
1492 struct ethtool_cmd ecmd;
1496 if (netdev_dev->cache_valid & VALID_FEATURES) {
1500 memset(&ecmd, 0, sizeof ecmd);
1501 error = netdev_linux_do_ethtool(netdev_dev->netdev_dev.name, &ecmd,
1502 ETHTOOL_GSET, "ETHTOOL_GSET");
1507 /* Supported features. */
1508 netdev_dev->supported = 0;
1509 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1510 netdev_dev->supported |= NETDEV_F_10MB_HD;
1512 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1513 netdev_dev->supported |= NETDEV_F_10MB_FD;
1515 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1516 netdev_dev->supported |= NETDEV_F_100MB_HD;
1518 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1519 netdev_dev->supported |= NETDEV_F_100MB_FD;
1521 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1522 netdev_dev->supported |= NETDEV_F_1GB_HD;
1524 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1525 netdev_dev->supported |= NETDEV_F_1GB_FD;
1527 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1528 netdev_dev->supported |= NETDEV_F_10GB_FD;
1530 if (ecmd.supported & SUPPORTED_TP) {
1531 netdev_dev->supported |= NETDEV_F_COPPER;
1533 if (ecmd.supported & SUPPORTED_FIBRE) {
1534 netdev_dev->supported |= NETDEV_F_FIBER;
1536 if (ecmd.supported & SUPPORTED_Autoneg) {
1537 netdev_dev->supported |= NETDEV_F_AUTONEG;
1539 if (ecmd.supported & SUPPORTED_Pause) {
1540 netdev_dev->supported |= NETDEV_F_PAUSE;
1542 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1543 netdev_dev->supported |= NETDEV_F_PAUSE_ASYM;
1546 /* Advertised features. */
1547 netdev_dev->advertised = 0;
1548 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1549 netdev_dev->advertised |= NETDEV_F_10MB_HD;
1551 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1552 netdev_dev->advertised |= NETDEV_F_10MB_FD;
1554 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1555 netdev_dev->advertised |= NETDEV_F_100MB_HD;
1557 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1558 netdev_dev->advertised |= NETDEV_F_100MB_FD;
1560 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1561 netdev_dev->advertised |= NETDEV_F_1GB_HD;
1563 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1564 netdev_dev->advertised |= NETDEV_F_1GB_FD;
1566 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1567 netdev_dev->advertised |= NETDEV_F_10GB_FD;
1569 if (ecmd.advertising & ADVERTISED_TP) {
1570 netdev_dev->advertised |= NETDEV_F_COPPER;
1572 if (ecmd.advertising & ADVERTISED_FIBRE) {
1573 netdev_dev->advertised |= NETDEV_F_FIBER;
1575 if (ecmd.advertising & ADVERTISED_Autoneg) {
1576 netdev_dev->advertised |= NETDEV_F_AUTONEG;
1578 if (ecmd.advertising & ADVERTISED_Pause) {
1579 netdev_dev->advertised |= NETDEV_F_PAUSE;
1581 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1582 netdev_dev->advertised |= NETDEV_F_PAUSE_ASYM;
1585 /* Current settings. */
1587 if (speed == SPEED_10) {
1588 netdev_dev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1589 } else if (speed == SPEED_100) {
1590 netdev_dev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1591 } else if (speed == SPEED_1000) {
1592 netdev_dev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1593 } else if (speed == SPEED_10000) {
1594 netdev_dev->current = NETDEV_F_10GB_FD;
1595 } else if (speed == 40000) {
1596 netdev_dev->current = NETDEV_F_40GB_FD;
1597 } else if (speed == 100000) {
1598 netdev_dev->current = NETDEV_F_100GB_FD;
1599 } else if (speed == 1000000) {
1600 netdev_dev->current = NETDEV_F_1TB_FD;
1602 netdev_dev->current = 0;
1605 if (ecmd.port == PORT_TP) {
1606 netdev_dev->current |= NETDEV_F_COPPER;
1607 } else if (ecmd.port == PORT_FIBRE) {
1608 netdev_dev->current |= NETDEV_F_FIBER;
1612 netdev_dev->current |= NETDEV_F_AUTONEG;
1615 /* Peer advertisements. */
1616 netdev_dev->peer = 0; /* XXX */
1619 netdev_dev->cache_valid |= VALID_FEATURES;
1620 netdev_dev->get_features_error = error;
1623 /* Stores the features supported by 'netdev' into each of '*current',
1624 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1625 * bitmap of NETDEV_* bits. Returns 0 if successful, otherwise a positive
1628 netdev_linux_get_features(const struct netdev *netdev_,
1629 enum netdev_features *current,
1630 enum netdev_features *advertised,
1631 enum netdev_features *supported,
1632 enum netdev_features *peer)
1634 struct netdev_dev_linux *netdev_dev =
1635 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1637 netdev_linux_read_features(netdev_dev);
1639 if (!netdev_dev->get_features_error) {
1640 *current = netdev_dev->current;
1641 *advertised = netdev_dev->advertised;
1642 *supported = netdev_dev->supported;
1643 *peer = netdev_dev->peer;
1645 return netdev_dev->get_features_error;
1648 /* Set the features advertised by 'netdev' to 'advertise'. */
1650 netdev_linux_set_advertisements(struct netdev *netdev,
1651 enum netdev_features advertise)
1653 struct ethtool_cmd ecmd;
1656 memset(&ecmd, 0, sizeof ecmd);
1657 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1658 ETHTOOL_GSET, "ETHTOOL_GSET");
1663 ecmd.advertising = 0;
1664 if (advertise & NETDEV_F_10MB_HD) {
1665 ecmd.advertising |= ADVERTISED_10baseT_Half;
1667 if (advertise & NETDEV_F_10MB_FD) {
1668 ecmd.advertising |= ADVERTISED_10baseT_Full;
1670 if (advertise & NETDEV_F_100MB_HD) {
1671 ecmd.advertising |= ADVERTISED_100baseT_Half;
1673 if (advertise & NETDEV_F_100MB_FD) {
1674 ecmd.advertising |= ADVERTISED_100baseT_Full;
1676 if (advertise & NETDEV_F_1GB_HD) {
1677 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1679 if (advertise & NETDEV_F_1GB_FD) {
1680 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1682 if (advertise & NETDEV_F_10GB_FD) {
1683 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1685 if (advertise & NETDEV_F_COPPER) {
1686 ecmd.advertising |= ADVERTISED_TP;
1688 if (advertise & NETDEV_F_FIBER) {
1689 ecmd.advertising |= ADVERTISED_FIBRE;
1691 if (advertise & NETDEV_F_AUTONEG) {
1692 ecmd.advertising |= ADVERTISED_Autoneg;
1694 if (advertise & NETDEV_F_PAUSE) {
1695 ecmd.advertising |= ADVERTISED_Pause;
1697 if (advertise & NETDEV_F_PAUSE_ASYM) {
1698 ecmd.advertising |= ADVERTISED_Asym_Pause;
1700 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1701 ETHTOOL_SSET, "ETHTOOL_SSET");
1704 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1705 * successful, otherwise a positive errno value. */
1707 netdev_linux_set_policing(struct netdev *netdev,
1708 uint32_t kbits_rate, uint32_t kbits_burst)
1710 struct netdev_dev_linux *netdev_dev =
1711 netdev_dev_linux_cast(netdev_get_dev(netdev));
1712 const char *netdev_name = netdev_get_name(netdev);
1716 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1717 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1718 : kbits_burst); /* Stick with user-specified value. */
1720 if (netdev_dev->cache_valid & VALID_POLICING) {
1721 if (netdev_dev->netdev_policing_error) {
1722 return netdev_dev->netdev_policing_error;
1725 if (netdev_dev->kbits_rate == kbits_rate &&
1726 netdev_dev->kbits_burst == kbits_burst) {
1727 /* Assume that settings haven't changed since we last set them. */
1730 netdev_dev->cache_valid &= ~VALID_POLICING;
1733 COVERAGE_INC(netdev_set_policing);
1734 /* Remove any existing ingress qdisc. */
1735 error = tc_add_del_ingress_qdisc(netdev, false);
1737 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1738 netdev_name, strerror(error));
1743 error = tc_add_del_ingress_qdisc(netdev, true);
1745 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1746 netdev_name, strerror(error));
1750 error = tc_add_policer(netdev, kbits_rate, kbits_burst);
1752 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1753 netdev_name, strerror(error));
1758 netdev_dev->kbits_rate = kbits_rate;
1759 netdev_dev->kbits_burst = kbits_burst;
1762 if (!error || error == ENODEV) {
1763 netdev_dev->netdev_policing_error = error;
1764 netdev_dev->cache_valid |= VALID_POLICING;
1770 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1773 const struct tc_ops **opsp;
1775 for (opsp = tcs; *opsp != NULL; opsp++) {
1776 const struct tc_ops *ops = *opsp;
1777 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1778 sset_add(types, ops->ovs_name);
1785 netdev_linux_create_tap_pl(const struct netdev_class *class OVS_UNUSED,
1786 const char *name, struct netdev_dev **netdev_devp)
1788 struct netdev_dev_linux *netdev_dev;
1789 struct tap_state *state;
1790 char real_name[IFNAMSIZ];
1793 netdev_dev = xzalloc(sizeof *netdev_dev);
1794 state = &netdev_dev->state.tap;
1796 error = cache_notifier_ref();
1801 /* Open tap device. */
1802 state->fd = tun_alloc(IFF_TAP, real_name);
1803 if (state->fd < 0) {
1805 VLOG_WARN("tun_alloc(IFF_TAP, %s) failed: %s", name, strerror(error));
1806 goto error_unref_notifier;
1808 if (strcmp(name, real_name)) {
1809 VLOG_WARN("tap_pl: requested %s, created %s", name, real_name);
1812 /* Make non-blocking. */
1813 error = set_nonblocking(state->fd);
1815 goto error_unref_notifier;
1818 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_pl_class);
1819 *netdev_devp = &netdev_dev->netdev_dev;
1822 error_unref_notifier:
1823 cache_notifier_unref();
1829 static const struct tc_ops *
1830 tc_lookup_ovs_name(const char *name)
1832 const struct tc_ops **opsp;
1834 for (opsp = tcs; *opsp != NULL; opsp++) {
1835 const struct tc_ops *ops = *opsp;
1836 if (!strcmp(name, ops->ovs_name)) {
1843 static const struct tc_ops *
1844 tc_lookup_linux_name(const char *name)
1846 const struct tc_ops **opsp;
1848 for (opsp = tcs; *opsp != NULL; opsp++) {
1849 const struct tc_ops *ops = *opsp;
1850 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1857 static struct tc_queue *
1858 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1861 struct netdev_dev_linux *netdev_dev =
1862 netdev_dev_linux_cast(netdev_get_dev(netdev));
1863 struct tc_queue *queue;
1865 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1866 if (queue->queue_id == queue_id) {
1873 static struct tc_queue *
1874 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1876 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1880 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1882 struct netdev_qos_capabilities *caps)
1884 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1888 caps->n_queues = ops->n_queues;
1893 netdev_linux_get_qos(const struct netdev *netdev,
1894 const char **typep, struct shash *details)
1896 struct netdev_dev_linux *netdev_dev =
1897 netdev_dev_linux_cast(netdev_get_dev(netdev));
1900 error = tc_query_qdisc(netdev);
1905 *typep = netdev_dev->tc->ops->ovs_name;
1906 return (netdev_dev->tc->ops->qdisc_get
1907 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1912 netdev_linux_set_qos(struct netdev *netdev,
1913 const char *type, const struct shash *details)
1915 struct netdev_dev_linux *netdev_dev =
1916 netdev_dev_linux_cast(netdev_get_dev(netdev));
1917 const struct tc_ops *new_ops;
1920 new_ops = tc_lookup_ovs_name(type);
1921 if (!new_ops || !new_ops->tc_install) {
1925 error = tc_query_qdisc(netdev);
1930 if (new_ops == netdev_dev->tc->ops) {
1931 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1933 /* Delete existing qdisc. */
1934 error = tc_del_qdisc(netdev);
1938 assert(netdev_dev->tc == NULL);
1940 /* Install new qdisc. */
1941 error = new_ops->tc_install(netdev, details);
1942 assert((error == 0) == (netdev_dev->tc != NULL));
1949 netdev_linux_get_queue(const struct netdev *netdev,
1950 unsigned int queue_id, struct shash *details)
1952 struct netdev_dev_linux *netdev_dev =
1953 netdev_dev_linux_cast(netdev_get_dev(netdev));
1956 error = tc_query_qdisc(netdev);
1960 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1962 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1968 netdev_linux_set_queue(struct netdev *netdev,
1969 unsigned int queue_id, const struct shash *details)
1971 struct netdev_dev_linux *netdev_dev =
1972 netdev_dev_linux_cast(netdev_get_dev(netdev));
1975 error = tc_query_qdisc(netdev);
1978 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1979 || !netdev_dev->tc->ops->class_set) {
1983 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1987 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1989 struct netdev_dev_linux *netdev_dev =
1990 netdev_dev_linux_cast(netdev_get_dev(netdev));
1993 error = tc_query_qdisc(netdev);
1996 } else if (!netdev_dev->tc->ops->class_delete) {
1999 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
2001 ? netdev_dev->tc->ops->class_delete(netdev, queue)
2007 netdev_linux_get_queue_stats(const struct netdev *netdev,
2008 unsigned int queue_id,
2009 struct netdev_queue_stats *stats)
2011 struct netdev_dev_linux *netdev_dev =
2012 netdev_dev_linux_cast(netdev_get_dev(netdev));
2015 error = tc_query_qdisc(netdev);
2018 } else if (!netdev_dev->tc->ops->class_get_stats) {
2021 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
2023 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
2029 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
2031 struct ofpbuf request;
2032 struct tcmsg *tcmsg;
2034 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2038 tcmsg->tcm_parent = 0;
2039 nl_dump_start(dump, rtnl_sock, &request);
2040 ofpbuf_uninit(&request);
2045 netdev_linux_dump_queues(const struct netdev *netdev,
2046 netdev_dump_queues_cb *cb, void *aux)
2048 struct netdev_dev_linux *netdev_dev =
2049 netdev_dev_linux_cast(netdev_get_dev(netdev));
2050 struct tc_queue *queue, *next_queue;
2051 struct shash details;
2055 error = tc_query_qdisc(netdev);
2058 } else if (!netdev_dev->tc->ops->class_get) {
2063 shash_init(&details);
2064 HMAP_FOR_EACH_SAFE (queue, next_queue, hmap_node,
2065 &netdev_dev->tc->queues) {
2066 shash_clear(&details);
2068 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
2070 (*cb)(queue->queue_id, &details, aux);
2075 shash_destroy(&details);
2081 netdev_linux_dump_queue_stats(const struct netdev *netdev,
2082 netdev_dump_queue_stats_cb *cb, void *aux)
2084 struct netdev_dev_linux *netdev_dev =
2085 netdev_dev_linux_cast(netdev_get_dev(netdev));
2086 struct nl_dump dump;
2091 error = tc_query_qdisc(netdev);
2094 } else if (!netdev_dev->tc->ops->class_dump_stats) {
2099 if (!start_queue_dump(netdev, &dump)) {
2102 while (nl_dump_next(&dump, &msg)) {
2103 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
2109 error = nl_dump_done(&dump);
2110 return error ? error : last_error;
2114 netdev_linux_get_in4(const struct netdev *netdev_,
2115 struct in_addr *address, struct in_addr *netmask)
2117 struct netdev_dev_linux *netdev_dev =
2118 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2120 if (!(netdev_dev->cache_valid & VALID_IN4)) {
2123 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
2124 SIOCGIFADDR, "SIOCGIFADDR");
2129 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
2130 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2135 netdev_dev->cache_valid |= VALID_IN4;
2137 *address = netdev_dev->address;
2138 *netmask = netdev_dev->netmask;
2139 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
2143 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2144 struct in_addr netmask)
2146 struct netdev_dev_linux *netdev_dev =
2147 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2150 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2152 netdev_dev->cache_valid |= VALID_IN4;
2153 netdev_dev->address = address;
2154 netdev_dev->netmask = netmask;
2155 if (address.s_addr != INADDR_ANY) {
2156 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2157 "SIOCSIFNETMASK", netmask);
2164 parse_if_inet6_line(const char *line,
2165 struct in6_addr *in6, char ifname[16 + 1])
2167 uint8_t *s6 = in6->s6_addr;
2168 #define X8 "%2"SCNx8
2170 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2171 "%*x %*x %*x %*x %16s\n",
2172 &s6[0], &s6[1], &s6[2], &s6[3],
2173 &s6[4], &s6[5], &s6[6], &s6[7],
2174 &s6[8], &s6[9], &s6[10], &s6[11],
2175 &s6[12], &s6[13], &s6[14], &s6[15],
2179 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2180 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2182 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2184 struct netdev_dev_linux *netdev_dev =
2185 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2186 if (!(netdev_dev->cache_valid & VALID_IN6)) {
2190 netdev_dev->in6 = in6addr_any;
2192 file = fopen("/proc/net/if_inet6", "r");
2194 const char *name = netdev_get_name(netdev_);
2195 while (fgets(line, sizeof line, file)) {
2196 struct in6_addr in6_tmp;
2197 char ifname[16 + 1];
2198 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2199 && !strcmp(name, ifname))
2201 netdev_dev->in6 = in6_tmp;
2207 netdev_dev->cache_valid |= VALID_IN6;
2209 *in6 = netdev_dev->in6;
2214 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2216 struct sockaddr_in sin;
2217 memset(&sin, 0, sizeof sin);
2218 sin.sin_family = AF_INET;
2219 sin.sin_addr = addr;
2222 memset(sa, 0, sizeof *sa);
2223 memcpy(sa, &sin, sizeof sin);
2227 do_set_addr(struct netdev *netdev,
2228 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2231 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2232 make_in4_sockaddr(&ifr.ifr_addr, addr);
2234 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2238 /* Adds 'router' as a default IP gateway. */
2240 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2242 struct in_addr any = { INADDR_ANY };
2246 memset(&rt, 0, sizeof rt);
2247 make_in4_sockaddr(&rt.rt_dst, any);
2248 make_in4_sockaddr(&rt.rt_gateway, router);
2249 make_in4_sockaddr(&rt.rt_genmask, any);
2250 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2251 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2253 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
2259 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2262 static const char fn[] = "/proc/net/route";
2267 *netdev_name = NULL;
2268 stream = fopen(fn, "r");
2269 if (stream == NULL) {
2270 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
2275 while (fgets(line, sizeof line, stream)) {
2278 ovs_be32 dest, gateway, mask;
2279 int refcnt, metric, mtu;
2280 unsigned int flags, use, window, irtt;
2283 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2285 iface, &dest, &gateway, &flags, &refcnt,
2286 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2288 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2292 if (!(flags & RTF_UP)) {
2293 /* Skip routes that aren't up. */
2297 /* The output of 'dest', 'mask', and 'gateway' were given in
2298 * network byte order, so we don't need need any endian
2299 * conversions here. */
2300 if ((dest & mask) == (host->s_addr & mask)) {
2302 /* The host is directly reachable. */
2303 next_hop->s_addr = 0;
2305 /* To reach the host, we must go through a gateway. */
2306 next_hop->s_addr = gateway;
2308 *netdev_name = xstrdup(iface);
2320 netdev_linux_get_drv_info(const struct netdev *netdev, struct shash *sh)
2323 struct netdev_dev_linux *netdev_dev =
2324 netdev_dev_linux_cast(netdev_get_dev(netdev));
2326 error = netdev_linux_get_drvinfo(netdev_dev);
2328 shash_add(sh, "driver_name", xstrdup(netdev_dev->drvinfo.driver));
2329 shash_add(sh, "driver_version", xstrdup(netdev_dev->drvinfo.version));
2330 shash_add(sh, "firmware_version", xstrdup(netdev_dev->drvinfo.fw_version));
2336 netdev_internal_get_drv_info(const struct netdev *netdev OVS_UNUSED, struct shash *sh)
2338 shash_add(sh, "driver_name", xstrdup("openvswitch"));
2342 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2343 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2344 * returns 0. Otherwise, it returns a positive errno value; in particular,
2345 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2347 netdev_linux_arp_lookup(const struct netdev *netdev,
2348 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2351 struct sockaddr_in sin;
2354 memset(&r, 0, sizeof r);
2355 memset(&sin, 0, sizeof sin);
2356 sin.sin_family = AF_INET;
2357 sin.sin_addr.s_addr = ip;
2359 memcpy(&r.arp_pa, &sin, sizeof sin);
2360 r.arp_ha.sa_family = ARPHRD_ETHER;
2362 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2363 COVERAGE_INC(netdev_arp_lookup);
2364 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2366 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2367 } else if (retval != ENXIO) {
2368 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2369 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
2375 nd_to_iff_flags(enum netdev_flags nd)
2378 if (nd & NETDEV_UP) {
2381 if (nd & NETDEV_PROMISC) {
2388 iff_to_nd_flags(int iff)
2390 enum netdev_flags nd = 0;
2394 if (iff & IFF_PROMISC) {
2395 nd |= NETDEV_PROMISC;
2401 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2402 enum netdev_flags on, enum netdev_flags *old_flagsp)
2404 struct netdev_dev_linux *netdev_dev;
2405 int old_flags, new_flags;
2408 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2409 old_flags = netdev_dev->ifi_flags;
2410 *old_flagsp = iff_to_nd_flags(old_flags);
2411 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2412 if (new_flags != old_flags) {
2413 error = set_flags(netdev, new_flags);
2414 get_flags(&netdev_dev->netdev_dev, &netdev_dev->ifi_flags);
2420 netdev_tap_pl_update_flags(struct netdev *netdev, enum netdev_flags off,
2421 enum netdev_flags on, enum netdev_flags *old_flagsp)
2427 netdev_linux_change_seq(const struct netdev *netdev)
2429 return netdev_dev_linux_cast(netdev_get_dev(netdev))->change_seq;
2432 #define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS, \
2433 GET_FEATURES, GET_STATUS, \
2438 netdev_linux_init, \
2440 netdev_linux_wait, \
2443 netdev_linux_destroy, \
2444 NULL, /* get_config */ \
2445 NULL, /* set_config */ \
2447 netdev_linux_open, \
2448 netdev_linux_close, \
2450 netdev_linux_listen, \
2451 netdev_linux_recv, \
2452 netdev_linux_recv_wait, \
2453 netdev_linux_drain, \
2455 netdev_linux_send, \
2456 netdev_linux_send_wait, \
2458 netdev_linux_set_etheraddr, \
2459 netdev_linux_get_etheraddr, \
2460 netdev_linux_get_mtu, \
2461 netdev_linux_set_mtu, \
2462 netdev_linux_get_ifindex, \
2463 netdev_linux_get_carrier, \
2464 netdev_linux_get_carrier_resets, \
2465 netdev_linux_set_miimon_interval, \
2470 netdev_linux_set_advertisements, \
2472 netdev_linux_set_policing, \
2473 netdev_linux_get_qos_types, \
2474 netdev_linux_get_qos_capabilities, \
2475 netdev_linux_get_qos, \
2476 netdev_linux_set_qos, \
2477 netdev_linux_get_queue, \
2478 netdev_linux_set_queue, \
2479 netdev_linux_delete_queue, \
2480 netdev_linux_get_queue_stats, \
2481 netdev_linux_dump_queues, \
2482 netdev_linux_dump_queue_stats, \
2484 netdev_linux_get_in4, \
2485 netdev_linux_set_in4, \
2486 netdev_linux_get_in6, \
2487 netdev_linux_add_router, \
2488 netdev_linux_get_next_hop, \
2490 netdev_linux_arp_lookup, \
2494 netdev_linux_change_seq \
2497 const struct netdev_class netdev_linux_class =
2500 netdev_linux_create,
2501 netdev_linux_get_stats,
2502 NULL, /* set_stats */
2503 netdev_linux_get_features,
2504 netdev_linux_get_drv_info,
2505 netdev_linux_update_flags);
2507 const struct netdev_class netdev_tap_class =
2510 netdev_linux_create_tap,
2511 netdev_tap_get_stats,
2512 NULL, /* set_stats */
2513 netdev_linux_get_features,
2514 netdev_linux_get_drv_info,
2515 netdev_linux_update_flags);
2517 const struct netdev_class netdev_internal_class =
2520 netdev_linux_create,
2521 netdev_internal_get_stats,
2522 netdev_vport_set_stats,
2523 NULL, /* get_features */
2524 netdev_internal_get_drv_info,
2525 netdev_linux_update_flags);
2527 const struct netdev_class netdev_tap_pl_class =
2530 netdev_linux_create_tap_pl,
2531 netdev_tap_get_stats,
2532 NULL, /* set_stats */
2533 netdev_linux_get_features,
2534 netdev_linux_get_drv_info,
2535 netdev_tap_pl_update_flags);
2537 /* HTB traffic control class. */
2539 #define HTB_N_QUEUES 0xf000
2543 unsigned int max_rate; /* In bytes/s. */
2547 struct tc_queue tc_queue;
2548 unsigned int min_rate; /* In bytes/s. */
2549 unsigned int max_rate; /* In bytes/s. */
2550 unsigned int burst; /* In bytes. */
2551 unsigned int priority; /* Lower values are higher priorities. */
2555 htb_get__(const struct netdev *netdev)
2557 struct netdev_dev_linux *netdev_dev =
2558 netdev_dev_linux_cast(netdev_get_dev(netdev));
2559 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2563 htb_install__(struct netdev *netdev, uint64_t max_rate)
2565 struct netdev_dev_linux *netdev_dev =
2566 netdev_dev_linux_cast(netdev_get_dev(netdev));
2569 htb = xmalloc(sizeof *htb);
2570 tc_init(&htb->tc, &tc_ops_htb);
2571 htb->max_rate = max_rate;
2573 netdev_dev->tc = &htb->tc;
2576 /* Create an HTB qdisc.
2578 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2580 htb_setup_qdisc__(struct netdev *netdev)
2583 struct tc_htb_glob opt;
2584 struct ofpbuf request;
2585 struct tcmsg *tcmsg;
2587 tc_del_qdisc(netdev);
2589 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2590 NLM_F_EXCL | NLM_F_CREATE, &request);
2594 tcmsg->tcm_handle = tc_make_handle(1, 0);
2595 tcmsg->tcm_parent = TC_H_ROOT;
2597 nl_msg_put_string(&request, TCA_KIND, "htb");
2599 memset(&opt, 0, sizeof opt);
2600 opt.rate2quantum = 10;
2604 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2605 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2606 nl_msg_end_nested(&request, opt_offset);
2608 return tc_transact(&request, NULL);
2611 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2612 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2614 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2615 unsigned int parent, struct htb_class *class)
2618 struct tc_htb_opt opt;
2619 struct ofpbuf request;
2620 struct tcmsg *tcmsg;
2624 error = netdev_get_mtu(netdev, &mtu);
2626 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2627 netdev_get_name(netdev));
2631 memset(&opt, 0, sizeof opt);
2632 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2633 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2634 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2635 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2636 opt.prio = class->priority;
2638 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2642 tcmsg->tcm_handle = handle;
2643 tcmsg->tcm_parent = parent;
2645 nl_msg_put_string(&request, TCA_KIND, "htb");
2646 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2647 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2648 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2649 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2650 nl_msg_end_nested(&request, opt_offset);
2652 error = tc_transact(&request, NULL);
2654 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2655 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2656 netdev_get_name(netdev),
2657 tc_get_major(handle), tc_get_minor(handle),
2658 tc_get_major(parent), tc_get_minor(parent),
2659 class->min_rate, class->max_rate,
2660 class->burst, class->priority, strerror(error));
2665 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2666 * description of them into 'details'. The description complies with the
2667 * specification given in the vswitch database documentation for linux-htb
2670 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2672 static const struct nl_policy tca_htb_policy[] = {
2673 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2674 .min_len = sizeof(struct tc_htb_opt) },
2677 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2678 const struct tc_htb_opt *htb;
2680 if (!nl_parse_nested(nl_options, tca_htb_policy,
2681 attrs, ARRAY_SIZE(tca_htb_policy))) {
2682 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2686 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2687 class->min_rate = htb->rate.rate;
2688 class->max_rate = htb->ceil.rate;
2689 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2690 class->priority = htb->prio;
2695 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2696 struct htb_class *options,
2697 struct netdev_queue_stats *stats)
2699 struct nlattr *nl_options;
2700 unsigned int handle;
2703 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2704 if (!error && queue_id) {
2705 unsigned int major = tc_get_major(handle);
2706 unsigned int minor = tc_get_minor(handle);
2707 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2708 *queue_id = minor - 1;
2713 if (!error && options) {
2714 error = htb_parse_tca_options__(nl_options, options);
2720 htb_parse_qdisc_details__(struct netdev *netdev,
2721 const struct shash *details, struct htb_class *hc)
2723 const char *max_rate_s;
2725 max_rate_s = shash_find_data(details, "max-rate");
2726 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2727 if (!hc->max_rate) {
2728 enum netdev_features current;
2730 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2731 hc->max_rate = netdev_features_to_bps(current) / 8;
2733 hc->min_rate = hc->max_rate;
2739 htb_parse_class_details__(struct netdev *netdev,
2740 const struct shash *details, struct htb_class *hc)
2742 const struct htb *htb = htb_get__(netdev);
2743 const char *min_rate_s = shash_find_data(details, "min-rate");
2744 const char *max_rate_s = shash_find_data(details, "max-rate");
2745 const char *burst_s = shash_find_data(details, "burst");
2746 const char *priority_s = shash_find_data(details, "priority");
2749 error = netdev_get_mtu(netdev, &mtu);
2751 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2752 netdev_get_name(netdev));
2756 /* HTB requires at least an mtu sized min-rate to send any traffic even
2757 * on uncongested links. */
2758 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2759 hc->min_rate = MAX(hc->min_rate, mtu);
2760 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2763 hc->max_rate = (max_rate_s
2764 ? strtoull(max_rate_s, NULL, 10) / 8
2766 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2767 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2771 * According to hints in the documentation that I've read, it is important
2772 * that 'burst' be at least as big as the largest frame that might be
2773 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2774 * but having it a bit too small is a problem. Since netdev_get_mtu()
2775 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2776 * the MTU. We actually add 64, instead of 14, as a guard against
2777 * additional headers get tacked on somewhere that we're not aware of. */
2778 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2779 hc->burst = MAX(hc->burst, mtu + 64);
2782 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2788 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2789 unsigned int parent, struct htb_class *options,
2790 struct netdev_queue_stats *stats)
2792 struct ofpbuf *reply;
2795 error = tc_query_class(netdev, handle, parent, &reply);
2797 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2798 ofpbuf_delete(reply);
2804 htb_tc_install(struct netdev *netdev, const struct shash *details)
2808 error = htb_setup_qdisc__(netdev);
2810 struct htb_class hc;
2812 htb_parse_qdisc_details__(netdev, details, &hc);
2813 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2814 tc_make_handle(1, 0), &hc);
2816 htb_install__(netdev, hc.max_rate);
2822 static struct htb_class *
2823 htb_class_cast__(const struct tc_queue *queue)
2825 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2829 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2830 const struct htb_class *hc)
2832 struct htb *htb = htb_get__(netdev);
2833 size_t hash = hash_int(queue_id, 0);
2834 struct tc_queue *queue;
2835 struct htb_class *hcp;
2837 queue = tc_find_queue__(netdev, queue_id, hash);
2839 hcp = htb_class_cast__(queue);
2841 hcp = xmalloc(sizeof *hcp);
2842 queue = &hcp->tc_queue;
2843 queue->queue_id = queue_id;
2844 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2847 hcp->min_rate = hc->min_rate;
2848 hcp->max_rate = hc->max_rate;
2849 hcp->burst = hc->burst;
2850 hcp->priority = hc->priority;
2854 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2857 struct nl_dump dump;
2858 struct htb_class hc;
2860 /* Get qdisc options. */
2862 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2863 htb_install__(netdev, hc.max_rate);
2866 if (!start_queue_dump(netdev, &dump)) {
2869 while (nl_dump_next(&dump, &msg)) {
2870 unsigned int queue_id;
2872 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2873 htb_update_queue__(netdev, queue_id, &hc);
2876 nl_dump_done(&dump);
2882 htb_tc_destroy(struct tc *tc)
2884 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2885 struct htb_class *hc, *next;
2887 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2888 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2896 htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2898 const struct htb *htb = htb_get__(netdev);
2899 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2904 htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2906 struct htb_class hc;
2909 htb_parse_qdisc_details__(netdev, details, &hc);
2910 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2911 tc_make_handle(1, 0), &hc);
2913 htb_get__(netdev)->max_rate = hc.max_rate;
2919 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2920 const struct tc_queue *queue, struct shash *details)
2922 const struct htb_class *hc = htb_class_cast__(queue);
2924 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2925 if (hc->min_rate != hc->max_rate) {
2926 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2928 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2930 shash_add(details, "priority", xasprintf("%u", hc->priority));
2936 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2937 const struct shash *details)
2939 struct htb_class hc;
2942 error = htb_parse_class_details__(netdev, details, &hc);
2947 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2948 tc_make_handle(1, 0xfffe), &hc);
2953 htb_update_queue__(netdev, queue_id, &hc);
2958 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2960 struct htb_class *hc = htb_class_cast__(queue);
2961 struct htb *htb = htb_get__(netdev);
2964 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2966 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2973 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2974 struct netdev_queue_stats *stats)
2976 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2977 tc_make_handle(1, 0xfffe), NULL, stats);
2981 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2982 const struct ofpbuf *nlmsg,
2983 netdev_dump_queue_stats_cb *cb, void *aux)
2985 struct netdev_queue_stats stats;
2986 unsigned int handle, major, minor;
2989 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2994 major = tc_get_major(handle);
2995 minor = tc_get_minor(handle);
2996 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2997 (*cb)(minor - 1, &stats, aux);
3002 static const struct tc_ops tc_ops_htb = {
3003 "htb", /* linux_name */
3004 "linux-htb", /* ovs_name */
3005 HTB_N_QUEUES, /* n_queues */
3014 htb_class_get_stats,
3015 htb_class_dump_stats
3018 /* "linux-hfsc" traffic control class. */
3020 #define HFSC_N_QUEUES 0xf000
3028 struct tc_queue tc_queue;
3033 static struct hfsc *
3034 hfsc_get__(const struct netdev *netdev)
3036 struct netdev_dev_linux *netdev_dev;
3037 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
3038 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
3041 static struct hfsc_class *
3042 hfsc_class_cast__(const struct tc_queue *queue)
3044 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3048 hfsc_install__(struct netdev *netdev, uint32_t max_rate)
3050 struct netdev_dev_linux * netdev_dev;
3053 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
3054 hfsc = xmalloc(sizeof *hfsc);
3055 tc_init(&hfsc->tc, &tc_ops_hfsc);
3056 hfsc->max_rate = max_rate;
3057 netdev_dev->tc = &hfsc->tc;
3061 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3062 const struct hfsc_class *hc)
3066 struct hfsc_class *hcp;
3067 struct tc_queue *queue;
3069 hfsc = hfsc_get__(netdev);
3070 hash = hash_int(queue_id, 0);
3072 queue = tc_find_queue__(netdev, queue_id, hash);
3074 hcp = hfsc_class_cast__(queue);
3076 hcp = xmalloc(sizeof *hcp);
3077 queue = &hcp->tc_queue;
3078 queue->queue_id = queue_id;
3079 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3082 hcp->min_rate = hc->min_rate;
3083 hcp->max_rate = hc->max_rate;
3087 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3089 const struct tc_service_curve *rsc, *fsc, *usc;
3090 static const struct nl_policy tca_hfsc_policy[] = {
3092 .type = NL_A_UNSPEC,
3094 .min_len = sizeof(struct tc_service_curve),
3097 .type = NL_A_UNSPEC,
3099 .min_len = sizeof(struct tc_service_curve),
3102 .type = NL_A_UNSPEC,
3104 .min_len = sizeof(struct tc_service_curve),
3107 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3109 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3110 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3111 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3115 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3116 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3117 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3119 if (rsc->m1 != 0 || rsc->d != 0 ||
3120 fsc->m1 != 0 || fsc->d != 0 ||
3121 usc->m1 != 0 || usc->d != 0) {
3122 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3123 "Non-linear service curves are not supported.");
3127 if (rsc->m2 != fsc->m2) {
3128 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3129 "Real-time service curves are not supported ");
3133 if (rsc->m2 > usc->m2) {
3134 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3135 "Min-rate service curve is greater than "
3136 "the max-rate service curve.");
3140 class->min_rate = fsc->m2;
3141 class->max_rate = usc->m2;
3146 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3147 struct hfsc_class *options,
3148 struct netdev_queue_stats *stats)
3151 unsigned int handle;
3152 struct nlattr *nl_options;
3154 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3160 unsigned int major, minor;
3162 major = tc_get_major(handle);
3163 minor = tc_get_minor(handle);
3164 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3165 *queue_id = minor - 1;
3172 error = hfsc_parse_tca_options__(nl_options, options);
3179 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3180 unsigned int parent, struct hfsc_class *options,
3181 struct netdev_queue_stats *stats)
3184 struct ofpbuf *reply;
3186 error = tc_query_class(netdev, handle, parent, &reply);
3191 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3192 ofpbuf_delete(reply);
3197 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
3198 struct hfsc_class *class)
3201 const char *max_rate_s;
3203 max_rate_s = shash_find_data(details, "max-rate");
3204 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3207 enum netdev_features current;
3209 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
3210 max_rate = netdev_features_to_bps(current) / 8;
3213 class->min_rate = max_rate;
3214 class->max_rate = max_rate;
3218 hfsc_parse_class_details__(struct netdev *netdev,
3219 const struct shash *details,
3220 struct hfsc_class * class)
3222 const struct hfsc *hfsc;
3223 uint32_t min_rate, max_rate;
3224 const char *min_rate_s, *max_rate_s;
3226 hfsc = hfsc_get__(netdev);
3227 min_rate_s = shash_find_data(details, "min-rate");
3228 max_rate_s = shash_find_data(details, "max-rate");
3230 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3231 min_rate = MAX(min_rate, 1);
3232 min_rate = MIN(min_rate, hfsc->max_rate);
3234 max_rate = (max_rate_s
3235 ? strtoull(max_rate_s, NULL, 10) / 8
3237 max_rate = MAX(max_rate, min_rate);
3238 max_rate = MIN(max_rate, hfsc->max_rate);
3240 class->min_rate = min_rate;
3241 class->max_rate = max_rate;
3246 /* Create an HFSC qdisc.
3248 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3250 hfsc_setup_qdisc__(struct netdev * netdev)
3252 struct tcmsg *tcmsg;
3253 struct ofpbuf request;
3254 struct tc_hfsc_qopt opt;
3256 tc_del_qdisc(netdev);
3258 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3259 NLM_F_EXCL | NLM_F_CREATE, &request);
3265 tcmsg->tcm_handle = tc_make_handle(1, 0);
3266 tcmsg->tcm_parent = TC_H_ROOT;
3268 memset(&opt, 0, sizeof opt);
3271 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3272 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3274 return tc_transact(&request, NULL);
3277 /* Create an HFSC class.
3279 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3280 * sc rate <min_rate> ul rate <max_rate>" */
3282 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3283 unsigned int parent, struct hfsc_class *class)
3287 struct tcmsg *tcmsg;
3288 struct ofpbuf request;
3289 struct tc_service_curve min, max;
3291 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3297 tcmsg->tcm_handle = handle;
3298 tcmsg->tcm_parent = parent;
3302 min.m2 = class->min_rate;
3306 max.m2 = class->max_rate;
3308 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3309 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3310 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3311 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3312 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3313 nl_msg_end_nested(&request, opt_offset);
3315 error = tc_transact(&request, NULL);
3317 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3318 "min-rate %ubps, max-rate %ubps (%s)",
3319 netdev_get_name(netdev),
3320 tc_get_major(handle), tc_get_minor(handle),
3321 tc_get_major(parent), tc_get_minor(parent),
3322 class->min_rate, class->max_rate, strerror(error));
3329 hfsc_tc_install(struct netdev *netdev, const struct shash *details)
3332 struct hfsc_class class;
3334 error = hfsc_setup_qdisc__(netdev);
3340 hfsc_parse_qdisc_details__(netdev, details, &class);
3341 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3342 tc_make_handle(1, 0), &class);
3348 hfsc_install__(netdev, class.max_rate);
3353 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3356 struct nl_dump dump;
3357 struct hfsc_class hc;
3360 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3361 hfsc_install__(netdev, hc.max_rate);
3363 if (!start_queue_dump(netdev, &dump)) {
3367 while (nl_dump_next(&dump, &msg)) {
3368 unsigned int queue_id;
3370 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3371 hfsc_update_queue__(netdev, queue_id, &hc);
3375 nl_dump_done(&dump);
3380 hfsc_tc_destroy(struct tc *tc)
3383 struct hfsc_class *hc, *next;
3385 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3387 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3388 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3397 hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3399 const struct hfsc *hfsc;
3400 hfsc = hfsc_get__(netdev);
3401 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3406 hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3409 struct hfsc_class class;
3411 hfsc_parse_qdisc_details__(netdev, details, &class);
3412 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3413 tc_make_handle(1, 0), &class);
3416 hfsc_get__(netdev)->max_rate = class.max_rate;
3423 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3424 const struct tc_queue *queue, struct shash *details)
3426 const struct hfsc_class *hc;
3428 hc = hfsc_class_cast__(queue);
3429 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3430 if (hc->min_rate != hc->max_rate) {
3431 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3437 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3438 const struct shash *details)
3441 struct hfsc_class class;
3443 error = hfsc_parse_class_details__(netdev, details, &class);
3448 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3449 tc_make_handle(1, 0xfffe), &class);
3454 hfsc_update_queue__(netdev, queue_id, &class);
3459 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3463 struct hfsc_class *hc;
3465 hc = hfsc_class_cast__(queue);
3466 hfsc = hfsc_get__(netdev);
3468 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3470 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3477 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3478 struct netdev_queue_stats *stats)
3480 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3481 tc_make_handle(1, 0xfffe), NULL, stats);
3485 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3486 const struct ofpbuf *nlmsg,
3487 netdev_dump_queue_stats_cb *cb, void *aux)
3489 struct netdev_queue_stats stats;
3490 unsigned int handle, major, minor;
3493 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3498 major = tc_get_major(handle);
3499 minor = tc_get_minor(handle);
3500 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3501 (*cb)(minor - 1, &stats, aux);
3506 static const struct tc_ops tc_ops_hfsc = {
3507 "hfsc", /* linux_name */
3508 "linux-hfsc", /* ovs_name */
3509 HFSC_N_QUEUES, /* n_queues */
3510 hfsc_tc_install, /* tc_install */
3511 hfsc_tc_load, /* tc_load */
3512 hfsc_tc_destroy, /* tc_destroy */
3513 hfsc_qdisc_get, /* qdisc_get */
3514 hfsc_qdisc_set, /* qdisc_set */
3515 hfsc_class_get, /* class_get */
3516 hfsc_class_set, /* class_set */
3517 hfsc_class_delete, /* class_delete */
3518 hfsc_class_get_stats, /* class_get_stats */
3519 hfsc_class_dump_stats /* class_dump_stats */
3522 /* "linux-default" traffic control class.
3524 * This class represents the default, unnamed Linux qdisc. It corresponds to
3525 * the "" (empty string) QoS type in the OVS database. */
3528 default_install__(struct netdev *netdev)
3530 struct netdev_dev_linux *netdev_dev =
3531 netdev_dev_linux_cast(netdev_get_dev(netdev));
3532 static struct tc *tc;
3535 tc = xmalloc(sizeof *tc);
3536 tc_init(tc, &tc_ops_default);
3538 netdev_dev->tc = tc;
3542 default_tc_install(struct netdev *netdev,
3543 const struct shash *details OVS_UNUSED)
3545 default_install__(netdev);
3550 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3552 default_install__(netdev);
3556 static const struct tc_ops tc_ops_default = {
3557 NULL, /* linux_name */
3562 NULL, /* tc_destroy */
3563 NULL, /* qdisc_get */
3564 NULL, /* qdisc_set */
3565 NULL, /* class_get */
3566 NULL, /* class_set */
3567 NULL, /* class_delete */
3568 NULL, /* class_get_stats */
3569 NULL /* class_dump_stats */
3572 /* "linux-other" traffic control class.
3577 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3579 struct netdev_dev_linux *netdev_dev =
3580 netdev_dev_linux_cast(netdev_get_dev(netdev));
3581 static struct tc *tc;
3584 tc = xmalloc(sizeof *tc);
3585 tc_init(tc, &tc_ops_other);
3587 netdev_dev->tc = tc;
3591 static const struct tc_ops tc_ops_other = {
3592 NULL, /* linux_name */
3593 "linux-other", /* ovs_name */
3595 NULL, /* tc_install */
3597 NULL, /* tc_destroy */
3598 NULL, /* qdisc_get */
3599 NULL, /* qdisc_set */
3600 NULL, /* class_get */
3601 NULL, /* class_set */
3602 NULL, /* class_delete */
3603 NULL, /* class_get_stats */
3604 NULL /* class_dump_stats */
3607 /* Traffic control. */
3609 /* Number of kernel "tc" ticks per second. */
3610 static double ticks_per_s;
3612 /* Number of kernel "jiffies" per second. This is used for the purpose of
3613 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3614 * one jiffy's worth of data.
3616 * There are two possibilities here:
3618 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3619 * approximate range of 100 to 1024. That means that we really need to
3620 * make sure that the qdisc can buffer that much data.
3622 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3623 * has finely granular timers and there's no need to fudge additional room
3624 * for buffers. (There's no extra effort needed to implement that: the
3625 * large 'buffer_hz' is used as a divisor, so practically any number will
3626 * come out as 0 in the division. Small integer results in the case of
3627 * really high dividends won't have any real effect anyhow.)
3629 static unsigned int buffer_hz;
3631 /* Returns tc handle 'major':'minor'. */
3633 tc_make_handle(unsigned int major, unsigned int minor)
3635 return TC_H_MAKE(major << 16, minor);
3638 /* Returns the major number from 'handle'. */
3640 tc_get_major(unsigned int handle)
3642 return TC_H_MAJ(handle) >> 16;
3645 /* Returns the minor number from 'handle'. */
3647 tc_get_minor(unsigned int handle)
3649 return TC_H_MIN(handle);
3652 static struct tcmsg *
3653 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3654 struct ofpbuf *request)
3656 struct tcmsg *tcmsg;
3660 error = get_ifindex(netdev, &ifindex);
3665 ofpbuf_init(request, 512);
3666 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3667 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3668 tcmsg->tcm_family = AF_UNSPEC;
3669 tcmsg->tcm_ifindex = ifindex;
3670 /* Caller should fill in tcmsg->tcm_handle. */
3671 /* Caller should fill in tcmsg->tcm_parent. */
3677 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3679 int error = nl_sock_transact(rtnl_sock, request, replyp);
3680 ofpbuf_uninit(request);
3684 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3685 * policing configuration.
3687 * This function is equivalent to running the following when 'add' is true:
3688 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3690 * This function is equivalent to running the following when 'add' is false:
3691 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3693 * The configuration and stats may be seen with the following command:
3694 * /sbin/tc -s qdisc show dev <devname>
3696 * Returns 0 if successful, otherwise a positive errno value.
3699 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3701 struct ofpbuf request;
3702 struct tcmsg *tcmsg;
3704 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3705 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3707 tcmsg = tc_make_request(netdev, type, flags, &request);
3711 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3712 tcmsg->tcm_parent = TC_H_INGRESS;
3713 nl_msg_put_string(&request, TCA_KIND, "ingress");
3714 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3716 error = tc_transact(&request, NULL);
3718 /* If we're deleting the qdisc, don't worry about some of the
3719 * error conditions. */
3720 if (!add && (error == ENOENT || error == EINVAL)) {
3729 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3732 * This function is equivalent to running:
3733 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3734 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3737 * The configuration and stats may be seen with the following command:
3738 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3740 * Returns 0 if successful, otherwise a positive errno value.
3743 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3745 struct tc_police tc_police;
3746 struct ofpbuf request;
3747 struct tcmsg *tcmsg;
3748 size_t basic_offset;
3749 size_t police_offset;
3753 memset(&tc_police, 0, sizeof tc_police);
3754 tc_police.action = TC_POLICE_SHOT;
3755 tc_police.mtu = mtu;
3756 tc_fill_rate(&tc_police.rate, kbits_rate/8 * 1000, mtu);
3757 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3758 kbits_burst * 1024);
3760 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3761 NLM_F_EXCL | NLM_F_CREATE, &request);
3765 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3766 tcmsg->tcm_info = tc_make_handle(49,
3767 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3769 nl_msg_put_string(&request, TCA_KIND, "basic");
3770 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3771 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3772 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3773 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3774 nl_msg_end_nested(&request, police_offset);
3775 nl_msg_end_nested(&request, basic_offset);
3777 error = tc_transact(&request, NULL);
3788 /* The values in psched are not individually very meaningful, but they are
3789 * important. The tables below show some values seen in the wild.
3793 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3794 * (Before that, there are hints that it was 1000000000.)
3796 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3800 * -----------------------------------
3801 * [1] 000c8000 000f4240 000f4240 00000064
3802 * [2] 000003e8 00000400 000f4240 3b9aca00
3803 * [3] 000003e8 00000400 000f4240 3b9aca00
3804 * [4] 000003e8 00000400 000f4240 00000064
3805 * [5] 000003e8 00000040 000f4240 3b9aca00
3806 * [6] 000003e8 00000040 000f4240 000000f9
3808 * a b c d ticks_per_s buffer_hz
3809 * ------- --------- ---------- ------------- ----------- -------------
3810 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3811 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3812 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3813 * [4] 1,000 1,024 1,000,000 100 976,562 100
3814 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3815 * [6] 1,000 64 1,000,000 249 15,625,000 249
3817 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3818 * [2] 2.6.26-1-686-bigmem from Debian lenny
3819 * [3] 2.6.26-2-sparc64 from Debian lenny
3820 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3821 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3822 * [6] 2.6.34 from kernel.org on KVM
3824 static const char fn[] = "/proc/net/psched";
3825 unsigned int a, b, c, d;
3831 stream = fopen(fn, "r");
3833 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3837 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3838 VLOG_WARN("%s: read failed", fn);
3842 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3846 VLOG_WARN("%s: invalid scheduler parameters", fn);
3850 ticks_per_s = (double) a * c / b;
3854 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3857 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3860 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3861 * rate of 'rate' bytes per second. */
3863 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3868 return (rate * ticks) / ticks_per_s;
3871 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3872 * rate of 'rate' bytes per second. */
3874 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3879 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3882 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3883 * a transmission rate of 'rate' bytes per second. */
3885 tc_buffer_per_jiffy(unsigned int rate)
3890 return rate / buffer_hz;
3893 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3894 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3895 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3896 * stores NULL into it if it is absent.
3898 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3901 * Returns 0 if successful, otherwise a positive errno value. */
3903 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3904 struct nlattr **options)
3906 static const struct nl_policy tca_policy[] = {
3907 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3908 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3910 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3912 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3913 tca_policy, ta, ARRAY_SIZE(ta))) {
3914 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3919 *kind = nl_attr_get_string(ta[TCA_KIND]);
3923 *options = ta[TCA_OPTIONS];
3938 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3939 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3940 * into '*options', and its queue statistics into '*stats'. Any of the output
3941 * arguments may be null.
3943 * Returns 0 if successful, otherwise a positive errno value. */
3945 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3946 struct nlattr **options, struct netdev_queue_stats *stats)
3948 static const struct nl_policy tca_policy[] = {
3949 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3950 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3952 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3954 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3955 tca_policy, ta, ARRAY_SIZE(ta))) {
3956 VLOG_WARN_RL(&rl, "failed to parse class message");
3961 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3962 *handlep = tc->tcm_handle;
3966 *options = ta[TCA_OPTIONS];
3970 const struct gnet_stats_queue *gsq;
3971 struct gnet_stats_basic gsb;
3973 static const struct nl_policy stats_policy[] = {
3974 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3975 .min_len = sizeof gsb },
3976 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3977 .min_len = sizeof *gsq },
3979 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3981 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3982 sa, ARRAY_SIZE(sa))) {
3983 VLOG_WARN_RL(&rl, "failed to parse class stats");
3987 /* Alignment issues screw up the length of struct gnet_stats_basic on
3988 * some arch/bitsize combinations. Newer versions of Linux have a
3989 * struct gnet_stats_basic_packed, but we can't depend on that. The
3990 * easiest thing to do is just to make a copy. */
3991 memset(&gsb, 0, sizeof gsb);
3992 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3993 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3994 stats->tx_bytes = gsb.bytes;
3995 stats->tx_packets = gsb.packets;
3997 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3998 stats->tx_errors = gsq->drops;
4008 memset(stats, 0, sizeof *stats);
4013 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
4016 tc_query_class(const struct netdev *netdev,
4017 unsigned int handle, unsigned int parent,
4018 struct ofpbuf **replyp)
4020 struct ofpbuf request;
4021 struct tcmsg *tcmsg;
4024 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
4028 tcmsg->tcm_handle = handle;
4029 tcmsg->tcm_parent = parent;
4031 error = tc_transact(&request, replyp);
4033 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
4034 netdev_get_name(netdev),
4035 tc_get_major(handle), tc_get_minor(handle),
4036 tc_get_major(parent), tc_get_minor(parent),
4042 /* Equivalent to "tc class del dev <name> handle <handle>". */
4044 tc_delete_class(const struct netdev *netdev, unsigned int handle)
4046 struct ofpbuf request;
4047 struct tcmsg *tcmsg;
4050 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
4054 tcmsg->tcm_handle = handle;
4055 tcmsg->tcm_parent = 0;
4057 error = tc_transact(&request, NULL);
4059 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4060 netdev_get_name(netdev),
4061 tc_get_major(handle), tc_get_minor(handle),
4067 /* Equivalent to "tc qdisc del dev <name> root". */
4069 tc_del_qdisc(struct netdev *netdev)
4071 struct netdev_dev_linux *netdev_dev =
4072 netdev_dev_linux_cast(netdev_get_dev(netdev));
4073 struct ofpbuf request;
4074 struct tcmsg *tcmsg;
4077 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
4081 tcmsg->tcm_handle = tc_make_handle(1, 0);
4082 tcmsg->tcm_parent = TC_H_ROOT;
4084 error = tc_transact(&request, NULL);
4085 if (error == EINVAL) {
4086 /* EINVAL probably means that the default qdisc was in use, in which
4087 * case we've accomplished our purpose. */
4090 if (!error && netdev_dev->tc) {
4091 if (netdev_dev->tc->ops->tc_destroy) {
4092 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
4094 netdev_dev->tc = NULL;
4099 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4100 * kernel to determine what they are. Returns 0 if successful, otherwise a
4101 * positive errno value. */
4103 tc_query_qdisc(const struct netdev *netdev)
4105 struct netdev_dev_linux *netdev_dev =
4106 netdev_dev_linux_cast(netdev_get_dev(netdev));
4107 struct ofpbuf request, *qdisc;
4108 const struct tc_ops *ops;
4109 struct tcmsg *tcmsg;
4113 if (netdev_dev->tc) {
4117 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4118 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4119 * 2.6.35 without that fix backported to it.
4121 * To avoid the OOPS, we must not make a request that would attempt to dump
4122 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4123 * few others. There are a few ways that I can see to do this, but most of
4124 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4125 * technique chosen here is to assume that any non-default qdisc that we
4126 * create will have a class with handle 1:0. The built-in qdiscs only have
4127 * a class with handle 0:0.
4129 * We could check for Linux 2.6.35+ and use a more straightforward method
4131 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
4135 tcmsg->tcm_handle = tc_make_handle(1, 0);
4136 tcmsg->tcm_parent = 0;
4138 /* Figure out what tc class to instantiate. */
4139 error = tc_transact(&request, &qdisc);
4143 error = tc_parse_qdisc(qdisc, &kind, NULL);
4145 ops = &tc_ops_other;
4147 ops = tc_lookup_linux_name(kind);
4149 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4150 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4152 ops = &tc_ops_other;
4155 } else if (error == ENOENT) {
4156 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4157 * other entity that doesn't have a handle 1:0. We will assume
4158 * that it's the system default qdisc. */
4159 ops = &tc_ops_default;
4162 /* Who knows? Maybe the device got deleted. */
4163 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4164 netdev_get_name(netdev), strerror(error));
4165 ops = &tc_ops_other;
4168 /* Instantiate it. */
4169 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
4170 assert((load_error == 0) == (netdev_dev->tc != NULL));
4171 ofpbuf_delete(qdisc);
4173 return error ? error : load_error;
4176 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4177 approximate the time to transmit packets of various lengths. For an MTU of
4178 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4179 represents two possible packet lengths; for a MTU of 513 through 1024, four
4180 possible lengths; and so on.
4182 Returns, for the specified 'mtu', the number of bits that packet lengths
4183 need to be shifted right to fit within such a 256-entry table. */
4185 tc_calc_cell_log(unsigned int mtu)
4190 mtu = ETH_PAYLOAD_MAX;
4192 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4194 for (cell_log = 0; mtu >= 256; cell_log++) {
4201 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4204 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4206 memset(rate, 0, sizeof *rate);
4207 rate->cell_log = tc_calc_cell_log(mtu);
4208 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4209 /* rate->cell_align = 0; */ /* distro headers. */
4210 rate->mpu = ETH_TOTAL_MIN;
4214 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4215 * attribute of the specified "type".
4217 * See tc_calc_cell_log() above for a description of "rtab"s. */
4219 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4224 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4225 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4226 unsigned packet_size = (i + 1) << rate->cell_log;
4227 if (packet_size < rate->mpu) {
4228 packet_size = rate->mpu;
4230 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4234 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4235 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4236 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4239 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4241 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4242 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4245 /* Linux-only functions declared in netdev-linux.h */
4247 /* Returns a fd for an AF_INET socket or a negative errno value. */
4249 netdev_linux_get_af_inet_sock(void)
4251 int error = netdev_linux_init();
4252 return error ? -error : af_inet_sock;
4255 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4256 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4258 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4259 const char *flag_name, bool enable)
4261 const char *netdev_name = netdev_get_name(netdev);
4262 struct ethtool_value evalue;
4266 memset(&evalue, 0, sizeof evalue);
4267 error = netdev_linux_do_ethtool(netdev_name,
4268 (struct ethtool_cmd *)&evalue,
4269 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4274 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4275 error = netdev_linux_do_ethtool(netdev_name,
4276 (struct ethtool_cmd *)&evalue,
4277 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4282 memset(&evalue, 0, sizeof evalue);
4283 error = netdev_linux_do_ethtool(netdev_name,
4284 (struct ethtool_cmd *)&evalue,
4285 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4290 if (new_flags != evalue.data) {
4291 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4292 "device %s failed", enable ? "enable" : "disable",
4293 flag_name, netdev_name);
4300 /* Utility functions. */
4302 /* Copies 'src' into 'dst', performing format conversion in the process. */
4304 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4305 const struct rtnl_link_stats *src)
4307 dst->rx_packets = src->rx_packets;
4308 dst->tx_packets = src->tx_packets;
4309 dst->rx_bytes = src->rx_bytes;
4310 dst->tx_bytes = src->tx_bytes;
4311 dst->rx_errors = src->rx_errors;
4312 dst->tx_errors = src->tx_errors;
4313 dst->rx_dropped = src->rx_dropped;
4314 dst->tx_dropped = src->tx_dropped;
4315 dst->multicast = src->multicast;
4316 dst->collisions = src->collisions;
4317 dst->rx_length_errors = src->rx_length_errors;
4318 dst->rx_over_errors = src->rx_over_errors;
4319 dst->rx_crc_errors = src->rx_crc_errors;
4320 dst->rx_frame_errors = src->rx_frame_errors;
4321 dst->rx_fifo_errors = src->rx_fifo_errors;
4322 dst->rx_missed_errors = src->rx_missed_errors;
4323 dst->tx_aborted_errors = src->tx_aborted_errors;
4324 dst->tx_carrier_errors = src->tx_carrier_errors;
4325 dst->tx_fifo_errors = src->tx_fifo_errors;
4326 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4327 dst->tx_window_errors = src->tx_window_errors;
4331 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4333 /* Policy for RTNLGRP_LINK messages.
4335 * There are *many* more fields in these messages, but currently we only
4336 * care about these fields. */
4337 static const struct nl_policy rtnlgrp_link_policy[] = {
4338 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4339 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4340 .min_len = sizeof(struct rtnl_link_stats) },
4343 struct ofpbuf request;
4344 struct ofpbuf *reply;
4345 struct ifinfomsg *ifi;
4346 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4349 ofpbuf_init(&request, 0);
4350 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4351 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4352 ifi->ifi_family = PF_UNSPEC;
4353 ifi->ifi_index = ifindex;
4354 error = nl_sock_transact(rtnl_sock, &request, &reply);
4355 ofpbuf_uninit(&request);
4360 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4361 rtnlgrp_link_policy,
4362 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4363 ofpbuf_delete(reply);
4367 if (!attrs[IFLA_STATS]) {
4368 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4369 ofpbuf_delete(reply);
4373 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4375 ofpbuf_delete(reply);
4381 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4383 static const char fn[] = "/proc/net/dev";
4388 stream = fopen(fn, "r");
4390 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
4395 while (fgets(line, sizeof line, stream)) {
4398 #define X64 "%"SCNu64
4401 X64 X64 X64 X64 X64 X64 X64 "%*u"
4402 X64 X64 X64 X64 X64 X64 X64 "%*u",
4408 &stats->rx_fifo_errors,
4409 &stats->rx_frame_errors,
4415 &stats->tx_fifo_errors,
4417 &stats->tx_carrier_errors) != 15) {
4418 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4419 } else if (!strcmp(devname, netdev_name)) {
4420 stats->rx_length_errors = UINT64_MAX;
4421 stats->rx_over_errors = UINT64_MAX;
4422 stats->rx_crc_errors = UINT64_MAX;
4423 stats->rx_missed_errors = UINT64_MAX;
4424 stats->tx_aborted_errors = UINT64_MAX;
4425 stats->tx_heartbeat_errors = UINT64_MAX;
4426 stats->tx_window_errors = UINT64_MAX;
4432 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4438 get_flags(const struct netdev_dev *dev, unsigned int *flags)
4444 error = netdev_linux_do_ioctl(dev->name, &ifr, SIOCGIFFLAGS,
4447 *flags = ifr.ifr_flags;
4453 set_flags(struct netdev *netdev, unsigned int flags)
4457 ifr.ifr_flags = flags;
4458 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4463 do_get_ifindex(const char *netdev_name)
4467 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4468 COVERAGE_INC(netdev_get_ifindex);
4469 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4470 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4471 netdev_name, strerror(errno));
4474 return ifr.ifr_ifindex;
4478 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4480 struct netdev_dev_linux *netdev_dev =
4481 netdev_dev_linux_cast(netdev_get_dev(netdev_));
4483 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
4484 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4487 netdev_dev->get_ifindex_error = -ifindex;
4488 netdev_dev->ifindex = 0;
4490 netdev_dev->get_ifindex_error = 0;
4491 netdev_dev->ifindex = ifindex;
4493 netdev_dev->cache_valid |= VALID_IFINDEX;
4496 *ifindexp = netdev_dev->ifindex;
4497 return netdev_dev->get_ifindex_error;
4501 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4506 memset(&ifr, 0, sizeof ifr);
4507 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4508 COVERAGE_INC(netdev_get_hwaddr);
4509 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4510 /* ENODEV probably means that a vif disappeared asynchronously and
4511 * hasn't been removed from the database yet, so reduce the log level
4512 * to INFO for that case. */
4513 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4514 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4515 netdev_name, strerror(errno));
4518 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4519 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4520 VLOG_WARN("%s device has unknown hardware address family %d",
4521 netdev_name, hwaddr_family);
4523 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4528 set_etheraddr(const char *netdev_name,
4529 const uint8_t mac[ETH_ADDR_LEN])
4533 memset(&ifr, 0, sizeof ifr);
4534 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4535 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4536 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4537 COVERAGE_INC(netdev_set_hwaddr);
4538 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4539 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4540 netdev_name, strerror(errno));
4547 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4548 int cmd, const char *cmd_name)
4552 memset(&ifr, 0, sizeof ifr);
4553 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4554 ifr.ifr_data = (caddr_t) ecmd;
4557 COVERAGE_INC(netdev_ethtool);
4558 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4561 if (errno != EOPNOTSUPP) {
4562 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4563 "failed: %s", cmd_name, name, strerror(errno));
4565 /* The device doesn't support this operation. That's pretty
4566 * common, so there's no point in logging anything. */
4573 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4574 const char *cmd_name)
4576 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4577 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4578 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4586 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4587 int cmd, const char *cmd_name)
4592 ifr.ifr_addr.sa_family = AF_INET;
4593 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4595 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4596 *ip = sin->sin_addr;
4601 /* Returns an AF_PACKET raw socket or a negative errno value. */
4603 af_packet_sock(void)
4605 static int sock = INT_MIN;
4607 if (sock == INT_MIN) {
4608 sock = socket(AF_PACKET, SOCK_RAW, 0);
4610 set_nonblocking(sock);
4613 VLOG_ERR("failed to create packet socket: %s", strerror(errno));