2 * Copyright (c) 2009, 2010, 2011, 2012 Nicira Networks.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
24 #include <arpa/inet.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
30 #include <linux/types.h>
31 #include <linux/ethtool.h>
32 #include <linux/mii.h>
33 #include <linux/pkt_cls.h>
34 #include <linux/pkt_sched.h>
35 #include <linux/rtnetlink.h>
36 #include <linux/sockios.h>
37 #include <linux/version.h>
38 #include <sys/types.h>
39 #include <sys/ioctl.h>
40 #include <sys/socket.h>
41 #include <netpacket/packet.h>
43 #include <net/if_arp.h>
44 #include <net/if_packet.h>
45 #include <net/route.h>
46 #include <netinet/in.h>
53 #include "dpif-linux.h"
54 #include "dynamic-string.h"
55 #include "fatal-signal.h"
58 #include "netdev-provider.h"
59 #include "netdev-vport.h"
61 #include "netlink-notifier.h"
62 #include "netlink-socket.h"
64 #include "openflow/openflow.h"
66 #include "poll-loop.h"
67 #include "rtnetlink-link.h"
68 #include "socket-util.h"
74 VLOG_DEFINE_THIS_MODULE(netdev_linux);
76 COVERAGE_DEFINE(netdev_set_policing);
77 COVERAGE_DEFINE(netdev_arp_lookup);
78 COVERAGE_DEFINE(netdev_get_ifindex);
79 COVERAGE_DEFINE(netdev_get_hwaddr);
80 COVERAGE_DEFINE(netdev_set_hwaddr);
81 COVERAGE_DEFINE(netdev_ethtool);
84 /* These were introduced in Linux 2.6.14, so they might be missing if we have
86 #ifndef ADVERTISED_Pause
87 #define ADVERTISED_Pause (1 << 13)
89 #ifndef ADVERTISED_Asym_Pause
90 #define ADVERTISED_Asym_Pause (1 << 14)
93 /* These were introduced in Linux 2.6.24, so they might be missing if we
94 * have old headers. */
95 #ifndef ETHTOOL_GFLAGS
96 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
98 #ifndef ETHTOOL_SFLAGS
99 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
102 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
105 #define TC_RTAB_SIZE 1024
108 static struct nln_notifier *netdev_linux_cache_notifier = NULL;
109 static int cache_notifier_refcount;
112 VALID_IFINDEX = 1 << 0,
113 VALID_ETHERADDR = 1 << 1,
117 VALID_POLICING = 1 << 5,
118 VALID_VPORT_STAT_ERROR = 1 << 6,
119 VALID_DRVINFO = 1 << 7,
120 VALID_FEATURES = 1 << 8,
128 /* Traffic control. */
130 /* An instance of a traffic control class. Always associated with a particular
133 * Each TC implementation subclasses this with whatever additional data it
136 const struct tc_ops *ops;
137 struct hmap queues; /* Contains "struct tc_queue"s.
138 * Read by generic TC layer.
139 * Written only by TC implementation. */
142 /* One traffic control queue.
144 * Each TC implementation subclasses this with whatever additional data it
147 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
148 unsigned int queue_id; /* OpenFlow queue ID. */
151 /* A particular kind of traffic control. Each implementation generally maps to
152 * one particular Linux qdisc class.
154 * The functions below return 0 if successful or a positive errno value on
155 * failure, except where otherwise noted. All of them must be provided, except
156 * where otherwise noted. */
158 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
159 * This is null for tc_ops_default and tc_ops_other, for which there are no
160 * appropriate values. */
161 const char *linux_name;
163 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
164 const char *ovs_name;
166 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
167 * queues. The queues are numbered 0 through n_queues - 1. */
168 unsigned int n_queues;
170 /* Called to install this TC class on 'netdev'. The implementation should
171 * make the Netlink calls required to set up 'netdev' with the right qdisc
172 * and configure it according to 'details'. The implementation may assume
173 * that the current qdisc is the default; that is, there is no need for it
174 * to delete the current qdisc before installing itself.
176 * The contents of 'details' should be documented as valid for 'ovs_name'
177 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
178 * (which is built as ovs-vswitchd.conf.db(8)).
180 * This function must return 0 if and only if it sets 'netdev->tc' to an
181 * initialized 'struct tc'.
183 * (This function is null for tc_ops_other, which cannot be installed. For
184 * other TC classes it should always be nonnull.) */
185 int (*tc_install)(struct netdev *netdev, const struct shash *details);
187 /* Called when the netdev code determines (through a Netlink query) that
188 * this TC class's qdisc is installed on 'netdev', but we didn't install
189 * it ourselves and so don't know any of the details.
191 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
192 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
193 * implementation should parse the other attributes of 'nlmsg' as
194 * necessary to determine its configuration. If necessary it should also
195 * use Netlink queries to determine the configuration of queues on
198 * This function must return 0 if and only if it sets 'netdev->tc' to an
199 * initialized 'struct tc'. */
200 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
202 /* Destroys the data structures allocated by the implementation as part of
203 * 'tc'. (This includes destroying 'tc->queues' by calling
206 * The implementation should not need to perform any Netlink calls. If
207 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
208 * (But it may not be desirable.)
210 * This function may be null if 'tc' is trivial. */
211 void (*tc_destroy)(struct tc *tc);
213 /* Retrieves details of 'netdev->tc' configuration into 'details'.
215 * The implementation should not need to perform any Netlink calls, because
216 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
217 * cached the configuration.
219 * The contents of 'details' should be documented as valid for 'ovs_name'
220 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
221 * (which is built as ovs-vswitchd.conf.db(8)).
223 * This function may be null if 'tc' is not configurable.
225 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
227 /* Reconfigures 'netdev->tc' according to 'details', performing any
228 * required Netlink calls to complete the reconfiguration.
230 * The contents of 'details' should be documented as valid for 'ovs_name'
231 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
232 * (which is built as ovs-vswitchd.conf.db(8)).
234 * This function may be null if 'tc' is not configurable.
236 int (*qdisc_set)(struct netdev *, const struct shash *details);
238 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
239 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
241 * The contents of 'details' should be documented as valid for 'ovs_name'
242 * in the "other_config" column in the "Queue" table in
243 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
245 * The implementation should not need to perform any Netlink calls, because
246 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
247 * cached the queue configuration.
249 * This function may be null if 'tc' does not have queues ('n_queues' is
251 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
252 struct shash *details);
254 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
255 * 'details', perfoming any required Netlink calls to complete the
256 * reconfiguration. The caller ensures that 'queue_id' is less than
259 * The contents of 'details' should be documented as valid for 'ovs_name'
260 * in the "other_config" column in the "Queue" table in
261 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
263 * This function may be null if 'tc' does not have queues or its queues are
264 * not configurable. */
265 int (*class_set)(struct netdev *, unsigned int queue_id,
266 const struct shash *details);
268 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
269 * tc_queue's within 'netdev->tc->queues'.
271 * This function may be null if 'tc' does not have queues or its queues
272 * cannot be deleted. */
273 int (*class_delete)(struct netdev *, struct tc_queue *queue);
275 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
276 * 'struct tc_queue's within 'netdev->tc->queues'.
278 * On success, initializes '*stats'.
280 * This function may be null if 'tc' does not have queues or if it cannot
281 * report queue statistics. */
282 int (*class_get_stats)(const struct netdev *netdev,
283 const struct tc_queue *queue,
284 struct netdev_queue_stats *stats);
286 /* Extracts queue stats from 'nlmsg', which is a response to a
287 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
289 * This function may be null if 'tc' does not have queues or if it cannot
290 * report queue statistics. */
291 int (*class_dump_stats)(const struct netdev *netdev,
292 const struct ofpbuf *nlmsg,
293 netdev_dump_queue_stats_cb *cb, void *aux);
297 tc_init(struct tc *tc, const struct tc_ops *ops)
300 hmap_init(&tc->queues);
304 tc_destroy(struct tc *tc)
306 hmap_destroy(&tc->queues);
309 static const struct tc_ops tc_ops_htb;
310 static const struct tc_ops tc_ops_hfsc;
311 static const struct tc_ops tc_ops_default;
312 static const struct tc_ops tc_ops_other;
314 static const struct tc_ops *tcs[] = {
315 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
316 &tc_ops_hfsc, /* Hierarchical fair service curve. */
317 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
318 &tc_ops_other, /* Some other qdisc. */
322 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
323 static unsigned int tc_get_major(unsigned int handle);
324 static unsigned int tc_get_minor(unsigned int handle);
326 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
327 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
328 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
330 static struct tcmsg *tc_make_request(const struct netdev *, int type,
331 unsigned int flags, struct ofpbuf *);
332 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
333 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
334 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
337 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
338 struct nlattr **options);
339 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
340 struct nlattr **options,
341 struct netdev_queue_stats *);
342 static int tc_query_class(const struct netdev *,
343 unsigned int handle, unsigned int parent,
344 struct ofpbuf **replyp);
345 static int tc_delete_class(const struct netdev *, unsigned int handle);
347 static int tc_del_qdisc(struct netdev *netdev);
348 static int tc_query_qdisc(const struct netdev *netdev);
350 static int tc_calc_cell_log(unsigned int mtu);
351 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
352 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
353 const struct tc_ratespec *rate);
354 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
356 struct netdev_dev_linux {
357 struct netdev_dev netdev_dev;
359 struct shash_node *shash_node;
360 unsigned int cache_valid;
361 unsigned int change_seq;
363 bool miimon; /* Link status of last poll. */
364 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
365 struct timer miimon_timer;
367 /* The following are figured out "on demand" only. They are only valid
368 * when the corresponding VALID_* bit in 'cache_valid' is set. */
370 uint8_t etheraddr[ETH_ADDR_LEN];
371 struct in_addr address, netmask;
374 unsigned int ifi_flags;
375 long long int carrier_resets;
376 uint32_t kbits_rate; /* Policing data. */
377 uint32_t kbits_burst;
378 int vport_stats_error; /* Cached error code from vport_get_stats().
379 0 or an errno value. */
380 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
381 int ether_addr_error; /* Cached error code from set/get etheraddr. */
382 int netdev_policing_error; /* Cached error code from set policing. */
383 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
384 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
386 uint32_t current; /* Cached from ETHTOOL_GSET. */
387 uint32_t advertised; /* Cached from ETHTOOL_GSET. */
388 uint32_t supported; /* Cached from ETHTOOL_GSET. */
389 uint32_t peer; /* Cached from ETHTOOL_GSET. */
391 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
395 struct tap_state tap;
399 struct netdev_linux {
400 struct netdev netdev;
404 /* Sockets used for ioctl operations. */
405 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
407 /* A Netlink routing socket that is not subscribed to any multicast groups. */
408 static struct nl_sock *rtnl_sock;
410 /* This is set pretty low because we probably won't learn anything from the
411 * additional log messages. */
412 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
414 static int netdev_linux_init(void);
416 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
417 int cmd, const char *cmd_name);
418 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
419 const char *cmd_name);
420 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
421 int cmd, const char *cmd_name);
422 static int get_flags(const struct netdev_dev *, unsigned int *flags);
423 static int set_flags(struct netdev *, unsigned int flags);
424 static int do_get_ifindex(const char *netdev_name);
425 static int get_ifindex(const struct netdev *, int *ifindexp);
426 static int do_set_addr(struct netdev *netdev,
427 int ioctl_nr, const char *ioctl_name,
428 struct in_addr addr);
429 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
430 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
431 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
432 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
433 static int af_packet_sock(void);
434 static void netdev_linux_miimon_run(void);
435 static void netdev_linux_miimon_wait(void);
438 is_netdev_linux_class(const struct netdev_class *netdev_class)
440 return netdev_class->init == netdev_linux_init;
443 static struct netdev_dev_linux *
444 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
446 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
447 assert(is_netdev_linux_class(netdev_class));
449 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
452 static struct netdev_linux *
453 netdev_linux_cast(const struct netdev *netdev)
455 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
456 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
457 assert(is_netdev_linux_class(netdev_class));
459 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
463 netdev_linux_init(void)
465 static int status = -1;
467 /* Create AF_INET socket. */
468 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
469 status = af_inet_sock >= 0 ? 0 : errno;
471 VLOG_ERR("failed to create inet socket: %s", strerror(status));
474 /* Create rtnetlink socket. */
476 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
478 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
487 netdev_linux_run(void)
489 rtnetlink_link_run();
490 netdev_linux_miimon_run();
494 netdev_linux_wait(void)
496 rtnetlink_link_wait();
497 netdev_linux_miimon_wait();
501 netdev_linux_get_drvinfo(struct netdev_dev_linux *netdev_dev)
506 if (netdev_dev->cache_valid & VALID_DRVINFO) {
510 memset(&netdev_dev->drvinfo, 0, sizeof netdev_dev->drvinfo);
511 error = netdev_linux_do_ethtool(netdev_dev->netdev_dev.name,
512 (struct ethtool_cmd *)&netdev_dev->drvinfo,
516 netdev_dev->cache_valid |= VALID_DRVINFO;
522 netdev_dev_linux_changed(struct netdev_dev_linux *dev,
523 unsigned int ifi_flags,
527 if (!dev->change_seq) {
531 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
532 dev->carrier_resets++;
534 dev->ifi_flags = ifi_flags;
536 dev->cache_valid &= mask;
540 netdev_dev_linux_update(struct netdev_dev_linux *dev,
541 const struct rtnetlink_link_change *change)
543 if (change->nlmsg_type == RTM_NEWLINK) {
545 netdev_dev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
547 /* Update netdev from rtnl-change msg. */
549 dev->mtu = change->mtu;
550 dev->cache_valid |= VALID_MTU;
551 dev->netdev_mtu_error = 0;
554 if (!eth_addr_is_zero(change->addr)) {
555 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
556 dev->cache_valid |= VALID_ETHERADDR;
557 dev->ether_addr_error = 0;
560 dev->ifindex = change->ifi_index;
561 dev->cache_valid |= VALID_IFINDEX;
562 dev->get_ifindex_error = 0;
565 netdev_dev_linux_changed(dev, change->ifi_flags, 0);
570 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
571 void *aux OVS_UNUSED)
573 struct netdev_dev_linux *dev;
575 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
577 const struct netdev_class *netdev_class =
578 netdev_dev_get_class(base_dev);
580 if (is_netdev_linux_class(netdev_class)) {
581 dev = netdev_dev_linux_cast(base_dev);
582 netdev_dev_linux_update(dev, change);
586 struct shash device_shash;
587 struct shash_node *node;
589 shash_init(&device_shash);
590 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
591 SHASH_FOR_EACH (node, &device_shash) {
596 get_flags(&dev->netdev_dev, &flags);
597 netdev_dev_linux_changed(dev, flags, 0);
599 shash_destroy(&device_shash);
604 cache_notifier_ref(void)
606 if (!cache_notifier_refcount) {
607 assert(!netdev_linux_cache_notifier);
609 netdev_linux_cache_notifier =
610 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
612 if (!netdev_linux_cache_notifier) {
616 cache_notifier_refcount++;
622 cache_notifier_unref(void)
624 assert(cache_notifier_refcount > 0);
625 if (!--cache_notifier_refcount) {
626 assert(netdev_linux_cache_notifier);
627 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
628 netdev_linux_cache_notifier = NULL;
632 /* Creates system and internal devices. */
634 netdev_linux_create(const struct netdev_class *class, const char *name,
635 struct netdev_dev **netdev_devp)
637 struct netdev_dev_linux *netdev_dev;
640 error = cache_notifier_ref();
645 netdev_dev = xzalloc(sizeof *netdev_dev);
646 netdev_dev->change_seq = 1;
647 netdev_dev_init(&netdev_dev->netdev_dev, name, class);
648 get_flags(&netdev_dev->netdev_dev, &netdev_dev->ifi_flags);
650 *netdev_devp = &netdev_dev->netdev_dev;
654 /* For most types of netdevs we open the device for each call of
655 * netdev_open(). However, this is not the case with tap devices,
656 * since it is only possible to open the device once. In this
657 * situation we share a single file descriptor, and consequently
658 * buffers, across all readers. Therefore once data is read it will
659 * be unavailable to other reads for tap devices. */
661 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
662 const char *name, struct netdev_dev **netdev_devp)
664 struct netdev_dev_linux *netdev_dev;
665 struct tap_state *state;
666 static const char tap_dev[] = "/dev/net/tun";
670 netdev_dev = xzalloc(sizeof *netdev_dev);
671 state = &netdev_dev->state.tap;
673 error = cache_notifier_ref();
678 /* Open tap device. */
679 state->fd = open(tap_dev, O_RDWR);
682 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
683 goto error_unref_notifier;
686 /* Create tap device. */
687 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
688 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
689 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
690 VLOG_WARN("%s: creating tap device failed: %s", name,
693 goto error_unref_notifier;
696 /* Make non-blocking. */
697 error = set_nonblocking(state->fd);
699 goto error_unref_notifier;
702 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
703 *netdev_devp = &netdev_dev->netdev_dev;
706 error_unref_notifier:
707 cache_notifier_unref();
714 destroy_tap(struct netdev_dev_linux *netdev_dev)
716 struct tap_state *state = &netdev_dev->state.tap;
718 if (state->fd >= 0) {
723 /* Destroys the netdev device 'netdev_dev_'. */
725 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
727 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
728 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
730 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
731 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
734 if (class == &netdev_tap_class) {
735 destroy_tap(netdev_dev);
739 cache_notifier_unref();
743 netdev_linux_open(struct netdev_dev *netdev_dev_, struct netdev **netdevp)
745 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
746 struct netdev_linux *netdev;
747 enum netdev_flags flags;
750 /* Allocate network device. */
751 netdev = xzalloc(sizeof *netdev);
753 netdev_init(&netdev->netdev, netdev_dev_);
755 /* Verify that the device really exists, by attempting to read its flags.
756 * (The flags might be cached, in which case this won't actually do an
759 * Don't do this for "internal" netdevs, though, because those have to be
760 * created as netdev objects before they exist in the kernel, because
761 * creating them in the kernel happens by passing a netdev object to
762 * dpif_port_add(). */
763 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
764 error = netdev_get_flags(&netdev->netdev, &flags);
765 if (error == ENODEV) {
770 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
771 !netdev_dev->state.tap.opened) {
773 /* We assume that the first user of the tap device is the primary user
774 * and give them the tap FD. Subsequent users probably just expect
775 * this to be a system device so open it normally to avoid send/receive
776 * directions appearing to be reversed. */
777 netdev->fd = netdev_dev->state.tap.fd;
778 netdev_dev->state.tap.opened = true;
781 *netdevp = &netdev->netdev;
785 netdev_uninit(&netdev->netdev, true);
789 /* Closes and destroys 'netdev'. */
791 netdev_linux_close(struct netdev *netdev_)
793 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
795 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
802 netdev_linux_listen(struct netdev *netdev_)
804 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
805 struct sockaddr_ll sll;
810 if (netdev->fd >= 0) {
814 /* Create file descriptor. */
815 fd = socket(PF_PACKET, SOCK_RAW, 0);
818 VLOG_ERR("failed to create raw socket (%s)", strerror(error));
822 /* Set non-blocking mode. */
823 error = set_nonblocking(fd);
828 /* Get ethernet device index. */
829 error = get_ifindex(&netdev->netdev, &ifindex);
834 /* Bind to specific ethernet device. */
835 memset(&sll, 0, sizeof sll);
836 sll.sll_family = AF_PACKET;
837 sll.sll_ifindex = ifindex;
838 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
839 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
841 VLOG_ERR("%s: failed to bind raw socket (%s)",
842 netdev_get_name(netdev_), strerror(error));
857 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
859 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
861 if (netdev->fd < 0) {
862 /* Device is not listening. */
869 retval = (netdev_->netdev_dev->netdev_class == &netdev_tap_class
870 ? read(netdev->fd, data, size)
871 : recv(netdev->fd, data, size, MSG_TRUNC));
873 return retval <= size ? retval : -EMSGSIZE;
874 } else if (errno != EINTR) {
875 if (errno != EAGAIN) {
876 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
877 strerror(errno), netdev_get_name(netdev_));
884 /* Registers with the poll loop to wake up from the next call to poll_block()
885 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
887 netdev_linux_recv_wait(struct netdev *netdev_)
889 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
890 if (netdev->fd >= 0) {
891 poll_fd_wait(netdev->fd, POLLIN);
895 /* Discards all packets waiting to be received from 'netdev'. */
897 netdev_linux_drain(struct netdev *netdev_)
899 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
900 if (netdev->fd < 0) {
902 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
904 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
905 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
909 drain_fd(netdev->fd, ifr.ifr_qlen);
912 return drain_rcvbuf(netdev->fd);
916 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
917 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
918 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
919 * the packet is too big or too small to transmit on the device.
921 * The caller retains ownership of 'buffer' in all cases.
923 * The kernel maintains a packet transmission queue, so the caller is not
924 * expected to do additional queuing of packets. */
926 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
928 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
932 if (netdev->fd < 0) {
933 /* Use our AF_PACKET socket to send to this device. */
934 struct sockaddr_ll sll;
941 sock = af_packet_sock();
946 error = get_ifindex(netdev_, &ifindex);
951 /* We don't bother setting most fields in sockaddr_ll because the
952 * kernel ignores them for SOCK_RAW. */
953 memset(&sll, 0, sizeof sll);
954 sll.sll_family = AF_PACKET;
955 sll.sll_ifindex = ifindex;
957 iov.iov_base = (void *) data;
961 msg.msg_namelen = sizeof sll;
964 msg.msg_control = NULL;
965 msg.msg_controllen = 0;
968 retval = sendmsg(sock, &msg, 0);
970 /* Use the netdev's own fd to send to this device. This is
971 * essential for tap devices, because packets sent to a tap device
972 * with an AF_PACKET socket will loop back to be *received* again
973 * on the tap device. */
974 retval = write(netdev->fd, data, size);
978 /* The Linux AF_PACKET implementation never blocks waiting for room
979 * for packets, instead returning ENOBUFS. Translate this into
980 * EAGAIN for the caller. */
981 if (errno == ENOBUFS) {
983 } else if (errno == EINTR) {
985 } else if (errno != EAGAIN) {
986 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
987 netdev_get_name(netdev_), strerror(errno));
990 } else if (retval != size) {
991 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
992 "%zu) on %s", retval, size, netdev_get_name(netdev_));
1000 /* Registers with the poll loop to wake up from the next call to poll_block()
1001 * when the packet transmission queue has sufficient room to transmit a packet
1002 * with netdev_send().
1004 * The kernel maintains a packet transmission queue, so the client is not
1005 * expected to do additional queuing of packets. Thus, this function is
1006 * unlikely to ever be used. It is included for completeness. */
1008 netdev_linux_send_wait(struct netdev *netdev_)
1010 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1011 if (netdev->fd < 0) {
1012 /* Nothing to do. */
1013 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
1014 poll_fd_wait(netdev->fd, POLLOUT);
1016 /* TAP device always accepts packets.*/
1017 poll_immediate_wake();
1021 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1022 * otherwise a positive errno value. */
1024 netdev_linux_set_etheraddr(struct netdev *netdev_,
1025 const uint8_t mac[ETH_ADDR_LEN])
1027 struct netdev_dev_linux *netdev_dev =
1028 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1031 if (netdev_dev->cache_valid & VALID_ETHERADDR) {
1032 if (netdev_dev->ether_addr_error) {
1033 return netdev_dev->ether_addr_error;
1035 if (eth_addr_equals(netdev_dev->etheraddr, mac)) {
1038 netdev_dev->cache_valid &= ~VALID_ETHERADDR;
1041 error = set_etheraddr(netdev_get_name(netdev_), mac);
1042 if (!error || error == ENODEV) {
1043 netdev_dev->ether_addr_error = error;
1044 netdev_dev->cache_valid |= VALID_ETHERADDR;
1046 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
1053 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1055 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1056 uint8_t mac[ETH_ADDR_LEN])
1058 struct netdev_dev_linux *netdev_dev =
1059 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1061 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
1062 int error = get_etheraddr(netdev_get_name(netdev_),
1063 netdev_dev->etheraddr);
1065 netdev_dev->ether_addr_error = error;
1066 netdev_dev->cache_valid |= VALID_ETHERADDR;
1069 if (!netdev_dev->ether_addr_error) {
1070 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
1073 return netdev_dev->ether_addr_error;
1076 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1077 * in bytes, not including the hardware header; thus, this is typically 1500
1078 * bytes for Ethernet devices. */
1080 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1082 struct netdev_dev_linux *netdev_dev =
1083 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1084 if (!(netdev_dev->cache_valid & VALID_MTU)) {
1088 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1089 SIOCGIFMTU, "SIOCGIFMTU");
1091 netdev_dev->netdev_mtu_error = error;
1092 netdev_dev->mtu = ifr.ifr_mtu;
1093 netdev_dev->cache_valid |= VALID_MTU;
1096 if (!netdev_dev->netdev_mtu_error) {
1097 *mtup = netdev_dev->mtu;
1099 return netdev_dev->netdev_mtu_error;
1102 /* Sets the maximum size of transmitted (MTU) for given device using linux
1103 * networking ioctl interface.
1106 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1108 struct netdev_dev_linux *netdev_dev =
1109 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1113 if (netdev_dev->cache_valid & VALID_MTU) {
1114 if (netdev_dev->netdev_mtu_error) {
1115 return netdev_dev->netdev_mtu_error;
1117 if (netdev_dev->mtu == mtu) {
1120 netdev_dev->cache_valid &= ~VALID_MTU;
1123 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1124 SIOCSIFMTU, "SIOCSIFMTU");
1125 if (!error || error == ENODEV) {
1126 netdev_dev->netdev_mtu_error = error;
1127 netdev_dev->mtu = ifr.ifr_mtu;
1128 netdev_dev->cache_valid |= VALID_MTU;
1133 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1134 * On failure, returns a negative errno value. */
1136 netdev_linux_get_ifindex(const struct netdev *netdev)
1140 error = get_ifindex(netdev, &ifindex);
1141 return error ? -error : ifindex;
1145 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1147 struct netdev_dev_linux *netdev_dev =
1148 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1150 if (netdev_dev->miimon_interval > 0) {
1151 *carrier = netdev_dev->miimon;
1153 *carrier = (netdev_dev->ifi_flags & IFF_RUNNING) != 0;
1159 static long long int
1160 netdev_linux_get_carrier_resets(const struct netdev *netdev)
1162 return netdev_dev_linux_cast(netdev_get_dev(netdev))->carrier_resets;
1166 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1167 struct mii_ioctl_data *data)
1172 memset(&ifr, 0, sizeof ifr);
1173 memcpy(&ifr.ifr_data, data, sizeof *data);
1174 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1175 memcpy(data, &ifr.ifr_data, sizeof *data);
1181 netdev_linux_get_miimon(const char *name, bool *miimon)
1183 struct mii_ioctl_data data;
1188 memset(&data, 0, sizeof data);
1189 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1191 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1192 data.reg_num = MII_BMSR;
1193 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1197 *miimon = !!(data.val_out & BMSR_LSTATUS);
1199 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1202 struct ethtool_cmd ecmd;
1204 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1207 memset(&ecmd, 0, sizeof ecmd);
1208 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1211 struct ethtool_value eval;
1213 memcpy(&eval, &ecmd, sizeof eval);
1214 *miimon = !!eval.data;
1216 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1224 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1225 long long int interval)
1227 struct netdev_dev_linux *netdev_dev;
1229 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev_));
1231 interval = interval > 0 ? MAX(interval, 100) : 0;
1232 if (netdev_dev->miimon_interval != interval) {
1233 netdev_dev->miimon_interval = interval;
1234 timer_set_expired(&netdev_dev->miimon_timer);
1241 netdev_linux_miimon_run(void)
1243 struct shash device_shash;
1244 struct shash_node *node;
1246 shash_init(&device_shash);
1247 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1248 SHASH_FOR_EACH (node, &device_shash) {
1249 struct netdev_dev_linux *dev = node->data;
1252 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1256 netdev_linux_get_miimon(dev->netdev_dev.name, &miimon);
1257 if (miimon != dev->miimon) {
1258 dev->miimon = miimon;
1259 netdev_dev_linux_changed(dev, dev->ifi_flags, 0);
1262 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1265 shash_destroy(&device_shash);
1269 netdev_linux_miimon_wait(void)
1271 struct shash device_shash;
1272 struct shash_node *node;
1274 shash_init(&device_shash);
1275 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1276 SHASH_FOR_EACH (node, &device_shash) {
1277 struct netdev_dev_linux *dev = node->data;
1279 if (dev->miimon_interval > 0) {
1280 timer_wait(&dev->miimon_timer);
1283 shash_destroy(&device_shash);
1286 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1287 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1290 check_for_working_netlink_stats(void)
1292 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1293 * preferable, so if that works, we'll use it. */
1294 int ifindex = do_get_ifindex("lo");
1296 VLOG_WARN("failed to get ifindex for lo, "
1297 "obtaining netdev stats from proc");
1300 struct netdev_stats stats;
1301 int error = get_stats_via_netlink(ifindex, &stats);
1303 VLOG_DBG("obtaining netdev stats via rtnetlink");
1306 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1307 "via proc (you are probably running a pre-2.6.19 "
1308 "kernel)", strerror(error));
1315 swap_uint64(uint64_t *a, uint64_t *b)
1323 get_stats_via_vport(const struct netdev *netdev_,
1324 struct netdev_stats *stats)
1326 struct netdev_dev_linux *netdev_dev =
1327 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1329 if (!netdev_dev->vport_stats_error ||
1330 !(netdev_dev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1333 error = netdev_vport_get_stats(netdev_, stats);
1335 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1336 "(%s)", netdev_get_name(netdev_), strerror(error));
1338 netdev_dev->vport_stats_error = error;
1339 netdev_dev->cache_valid |= VALID_VPORT_STAT_ERROR;
1344 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1345 struct netdev_stats *stats)
1347 static int use_netlink_stats = -1;
1350 if (use_netlink_stats < 0) {
1351 use_netlink_stats = check_for_working_netlink_stats();
1354 if (use_netlink_stats) {
1357 error = get_ifindex(netdev_, &ifindex);
1359 error = get_stats_via_netlink(ifindex, stats);
1362 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1366 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1367 netdev_get_name(netdev_), error);
1373 /* Retrieves current device stats for 'netdev-linux'. */
1375 netdev_linux_get_stats(const struct netdev *netdev_,
1376 struct netdev_stats *stats)
1378 struct netdev_dev_linux *netdev_dev =
1379 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1380 struct netdev_stats dev_stats;
1383 get_stats_via_vport(netdev_, stats);
1385 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1388 if (netdev_dev->vport_stats_error) {
1395 if (netdev_dev->vport_stats_error) {
1396 /* stats not available from OVS then use ioctl stats. */
1399 stats->rx_errors += dev_stats.rx_errors;
1400 stats->tx_errors += dev_stats.tx_errors;
1401 stats->rx_dropped += dev_stats.rx_dropped;
1402 stats->tx_dropped += dev_stats.tx_dropped;
1403 stats->multicast += dev_stats.multicast;
1404 stats->collisions += dev_stats.collisions;
1405 stats->rx_length_errors += dev_stats.rx_length_errors;
1406 stats->rx_over_errors += dev_stats.rx_over_errors;
1407 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1408 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1409 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1410 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1411 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1412 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1413 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1414 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1415 stats->tx_window_errors += dev_stats.tx_window_errors;
1420 /* Retrieves current device stats for 'netdev-tap' netdev or
1421 * netdev-internal. */
1423 netdev_tap_get_stats(const struct netdev *netdev_,
1424 struct netdev_stats *stats)
1426 struct netdev_dev_linux *netdev_dev =
1427 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1428 struct netdev_stats dev_stats;
1431 get_stats_via_vport(netdev_, stats);
1433 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1435 if (netdev_dev->vport_stats_error) {
1442 /* If this port is an internal port then the transmit and receive stats
1443 * will appear to be swapped relative to the other ports since we are the
1444 * one sending the data, not a remote computer. For consistency, we swap
1445 * them back here. This does not apply if we are getting stats from the
1446 * vport layer because it always tracks stats from the perspective of the
1448 if (netdev_dev->vport_stats_error) {
1450 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1451 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1452 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1453 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1454 stats->rx_length_errors = 0;
1455 stats->rx_over_errors = 0;
1456 stats->rx_crc_errors = 0;
1457 stats->rx_frame_errors = 0;
1458 stats->rx_fifo_errors = 0;
1459 stats->rx_missed_errors = 0;
1460 stats->tx_aborted_errors = 0;
1461 stats->tx_carrier_errors = 0;
1462 stats->tx_fifo_errors = 0;
1463 stats->tx_heartbeat_errors = 0;
1464 stats->tx_window_errors = 0;
1466 stats->rx_dropped += dev_stats.tx_dropped;
1467 stats->tx_dropped += dev_stats.rx_dropped;
1469 stats->rx_errors += dev_stats.tx_errors;
1470 stats->tx_errors += dev_stats.rx_errors;
1472 stats->multicast += dev_stats.multicast;
1473 stats->collisions += dev_stats.collisions;
1479 netdev_internal_get_stats(const struct netdev *netdev_,
1480 struct netdev_stats *stats)
1482 struct netdev_dev_linux *netdev_dev =
1483 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1485 get_stats_via_vport(netdev_, stats);
1486 return netdev_dev->vport_stats_error;
1490 netdev_linux_read_features(struct netdev_dev_linux *netdev_dev)
1492 struct ethtool_cmd ecmd;
1495 if (netdev_dev->cache_valid & VALID_FEATURES) {
1499 memset(&ecmd, 0, sizeof ecmd);
1500 error = netdev_linux_do_ethtool(netdev_dev->netdev_dev.name, &ecmd,
1501 ETHTOOL_GSET, "ETHTOOL_GSET");
1506 /* Supported features. */
1507 netdev_dev->supported = 0;
1508 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1509 netdev_dev->supported |= OFPPF_10MB_HD;
1511 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1512 netdev_dev->supported |= OFPPF_10MB_FD;
1514 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1515 netdev_dev->supported |= OFPPF_100MB_HD;
1517 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1518 netdev_dev->supported |= OFPPF_100MB_FD;
1520 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1521 netdev_dev->supported |= OFPPF_1GB_HD;
1523 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1524 netdev_dev->supported |= OFPPF_1GB_FD;
1526 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1527 netdev_dev->supported |= OFPPF_10GB_FD;
1529 if (ecmd.supported & SUPPORTED_TP) {
1530 netdev_dev->supported |= OFPPF_COPPER;
1532 if (ecmd.supported & SUPPORTED_FIBRE) {
1533 netdev_dev->supported |= OFPPF_FIBER;
1535 if (ecmd.supported & SUPPORTED_Autoneg) {
1536 netdev_dev->supported |= OFPPF_AUTONEG;
1538 if (ecmd.supported & SUPPORTED_Pause) {
1539 netdev_dev->supported |= OFPPF_PAUSE;
1541 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1542 netdev_dev->supported |= OFPPF_PAUSE_ASYM;
1545 /* Advertised features. */
1546 netdev_dev->advertised = 0;
1547 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1548 netdev_dev->advertised |= OFPPF_10MB_HD;
1550 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1551 netdev_dev->advertised |= OFPPF_10MB_FD;
1553 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1554 netdev_dev->advertised |= OFPPF_100MB_HD;
1556 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1557 netdev_dev->advertised |= OFPPF_100MB_FD;
1559 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1560 netdev_dev->advertised |= OFPPF_1GB_HD;
1562 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1563 netdev_dev->advertised |= OFPPF_1GB_FD;
1565 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1566 netdev_dev->advertised |= OFPPF_10GB_FD;
1568 if (ecmd.advertising & ADVERTISED_TP) {
1569 netdev_dev->advertised |= OFPPF_COPPER;
1571 if (ecmd.advertising & ADVERTISED_FIBRE) {
1572 netdev_dev->advertised |= OFPPF_FIBER;
1574 if (ecmd.advertising & ADVERTISED_Autoneg) {
1575 netdev_dev->advertised |= OFPPF_AUTONEG;
1577 if (ecmd.advertising & ADVERTISED_Pause) {
1578 netdev_dev->advertised |= OFPPF_PAUSE;
1580 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1581 netdev_dev->advertised |= OFPPF_PAUSE_ASYM;
1584 /* Current settings. */
1585 if (ecmd.speed == SPEED_10) {
1586 netdev_dev->current = ecmd.duplex ? OFPPF_10MB_FD : OFPPF_10MB_HD;
1587 } else if (ecmd.speed == SPEED_100) {
1588 netdev_dev->current = ecmd.duplex ? OFPPF_100MB_FD : OFPPF_100MB_HD;
1589 } else if (ecmd.speed == SPEED_1000) {
1590 netdev_dev->current = ecmd.duplex ? OFPPF_1GB_FD : OFPPF_1GB_HD;
1591 } else if (ecmd.speed == SPEED_10000) {
1592 netdev_dev->current = OFPPF_10GB_FD;
1594 netdev_dev->current = 0;
1597 if (ecmd.port == PORT_TP) {
1598 netdev_dev->current |= OFPPF_COPPER;
1599 } else if (ecmd.port == PORT_FIBRE) {
1600 netdev_dev->current |= OFPPF_FIBER;
1604 netdev_dev->current |= OFPPF_AUTONEG;
1607 /* Peer advertisements. */
1608 netdev_dev->peer = 0; /* XXX */
1611 netdev_dev->cache_valid |= VALID_FEATURES;
1612 netdev_dev->get_features_error = error;
1615 /* Stores the features supported by 'netdev' into each of '*current',
1616 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1617 * bitmap of NETDEV_* bits. Returns 0 if successful, otherwise a positive
1620 netdev_linux_get_features(const struct netdev *netdev_,
1621 uint32_t *current, uint32_t *advertised,
1622 uint32_t *supported, uint32_t *peer)
1624 struct netdev_dev_linux *netdev_dev =
1625 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1627 netdev_linux_read_features(netdev_dev);
1629 if (!netdev_dev->get_features_error) {
1630 *current = netdev_dev->current;
1631 *advertised = netdev_dev->advertised;
1632 *supported = netdev_dev->supported;
1633 *peer = netdev_dev->peer;
1635 return netdev_dev->get_features_error;
1638 /* Set the features advertised by 'netdev' to 'advertise'. */
1640 netdev_linux_set_advertisements(struct netdev *netdev, uint32_t advertise)
1642 struct ethtool_cmd ecmd;
1645 memset(&ecmd, 0, sizeof ecmd);
1646 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1647 ETHTOOL_GSET, "ETHTOOL_GSET");
1652 ecmd.advertising = 0;
1653 if (advertise & OFPPF_10MB_HD) {
1654 ecmd.advertising |= ADVERTISED_10baseT_Half;
1656 if (advertise & OFPPF_10MB_FD) {
1657 ecmd.advertising |= ADVERTISED_10baseT_Full;
1659 if (advertise & OFPPF_100MB_HD) {
1660 ecmd.advertising |= ADVERTISED_100baseT_Half;
1662 if (advertise & OFPPF_100MB_FD) {
1663 ecmd.advertising |= ADVERTISED_100baseT_Full;
1665 if (advertise & OFPPF_1GB_HD) {
1666 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1668 if (advertise & OFPPF_1GB_FD) {
1669 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1671 if (advertise & OFPPF_10GB_FD) {
1672 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1674 if (advertise & OFPPF_COPPER) {
1675 ecmd.advertising |= ADVERTISED_TP;
1677 if (advertise & OFPPF_FIBER) {
1678 ecmd.advertising |= ADVERTISED_FIBRE;
1680 if (advertise & OFPPF_AUTONEG) {
1681 ecmd.advertising |= ADVERTISED_Autoneg;
1683 if (advertise & OFPPF_PAUSE) {
1684 ecmd.advertising |= ADVERTISED_Pause;
1686 if (advertise & OFPPF_PAUSE_ASYM) {
1687 ecmd.advertising |= ADVERTISED_Asym_Pause;
1689 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1690 ETHTOOL_SSET, "ETHTOOL_SSET");
1693 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1694 * successful, otherwise a positive errno value. */
1696 netdev_linux_set_policing(struct netdev *netdev,
1697 uint32_t kbits_rate, uint32_t kbits_burst)
1699 struct netdev_dev_linux *netdev_dev =
1700 netdev_dev_linux_cast(netdev_get_dev(netdev));
1701 const char *netdev_name = netdev_get_name(netdev);
1705 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1706 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1707 : kbits_burst); /* Stick with user-specified value. */
1709 if (netdev_dev->cache_valid & VALID_POLICING) {
1710 if (netdev_dev->netdev_policing_error) {
1711 return netdev_dev->netdev_policing_error;
1714 if (netdev_dev->kbits_rate == kbits_rate &&
1715 netdev_dev->kbits_burst == kbits_burst) {
1716 /* Assume that settings haven't changed since we last set them. */
1719 netdev_dev->cache_valid &= ~VALID_POLICING;
1722 COVERAGE_INC(netdev_set_policing);
1723 /* Remove any existing ingress qdisc. */
1724 error = tc_add_del_ingress_qdisc(netdev, false);
1726 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1727 netdev_name, strerror(error));
1732 error = tc_add_del_ingress_qdisc(netdev, true);
1734 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1735 netdev_name, strerror(error));
1739 error = tc_add_policer(netdev, kbits_rate, kbits_burst);
1741 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1742 netdev_name, strerror(error));
1747 netdev_dev->kbits_rate = kbits_rate;
1748 netdev_dev->kbits_burst = kbits_burst;
1751 if (!error || error == ENODEV) {
1752 netdev_dev->netdev_policing_error = error;
1753 netdev_dev->cache_valid |= VALID_POLICING;
1759 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1762 const struct tc_ops **opsp;
1764 for (opsp = tcs; *opsp != NULL; opsp++) {
1765 const struct tc_ops *ops = *opsp;
1766 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1767 sset_add(types, ops->ovs_name);
1773 static const struct tc_ops *
1774 tc_lookup_ovs_name(const char *name)
1776 const struct tc_ops **opsp;
1778 for (opsp = tcs; *opsp != NULL; opsp++) {
1779 const struct tc_ops *ops = *opsp;
1780 if (!strcmp(name, ops->ovs_name)) {
1787 static const struct tc_ops *
1788 tc_lookup_linux_name(const char *name)
1790 const struct tc_ops **opsp;
1792 for (opsp = tcs; *opsp != NULL; opsp++) {
1793 const struct tc_ops *ops = *opsp;
1794 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1801 static struct tc_queue *
1802 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1805 struct netdev_dev_linux *netdev_dev =
1806 netdev_dev_linux_cast(netdev_get_dev(netdev));
1807 struct tc_queue *queue;
1809 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1810 if (queue->queue_id == queue_id) {
1817 static struct tc_queue *
1818 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1820 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1824 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1826 struct netdev_qos_capabilities *caps)
1828 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1832 caps->n_queues = ops->n_queues;
1837 netdev_linux_get_qos(const struct netdev *netdev,
1838 const char **typep, struct shash *details)
1840 struct netdev_dev_linux *netdev_dev =
1841 netdev_dev_linux_cast(netdev_get_dev(netdev));
1844 error = tc_query_qdisc(netdev);
1849 *typep = netdev_dev->tc->ops->ovs_name;
1850 return (netdev_dev->tc->ops->qdisc_get
1851 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1856 netdev_linux_set_qos(struct netdev *netdev,
1857 const char *type, const struct shash *details)
1859 struct netdev_dev_linux *netdev_dev =
1860 netdev_dev_linux_cast(netdev_get_dev(netdev));
1861 const struct tc_ops *new_ops;
1864 new_ops = tc_lookup_ovs_name(type);
1865 if (!new_ops || !new_ops->tc_install) {
1869 error = tc_query_qdisc(netdev);
1874 if (new_ops == netdev_dev->tc->ops) {
1875 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1877 /* Delete existing qdisc. */
1878 error = tc_del_qdisc(netdev);
1882 assert(netdev_dev->tc == NULL);
1884 /* Install new qdisc. */
1885 error = new_ops->tc_install(netdev, details);
1886 assert((error == 0) == (netdev_dev->tc != NULL));
1893 netdev_linux_get_queue(const struct netdev *netdev,
1894 unsigned int queue_id, struct shash *details)
1896 struct netdev_dev_linux *netdev_dev =
1897 netdev_dev_linux_cast(netdev_get_dev(netdev));
1900 error = tc_query_qdisc(netdev);
1904 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1906 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1912 netdev_linux_set_queue(struct netdev *netdev,
1913 unsigned int queue_id, const struct shash *details)
1915 struct netdev_dev_linux *netdev_dev =
1916 netdev_dev_linux_cast(netdev_get_dev(netdev));
1919 error = tc_query_qdisc(netdev);
1922 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1923 || !netdev_dev->tc->ops->class_set) {
1927 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1931 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1933 struct netdev_dev_linux *netdev_dev =
1934 netdev_dev_linux_cast(netdev_get_dev(netdev));
1937 error = tc_query_qdisc(netdev);
1940 } else if (!netdev_dev->tc->ops->class_delete) {
1943 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1945 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1951 netdev_linux_get_queue_stats(const struct netdev *netdev,
1952 unsigned int queue_id,
1953 struct netdev_queue_stats *stats)
1955 struct netdev_dev_linux *netdev_dev =
1956 netdev_dev_linux_cast(netdev_get_dev(netdev));
1959 error = tc_query_qdisc(netdev);
1962 } else if (!netdev_dev->tc->ops->class_get_stats) {
1965 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1967 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1973 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1975 struct ofpbuf request;
1976 struct tcmsg *tcmsg;
1978 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
1982 tcmsg->tcm_parent = 0;
1983 nl_dump_start(dump, rtnl_sock, &request);
1984 ofpbuf_uninit(&request);
1989 netdev_linux_dump_queues(const struct netdev *netdev,
1990 netdev_dump_queues_cb *cb, void *aux)
1992 struct netdev_dev_linux *netdev_dev =
1993 netdev_dev_linux_cast(netdev_get_dev(netdev));
1994 struct tc_queue *queue, *next_queue;
1995 struct shash details;
1999 error = tc_query_qdisc(netdev);
2002 } else if (!netdev_dev->tc->ops->class_get) {
2007 shash_init(&details);
2008 HMAP_FOR_EACH_SAFE (queue, next_queue, hmap_node,
2009 &netdev_dev->tc->queues) {
2010 shash_clear(&details);
2012 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
2014 (*cb)(queue->queue_id, &details, aux);
2019 shash_destroy(&details);
2025 netdev_linux_dump_queue_stats(const struct netdev *netdev,
2026 netdev_dump_queue_stats_cb *cb, void *aux)
2028 struct netdev_dev_linux *netdev_dev =
2029 netdev_dev_linux_cast(netdev_get_dev(netdev));
2030 struct nl_dump dump;
2035 error = tc_query_qdisc(netdev);
2038 } else if (!netdev_dev->tc->ops->class_dump_stats) {
2043 if (!start_queue_dump(netdev, &dump)) {
2046 while (nl_dump_next(&dump, &msg)) {
2047 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
2053 error = nl_dump_done(&dump);
2054 return error ? error : last_error;
2058 netdev_linux_get_in4(const struct netdev *netdev_,
2059 struct in_addr *address, struct in_addr *netmask)
2061 struct netdev_dev_linux *netdev_dev =
2062 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2064 if (!(netdev_dev->cache_valid & VALID_IN4)) {
2067 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
2068 SIOCGIFADDR, "SIOCGIFADDR");
2073 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
2074 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2079 netdev_dev->cache_valid |= VALID_IN4;
2081 *address = netdev_dev->address;
2082 *netmask = netdev_dev->netmask;
2083 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
2087 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2088 struct in_addr netmask)
2090 struct netdev_dev_linux *netdev_dev =
2091 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2094 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2096 netdev_dev->cache_valid |= VALID_IN4;
2097 netdev_dev->address = address;
2098 netdev_dev->netmask = netmask;
2099 if (address.s_addr != INADDR_ANY) {
2100 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2101 "SIOCSIFNETMASK", netmask);
2108 parse_if_inet6_line(const char *line,
2109 struct in6_addr *in6, char ifname[16 + 1])
2111 uint8_t *s6 = in6->s6_addr;
2112 #define X8 "%2"SCNx8
2114 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2115 "%*x %*x %*x %*x %16s\n",
2116 &s6[0], &s6[1], &s6[2], &s6[3],
2117 &s6[4], &s6[5], &s6[6], &s6[7],
2118 &s6[8], &s6[9], &s6[10], &s6[11],
2119 &s6[12], &s6[13], &s6[14], &s6[15],
2123 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2124 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2126 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2128 struct netdev_dev_linux *netdev_dev =
2129 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2130 if (!(netdev_dev->cache_valid & VALID_IN6)) {
2134 netdev_dev->in6 = in6addr_any;
2136 file = fopen("/proc/net/if_inet6", "r");
2138 const char *name = netdev_get_name(netdev_);
2139 while (fgets(line, sizeof line, file)) {
2140 struct in6_addr in6_tmp;
2141 char ifname[16 + 1];
2142 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2143 && !strcmp(name, ifname))
2145 netdev_dev->in6 = in6_tmp;
2151 netdev_dev->cache_valid |= VALID_IN6;
2153 *in6 = netdev_dev->in6;
2158 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2160 struct sockaddr_in sin;
2161 memset(&sin, 0, sizeof sin);
2162 sin.sin_family = AF_INET;
2163 sin.sin_addr = addr;
2166 memset(sa, 0, sizeof *sa);
2167 memcpy(sa, &sin, sizeof sin);
2171 do_set_addr(struct netdev *netdev,
2172 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2175 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2176 make_in4_sockaddr(&ifr.ifr_addr, addr);
2178 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2182 /* Adds 'router' as a default IP gateway. */
2184 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2186 struct in_addr any = { INADDR_ANY };
2190 memset(&rt, 0, sizeof rt);
2191 make_in4_sockaddr(&rt.rt_dst, any);
2192 make_in4_sockaddr(&rt.rt_gateway, router);
2193 make_in4_sockaddr(&rt.rt_genmask, any);
2194 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2195 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2197 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
2203 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2206 static const char fn[] = "/proc/net/route";
2211 *netdev_name = NULL;
2212 stream = fopen(fn, "r");
2213 if (stream == NULL) {
2214 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
2219 while (fgets(line, sizeof line, stream)) {
2222 ovs_be32 dest, gateway, mask;
2223 int refcnt, metric, mtu;
2224 unsigned int flags, use, window, irtt;
2227 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2229 iface, &dest, &gateway, &flags, &refcnt,
2230 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2232 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2236 if (!(flags & RTF_UP)) {
2237 /* Skip routes that aren't up. */
2241 /* The output of 'dest', 'mask', and 'gateway' were given in
2242 * network byte order, so we don't need need any endian
2243 * conversions here. */
2244 if ((dest & mask) == (host->s_addr & mask)) {
2246 /* The host is directly reachable. */
2247 next_hop->s_addr = 0;
2249 /* To reach the host, we must go through a gateway. */
2250 next_hop->s_addr = gateway;
2252 *netdev_name = xstrdup(iface);
2264 netdev_linux_get_status(const struct netdev *netdev, struct shash *sh)
2267 struct netdev_dev_linux *netdev_dev =
2268 netdev_dev_linux_cast(netdev_get_dev(netdev));
2270 error = netdev_linux_get_drvinfo(netdev_dev);
2272 shash_add(sh, "driver_name", xstrdup(netdev_dev->drvinfo.driver));
2273 shash_add(sh, "driver_version", xstrdup(netdev_dev->drvinfo.version));
2274 shash_add(sh, "firmware_version", xstrdup(netdev_dev->drvinfo.fw_version));
2280 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED, struct shash *sh)
2282 shash_add(sh, "driver_name", xstrdup("openvswitch"));
2286 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2287 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2288 * returns 0. Otherwise, it returns a positive errno value; in particular,
2289 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2291 netdev_linux_arp_lookup(const struct netdev *netdev,
2292 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2295 struct sockaddr_in sin;
2298 memset(&r, 0, sizeof r);
2299 memset(&sin, 0, sizeof sin);
2300 sin.sin_family = AF_INET;
2301 sin.sin_addr.s_addr = ip;
2303 memcpy(&r.arp_pa, &sin, sizeof sin);
2304 r.arp_ha.sa_family = ARPHRD_ETHER;
2306 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2307 COVERAGE_INC(netdev_arp_lookup);
2308 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2310 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2311 } else if (retval != ENXIO) {
2312 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2313 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
2319 nd_to_iff_flags(enum netdev_flags nd)
2322 if (nd & NETDEV_UP) {
2325 if (nd & NETDEV_PROMISC) {
2332 iff_to_nd_flags(int iff)
2334 enum netdev_flags nd = 0;
2338 if (iff & IFF_PROMISC) {
2339 nd |= NETDEV_PROMISC;
2345 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2346 enum netdev_flags on, enum netdev_flags *old_flagsp)
2348 struct netdev_dev_linux *netdev_dev;
2349 int old_flags, new_flags;
2352 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2353 old_flags = netdev_dev->ifi_flags;
2354 *old_flagsp = iff_to_nd_flags(old_flags);
2355 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2356 if (new_flags != old_flags) {
2357 error = set_flags(netdev, new_flags);
2358 get_flags(&netdev_dev->netdev_dev, &netdev_dev->ifi_flags);
2364 netdev_linux_change_seq(const struct netdev *netdev)
2366 return netdev_dev_linux_cast(netdev_get_dev(netdev))->change_seq;
2369 #define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS, \
2370 GET_FEATURES, GET_STATUS) \
2374 netdev_linux_init, \
2376 netdev_linux_wait, \
2379 netdev_linux_destroy, \
2380 NULL, /* get_config */ \
2381 NULL, /* set_config */ \
2383 netdev_linux_open, \
2384 netdev_linux_close, \
2386 netdev_linux_listen, \
2387 netdev_linux_recv, \
2388 netdev_linux_recv_wait, \
2389 netdev_linux_drain, \
2391 netdev_linux_send, \
2392 netdev_linux_send_wait, \
2394 netdev_linux_set_etheraddr, \
2395 netdev_linux_get_etheraddr, \
2396 netdev_linux_get_mtu, \
2397 netdev_linux_set_mtu, \
2398 netdev_linux_get_ifindex, \
2399 netdev_linux_get_carrier, \
2400 netdev_linux_get_carrier_resets, \
2401 netdev_linux_set_miimon_interval, \
2406 netdev_linux_set_advertisements, \
2408 netdev_linux_set_policing, \
2409 netdev_linux_get_qos_types, \
2410 netdev_linux_get_qos_capabilities, \
2411 netdev_linux_get_qos, \
2412 netdev_linux_set_qos, \
2413 netdev_linux_get_queue, \
2414 netdev_linux_set_queue, \
2415 netdev_linux_delete_queue, \
2416 netdev_linux_get_queue_stats, \
2417 netdev_linux_dump_queues, \
2418 netdev_linux_dump_queue_stats, \
2420 netdev_linux_get_in4, \
2421 netdev_linux_set_in4, \
2422 netdev_linux_get_in6, \
2423 netdev_linux_add_router, \
2424 netdev_linux_get_next_hop, \
2426 netdev_linux_arp_lookup, \
2428 netdev_linux_update_flags, \
2430 netdev_linux_change_seq \
2433 const struct netdev_class netdev_linux_class =
2436 netdev_linux_create,
2437 netdev_linux_get_stats,
2438 NULL, /* set_stats */
2439 netdev_linux_get_features,
2440 netdev_linux_get_status);
2442 const struct netdev_class netdev_tap_class =
2445 netdev_linux_create_tap,
2446 netdev_tap_get_stats,
2447 NULL, /* set_stats */
2448 netdev_linux_get_features,
2449 netdev_linux_get_status);
2451 const struct netdev_class netdev_internal_class =
2454 netdev_linux_create,
2455 netdev_internal_get_stats,
2456 netdev_vport_set_stats,
2457 NULL, /* get_features */
2458 netdev_internal_get_status);
2460 /* HTB traffic control class. */
2462 #define HTB_N_QUEUES 0xf000
2466 unsigned int max_rate; /* In bytes/s. */
2470 struct tc_queue tc_queue;
2471 unsigned int min_rate; /* In bytes/s. */
2472 unsigned int max_rate; /* In bytes/s. */
2473 unsigned int burst; /* In bytes. */
2474 unsigned int priority; /* Lower values are higher priorities. */
2478 htb_get__(const struct netdev *netdev)
2480 struct netdev_dev_linux *netdev_dev =
2481 netdev_dev_linux_cast(netdev_get_dev(netdev));
2482 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2486 htb_install__(struct netdev *netdev, uint64_t max_rate)
2488 struct netdev_dev_linux *netdev_dev =
2489 netdev_dev_linux_cast(netdev_get_dev(netdev));
2492 htb = xmalloc(sizeof *htb);
2493 tc_init(&htb->tc, &tc_ops_htb);
2494 htb->max_rate = max_rate;
2496 netdev_dev->tc = &htb->tc;
2499 /* Create an HTB qdisc.
2501 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2503 htb_setup_qdisc__(struct netdev *netdev)
2506 struct tc_htb_glob opt;
2507 struct ofpbuf request;
2508 struct tcmsg *tcmsg;
2510 tc_del_qdisc(netdev);
2512 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2513 NLM_F_EXCL | NLM_F_CREATE, &request);
2517 tcmsg->tcm_handle = tc_make_handle(1, 0);
2518 tcmsg->tcm_parent = TC_H_ROOT;
2520 nl_msg_put_string(&request, TCA_KIND, "htb");
2522 memset(&opt, 0, sizeof opt);
2523 opt.rate2quantum = 10;
2527 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2528 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2529 nl_msg_end_nested(&request, opt_offset);
2531 return tc_transact(&request, NULL);
2534 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2535 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2537 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2538 unsigned int parent, struct htb_class *class)
2541 struct tc_htb_opt opt;
2542 struct ofpbuf request;
2543 struct tcmsg *tcmsg;
2547 error = netdev_get_mtu(netdev, &mtu);
2549 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2550 netdev_get_name(netdev));
2554 memset(&opt, 0, sizeof opt);
2555 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2556 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2557 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2558 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2559 opt.prio = class->priority;
2561 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2565 tcmsg->tcm_handle = handle;
2566 tcmsg->tcm_parent = parent;
2568 nl_msg_put_string(&request, TCA_KIND, "htb");
2569 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2570 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2571 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2572 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2573 nl_msg_end_nested(&request, opt_offset);
2575 error = tc_transact(&request, NULL);
2577 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2578 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2579 netdev_get_name(netdev),
2580 tc_get_major(handle), tc_get_minor(handle),
2581 tc_get_major(parent), tc_get_minor(parent),
2582 class->min_rate, class->max_rate,
2583 class->burst, class->priority, strerror(error));
2588 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2589 * description of them into 'details'. The description complies with the
2590 * specification given in the vswitch database documentation for linux-htb
2593 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2595 static const struct nl_policy tca_htb_policy[] = {
2596 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2597 .min_len = sizeof(struct tc_htb_opt) },
2600 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2601 const struct tc_htb_opt *htb;
2603 if (!nl_parse_nested(nl_options, tca_htb_policy,
2604 attrs, ARRAY_SIZE(tca_htb_policy))) {
2605 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2609 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2610 class->min_rate = htb->rate.rate;
2611 class->max_rate = htb->ceil.rate;
2612 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2613 class->priority = htb->prio;
2618 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2619 struct htb_class *options,
2620 struct netdev_queue_stats *stats)
2622 struct nlattr *nl_options;
2623 unsigned int handle;
2626 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2627 if (!error && queue_id) {
2628 unsigned int major = tc_get_major(handle);
2629 unsigned int minor = tc_get_minor(handle);
2630 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2631 *queue_id = minor - 1;
2636 if (!error && options) {
2637 error = htb_parse_tca_options__(nl_options, options);
2643 htb_parse_qdisc_details__(struct netdev *netdev,
2644 const struct shash *details, struct htb_class *hc)
2646 const char *max_rate_s;
2648 max_rate_s = shash_find_data(details, "max-rate");
2649 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2650 if (!hc->max_rate) {
2653 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2654 hc->max_rate = netdev_features_to_bps(current) / 8;
2656 hc->min_rate = hc->max_rate;
2662 htb_parse_class_details__(struct netdev *netdev,
2663 const struct shash *details, struct htb_class *hc)
2665 const struct htb *htb = htb_get__(netdev);
2666 const char *min_rate_s = shash_find_data(details, "min-rate");
2667 const char *max_rate_s = shash_find_data(details, "max-rate");
2668 const char *burst_s = shash_find_data(details, "burst");
2669 const char *priority_s = shash_find_data(details, "priority");
2672 error = netdev_get_mtu(netdev, &mtu);
2674 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2675 netdev_get_name(netdev));
2679 /* HTB requires at least an mtu sized min-rate to send any traffic even
2680 * on uncongested links. */
2681 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2682 hc->min_rate = MAX(hc->min_rate, mtu);
2683 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2686 hc->max_rate = (max_rate_s
2687 ? strtoull(max_rate_s, NULL, 10) / 8
2689 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2690 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2694 * According to hints in the documentation that I've read, it is important
2695 * that 'burst' be at least as big as the largest frame that might be
2696 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2697 * but having it a bit too small is a problem. Since netdev_get_mtu()
2698 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2699 * the MTU. We actually add 64, instead of 14, as a guard against
2700 * additional headers get tacked on somewhere that we're not aware of. */
2701 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2702 hc->burst = MAX(hc->burst, mtu + 64);
2705 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2711 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2712 unsigned int parent, struct htb_class *options,
2713 struct netdev_queue_stats *stats)
2715 struct ofpbuf *reply;
2718 error = tc_query_class(netdev, handle, parent, &reply);
2720 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2721 ofpbuf_delete(reply);
2727 htb_tc_install(struct netdev *netdev, const struct shash *details)
2731 error = htb_setup_qdisc__(netdev);
2733 struct htb_class hc;
2735 htb_parse_qdisc_details__(netdev, details, &hc);
2736 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2737 tc_make_handle(1, 0), &hc);
2739 htb_install__(netdev, hc.max_rate);
2745 static struct htb_class *
2746 htb_class_cast__(const struct tc_queue *queue)
2748 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2752 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2753 const struct htb_class *hc)
2755 struct htb *htb = htb_get__(netdev);
2756 size_t hash = hash_int(queue_id, 0);
2757 struct tc_queue *queue;
2758 struct htb_class *hcp;
2760 queue = tc_find_queue__(netdev, queue_id, hash);
2762 hcp = htb_class_cast__(queue);
2764 hcp = xmalloc(sizeof *hcp);
2765 queue = &hcp->tc_queue;
2766 queue->queue_id = queue_id;
2767 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2770 hcp->min_rate = hc->min_rate;
2771 hcp->max_rate = hc->max_rate;
2772 hcp->burst = hc->burst;
2773 hcp->priority = hc->priority;
2777 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2780 struct nl_dump dump;
2781 struct htb_class hc;
2783 /* Get qdisc options. */
2785 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2786 htb_install__(netdev, hc.max_rate);
2789 if (!start_queue_dump(netdev, &dump)) {
2792 while (nl_dump_next(&dump, &msg)) {
2793 unsigned int queue_id;
2795 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2796 htb_update_queue__(netdev, queue_id, &hc);
2799 nl_dump_done(&dump);
2805 htb_tc_destroy(struct tc *tc)
2807 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2808 struct htb_class *hc, *next;
2810 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2811 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2819 htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2821 const struct htb *htb = htb_get__(netdev);
2822 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2827 htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2829 struct htb_class hc;
2832 htb_parse_qdisc_details__(netdev, details, &hc);
2833 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2834 tc_make_handle(1, 0), &hc);
2836 htb_get__(netdev)->max_rate = hc.max_rate;
2842 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2843 const struct tc_queue *queue, struct shash *details)
2845 const struct htb_class *hc = htb_class_cast__(queue);
2847 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2848 if (hc->min_rate != hc->max_rate) {
2849 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2851 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2853 shash_add(details, "priority", xasprintf("%u", hc->priority));
2859 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2860 const struct shash *details)
2862 struct htb_class hc;
2865 error = htb_parse_class_details__(netdev, details, &hc);
2870 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2871 tc_make_handle(1, 0xfffe), &hc);
2876 htb_update_queue__(netdev, queue_id, &hc);
2881 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2883 struct htb_class *hc = htb_class_cast__(queue);
2884 struct htb *htb = htb_get__(netdev);
2887 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2889 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2896 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2897 struct netdev_queue_stats *stats)
2899 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2900 tc_make_handle(1, 0xfffe), NULL, stats);
2904 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2905 const struct ofpbuf *nlmsg,
2906 netdev_dump_queue_stats_cb *cb, void *aux)
2908 struct netdev_queue_stats stats;
2909 unsigned int handle, major, minor;
2912 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2917 major = tc_get_major(handle);
2918 minor = tc_get_minor(handle);
2919 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2920 (*cb)(minor - 1, &stats, aux);
2925 static const struct tc_ops tc_ops_htb = {
2926 "htb", /* linux_name */
2927 "linux-htb", /* ovs_name */
2928 HTB_N_QUEUES, /* n_queues */
2937 htb_class_get_stats,
2938 htb_class_dump_stats
2941 /* "linux-hfsc" traffic control class. */
2943 #define HFSC_N_QUEUES 0xf000
2951 struct tc_queue tc_queue;
2956 static struct hfsc *
2957 hfsc_get__(const struct netdev *netdev)
2959 struct netdev_dev_linux *netdev_dev;
2960 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2961 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2964 static struct hfsc_class *
2965 hfsc_class_cast__(const struct tc_queue *queue)
2967 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2971 hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2973 struct netdev_dev_linux * netdev_dev;
2976 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2977 hfsc = xmalloc(sizeof *hfsc);
2978 tc_init(&hfsc->tc, &tc_ops_hfsc);
2979 hfsc->max_rate = max_rate;
2980 netdev_dev->tc = &hfsc->tc;
2984 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
2985 const struct hfsc_class *hc)
2989 struct hfsc_class *hcp;
2990 struct tc_queue *queue;
2992 hfsc = hfsc_get__(netdev);
2993 hash = hash_int(queue_id, 0);
2995 queue = tc_find_queue__(netdev, queue_id, hash);
2997 hcp = hfsc_class_cast__(queue);
2999 hcp = xmalloc(sizeof *hcp);
3000 queue = &hcp->tc_queue;
3001 queue->queue_id = queue_id;
3002 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3005 hcp->min_rate = hc->min_rate;
3006 hcp->max_rate = hc->max_rate;
3010 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3012 const struct tc_service_curve *rsc, *fsc, *usc;
3013 static const struct nl_policy tca_hfsc_policy[] = {
3015 .type = NL_A_UNSPEC,
3017 .min_len = sizeof(struct tc_service_curve),
3020 .type = NL_A_UNSPEC,
3022 .min_len = sizeof(struct tc_service_curve),
3025 .type = NL_A_UNSPEC,
3027 .min_len = sizeof(struct tc_service_curve),
3030 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3032 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3033 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3034 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3038 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3039 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3040 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3042 if (rsc->m1 != 0 || rsc->d != 0 ||
3043 fsc->m1 != 0 || fsc->d != 0 ||
3044 usc->m1 != 0 || usc->d != 0) {
3045 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3046 "Non-linear service curves are not supported.");
3050 if (rsc->m2 != fsc->m2) {
3051 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3052 "Real-time service curves are not supported ");
3056 if (rsc->m2 > usc->m2) {
3057 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3058 "Min-rate service curve is greater than "
3059 "the max-rate service curve.");
3063 class->min_rate = fsc->m2;
3064 class->max_rate = usc->m2;
3069 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3070 struct hfsc_class *options,
3071 struct netdev_queue_stats *stats)
3074 unsigned int handle;
3075 struct nlattr *nl_options;
3077 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3083 unsigned int major, minor;
3085 major = tc_get_major(handle);
3086 minor = tc_get_minor(handle);
3087 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3088 *queue_id = minor - 1;
3095 error = hfsc_parse_tca_options__(nl_options, options);
3102 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3103 unsigned int parent, struct hfsc_class *options,
3104 struct netdev_queue_stats *stats)
3107 struct ofpbuf *reply;
3109 error = tc_query_class(netdev, handle, parent, &reply);
3114 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3115 ofpbuf_delete(reply);
3120 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
3121 struct hfsc_class *class)
3124 const char *max_rate_s;
3126 max_rate_s = shash_find_data(details, "max-rate");
3127 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3132 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
3133 max_rate = netdev_features_to_bps(current) / 8;
3136 class->min_rate = max_rate;
3137 class->max_rate = max_rate;
3141 hfsc_parse_class_details__(struct netdev *netdev,
3142 const struct shash *details,
3143 struct hfsc_class * class)
3145 const struct hfsc *hfsc;
3146 uint32_t min_rate, max_rate;
3147 const char *min_rate_s, *max_rate_s;
3149 hfsc = hfsc_get__(netdev);
3150 min_rate_s = shash_find_data(details, "min-rate");
3151 max_rate_s = shash_find_data(details, "max-rate");
3153 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3154 min_rate = MAX(min_rate, 1);
3155 min_rate = MIN(min_rate, hfsc->max_rate);
3157 max_rate = (max_rate_s
3158 ? strtoull(max_rate_s, NULL, 10) / 8
3160 max_rate = MAX(max_rate, min_rate);
3161 max_rate = MIN(max_rate, hfsc->max_rate);
3163 class->min_rate = min_rate;
3164 class->max_rate = max_rate;
3169 /* Create an HFSC qdisc.
3171 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3173 hfsc_setup_qdisc__(struct netdev * netdev)
3175 struct tcmsg *tcmsg;
3176 struct ofpbuf request;
3177 struct tc_hfsc_qopt opt;
3179 tc_del_qdisc(netdev);
3181 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3182 NLM_F_EXCL | NLM_F_CREATE, &request);
3188 tcmsg->tcm_handle = tc_make_handle(1, 0);
3189 tcmsg->tcm_parent = TC_H_ROOT;
3191 memset(&opt, 0, sizeof opt);
3194 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3195 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3197 return tc_transact(&request, NULL);
3200 /* Create an HFSC class.
3202 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3203 * sc rate <min_rate> ul rate <max_rate>" */
3205 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3206 unsigned int parent, struct hfsc_class *class)
3210 struct tcmsg *tcmsg;
3211 struct ofpbuf request;
3212 struct tc_service_curve min, max;
3214 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3220 tcmsg->tcm_handle = handle;
3221 tcmsg->tcm_parent = parent;
3225 min.m2 = class->min_rate;
3229 max.m2 = class->max_rate;
3231 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3232 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3233 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3234 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3235 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3236 nl_msg_end_nested(&request, opt_offset);
3238 error = tc_transact(&request, NULL);
3240 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3241 "min-rate %ubps, max-rate %ubps (%s)",
3242 netdev_get_name(netdev),
3243 tc_get_major(handle), tc_get_minor(handle),
3244 tc_get_major(parent), tc_get_minor(parent),
3245 class->min_rate, class->max_rate, strerror(error));
3252 hfsc_tc_install(struct netdev *netdev, const struct shash *details)
3255 struct hfsc_class class;
3257 error = hfsc_setup_qdisc__(netdev);
3263 hfsc_parse_qdisc_details__(netdev, details, &class);
3264 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3265 tc_make_handle(1, 0), &class);
3271 hfsc_install__(netdev, class.max_rate);
3276 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3279 struct nl_dump dump;
3280 struct hfsc_class hc;
3283 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3284 hfsc_install__(netdev, hc.max_rate);
3286 if (!start_queue_dump(netdev, &dump)) {
3290 while (nl_dump_next(&dump, &msg)) {
3291 unsigned int queue_id;
3293 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3294 hfsc_update_queue__(netdev, queue_id, &hc);
3298 nl_dump_done(&dump);
3303 hfsc_tc_destroy(struct tc *tc)
3306 struct hfsc_class *hc, *next;
3308 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3310 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3311 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3320 hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3322 const struct hfsc *hfsc;
3323 hfsc = hfsc_get__(netdev);
3324 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3329 hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3332 struct hfsc_class class;
3334 hfsc_parse_qdisc_details__(netdev, details, &class);
3335 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3336 tc_make_handle(1, 0), &class);
3339 hfsc_get__(netdev)->max_rate = class.max_rate;
3346 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3347 const struct tc_queue *queue, struct shash *details)
3349 const struct hfsc_class *hc;
3351 hc = hfsc_class_cast__(queue);
3352 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3353 if (hc->min_rate != hc->max_rate) {
3354 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3360 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3361 const struct shash *details)
3364 struct hfsc_class class;
3366 error = hfsc_parse_class_details__(netdev, details, &class);
3371 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3372 tc_make_handle(1, 0xfffe), &class);
3377 hfsc_update_queue__(netdev, queue_id, &class);
3382 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3386 struct hfsc_class *hc;
3388 hc = hfsc_class_cast__(queue);
3389 hfsc = hfsc_get__(netdev);
3391 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3393 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3400 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3401 struct netdev_queue_stats *stats)
3403 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3404 tc_make_handle(1, 0xfffe), NULL, stats);
3408 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3409 const struct ofpbuf *nlmsg,
3410 netdev_dump_queue_stats_cb *cb, void *aux)
3412 struct netdev_queue_stats stats;
3413 unsigned int handle, major, minor;
3416 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3421 major = tc_get_major(handle);
3422 minor = tc_get_minor(handle);
3423 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3424 (*cb)(minor - 1, &stats, aux);
3429 static const struct tc_ops tc_ops_hfsc = {
3430 "hfsc", /* linux_name */
3431 "linux-hfsc", /* ovs_name */
3432 HFSC_N_QUEUES, /* n_queues */
3433 hfsc_tc_install, /* tc_install */
3434 hfsc_tc_load, /* tc_load */
3435 hfsc_tc_destroy, /* tc_destroy */
3436 hfsc_qdisc_get, /* qdisc_get */
3437 hfsc_qdisc_set, /* qdisc_set */
3438 hfsc_class_get, /* class_get */
3439 hfsc_class_set, /* class_set */
3440 hfsc_class_delete, /* class_delete */
3441 hfsc_class_get_stats, /* class_get_stats */
3442 hfsc_class_dump_stats /* class_dump_stats */
3445 /* "linux-default" traffic control class.
3447 * This class represents the default, unnamed Linux qdisc. It corresponds to
3448 * the "" (empty string) QoS type in the OVS database. */
3451 default_install__(struct netdev *netdev)
3453 struct netdev_dev_linux *netdev_dev =
3454 netdev_dev_linux_cast(netdev_get_dev(netdev));
3455 static struct tc *tc;
3458 tc = xmalloc(sizeof *tc);
3459 tc_init(tc, &tc_ops_default);
3461 netdev_dev->tc = tc;
3465 default_tc_install(struct netdev *netdev,
3466 const struct shash *details OVS_UNUSED)
3468 default_install__(netdev);
3473 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3475 default_install__(netdev);
3479 static const struct tc_ops tc_ops_default = {
3480 NULL, /* linux_name */
3485 NULL, /* tc_destroy */
3486 NULL, /* qdisc_get */
3487 NULL, /* qdisc_set */
3488 NULL, /* class_get */
3489 NULL, /* class_set */
3490 NULL, /* class_delete */
3491 NULL, /* class_get_stats */
3492 NULL /* class_dump_stats */
3495 /* "linux-other" traffic control class.
3500 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3502 struct netdev_dev_linux *netdev_dev =
3503 netdev_dev_linux_cast(netdev_get_dev(netdev));
3504 static struct tc *tc;
3507 tc = xmalloc(sizeof *tc);
3508 tc_init(tc, &tc_ops_other);
3510 netdev_dev->tc = tc;
3514 static const struct tc_ops tc_ops_other = {
3515 NULL, /* linux_name */
3516 "linux-other", /* ovs_name */
3518 NULL, /* tc_install */
3520 NULL, /* tc_destroy */
3521 NULL, /* qdisc_get */
3522 NULL, /* qdisc_set */
3523 NULL, /* class_get */
3524 NULL, /* class_set */
3525 NULL, /* class_delete */
3526 NULL, /* class_get_stats */
3527 NULL /* class_dump_stats */
3530 /* Traffic control. */
3532 /* Number of kernel "tc" ticks per second. */
3533 static double ticks_per_s;
3535 /* Number of kernel "jiffies" per second. This is used for the purpose of
3536 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3537 * one jiffy's worth of data.
3539 * There are two possibilities here:
3541 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3542 * approximate range of 100 to 1024. That means that we really need to
3543 * make sure that the qdisc can buffer that much data.
3545 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3546 * has finely granular timers and there's no need to fudge additional room
3547 * for buffers. (There's no extra effort needed to implement that: the
3548 * large 'buffer_hz' is used as a divisor, so practically any number will
3549 * come out as 0 in the division. Small integer results in the case of
3550 * really high dividends won't have any real effect anyhow.)
3552 static unsigned int buffer_hz;
3554 /* Returns tc handle 'major':'minor'. */
3556 tc_make_handle(unsigned int major, unsigned int minor)
3558 return TC_H_MAKE(major << 16, minor);
3561 /* Returns the major number from 'handle'. */
3563 tc_get_major(unsigned int handle)
3565 return TC_H_MAJ(handle) >> 16;
3568 /* Returns the minor number from 'handle'. */
3570 tc_get_minor(unsigned int handle)
3572 return TC_H_MIN(handle);
3575 static struct tcmsg *
3576 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3577 struct ofpbuf *request)
3579 struct tcmsg *tcmsg;
3583 error = get_ifindex(netdev, &ifindex);
3588 ofpbuf_init(request, 512);
3589 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3590 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3591 tcmsg->tcm_family = AF_UNSPEC;
3592 tcmsg->tcm_ifindex = ifindex;
3593 /* Caller should fill in tcmsg->tcm_handle. */
3594 /* Caller should fill in tcmsg->tcm_parent. */
3600 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3602 int error = nl_sock_transact(rtnl_sock, request, replyp);
3603 ofpbuf_uninit(request);
3607 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3608 * policing configuration.
3610 * This function is equivalent to running the following when 'add' is true:
3611 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3613 * This function is equivalent to running the following when 'add' is false:
3614 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3616 * The configuration and stats may be seen with the following command:
3617 * /sbin/tc -s qdisc show dev <devname>
3619 * Returns 0 if successful, otherwise a positive errno value.
3622 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3624 struct ofpbuf request;
3625 struct tcmsg *tcmsg;
3627 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3628 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3630 tcmsg = tc_make_request(netdev, type, flags, &request);
3634 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3635 tcmsg->tcm_parent = TC_H_INGRESS;
3636 nl_msg_put_string(&request, TCA_KIND, "ingress");
3637 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3639 error = tc_transact(&request, NULL);
3641 /* If we're deleting the qdisc, don't worry about some of the
3642 * error conditions. */
3643 if (!add && (error == ENOENT || error == EINVAL)) {
3652 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3655 * This function is equivalent to running:
3656 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3657 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3660 * The configuration and stats may be seen with the following command:
3661 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3663 * Returns 0 if successful, otherwise a positive errno value.
3666 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3668 struct tc_police tc_police;
3669 struct ofpbuf request;
3670 struct tcmsg *tcmsg;
3671 size_t basic_offset;
3672 size_t police_offset;
3676 memset(&tc_police, 0, sizeof tc_police);
3677 tc_police.action = TC_POLICE_SHOT;
3678 tc_police.mtu = mtu;
3679 tc_fill_rate(&tc_police.rate, kbits_rate/8 * 1000, mtu);
3680 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3681 kbits_burst * 1024);
3683 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3684 NLM_F_EXCL | NLM_F_CREATE, &request);
3688 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3689 tcmsg->tcm_info = tc_make_handle(49,
3690 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3692 nl_msg_put_string(&request, TCA_KIND, "basic");
3693 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3694 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3695 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3696 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3697 nl_msg_end_nested(&request, police_offset);
3698 nl_msg_end_nested(&request, basic_offset);
3700 error = tc_transact(&request, NULL);
3711 /* The values in psched are not individually very meaningful, but they are
3712 * important. The tables below show some values seen in the wild.
3716 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3717 * (Before that, there are hints that it was 1000000000.)
3719 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3723 * -----------------------------------
3724 * [1] 000c8000 000f4240 000f4240 00000064
3725 * [2] 000003e8 00000400 000f4240 3b9aca00
3726 * [3] 000003e8 00000400 000f4240 3b9aca00
3727 * [4] 000003e8 00000400 000f4240 00000064
3728 * [5] 000003e8 00000040 000f4240 3b9aca00
3729 * [6] 000003e8 00000040 000f4240 000000f9
3731 * a b c d ticks_per_s buffer_hz
3732 * ------- --------- ---------- ------------- ----------- -------------
3733 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3734 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3735 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3736 * [4] 1,000 1,024 1,000,000 100 976,562 100
3737 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3738 * [6] 1,000 64 1,000,000 249 15,625,000 249
3740 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3741 * [2] 2.6.26-1-686-bigmem from Debian lenny
3742 * [3] 2.6.26-2-sparc64 from Debian lenny
3743 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3744 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3745 * [6] 2.6.34 from kernel.org on KVM
3747 static const char fn[] = "/proc/net/psched";
3748 unsigned int a, b, c, d;
3754 stream = fopen(fn, "r");
3756 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3760 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3761 VLOG_WARN("%s: read failed", fn);
3765 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3769 VLOG_WARN("%s: invalid scheduler parameters", fn);
3773 ticks_per_s = (double) a * c / b;
3777 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3780 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3783 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3784 * rate of 'rate' bytes per second. */
3786 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3791 return (rate * ticks) / ticks_per_s;
3794 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3795 * rate of 'rate' bytes per second. */
3797 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3802 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3805 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3806 * a transmission rate of 'rate' bytes per second. */
3808 tc_buffer_per_jiffy(unsigned int rate)
3813 return rate / buffer_hz;
3816 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3817 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3818 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3819 * stores NULL into it if it is absent.
3821 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3824 * Returns 0 if successful, otherwise a positive errno value. */
3826 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3827 struct nlattr **options)
3829 static const struct nl_policy tca_policy[] = {
3830 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3831 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3833 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3835 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3836 tca_policy, ta, ARRAY_SIZE(ta))) {
3837 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3842 *kind = nl_attr_get_string(ta[TCA_KIND]);
3846 *options = ta[TCA_OPTIONS];
3861 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3862 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3863 * into '*options', and its queue statistics into '*stats'. Any of the output
3864 * arguments may be null.
3866 * Returns 0 if successful, otherwise a positive errno value. */
3868 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3869 struct nlattr **options, struct netdev_queue_stats *stats)
3871 static const struct nl_policy tca_policy[] = {
3872 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3873 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3875 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3877 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3878 tca_policy, ta, ARRAY_SIZE(ta))) {
3879 VLOG_WARN_RL(&rl, "failed to parse class message");
3884 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3885 *handlep = tc->tcm_handle;
3889 *options = ta[TCA_OPTIONS];
3893 const struct gnet_stats_queue *gsq;
3894 struct gnet_stats_basic gsb;
3896 static const struct nl_policy stats_policy[] = {
3897 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3898 .min_len = sizeof gsb },
3899 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3900 .min_len = sizeof *gsq },
3902 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3904 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3905 sa, ARRAY_SIZE(sa))) {
3906 VLOG_WARN_RL(&rl, "failed to parse class stats");
3910 /* Alignment issues screw up the length of struct gnet_stats_basic on
3911 * some arch/bitsize combinations. Newer versions of Linux have a
3912 * struct gnet_stats_basic_packed, but we can't depend on that. The
3913 * easiest thing to do is just to make a copy. */
3914 memset(&gsb, 0, sizeof gsb);
3915 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3916 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3917 stats->tx_bytes = gsb.bytes;
3918 stats->tx_packets = gsb.packets;
3920 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3921 stats->tx_errors = gsq->drops;
3931 memset(stats, 0, sizeof *stats);
3936 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3939 tc_query_class(const struct netdev *netdev,
3940 unsigned int handle, unsigned int parent,
3941 struct ofpbuf **replyp)
3943 struct ofpbuf request;
3944 struct tcmsg *tcmsg;
3947 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3951 tcmsg->tcm_handle = handle;
3952 tcmsg->tcm_parent = parent;
3954 error = tc_transact(&request, replyp);
3956 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3957 netdev_get_name(netdev),
3958 tc_get_major(handle), tc_get_minor(handle),
3959 tc_get_major(parent), tc_get_minor(parent),
3965 /* Equivalent to "tc class del dev <name> handle <handle>". */
3967 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3969 struct ofpbuf request;
3970 struct tcmsg *tcmsg;
3973 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
3977 tcmsg->tcm_handle = handle;
3978 tcmsg->tcm_parent = 0;
3980 error = tc_transact(&request, NULL);
3982 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3983 netdev_get_name(netdev),
3984 tc_get_major(handle), tc_get_minor(handle),
3990 /* Equivalent to "tc qdisc del dev <name> root". */
3992 tc_del_qdisc(struct netdev *netdev)
3994 struct netdev_dev_linux *netdev_dev =
3995 netdev_dev_linux_cast(netdev_get_dev(netdev));
3996 struct ofpbuf request;
3997 struct tcmsg *tcmsg;
4000 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
4004 tcmsg->tcm_handle = tc_make_handle(1, 0);
4005 tcmsg->tcm_parent = TC_H_ROOT;
4007 error = tc_transact(&request, NULL);
4008 if (error == EINVAL) {
4009 /* EINVAL probably means that the default qdisc was in use, in which
4010 * case we've accomplished our purpose. */
4013 if (!error && netdev_dev->tc) {
4014 if (netdev_dev->tc->ops->tc_destroy) {
4015 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
4017 netdev_dev->tc = NULL;
4022 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4023 * kernel to determine what they are. Returns 0 if successful, otherwise a
4024 * positive errno value. */
4026 tc_query_qdisc(const struct netdev *netdev)
4028 struct netdev_dev_linux *netdev_dev =
4029 netdev_dev_linux_cast(netdev_get_dev(netdev));
4030 struct ofpbuf request, *qdisc;
4031 const struct tc_ops *ops;
4032 struct tcmsg *tcmsg;
4036 if (netdev_dev->tc) {
4040 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4041 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4042 * 2.6.35 without that fix backported to it.
4044 * To avoid the OOPS, we must not make a request that would attempt to dump
4045 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4046 * few others. There are a few ways that I can see to do this, but most of
4047 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4048 * technique chosen here is to assume that any non-default qdisc that we
4049 * create will have a class with handle 1:0. The built-in qdiscs only have
4050 * a class with handle 0:0.
4052 * We could check for Linux 2.6.35+ and use a more straightforward method
4054 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
4058 tcmsg->tcm_handle = tc_make_handle(1, 0);
4059 tcmsg->tcm_parent = 0;
4061 /* Figure out what tc class to instantiate. */
4062 error = tc_transact(&request, &qdisc);
4066 error = tc_parse_qdisc(qdisc, &kind, NULL);
4068 ops = &tc_ops_other;
4070 ops = tc_lookup_linux_name(kind);
4072 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4073 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4075 ops = &tc_ops_other;
4078 } else if (error == ENOENT) {
4079 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4080 * other entity that doesn't have a handle 1:0. We will assume
4081 * that it's the system default qdisc. */
4082 ops = &tc_ops_default;
4085 /* Who knows? Maybe the device got deleted. */
4086 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4087 netdev_get_name(netdev), strerror(error));
4088 ops = &tc_ops_other;
4091 /* Instantiate it. */
4092 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
4093 assert((load_error == 0) == (netdev_dev->tc != NULL));
4094 ofpbuf_delete(qdisc);
4096 return error ? error : load_error;
4099 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4100 approximate the time to transmit packets of various lengths. For an MTU of
4101 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4102 represents two possible packet lengths; for a MTU of 513 through 1024, four
4103 possible lengths; and so on.
4105 Returns, for the specified 'mtu', the number of bits that packet lengths
4106 need to be shifted right to fit within such a 256-entry table. */
4108 tc_calc_cell_log(unsigned int mtu)
4113 mtu = ETH_PAYLOAD_MAX;
4115 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4117 for (cell_log = 0; mtu >= 256; cell_log++) {
4124 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4127 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4129 memset(rate, 0, sizeof *rate);
4130 rate->cell_log = tc_calc_cell_log(mtu);
4131 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4132 /* rate->cell_align = 0; */ /* distro headers. */
4133 rate->mpu = ETH_TOTAL_MIN;
4137 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4138 * attribute of the specified "type".
4140 * See tc_calc_cell_log() above for a description of "rtab"s. */
4142 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4147 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4148 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4149 unsigned packet_size = (i + 1) << rate->cell_log;
4150 if (packet_size < rate->mpu) {
4151 packet_size = rate->mpu;
4153 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4157 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4158 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4159 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4162 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4164 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4165 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4168 /* Linux-only functions declared in netdev-linux.h */
4170 /* Returns a fd for an AF_INET socket or a negative errno value. */
4172 netdev_linux_get_af_inet_sock(void)
4174 int error = netdev_linux_init();
4175 return error ? -error : af_inet_sock;
4178 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4179 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4181 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4182 const char *flag_name, bool enable)
4184 const char *netdev_name = netdev_get_name(netdev);
4185 struct ethtool_value evalue;
4189 memset(&evalue, 0, sizeof evalue);
4190 error = netdev_linux_do_ethtool(netdev_name,
4191 (struct ethtool_cmd *)&evalue,
4192 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4197 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4198 error = netdev_linux_do_ethtool(netdev_name,
4199 (struct ethtool_cmd *)&evalue,
4200 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4205 memset(&evalue, 0, sizeof evalue);
4206 error = netdev_linux_do_ethtool(netdev_name,
4207 (struct ethtool_cmd *)&evalue,
4208 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4213 if (new_flags != evalue.data) {
4214 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4215 "device %s failed", enable ? "enable" : "disable",
4216 flag_name, netdev_name);
4223 /* Utility functions. */
4225 /* Copies 'src' into 'dst', performing format conversion in the process. */
4227 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4228 const struct rtnl_link_stats *src)
4230 dst->rx_packets = src->rx_packets;
4231 dst->tx_packets = src->tx_packets;
4232 dst->rx_bytes = src->rx_bytes;
4233 dst->tx_bytes = src->tx_bytes;
4234 dst->rx_errors = src->rx_errors;
4235 dst->tx_errors = src->tx_errors;
4236 dst->rx_dropped = src->rx_dropped;
4237 dst->tx_dropped = src->tx_dropped;
4238 dst->multicast = src->multicast;
4239 dst->collisions = src->collisions;
4240 dst->rx_length_errors = src->rx_length_errors;
4241 dst->rx_over_errors = src->rx_over_errors;
4242 dst->rx_crc_errors = src->rx_crc_errors;
4243 dst->rx_frame_errors = src->rx_frame_errors;
4244 dst->rx_fifo_errors = src->rx_fifo_errors;
4245 dst->rx_missed_errors = src->rx_missed_errors;
4246 dst->tx_aborted_errors = src->tx_aborted_errors;
4247 dst->tx_carrier_errors = src->tx_carrier_errors;
4248 dst->tx_fifo_errors = src->tx_fifo_errors;
4249 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4250 dst->tx_window_errors = src->tx_window_errors;
4254 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4256 /* Policy for RTNLGRP_LINK messages.
4258 * There are *many* more fields in these messages, but currently we only
4259 * care about these fields. */
4260 static const struct nl_policy rtnlgrp_link_policy[] = {
4261 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4262 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4263 .min_len = sizeof(struct rtnl_link_stats) },
4266 struct ofpbuf request;
4267 struct ofpbuf *reply;
4268 struct ifinfomsg *ifi;
4269 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4272 ofpbuf_init(&request, 0);
4273 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4274 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4275 ifi->ifi_family = PF_UNSPEC;
4276 ifi->ifi_index = ifindex;
4277 error = nl_sock_transact(rtnl_sock, &request, &reply);
4278 ofpbuf_uninit(&request);
4283 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4284 rtnlgrp_link_policy,
4285 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4286 ofpbuf_delete(reply);
4290 if (!attrs[IFLA_STATS]) {
4291 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4292 ofpbuf_delete(reply);
4296 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4298 ofpbuf_delete(reply);
4304 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4306 static const char fn[] = "/proc/net/dev";
4311 stream = fopen(fn, "r");
4313 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
4318 while (fgets(line, sizeof line, stream)) {
4321 #define X64 "%"SCNu64
4324 X64 X64 X64 X64 X64 X64 X64 "%*u"
4325 X64 X64 X64 X64 X64 X64 X64 "%*u",
4331 &stats->rx_fifo_errors,
4332 &stats->rx_frame_errors,
4338 &stats->tx_fifo_errors,
4340 &stats->tx_carrier_errors) != 15) {
4341 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4342 } else if (!strcmp(devname, netdev_name)) {
4343 stats->rx_length_errors = UINT64_MAX;
4344 stats->rx_over_errors = UINT64_MAX;
4345 stats->rx_crc_errors = UINT64_MAX;
4346 stats->rx_missed_errors = UINT64_MAX;
4347 stats->tx_aborted_errors = UINT64_MAX;
4348 stats->tx_heartbeat_errors = UINT64_MAX;
4349 stats->tx_window_errors = UINT64_MAX;
4355 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4361 get_flags(const struct netdev_dev *dev, unsigned int *flags)
4367 error = netdev_linux_do_ioctl(dev->name, &ifr, SIOCGIFFLAGS,
4370 *flags = ifr.ifr_flags;
4376 set_flags(struct netdev *netdev, unsigned int flags)
4380 ifr.ifr_flags = flags;
4381 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4386 do_get_ifindex(const char *netdev_name)
4390 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4391 COVERAGE_INC(netdev_get_ifindex);
4392 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4393 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4394 netdev_name, strerror(errno));
4397 return ifr.ifr_ifindex;
4401 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4403 struct netdev_dev_linux *netdev_dev =
4404 netdev_dev_linux_cast(netdev_get_dev(netdev_));
4406 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
4407 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4410 netdev_dev->get_ifindex_error = -ifindex;
4411 netdev_dev->ifindex = 0;
4413 netdev_dev->get_ifindex_error = 0;
4414 netdev_dev->ifindex = ifindex;
4416 netdev_dev->cache_valid |= VALID_IFINDEX;
4419 *ifindexp = netdev_dev->ifindex;
4420 return netdev_dev->get_ifindex_error;
4424 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4429 memset(&ifr, 0, sizeof ifr);
4430 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4431 COVERAGE_INC(netdev_get_hwaddr);
4432 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4433 /* ENODEV probably means that a vif disappeared asynchronously and
4434 * hasn't been removed from the database yet, so reduce the log level
4435 * to INFO for that case. */
4436 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4437 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4438 netdev_name, strerror(errno));
4441 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4442 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4443 VLOG_WARN("%s device has unknown hardware address family %d",
4444 netdev_name, hwaddr_family);
4446 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4451 set_etheraddr(const char *netdev_name,
4452 const uint8_t mac[ETH_ADDR_LEN])
4456 memset(&ifr, 0, sizeof ifr);
4457 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4458 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4459 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4460 COVERAGE_INC(netdev_set_hwaddr);
4461 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4462 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4463 netdev_name, strerror(errno));
4470 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4471 int cmd, const char *cmd_name)
4475 memset(&ifr, 0, sizeof ifr);
4476 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4477 ifr.ifr_data = (caddr_t) ecmd;
4480 COVERAGE_INC(netdev_ethtool);
4481 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4484 if (errno != EOPNOTSUPP) {
4485 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4486 "failed: %s", cmd_name, name, strerror(errno));
4488 /* The device doesn't support this operation. That's pretty
4489 * common, so there's no point in logging anything. */
4496 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4497 const char *cmd_name)
4499 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4500 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4501 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4509 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4510 int cmd, const char *cmd_name)
4515 ifr.ifr_addr.sa_family = AF_INET;
4516 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4518 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4519 *ip = sin->sin_addr;
4524 /* Returns an AF_PACKET raw socket or a negative errno value. */
4526 af_packet_sock(void)
4528 static int sock = INT_MIN;
4530 if (sock == INT_MIN) {
4531 sock = socket(AF_PACKET, SOCK_RAW, 0);
4533 set_nonblocking(sock);
4536 VLOG_ERR("failed to create packet socket: %s", strerror(errno));