2 * Copyright (c) 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/gen_stats.h>
26 #include <linux/if_ether.h>
27 #include <linux/if_tun.h>
28 #include <linux/types.h>
29 #include <linux/ethtool.h>
30 #include <linux/mii.h>
31 #include <linux/pkt_cls.h>
32 #include <linux/pkt_sched.h>
33 #include <linux/rtnetlink.h>
34 #include <linux/sockios.h>
35 #include <linux/version.h>
36 #include <sys/types.h>
37 #include <sys/ioctl.h>
38 #include <sys/socket.h>
39 #include <netpacket/packet.h>
41 #include <net/if_arp.h>
42 #include <net/if_packet.h>
43 #include <net/route.h>
44 #include <netinet/in.h>
51 #include "dpif-linux.h"
52 #include "dynamic-string.h"
53 #include "fatal-signal.h"
56 #include "netdev-provider.h"
57 #include "netdev-vport.h"
58 #include "netlink-notifier.h"
59 #include "netlink-socket.h"
62 #include "openflow/openflow.h"
64 #include "poll-loop.h"
65 #include "rtnetlink-link.h"
67 #include "socket-util.h"
70 #include "unaligned.h"
73 VLOG_DEFINE_THIS_MODULE(netdev_linux);
75 COVERAGE_DEFINE(netdev_set_policing);
76 COVERAGE_DEFINE(netdev_arp_lookup);
77 COVERAGE_DEFINE(netdev_get_ifindex);
78 COVERAGE_DEFINE(netdev_get_hwaddr);
79 COVERAGE_DEFINE(netdev_set_hwaddr);
80 COVERAGE_DEFINE(netdev_get_ethtool);
81 COVERAGE_DEFINE(netdev_set_ethtool);
84 /* These were introduced in Linux 2.6.14, so they might be missing if we have
86 #ifndef ADVERTISED_Pause
87 #define ADVERTISED_Pause (1 << 13)
89 #ifndef ADVERTISED_Asym_Pause
90 #define ADVERTISED_Asym_Pause (1 << 14)
93 /* These were introduced in Linux 2.6.24, so they might be missing if we
94 * have old headers. */
95 #ifndef ETHTOOL_GFLAGS
96 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
98 #ifndef ETHTOOL_SFLAGS
99 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
102 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
105 #define TC_RTAB_SIZE 1024
108 static struct nln_notifier *netdev_linux_cache_notifier = NULL;
109 static int cache_notifier_refcount;
112 VALID_IFINDEX = 1 << 0,
113 VALID_ETHERADDR = 1 << 1,
117 VALID_POLICING = 1 << 5,
118 VALID_VPORT_STAT_ERROR = 1 << 6,
119 VALID_DRVINFO = 1 << 7,
120 VALID_FEATURES = 1 << 8,
127 /* Traffic control. */
129 /* An instance of a traffic control class. Always associated with a particular
132 * Each TC implementation subclasses this with whatever additional data it
135 const struct tc_ops *ops;
136 struct hmap queues; /* Contains "struct tc_queue"s.
137 * Read by generic TC layer.
138 * Written only by TC implementation. */
141 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
143 /* One traffic control queue.
145 * Each TC implementation subclasses this with whatever additional data it
148 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
149 unsigned int queue_id; /* OpenFlow queue ID. */
152 /* A particular kind of traffic control. Each implementation generally maps to
153 * one particular Linux qdisc class.
155 * The functions below return 0 if successful or a positive errno value on
156 * failure, except where otherwise noted. All of them must be provided, except
157 * where otherwise noted. */
159 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
160 * This is null for tc_ops_default and tc_ops_other, for which there are no
161 * appropriate values. */
162 const char *linux_name;
164 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
165 const char *ovs_name;
167 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
168 * queues. The queues are numbered 0 through n_queues - 1. */
169 unsigned int n_queues;
171 /* Called to install this TC class on 'netdev'. The implementation should
172 * make the Netlink calls required to set up 'netdev' with the right qdisc
173 * and configure it according to 'details'. The implementation may assume
174 * that the current qdisc is the default; that is, there is no need for it
175 * to delete the current qdisc before installing itself.
177 * The contents of 'details' should be documented as valid for 'ovs_name'
178 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
179 * (which is built as ovs-vswitchd.conf.db(8)).
181 * This function must return 0 if and only if it sets 'netdev->tc' to an
182 * initialized 'struct tc'.
184 * (This function is null for tc_ops_other, which cannot be installed. For
185 * other TC classes it should always be nonnull.) */
186 int (*tc_install)(struct netdev *netdev, const struct smap *details);
188 /* Called when the netdev code determines (through a Netlink query) that
189 * this TC class's qdisc is installed on 'netdev', but we didn't install
190 * it ourselves and so don't know any of the details.
192 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
193 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
194 * implementation should parse the other attributes of 'nlmsg' as
195 * necessary to determine its configuration. If necessary it should also
196 * use Netlink queries to determine the configuration of queues on
199 * This function must return 0 if and only if it sets 'netdev->tc' to an
200 * initialized 'struct tc'. */
201 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
203 /* Destroys the data structures allocated by the implementation as part of
204 * 'tc'. (This includes destroying 'tc->queues' by calling
207 * The implementation should not need to perform any Netlink calls. If
208 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
209 * (But it may not be desirable.)
211 * This function may be null if 'tc' is trivial. */
212 void (*tc_destroy)(struct tc *tc);
214 /* Retrieves details of 'netdev->tc' configuration into 'details'.
216 * The implementation should not need to perform any Netlink calls, because
217 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
218 * cached the configuration.
220 * The contents of 'details' should be documented as valid for 'ovs_name'
221 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
222 * (which is built as ovs-vswitchd.conf.db(8)).
224 * This function may be null if 'tc' is not configurable.
226 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
228 /* Reconfigures 'netdev->tc' according to 'details', performing any
229 * required Netlink calls to complete the reconfiguration.
231 * The contents of 'details' should be documented as valid for 'ovs_name'
232 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
233 * (which is built as ovs-vswitchd.conf.db(8)).
235 * This function may be null if 'tc' is not configurable.
237 int (*qdisc_set)(struct netdev *, const struct smap *details);
239 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
240 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
242 * The contents of 'details' should be documented as valid for 'ovs_name'
243 * in the "other_config" column in the "Queue" table in
244 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
246 * The implementation should not need to perform any Netlink calls, because
247 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
248 * cached the queue configuration.
250 * This function may be null if 'tc' does not have queues ('n_queues' is
252 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
253 struct smap *details);
255 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
256 * 'details', perfoming any required Netlink calls to complete the
257 * reconfiguration. The caller ensures that 'queue_id' is less than
260 * The contents of 'details' should be documented as valid for 'ovs_name'
261 * in the "other_config" column in the "Queue" table in
262 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
264 * This function may be null if 'tc' does not have queues or its queues are
265 * not configurable. */
266 int (*class_set)(struct netdev *, unsigned int queue_id,
267 const struct smap *details);
269 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
270 * tc_queue's within 'netdev->tc->queues'.
272 * This function may be null if 'tc' does not have queues or its queues
273 * cannot be deleted. */
274 int (*class_delete)(struct netdev *, struct tc_queue *queue);
276 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
277 * 'struct tc_queue's within 'netdev->tc->queues'.
279 * On success, initializes '*stats'.
281 * This function may be null if 'tc' does not have queues or if it cannot
282 * report queue statistics. */
283 int (*class_get_stats)(const struct netdev *netdev,
284 const struct tc_queue *queue,
285 struct netdev_queue_stats *stats);
287 /* Extracts queue stats from 'nlmsg', which is a response to a
288 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
290 * This function may be null if 'tc' does not have queues or if it cannot
291 * report queue statistics. */
292 int (*class_dump_stats)(const struct netdev *netdev,
293 const struct ofpbuf *nlmsg,
294 netdev_dump_queue_stats_cb *cb, void *aux);
298 tc_init(struct tc *tc, const struct tc_ops *ops)
301 hmap_init(&tc->queues);
305 tc_destroy(struct tc *tc)
307 hmap_destroy(&tc->queues);
310 static const struct tc_ops tc_ops_htb;
311 static const struct tc_ops tc_ops_hfsc;
312 static const struct tc_ops tc_ops_default;
313 static const struct tc_ops tc_ops_other;
315 static const struct tc_ops *const tcs[] = {
316 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
317 &tc_ops_hfsc, /* Hierarchical fair service curve. */
318 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
319 &tc_ops_other, /* Some other qdisc. */
323 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
324 static unsigned int tc_get_major(unsigned int handle);
325 static unsigned int tc_get_minor(unsigned int handle);
327 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
328 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
329 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
331 static struct tcmsg *tc_make_request(const struct netdev *, int type,
332 unsigned int flags, struct ofpbuf *);
333 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
334 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
335 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
338 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
339 struct nlattr **options);
340 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
341 struct nlattr **options,
342 struct netdev_queue_stats *);
343 static int tc_query_class(const struct netdev *,
344 unsigned int handle, unsigned int parent,
345 struct ofpbuf **replyp);
346 static int tc_delete_class(const struct netdev *, unsigned int handle);
348 static int tc_del_qdisc(struct netdev *netdev);
349 static int tc_query_qdisc(const struct netdev *netdev);
351 static int tc_calc_cell_log(unsigned int mtu);
352 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
353 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
354 const struct tc_ratespec *rate);
355 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
357 struct netdev_linux {
360 struct shash_node *shash_node;
361 unsigned int cache_valid;
362 unsigned int change_seq;
364 bool miimon; /* Link status of last poll. */
365 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
366 struct timer miimon_timer;
368 /* The following are figured out "on demand" only. They are only valid
369 * when the corresponding VALID_* bit in 'cache_valid' is set. */
371 uint8_t etheraddr[ETH_ADDR_LEN];
372 struct in_addr address, netmask;
375 unsigned int ifi_flags;
376 long long int carrier_resets;
377 uint32_t kbits_rate; /* Policing data. */
378 uint32_t kbits_burst;
379 int vport_stats_error; /* Cached error code from vport_get_stats().
380 0 or an errno value. */
381 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
382 int ether_addr_error; /* Cached error code from set/get etheraddr. */
383 int netdev_policing_error; /* Cached error code from set policing. */
384 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
385 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
387 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
388 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
389 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
390 enum netdev_features peer; /* Cached from ETHTOOL_GSET. */
392 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
396 struct tap_state tap;
400 struct netdev_rx_linux {
406 static const struct netdev_rx_class netdev_rx_linux_class;
408 /* Sockets used for ioctl operations. */
409 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
411 /* A Netlink routing socket that is not subscribed to any multicast groups. */
412 static struct nl_sock *rtnl_sock;
414 /* This is set pretty low because we probably won't learn anything from the
415 * additional log messages. */
416 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
418 static int netdev_linux_init(void);
420 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
421 int cmd, const char *cmd_name);
422 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
423 const char *cmd_name);
424 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
425 int cmd, const char *cmd_name);
426 static int get_flags(const struct netdev *, unsigned int *flags);
427 static int set_flags(const char *, unsigned int flags);
428 static int do_get_ifindex(const char *netdev_name);
429 static int get_ifindex(const struct netdev *, int *ifindexp);
430 static int do_set_addr(struct netdev *netdev,
431 int ioctl_nr, const char *ioctl_name,
432 struct in_addr addr);
433 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
434 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
435 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
436 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
437 static int af_packet_sock(void);
438 static void netdev_linux_miimon_run(void);
439 static void netdev_linux_miimon_wait(void);
442 is_netdev_linux_class(const struct netdev_class *netdev_class)
444 return netdev_class->init == netdev_linux_init;
448 is_tap_netdev(const struct netdev *netdev)
450 return netdev_get_class(netdev) == &netdev_tap_class;
453 static struct netdev_linux *
454 netdev_linux_cast(const struct netdev *netdev)
456 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
458 return CONTAINER_OF(netdev, struct netdev_linux, up);
461 static struct netdev_rx_linux *
462 netdev_rx_linux_cast(const struct netdev_rx *rx)
464 netdev_rx_assert_class(rx, &netdev_rx_linux_class);
465 return CONTAINER_OF(rx, struct netdev_rx_linux, up);
469 netdev_linux_init(void)
471 static int status = -1;
473 /* Create AF_INET socket. */
474 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
475 status = af_inet_sock >= 0 ? 0 : errno;
477 VLOG_ERR("failed to create inet socket: %s", strerror(status));
480 /* Create rtnetlink socket. */
482 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
484 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
493 netdev_linux_run(void)
495 rtnetlink_link_run();
496 netdev_linux_miimon_run();
500 netdev_linux_wait(void)
502 rtnetlink_link_wait();
503 netdev_linux_miimon_wait();
507 netdev_linux_changed(struct netdev_linux *dev,
508 unsigned int ifi_flags, unsigned int mask)
511 if (!dev->change_seq) {
515 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
516 dev->carrier_resets++;
518 dev->ifi_flags = ifi_flags;
520 dev->cache_valid &= mask;
524 netdev_linux_update(struct netdev_linux *dev,
525 const struct rtnetlink_link_change *change)
527 if (change->nlmsg_type == RTM_NEWLINK) {
529 netdev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
531 /* Update netdev from rtnl-change msg. */
533 dev->mtu = change->mtu;
534 dev->cache_valid |= VALID_MTU;
535 dev->netdev_mtu_error = 0;
538 if (!eth_addr_is_zero(change->addr)) {
539 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
540 dev->cache_valid |= VALID_ETHERADDR;
541 dev->ether_addr_error = 0;
544 dev->ifindex = change->ifi_index;
545 dev->cache_valid |= VALID_IFINDEX;
546 dev->get_ifindex_error = 0;
549 netdev_linux_changed(dev, change->ifi_flags, 0);
554 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
555 void *aux OVS_UNUSED)
557 struct netdev_linux *dev;
559 struct netdev *base_dev = netdev_from_name(change->ifname);
560 if (base_dev && is_netdev_linux_class(netdev_get_class(base_dev))) {
561 netdev_linux_update(netdev_linux_cast(base_dev), change);
564 struct shash device_shash;
565 struct shash_node *node;
567 shash_init(&device_shash);
568 netdev_get_devices(&netdev_linux_class, &device_shash);
569 SHASH_FOR_EACH (node, &device_shash) {
574 get_flags(&dev->up, &flags);
575 netdev_linux_changed(dev, flags, 0);
577 shash_destroy(&device_shash);
582 cache_notifier_ref(void)
584 if (!cache_notifier_refcount) {
585 ovs_assert(!netdev_linux_cache_notifier);
587 netdev_linux_cache_notifier =
588 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
590 if (!netdev_linux_cache_notifier) {
594 cache_notifier_refcount++;
600 cache_notifier_unref(void)
602 ovs_assert(cache_notifier_refcount > 0);
603 if (!--cache_notifier_refcount) {
604 ovs_assert(netdev_linux_cache_notifier);
605 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
606 netdev_linux_cache_notifier = NULL;
610 /* Creates system and internal devices. */
612 netdev_linux_create(const struct netdev_class *class, const char *name,
613 struct netdev **netdevp)
615 struct netdev_linux *netdev;
618 error = cache_notifier_ref();
623 netdev = xzalloc(sizeof *netdev);
624 netdev->change_seq = 1;
625 netdev_init(&netdev->up, name, class);
626 error = get_flags(&netdev->up, &netdev->ifi_flags);
627 if (error == ENODEV) {
628 if (class != &netdev_internal_class) {
629 /* The device does not exist, so don't allow it to be opened. */
630 netdev_uninit(&netdev->up, false);
631 cache_notifier_unref();
635 /* "Internal" netdevs have to be created as netdev objects before
636 * they exist in the kernel, because creating them in the kernel
637 * happens by passing a netdev object to dpif_port_add().
638 * Therefore, ignore the error. */
642 *netdevp = &netdev->up;
646 /* For most types of netdevs we open the device for each call of
647 * netdev_open(). However, this is not the case with tap devices,
648 * since it is only possible to open the device once. In this
649 * situation we share a single file descriptor, and consequently
650 * buffers, across all readers. Therefore once data is read it will
651 * be unavailable to other reads for tap devices. */
653 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
654 const char *name, struct netdev **netdevp)
656 struct netdev_linux *netdev;
657 struct tap_state *state;
658 static const char tap_dev[] = "/dev/net/tun";
662 netdev = xzalloc(sizeof *netdev);
663 state = &netdev->state.tap;
665 error = cache_notifier_ref();
670 /* Open tap device. */
671 state->fd = open(tap_dev, O_RDWR);
674 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
675 goto error_unref_notifier;
678 /* Create tap device. */
679 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
680 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
681 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
682 VLOG_WARN("%s: creating tap device failed: %s", name,
685 goto error_unref_notifier;
688 /* Make non-blocking. */
689 error = set_nonblocking(state->fd);
691 goto error_unref_notifier;
694 netdev_init(&netdev->up, name, &netdev_tap_class);
695 *netdevp = &netdev->up;
698 error_unref_notifier:
699 cache_notifier_unref();
706 destroy_tap(struct netdev_linux *netdev)
708 struct tap_state *state = &netdev->state.tap;
710 if (state->fd >= 0) {
715 /* Destroys the netdev device 'netdev_'. */
717 netdev_linux_destroy(struct netdev *netdev_)
719 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
721 if (netdev->tc && netdev->tc->ops->tc_destroy) {
722 netdev->tc->ops->tc_destroy(netdev->tc);
725 if (netdev_get_class(netdev_) == &netdev_tap_class) {
730 cache_notifier_unref();
734 netdev_linux_rx_open(struct netdev *netdev_, struct netdev_rx **rxp)
736 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
737 bool is_tap = is_tap_netdev(netdev_);
738 struct netdev_rx_linux *rx;
743 fd = netdev->state.tap.fd;
745 struct sockaddr_ll sll;
748 /* Create file descriptor. */
749 fd = socket(PF_PACKET, SOCK_RAW, 0);
752 VLOG_ERR("failed to create raw socket (%s)", strerror(error));
756 /* Set non-blocking mode. */
757 error = set_nonblocking(fd);
762 /* Get ethernet device index. */
763 error = get_ifindex(&netdev->up, &ifindex);
768 /* Bind to specific ethernet device. */
769 memset(&sll, 0, sizeof sll);
770 sll.sll_family = AF_PACKET;
771 sll.sll_ifindex = ifindex;
772 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
773 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
775 VLOG_ERR("%s: failed to bind raw socket (%s)",
776 netdev_get_name(netdev_), strerror(error));
781 rx = xmalloc(sizeof *rx);
782 netdev_rx_init(&rx->up, netdev_, &netdev_rx_linux_class);
797 netdev_rx_linux_destroy(struct netdev_rx *rx_)
799 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
808 netdev_rx_linux_recv(struct netdev_rx *rx_, void *data, size_t size)
810 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
815 ? read(rx->fd, data, size)
816 : recv(rx->fd, data, size, MSG_TRUNC));
817 } while (retval < 0 && errno == EINTR);
821 } else if (retval >= 0) {
824 if (errno != EAGAIN) {
825 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
826 strerror(errno), netdev_rx_get_name(rx_));
833 netdev_rx_linux_wait(struct netdev_rx *rx_)
835 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
836 poll_fd_wait(rx->fd, POLLIN);
840 netdev_rx_linux_drain(struct netdev_rx *rx_)
842 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
845 int error = netdev_linux_do_ioctl(netdev_rx_get_name(rx_), &ifr,
846 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
850 drain_fd(rx->fd, ifr.ifr_qlen);
853 return drain_rcvbuf(rx->fd);
857 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
858 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
859 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
860 * the packet is too big or too small to transmit on the device.
862 * The caller retains ownership of 'buffer' in all cases.
864 * The kernel maintains a packet transmission queue, so the caller is not
865 * expected to do additional queuing of packets. */
867 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
872 if (!is_tap_netdev(netdev_)) {
873 /* Use our AF_PACKET socket to send to this device. */
874 struct sockaddr_ll sll;
881 sock = af_packet_sock();
886 error = get_ifindex(netdev_, &ifindex);
891 /* We don't bother setting most fields in sockaddr_ll because the
892 * kernel ignores them for SOCK_RAW. */
893 memset(&sll, 0, sizeof sll);
894 sll.sll_family = AF_PACKET;
895 sll.sll_ifindex = ifindex;
897 iov.iov_base = CONST_CAST(void *, data);
901 msg.msg_namelen = sizeof sll;
904 msg.msg_control = NULL;
905 msg.msg_controllen = 0;
908 retval = sendmsg(sock, &msg, 0);
910 /* Use the tap fd to send to this device. This is essential for
911 * tap devices, because packets sent to a tap device with an
912 * AF_PACKET socket will loop back to be *received* again on the
914 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
916 retval = write(netdev->state.tap.fd, data, size);
920 /* The Linux AF_PACKET implementation never blocks waiting for room
921 * for packets, instead returning ENOBUFS. Translate this into
922 * EAGAIN for the caller. */
923 if (errno == ENOBUFS) {
925 } else if (errno == EINTR) {
927 } else if (errno != EAGAIN) {
928 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
929 netdev_get_name(netdev_), strerror(errno));
932 } else if (retval != size) {
933 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
934 "%zu) on %s", retval, size, netdev_get_name(netdev_));
942 /* Registers with the poll loop to wake up from the next call to poll_block()
943 * when the packet transmission queue has sufficient room to transmit a packet
944 * with netdev_send().
946 * The kernel maintains a packet transmission queue, so the client is not
947 * expected to do additional queuing of packets. Thus, this function is
948 * unlikely to ever be used. It is included for completeness. */
950 netdev_linux_send_wait(struct netdev *netdev)
952 if (is_tap_netdev(netdev)) {
953 /* TAP device always accepts packets.*/
954 poll_immediate_wake();
958 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
959 * otherwise a positive errno value. */
961 netdev_linux_set_etheraddr(struct netdev *netdev_,
962 const uint8_t mac[ETH_ADDR_LEN])
964 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
965 struct netdev_saved_flags *sf = NULL;
968 if (netdev->cache_valid & VALID_ETHERADDR) {
969 if (netdev->ether_addr_error) {
970 return netdev->ether_addr_error;
972 if (eth_addr_equals(netdev->etheraddr, mac)) {
975 netdev->cache_valid &= ~VALID_ETHERADDR;
978 /* Tap devices must be brought down before setting the address. */
979 if (is_tap_netdev(netdev_)) {
980 enum netdev_flags flags;
982 if (!netdev_get_flags(netdev_, &flags) && (flags & NETDEV_UP)) {
983 netdev_turn_flags_off(netdev_, NETDEV_UP, &sf);
986 error = set_etheraddr(netdev_get_name(netdev_), mac);
987 if (!error || error == ENODEV) {
988 netdev->ether_addr_error = error;
989 netdev->cache_valid |= VALID_ETHERADDR;
991 memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
995 netdev_restore_flags(sf);
1000 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1002 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1003 uint8_t mac[ETH_ADDR_LEN])
1005 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1007 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1008 int error = get_etheraddr(netdev_get_name(netdev_),
1011 netdev->ether_addr_error = error;
1012 netdev->cache_valid |= VALID_ETHERADDR;
1015 if (!netdev->ether_addr_error) {
1016 memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
1019 return netdev->ether_addr_error;
1022 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1023 * in bytes, not including the hardware header; thus, this is typically 1500
1024 * bytes for Ethernet devices. */
1026 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1028 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1029 if (!(netdev->cache_valid & VALID_MTU)) {
1033 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1034 SIOCGIFMTU, "SIOCGIFMTU");
1036 netdev->netdev_mtu_error = error;
1037 netdev->mtu = ifr.ifr_mtu;
1038 netdev->cache_valid |= VALID_MTU;
1041 if (!netdev->netdev_mtu_error) {
1042 *mtup = netdev->mtu;
1044 return netdev->netdev_mtu_error;
1047 /* Sets the maximum size of transmitted (MTU) for given device using linux
1048 * networking ioctl interface.
1051 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1053 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1057 if (netdev->cache_valid & VALID_MTU) {
1058 if (netdev->netdev_mtu_error) {
1059 return netdev->netdev_mtu_error;
1061 if (netdev->mtu == mtu) {
1064 netdev->cache_valid &= ~VALID_MTU;
1067 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1068 SIOCSIFMTU, "SIOCSIFMTU");
1069 if (!error || error == ENODEV) {
1070 netdev->netdev_mtu_error = error;
1071 netdev->mtu = ifr.ifr_mtu;
1072 netdev->cache_valid |= VALID_MTU;
1077 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1078 * On failure, returns a negative errno value. */
1080 netdev_linux_get_ifindex(const struct netdev *netdev)
1084 error = get_ifindex(netdev, &ifindex);
1085 return error ? -error : ifindex;
1089 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1091 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1093 if (netdev->miimon_interval > 0) {
1094 *carrier = netdev->miimon;
1096 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1102 static long long int
1103 netdev_linux_get_carrier_resets(const struct netdev *netdev)
1105 return netdev_linux_cast(netdev)->carrier_resets;
1109 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1110 struct mii_ioctl_data *data)
1115 memset(&ifr, 0, sizeof ifr);
1116 memcpy(&ifr.ifr_data, data, sizeof *data);
1117 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1118 memcpy(data, &ifr.ifr_data, sizeof *data);
1124 netdev_linux_get_miimon(const char *name, bool *miimon)
1126 struct mii_ioctl_data data;
1131 memset(&data, 0, sizeof data);
1132 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1134 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1135 data.reg_num = MII_BMSR;
1136 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1140 *miimon = !!(data.val_out & BMSR_LSTATUS);
1142 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1145 struct ethtool_cmd ecmd;
1147 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1150 COVERAGE_INC(netdev_get_ethtool);
1151 memset(&ecmd, 0, sizeof ecmd);
1152 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1155 struct ethtool_value eval;
1157 memcpy(&eval, &ecmd, sizeof eval);
1158 *miimon = !!eval.data;
1160 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1168 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1169 long long int interval)
1171 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1173 interval = interval > 0 ? MAX(interval, 100) : 0;
1174 if (netdev->miimon_interval != interval) {
1175 netdev->miimon_interval = interval;
1176 timer_set_expired(&netdev->miimon_timer);
1183 netdev_linux_miimon_run(void)
1185 struct shash device_shash;
1186 struct shash_node *node;
1188 shash_init(&device_shash);
1189 netdev_get_devices(&netdev_linux_class, &device_shash);
1190 SHASH_FOR_EACH (node, &device_shash) {
1191 struct netdev_linux *dev = node->data;
1194 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1198 netdev_linux_get_miimon(dev->up.name, &miimon);
1199 if (miimon != dev->miimon) {
1200 dev->miimon = miimon;
1201 netdev_linux_changed(dev, dev->ifi_flags, 0);
1204 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1207 shash_destroy(&device_shash);
1211 netdev_linux_miimon_wait(void)
1213 struct shash device_shash;
1214 struct shash_node *node;
1216 shash_init(&device_shash);
1217 netdev_get_devices(&netdev_linux_class, &device_shash);
1218 SHASH_FOR_EACH (node, &device_shash) {
1219 struct netdev_linux *dev = node->data;
1221 if (dev->miimon_interval > 0) {
1222 timer_wait(&dev->miimon_timer);
1225 shash_destroy(&device_shash);
1228 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1229 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1232 check_for_working_netlink_stats(void)
1234 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1235 * preferable, so if that works, we'll use it. */
1236 int ifindex = do_get_ifindex("lo");
1238 VLOG_WARN("failed to get ifindex for lo, "
1239 "obtaining netdev stats from proc");
1242 struct netdev_stats stats;
1243 int error = get_stats_via_netlink(ifindex, &stats);
1245 VLOG_DBG("obtaining netdev stats via rtnetlink");
1248 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1249 "via proc (you are probably running a pre-2.6.19 "
1250 "kernel)", strerror(error));
1257 swap_uint64(uint64_t *a, uint64_t *b)
1264 /* Copies 'src' into 'dst', performing format conversion in the process.
1266 * 'src' is allowed to be misaligned. */
1268 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1269 const struct ovs_vport_stats *src)
1271 dst->rx_packets = get_unaligned_u64(&src->rx_packets);
1272 dst->tx_packets = get_unaligned_u64(&src->tx_packets);
1273 dst->rx_bytes = get_unaligned_u64(&src->rx_bytes);
1274 dst->tx_bytes = get_unaligned_u64(&src->tx_bytes);
1275 dst->rx_errors = get_unaligned_u64(&src->rx_errors);
1276 dst->tx_errors = get_unaligned_u64(&src->tx_errors);
1277 dst->rx_dropped = get_unaligned_u64(&src->rx_dropped);
1278 dst->tx_dropped = get_unaligned_u64(&src->tx_dropped);
1280 dst->collisions = 0;
1281 dst->rx_length_errors = 0;
1282 dst->rx_over_errors = 0;
1283 dst->rx_crc_errors = 0;
1284 dst->rx_frame_errors = 0;
1285 dst->rx_fifo_errors = 0;
1286 dst->rx_missed_errors = 0;
1287 dst->tx_aborted_errors = 0;
1288 dst->tx_carrier_errors = 0;
1289 dst->tx_fifo_errors = 0;
1290 dst->tx_heartbeat_errors = 0;
1291 dst->tx_window_errors = 0;
1295 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1297 struct dpif_linux_vport reply;
1301 error = dpif_linux_vport_get(netdev_get_name(netdev), &reply, &buf);
1304 } else if (!reply.stats) {
1309 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1317 get_stats_via_vport(const struct netdev *netdev_,
1318 struct netdev_stats *stats)
1320 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1322 if (!netdev->vport_stats_error ||
1323 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1326 error = get_stats_via_vport__(netdev_, stats);
1327 if (error && error != ENOENT) {
1328 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1329 "(%s)", netdev_get_name(netdev_), strerror(error));
1331 netdev->vport_stats_error = error;
1332 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1337 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1338 struct netdev_stats *stats)
1340 static int use_netlink_stats = -1;
1343 if (use_netlink_stats < 0) {
1344 use_netlink_stats = check_for_working_netlink_stats();
1347 if (use_netlink_stats) {
1350 error = get_ifindex(netdev_, &ifindex);
1352 error = get_stats_via_netlink(ifindex, stats);
1355 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1359 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1360 netdev_get_name(netdev_), error);
1366 /* Retrieves current device stats for 'netdev-linux'. */
1368 netdev_linux_get_stats(const struct netdev *netdev_,
1369 struct netdev_stats *stats)
1371 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1372 struct netdev_stats dev_stats;
1375 get_stats_via_vport(netdev_, stats);
1377 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1380 if (netdev->vport_stats_error) {
1387 if (netdev->vport_stats_error) {
1388 /* stats not available from OVS then use ioctl stats. */
1391 stats->rx_errors += dev_stats.rx_errors;
1392 stats->tx_errors += dev_stats.tx_errors;
1393 stats->rx_dropped += dev_stats.rx_dropped;
1394 stats->tx_dropped += dev_stats.tx_dropped;
1395 stats->multicast += dev_stats.multicast;
1396 stats->collisions += dev_stats.collisions;
1397 stats->rx_length_errors += dev_stats.rx_length_errors;
1398 stats->rx_over_errors += dev_stats.rx_over_errors;
1399 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1400 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1401 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1402 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1403 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1404 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1405 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1406 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1407 stats->tx_window_errors += dev_stats.tx_window_errors;
1412 /* Retrieves current device stats for 'netdev-tap' netdev or
1413 * netdev-internal. */
1415 netdev_tap_get_stats(const struct netdev *netdev_,
1416 struct netdev_stats *stats)
1418 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1419 struct netdev_stats dev_stats;
1422 get_stats_via_vport(netdev_, stats);
1424 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1426 if (netdev->vport_stats_error) {
1433 /* If this port is an internal port then the transmit and receive stats
1434 * will appear to be swapped relative to the other ports since we are the
1435 * one sending the data, not a remote computer. For consistency, we swap
1436 * them back here. This does not apply if we are getting stats from the
1437 * vport layer because it always tracks stats from the perspective of the
1439 if (netdev->vport_stats_error) {
1441 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1442 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1443 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1444 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1445 stats->rx_length_errors = 0;
1446 stats->rx_over_errors = 0;
1447 stats->rx_crc_errors = 0;
1448 stats->rx_frame_errors = 0;
1449 stats->rx_fifo_errors = 0;
1450 stats->rx_missed_errors = 0;
1451 stats->tx_aborted_errors = 0;
1452 stats->tx_carrier_errors = 0;
1453 stats->tx_fifo_errors = 0;
1454 stats->tx_heartbeat_errors = 0;
1455 stats->tx_window_errors = 0;
1457 stats->rx_dropped += dev_stats.tx_dropped;
1458 stats->tx_dropped += dev_stats.rx_dropped;
1460 stats->rx_errors += dev_stats.tx_errors;
1461 stats->tx_errors += dev_stats.rx_errors;
1463 stats->multicast += dev_stats.multicast;
1464 stats->collisions += dev_stats.collisions;
1470 netdev_internal_get_stats(const struct netdev *netdev_,
1471 struct netdev_stats *stats)
1473 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1475 get_stats_via_vport(netdev_, stats);
1476 return netdev->vport_stats_error;
1480 netdev_internal_set_stats(struct netdev *netdev,
1481 const struct netdev_stats *stats)
1483 struct ovs_vport_stats vport_stats;
1484 struct dpif_linux_vport vport;
1487 vport_stats.rx_packets = stats->rx_packets;
1488 vport_stats.tx_packets = stats->tx_packets;
1489 vport_stats.rx_bytes = stats->rx_bytes;
1490 vport_stats.tx_bytes = stats->tx_bytes;
1491 vport_stats.rx_errors = stats->rx_errors;
1492 vport_stats.tx_errors = stats->tx_errors;
1493 vport_stats.rx_dropped = stats->rx_dropped;
1494 vport_stats.tx_dropped = stats->tx_dropped;
1496 dpif_linux_vport_init(&vport);
1497 vport.cmd = OVS_VPORT_CMD_SET;
1498 vport.name = netdev_get_name(netdev);
1499 vport.stats = &vport_stats;
1501 err = dpif_linux_vport_transact(&vport, NULL, NULL);
1503 /* If the vport layer doesn't know about the device, that doesn't mean it
1504 * doesn't exist (after all were able to open it when netdev_open() was
1505 * called), it just means that it isn't attached and we'll be getting
1506 * stats a different way. */
1507 if (err == ENODEV) {
1515 netdev_linux_read_features(struct netdev_linux *netdev)
1517 struct ethtool_cmd ecmd;
1521 if (netdev->cache_valid & VALID_FEATURES) {
1525 COVERAGE_INC(netdev_get_ethtool);
1526 memset(&ecmd, 0, sizeof ecmd);
1527 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1528 ETHTOOL_GSET, "ETHTOOL_GSET");
1533 /* Supported features. */
1534 netdev->supported = 0;
1535 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1536 netdev->supported |= NETDEV_F_10MB_HD;
1538 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1539 netdev->supported |= NETDEV_F_10MB_FD;
1541 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1542 netdev->supported |= NETDEV_F_100MB_HD;
1544 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1545 netdev->supported |= NETDEV_F_100MB_FD;
1547 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1548 netdev->supported |= NETDEV_F_1GB_HD;
1550 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1551 netdev->supported |= NETDEV_F_1GB_FD;
1553 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1554 netdev->supported |= NETDEV_F_10GB_FD;
1556 if (ecmd.supported & SUPPORTED_TP) {
1557 netdev->supported |= NETDEV_F_COPPER;
1559 if (ecmd.supported & SUPPORTED_FIBRE) {
1560 netdev->supported |= NETDEV_F_FIBER;
1562 if (ecmd.supported & SUPPORTED_Autoneg) {
1563 netdev->supported |= NETDEV_F_AUTONEG;
1565 if (ecmd.supported & SUPPORTED_Pause) {
1566 netdev->supported |= NETDEV_F_PAUSE;
1568 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1569 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1572 /* Advertised features. */
1573 netdev->advertised = 0;
1574 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1575 netdev->advertised |= NETDEV_F_10MB_HD;
1577 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1578 netdev->advertised |= NETDEV_F_10MB_FD;
1580 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1581 netdev->advertised |= NETDEV_F_100MB_HD;
1583 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1584 netdev->advertised |= NETDEV_F_100MB_FD;
1586 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1587 netdev->advertised |= NETDEV_F_1GB_HD;
1589 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1590 netdev->advertised |= NETDEV_F_1GB_FD;
1592 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1593 netdev->advertised |= NETDEV_F_10GB_FD;
1595 if (ecmd.advertising & ADVERTISED_TP) {
1596 netdev->advertised |= NETDEV_F_COPPER;
1598 if (ecmd.advertising & ADVERTISED_FIBRE) {
1599 netdev->advertised |= NETDEV_F_FIBER;
1601 if (ecmd.advertising & ADVERTISED_Autoneg) {
1602 netdev->advertised |= NETDEV_F_AUTONEG;
1604 if (ecmd.advertising & ADVERTISED_Pause) {
1605 netdev->advertised |= NETDEV_F_PAUSE;
1607 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1608 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1611 /* Current settings. */
1613 if (speed == SPEED_10) {
1614 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1615 } else if (speed == SPEED_100) {
1616 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1617 } else if (speed == SPEED_1000) {
1618 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1619 } else if (speed == SPEED_10000) {
1620 netdev->current = NETDEV_F_10GB_FD;
1621 } else if (speed == 40000) {
1622 netdev->current = NETDEV_F_40GB_FD;
1623 } else if (speed == 100000) {
1624 netdev->current = NETDEV_F_100GB_FD;
1625 } else if (speed == 1000000) {
1626 netdev->current = NETDEV_F_1TB_FD;
1628 netdev->current = 0;
1631 if (ecmd.port == PORT_TP) {
1632 netdev->current |= NETDEV_F_COPPER;
1633 } else if (ecmd.port == PORT_FIBRE) {
1634 netdev->current |= NETDEV_F_FIBER;
1638 netdev->current |= NETDEV_F_AUTONEG;
1641 /* Peer advertisements. */
1642 netdev->peer = 0; /* XXX */
1645 netdev->cache_valid |= VALID_FEATURES;
1646 netdev->get_features_error = error;
1649 /* Stores the features supported by 'netdev' into each of '*current',
1650 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1651 * bitmap of NETDEV_* bits. Returns 0 if successful, otherwise a positive
1654 netdev_linux_get_features(const struct netdev *netdev_,
1655 enum netdev_features *current,
1656 enum netdev_features *advertised,
1657 enum netdev_features *supported,
1658 enum netdev_features *peer)
1660 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1662 netdev_linux_read_features(netdev);
1664 if (!netdev->get_features_error) {
1665 *current = netdev->current;
1666 *advertised = netdev->advertised;
1667 *supported = netdev->supported;
1668 *peer = netdev->peer;
1670 return netdev->get_features_error;
1673 /* Set the features advertised by 'netdev' to 'advertise'. */
1675 netdev_linux_set_advertisements(struct netdev *netdev,
1676 enum netdev_features advertise)
1678 struct ethtool_cmd ecmd;
1681 COVERAGE_INC(netdev_get_ethtool);
1682 memset(&ecmd, 0, sizeof ecmd);
1683 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1684 ETHTOOL_GSET, "ETHTOOL_GSET");
1689 ecmd.advertising = 0;
1690 if (advertise & NETDEV_F_10MB_HD) {
1691 ecmd.advertising |= ADVERTISED_10baseT_Half;
1693 if (advertise & NETDEV_F_10MB_FD) {
1694 ecmd.advertising |= ADVERTISED_10baseT_Full;
1696 if (advertise & NETDEV_F_100MB_HD) {
1697 ecmd.advertising |= ADVERTISED_100baseT_Half;
1699 if (advertise & NETDEV_F_100MB_FD) {
1700 ecmd.advertising |= ADVERTISED_100baseT_Full;
1702 if (advertise & NETDEV_F_1GB_HD) {
1703 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1705 if (advertise & NETDEV_F_1GB_FD) {
1706 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1708 if (advertise & NETDEV_F_10GB_FD) {
1709 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1711 if (advertise & NETDEV_F_COPPER) {
1712 ecmd.advertising |= ADVERTISED_TP;
1714 if (advertise & NETDEV_F_FIBER) {
1715 ecmd.advertising |= ADVERTISED_FIBRE;
1717 if (advertise & NETDEV_F_AUTONEG) {
1718 ecmd.advertising |= ADVERTISED_Autoneg;
1720 if (advertise & NETDEV_F_PAUSE) {
1721 ecmd.advertising |= ADVERTISED_Pause;
1723 if (advertise & NETDEV_F_PAUSE_ASYM) {
1724 ecmd.advertising |= ADVERTISED_Asym_Pause;
1726 COVERAGE_INC(netdev_set_ethtool);
1727 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1728 ETHTOOL_SSET, "ETHTOOL_SSET");
1731 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1732 * successful, otherwise a positive errno value. */
1734 netdev_linux_set_policing(struct netdev *netdev_,
1735 uint32_t kbits_rate, uint32_t kbits_burst)
1737 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1738 const char *netdev_name = netdev_get_name(netdev_);
1742 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1743 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1744 : kbits_burst); /* Stick with user-specified value. */
1746 if (netdev->cache_valid & VALID_POLICING) {
1747 if (netdev->netdev_policing_error) {
1748 return netdev->netdev_policing_error;
1751 if (netdev->kbits_rate == kbits_rate &&
1752 netdev->kbits_burst == kbits_burst) {
1753 /* Assume that settings haven't changed since we last set them. */
1756 netdev->cache_valid &= ~VALID_POLICING;
1759 COVERAGE_INC(netdev_set_policing);
1760 /* Remove any existing ingress qdisc. */
1761 error = tc_add_del_ingress_qdisc(netdev_, false);
1763 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1764 netdev_name, strerror(error));
1769 error = tc_add_del_ingress_qdisc(netdev_, true);
1771 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1772 netdev_name, strerror(error));
1776 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
1778 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1779 netdev_name, strerror(error));
1784 netdev->kbits_rate = kbits_rate;
1785 netdev->kbits_burst = kbits_burst;
1788 if (!error || error == ENODEV) {
1789 netdev->netdev_policing_error = error;
1790 netdev->cache_valid |= VALID_POLICING;
1796 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1799 const struct tc_ops *const *opsp;
1801 for (opsp = tcs; *opsp != NULL; opsp++) {
1802 const struct tc_ops *ops = *opsp;
1803 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1804 sset_add(types, ops->ovs_name);
1810 static const struct tc_ops *
1811 tc_lookup_ovs_name(const char *name)
1813 const struct tc_ops *const *opsp;
1815 for (opsp = tcs; *opsp != NULL; opsp++) {
1816 const struct tc_ops *ops = *opsp;
1817 if (!strcmp(name, ops->ovs_name)) {
1824 static const struct tc_ops *
1825 tc_lookup_linux_name(const char *name)
1827 const struct tc_ops *const *opsp;
1829 for (opsp = tcs; *opsp != NULL; opsp++) {
1830 const struct tc_ops *ops = *opsp;
1831 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1838 static struct tc_queue *
1839 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
1842 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1843 struct tc_queue *queue;
1845 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
1846 if (queue->queue_id == queue_id) {
1853 static struct tc_queue *
1854 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1856 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1860 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1862 struct netdev_qos_capabilities *caps)
1864 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1868 caps->n_queues = ops->n_queues;
1873 netdev_linux_get_qos(const struct netdev *netdev_,
1874 const char **typep, struct smap *details)
1876 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1879 error = tc_query_qdisc(netdev_);
1884 *typep = netdev->tc->ops->ovs_name;
1885 return (netdev->tc->ops->qdisc_get
1886 ? netdev->tc->ops->qdisc_get(netdev_, details)
1891 netdev_linux_set_qos(struct netdev *netdev_,
1892 const char *type, const struct smap *details)
1894 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1895 const struct tc_ops *new_ops;
1898 new_ops = tc_lookup_ovs_name(type);
1899 if (!new_ops || !new_ops->tc_install) {
1903 error = tc_query_qdisc(netdev_);
1908 if (new_ops == netdev->tc->ops) {
1909 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
1911 /* Delete existing qdisc. */
1912 error = tc_del_qdisc(netdev_);
1916 ovs_assert(netdev->tc == NULL);
1918 /* Install new qdisc. */
1919 error = new_ops->tc_install(netdev_, details);
1920 ovs_assert((error == 0) == (netdev->tc != NULL));
1927 netdev_linux_get_queue(const struct netdev *netdev_,
1928 unsigned int queue_id, struct smap *details)
1930 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1933 error = tc_query_qdisc(netdev_);
1937 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
1939 ? netdev->tc->ops->class_get(netdev_, queue, details)
1945 netdev_linux_set_queue(struct netdev *netdev_,
1946 unsigned int queue_id, const struct smap *details)
1948 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1951 error = tc_query_qdisc(netdev_);
1954 } else if (queue_id >= netdev->tc->ops->n_queues
1955 || !netdev->tc->ops->class_set) {
1959 return netdev->tc->ops->class_set(netdev_, queue_id, details);
1963 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
1965 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1968 error = tc_query_qdisc(netdev_);
1971 } else if (!netdev->tc->ops->class_delete) {
1974 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
1976 ? netdev->tc->ops->class_delete(netdev_, queue)
1982 netdev_linux_get_queue_stats(const struct netdev *netdev_,
1983 unsigned int queue_id,
1984 struct netdev_queue_stats *stats)
1986 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1989 error = tc_query_qdisc(netdev_);
1992 } else if (!netdev->tc->ops->class_get_stats) {
1995 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
1997 ? netdev->tc->ops->class_get_stats(netdev_, queue, stats)
2003 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
2005 struct ofpbuf request;
2006 struct tcmsg *tcmsg;
2008 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2012 tcmsg->tcm_parent = 0;
2013 nl_dump_start(dump, rtnl_sock, &request);
2014 ofpbuf_uninit(&request);
2019 netdev_linux_dump_queues(const struct netdev *netdev_,
2020 netdev_dump_queues_cb *cb, void *aux)
2022 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2023 struct tc_queue *queue, *next_queue;
2024 struct smap details;
2028 error = tc_query_qdisc(netdev_);
2031 } else if (!netdev->tc->ops->class_get) {
2036 smap_init(&details);
2037 HMAP_FOR_EACH_SAFE (queue, next_queue, hmap_node,
2038 &netdev->tc->queues) {
2039 smap_clear(&details);
2041 error = netdev->tc->ops->class_get(netdev_, queue, &details);
2043 (*cb)(queue->queue_id, &details, aux);
2048 smap_destroy(&details);
2054 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2055 netdev_dump_queue_stats_cb *cb, void *aux)
2057 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2058 struct nl_dump dump;
2063 error = tc_query_qdisc(netdev_);
2066 } else if (!netdev->tc->ops->class_dump_stats) {
2071 if (!start_queue_dump(netdev_, &dump)) {
2074 while (nl_dump_next(&dump, &msg)) {
2075 error = netdev->tc->ops->class_dump_stats(netdev_, &msg, cb, aux);
2081 error = nl_dump_done(&dump);
2082 return error ? error : last_error;
2086 netdev_linux_get_in4(const struct netdev *netdev_,
2087 struct in_addr *address, struct in_addr *netmask)
2089 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2091 if (!(netdev->cache_valid & VALID_IN4)) {
2094 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
2095 SIOCGIFADDR, "SIOCGIFADDR");
2100 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2101 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2106 netdev->cache_valid |= VALID_IN4;
2108 *address = netdev->address;
2109 *netmask = netdev->netmask;
2110 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
2114 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2115 struct in_addr netmask)
2117 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2120 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2122 netdev->cache_valid |= VALID_IN4;
2123 netdev->address = address;
2124 netdev->netmask = netmask;
2125 if (address.s_addr != INADDR_ANY) {
2126 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2127 "SIOCSIFNETMASK", netmask);
2134 parse_if_inet6_line(const char *line,
2135 struct in6_addr *in6, char ifname[16 + 1])
2137 uint8_t *s6 = in6->s6_addr;
2138 #define X8 "%2"SCNx8
2140 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2141 "%*x %*x %*x %*x %16s\n",
2142 &s6[0], &s6[1], &s6[2], &s6[3],
2143 &s6[4], &s6[5], &s6[6], &s6[7],
2144 &s6[8], &s6[9], &s6[10], &s6[11],
2145 &s6[12], &s6[13], &s6[14], &s6[15],
2149 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2150 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2152 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2154 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2155 if (!(netdev->cache_valid & VALID_IN6)) {
2159 netdev->in6 = in6addr_any;
2161 file = fopen("/proc/net/if_inet6", "r");
2163 const char *name = netdev_get_name(netdev_);
2164 while (fgets(line, sizeof line, file)) {
2165 struct in6_addr in6_tmp;
2166 char ifname[16 + 1];
2167 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2168 && !strcmp(name, ifname))
2170 netdev->in6 = in6_tmp;
2176 netdev->cache_valid |= VALID_IN6;
2183 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2185 struct sockaddr_in sin;
2186 memset(&sin, 0, sizeof sin);
2187 sin.sin_family = AF_INET;
2188 sin.sin_addr = addr;
2191 memset(sa, 0, sizeof *sa);
2192 memcpy(sa, &sin, sizeof sin);
2196 do_set_addr(struct netdev *netdev,
2197 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2200 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2201 make_in4_sockaddr(&ifr.ifr_addr, addr);
2203 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2207 /* Adds 'router' as a default IP gateway. */
2209 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2211 struct in_addr any = { INADDR_ANY };
2215 memset(&rt, 0, sizeof rt);
2216 make_in4_sockaddr(&rt.rt_dst, any);
2217 make_in4_sockaddr(&rt.rt_gateway, router);
2218 make_in4_sockaddr(&rt.rt_genmask, any);
2219 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2220 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2222 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
2228 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2231 static const char fn[] = "/proc/net/route";
2236 *netdev_name = NULL;
2237 stream = fopen(fn, "r");
2238 if (stream == NULL) {
2239 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
2244 while (fgets(line, sizeof line, stream)) {
2247 ovs_be32 dest, gateway, mask;
2248 int refcnt, metric, mtu;
2249 unsigned int flags, use, window, irtt;
2252 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2254 iface, &dest, &gateway, &flags, &refcnt,
2255 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2257 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2261 if (!(flags & RTF_UP)) {
2262 /* Skip routes that aren't up. */
2266 /* The output of 'dest', 'mask', and 'gateway' were given in
2267 * network byte order, so we don't need need any endian
2268 * conversions here. */
2269 if ((dest & mask) == (host->s_addr & mask)) {
2271 /* The host is directly reachable. */
2272 next_hop->s_addr = 0;
2274 /* To reach the host, we must go through a gateway. */
2275 next_hop->s_addr = gateway;
2277 *netdev_name = xstrdup(iface);
2289 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2291 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2294 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2295 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2297 COVERAGE_INC(netdev_get_ethtool);
2298 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2299 error = netdev_linux_do_ethtool(netdev->up.name,
2302 "ETHTOOL_GDRVINFO");
2304 netdev->cache_valid |= VALID_DRVINFO;
2309 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2310 smap_add(smap, "driver_version", netdev->drvinfo.version);
2311 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2317 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2320 smap_add(smap, "driver_name", "openvswitch");
2324 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2325 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2326 * returns 0. Otherwise, it returns a positive errno value; in particular,
2327 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2329 netdev_linux_arp_lookup(const struct netdev *netdev,
2330 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2333 struct sockaddr_in sin;
2336 memset(&r, 0, sizeof r);
2337 memset(&sin, 0, sizeof sin);
2338 sin.sin_family = AF_INET;
2339 sin.sin_addr.s_addr = ip;
2341 memcpy(&r.arp_pa, &sin, sizeof sin);
2342 r.arp_ha.sa_family = ARPHRD_ETHER;
2344 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2345 COVERAGE_INC(netdev_arp_lookup);
2346 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2348 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2349 } else if (retval != ENXIO) {
2350 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2351 netdev_get_name(netdev), IP_ARGS(ip), strerror(retval));
2357 nd_to_iff_flags(enum netdev_flags nd)
2360 if (nd & NETDEV_UP) {
2363 if (nd & NETDEV_PROMISC) {
2370 iff_to_nd_flags(int iff)
2372 enum netdev_flags nd = 0;
2376 if (iff & IFF_PROMISC) {
2377 nd |= NETDEV_PROMISC;
2383 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2384 enum netdev_flags on, enum netdev_flags *old_flagsp)
2386 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2387 int old_flags, new_flags;
2390 old_flags = netdev->ifi_flags;
2391 *old_flagsp = iff_to_nd_flags(old_flags);
2392 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2393 if (new_flags != old_flags) {
2394 error = set_flags(netdev_get_name(netdev_), new_flags);
2395 get_flags(netdev_, &netdev->ifi_flags);
2401 netdev_linux_change_seq(const struct netdev *netdev)
2403 return netdev_linux_cast(netdev)->change_seq;
2406 #define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS, \
2407 GET_FEATURES, GET_STATUS) \
2411 netdev_linux_init, \
2413 netdev_linux_wait, \
2416 netdev_linux_destroy, \
2417 NULL, /* get_config */ \
2418 NULL, /* set_config */ \
2419 NULL, /* get_tunnel_config */ \
2421 netdev_linux_rx_open, \
2423 netdev_linux_send, \
2424 netdev_linux_send_wait, \
2426 netdev_linux_set_etheraddr, \
2427 netdev_linux_get_etheraddr, \
2428 netdev_linux_get_mtu, \
2429 netdev_linux_set_mtu, \
2430 netdev_linux_get_ifindex, \
2431 netdev_linux_get_carrier, \
2432 netdev_linux_get_carrier_resets, \
2433 netdev_linux_set_miimon_interval, \
2438 netdev_linux_set_advertisements, \
2440 netdev_linux_set_policing, \
2441 netdev_linux_get_qos_types, \
2442 netdev_linux_get_qos_capabilities, \
2443 netdev_linux_get_qos, \
2444 netdev_linux_set_qos, \
2445 netdev_linux_get_queue, \
2446 netdev_linux_set_queue, \
2447 netdev_linux_delete_queue, \
2448 netdev_linux_get_queue_stats, \
2449 netdev_linux_dump_queues, \
2450 netdev_linux_dump_queue_stats, \
2452 netdev_linux_get_in4, \
2453 netdev_linux_set_in4, \
2454 netdev_linux_get_in6, \
2455 netdev_linux_add_router, \
2456 netdev_linux_get_next_hop, \
2458 netdev_linux_arp_lookup, \
2460 netdev_linux_update_flags, \
2462 netdev_linux_change_seq \
2465 const struct netdev_class netdev_linux_class =
2468 netdev_linux_create,
2469 netdev_linux_get_stats,
2470 NULL, /* set_stats */
2471 netdev_linux_get_features,
2472 netdev_linux_get_status);
2474 const struct netdev_class netdev_tap_class =
2477 netdev_linux_create_tap,
2478 netdev_tap_get_stats,
2479 NULL, /* set_stats */
2480 netdev_linux_get_features,
2481 netdev_linux_get_status);
2483 const struct netdev_class netdev_internal_class =
2486 netdev_linux_create,
2487 netdev_internal_get_stats,
2488 netdev_internal_set_stats,
2489 NULL, /* get_features */
2490 netdev_internal_get_status);
2492 static const struct netdev_rx_class netdev_rx_linux_class = {
2493 netdev_rx_linux_destroy,
2494 netdev_rx_linux_recv,
2495 netdev_rx_linux_wait,
2496 netdev_rx_linux_drain,
2499 /* HTB traffic control class. */
2501 #define HTB_N_QUEUES 0xf000
2505 unsigned int max_rate; /* In bytes/s. */
2509 struct tc_queue tc_queue;
2510 unsigned int min_rate; /* In bytes/s. */
2511 unsigned int max_rate; /* In bytes/s. */
2512 unsigned int burst; /* In bytes. */
2513 unsigned int priority; /* Lower values are higher priorities. */
2517 htb_get__(const struct netdev *netdev_)
2519 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2520 return CONTAINER_OF(netdev->tc, struct htb, tc);
2524 htb_install__(struct netdev *netdev_, uint64_t max_rate)
2526 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2529 htb = xmalloc(sizeof *htb);
2530 tc_init(&htb->tc, &tc_ops_htb);
2531 htb->max_rate = max_rate;
2533 netdev->tc = &htb->tc;
2536 /* Create an HTB qdisc.
2538 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2540 htb_setup_qdisc__(struct netdev *netdev)
2543 struct tc_htb_glob opt;
2544 struct ofpbuf request;
2545 struct tcmsg *tcmsg;
2547 tc_del_qdisc(netdev);
2549 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2550 NLM_F_EXCL | NLM_F_CREATE, &request);
2554 tcmsg->tcm_handle = tc_make_handle(1, 0);
2555 tcmsg->tcm_parent = TC_H_ROOT;
2557 nl_msg_put_string(&request, TCA_KIND, "htb");
2559 memset(&opt, 0, sizeof opt);
2560 opt.rate2quantum = 10;
2564 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2565 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2566 nl_msg_end_nested(&request, opt_offset);
2568 return tc_transact(&request, NULL);
2571 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2572 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2574 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2575 unsigned int parent, struct htb_class *class)
2578 struct tc_htb_opt opt;
2579 struct ofpbuf request;
2580 struct tcmsg *tcmsg;
2584 error = netdev_get_mtu(netdev, &mtu);
2586 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2587 netdev_get_name(netdev));
2591 memset(&opt, 0, sizeof opt);
2592 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2593 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2594 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2595 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2596 opt.prio = class->priority;
2598 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2602 tcmsg->tcm_handle = handle;
2603 tcmsg->tcm_parent = parent;
2605 nl_msg_put_string(&request, TCA_KIND, "htb");
2606 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2607 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2608 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2609 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2610 nl_msg_end_nested(&request, opt_offset);
2612 error = tc_transact(&request, NULL);
2614 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2615 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2616 netdev_get_name(netdev),
2617 tc_get_major(handle), tc_get_minor(handle),
2618 tc_get_major(parent), tc_get_minor(parent),
2619 class->min_rate, class->max_rate,
2620 class->burst, class->priority, strerror(error));
2625 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2626 * description of them into 'details'. The description complies with the
2627 * specification given in the vswitch database documentation for linux-htb
2630 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2632 static const struct nl_policy tca_htb_policy[] = {
2633 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2634 .min_len = sizeof(struct tc_htb_opt) },
2637 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2638 const struct tc_htb_opt *htb;
2640 if (!nl_parse_nested(nl_options, tca_htb_policy,
2641 attrs, ARRAY_SIZE(tca_htb_policy))) {
2642 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2646 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2647 class->min_rate = htb->rate.rate;
2648 class->max_rate = htb->ceil.rate;
2649 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2650 class->priority = htb->prio;
2655 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2656 struct htb_class *options,
2657 struct netdev_queue_stats *stats)
2659 struct nlattr *nl_options;
2660 unsigned int handle;
2663 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2664 if (!error && queue_id) {
2665 unsigned int major = tc_get_major(handle);
2666 unsigned int minor = tc_get_minor(handle);
2667 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2668 *queue_id = minor - 1;
2673 if (!error && options) {
2674 error = htb_parse_tca_options__(nl_options, options);
2680 htb_parse_qdisc_details__(struct netdev *netdev,
2681 const struct smap *details, struct htb_class *hc)
2683 const char *max_rate_s;
2685 max_rate_s = smap_get(details, "max-rate");
2686 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2687 if (!hc->max_rate) {
2688 enum netdev_features current;
2690 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2691 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
2693 hc->min_rate = hc->max_rate;
2699 htb_parse_class_details__(struct netdev *netdev,
2700 const struct smap *details, struct htb_class *hc)
2702 const struct htb *htb = htb_get__(netdev);
2703 const char *min_rate_s = smap_get(details, "min-rate");
2704 const char *max_rate_s = smap_get(details, "max-rate");
2705 const char *burst_s = smap_get(details, "burst");
2706 const char *priority_s = smap_get(details, "priority");
2709 error = netdev_get_mtu(netdev, &mtu);
2711 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2712 netdev_get_name(netdev));
2716 /* HTB requires at least an mtu sized min-rate to send any traffic even
2717 * on uncongested links. */
2718 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2719 hc->min_rate = MAX(hc->min_rate, mtu);
2720 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2723 hc->max_rate = (max_rate_s
2724 ? strtoull(max_rate_s, NULL, 10) / 8
2726 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2727 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2731 * According to hints in the documentation that I've read, it is important
2732 * that 'burst' be at least as big as the largest frame that might be
2733 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2734 * but having it a bit too small is a problem. Since netdev_get_mtu()
2735 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2736 * the MTU. We actually add 64, instead of 14, as a guard against
2737 * additional headers get tacked on somewhere that we're not aware of. */
2738 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2739 hc->burst = MAX(hc->burst, mtu + 64);
2742 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2748 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2749 unsigned int parent, struct htb_class *options,
2750 struct netdev_queue_stats *stats)
2752 struct ofpbuf *reply;
2755 error = tc_query_class(netdev, handle, parent, &reply);
2757 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2758 ofpbuf_delete(reply);
2764 htb_tc_install(struct netdev *netdev, const struct smap *details)
2768 error = htb_setup_qdisc__(netdev);
2770 struct htb_class hc;
2772 htb_parse_qdisc_details__(netdev, details, &hc);
2773 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2774 tc_make_handle(1, 0), &hc);
2776 htb_install__(netdev, hc.max_rate);
2782 static struct htb_class *
2783 htb_class_cast__(const struct tc_queue *queue)
2785 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2789 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2790 const struct htb_class *hc)
2792 struct htb *htb = htb_get__(netdev);
2793 size_t hash = hash_int(queue_id, 0);
2794 struct tc_queue *queue;
2795 struct htb_class *hcp;
2797 queue = tc_find_queue__(netdev, queue_id, hash);
2799 hcp = htb_class_cast__(queue);
2801 hcp = xmalloc(sizeof *hcp);
2802 queue = &hcp->tc_queue;
2803 queue->queue_id = queue_id;
2804 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2807 hcp->min_rate = hc->min_rate;
2808 hcp->max_rate = hc->max_rate;
2809 hcp->burst = hc->burst;
2810 hcp->priority = hc->priority;
2814 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2817 struct nl_dump dump;
2818 struct htb_class hc;
2820 /* Get qdisc options. */
2822 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2823 htb_install__(netdev, hc.max_rate);
2826 if (!start_queue_dump(netdev, &dump)) {
2829 while (nl_dump_next(&dump, &msg)) {
2830 unsigned int queue_id;
2832 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2833 htb_update_queue__(netdev, queue_id, &hc);
2836 nl_dump_done(&dump);
2842 htb_tc_destroy(struct tc *tc)
2844 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2845 struct htb_class *hc, *next;
2847 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2848 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2856 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
2858 const struct htb *htb = htb_get__(netdev);
2859 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
2864 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
2866 struct htb_class hc;
2869 htb_parse_qdisc_details__(netdev, details, &hc);
2870 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2871 tc_make_handle(1, 0), &hc);
2873 htb_get__(netdev)->max_rate = hc.max_rate;
2879 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2880 const struct tc_queue *queue, struct smap *details)
2882 const struct htb_class *hc = htb_class_cast__(queue);
2884 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
2885 if (hc->min_rate != hc->max_rate) {
2886 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
2888 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
2890 smap_add_format(details, "priority", "%u", hc->priority);
2896 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2897 const struct smap *details)
2899 struct htb_class hc;
2902 error = htb_parse_class_details__(netdev, details, &hc);
2907 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2908 tc_make_handle(1, 0xfffe), &hc);
2913 htb_update_queue__(netdev, queue_id, &hc);
2918 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2920 struct htb_class *hc = htb_class_cast__(queue);
2921 struct htb *htb = htb_get__(netdev);
2924 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2926 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2933 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2934 struct netdev_queue_stats *stats)
2936 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2937 tc_make_handle(1, 0xfffe), NULL, stats);
2941 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2942 const struct ofpbuf *nlmsg,
2943 netdev_dump_queue_stats_cb *cb, void *aux)
2945 struct netdev_queue_stats stats;
2946 unsigned int handle, major, minor;
2949 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2954 major = tc_get_major(handle);
2955 minor = tc_get_minor(handle);
2956 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2957 (*cb)(minor - 1, &stats, aux);
2962 static const struct tc_ops tc_ops_htb = {
2963 "htb", /* linux_name */
2964 "linux-htb", /* ovs_name */
2965 HTB_N_QUEUES, /* n_queues */
2974 htb_class_get_stats,
2975 htb_class_dump_stats
2978 /* "linux-hfsc" traffic control class. */
2980 #define HFSC_N_QUEUES 0xf000
2988 struct tc_queue tc_queue;
2993 static struct hfsc *
2994 hfsc_get__(const struct netdev *netdev_)
2996 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2997 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
3000 static struct hfsc_class *
3001 hfsc_class_cast__(const struct tc_queue *queue)
3003 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3007 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
3009 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3012 hfsc = xmalloc(sizeof *hfsc);
3013 tc_init(&hfsc->tc, &tc_ops_hfsc);
3014 hfsc->max_rate = max_rate;
3015 netdev->tc = &hfsc->tc;
3019 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3020 const struct hfsc_class *hc)
3024 struct hfsc_class *hcp;
3025 struct tc_queue *queue;
3027 hfsc = hfsc_get__(netdev);
3028 hash = hash_int(queue_id, 0);
3030 queue = tc_find_queue__(netdev, queue_id, hash);
3032 hcp = hfsc_class_cast__(queue);
3034 hcp = xmalloc(sizeof *hcp);
3035 queue = &hcp->tc_queue;
3036 queue->queue_id = queue_id;
3037 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3040 hcp->min_rate = hc->min_rate;
3041 hcp->max_rate = hc->max_rate;
3045 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3047 const struct tc_service_curve *rsc, *fsc, *usc;
3048 static const struct nl_policy tca_hfsc_policy[] = {
3050 .type = NL_A_UNSPEC,
3052 .min_len = sizeof(struct tc_service_curve),
3055 .type = NL_A_UNSPEC,
3057 .min_len = sizeof(struct tc_service_curve),
3060 .type = NL_A_UNSPEC,
3062 .min_len = sizeof(struct tc_service_curve),
3065 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3067 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3068 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3069 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3073 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3074 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3075 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3077 if (rsc->m1 != 0 || rsc->d != 0 ||
3078 fsc->m1 != 0 || fsc->d != 0 ||
3079 usc->m1 != 0 || usc->d != 0) {
3080 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3081 "Non-linear service curves are not supported.");
3085 if (rsc->m2 != fsc->m2) {
3086 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3087 "Real-time service curves are not supported ");
3091 if (rsc->m2 > usc->m2) {
3092 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3093 "Min-rate service curve is greater than "
3094 "the max-rate service curve.");
3098 class->min_rate = fsc->m2;
3099 class->max_rate = usc->m2;
3104 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3105 struct hfsc_class *options,
3106 struct netdev_queue_stats *stats)
3109 unsigned int handle;
3110 struct nlattr *nl_options;
3112 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3118 unsigned int major, minor;
3120 major = tc_get_major(handle);
3121 minor = tc_get_minor(handle);
3122 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3123 *queue_id = minor - 1;
3130 error = hfsc_parse_tca_options__(nl_options, options);
3137 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3138 unsigned int parent, struct hfsc_class *options,
3139 struct netdev_queue_stats *stats)
3142 struct ofpbuf *reply;
3144 error = tc_query_class(netdev, handle, parent, &reply);
3149 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3150 ofpbuf_delete(reply);
3155 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct smap *details,
3156 struct hfsc_class *class)
3159 const char *max_rate_s;
3161 max_rate_s = smap_get(details, "max-rate");
3162 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3165 enum netdev_features current;
3167 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
3168 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3171 class->min_rate = max_rate;
3172 class->max_rate = max_rate;
3176 hfsc_parse_class_details__(struct netdev *netdev,
3177 const struct smap *details,
3178 struct hfsc_class * class)
3180 const struct hfsc *hfsc;
3181 uint32_t min_rate, max_rate;
3182 const char *min_rate_s, *max_rate_s;
3184 hfsc = hfsc_get__(netdev);
3185 min_rate_s = smap_get(details, "min-rate");
3186 max_rate_s = smap_get(details, "max-rate");
3188 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3189 min_rate = MAX(min_rate, 1);
3190 min_rate = MIN(min_rate, hfsc->max_rate);
3192 max_rate = (max_rate_s
3193 ? strtoull(max_rate_s, NULL, 10) / 8
3195 max_rate = MAX(max_rate, min_rate);
3196 max_rate = MIN(max_rate, hfsc->max_rate);
3198 class->min_rate = min_rate;
3199 class->max_rate = max_rate;
3204 /* Create an HFSC qdisc.
3206 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3208 hfsc_setup_qdisc__(struct netdev * netdev)
3210 struct tcmsg *tcmsg;
3211 struct ofpbuf request;
3212 struct tc_hfsc_qopt opt;
3214 tc_del_qdisc(netdev);
3216 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3217 NLM_F_EXCL | NLM_F_CREATE, &request);
3223 tcmsg->tcm_handle = tc_make_handle(1, 0);
3224 tcmsg->tcm_parent = TC_H_ROOT;
3226 memset(&opt, 0, sizeof opt);
3229 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3230 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3232 return tc_transact(&request, NULL);
3235 /* Create an HFSC class.
3237 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3238 * sc rate <min_rate> ul rate <max_rate>" */
3240 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3241 unsigned int parent, struct hfsc_class *class)
3245 struct tcmsg *tcmsg;
3246 struct ofpbuf request;
3247 struct tc_service_curve min, max;
3249 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3255 tcmsg->tcm_handle = handle;
3256 tcmsg->tcm_parent = parent;
3260 min.m2 = class->min_rate;
3264 max.m2 = class->max_rate;
3266 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3267 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3268 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3269 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3270 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3271 nl_msg_end_nested(&request, opt_offset);
3273 error = tc_transact(&request, NULL);
3275 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3276 "min-rate %ubps, max-rate %ubps (%s)",
3277 netdev_get_name(netdev),
3278 tc_get_major(handle), tc_get_minor(handle),
3279 tc_get_major(parent), tc_get_minor(parent),
3280 class->min_rate, class->max_rate, strerror(error));
3287 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
3290 struct hfsc_class class;
3292 error = hfsc_setup_qdisc__(netdev);
3298 hfsc_parse_qdisc_details__(netdev, details, &class);
3299 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3300 tc_make_handle(1, 0), &class);
3306 hfsc_install__(netdev, class.max_rate);
3311 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3314 struct nl_dump dump;
3315 struct hfsc_class hc;
3318 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3319 hfsc_install__(netdev, hc.max_rate);
3321 if (!start_queue_dump(netdev, &dump)) {
3325 while (nl_dump_next(&dump, &msg)) {
3326 unsigned int queue_id;
3328 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3329 hfsc_update_queue__(netdev, queue_id, &hc);
3333 nl_dump_done(&dump);
3338 hfsc_tc_destroy(struct tc *tc)
3341 struct hfsc_class *hc, *next;
3343 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3345 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3346 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3355 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
3357 const struct hfsc *hfsc;
3358 hfsc = hfsc_get__(netdev);
3359 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
3364 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
3367 struct hfsc_class class;
3369 hfsc_parse_qdisc_details__(netdev, details, &class);
3370 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3371 tc_make_handle(1, 0), &class);
3374 hfsc_get__(netdev)->max_rate = class.max_rate;
3381 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3382 const struct tc_queue *queue, struct smap *details)
3384 const struct hfsc_class *hc;
3386 hc = hfsc_class_cast__(queue);
3387 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3388 if (hc->min_rate != hc->max_rate) {
3389 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3395 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3396 const struct smap *details)
3399 struct hfsc_class class;
3401 error = hfsc_parse_class_details__(netdev, details, &class);
3406 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3407 tc_make_handle(1, 0xfffe), &class);
3412 hfsc_update_queue__(netdev, queue_id, &class);
3417 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3421 struct hfsc_class *hc;
3423 hc = hfsc_class_cast__(queue);
3424 hfsc = hfsc_get__(netdev);
3426 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3428 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3435 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3436 struct netdev_queue_stats *stats)
3438 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3439 tc_make_handle(1, 0xfffe), NULL, stats);
3443 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3444 const struct ofpbuf *nlmsg,
3445 netdev_dump_queue_stats_cb *cb, void *aux)
3447 struct netdev_queue_stats stats;
3448 unsigned int handle, major, minor;
3451 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3456 major = tc_get_major(handle);
3457 minor = tc_get_minor(handle);
3458 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3459 (*cb)(minor - 1, &stats, aux);
3464 static const struct tc_ops tc_ops_hfsc = {
3465 "hfsc", /* linux_name */
3466 "linux-hfsc", /* ovs_name */
3467 HFSC_N_QUEUES, /* n_queues */
3468 hfsc_tc_install, /* tc_install */
3469 hfsc_tc_load, /* tc_load */
3470 hfsc_tc_destroy, /* tc_destroy */
3471 hfsc_qdisc_get, /* qdisc_get */
3472 hfsc_qdisc_set, /* qdisc_set */
3473 hfsc_class_get, /* class_get */
3474 hfsc_class_set, /* class_set */
3475 hfsc_class_delete, /* class_delete */
3476 hfsc_class_get_stats, /* class_get_stats */
3477 hfsc_class_dump_stats /* class_dump_stats */
3480 /* "linux-default" traffic control class.
3482 * This class represents the default, unnamed Linux qdisc. It corresponds to
3483 * the "" (empty string) QoS type in the OVS database. */
3486 default_install__(struct netdev *netdev_)
3488 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3489 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
3491 /* Nothing but a tc class implementation is allowed to write to a tc. This
3492 * class never does that, so we can legitimately use a const tc object. */
3493 netdev->tc = CONST_CAST(struct tc *, &tc);
3497 default_tc_install(struct netdev *netdev,
3498 const struct smap *details OVS_UNUSED)
3500 default_install__(netdev);
3505 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3507 default_install__(netdev);
3511 static const struct tc_ops tc_ops_default = {
3512 NULL, /* linux_name */
3517 NULL, /* tc_destroy */
3518 NULL, /* qdisc_get */
3519 NULL, /* qdisc_set */
3520 NULL, /* class_get */
3521 NULL, /* class_set */
3522 NULL, /* class_delete */
3523 NULL, /* class_get_stats */
3524 NULL /* class_dump_stats */
3527 /* "linux-other" traffic control class.
3532 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
3534 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3535 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
3537 /* Nothing but a tc class implementation is allowed to write to a tc. This
3538 * class never does that, so we can legitimately use a const tc object. */
3539 netdev->tc = CONST_CAST(struct tc *, &tc);
3543 static const struct tc_ops tc_ops_other = {
3544 NULL, /* linux_name */
3545 "linux-other", /* ovs_name */
3547 NULL, /* tc_install */
3549 NULL, /* tc_destroy */
3550 NULL, /* qdisc_get */
3551 NULL, /* qdisc_set */
3552 NULL, /* class_get */
3553 NULL, /* class_set */
3554 NULL, /* class_delete */
3555 NULL, /* class_get_stats */
3556 NULL /* class_dump_stats */
3559 /* Traffic control. */
3561 /* Number of kernel "tc" ticks per second. */
3562 static double ticks_per_s;
3564 /* Number of kernel "jiffies" per second. This is used for the purpose of
3565 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3566 * one jiffy's worth of data.
3568 * There are two possibilities here:
3570 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3571 * approximate range of 100 to 1024. That means that we really need to
3572 * make sure that the qdisc can buffer that much data.
3574 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3575 * has finely granular timers and there's no need to fudge additional room
3576 * for buffers. (There's no extra effort needed to implement that: the
3577 * large 'buffer_hz' is used as a divisor, so practically any number will
3578 * come out as 0 in the division. Small integer results in the case of
3579 * really high dividends won't have any real effect anyhow.)
3581 static unsigned int buffer_hz;
3583 /* Returns tc handle 'major':'minor'. */
3585 tc_make_handle(unsigned int major, unsigned int minor)
3587 return TC_H_MAKE(major << 16, minor);
3590 /* Returns the major number from 'handle'. */
3592 tc_get_major(unsigned int handle)
3594 return TC_H_MAJ(handle) >> 16;
3597 /* Returns the minor number from 'handle'. */
3599 tc_get_minor(unsigned int handle)
3601 return TC_H_MIN(handle);
3604 static struct tcmsg *
3605 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3606 struct ofpbuf *request)
3608 struct tcmsg *tcmsg;
3612 error = get_ifindex(netdev, &ifindex);
3617 ofpbuf_init(request, 512);
3618 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3619 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3620 tcmsg->tcm_family = AF_UNSPEC;
3621 tcmsg->tcm_ifindex = ifindex;
3622 /* Caller should fill in tcmsg->tcm_handle. */
3623 /* Caller should fill in tcmsg->tcm_parent. */
3629 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3631 int error = nl_sock_transact(rtnl_sock, request, replyp);
3632 ofpbuf_uninit(request);
3636 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3637 * policing configuration.
3639 * This function is equivalent to running the following when 'add' is true:
3640 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3642 * This function is equivalent to running the following when 'add' is false:
3643 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3645 * The configuration and stats may be seen with the following command:
3646 * /sbin/tc -s qdisc show dev <devname>
3648 * Returns 0 if successful, otherwise a positive errno value.
3651 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3653 struct ofpbuf request;
3654 struct tcmsg *tcmsg;
3656 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3657 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3659 tcmsg = tc_make_request(netdev, type, flags, &request);
3663 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3664 tcmsg->tcm_parent = TC_H_INGRESS;
3665 nl_msg_put_string(&request, TCA_KIND, "ingress");
3666 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3668 error = tc_transact(&request, NULL);
3670 /* If we're deleting the qdisc, don't worry about some of the
3671 * error conditions. */
3672 if (!add && (error == ENOENT || error == EINVAL)) {
3681 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3684 * This function is equivalent to running:
3685 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3686 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3689 * The configuration and stats may be seen with the following command:
3690 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3692 * Returns 0 if successful, otherwise a positive errno value.
3695 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3697 struct tc_police tc_police;
3698 struct ofpbuf request;
3699 struct tcmsg *tcmsg;
3700 size_t basic_offset;
3701 size_t police_offset;
3705 memset(&tc_police, 0, sizeof tc_police);
3706 tc_police.action = TC_POLICE_SHOT;
3707 tc_police.mtu = mtu;
3708 tc_fill_rate(&tc_police.rate, (kbits_rate * 1000)/8, mtu);
3709 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3710 kbits_burst * 1024);
3712 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3713 NLM_F_EXCL | NLM_F_CREATE, &request);
3717 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3718 tcmsg->tcm_info = tc_make_handle(49,
3719 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3721 nl_msg_put_string(&request, TCA_KIND, "basic");
3722 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3723 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3724 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3725 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3726 nl_msg_end_nested(&request, police_offset);
3727 nl_msg_end_nested(&request, basic_offset);
3729 error = tc_transact(&request, NULL);
3740 /* The values in psched are not individually very meaningful, but they are
3741 * important. The tables below show some values seen in the wild.
3745 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3746 * (Before that, there are hints that it was 1000000000.)
3748 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3752 * -----------------------------------
3753 * [1] 000c8000 000f4240 000f4240 00000064
3754 * [2] 000003e8 00000400 000f4240 3b9aca00
3755 * [3] 000003e8 00000400 000f4240 3b9aca00
3756 * [4] 000003e8 00000400 000f4240 00000064
3757 * [5] 000003e8 00000040 000f4240 3b9aca00
3758 * [6] 000003e8 00000040 000f4240 000000f9
3760 * a b c d ticks_per_s buffer_hz
3761 * ------- --------- ---------- ------------- ----------- -------------
3762 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3763 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3764 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3765 * [4] 1,000 1,024 1,000,000 100 976,562 100
3766 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3767 * [6] 1,000 64 1,000,000 249 15,625,000 249
3769 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3770 * [2] 2.6.26-1-686-bigmem from Debian lenny
3771 * [3] 2.6.26-2-sparc64 from Debian lenny
3772 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3773 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3774 * [6] 2.6.34 from kernel.org on KVM
3776 static const char fn[] = "/proc/net/psched";
3777 unsigned int a, b, c, d;
3783 stream = fopen(fn, "r");
3785 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3789 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3790 VLOG_WARN("%s: read failed", fn);
3794 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3798 VLOG_WARN("%s: invalid scheduler parameters", fn);
3802 ticks_per_s = (double) a * c / b;
3806 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3809 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3812 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3813 * rate of 'rate' bytes per second. */
3815 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3820 return (rate * ticks) / ticks_per_s;
3823 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3824 * rate of 'rate' bytes per second. */
3826 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3831 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3834 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3835 * a transmission rate of 'rate' bytes per second. */
3837 tc_buffer_per_jiffy(unsigned int rate)
3842 return rate / buffer_hz;
3845 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3846 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3847 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3848 * stores NULL into it if it is absent.
3850 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3853 * Returns 0 if successful, otherwise a positive errno value. */
3855 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3856 struct nlattr **options)
3858 static const struct nl_policy tca_policy[] = {
3859 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3860 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3862 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3864 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3865 tca_policy, ta, ARRAY_SIZE(ta))) {
3866 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3871 *kind = nl_attr_get_string(ta[TCA_KIND]);
3875 *options = ta[TCA_OPTIONS];
3890 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3891 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3892 * into '*options', and its queue statistics into '*stats'. Any of the output
3893 * arguments may be null.
3895 * Returns 0 if successful, otherwise a positive errno value. */
3897 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3898 struct nlattr **options, struct netdev_queue_stats *stats)
3900 static const struct nl_policy tca_policy[] = {
3901 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3902 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3904 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3906 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3907 tca_policy, ta, ARRAY_SIZE(ta))) {
3908 VLOG_WARN_RL(&rl, "failed to parse class message");
3913 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3914 *handlep = tc->tcm_handle;
3918 *options = ta[TCA_OPTIONS];
3922 const struct gnet_stats_queue *gsq;
3923 struct gnet_stats_basic gsb;
3925 static const struct nl_policy stats_policy[] = {
3926 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3927 .min_len = sizeof gsb },
3928 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3929 .min_len = sizeof *gsq },
3931 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3933 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3934 sa, ARRAY_SIZE(sa))) {
3935 VLOG_WARN_RL(&rl, "failed to parse class stats");
3939 /* Alignment issues screw up the length of struct gnet_stats_basic on
3940 * some arch/bitsize combinations. Newer versions of Linux have a
3941 * struct gnet_stats_basic_packed, but we can't depend on that. The
3942 * easiest thing to do is just to make a copy. */
3943 memset(&gsb, 0, sizeof gsb);
3944 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3945 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3946 stats->tx_bytes = gsb.bytes;
3947 stats->tx_packets = gsb.packets;
3949 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3950 stats->tx_errors = gsq->drops;
3960 memset(stats, 0, sizeof *stats);
3965 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3968 tc_query_class(const struct netdev *netdev,
3969 unsigned int handle, unsigned int parent,
3970 struct ofpbuf **replyp)
3972 struct ofpbuf request;
3973 struct tcmsg *tcmsg;
3976 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3980 tcmsg->tcm_handle = handle;
3981 tcmsg->tcm_parent = parent;
3983 error = tc_transact(&request, replyp);
3985 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3986 netdev_get_name(netdev),
3987 tc_get_major(handle), tc_get_minor(handle),
3988 tc_get_major(parent), tc_get_minor(parent),
3994 /* Equivalent to "tc class del dev <name> handle <handle>". */
3996 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3998 struct ofpbuf request;
3999 struct tcmsg *tcmsg;
4002 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
4006 tcmsg->tcm_handle = handle;
4007 tcmsg->tcm_parent = 0;
4009 error = tc_transact(&request, NULL);
4011 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4012 netdev_get_name(netdev),
4013 tc_get_major(handle), tc_get_minor(handle),
4019 /* Equivalent to "tc qdisc del dev <name> root". */
4021 tc_del_qdisc(struct netdev *netdev_)
4023 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4024 struct ofpbuf request;
4025 struct tcmsg *tcmsg;
4028 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
4032 tcmsg->tcm_handle = tc_make_handle(1, 0);
4033 tcmsg->tcm_parent = TC_H_ROOT;
4035 error = tc_transact(&request, NULL);
4036 if (error == EINVAL) {
4037 /* EINVAL probably means that the default qdisc was in use, in which
4038 * case we've accomplished our purpose. */
4041 if (!error && netdev->tc) {
4042 if (netdev->tc->ops->tc_destroy) {
4043 netdev->tc->ops->tc_destroy(netdev->tc);
4050 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4051 * kernel to determine what they are. Returns 0 if successful, otherwise a
4052 * positive errno value. */
4054 tc_query_qdisc(const struct netdev *netdev_)
4056 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4057 struct ofpbuf request, *qdisc;
4058 const struct tc_ops *ops;
4059 struct tcmsg *tcmsg;
4067 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4068 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4069 * 2.6.35 without that fix backported to it.
4071 * To avoid the OOPS, we must not make a request that would attempt to dump
4072 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4073 * few others. There are a few ways that I can see to do this, but most of
4074 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4075 * technique chosen here is to assume that any non-default qdisc that we
4076 * create will have a class with handle 1:0. The built-in qdiscs only have
4077 * a class with handle 0:0.
4079 * We could check for Linux 2.6.35+ and use a more straightforward method
4081 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
4085 tcmsg->tcm_handle = tc_make_handle(1, 0);
4086 tcmsg->tcm_parent = 0;
4088 /* Figure out what tc class to instantiate. */
4089 error = tc_transact(&request, &qdisc);
4093 error = tc_parse_qdisc(qdisc, &kind, NULL);
4095 ops = &tc_ops_other;
4097 ops = tc_lookup_linux_name(kind);
4099 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4100 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4102 ops = &tc_ops_other;
4105 } else if (error == ENOENT) {
4106 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4107 * other entity that doesn't have a handle 1:0. We will assume
4108 * that it's the system default qdisc. */
4109 ops = &tc_ops_default;
4112 /* Who knows? Maybe the device got deleted. */
4113 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4114 netdev_get_name(netdev_), strerror(error));
4115 ops = &tc_ops_other;
4118 /* Instantiate it. */
4119 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
4120 ovs_assert((load_error == 0) == (netdev->tc != NULL));
4121 ofpbuf_delete(qdisc);
4123 return error ? error : load_error;
4126 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4127 approximate the time to transmit packets of various lengths. For an MTU of
4128 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4129 represents two possible packet lengths; for a MTU of 513 through 1024, four
4130 possible lengths; and so on.
4132 Returns, for the specified 'mtu', the number of bits that packet lengths
4133 need to be shifted right to fit within such a 256-entry table. */
4135 tc_calc_cell_log(unsigned int mtu)
4140 mtu = ETH_PAYLOAD_MAX;
4142 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4144 for (cell_log = 0; mtu >= 256; cell_log++) {
4151 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4154 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4156 memset(rate, 0, sizeof *rate);
4157 rate->cell_log = tc_calc_cell_log(mtu);
4158 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4159 /* rate->cell_align = 0; */ /* distro headers. */
4160 rate->mpu = ETH_TOTAL_MIN;
4164 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4165 * attribute of the specified "type".
4167 * See tc_calc_cell_log() above for a description of "rtab"s. */
4169 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4174 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4175 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4176 unsigned packet_size = (i + 1) << rate->cell_log;
4177 if (packet_size < rate->mpu) {
4178 packet_size = rate->mpu;
4180 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4184 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4185 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4186 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4189 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4191 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4192 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4195 /* Linux-only functions declared in netdev-linux.h */
4197 /* Returns a fd for an AF_INET socket or a negative errno value. */
4199 netdev_linux_get_af_inet_sock(void)
4201 int error = netdev_linux_init();
4202 return error ? -error : af_inet_sock;
4205 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4206 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4208 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4209 const char *flag_name, bool enable)
4211 const char *netdev_name = netdev_get_name(netdev);
4212 struct ethtool_value evalue;
4216 COVERAGE_INC(netdev_get_ethtool);
4217 memset(&evalue, 0, sizeof evalue);
4218 error = netdev_linux_do_ethtool(netdev_name,
4219 (struct ethtool_cmd *)&evalue,
4220 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4225 COVERAGE_INC(netdev_set_ethtool);
4226 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4227 error = netdev_linux_do_ethtool(netdev_name,
4228 (struct ethtool_cmd *)&evalue,
4229 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4234 COVERAGE_INC(netdev_get_ethtool);
4235 memset(&evalue, 0, sizeof evalue);
4236 error = netdev_linux_do_ethtool(netdev_name,
4237 (struct ethtool_cmd *)&evalue,
4238 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4243 if (new_flags != evalue.data) {
4244 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4245 "device %s failed", enable ? "enable" : "disable",
4246 flag_name, netdev_name);
4253 /* Utility functions. */
4255 /* Copies 'src' into 'dst', performing format conversion in the process. */
4257 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4258 const struct rtnl_link_stats *src)
4260 dst->rx_packets = src->rx_packets;
4261 dst->tx_packets = src->tx_packets;
4262 dst->rx_bytes = src->rx_bytes;
4263 dst->tx_bytes = src->tx_bytes;
4264 dst->rx_errors = src->rx_errors;
4265 dst->tx_errors = src->tx_errors;
4266 dst->rx_dropped = src->rx_dropped;
4267 dst->tx_dropped = src->tx_dropped;
4268 dst->multicast = src->multicast;
4269 dst->collisions = src->collisions;
4270 dst->rx_length_errors = src->rx_length_errors;
4271 dst->rx_over_errors = src->rx_over_errors;
4272 dst->rx_crc_errors = src->rx_crc_errors;
4273 dst->rx_frame_errors = src->rx_frame_errors;
4274 dst->rx_fifo_errors = src->rx_fifo_errors;
4275 dst->rx_missed_errors = src->rx_missed_errors;
4276 dst->tx_aborted_errors = src->tx_aborted_errors;
4277 dst->tx_carrier_errors = src->tx_carrier_errors;
4278 dst->tx_fifo_errors = src->tx_fifo_errors;
4279 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4280 dst->tx_window_errors = src->tx_window_errors;
4284 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4286 /* Policy for RTNLGRP_LINK messages.
4288 * There are *many* more fields in these messages, but currently we only
4289 * care about these fields. */
4290 static const struct nl_policy rtnlgrp_link_policy[] = {
4291 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4292 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4293 .min_len = sizeof(struct rtnl_link_stats) },
4296 struct ofpbuf request;
4297 struct ofpbuf *reply;
4298 struct ifinfomsg *ifi;
4299 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4302 ofpbuf_init(&request, 0);
4303 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4304 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4305 ifi->ifi_family = PF_UNSPEC;
4306 ifi->ifi_index = ifindex;
4307 error = nl_sock_transact(rtnl_sock, &request, &reply);
4308 ofpbuf_uninit(&request);
4313 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4314 rtnlgrp_link_policy,
4315 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4316 ofpbuf_delete(reply);
4320 if (!attrs[IFLA_STATS]) {
4321 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4322 ofpbuf_delete(reply);
4326 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4328 ofpbuf_delete(reply);
4334 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4336 static const char fn[] = "/proc/net/dev";
4341 stream = fopen(fn, "r");
4343 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
4348 while (fgets(line, sizeof line, stream)) {
4351 #define X64 "%"SCNu64
4354 X64 X64 X64 X64 X64 X64 X64 "%*u"
4355 X64 X64 X64 X64 X64 X64 X64 "%*u",
4361 &stats->rx_fifo_errors,
4362 &stats->rx_frame_errors,
4368 &stats->tx_fifo_errors,
4370 &stats->tx_carrier_errors) != 15) {
4371 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4372 } else if (!strcmp(devname, netdev_name)) {
4373 stats->rx_length_errors = UINT64_MAX;
4374 stats->rx_over_errors = UINT64_MAX;
4375 stats->rx_crc_errors = UINT64_MAX;
4376 stats->rx_missed_errors = UINT64_MAX;
4377 stats->tx_aborted_errors = UINT64_MAX;
4378 stats->tx_heartbeat_errors = UINT64_MAX;
4379 stats->tx_window_errors = UINT64_MAX;
4385 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4391 get_flags(const struct netdev *dev, unsigned int *flags)
4397 error = netdev_linux_do_ioctl(dev->name, &ifr, SIOCGIFFLAGS,
4400 *flags = ifr.ifr_flags;
4406 set_flags(const char *name, unsigned int flags)
4410 ifr.ifr_flags = flags;
4411 return netdev_linux_do_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
4415 do_get_ifindex(const char *netdev_name)
4419 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4420 COVERAGE_INC(netdev_get_ifindex);
4421 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4422 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4423 netdev_name, strerror(errno));
4426 return ifr.ifr_ifindex;
4430 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4432 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4434 if (!(netdev->cache_valid & VALID_IFINDEX)) {
4435 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4438 netdev->get_ifindex_error = -ifindex;
4439 netdev->ifindex = 0;
4441 netdev->get_ifindex_error = 0;
4442 netdev->ifindex = ifindex;
4444 netdev->cache_valid |= VALID_IFINDEX;
4447 *ifindexp = netdev->ifindex;
4448 return netdev->get_ifindex_error;
4452 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4457 memset(&ifr, 0, sizeof ifr);
4458 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4459 COVERAGE_INC(netdev_get_hwaddr);
4460 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4461 /* ENODEV probably means that a vif disappeared asynchronously and
4462 * hasn't been removed from the database yet, so reduce the log level
4463 * to INFO for that case. */
4464 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4465 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4466 netdev_name, strerror(errno));
4469 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4470 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4471 VLOG_WARN("%s device has unknown hardware address family %d",
4472 netdev_name, hwaddr_family);
4474 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4479 set_etheraddr(const char *netdev_name,
4480 const uint8_t mac[ETH_ADDR_LEN])
4484 memset(&ifr, 0, sizeof ifr);
4485 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4486 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4487 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4488 COVERAGE_INC(netdev_set_hwaddr);
4489 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4490 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4491 netdev_name, strerror(errno));
4498 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4499 int cmd, const char *cmd_name)
4503 memset(&ifr, 0, sizeof ifr);
4504 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4505 ifr.ifr_data = (caddr_t) ecmd;
4508 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4511 if (errno != EOPNOTSUPP) {
4512 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4513 "failed: %s", cmd_name, name, strerror(errno));
4515 /* The device doesn't support this operation. That's pretty
4516 * common, so there's no point in logging anything. */
4523 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4524 const char *cmd_name)
4526 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4527 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4528 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4536 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4537 int cmd, const char *cmd_name)
4542 ifr.ifr_addr.sa_family = AF_INET;
4543 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4545 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4546 *ip = sin->sin_addr;
4551 /* Returns an AF_PACKET raw socket or a negative errno value. */
4553 af_packet_sock(void)
4555 static int sock = INT_MIN;
4557 if (sock == INT_MIN) {
4558 sock = socket(AF_PACKET, SOCK_RAW, 0);
4560 int error = set_nonblocking(sock);
4567 VLOG_ERR("failed to create packet socket: %s", strerror(errno));