2 * Copyright (c) 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/gen_stats.h>
26 #include <linux/if_ether.h>
27 #include <linux/if_tun.h>
28 #include <linux/types.h>
29 #include <linux/ethtool.h>
30 #include <linux/mii.h>
31 #include <linux/pkt_cls.h>
32 #include <linux/pkt_sched.h>
33 #include <linux/rtnetlink.h>
34 #include <linux/sockios.h>
35 #include <linux/version.h>
36 #include <sys/types.h>
37 #include <sys/ioctl.h>
38 #include <sys/socket.h>
39 #include <netpacket/packet.h>
41 #include <net/if_arp.h>
42 #include <net/if_packet.h>
43 #include <net/route.h>
44 #include <netinet/in.h>
51 #include "dpif-linux.h"
52 #include "dynamic-string.h"
53 #include "fatal-signal.h"
56 #include "netdev-provider.h"
57 #include "netdev-vport.h"
58 #include "netlink-notifier.h"
59 #include "netlink-socket.h"
62 #include "openflow/openflow.h"
64 #include "poll-loop.h"
65 #include "rtnetlink-link.h"
67 #include "socket-util.h"
70 #include "unaligned.h"
73 VLOG_DEFINE_THIS_MODULE(netdev_linux);
75 COVERAGE_DEFINE(netdev_set_policing);
76 COVERAGE_DEFINE(netdev_arp_lookup);
77 COVERAGE_DEFINE(netdev_get_ifindex);
78 COVERAGE_DEFINE(netdev_get_hwaddr);
79 COVERAGE_DEFINE(netdev_set_hwaddr);
80 COVERAGE_DEFINE(netdev_get_ethtool);
81 COVERAGE_DEFINE(netdev_set_ethtool);
84 /* These were introduced in Linux 2.6.14, so they might be missing if we have
86 #ifndef ADVERTISED_Pause
87 #define ADVERTISED_Pause (1 << 13)
89 #ifndef ADVERTISED_Asym_Pause
90 #define ADVERTISED_Asym_Pause (1 << 14)
93 /* These were introduced in Linux 2.6.24, so they might be missing if we
94 * have old headers. */
95 #ifndef ETHTOOL_GFLAGS
96 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
98 #ifndef ETHTOOL_SFLAGS
99 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
102 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
105 #define TC_RTAB_SIZE 1024
108 static struct nln_notifier *netdev_linux_cache_notifier = NULL;
109 static int cache_notifier_refcount;
112 VALID_IFINDEX = 1 << 0,
113 VALID_ETHERADDR = 1 << 1,
117 VALID_POLICING = 1 << 5,
118 VALID_VPORT_STAT_ERROR = 1 << 6,
119 VALID_DRVINFO = 1 << 7,
120 VALID_FEATURES = 1 << 8,
127 /* Traffic control. */
129 /* An instance of a traffic control class. Always associated with a particular
132 * Each TC implementation subclasses this with whatever additional data it
135 const struct tc_ops *ops;
136 struct hmap queues; /* Contains "struct tc_queue"s.
137 * Read by generic TC layer.
138 * Written only by TC implementation. */
141 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
143 /* One traffic control queue.
145 * Each TC implementation subclasses this with whatever additional data it
148 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
149 unsigned int queue_id; /* OpenFlow queue ID. */
152 /* A particular kind of traffic control. Each implementation generally maps to
153 * one particular Linux qdisc class.
155 * The functions below return 0 if successful or a positive errno value on
156 * failure, except where otherwise noted. All of them must be provided, except
157 * where otherwise noted. */
159 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
160 * This is null for tc_ops_default and tc_ops_other, for which there are no
161 * appropriate values. */
162 const char *linux_name;
164 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
165 const char *ovs_name;
167 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
168 * queues. The queues are numbered 0 through n_queues - 1. */
169 unsigned int n_queues;
171 /* Called to install this TC class on 'netdev'. The implementation should
172 * make the Netlink calls required to set up 'netdev' with the right qdisc
173 * and configure it according to 'details'. The implementation may assume
174 * that the current qdisc is the default; that is, there is no need for it
175 * to delete the current qdisc before installing itself.
177 * The contents of 'details' should be documented as valid for 'ovs_name'
178 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
179 * (which is built as ovs-vswitchd.conf.db(8)).
181 * This function must return 0 if and only if it sets 'netdev->tc' to an
182 * initialized 'struct tc'.
184 * (This function is null for tc_ops_other, which cannot be installed. For
185 * other TC classes it should always be nonnull.) */
186 int (*tc_install)(struct netdev *netdev, const struct smap *details);
188 /* Called when the netdev code determines (through a Netlink query) that
189 * this TC class's qdisc is installed on 'netdev', but we didn't install
190 * it ourselves and so don't know any of the details.
192 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
193 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
194 * implementation should parse the other attributes of 'nlmsg' as
195 * necessary to determine its configuration. If necessary it should also
196 * use Netlink queries to determine the configuration of queues on
199 * This function must return 0 if and only if it sets 'netdev->tc' to an
200 * initialized 'struct tc'. */
201 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
203 /* Destroys the data structures allocated by the implementation as part of
204 * 'tc'. (This includes destroying 'tc->queues' by calling
207 * The implementation should not need to perform any Netlink calls. If
208 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
209 * (But it may not be desirable.)
211 * This function may be null if 'tc' is trivial. */
212 void (*tc_destroy)(struct tc *tc);
214 /* Retrieves details of 'netdev->tc' configuration into 'details'.
216 * The implementation should not need to perform any Netlink calls, because
217 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
218 * cached the configuration.
220 * The contents of 'details' should be documented as valid for 'ovs_name'
221 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
222 * (which is built as ovs-vswitchd.conf.db(8)).
224 * This function may be null if 'tc' is not configurable.
226 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
228 /* Reconfigures 'netdev->tc' according to 'details', performing any
229 * required Netlink calls to complete the reconfiguration.
231 * The contents of 'details' should be documented as valid for 'ovs_name'
232 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
233 * (which is built as ovs-vswitchd.conf.db(8)).
235 * This function may be null if 'tc' is not configurable.
237 int (*qdisc_set)(struct netdev *, const struct smap *details);
239 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
240 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
242 * The contents of 'details' should be documented as valid for 'ovs_name'
243 * in the "other_config" column in the "Queue" table in
244 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
246 * The implementation should not need to perform any Netlink calls, because
247 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
248 * cached the queue configuration.
250 * This function may be null if 'tc' does not have queues ('n_queues' is
252 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
253 struct smap *details);
255 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
256 * 'details', perfoming any required Netlink calls to complete the
257 * reconfiguration. The caller ensures that 'queue_id' is less than
260 * The contents of 'details' should be documented as valid for 'ovs_name'
261 * in the "other_config" column in the "Queue" table in
262 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
264 * This function may be null if 'tc' does not have queues or its queues are
265 * not configurable. */
266 int (*class_set)(struct netdev *, unsigned int queue_id,
267 const struct smap *details);
269 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
270 * tc_queue's within 'netdev->tc->queues'.
272 * This function may be null if 'tc' does not have queues or its queues
273 * cannot be deleted. */
274 int (*class_delete)(struct netdev *, struct tc_queue *queue);
276 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
277 * 'struct tc_queue's within 'netdev->tc->queues'.
279 * On success, initializes '*stats'.
281 * This function may be null if 'tc' does not have queues or if it cannot
282 * report queue statistics. */
283 int (*class_get_stats)(const struct netdev *netdev,
284 const struct tc_queue *queue,
285 struct netdev_queue_stats *stats);
287 /* Extracts queue stats from 'nlmsg', which is a response to a
288 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
290 * This function may be null if 'tc' does not have queues or if it cannot
291 * report queue statistics. */
292 int (*class_dump_stats)(const struct netdev *netdev,
293 const struct ofpbuf *nlmsg,
294 netdev_dump_queue_stats_cb *cb, void *aux);
298 tc_init(struct tc *tc, const struct tc_ops *ops)
301 hmap_init(&tc->queues);
305 tc_destroy(struct tc *tc)
307 hmap_destroy(&tc->queues);
310 static const struct tc_ops tc_ops_htb;
311 static const struct tc_ops tc_ops_hfsc;
312 static const struct tc_ops tc_ops_default;
313 static const struct tc_ops tc_ops_other;
315 static const struct tc_ops *const tcs[] = {
316 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
317 &tc_ops_hfsc, /* Hierarchical fair service curve. */
318 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
319 &tc_ops_other, /* Some other qdisc. */
323 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
324 static unsigned int tc_get_major(unsigned int handle);
325 static unsigned int tc_get_minor(unsigned int handle);
327 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
328 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
329 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
331 static struct tcmsg *tc_make_request(const struct netdev *, int type,
332 unsigned int flags, struct ofpbuf *);
333 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
334 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
335 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
338 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
339 struct nlattr **options);
340 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
341 struct nlattr **options,
342 struct netdev_queue_stats *);
343 static int tc_query_class(const struct netdev *,
344 unsigned int handle, unsigned int parent,
345 struct ofpbuf **replyp);
346 static int tc_delete_class(const struct netdev *, unsigned int handle);
348 static int tc_del_qdisc(struct netdev *netdev);
349 static int tc_query_qdisc(const struct netdev *netdev);
351 static int tc_calc_cell_log(unsigned int mtu);
352 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
353 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
354 const struct tc_ratespec *rate);
355 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
357 struct netdev_linux {
360 struct shash_node *shash_node;
361 unsigned int cache_valid;
362 unsigned int change_seq;
364 bool miimon; /* Link status of last poll. */
365 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
366 struct timer miimon_timer;
368 /* The following are figured out "on demand" only. They are only valid
369 * when the corresponding VALID_* bit in 'cache_valid' is set. */
371 uint8_t etheraddr[ETH_ADDR_LEN];
372 struct in_addr address, netmask;
375 unsigned int ifi_flags;
376 long long int carrier_resets;
377 uint32_t kbits_rate; /* Policing data. */
378 uint32_t kbits_burst;
379 int vport_stats_error; /* Cached error code from vport_get_stats().
380 0 or an errno value. */
381 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
382 int ether_addr_error; /* Cached error code from set/get etheraddr. */
383 int netdev_policing_error; /* Cached error code from set policing. */
384 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
385 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
387 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
388 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
389 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
390 enum netdev_features peer; /* Cached from ETHTOOL_GSET. */
392 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
396 struct tap_state tap;
400 struct netdev_rx_linux {
406 static const struct netdev_rx_class netdev_rx_linux_class;
408 /* Sockets used for ioctl operations. */
409 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
411 /* A Netlink routing socket that is not subscribed to any multicast groups. */
412 static struct nl_sock *rtnl_sock;
414 /* This is set pretty low because we probably won't learn anything from the
415 * additional log messages. */
416 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
418 static int netdev_linux_init(void);
420 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
421 int cmd, const char *cmd_name);
422 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
423 const char *cmd_name);
424 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
425 int cmd, const char *cmd_name);
426 static int get_flags(const struct netdev *, unsigned int *flags);
427 static int set_flags(const char *, unsigned int flags);
428 static int do_get_ifindex(const char *netdev_name);
429 static int get_ifindex(const struct netdev *, int *ifindexp);
430 static int do_set_addr(struct netdev *netdev,
431 int ioctl_nr, const char *ioctl_name,
432 struct in_addr addr);
433 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
434 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
435 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
436 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
437 static int af_packet_sock(void);
438 static void netdev_linux_miimon_run(void);
439 static void netdev_linux_miimon_wait(void);
442 is_netdev_linux_class(const struct netdev_class *netdev_class)
444 return netdev_class->init == netdev_linux_init;
448 is_tap_netdev(const struct netdev *netdev)
450 return netdev_get_class(netdev) == &netdev_tap_class;
453 static struct netdev_linux *
454 netdev_linux_cast(const struct netdev *netdev)
456 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
458 return CONTAINER_OF(netdev, struct netdev_linux, up);
461 static struct netdev_rx_linux *
462 netdev_rx_linux_cast(const struct netdev_rx *rx)
464 netdev_rx_assert_class(rx, &netdev_rx_linux_class);
465 return CONTAINER_OF(rx, struct netdev_rx_linux, up);
469 netdev_linux_init(void)
471 static int status = -1;
473 /* Create AF_INET socket. */
474 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
475 status = af_inet_sock >= 0 ? 0 : errno;
477 VLOG_ERR("failed to create inet socket: %s", strerror(status));
480 /* Create rtnetlink socket. */
482 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
484 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
493 netdev_linux_run(void)
495 rtnetlink_link_run();
496 netdev_linux_miimon_run();
500 netdev_linux_wait(void)
502 rtnetlink_link_wait();
503 netdev_linux_miimon_wait();
507 netdev_linux_changed(struct netdev_linux *dev,
508 unsigned int ifi_flags, unsigned int mask)
511 if (!dev->change_seq) {
515 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
516 dev->carrier_resets++;
518 dev->ifi_flags = ifi_flags;
520 dev->cache_valid &= mask;
524 netdev_linux_update(struct netdev_linux *dev,
525 const struct rtnetlink_link_change *change)
527 if (change->nlmsg_type == RTM_NEWLINK) {
529 netdev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
531 /* Update netdev from rtnl-change msg. */
533 dev->mtu = change->mtu;
534 dev->cache_valid |= VALID_MTU;
535 dev->netdev_mtu_error = 0;
538 if (!eth_addr_is_zero(change->addr)) {
539 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
540 dev->cache_valid |= VALID_ETHERADDR;
541 dev->ether_addr_error = 0;
544 dev->ifindex = change->ifi_index;
545 dev->cache_valid |= VALID_IFINDEX;
546 dev->get_ifindex_error = 0;
549 netdev_linux_changed(dev, change->ifi_flags, 0);
554 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
555 void *aux OVS_UNUSED)
557 struct netdev_linux *dev;
559 struct netdev *base_dev = netdev_from_name(change->ifname);
560 if (base_dev && is_netdev_linux_class(netdev_get_class(base_dev))) {
561 netdev_linux_update(netdev_linux_cast(base_dev), change);
564 struct shash device_shash;
565 struct shash_node *node;
567 shash_init(&device_shash);
568 netdev_get_devices(&netdev_linux_class, &device_shash);
569 SHASH_FOR_EACH (node, &device_shash) {
574 get_flags(&dev->up, &flags);
575 netdev_linux_changed(dev, flags, 0);
577 shash_destroy(&device_shash);
582 cache_notifier_ref(void)
584 if (!cache_notifier_refcount) {
585 ovs_assert(!netdev_linux_cache_notifier);
587 netdev_linux_cache_notifier =
588 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
590 if (!netdev_linux_cache_notifier) {
594 cache_notifier_refcount++;
600 cache_notifier_unref(void)
602 ovs_assert(cache_notifier_refcount > 0);
603 if (!--cache_notifier_refcount) {
604 ovs_assert(netdev_linux_cache_notifier);
605 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
606 netdev_linux_cache_notifier = NULL;
610 /* Creates system and internal devices. */
612 netdev_linux_create(const struct netdev_class *class, const char *name,
613 struct netdev **netdevp)
615 struct netdev_linux *netdev;
618 error = cache_notifier_ref();
623 netdev = xzalloc(sizeof *netdev);
624 netdev->change_seq = 1;
625 netdev_init(&netdev->up, name, class);
626 error = get_flags(&netdev->up, &netdev->ifi_flags);
627 if (error == ENODEV) {
628 if (class != &netdev_internal_class) {
629 /* The device does not exist, so don't allow it to be opened. */
630 netdev_uninit(&netdev->up, false);
631 cache_notifier_unref();
635 /* "Internal" netdevs have to be created as netdev objects before
636 * they exist in the kernel, because creating them in the kernel
637 * happens by passing a netdev object to dpif_port_add().
638 * Therefore, ignore the error. */
642 *netdevp = &netdev->up;
646 /* For most types of netdevs we open the device for each call of
647 * netdev_open(). However, this is not the case with tap devices,
648 * since it is only possible to open the device once. In this
649 * situation we share a single file descriptor, and consequently
650 * buffers, across all readers. Therefore once data is read it will
651 * be unavailable to other reads for tap devices. */
653 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
654 const char *name, struct netdev **netdevp)
656 struct netdev_linux *netdev;
657 struct tap_state *state;
658 static const char tap_dev[] = "/dev/net/tun";
662 netdev = xzalloc(sizeof *netdev);
663 state = &netdev->state.tap;
665 error = cache_notifier_ref();
670 /* Open tap device. */
671 state->fd = open(tap_dev, O_RDWR);
674 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
675 goto error_unref_notifier;
678 /* Create tap device. */
679 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
680 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
681 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
682 VLOG_WARN("%s: creating tap device failed: %s", name,
685 goto error_unref_notifier;
688 /* Make non-blocking. */
689 error = set_nonblocking(state->fd);
691 goto error_unref_notifier;
694 netdev_init(&netdev->up, name, &netdev_tap_class);
695 *netdevp = &netdev->up;
698 error_unref_notifier:
699 cache_notifier_unref();
706 destroy_tap(struct netdev_linux *netdev)
708 struct tap_state *state = &netdev->state.tap;
710 if (state->fd >= 0) {
715 /* Destroys the netdev device 'netdev_'. */
717 netdev_linux_destroy(struct netdev *netdev_)
719 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
721 if (netdev->tc && netdev->tc->ops->tc_destroy) {
722 netdev->tc->ops->tc_destroy(netdev->tc);
725 if (netdev_get_class(netdev_) == &netdev_tap_class) {
730 cache_notifier_unref();
734 netdev_linux_rx_open(struct netdev *netdev_, struct netdev_rx **rxp)
736 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
737 bool is_tap = is_tap_netdev(netdev_);
738 struct netdev_rx_linux *rx;
743 fd = netdev->state.tap.fd;
745 struct sockaddr_ll sll;
748 /* Create file descriptor. */
749 fd = socket(PF_PACKET, SOCK_RAW, 0);
752 VLOG_ERR("failed to create raw socket (%s)", strerror(error));
756 /* Set non-blocking mode. */
757 error = set_nonblocking(fd);
762 /* Get ethernet device index. */
763 error = get_ifindex(&netdev->up, &ifindex);
768 /* Bind to specific ethernet device. */
769 memset(&sll, 0, sizeof sll);
770 sll.sll_family = AF_PACKET;
771 sll.sll_ifindex = ifindex;
772 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
773 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
775 VLOG_ERR("%s: failed to bind raw socket (%s)",
776 netdev_get_name(netdev_), strerror(error));
781 rx = xmalloc(sizeof *rx);
782 netdev_rx_init(&rx->up, netdev_, &netdev_rx_linux_class);
797 netdev_rx_linux_destroy(struct netdev_rx *rx_)
799 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
808 netdev_rx_linux_recv(struct netdev_rx *rx_, void *data, size_t size)
810 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
815 ? read(rx->fd, data, size)
816 : recv(rx->fd, data, size, MSG_TRUNC));
817 } while (retval < 0 && errno == EINTR);
821 } else if (retval >= 0) {
824 if (errno != EAGAIN) {
825 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
826 strerror(errno), netdev_rx_get_name(rx_));
833 netdev_rx_linux_wait(struct netdev_rx *rx_)
835 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
836 poll_fd_wait(rx->fd, POLLIN);
840 netdev_rx_linux_drain(struct netdev_rx *rx_)
842 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
845 int error = netdev_linux_do_ioctl(netdev_rx_get_name(rx_), &ifr,
846 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
850 drain_fd(rx->fd, ifr.ifr_qlen);
853 return drain_rcvbuf(rx->fd);
857 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
858 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
859 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
860 * the packet is too big or too small to transmit on the device.
862 * The caller retains ownership of 'buffer' in all cases.
864 * The kernel maintains a packet transmission queue, so the caller is not
865 * expected to do additional queuing of packets. */
867 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
872 if (!is_tap_netdev(netdev_)) {
873 /* Use our AF_PACKET socket to send to this device. */
874 struct sockaddr_ll sll;
881 sock = af_packet_sock();
886 error = get_ifindex(netdev_, &ifindex);
891 /* We don't bother setting most fields in sockaddr_ll because the
892 * kernel ignores them for SOCK_RAW. */
893 memset(&sll, 0, sizeof sll);
894 sll.sll_family = AF_PACKET;
895 sll.sll_ifindex = ifindex;
897 iov.iov_base = CONST_CAST(void *, data);
901 msg.msg_namelen = sizeof sll;
904 msg.msg_control = NULL;
905 msg.msg_controllen = 0;
908 retval = sendmsg(sock, &msg, 0);
910 /* Use the tap fd to send to this device. This is essential for
911 * tap devices, because packets sent to a tap device with an
912 * AF_PACKET socket will loop back to be *received* again on the
914 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
916 retval = write(netdev->state.tap.fd, data, size);
920 /* The Linux AF_PACKET implementation never blocks waiting for room
921 * for packets, instead returning ENOBUFS. Translate this into
922 * EAGAIN for the caller. */
923 if (errno == ENOBUFS) {
925 } else if (errno == EINTR) {
927 } else if (errno != EAGAIN) {
928 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
929 netdev_get_name(netdev_), strerror(errno));
932 } else if (retval != size) {
933 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
934 "%zu) on %s", retval, size, netdev_get_name(netdev_));
942 /* Registers with the poll loop to wake up from the next call to poll_block()
943 * when the packet transmission queue has sufficient room to transmit a packet
944 * with netdev_send().
946 * The kernel maintains a packet transmission queue, so the client is not
947 * expected to do additional queuing of packets. Thus, this function is
948 * unlikely to ever be used. It is included for completeness. */
950 netdev_linux_send_wait(struct netdev *netdev)
952 if (is_tap_netdev(netdev)) {
953 /* TAP device always accepts packets.*/
954 poll_immediate_wake();
958 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
959 * otherwise a positive errno value. */
961 netdev_linux_set_etheraddr(struct netdev *netdev_,
962 const uint8_t mac[ETH_ADDR_LEN])
964 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
965 struct netdev_saved_flags *sf = NULL;
968 if (netdev->cache_valid & VALID_ETHERADDR) {
969 if (netdev->ether_addr_error) {
970 return netdev->ether_addr_error;
972 if (eth_addr_equals(netdev->etheraddr, mac)) {
975 netdev->cache_valid &= ~VALID_ETHERADDR;
978 /* Tap devices must be brought down before setting the address. */
979 if (is_tap_netdev(netdev_)) {
980 netdev_turn_flags_off(netdev_, NETDEV_UP, &sf);
982 error = set_etheraddr(netdev_get_name(netdev_), mac);
983 if (!error || error == ENODEV) {
984 netdev->ether_addr_error = error;
985 netdev->cache_valid |= VALID_ETHERADDR;
987 memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
991 netdev_restore_flags(sf);
996 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
998 netdev_linux_get_etheraddr(const struct netdev *netdev_,
999 uint8_t mac[ETH_ADDR_LEN])
1001 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1003 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1004 int error = get_etheraddr(netdev_get_name(netdev_),
1007 netdev->ether_addr_error = error;
1008 netdev->cache_valid |= VALID_ETHERADDR;
1011 if (!netdev->ether_addr_error) {
1012 memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
1015 return netdev->ether_addr_error;
1018 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1019 * in bytes, not including the hardware header; thus, this is typically 1500
1020 * bytes for Ethernet devices. */
1022 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1024 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1025 if (!(netdev->cache_valid & VALID_MTU)) {
1029 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1030 SIOCGIFMTU, "SIOCGIFMTU");
1032 netdev->netdev_mtu_error = error;
1033 netdev->mtu = ifr.ifr_mtu;
1034 netdev->cache_valid |= VALID_MTU;
1037 if (!netdev->netdev_mtu_error) {
1038 *mtup = netdev->mtu;
1040 return netdev->netdev_mtu_error;
1043 /* Sets the maximum size of transmitted (MTU) for given device using linux
1044 * networking ioctl interface.
1047 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1049 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1053 if (netdev->cache_valid & VALID_MTU) {
1054 if (netdev->netdev_mtu_error) {
1055 return netdev->netdev_mtu_error;
1057 if (netdev->mtu == mtu) {
1060 netdev->cache_valid &= ~VALID_MTU;
1063 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1064 SIOCSIFMTU, "SIOCSIFMTU");
1065 if (!error || error == ENODEV) {
1066 netdev->netdev_mtu_error = error;
1067 netdev->mtu = ifr.ifr_mtu;
1068 netdev->cache_valid |= VALID_MTU;
1073 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1074 * On failure, returns a negative errno value. */
1076 netdev_linux_get_ifindex(const struct netdev *netdev)
1080 error = get_ifindex(netdev, &ifindex);
1081 return error ? -error : ifindex;
1085 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1087 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1089 if (netdev->miimon_interval > 0) {
1090 *carrier = netdev->miimon;
1092 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1098 static long long int
1099 netdev_linux_get_carrier_resets(const struct netdev *netdev)
1101 return netdev_linux_cast(netdev)->carrier_resets;
1105 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1106 struct mii_ioctl_data *data)
1111 memset(&ifr, 0, sizeof ifr);
1112 memcpy(&ifr.ifr_data, data, sizeof *data);
1113 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1114 memcpy(data, &ifr.ifr_data, sizeof *data);
1120 netdev_linux_get_miimon(const char *name, bool *miimon)
1122 struct mii_ioctl_data data;
1127 memset(&data, 0, sizeof data);
1128 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1130 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1131 data.reg_num = MII_BMSR;
1132 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1136 *miimon = !!(data.val_out & BMSR_LSTATUS);
1138 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1141 struct ethtool_cmd ecmd;
1143 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1146 COVERAGE_INC(netdev_get_ethtool);
1147 memset(&ecmd, 0, sizeof ecmd);
1148 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1151 struct ethtool_value eval;
1153 memcpy(&eval, &ecmd, sizeof eval);
1154 *miimon = !!eval.data;
1156 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1164 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1165 long long int interval)
1167 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1169 interval = interval > 0 ? MAX(interval, 100) : 0;
1170 if (netdev->miimon_interval != interval) {
1171 netdev->miimon_interval = interval;
1172 timer_set_expired(&netdev->miimon_timer);
1179 netdev_linux_miimon_run(void)
1181 struct shash device_shash;
1182 struct shash_node *node;
1184 shash_init(&device_shash);
1185 netdev_get_devices(&netdev_linux_class, &device_shash);
1186 SHASH_FOR_EACH (node, &device_shash) {
1187 struct netdev_linux *dev = node->data;
1190 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1194 netdev_linux_get_miimon(dev->up.name, &miimon);
1195 if (miimon != dev->miimon) {
1196 dev->miimon = miimon;
1197 netdev_linux_changed(dev, dev->ifi_flags, 0);
1200 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1203 shash_destroy(&device_shash);
1207 netdev_linux_miimon_wait(void)
1209 struct shash device_shash;
1210 struct shash_node *node;
1212 shash_init(&device_shash);
1213 netdev_get_devices(&netdev_linux_class, &device_shash);
1214 SHASH_FOR_EACH (node, &device_shash) {
1215 struct netdev_linux *dev = node->data;
1217 if (dev->miimon_interval > 0) {
1218 timer_wait(&dev->miimon_timer);
1221 shash_destroy(&device_shash);
1224 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1225 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1228 check_for_working_netlink_stats(void)
1230 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1231 * preferable, so if that works, we'll use it. */
1232 int ifindex = do_get_ifindex("lo");
1234 VLOG_WARN("failed to get ifindex for lo, "
1235 "obtaining netdev stats from proc");
1238 struct netdev_stats stats;
1239 int error = get_stats_via_netlink(ifindex, &stats);
1241 VLOG_DBG("obtaining netdev stats via rtnetlink");
1244 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1245 "via proc (you are probably running a pre-2.6.19 "
1246 "kernel)", strerror(error));
1253 swap_uint64(uint64_t *a, uint64_t *b)
1260 /* Copies 'src' into 'dst', performing format conversion in the process.
1262 * 'src' is allowed to be misaligned. */
1264 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1265 const struct ovs_vport_stats *src)
1267 dst->rx_packets = get_unaligned_u64(&src->rx_packets);
1268 dst->tx_packets = get_unaligned_u64(&src->tx_packets);
1269 dst->rx_bytes = get_unaligned_u64(&src->rx_bytes);
1270 dst->tx_bytes = get_unaligned_u64(&src->tx_bytes);
1271 dst->rx_errors = get_unaligned_u64(&src->rx_errors);
1272 dst->tx_errors = get_unaligned_u64(&src->tx_errors);
1273 dst->rx_dropped = get_unaligned_u64(&src->rx_dropped);
1274 dst->tx_dropped = get_unaligned_u64(&src->tx_dropped);
1276 dst->collisions = 0;
1277 dst->rx_length_errors = 0;
1278 dst->rx_over_errors = 0;
1279 dst->rx_crc_errors = 0;
1280 dst->rx_frame_errors = 0;
1281 dst->rx_fifo_errors = 0;
1282 dst->rx_missed_errors = 0;
1283 dst->tx_aborted_errors = 0;
1284 dst->tx_carrier_errors = 0;
1285 dst->tx_fifo_errors = 0;
1286 dst->tx_heartbeat_errors = 0;
1287 dst->tx_window_errors = 0;
1291 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1293 struct dpif_linux_vport reply;
1297 error = dpif_linux_vport_get(netdev_get_name(netdev), &reply, &buf);
1300 } else if (!reply.stats) {
1305 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1313 get_stats_via_vport(const struct netdev *netdev_,
1314 struct netdev_stats *stats)
1316 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1318 if (!netdev->vport_stats_error ||
1319 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1322 error = get_stats_via_vport__(netdev_, stats);
1323 if (error && error != ENOENT) {
1324 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1325 "(%s)", netdev_get_name(netdev_), strerror(error));
1327 netdev->vport_stats_error = error;
1328 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1333 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1334 struct netdev_stats *stats)
1336 static int use_netlink_stats = -1;
1339 if (use_netlink_stats < 0) {
1340 use_netlink_stats = check_for_working_netlink_stats();
1343 if (use_netlink_stats) {
1346 error = get_ifindex(netdev_, &ifindex);
1348 error = get_stats_via_netlink(ifindex, stats);
1351 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1355 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1356 netdev_get_name(netdev_), error);
1362 /* Retrieves current device stats for 'netdev-linux'. */
1364 netdev_linux_get_stats(const struct netdev *netdev_,
1365 struct netdev_stats *stats)
1367 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1368 struct netdev_stats dev_stats;
1371 get_stats_via_vport(netdev_, stats);
1373 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1376 if (netdev->vport_stats_error) {
1383 if (netdev->vport_stats_error) {
1384 /* stats not available from OVS then use ioctl stats. */
1387 stats->rx_errors += dev_stats.rx_errors;
1388 stats->tx_errors += dev_stats.tx_errors;
1389 stats->rx_dropped += dev_stats.rx_dropped;
1390 stats->tx_dropped += dev_stats.tx_dropped;
1391 stats->multicast += dev_stats.multicast;
1392 stats->collisions += dev_stats.collisions;
1393 stats->rx_length_errors += dev_stats.rx_length_errors;
1394 stats->rx_over_errors += dev_stats.rx_over_errors;
1395 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1396 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1397 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1398 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1399 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1400 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1401 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1402 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1403 stats->tx_window_errors += dev_stats.tx_window_errors;
1408 /* Retrieves current device stats for 'netdev-tap' netdev or
1409 * netdev-internal. */
1411 netdev_tap_get_stats(const struct netdev *netdev_,
1412 struct netdev_stats *stats)
1414 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1415 struct netdev_stats dev_stats;
1418 get_stats_via_vport(netdev_, stats);
1420 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1422 if (netdev->vport_stats_error) {
1429 /* If this port is an internal port then the transmit and receive stats
1430 * will appear to be swapped relative to the other ports since we are the
1431 * one sending the data, not a remote computer. For consistency, we swap
1432 * them back here. This does not apply if we are getting stats from the
1433 * vport layer because it always tracks stats from the perspective of the
1435 if (netdev->vport_stats_error) {
1437 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1438 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1439 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1440 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1441 stats->rx_length_errors = 0;
1442 stats->rx_over_errors = 0;
1443 stats->rx_crc_errors = 0;
1444 stats->rx_frame_errors = 0;
1445 stats->rx_fifo_errors = 0;
1446 stats->rx_missed_errors = 0;
1447 stats->tx_aborted_errors = 0;
1448 stats->tx_carrier_errors = 0;
1449 stats->tx_fifo_errors = 0;
1450 stats->tx_heartbeat_errors = 0;
1451 stats->tx_window_errors = 0;
1453 stats->rx_dropped += dev_stats.tx_dropped;
1454 stats->tx_dropped += dev_stats.rx_dropped;
1456 stats->rx_errors += dev_stats.tx_errors;
1457 stats->tx_errors += dev_stats.rx_errors;
1459 stats->multicast += dev_stats.multicast;
1460 stats->collisions += dev_stats.collisions;
1466 netdev_internal_get_stats(const struct netdev *netdev_,
1467 struct netdev_stats *stats)
1469 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1471 get_stats_via_vport(netdev_, stats);
1472 return netdev->vport_stats_error;
1476 netdev_internal_set_stats(struct netdev *netdev,
1477 const struct netdev_stats *stats)
1479 struct ovs_vport_stats vport_stats;
1480 struct dpif_linux_vport vport;
1483 vport_stats.rx_packets = stats->rx_packets;
1484 vport_stats.tx_packets = stats->tx_packets;
1485 vport_stats.rx_bytes = stats->rx_bytes;
1486 vport_stats.tx_bytes = stats->tx_bytes;
1487 vport_stats.rx_errors = stats->rx_errors;
1488 vport_stats.tx_errors = stats->tx_errors;
1489 vport_stats.rx_dropped = stats->rx_dropped;
1490 vport_stats.tx_dropped = stats->tx_dropped;
1492 dpif_linux_vport_init(&vport);
1493 vport.cmd = OVS_VPORT_CMD_SET;
1494 vport.name = netdev_get_name(netdev);
1495 vport.stats = &vport_stats;
1497 err = dpif_linux_vport_transact(&vport, NULL, NULL);
1499 /* If the vport layer doesn't know about the device, that doesn't mean it
1500 * doesn't exist (after all were able to open it when netdev_open() was
1501 * called), it just means that it isn't attached and we'll be getting
1502 * stats a different way. */
1503 if (err == ENODEV) {
1511 netdev_linux_read_features(struct netdev_linux *netdev)
1513 struct ethtool_cmd ecmd;
1517 if (netdev->cache_valid & VALID_FEATURES) {
1521 COVERAGE_INC(netdev_get_ethtool);
1522 memset(&ecmd, 0, sizeof ecmd);
1523 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1524 ETHTOOL_GSET, "ETHTOOL_GSET");
1529 /* Supported features. */
1530 netdev->supported = 0;
1531 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1532 netdev->supported |= NETDEV_F_10MB_HD;
1534 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1535 netdev->supported |= NETDEV_F_10MB_FD;
1537 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1538 netdev->supported |= NETDEV_F_100MB_HD;
1540 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1541 netdev->supported |= NETDEV_F_100MB_FD;
1543 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1544 netdev->supported |= NETDEV_F_1GB_HD;
1546 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1547 netdev->supported |= NETDEV_F_1GB_FD;
1549 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1550 netdev->supported |= NETDEV_F_10GB_FD;
1552 if (ecmd.supported & SUPPORTED_TP) {
1553 netdev->supported |= NETDEV_F_COPPER;
1555 if (ecmd.supported & SUPPORTED_FIBRE) {
1556 netdev->supported |= NETDEV_F_FIBER;
1558 if (ecmd.supported & SUPPORTED_Autoneg) {
1559 netdev->supported |= NETDEV_F_AUTONEG;
1561 if (ecmd.supported & SUPPORTED_Pause) {
1562 netdev->supported |= NETDEV_F_PAUSE;
1564 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1565 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1568 /* Advertised features. */
1569 netdev->advertised = 0;
1570 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1571 netdev->advertised |= NETDEV_F_10MB_HD;
1573 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1574 netdev->advertised |= NETDEV_F_10MB_FD;
1576 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1577 netdev->advertised |= NETDEV_F_100MB_HD;
1579 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1580 netdev->advertised |= NETDEV_F_100MB_FD;
1582 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1583 netdev->advertised |= NETDEV_F_1GB_HD;
1585 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1586 netdev->advertised |= NETDEV_F_1GB_FD;
1588 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1589 netdev->advertised |= NETDEV_F_10GB_FD;
1591 if (ecmd.advertising & ADVERTISED_TP) {
1592 netdev->advertised |= NETDEV_F_COPPER;
1594 if (ecmd.advertising & ADVERTISED_FIBRE) {
1595 netdev->advertised |= NETDEV_F_FIBER;
1597 if (ecmd.advertising & ADVERTISED_Autoneg) {
1598 netdev->advertised |= NETDEV_F_AUTONEG;
1600 if (ecmd.advertising & ADVERTISED_Pause) {
1601 netdev->advertised |= NETDEV_F_PAUSE;
1603 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1604 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1607 /* Current settings. */
1609 if (speed == SPEED_10) {
1610 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1611 } else if (speed == SPEED_100) {
1612 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1613 } else if (speed == SPEED_1000) {
1614 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1615 } else if (speed == SPEED_10000) {
1616 netdev->current = NETDEV_F_10GB_FD;
1617 } else if (speed == 40000) {
1618 netdev->current = NETDEV_F_40GB_FD;
1619 } else if (speed == 100000) {
1620 netdev->current = NETDEV_F_100GB_FD;
1621 } else if (speed == 1000000) {
1622 netdev->current = NETDEV_F_1TB_FD;
1624 netdev->current = 0;
1627 if (ecmd.port == PORT_TP) {
1628 netdev->current |= NETDEV_F_COPPER;
1629 } else if (ecmd.port == PORT_FIBRE) {
1630 netdev->current |= NETDEV_F_FIBER;
1634 netdev->current |= NETDEV_F_AUTONEG;
1637 /* Peer advertisements. */
1638 netdev->peer = 0; /* XXX */
1641 netdev->cache_valid |= VALID_FEATURES;
1642 netdev->get_features_error = error;
1645 /* Stores the features supported by 'netdev' into each of '*current',
1646 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1647 * bitmap of NETDEV_* bits. Returns 0 if successful, otherwise a positive
1650 netdev_linux_get_features(const struct netdev *netdev_,
1651 enum netdev_features *current,
1652 enum netdev_features *advertised,
1653 enum netdev_features *supported,
1654 enum netdev_features *peer)
1656 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1658 netdev_linux_read_features(netdev);
1660 if (!netdev->get_features_error) {
1661 *current = netdev->current;
1662 *advertised = netdev->advertised;
1663 *supported = netdev->supported;
1664 *peer = netdev->peer;
1666 return netdev->get_features_error;
1669 /* Set the features advertised by 'netdev' to 'advertise'. */
1671 netdev_linux_set_advertisements(struct netdev *netdev,
1672 enum netdev_features advertise)
1674 struct ethtool_cmd ecmd;
1677 COVERAGE_INC(netdev_get_ethtool);
1678 memset(&ecmd, 0, sizeof ecmd);
1679 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1680 ETHTOOL_GSET, "ETHTOOL_GSET");
1685 ecmd.advertising = 0;
1686 if (advertise & NETDEV_F_10MB_HD) {
1687 ecmd.advertising |= ADVERTISED_10baseT_Half;
1689 if (advertise & NETDEV_F_10MB_FD) {
1690 ecmd.advertising |= ADVERTISED_10baseT_Full;
1692 if (advertise & NETDEV_F_100MB_HD) {
1693 ecmd.advertising |= ADVERTISED_100baseT_Half;
1695 if (advertise & NETDEV_F_100MB_FD) {
1696 ecmd.advertising |= ADVERTISED_100baseT_Full;
1698 if (advertise & NETDEV_F_1GB_HD) {
1699 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1701 if (advertise & NETDEV_F_1GB_FD) {
1702 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1704 if (advertise & NETDEV_F_10GB_FD) {
1705 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1707 if (advertise & NETDEV_F_COPPER) {
1708 ecmd.advertising |= ADVERTISED_TP;
1710 if (advertise & NETDEV_F_FIBER) {
1711 ecmd.advertising |= ADVERTISED_FIBRE;
1713 if (advertise & NETDEV_F_AUTONEG) {
1714 ecmd.advertising |= ADVERTISED_Autoneg;
1716 if (advertise & NETDEV_F_PAUSE) {
1717 ecmd.advertising |= ADVERTISED_Pause;
1719 if (advertise & NETDEV_F_PAUSE_ASYM) {
1720 ecmd.advertising |= ADVERTISED_Asym_Pause;
1722 COVERAGE_INC(netdev_set_ethtool);
1723 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1724 ETHTOOL_SSET, "ETHTOOL_SSET");
1727 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1728 * successful, otherwise a positive errno value. */
1730 netdev_linux_set_policing(struct netdev *netdev_,
1731 uint32_t kbits_rate, uint32_t kbits_burst)
1733 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1734 const char *netdev_name = netdev_get_name(netdev_);
1738 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1739 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1740 : kbits_burst); /* Stick with user-specified value. */
1742 if (netdev->cache_valid & VALID_POLICING) {
1743 if (netdev->netdev_policing_error) {
1744 return netdev->netdev_policing_error;
1747 if (netdev->kbits_rate == kbits_rate &&
1748 netdev->kbits_burst == kbits_burst) {
1749 /* Assume that settings haven't changed since we last set them. */
1752 netdev->cache_valid &= ~VALID_POLICING;
1755 COVERAGE_INC(netdev_set_policing);
1756 /* Remove any existing ingress qdisc. */
1757 error = tc_add_del_ingress_qdisc(netdev_, false);
1759 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1760 netdev_name, strerror(error));
1765 error = tc_add_del_ingress_qdisc(netdev_, true);
1767 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1768 netdev_name, strerror(error));
1772 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
1774 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1775 netdev_name, strerror(error));
1780 netdev->kbits_rate = kbits_rate;
1781 netdev->kbits_burst = kbits_burst;
1784 if (!error || error == ENODEV) {
1785 netdev->netdev_policing_error = error;
1786 netdev->cache_valid |= VALID_POLICING;
1792 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1795 const struct tc_ops *const *opsp;
1797 for (opsp = tcs; *opsp != NULL; opsp++) {
1798 const struct tc_ops *ops = *opsp;
1799 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1800 sset_add(types, ops->ovs_name);
1806 static const struct tc_ops *
1807 tc_lookup_ovs_name(const char *name)
1809 const struct tc_ops *const *opsp;
1811 for (opsp = tcs; *opsp != NULL; opsp++) {
1812 const struct tc_ops *ops = *opsp;
1813 if (!strcmp(name, ops->ovs_name)) {
1820 static const struct tc_ops *
1821 tc_lookup_linux_name(const char *name)
1823 const struct tc_ops *const *opsp;
1825 for (opsp = tcs; *opsp != NULL; opsp++) {
1826 const struct tc_ops *ops = *opsp;
1827 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1834 static struct tc_queue *
1835 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
1838 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1839 struct tc_queue *queue;
1841 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
1842 if (queue->queue_id == queue_id) {
1849 static struct tc_queue *
1850 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1852 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1856 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1858 struct netdev_qos_capabilities *caps)
1860 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1864 caps->n_queues = ops->n_queues;
1869 netdev_linux_get_qos(const struct netdev *netdev_,
1870 const char **typep, struct smap *details)
1872 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1875 error = tc_query_qdisc(netdev_);
1880 *typep = netdev->tc->ops->ovs_name;
1881 return (netdev->tc->ops->qdisc_get
1882 ? netdev->tc->ops->qdisc_get(netdev_, details)
1887 netdev_linux_set_qos(struct netdev *netdev_,
1888 const char *type, const struct smap *details)
1890 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1891 const struct tc_ops *new_ops;
1894 new_ops = tc_lookup_ovs_name(type);
1895 if (!new_ops || !new_ops->tc_install) {
1899 error = tc_query_qdisc(netdev_);
1904 if (new_ops == netdev->tc->ops) {
1905 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
1907 /* Delete existing qdisc. */
1908 error = tc_del_qdisc(netdev_);
1912 ovs_assert(netdev->tc == NULL);
1914 /* Install new qdisc. */
1915 error = new_ops->tc_install(netdev_, details);
1916 ovs_assert((error == 0) == (netdev->tc != NULL));
1923 netdev_linux_get_queue(const struct netdev *netdev_,
1924 unsigned int queue_id, struct smap *details)
1926 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1929 error = tc_query_qdisc(netdev_);
1933 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
1935 ? netdev->tc->ops->class_get(netdev_, queue, details)
1941 netdev_linux_set_queue(struct netdev *netdev_,
1942 unsigned int queue_id, const struct smap *details)
1944 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1947 error = tc_query_qdisc(netdev_);
1950 } else if (queue_id >= netdev->tc->ops->n_queues
1951 || !netdev->tc->ops->class_set) {
1955 return netdev->tc->ops->class_set(netdev_, queue_id, details);
1959 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
1961 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1964 error = tc_query_qdisc(netdev_);
1967 } else if (!netdev->tc->ops->class_delete) {
1970 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
1972 ? netdev->tc->ops->class_delete(netdev_, queue)
1978 netdev_linux_get_queue_stats(const struct netdev *netdev_,
1979 unsigned int queue_id,
1980 struct netdev_queue_stats *stats)
1982 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1985 error = tc_query_qdisc(netdev_);
1988 } else if (!netdev->tc->ops->class_get_stats) {
1991 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
1993 ? netdev->tc->ops->class_get_stats(netdev_, queue, stats)
1999 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
2001 struct ofpbuf request;
2002 struct tcmsg *tcmsg;
2004 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2008 tcmsg->tcm_parent = 0;
2009 nl_dump_start(dump, rtnl_sock, &request);
2010 ofpbuf_uninit(&request);
2015 netdev_linux_dump_queues(const struct netdev *netdev_,
2016 netdev_dump_queues_cb *cb, void *aux)
2018 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2019 struct tc_queue *queue, *next_queue;
2020 struct smap details;
2024 error = tc_query_qdisc(netdev_);
2027 } else if (!netdev->tc->ops->class_get) {
2032 smap_init(&details);
2033 HMAP_FOR_EACH_SAFE (queue, next_queue, hmap_node,
2034 &netdev->tc->queues) {
2035 smap_clear(&details);
2037 error = netdev->tc->ops->class_get(netdev_, queue, &details);
2039 (*cb)(queue->queue_id, &details, aux);
2044 smap_destroy(&details);
2050 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2051 netdev_dump_queue_stats_cb *cb, void *aux)
2053 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2054 struct nl_dump dump;
2059 error = tc_query_qdisc(netdev_);
2062 } else if (!netdev->tc->ops->class_dump_stats) {
2067 if (!start_queue_dump(netdev_, &dump)) {
2070 while (nl_dump_next(&dump, &msg)) {
2071 error = netdev->tc->ops->class_dump_stats(netdev_, &msg, cb, aux);
2077 error = nl_dump_done(&dump);
2078 return error ? error : last_error;
2082 netdev_linux_get_in4(const struct netdev *netdev_,
2083 struct in_addr *address, struct in_addr *netmask)
2085 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2087 if (!(netdev->cache_valid & VALID_IN4)) {
2090 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
2091 SIOCGIFADDR, "SIOCGIFADDR");
2096 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2097 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2102 netdev->cache_valid |= VALID_IN4;
2104 *address = netdev->address;
2105 *netmask = netdev->netmask;
2106 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
2110 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2111 struct in_addr netmask)
2113 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2116 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2118 netdev->cache_valid |= VALID_IN4;
2119 netdev->address = address;
2120 netdev->netmask = netmask;
2121 if (address.s_addr != INADDR_ANY) {
2122 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2123 "SIOCSIFNETMASK", netmask);
2130 parse_if_inet6_line(const char *line,
2131 struct in6_addr *in6, char ifname[16 + 1])
2133 uint8_t *s6 = in6->s6_addr;
2134 #define X8 "%2"SCNx8
2136 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2137 "%*x %*x %*x %*x %16s\n",
2138 &s6[0], &s6[1], &s6[2], &s6[3],
2139 &s6[4], &s6[5], &s6[6], &s6[7],
2140 &s6[8], &s6[9], &s6[10], &s6[11],
2141 &s6[12], &s6[13], &s6[14], &s6[15],
2145 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2146 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2148 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2150 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2151 if (!(netdev->cache_valid & VALID_IN6)) {
2155 netdev->in6 = in6addr_any;
2157 file = fopen("/proc/net/if_inet6", "r");
2159 const char *name = netdev_get_name(netdev_);
2160 while (fgets(line, sizeof line, file)) {
2161 struct in6_addr in6_tmp;
2162 char ifname[16 + 1];
2163 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2164 && !strcmp(name, ifname))
2166 netdev->in6 = in6_tmp;
2172 netdev->cache_valid |= VALID_IN6;
2179 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2181 struct sockaddr_in sin;
2182 memset(&sin, 0, sizeof sin);
2183 sin.sin_family = AF_INET;
2184 sin.sin_addr = addr;
2187 memset(sa, 0, sizeof *sa);
2188 memcpy(sa, &sin, sizeof sin);
2192 do_set_addr(struct netdev *netdev,
2193 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2196 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2197 make_in4_sockaddr(&ifr.ifr_addr, addr);
2199 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2203 /* Adds 'router' as a default IP gateway. */
2205 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2207 struct in_addr any = { INADDR_ANY };
2211 memset(&rt, 0, sizeof rt);
2212 make_in4_sockaddr(&rt.rt_dst, any);
2213 make_in4_sockaddr(&rt.rt_gateway, router);
2214 make_in4_sockaddr(&rt.rt_genmask, any);
2215 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2216 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2218 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
2224 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2227 static const char fn[] = "/proc/net/route";
2232 *netdev_name = NULL;
2233 stream = fopen(fn, "r");
2234 if (stream == NULL) {
2235 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
2240 while (fgets(line, sizeof line, stream)) {
2243 ovs_be32 dest, gateway, mask;
2244 int refcnt, metric, mtu;
2245 unsigned int flags, use, window, irtt;
2248 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2250 iface, &dest, &gateway, &flags, &refcnt,
2251 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2253 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2257 if (!(flags & RTF_UP)) {
2258 /* Skip routes that aren't up. */
2262 /* The output of 'dest', 'mask', and 'gateway' were given in
2263 * network byte order, so we don't need need any endian
2264 * conversions here. */
2265 if ((dest & mask) == (host->s_addr & mask)) {
2267 /* The host is directly reachable. */
2268 next_hop->s_addr = 0;
2270 /* To reach the host, we must go through a gateway. */
2271 next_hop->s_addr = gateway;
2273 *netdev_name = xstrdup(iface);
2285 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2287 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2290 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2291 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2293 COVERAGE_INC(netdev_get_ethtool);
2294 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2295 error = netdev_linux_do_ethtool(netdev->up.name,
2298 "ETHTOOL_GDRVINFO");
2300 netdev->cache_valid |= VALID_DRVINFO;
2305 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2306 smap_add(smap, "driver_version", netdev->drvinfo.version);
2307 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2313 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2316 smap_add(smap, "driver_name", "openvswitch");
2320 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2321 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2322 * returns 0. Otherwise, it returns a positive errno value; in particular,
2323 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2325 netdev_linux_arp_lookup(const struct netdev *netdev,
2326 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2329 struct sockaddr_in sin;
2332 memset(&r, 0, sizeof r);
2333 memset(&sin, 0, sizeof sin);
2334 sin.sin_family = AF_INET;
2335 sin.sin_addr.s_addr = ip;
2337 memcpy(&r.arp_pa, &sin, sizeof sin);
2338 r.arp_ha.sa_family = ARPHRD_ETHER;
2340 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2341 COVERAGE_INC(netdev_arp_lookup);
2342 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2344 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2345 } else if (retval != ENXIO) {
2346 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2347 netdev_get_name(netdev), IP_ARGS(ip), strerror(retval));
2353 nd_to_iff_flags(enum netdev_flags nd)
2356 if (nd & NETDEV_UP) {
2359 if (nd & NETDEV_PROMISC) {
2366 iff_to_nd_flags(int iff)
2368 enum netdev_flags nd = 0;
2372 if (iff & IFF_PROMISC) {
2373 nd |= NETDEV_PROMISC;
2379 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2380 enum netdev_flags on, enum netdev_flags *old_flagsp)
2382 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2383 int old_flags, new_flags;
2386 old_flags = netdev->ifi_flags;
2387 *old_flagsp = iff_to_nd_flags(old_flags);
2388 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2389 if (new_flags != old_flags) {
2390 error = set_flags(netdev_get_name(netdev_), new_flags);
2391 get_flags(netdev_, &netdev->ifi_flags);
2397 netdev_linux_change_seq(const struct netdev *netdev)
2399 return netdev_linux_cast(netdev)->change_seq;
2402 #define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS, \
2403 GET_FEATURES, GET_STATUS) \
2407 netdev_linux_init, \
2409 netdev_linux_wait, \
2412 netdev_linux_destroy, \
2413 NULL, /* get_config */ \
2414 NULL, /* set_config */ \
2415 NULL, /* get_tunnel_config */ \
2417 netdev_linux_rx_open, \
2419 netdev_linux_send, \
2420 netdev_linux_send_wait, \
2422 netdev_linux_set_etheraddr, \
2423 netdev_linux_get_etheraddr, \
2424 netdev_linux_get_mtu, \
2425 netdev_linux_set_mtu, \
2426 netdev_linux_get_ifindex, \
2427 netdev_linux_get_carrier, \
2428 netdev_linux_get_carrier_resets, \
2429 netdev_linux_set_miimon_interval, \
2434 netdev_linux_set_advertisements, \
2436 netdev_linux_set_policing, \
2437 netdev_linux_get_qos_types, \
2438 netdev_linux_get_qos_capabilities, \
2439 netdev_linux_get_qos, \
2440 netdev_linux_set_qos, \
2441 netdev_linux_get_queue, \
2442 netdev_linux_set_queue, \
2443 netdev_linux_delete_queue, \
2444 netdev_linux_get_queue_stats, \
2445 netdev_linux_dump_queues, \
2446 netdev_linux_dump_queue_stats, \
2448 netdev_linux_get_in4, \
2449 netdev_linux_set_in4, \
2450 netdev_linux_get_in6, \
2451 netdev_linux_add_router, \
2452 netdev_linux_get_next_hop, \
2454 netdev_linux_arp_lookup, \
2456 netdev_linux_update_flags, \
2458 netdev_linux_change_seq \
2461 const struct netdev_class netdev_linux_class =
2464 netdev_linux_create,
2465 netdev_linux_get_stats,
2466 NULL, /* set_stats */
2467 netdev_linux_get_features,
2468 netdev_linux_get_status);
2470 const struct netdev_class netdev_tap_class =
2473 netdev_linux_create_tap,
2474 netdev_tap_get_stats,
2475 NULL, /* set_stats */
2476 netdev_linux_get_features,
2477 netdev_linux_get_status);
2479 const struct netdev_class netdev_internal_class =
2482 netdev_linux_create,
2483 netdev_internal_get_stats,
2484 netdev_internal_set_stats,
2485 NULL, /* get_features */
2486 netdev_internal_get_status);
2488 static const struct netdev_rx_class netdev_rx_linux_class = {
2489 netdev_rx_linux_destroy,
2490 netdev_rx_linux_recv,
2491 netdev_rx_linux_wait,
2492 netdev_rx_linux_drain,
2495 /* HTB traffic control class. */
2497 #define HTB_N_QUEUES 0xf000
2501 unsigned int max_rate; /* In bytes/s. */
2505 struct tc_queue tc_queue;
2506 unsigned int min_rate; /* In bytes/s. */
2507 unsigned int max_rate; /* In bytes/s. */
2508 unsigned int burst; /* In bytes. */
2509 unsigned int priority; /* Lower values are higher priorities. */
2513 htb_get__(const struct netdev *netdev_)
2515 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2516 return CONTAINER_OF(netdev->tc, struct htb, tc);
2520 htb_install__(struct netdev *netdev_, uint64_t max_rate)
2522 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2525 htb = xmalloc(sizeof *htb);
2526 tc_init(&htb->tc, &tc_ops_htb);
2527 htb->max_rate = max_rate;
2529 netdev->tc = &htb->tc;
2532 /* Create an HTB qdisc.
2534 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2536 htb_setup_qdisc__(struct netdev *netdev)
2539 struct tc_htb_glob opt;
2540 struct ofpbuf request;
2541 struct tcmsg *tcmsg;
2543 tc_del_qdisc(netdev);
2545 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2546 NLM_F_EXCL | NLM_F_CREATE, &request);
2550 tcmsg->tcm_handle = tc_make_handle(1, 0);
2551 tcmsg->tcm_parent = TC_H_ROOT;
2553 nl_msg_put_string(&request, TCA_KIND, "htb");
2555 memset(&opt, 0, sizeof opt);
2556 opt.rate2quantum = 10;
2560 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2561 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2562 nl_msg_end_nested(&request, opt_offset);
2564 return tc_transact(&request, NULL);
2567 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2568 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2570 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2571 unsigned int parent, struct htb_class *class)
2574 struct tc_htb_opt opt;
2575 struct ofpbuf request;
2576 struct tcmsg *tcmsg;
2580 error = netdev_get_mtu(netdev, &mtu);
2582 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2583 netdev_get_name(netdev));
2587 memset(&opt, 0, sizeof opt);
2588 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2589 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2590 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2591 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2592 opt.prio = class->priority;
2594 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2598 tcmsg->tcm_handle = handle;
2599 tcmsg->tcm_parent = parent;
2601 nl_msg_put_string(&request, TCA_KIND, "htb");
2602 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2603 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2604 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2605 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2606 nl_msg_end_nested(&request, opt_offset);
2608 error = tc_transact(&request, NULL);
2610 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2611 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2612 netdev_get_name(netdev),
2613 tc_get_major(handle), tc_get_minor(handle),
2614 tc_get_major(parent), tc_get_minor(parent),
2615 class->min_rate, class->max_rate,
2616 class->burst, class->priority, strerror(error));
2621 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2622 * description of them into 'details'. The description complies with the
2623 * specification given in the vswitch database documentation for linux-htb
2626 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2628 static const struct nl_policy tca_htb_policy[] = {
2629 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2630 .min_len = sizeof(struct tc_htb_opt) },
2633 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2634 const struct tc_htb_opt *htb;
2636 if (!nl_parse_nested(nl_options, tca_htb_policy,
2637 attrs, ARRAY_SIZE(tca_htb_policy))) {
2638 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2642 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2643 class->min_rate = htb->rate.rate;
2644 class->max_rate = htb->ceil.rate;
2645 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2646 class->priority = htb->prio;
2651 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2652 struct htb_class *options,
2653 struct netdev_queue_stats *stats)
2655 struct nlattr *nl_options;
2656 unsigned int handle;
2659 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2660 if (!error && queue_id) {
2661 unsigned int major = tc_get_major(handle);
2662 unsigned int minor = tc_get_minor(handle);
2663 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2664 *queue_id = minor - 1;
2669 if (!error && options) {
2670 error = htb_parse_tca_options__(nl_options, options);
2676 htb_parse_qdisc_details__(struct netdev *netdev,
2677 const struct smap *details, struct htb_class *hc)
2679 const char *max_rate_s;
2681 max_rate_s = smap_get(details, "max-rate");
2682 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2683 if (!hc->max_rate) {
2684 enum netdev_features current;
2686 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2687 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
2689 hc->min_rate = hc->max_rate;
2695 htb_parse_class_details__(struct netdev *netdev,
2696 const struct smap *details, struct htb_class *hc)
2698 const struct htb *htb = htb_get__(netdev);
2699 const char *min_rate_s = smap_get(details, "min-rate");
2700 const char *max_rate_s = smap_get(details, "max-rate");
2701 const char *burst_s = smap_get(details, "burst");
2702 const char *priority_s = smap_get(details, "priority");
2705 error = netdev_get_mtu(netdev, &mtu);
2707 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2708 netdev_get_name(netdev));
2712 /* HTB requires at least an mtu sized min-rate to send any traffic even
2713 * on uncongested links. */
2714 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2715 hc->min_rate = MAX(hc->min_rate, mtu);
2716 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2719 hc->max_rate = (max_rate_s
2720 ? strtoull(max_rate_s, NULL, 10) / 8
2722 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2723 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2727 * According to hints in the documentation that I've read, it is important
2728 * that 'burst' be at least as big as the largest frame that might be
2729 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2730 * but having it a bit too small is a problem. Since netdev_get_mtu()
2731 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2732 * the MTU. We actually add 64, instead of 14, as a guard against
2733 * additional headers get tacked on somewhere that we're not aware of. */
2734 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2735 hc->burst = MAX(hc->burst, mtu + 64);
2738 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2744 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2745 unsigned int parent, struct htb_class *options,
2746 struct netdev_queue_stats *stats)
2748 struct ofpbuf *reply;
2751 error = tc_query_class(netdev, handle, parent, &reply);
2753 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2754 ofpbuf_delete(reply);
2760 htb_tc_install(struct netdev *netdev, const struct smap *details)
2764 error = htb_setup_qdisc__(netdev);
2766 struct htb_class hc;
2768 htb_parse_qdisc_details__(netdev, details, &hc);
2769 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2770 tc_make_handle(1, 0), &hc);
2772 htb_install__(netdev, hc.max_rate);
2778 static struct htb_class *
2779 htb_class_cast__(const struct tc_queue *queue)
2781 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2785 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2786 const struct htb_class *hc)
2788 struct htb *htb = htb_get__(netdev);
2789 size_t hash = hash_int(queue_id, 0);
2790 struct tc_queue *queue;
2791 struct htb_class *hcp;
2793 queue = tc_find_queue__(netdev, queue_id, hash);
2795 hcp = htb_class_cast__(queue);
2797 hcp = xmalloc(sizeof *hcp);
2798 queue = &hcp->tc_queue;
2799 queue->queue_id = queue_id;
2800 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2803 hcp->min_rate = hc->min_rate;
2804 hcp->max_rate = hc->max_rate;
2805 hcp->burst = hc->burst;
2806 hcp->priority = hc->priority;
2810 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2813 struct nl_dump dump;
2814 struct htb_class hc;
2816 /* Get qdisc options. */
2818 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2819 htb_install__(netdev, hc.max_rate);
2822 if (!start_queue_dump(netdev, &dump)) {
2825 while (nl_dump_next(&dump, &msg)) {
2826 unsigned int queue_id;
2828 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2829 htb_update_queue__(netdev, queue_id, &hc);
2832 nl_dump_done(&dump);
2838 htb_tc_destroy(struct tc *tc)
2840 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2841 struct htb_class *hc, *next;
2843 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2844 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2852 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
2854 const struct htb *htb = htb_get__(netdev);
2855 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
2860 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
2862 struct htb_class hc;
2865 htb_parse_qdisc_details__(netdev, details, &hc);
2866 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2867 tc_make_handle(1, 0), &hc);
2869 htb_get__(netdev)->max_rate = hc.max_rate;
2875 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2876 const struct tc_queue *queue, struct smap *details)
2878 const struct htb_class *hc = htb_class_cast__(queue);
2880 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
2881 if (hc->min_rate != hc->max_rate) {
2882 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
2884 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
2886 smap_add_format(details, "priority", "%u", hc->priority);
2892 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2893 const struct smap *details)
2895 struct htb_class hc;
2898 error = htb_parse_class_details__(netdev, details, &hc);
2903 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2904 tc_make_handle(1, 0xfffe), &hc);
2909 htb_update_queue__(netdev, queue_id, &hc);
2914 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2916 struct htb_class *hc = htb_class_cast__(queue);
2917 struct htb *htb = htb_get__(netdev);
2920 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2922 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2929 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2930 struct netdev_queue_stats *stats)
2932 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2933 tc_make_handle(1, 0xfffe), NULL, stats);
2937 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2938 const struct ofpbuf *nlmsg,
2939 netdev_dump_queue_stats_cb *cb, void *aux)
2941 struct netdev_queue_stats stats;
2942 unsigned int handle, major, minor;
2945 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2950 major = tc_get_major(handle);
2951 minor = tc_get_minor(handle);
2952 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2953 (*cb)(minor - 1, &stats, aux);
2958 static const struct tc_ops tc_ops_htb = {
2959 "htb", /* linux_name */
2960 "linux-htb", /* ovs_name */
2961 HTB_N_QUEUES, /* n_queues */
2970 htb_class_get_stats,
2971 htb_class_dump_stats
2974 /* "linux-hfsc" traffic control class. */
2976 #define HFSC_N_QUEUES 0xf000
2984 struct tc_queue tc_queue;
2989 static struct hfsc *
2990 hfsc_get__(const struct netdev *netdev_)
2992 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2993 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
2996 static struct hfsc_class *
2997 hfsc_class_cast__(const struct tc_queue *queue)
2999 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3003 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
3005 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3008 hfsc = xmalloc(sizeof *hfsc);
3009 tc_init(&hfsc->tc, &tc_ops_hfsc);
3010 hfsc->max_rate = max_rate;
3011 netdev->tc = &hfsc->tc;
3015 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3016 const struct hfsc_class *hc)
3020 struct hfsc_class *hcp;
3021 struct tc_queue *queue;
3023 hfsc = hfsc_get__(netdev);
3024 hash = hash_int(queue_id, 0);
3026 queue = tc_find_queue__(netdev, queue_id, hash);
3028 hcp = hfsc_class_cast__(queue);
3030 hcp = xmalloc(sizeof *hcp);
3031 queue = &hcp->tc_queue;
3032 queue->queue_id = queue_id;
3033 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3036 hcp->min_rate = hc->min_rate;
3037 hcp->max_rate = hc->max_rate;
3041 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3043 const struct tc_service_curve *rsc, *fsc, *usc;
3044 static const struct nl_policy tca_hfsc_policy[] = {
3046 .type = NL_A_UNSPEC,
3048 .min_len = sizeof(struct tc_service_curve),
3051 .type = NL_A_UNSPEC,
3053 .min_len = sizeof(struct tc_service_curve),
3056 .type = NL_A_UNSPEC,
3058 .min_len = sizeof(struct tc_service_curve),
3061 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3063 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3064 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3065 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3069 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3070 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3071 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3073 if (rsc->m1 != 0 || rsc->d != 0 ||
3074 fsc->m1 != 0 || fsc->d != 0 ||
3075 usc->m1 != 0 || usc->d != 0) {
3076 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3077 "Non-linear service curves are not supported.");
3081 if (rsc->m2 != fsc->m2) {
3082 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3083 "Real-time service curves are not supported ");
3087 if (rsc->m2 > usc->m2) {
3088 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3089 "Min-rate service curve is greater than "
3090 "the max-rate service curve.");
3094 class->min_rate = fsc->m2;
3095 class->max_rate = usc->m2;
3100 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3101 struct hfsc_class *options,
3102 struct netdev_queue_stats *stats)
3105 unsigned int handle;
3106 struct nlattr *nl_options;
3108 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3114 unsigned int major, minor;
3116 major = tc_get_major(handle);
3117 minor = tc_get_minor(handle);
3118 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3119 *queue_id = minor - 1;
3126 error = hfsc_parse_tca_options__(nl_options, options);
3133 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3134 unsigned int parent, struct hfsc_class *options,
3135 struct netdev_queue_stats *stats)
3138 struct ofpbuf *reply;
3140 error = tc_query_class(netdev, handle, parent, &reply);
3145 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3146 ofpbuf_delete(reply);
3151 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct smap *details,
3152 struct hfsc_class *class)
3155 const char *max_rate_s;
3157 max_rate_s = smap_get(details, "max-rate");
3158 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3161 enum netdev_features current;
3163 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
3164 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3167 class->min_rate = max_rate;
3168 class->max_rate = max_rate;
3172 hfsc_parse_class_details__(struct netdev *netdev,
3173 const struct smap *details,
3174 struct hfsc_class * class)
3176 const struct hfsc *hfsc;
3177 uint32_t min_rate, max_rate;
3178 const char *min_rate_s, *max_rate_s;
3180 hfsc = hfsc_get__(netdev);
3181 min_rate_s = smap_get(details, "min-rate");
3182 max_rate_s = smap_get(details, "max-rate");
3184 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3185 min_rate = MAX(min_rate, 1);
3186 min_rate = MIN(min_rate, hfsc->max_rate);
3188 max_rate = (max_rate_s
3189 ? strtoull(max_rate_s, NULL, 10) / 8
3191 max_rate = MAX(max_rate, min_rate);
3192 max_rate = MIN(max_rate, hfsc->max_rate);
3194 class->min_rate = min_rate;
3195 class->max_rate = max_rate;
3200 /* Create an HFSC qdisc.
3202 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3204 hfsc_setup_qdisc__(struct netdev * netdev)
3206 struct tcmsg *tcmsg;
3207 struct ofpbuf request;
3208 struct tc_hfsc_qopt opt;
3210 tc_del_qdisc(netdev);
3212 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3213 NLM_F_EXCL | NLM_F_CREATE, &request);
3219 tcmsg->tcm_handle = tc_make_handle(1, 0);
3220 tcmsg->tcm_parent = TC_H_ROOT;
3222 memset(&opt, 0, sizeof opt);
3225 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3226 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3228 return tc_transact(&request, NULL);
3231 /* Create an HFSC class.
3233 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3234 * sc rate <min_rate> ul rate <max_rate>" */
3236 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3237 unsigned int parent, struct hfsc_class *class)
3241 struct tcmsg *tcmsg;
3242 struct ofpbuf request;
3243 struct tc_service_curve min, max;
3245 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3251 tcmsg->tcm_handle = handle;
3252 tcmsg->tcm_parent = parent;
3256 min.m2 = class->min_rate;
3260 max.m2 = class->max_rate;
3262 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3263 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3264 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3265 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3266 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3267 nl_msg_end_nested(&request, opt_offset);
3269 error = tc_transact(&request, NULL);
3271 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3272 "min-rate %ubps, max-rate %ubps (%s)",
3273 netdev_get_name(netdev),
3274 tc_get_major(handle), tc_get_minor(handle),
3275 tc_get_major(parent), tc_get_minor(parent),
3276 class->min_rate, class->max_rate, strerror(error));
3283 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
3286 struct hfsc_class class;
3288 error = hfsc_setup_qdisc__(netdev);
3294 hfsc_parse_qdisc_details__(netdev, details, &class);
3295 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3296 tc_make_handle(1, 0), &class);
3302 hfsc_install__(netdev, class.max_rate);
3307 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3310 struct nl_dump dump;
3311 struct hfsc_class hc;
3314 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3315 hfsc_install__(netdev, hc.max_rate);
3317 if (!start_queue_dump(netdev, &dump)) {
3321 while (nl_dump_next(&dump, &msg)) {
3322 unsigned int queue_id;
3324 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3325 hfsc_update_queue__(netdev, queue_id, &hc);
3329 nl_dump_done(&dump);
3334 hfsc_tc_destroy(struct tc *tc)
3337 struct hfsc_class *hc, *next;
3339 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3341 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3342 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3351 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
3353 const struct hfsc *hfsc;
3354 hfsc = hfsc_get__(netdev);
3355 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
3360 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
3363 struct hfsc_class class;
3365 hfsc_parse_qdisc_details__(netdev, details, &class);
3366 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3367 tc_make_handle(1, 0), &class);
3370 hfsc_get__(netdev)->max_rate = class.max_rate;
3377 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3378 const struct tc_queue *queue, struct smap *details)
3380 const struct hfsc_class *hc;
3382 hc = hfsc_class_cast__(queue);
3383 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3384 if (hc->min_rate != hc->max_rate) {
3385 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3391 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3392 const struct smap *details)
3395 struct hfsc_class class;
3397 error = hfsc_parse_class_details__(netdev, details, &class);
3402 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3403 tc_make_handle(1, 0xfffe), &class);
3408 hfsc_update_queue__(netdev, queue_id, &class);
3413 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3417 struct hfsc_class *hc;
3419 hc = hfsc_class_cast__(queue);
3420 hfsc = hfsc_get__(netdev);
3422 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3424 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3431 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3432 struct netdev_queue_stats *stats)
3434 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3435 tc_make_handle(1, 0xfffe), NULL, stats);
3439 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3440 const struct ofpbuf *nlmsg,
3441 netdev_dump_queue_stats_cb *cb, void *aux)
3443 struct netdev_queue_stats stats;
3444 unsigned int handle, major, minor;
3447 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3452 major = tc_get_major(handle);
3453 minor = tc_get_minor(handle);
3454 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3455 (*cb)(minor - 1, &stats, aux);
3460 static const struct tc_ops tc_ops_hfsc = {
3461 "hfsc", /* linux_name */
3462 "linux-hfsc", /* ovs_name */
3463 HFSC_N_QUEUES, /* n_queues */
3464 hfsc_tc_install, /* tc_install */
3465 hfsc_tc_load, /* tc_load */
3466 hfsc_tc_destroy, /* tc_destroy */
3467 hfsc_qdisc_get, /* qdisc_get */
3468 hfsc_qdisc_set, /* qdisc_set */
3469 hfsc_class_get, /* class_get */
3470 hfsc_class_set, /* class_set */
3471 hfsc_class_delete, /* class_delete */
3472 hfsc_class_get_stats, /* class_get_stats */
3473 hfsc_class_dump_stats /* class_dump_stats */
3476 /* "linux-default" traffic control class.
3478 * This class represents the default, unnamed Linux qdisc. It corresponds to
3479 * the "" (empty string) QoS type in the OVS database. */
3482 default_install__(struct netdev *netdev_)
3484 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3485 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
3487 /* Nothing but a tc class implementation is allowed to write to a tc. This
3488 * class never does that, so we can legitimately use a const tc object. */
3489 netdev->tc = CONST_CAST(struct tc *, &tc);
3493 default_tc_install(struct netdev *netdev,
3494 const struct smap *details OVS_UNUSED)
3496 default_install__(netdev);
3501 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3503 default_install__(netdev);
3507 static const struct tc_ops tc_ops_default = {
3508 NULL, /* linux_name */
3513 NULL, /* tc_destroy */
3514 NULL, /* qdisc_get */
3515 NULL, /* qdisc_set */
3516 NULL, /* class_get */
3517 NULL, /* class_set */
3518 NULL, /* class_delete */
3519 NULL, /* class_get_stats */
3520 NULL /* class_dump_stats */
3523 /* "linux-other" traffic control class.
3528 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
3530 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3531 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
3533 /* Nothing but a tc class implementation is allowed to write to a tc. This
3534 * class never does that, so we can legitimately use a const tc object. */
3535 netdev->tc = CONST_CAST(struct tc *, &tc);
3539 static const struct tc_ops tc_ops_other = {
3540 NULL, /* linux_name */
3541 "linux-other", /* ovs_name */
3543 NULL, /* tc_install */
3545 NULL, /* tc_destroy */
3546 NULL, /* qdisc_get */
3547 NULL, /* qdisc_set */
3548 NULL, /* class_get */
3549 NULL, /* class_set */
3550 NULL, /* class_delete */
3551 NULL, /* class_get_stats */
3552 NULL /* class_dump_stats */
3555 /* Traffic control. */
3557 /* Number of kernel "tc" ticks per second. */
3558 static double ticks_per_s;
3560 /* Number of kernel "jiffies" per second. This is used for the purpose of
3561 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3562 * one jiffy's worth of data.
3564 * There are two possibilities here:
3566 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3567 * approximate range of 100 to 1024. That means that we really need to
3568 * make sure that the qdisc can buffer that much data.
3570 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3571 * has finely granular timers and there's no need to fudge additional room
3572 * for buffers. (There's no extra effort needed to implement that: the
3573 * large 'buffer_hz' is used as a divisor, so practically any number will
3574 * come out as 0 in the division. Small integer results in the case of
3575 * really high dividends won't have any real effect anyhow.)
3577 static unsigned int buffer_hz;
3579 /* Returns tc handle 'major':'minor'. */
3581 tc_make_handle(unsigned int major, unsigned int minor)
3583 return TC_H_MAKE(major << 16, minor);
3586 /* Returns the major number from 'handle'. */
3588 tc_get_major(unsigned int handle)
3590 return TC_H_MAJ(handle) >> 16;
3593 /* Returns the minor number from 'handle'. */
3595 tc_get_minor(unsigned int handle)
3597 return TC_H_MIN(handle);
3600 static struct tcmsg *
3601 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3602 struct ofpbuf *request)
3604 struct tcmsg *tcmsg;
3608 error = get_ifindex(netdev, &ifindex);
3613 ofpbuf_init(request, 512);
3614 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3615 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3616 tcmsg->tcm_family = AF_UNSPEC;
3617 tcmsg->tcm_ifindex = ifindex;
3618 /* Caller should fill in tcmsg->tcm_handle. */
3619 /* Caller should fill in tcmsg->tcm_parent. */
3625 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3627 int error = nl_sock_transact(rtnl_sock, request, replyp);
3628 ofpbuf_uninit(request);
3632 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3633 * policing configuration.
3635 * This function is equivalent to running the following when 'add' is true:
3636 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3638 * This function is equivalent to running the following when 'add' is false:
3639 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3641 * The configuration and stats may be seen with the following command:
3642 * /sbin/tc -s qdisc show dev <devname>
3644 * Returns 0 if successful, otherwise a positive errno value.
3647 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3649 struct ofpbuf request;
3650 struct tcmsg *tcmsg;
3652 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3653 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3655 tcmsg = tc_make_request(netdev, type, flags, &request);
3659 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3660 tcmsg->tcm_parent = TC_H_INGRESS;
3661 nl_msg_put_string(&request, TCA_KIND, "ingress");
3662 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3664 error = tc_transact(&request, NULL);
3666 /* If we're deleting the qdisc, don't worry about some of the
3667 * error conditions. */
3668 if (!add && (error == ENOENT || error == EINVAL)) {
3677 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3680 * This function is equivalent to running:
3681 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3682 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3685 * The configuration and stats may be seen with the following command:
3686 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3688 * Returns 0 if successful, otherwise a positive errno value.
3691 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3693 struct tc_police tc_police;
3694 struct ofpbuf request;
3695 struct tcmsg *tcmsg;
3696 size_t basic_offset;
3697 size_t police_offset;
3701 memset(&tc_police, 0, sizeof tc_police);
3702 tc_police.action = TC_POLICE_SHOT;
3703 tc_police.mtu = mtu;
3704 tc_fill_rate(&tc_police.rate, (kbits_rate * 1000)/8, mtu);
3705 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3706 kbits_burst * 1024);
3708 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3709 NLM_F_EXCL | NLM_F_CREATE, &request);
3713 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3714 tcmsg->tcm_info = tc_make_handle(49,
3715 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3717 nl_msg_put_string(&request, TCA_KIND, "basic");
3718 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3719 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3720 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3721 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3722 nl_msg_end_nested(&request, police_offset);
3723 nl_msg_end_nested(&request, basic_offset);
3725 error = tc_transact(&request, NULL);
3736 /* The values in psched are not individually very meaningful, but they are
3737 * important. The tables below show some values seen in the wild.
3741 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3742 * (Before that, there are hints that it was 1000000000.)
3744 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3748 * -----------------------------------
3749 * [1] 000c8000 000f4240 000f4240 00000064
3750 * [2] 000003e8 00000400 000f4240 3b9aca00
3751 * [3] 000003e8 00000400 000f4240 3b9aca00
3752 * [4] 000003e8 00000400 000f4240 00000064
3753 * [5] 000003e8 00000040 000f4240 3b9aca00
3754 * [6] 000003e8 00000040 000f4240 000000f9
3756 * a b c d ticks_per_s buffer_hz
3757 * ------- --------- ---------- ------------- ----------- -------------
3758 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3759 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3760 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3761 * [4] 1,000 1,024 1,000,000 100 976,562 100
3762 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3763 * [6] 1,000 64 1,000,000 249 15,625,000 249
3765 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3766 * [2] 2.6.26-1-686-bigmem from Debian lenny
3767 * [3] 2.6.26-2-sparc64 from Debian lenny
3768 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3769 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3770 * [6] 2.6.34 from kernel.org on KVM
3772 static const char fn[] = "/proc/net/psched";
3773 unsigned int a, b, c, d;
3779 stream = fopen(fn, "r");
3781 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3785 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3786 VLOG_WARN("%s: read failed", fn);
3790 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3794 VLOG_WARN("%s: invalid scheduler parameters", fn);
3798 ticks_per_s = (double) a * c / b;
3802 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3805 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3808 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3809 * rate of 'rate' bytes per second. */
3811 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3816 return (rate * ticks) / ticks_per_s;
3819 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3820 * rate of 'rate' bytes per second. */
3822 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3827 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3830 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3831 * a transmission rate of 'rate' bytes per second. */
3833 tc_buffer_per_jiffy(unsigned int rate)
3838 return rate / buffer_hz;
3841 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3842 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3843 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3844 * stores NULL into it if it is absent.
3846 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3849 * Returns 0 if successful, otherwise a positive errno value. */
3851 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3852 struct nlattr **options)
3854 static const struct nl_policy tca_policy[] = {
3855 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3856 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3858 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3860 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3861 tca_policy, ta, ARRAY_SIZE(ta))) {
3862 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3867 *kind = nl_attr_get_string(ta[TCA_KIND]);
3871 *options = ta[TCA_OPTIONS];
3886 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3887 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3888 * into '*options', and its queue statistics into '*stats'. Any of the output
3889 * arguments may be null.
3891 * Returns 0 if successful, otherwise a positive errno value. */
3893 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3894 struct nlattr **options, struct netdev_queue_stats *stats)
3896 static const struct nl_policy tca_policy[] = {
3897 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3898 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3900 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3902 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3903 tca_policy, ta, ARRAY_SIZE(ta))) {
3904 VLOG_WARN_RL(&rl, "failed to parse class message");
3909 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3910 *handlep = tc->tcm_handle;
3914 *options = ta[TCA_OPTIONS];
3918 const struct gnet_stats_queue *gsq;
3919 struct gnet_stats_basic gsb;
3921 static const struct nl_policy stats_policy[] = {
3922 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3923 .min_len = sizeof gsb },
3924 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3925 .min_len = sizeof *gsq },
3927 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3929 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3930 sa, ARRAY_SIZE(sa))) {
3931 VLOG_WARN_RL(&rl, "failed to parse class stats");
3935 /* Alignment issues screw up the length of struct gnet_stats_basic on
3936 * some arch/bitsize combinations. Newer versions of Linux have a
3937 * struct gnet_stats_basic_packed, but we can't depend on that. The
3938 * easiest thing to do is just to make a copy. */
3939 memset(&gsb, 0, sizeof gsb);
3940 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3941 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3942 stats->tx_bytes = gsb.bytes;
3943 stats->tx_packets = gsb.packets;
3945 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3946 stats->tx_errors = gsq->drops;
3956 memset(stats, 0, sizeof *stats);
3961 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3964 tc_query_class(const struct netdev *netdev,
3965 unsigned int handle, unsigned int parent,
3966 struct ofpbuf **replyp)
3968 struct ofpbuf request;
3969 struct tcmsg *tcmsg;
3972 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3976 tcmsg->tcm_handle = handle;
3977 tcmsg->tcm_parent = parent;
3979 error = tc_transact(&request, replyp);
3981 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3982 netdev_get_name(netdev),
3983 tc_get_major(handle), tc_get_minor(handle),
3984 tc_get_major(parent), tc_get_minor(parent),
3990 /* Equivalent to "tc class del dev <name> handle <handle>". */
3992 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3994 struct ofpbuf request;
3995 struct tcmsg *tcmsg;
3998 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
4002 tcmsg->tcm_handle = handle;
4003 tcmsg->tcm_parent = 0;
4005 error = tc_transact(&request, NULL);
4007 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4008 netdev_get_name(netdev),
4009 tc_get_major(handle), tc_get_minor(handle),
4015 /* Equivalent to "tc qdisc del dev <name> root". */
4017 tc_del_qdisc(struct netdev *netdev_)
4019 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4020 struct ofpbuf request;
4021 struct tcmsg *tcmsg;
4024 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
4028 tcmsg->tcm_handle = tc_make_handle(1, 0);
4029 tcmsg->tcm_parent = TC_H_ROOT;
4031 error = tc_transact(&request, NULL);
4032 if (error == EINVAL) {
4033 /* EINVAL probably means that the default qdisc was in use, in which
4034 * case we've accomplished our purpose. */
4037 if (!error && netdev->tc) {
4038 if (netdev->tc->ops->tc_destroy) {
4039 netdev->tc->ops->tc_destroy(netdev->tc);
4046 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4047 * kernel to determine what they are. Returns 0 if successful, otherwise a
4048 * positive errno value. */
4050 tc_query_qdisc(const struct netdev *netdev_)
4052 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4053 struct ofpbuf request, *qdisc;
4054 const struct tc_ops *ops;
4055 struct tcmsg *tcmsg;
4063 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4064 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4065 * 2.6.35 without that fix backported to it.
4067 * To avoid the OOPS, we must not make a request that would attempt to dump
4068 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4069 * few others. There are a few ways that I can see to do this, but most of
4070 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4071 * technique chosen here is to assume that any non-default qdisc that we
4072 * create will have a class with handle 1:0. The built-in qdiscs only have
4073 * a class with handle 0:0.
4075 * We could check for Linux 2.6.35+ and use a more straightforward method
4077 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
4081 tcmsg->tcm_handle = tc_make_handle(1, 0);
4082 tcmsg->tcm_parent = 0;
4084 /* Figure out what tc class to instantiate. */
4085 error = tc_transact(&request, &qdisc);
4089 error = tc_parse_qdisc(qdisc, &kind, NULL);
4091 ops = &tc_ops_other;
4093 ops = tc_lookup_linux_name(kind);
4095 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4096 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4098 ops = &tc_ops_other;
4101 } else if (error == ENOENT) {
4102 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4103 * other entity that doesn't have a handle 1:0. We will assume
4104 * that it's the system default qdisc. */
4105 ops = &tc_ops_default;
4108 /* Who knows? Maybe the device got deleted. */
4109 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4110 netdev_get_name(netdev_), strerror(error));
4111 ops = &tc_ops_other;
4114 /* Instantiate it. */
4115 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
4116 ovs_assert((load_error == 0) == (netdev->tc != NULL));
4117 ofpbuf_delete(qdisc);
4119 return error ? error : load_error;
4122 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4123 approximate the time to transmit packets of various lengths. For an MTU of
4124 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4125 represents two possible packet lengths; for a MTU of 513 through 1024, four
4126 possible lengths; and so on.
4128 Returns, for the specified 'mtu', the number of bits that packet lengths
4129 need to be shifted right to fit within such a 256-entry table. */
4131 tc_calc_cell_log(unsigned int mtu)
4136 mtu = ETH_PAYLOAD_MAX;
4138 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4140 for (cell_log = 0; mtu >= 256; cell_log++) {
4147 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4150 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4152 memset(rate, 0, sizeof *rate);
4153 rate->cell_log = tc_calc_cell_log(mtu);
4154 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4155 /* rate->cell_align = 0; */ /* distro headers. */
4156 rate->mpu = ETH_TOTAL_MIN;
4160 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4161 * attribute of the specified "type".
4163 * See tc_calc_cell_log() above for a description of "rtab"s. */
4165 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4170 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4171 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4172 unsigned packet_size = (i + 1) << rate->cell_log;
4173 if (packet_size < rate->mpu) {
4174 packet_size = rate->mpu;
4176 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4180 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4181 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4182 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4185 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4187 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4188 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4191 /* Linux-only functions declared in netdev-linux.h */
4193 /* Returns a fd for an AF_INET socket or a negative errno value. */
4195 netdev_linux_get_af_inet_sock(void)
4197 int error = netdev_linux_init();
4198 return error ? -error : af_inet_sock;
4201 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4202 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4204 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4205 const char *flag_name, bool enable)
4207 const char *netdev_name = netdev_get_name(netdev);
4208 struct ethtool_value evalue;
4212 COVERAGE_INC(netdev_get_ethtool);
4213 memset(&evalue, 0, sizeof evalue);
4214 error = netdev_linux_do_ethtool(netdev_name,
4215 (struct ethtool_cmd *)&evalue,
4216 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4221 COVERAGE_INC(netdev_set_ethtool);
4222 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4223 error = netdev_linux_do_ethtool(netdev_name,
4224 (struct ethtool_cmd *)&evalue,
4225 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4230 COVERAGE_INC(netdev_get_ethtool);
4231 memset(&evalue, 0, sizeof evalue);
4232 error = netdev_linux_do_ethtool(netdev_name,
4233 (struct ethtool_cmd *)&evalue,
4234 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4239 if (new_flags != evalue.data) {
4240 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4241 "device %s failed", enable ? "enable" : "disable",
4242 flag_name, netdev_name);
4249 /* Utility functions. */
4251 /* Copies 'src' into 'dst', performing format conversion in the process. */
4253 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4254 const struct rtnl_link_stats *src)
4256 dst->rx_packets = src->rx_packets;
4257 dst->tx_packets = src->tx_packets;
4258 dst->rx_bytes = src->rx_bytes;
4259 dst->tx_bytes = src->tx_bytes;
4260 dst->rx_errors = src->rx_errors;
4261 dst->tx_errors = src->tx_errors;
4262 dst->rx_dropped = src->rx_dropped;
4263 dst->tx_dropped = src->tx_dropped;
4264 dst->multicast = src->multicast;
4265 dst->collisions = src->collisions;
4266 dst->rx_length_errors = src->rx_length_errors;
4267 dst->rx_over_errors = src->rx_over_errors;
4268 dst->rx_crc_errors = src->rx_crc_errors;
4269 dst->rx_frame_errors = src->rx_frame_errors;
4270 dst->rx_fifo_errors = src->rx_fifo_errors;
4271 dst->rx_missed_errors = src->rx_missed_errors;
4272 dst->tx_aborted_errors = src->tx_aborted_errors;
4273 dst->tx_carrier_errors = src->tx_carrier_errors;
4274 dst->tx_fifo_errors = src->tx_fifo_errors;
4275 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4276 dst->tx_window_errors = src->tx_window_errors;
4280 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4282 /* Policy for RTNLGRP_LINK messages.
4284 * There are *many* more fields in these messages, but currently we only
4285 * care about these fields. */
4286 static const struct nl_policy rtnlgrp_link_policy[] = {
4287 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4288 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4289 .min_len = sizeof(struct rtnl_link_stats) },
4292 struct ofpbuf request;
4293 struct ofpbuf *reply;
4294 struct ifinfomsg *ifi;
4295 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4298 ofpbuf_init(&request, 0);
4299 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4300 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4301 ifi->ifi_family = PF_UNSPEC;
4302 ifi->ifi_index = ifindex;
4303 error = nl_sock_transact(rtnl_sock, &request, &reply);
4304 ofpbuf_uninit(&request);
4309 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4310 rtnlgrp_link_policy,
4311 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4312 ofpbuf_delete(reply);
4316 if (!attrs[IFLA_STATS]) {
4317 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4318 ofpbuf_delete(reply);
4322 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4324 ofpbuf_delete(reply);
4330 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4332 static const char fn[] = "/proc/net/dev";
4337 stream = fopen(fn, "r");
4339 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
4344 while (fgets(line, sizeof line, stream)) {
4347 #define X64 "%"SCNu64
4350 X64 X64 X64 X64 X64 X64 X64 "%*u"
4351 X64 X64 X64 X64 X64 X64 X64 "%*u",
4357 &stats->rx_fifo_errors,
4358 &stats->rx_frame_errors,
4364 &stats->tx_fifo_errors,
4366 &stats->tx_carrier_errors) != 15) {
4367 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4368 } else if (!strcmp(devname, netdev_name)) {
4369 stats->rx_length_errors = UINT64_MAX;
4370 stats->rx_over_errors = UINT64_MAX;
4371 stats->rx_crc_errors = UINT64_MAX;
4372 stats->rx_missed_errors = UINT64_MAX;
4373 stats->tx_aborted_errors = UINT64_MAX;
4374 stats->tx_heartbeat_errors = UINT64_MAX;
4375 stats->tx_window_errors = UINT64_MAX;
4381 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4387 get_flags(const struct netdev *dev, unsigned int *flags)
4393 error = netdev_linux_do_ioctl(dev->name, &ifr, SIOCGIFFLAGS,
4396 *flags = ifr.ifr_flags;
4402 set_flags(const char *name, unsigned int flags)
4406 ifr.ifr_flags = flags;
4407 return netdev_linux_do_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
4411 do_get_ifindex(const char *netdev_name)
4415 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4416 COVERAGE_INC(netdev_get_ifindex);
4417 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4418 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4419 netdev_name, strerror(errno));
4422 return ifr.ifr_ifindex;
4426 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4428 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4430 if (!(netdev->cache_valid & VALID_IFINDEX)) {
4431 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4434 netdev->get_ifindex_error = -ifindex;
4435 netdev->ifindex = 0;
4437 netdev->get_ifindex_error = 0;
4438 netdev->ifindex = ifindex;
4440 netdev->cache_valid |= VALID_IFINDEX;
4443 *ifindexp = netdev->ifindex;
4444 return netdev->get_ifindex_error;
4448 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4453 memset(&ifr, 0, sizeof ifr);
4454 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4455 COVERAGE_INC(netdev_get_hwaddr);
4456 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4457 /* ENODEV probably means that a vif disappeared asynchronously and
4458 * hasn't been removed from the database yet, so reduce the log level
4459 * to INFO for that case. */
4460 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4461 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4462 netdev_name, strerror(errno));
4465 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4466 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4467 VLOG_WARN("%s device has unknown hardware address family %d",
4468 netdev_name, hwaddr_family);
4470 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4475 set_etheraddr(const char *netdev_name,
4476 const uint8_t mac[ETH_ADDR_LEN])
4480 memset(&ifr, 0, sizeof ifr);
4481 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4482 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4483 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4484 COVERAGE_INC(netdev_set_hwaddr);
4485 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4486 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4487 netdev_name, strerror(errno));
4494 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4495 int cmd, const char *cmd_name)
4499 memset(&ifr, 0, sizeof ifr);
4500 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4501 ifr.ifr_data = (caddr_t) ecmd;
4504 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4507 if (errno != EOPNOTSUPP) {
4508 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4509 "failed: %s", cmd_name, name, strerror(errno));
4511 /* The device doesn't support this operation. That's pretty
4512 * common, so there's no point in logging anything. */
4519 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4520 const char *cmd_name)
4522 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4523 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4524 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4532 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4533 int cmd, const char *cmd_name)
4538 ifr.ifr_addr.sa_family = AF_INET;
4539 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4541 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4542 *ip = sin->sin_addr;
4547 /* Returns an AF_PACKET raw socket or a negative errno value. */
4549 af_packet_sock(void)
4551 static int sock = INT_MIN;
4553 if (sock == INT_MIN) {
4554 sock = socket(AF_PACKET, SOCK_RAW, 0);
4556 int error = set_nonblocking(sock);
4563 VLOG_ERR("failed to create packet socket: %s", strerror(errno));