2 * Copyright (c) 2009, 2010, 2011, 2012 Nicira Networks.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
24 #include <arpa/inet.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
30 #include <linux/types.h>
31 #include <linux/ethtool.h>
32 #include <linux/mii.h>
33 #include <linux/pkt_cls.h>
34 #include <linux/pkt_sched.h>
35 #include <linux/rtnetlink.h>
36 #include <linux/sockios.h>
37 #include <linux/version.h>
38 #include <sys/types.h>
39 #include <sys/ioctl.h>
40 #include <sys/socket.h>
41 #include <netpacket/packet.h>
43 #include <net/if_arp.h>
44 #include <net/if_packet.h>
45 #include <net/route.h>
46 #include <netinet/in.h>
53 #include "dpif-linux.h"
54 #include "dynamic-string.h"
55 #include "fatal-signal.h"
58 #include "netdev-provider.h"
59 #include "netdev-vport.h"
61 #include "netlink-notifier.h"
62 #include "netlink-socket.h"
64 #include "openflow/openflow.h"
66 #include "poll-loop.h"
67 #include "rtnetlink-link.h"
68 #include "socket-util.h"
74 VLOG_DEFINE_THIS_MODULE(netdev_linux);
76 COVERAGE_DEFINE(netdev_set_policing);
77 COVERAGE_DEFINE(netdev_arp_lookup);
78 COVERAGE_DEFINE(netdev_get_ifindex);
79 COVERAGE_DEFINE(netdev_get_hwaddr);
80 COVERAGE_DEFINE(netdev_set_hwaddr);
81 COVERAGE_DEFINE(netdev_ethtool);
83 /* These were introduced in Linux 2.6.14, so they might be missing if we have
85 #ifndef ADVERTISED_Pause
86 #define ADVERTISED_Pause (1 << 13)
88 #ifndef ADVERTISED_Asym_Pause
89 #define ADVERTISED_Asym_Pause (1 << 14)
92 /* These were introduced in Linux 2.6.24, so they might be missing if we
93 * have old headers. */
94 #ifndef ETHTOOL_GFLAGS
95 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
97 #ifndef ETHTOOL_SFLAGS
98 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
101 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
104 #define TC_RTAB_SIZE 1024
107 static struct nln_notifier *netdev_linux_cache_notifier = NULL;
108 static int cache_notifier_refcount;
111 VALID_IFINDEX = 1 << 0,
112 VALID_ETHERADDR = 1 << 1,
116 VALID_POLICING = 1 << 5,
117 VALID_VPORT_STAT_ERROR = 1 << 6
125 /* Traffic control. */
127 /* An instance of a traffic control class. Always associated with a particular
130 * Each TC implementation subclasses this with whatever additional data it
133 const struct tc_ops *ops;
134 struct hmap queues; /* Contains "struct tc_queue"s.
135 * Read by generic TC layer.
136 * Written only by TC implementation. */
139 /* One traffic control queue.
141 * Each TC implementation subclasses this with whatever additional data it
144 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
145 unsigned int queue_id; /* OpenFlow queue ID. */
148 /* A particular kind of traffic control. Each implementation generally maps to
149 * one particular Linux qdisc class.
151 * The functions below return 0 if successful or a positive errno value on
152 * failure, except where otherwise noted. All of them must be provided, except
153 * where otherwise noted. */
155 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
156 * This is null for tc_ops_default and tc_ops_other, for which there are no
157 * appropriate values. */
158 const char *linux_name;
160 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
161 const char *ovs_name;
163 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
164 * queues. The queues are numbered 0 through n_queues - 1. */
165 unsigned int n_queues;
167 /* Called to install this TC class on 'netdev'. The implementation should
168 * make the Netlink calls required to set up 'netdev' with the right qdisc
169 * and configure it according to 'details'. The implementation may assume
170 * that the current qdisc is the default; that is, there is no need for it
171 * to delete the current qdisc before installing itself.
173 * The contents of 'details' should be documented as valid for 'ovs_name'
174 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
175 * (which is built as ovs-vswitchd.conf.db(8)).
177 * This function must return 0 if and only if it sets 'netdev->tc' to an
178 * initialized 'struct tc'.
180 * (This function is null for tc_ops_other, which cannot be installed. For
181 * other TC classes it should always be nonnull.) */
182 int (*tc_install)(struct netdev *netdev, const struct shash *details);
184 /* Called when the netdev code determines (through a Netlink query) that
185 * this TC class's qdisc is installed on 'netdev', but we didn't install
186 * it ourselves and so don't know any of the details.
188 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
189 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
190 * implementation should parse the other attributes of 'nlmsg' as
191 * necessary to determine its configuration. If necessary it should also
192 * use Netlink queries to determine the configuration of queues on
195 * This function must return 0 if and only if it sets 'netdev->tc' to an
196 * initialized 'struct tc'. */
197 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
199 /* Destroys the data structures allocated by the implementation as part of
200 * 'tc'. (This includes destroying 'tc->queues' by calling
203 * The implementation should not need to perform any Netlink calls. If
204 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
205 * (But it may not be desirable.)
207 * This function may be null if 'tc' is trivial. */
208 void (*tc_destroy)(struct tc *tc);
210 /* Retrieves details of 'netdev->tc' configuration into 'details'.
212 * The implementation should not need to perform any Netlink calls, because
213 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
214 * cached the configuration.
216 * The contents of 'details' should be documented as valid for 'ovs_name'
217 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
218 * (which is built as ovs-vswitchd.conf.db(8)).
220 * This function may be null if 'tc' is not configurable.
222 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
224 /* Reconfigures 'netdev->tc' according to 'details', performing any
225 * required Netlink calls to complete the reconfiguration.
227 * The contents of 'details' should be documented as valid for 'ovs_name'
228 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
229 * (which is built as ovs-vswitchd.conf.db(8)).
231 * This function may be null if 'tc' is not configurable.
233 int (*qdisc_set)(struct netdev *, const struct shash *details);
235 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
236 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
238 * The contents of 'details' should be documented as valid for 'ovs_name'
239 * in the "other_config" column in the "Queue" table in
240 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
242 * The implementation should not need to perform any Netlink calls, because
243 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
244 * cached the queue configuration.
246 * This function may be null if 'tc' does not have queues ('n_queues' is
248 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
249 struct shash *details);
251 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
252 * 'details', perfoming any required Netlink calls to complete the
253 * reconfiguration. The caller ensures that 'queue_id' is less than
256 * The contents of 'details' should be documented as valid for 'ovs_name'
257 * in the "other_config" column in the "Queue" table in
258 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
260 * This function may be null if 'tc' does not have queues or its queues are
261 * not configurable. */
262 int (*class_set)(struct netdev *, unsigned int queue_id,
263 const struct shash *details);
265 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
266 * tc_queue's within 'netdev->tc->queues'.
268 * This function may be null if 'tc' does not have queues or its queues
269 * cannot be deleted. */
270 int (*class_delete)(struct netdev *, struct tc_queue *queue);
272 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
273 * 'struct tc_queue's within 'netdev->tc->queues'.
275 * On success, initializes '*stats'.
277 * This function may be null if 'tc' does not have queues or if it cannot
278 * report queue statistics. */
279 int (*class_get_stats)(const struct netdev *netdev,
280 const struct tc_queue *queue,
281 struct netdev_queue_stats *stats);
283 /* Extracts queue stats from 'nlmsg', which is a response to a
284 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
286 * This function may be null if 'tc' does not have queues or if it cannot
287 * report queue statistics. */
288 int (*class_dump_stats)(const struct netdev *netdev,
289 const struct ofpbuf *nlmsg,
290 netdev_dump_queue_stats_cb *cb, void *aux);
294 tc_init(struct tc *tc, const struct tc_ops *ops)
297 hmap_init(&tc->queues);
301 tc_destroy(struct tc *tc)
303 hmap_destroy(&tc->queues);
306 static const struct tc_ops tc_ops_htb;
307 static const struct tc_ops tc_ops_hfsc;
308 static const struct tc_ops tc_ops_default;
309 static const struct tc_ops tc_ops_other;
311 static const struct tc_ops *tcs[] = {
312 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
313 &tc_ops_hfsc, /* Hierarchical fair service curve. */
314 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
315 &tc_ops_other, /* Some other qdisc. */
319 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
320 static unsigned int tc_get_major(unsigned int handle);
321 static unsigned int tc_get_minor(unsigned int handle);
323 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
324 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
325 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
327 static struct tcmsg *tc_make_request(const struct netdev *, int type,
328 unsigned int flags, struct ofpbuf *);
329 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
330 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
331 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
334 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
335 struct nlattr **options);
336 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
337 struct nlattr **options,
338 struct netdev_queue_stats *);
339 static int tc_query_class(const struct netdev *,
340 unsigned int handle, unsigned int parent,
341 struct ofpbuf **replyp);
342 static int tc_delete_class(const struct netdev *, unsigned int handle);
344 static int tc_del_qdisc(struct netdev *netdev);
345 static int tc_query_qdisc(const struct netdev *netdev);
347 static int tc_calc_cell_log(unsigned int mtu);
348 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
349 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
350 const struct tc_ratespec *rate);
351 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
353 struct netdev_dev_linux {
354 struct netdev_dev netdev_dev;
356 struct shash_node *shash_node;
357 unsigned int cache_valid;
358 unsigned int change_seq;
360 bool miimon; /* Link status of last poll. */
361 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
362 struct timer miimon_timer;
364 /* The following are figured out "on demand" only. They are only valid
365 * when the corresponding VALID_* bit in 'cache_valid' is set. */
367 uint8_t etheraddr[ETH_ADDR_LEN];
368 struct in_addr address, netmask;
371 unsigned int ifi_flags;
372 long long int carrier_resets;
373 uint32_t kbits_rate; /* Policing data. */
374 uint32_t kbits_burst;
375 int vport_stats_error; /* Cached error code from vport_get_stats().
376 0 or an errno value. */
380 struct tap_state tap;
384 struct netdev_linux {
385 struct netdev netdev;
389 /* Sockets used for ioctl operations. */
390 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
392 /* A Netlink routing socket that is not subscribed to any multicast groups. */
393 static struct nl_sock *rtnl_sock;
395 /* This is set pretty low because we probably won't learn anything from the
396 * additional log messages. */
397 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
399 static int netdev_linux_init(void);
401 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
402 int cmd, const char *cmd_name);
403 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
404 const char *cmd_name);
405 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
406 int cmd, const char *cmd_name);
407 static int get_flags(const struct netdev_dev *, unsigned int *flags);
408 static int set_flags(struct netdev *, unsigned int flags);
409 static int do_get_ifindex(const char *netdev_name);
410 static int get_ifindex(const struct netdev *, int *ifindexp);
411 static int do_set_addr(struct netdev *netdev,
412 int ioctl_nr, const char *ioctl_name,
413 struct in_addr addr);
414 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
415 static int set_etheraddr(const char *netdev_name, int hwaddr_family,
416 const uint8_t[ETH_ADDR_LEN]);
417 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
418 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
419 static int af_packet_sock(void);
420 static void netdev_linux_miimon_run(void);
421 static void netdev_linux_miimon_wait(void);
424 is_netdev_linux_class(const struct netdev_class *netdev_class)
426 return netdev_class->init == netdev_linux_init;
429 static struct netdev_dev_linux *
430 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
432 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
433 assert(is_netdev_linux_class(netdev_class));
435 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
438 static struct netdev_linux *
439 netdev_linux_cast(const struct netdev *netdev)
441 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
442 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
443 assert(is_netdev_linux_class(netdev_class));
445 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
449 netdev_linux_init(void)
451 static int status = -1;
453 /* Create AF_INET socket. */
454 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
455 status = af_inet_sock >= 0 ? 0 : errno;
457 VLOG_ERR("failed to create inet socket: %s", strerror(status));
460 /* Create rtnetlink socket. */
462 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
464 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
473 netdev_linux_run(void)
475 rtnetlink_link_run();
476 netdev_linux_miimon_run();
480 netdev_linux_wait(void)
482 rtnetlink_link_wait();
483 netdev_linux_miimon_wait();
487 netdev_dev_linux_changed(struct netdev_dev_linux *dev, unsigned int ifi_flags)
490 if (!dev->change_seq) {
494 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
495 dev->carrier_resets++;
497 dev->ifi_flags = ifi_flags;
499 dev->cache_valid = 0;
503 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
504 void *aux OVS_UNUSED)
506 struct netdev_dev_linux *dev;
508 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
510 const struct netdev_class *netdev_class =
511 netdev_dev_get_class(base_dev);
513 if (is_netdev_linux_class(netdev_class)) {
514 dev = netdev_dev_linux_cast(base_dev);
515 netdev_dev_linux_changed(dev, change->ifi_flags);
519 struct shash device_shash;
520 struct shash_node *node;
522 shash_init(&device_shash);
523 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
524 SHASH_FOR_EACH (node, &device_shash) {
529 get_flags(&dev->netdev_dev, &flags);
530 netdev_dev_linux_changed(dev, flags);
532 shash_destroy(&device_shash);
537 cache_notifier_ref(void)
539 if (!cache_notifier_refcount) {
540 assert(!netdev_linux_cache_notifier);
542 netdev_linux_cache_notifier =
543 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
545 if (!netdev_linux_cache_notifier) {
549 cache_notifier_refcount++;
555 cache_notifier_unref(void)
557 assert(cache_notifier_refcount > 0);
558 if (!--cache_notifier_refcount) {
559 assert(netdev_linux_cache_notifier);
560 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
561 netdev_linux_cache_notifier = NULL;
565 /* Creates system and internal devices. */
567 netdev_linux_create(const struct netdev_class *class, const char *name,
568 struct netdev_dev **netdev_devp)
570 struct netdev_dev_linux *netdev_dev;
573 error = cache_notifier_ref();
578 netdev_dev = xzalloc(sizeof *netdev_dev);
579 netdev_dev->change_seq = 1;
580 netdev_dev_init(&netdev_dev->netdev_dev, name, class);
581 get_flags(&netdev_dev->netdev_dev, &netdev_dev->ifi_flags);
583 *netdev_devp = &netdev_dev->netdev_dev;
587 /* For most types of netdevs we open the device for each call of
588 * netdev_open(). However, this is not the case with tap devices,
589 * since it is only possible to open the device once. In this
590 * situation we share a single file descriptor, and consequently
591 * buffers, across all readers. Therefore once data is read it will
592 * be unavailable to other reads for tap devices. */
594 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
595 const char *name, struct netdev_dev **netdev_devp)
597 struct netdev_dev_linux *netdev_dev;
598 struct tap_state *state;
599 static const char tap_dev[] = "/dev/net/tun";
603 netdev_dev = xzalloc(sizeof *netdev_dev);
604 state = &netdev_dev->state.tap;
606 error = cache_notifier_ref();
611 /* Open tap device. */
612 state->fd = open(tap_dev, O_RDWR);
615 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
616 goto error_unref_notifier;
619 /* Create tap device. */
620 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
621 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
622 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
623 VLOG_WARN("%s: creating tap device failed: %s", name,
626 goto error_unref_notifier;
629 /* Make non-blocking. */
630 error = set_nonblocking(state->fd);
632 goto error_unref_notifier;
635 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
636 *netdev_devp = &netdev_dev->netdev_dev;
639 error_unref_notifier:
640 cache_notifier_unref();
647 destroy_tap(struct netdev_dev_linux *netdev_dev)
649 struct tap_state *state = &netdev_dev->state.tap;
651 if (state->fd >= 0) {
656 /* Destroys the netdev device 'netdev_dev_'. */
658 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
660 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
661 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
663 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
664 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
667 if (class == &netdev_tap_class) {
668 destroy_tap(netdev_dev);
672 cache_notifier_unref();
676 netdev_linux_open(struct netdev_dev *netdev_dev_, struct netdev **netdevp)
678 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
679 struct netdev_linux *netdev;
680 enum netdev_flags flags;
683 /* Allocate network device. */
684 netdev = xzalloc(sizeof *netdev);
686 netdev_init(&netdev->netdev, netdev_dev_);
688 /* Verify that the device really exists, by attempting to read its flags.
689 * (The flags might be cached, in which case this won't actually do an
692 * Don't do this for "internal" netdevs, though, because those have to be
693 * created as netdev objects before they exist in the kernel, because
694 * creating them in the kernel happens by passing a netdev object to
695 * dpif_port_add(). */
696 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
697 error = netdev_get_flags(&netdev->netdev, &flags);
698 if (error == ENODEV) {
703 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
704 !netdev_dev->state.tap.opened) {
706 /* We assume that the first user of the tap device is the primary user
707 * and give them the tap FD. Subsequent users probably just expect
708 * this to be a system device so open it normally to avoid send/receive
709 * directions appearing to be reversed. */
710 netdev->fd = netdev_dev->state.tap.fd;
711 netdev_dev->state.tap.opened = true;
714 *netdevp = &netdev->netdev;
718 netdev_uninit(&netdev->netdev, true);
722 /* Closes and destroys 'netdev'. */
724 netdev_linux_close(struct netdev *netdev_)
726 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
728 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
735 netdev_linux_listen(struct netdev *netdev_)
737 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
738 struct sockaddr_ll sll;
743 if (netdev->fd >= 0) {
747 /* Create file descriptor. */
748 fd = socket(PF_PACKET, SOCK_RAW, 0);
751 VLOG_ERR("failed to create raw socket (%s)", strerror(error));
755 /* Set non-blocking mode. */
756 error = set_nonblocking(fd);
761 /* Get ethernet device index. */
762 error = get_ifindex(&netdev->netdev, &ifindex);
767 /* Bind to specific ethernet device. */
768 memset(&sll, 0, sizeof sll);
769 sll.sll_family = AF_PACKET;
770 sll.sll_ifindex = ifindex;
771 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
772 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
774 VLOG_ERR("%s: failed to bind raw socket (%s)",
775 netdev_get_name(netdev_), strerror(error));
790 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
792 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
794 if (netdev->fd < 0) {
795 /* Device is not listening. */
802 retval = (netdev_->netdev_dev->netdev_class == &netdev_tap_class
803 ? read(netdev->fd, data, size)
804 : recv(netdev->fd, data, size, MSG_TRUNC));
806 return retval <= size ? retval : -EMSGSIZE;
807 } else if (errno != EINTR) {
808 if (errno != EAGAIN) {
809 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
810 strerror(errno), netdev_get_name(netdev_));
817 /* Registers with the poll loop to wake up from the next call to poll_block()
818 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
820 netdev_linux_recv_wait(struct netdev *netdev_)
822 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
823 if (netdev->fd >= 0) {
824 poll_fd_wait(netdev->fd, POLLIN);
828 /* Discards all packets waiting to be received from 'netdev'. */
830 netdev_linux_drain(struct netdev *netdev_)
832 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
833 if (netdev->fd < 0) {
835 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
837 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
838 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
842 drain_fd(netdev->fd, ifr.ifr_qlen);
845 return drain_rcvbuf(netdev->fd);
849 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
850 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
851 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
852 * the packet is too big or too small to transmit on the device.
854 * The caller retains ownership of 'buffer' in all cases.
856 * The kernel maintains a packet transmission queue, so the caller is not
857 * expected to do additional queuing of packets. */
859 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
861 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
865 if (netdev->fd < 0) {
866 /* Use our AF_PACKET socket to send to this device. */
867 struct sockaddr_ll sll;
874 sock = af_packet_sock();
879 error = get_ifindex(netdev_, &ifindex);
884 /* We don't bother setting most fields in sockaddr_ll because the
885 * kernel ignores them for SOCK_RAW. */
886 memset(&sll, 0, sizeof sll);
887 sll.sll_family = AF_PACKET;
888 sll.sll_ifindex = ifindex;
890 iov.iov_base = (void *) data;
894 msg.msg_namelen = sizeof sll;
897 msg.msg_control = NULL;
898 msg.msg_controllen = 0;
901 retval = sendmsg(sock, &msg, 0);
903 /* Use the netdev's own fd to send to this device. This is
904 * essential for tap devices, because packets sent to a tap device
905 * with an AF_PACKET socket will loop back to be *received* again
906 * on the tap device. */
907 retval = write(netdev->fd, data, size);
911 /* The Linux AF_PACKET implementation never blocks waiting for room
912 * for packets, instead returning ENOBUFS. Translate this into
913 * EAGAIN for the caller. */
914 if (errno == ENOBUFS) {
916 } else if (errno == EINTR) {
918 } else if (errno != EAGAIN) {
919 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
920 netdev_get_name(netdev_), strerror(errno));
923 } else if (retval != size) {
924 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
925 "%zu) on %s", retval, size, netdev_get_name(netdev_));
933 /* Registers with the poll loop to wake up from the next call to poll_block()
934 * when the packet transmission queue has sufficient room to transmit a packet
935 * with netdev_send().
937 * The kernel maintains a packet transmission queue, so the client is not
938 * expected to do additional queuing of packets. Thus, this function is
939 * unlikely to ever be used. It is included for completeness. */
941 netdev_linux_send_wait(struct netdev *netdev_)
943 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
944 if (netdev->fd < 0) {
946 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
947 poll_fd_wait(netdev->fd, POLLOUT);
949 /* TAP device always accepts packets.*/
950 poll_immediate_wake();
954 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
955 * otherwise a positive errno value. */
957 netdev_linux_set_etheraddr(struct netdev *netdev_,
958 const uint8_t mac[ETH_ADDR_LEN])
960 struct netdev_dev_linux *netdev_dev =
961 netdev_dev_linux_cast(netdev_get_dev(netdev_));
964 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
965 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
966 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
968 netdev_dev->cache_valid |= VALID_ETHERADDR;
969 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
977 /* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
978 * free the returned buffer. */
980 netdev_linux_get_etheraddr(const struct netdev *netdev_,
981 uint8_t mac[ETH_ADDR_LEN])
983 struct netdev_dev_linux *netdev_dev =
984 netdev_dev_linux_cast(netdev_get_dev(netdev_));
985 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
986 int error = get_etheraddr(netdev_get_name(netdev_),
987 netdev_dev->etheraddr);
991 netdev_dev->cache_valid |= VALID_ETHERADDR;
993 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
997 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
998 * in bytes, not including the hardware header; thus, this is typically 1500
999 * bytes for Ethernet devices. */
1001 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1003 struct netdev_dev_linux *netdev_dev =
1004 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1005 if (!(netdev_dev->cache_valid & VALID_MTU)) {
1009 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1010 SIOCGIFMTU, "SIOCGIFMTU");
1014 netdev_dev->mtu = ifr.ifr_mtu;
1015 netdev_dev->cache_valid |= VALID_MTU;
1017 *mtup = netdev_dev->mtu;
1021 /* Sets the maximum size of transmitted (MTU) for given device using linux
1022 * networking ioctl interface.
1025 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1027 struct netdev_dev_linux *netdev_dev =
1028 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1032 if (netdev_dev->cache_valid & VALID_MTU &&
1033 netdev_dev->mtu == mtu) {
1037 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1038 SIOCSIFMTU, "SIOCSIFMTU");
1043 netdev_dev->mtu = ifr.ifr_mtu;
1044 netdev_dev->cache_valid |= VALID_MTU;
1048 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1049 * On failure, returns a negative errno value. */
1051 netdev_linux_get_ifindex(const struct netdev *netdev)
1055 error = get_ifindex(netdev, &ifindex);
1056 return error ? -error : ifindex;
1060 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1062 struct netdev_dev_linux *netdev_dev =
1063 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1065 if (netdev_dev->miimon_interval > 0) {
1066 *carrier = netdev_dev->miimon;
1068 *carrier = (netdev_dev->ifi_flags & IFF_RUNNING) != 0;
1074 static long long int
1075 netdev_linux_get_carrier_resets(const struct netdev *netdev)
1077 return netdev_dev_linux_cast(netdev_get_dev(netdev))->carrier_resets;
1081 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1082 struct mii_ioctl_data *data)
1087 memset(&ifr, 0, sizeof ifr);
1088 memcpy(&ifr.ifr_data, data, sizeof *data);
1089 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1090 memcpy(data, &ifr.ifr_data, sizeof *data);
1096 netdev_linux_get_miimon(const char *name, bool *miimon)
1098 struct mii_ioctl_data data;
1103 memset(&data, 0, sizeof data);
1104 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1106 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1107 data.reg_num = MII_BMSR;
1108 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1112 *miimon = !!(data.val_out & BMSR_LSTATUS);
1114 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1117 struct ethtool_cmd ecmd;
1119 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1122 memset(&ecmd, 0, sizeof ecmd);
1123 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1126 struct ethtool_value eval;
1128 memcpy(&eval, &ecmd, sizeof eval);
1129 *miimon = !!eval.data;
1131 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1139 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1140 long long int interval)
1142 struct netdev_dev_linux *netdev_dev;
1144 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev_));
1146 interval = interval > 0 ? MAX(interval, 100) : 0;
1147 if (netdev_dev->miimon_interval != interval) {
1148 netdev_dev->miimon_interval = interval;
1149 timer_set_expired(&netdev_dev->miimon_timer);
1156 netdev_linux_miimon_run(void)
1158 struct shash device_shash;
1159 struct shash_node *node;
1161 shash_init(&device_shash);
1162 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1163 SHASH_FOR_EACH (node, &device_shash) {
1164 struct netdev_dev_linux *dev = node->data;
1167 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1171 netdev_linux_get_miimon(dev->netdev_dev.name, &miimon);
1172 if (miimon != dev->miimon) {
1173 dev->miimon = miimon;
1174 netdev_dev_linux_changed(dev, dev->ifi_flags);
1177 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1180 shash_destroy(&device_shash);
1184 netdev_linux_miimon_wait(void)
1186 struct shash device_shash;
1187 struct shash_node *node;
1189 shash_init(&device_shash);
1190 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1191 SHASH_FOR_EACH (node, &device_shash) {
1192 struct netdev_dev_linux *dev = node->data;
1194 if (dev->miimon_interval > 0) {
1195 timer_wait(&dev->miimon_timer);
1198 shash_destroy(&device_shash);
1201 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1202 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1205 check_for_working_netlink_stats(void)
1207 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1208 * preferable, so if that works, we'll use it. */
1209 int ifindex = do_get_ifindex("lo");
1211 VLOG_WARN("failed to get ifindex for lo, "
1212 "obtaining netdev stats from proc");
1215 struct netdev_stats stats;
1216 int error = get_stats_via_netlink(ifindex, &stats);
1218 VLOG_DBG("obtaining netdev stats via rtnetlink");
1221 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1222 "via proc (you are probably running a pre-2.6.19 "
1223 "kernel)", strerror(error));
1230 swap_uint64(uint64_t *a, uint64_t *b)
1238 get_stats_via_vport(const struct netdev *netdev_,
1239 struct netdev_stats *stats)
1241 struct netdev_dev_linux *netdev_dev =
1242 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1244 if (!netdev_dev->vport_stats_error ||
1245 !(netdev_dev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1248 error = netdev_vport_get_stats(netdev_, stats);
1250 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1251 "(%s)", netdev_get_name(netdev_), strerror(error));
1253 netdev_dev->vport_stats_error = error;
1254 netdev_dev->cache_valid |= VALID_VPORT_STAT_ERROR;
1259 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1260 struct netdev_stats *stats)
1262 static int use_netlink_stats = -1;
1265 if (use_netlink_stats < 0) {
1266 use_netlink_stats = check_for_working_netlink_stats();
1269 if (use_netlink_stats) {
1272 error = get_ifindex(netdev_, &ifindex);
1274 error = get_stats_via_netlink(ifindex, stats);
1277 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1281 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1282 netdev_get_name(netdev_), error);
1288 /* Retrieves current device stats for 'netdev-linux'. */
1290 netdev_linux_get_stats(const struct netdev *netdev_,
1291 struct netdev_stats *stats)
1293 struct netdev_dev_linux *netdev_dev =
1294 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1295 struct netdev_stats dev_stats;
1298 get_stats_via_vport(netdev_, stats);
1300 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1303 if (netdev_dev->vport_stats_error) {
1310 if (netdev_dev->vport_stats_error) {
1311 /* stats not available from OVS then use ioctl stats. */
1314 stats->rx_errors += dev_stats.rx_errors;
1315 stats->tx_errors += dev_stats.tx_errors;
1316 stats->rx_dropped += dev_stats.rx_dropped;
1317 stats->tx_dropped += dev_stats.tx_dropped;
1318 stats->multicast += dev_stats.multicast;
1319 stats->collisions += dev_stats.collisions;
1320 stats->rx_length_errors += dev_stats.rx_length_errors;
1321 stats->rx_over_errors += dev_stats.rx_over_errors;
1322 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1323 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1324 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1325 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1326 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1327 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1328 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1329 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1330 stats->tx_window_errors += dev_stats.tx_window_errors;
1335 /* Retrieves current device stats for 'netdev-tap' netdev or
1336 * netdev-internal. */
1338 netdev_tap_get_stats(const struct netdev *netdev_,
1339 struct netdev_stats *stats)
1341 struct netdev_dev_linux *netdev_dev =
1342 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1343 struct netdev_stats dev_stats;
1346 get_stats_via_vport(netdev_, stats);
1348 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1350 if (netdev_dev->vport_stats_error) {
1357 /* If this port is an internal port then the transmit and receive stats
1358 * will appear to be swapped relative to the other ports since we are the
1359 * one sending the data, not a remote computer. For consistency, we swap
1360 * them back here. This does not apply if we are getting stats from the
1361 * vport layer because it always tracks stats from the perspective of the
1363 if (netdev_dev->vport_stats_error) {
1365 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1366 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1367 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1368 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1369 stats->rx_length_errors = 0;
1370 stats->rx_over_errors = 0;
1371 stats->rx_crc_errors = 0;
1372 stats->rx_frame_errors = 0;
1373 stats->rx_fifo_errors = 0;
1374 stats->rx_missed_errors = 0;
1375 stats->tx_aborted_errors = 0;
1376 stats->tx_carrier_errors = 0;
1377 stats->tx_fifo_errors = 0;
1378 stats->tx_heartbeat_errors = 0;
1379 stats->tx_window_errors = 0;
1381 stats->rx_dropped += dev_stats.tx_dropped;
1382 stats->tx_dropped += dev_stats.rx_dropped;
1384 stats->rx_errors += dev_stats.tx_errors;
1385 stats->tx_errors += dev_stats.rx_errors;
1387 stats->multicast += dev_stats.multicast;
1388 stats->collisions += dev_stats.collisions;
1394 netdev_internal_get_stats(const struct netdev *netdev_,
1395 struct netdev_stats *stats)
1397 struct netdev_dev_linux *netdev_dev =
1398 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1400 get_stats_via_vport(netdev_, stats);
1401 return netdev_dev->vport_stats_error;
1404 /* Stores the features supported by 'netdev' into each of '*current',
1405 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1406 * bitmap of NETDEV_* bits. Returns 0 if successful, otherwise a positive
1409 netdev_linux_get_features(const struct netdev *netdev,
1410 enum netdev_features *current,
1411 enum netdev_features *advertised,
1412 enum netdev_features *supported,
1413 enum netdev_features *peer)
1415 struct ethtool_cmd ecmd;
1419 memset(&ecmd, 0, sizeof ecmd);
1420 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1421 ETHTOOL_GSET, "ETHTOOL_GSET");
1426 /* Supported features. */
1428 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1429 *supported |= NETDEV_F_10MB_HD;
1431 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1432 *supported |= NETDEV_F_10MB_FD;
1434 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1435 *supported |= NETDEV_F_100MB_HD;
1437 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1438 *supported |= NETDEV_F_100MB_FD;
1440 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1441 *supported |= NETDEV_F_1GB_HD;
1443 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1444 *supported |= NETDEV_F_1GB_FD;
1446 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1447 *supported |= NETDEV_F_10GB_FD;
1449 if (ecmd.supported & SUPPORTED_TP) {
1450 *supported |= NETDEV_F_COPPER;
1452 if (ecmd.supported & SUPPORTED_FIBRE) {
1453 *supported |= NETDEV_F_FIBER;
1455 if (ecmd.supported & SUPPORTED_Autoneg) {
1456 *supported |= NETDEV_F_AUTONEG;
1458 if (ecmd.supported & SUPPORTED_Pause) {
1459 *supported |= NETDEV_F_PAUSE;
1461 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1462 *supported |= NETDEV_F_PAUSE_ASYM;
1465 /* Advertised features. */
1467 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1468 *advertised |= NETDEV_F_10MB_HD;
1470 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1471 *advertised |= NETDEV_F_10MB_FD;
1473 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1474 *advertised |= NETDEV_F_100MB_HD;
1476 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1477 *advertised |= NETDEV_F_100MB_FD;
1479 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1480 *advertised |= NETDEV_F_1GB_HD;
1482 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1483 *advertised |= NETDEV_F_1GB_FD;
1485 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1486 *advertised |= NETDEV_F_10GB_FD;
1488 if (ecmd.advertising & ADVERTISED_TP) {
1489 *advertised |= NETDEV_F_COPPER;
1491 if (ecmd.advertising & ADVERTISED_FIBRE) {
1492 *advertised |= NETDEV_F_FIBER;
1494 if (ecmd.advertising & ADVERTISED_Autoneg) {
1495 *advertised |= NETDEV_F_AUTONEG;
1497 if (ecmd.advertising & ADVERTISED_Pause) {
1498 *advertised |= NETDEV_F_PAUSE;
1500 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1501 *advertised |= NETDEV_F_PAUSE_ASYM;
1504 /* Current settings. */
1506 if (speed == SPEED_10) {
1507 *current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1508 } else if (speed == SPEED_100) {
1509 *current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1510 } else if (speed == SPEED_1000) {
1511 *current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1512 } else if (speed == SPEED_10000) {
1513 *current = NETDEV_F_10GB_FD;
1514 } else if (speed == 40000) {
1515 *current = NETDEV_F_40GB_FD;
1516 } else if (speed == 100000) {
1517 *current = NETDEV_F_100GB_FD;
1518 } else if (speed == 1000000) {
1519 *current = NETDEV_F_1TB_FD;
1524 if (ecmd.port == PORT_TP) {
1525 *current |= NETDEV_F_COPPER;
1526 } else if (ecmd.port == PORT_FIBRE) {
1527 *current |= NETDEV_F_FIBER;
1531 *current |= NETDEV_F_AUTONEG;
1534 /* Peer advertisements. */
1535 *peer = 0; /* XXX */
1540 /* Set the features advertised by 'netdev' to 'advertise'. */
1542 netdev_linux_set_advertisements(struct netdev *netdev,
1543 enum netdev_features advertise)
1545 struct ethtool_cmd ecmd;
1548 memset(&ecmd, 0, sizeof ecmd);
1549 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1550 ETHTOOL_GSET, "ETHTOOL_GSET");
1555 ecmd.advertising = 0;
1556 if (advertise & NETDEV_F_10MB_HD) {
1557 ecmd.advertising |= ADVERTISED_10baseT_Half;
1559 if (advertise & NETDEV_F_10MB_FD) {
1560 ecmd.advertising |= ADVERTISED_10baseT_Full;
1562 if (advertise & NETDEV_F_100MB_HD) {
1563 ecmd.advertising |= ADVERTISED_100baseT_Half;
1565 if (advertise & NETDEV_F_100MB_FD) {
1566 ecmd.advertising |= ADVERTISED_100baseT_Full;
1568 if (advertise & NETDEV_F_1GB_HD) {
1569 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1571 if (advertise & NETDEV_F_1GB_FD) {
1572 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1574 if (advertise & NETDEV_F_10GB_FD) {
1575 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1577 if (advertise & NETDEV_F_COPPER) {
1578 ecmd.advertising |= ADVERTISED_TP;
1580 if (advertise & NETDEV_F_FIBER) {
1581 ecmd.advertising |= ADVERTISED_FIBRE;
1583 if (advertise & NETDEV_F_AUTONEG) {
1584 ecmd.advertising |= ADVERTISED_Autoneg;
1586 if (advertise & NETDEV_F_PAUSE) {
1587 ecmd.advertising |= ADVERTISED_Pause;
1589 if (advertise & NETDEV_F_PAUSE_ASYM) {
1590 ecmd.advertising |= ADVERTISED_Asym_Pause;
1592 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1593 ETHTOOL_SSET, "ETHTOOL_SSET");
1596 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1597 * successful, otherwise a positive errno value. */
1599 netdev_linux_set_policing(struct netdev *netdev,
1600 uint32_t kbits_rate, uint32_t kbits_burst)
1602 struct netdev_dev_linux *netdev_dev =
1603 netdev_dev_linux_cast(netdev_get_dev(netdev));
1604 const char *netdev_name = netdev_get_name(netdev);
1608 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1609 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1610 : kbits_burst); /* Stick with user-specified value. */
1612 if (netdev_dev->cache_valid & VALID_POLICING
1613 && netdev_dev->kbits_rate == kbits_rate
1614 && netdev_dev->kbits_burst == kbits_burst) {
1615 /* Assume that settings haven't changed since we last set them. */
1619 COVERAGE_INC(netdev_set_policing);
1620 /* Remove any existing ingress qdisc. */
1621 error = tc_add_del_ingress_qdisc(netdev, false);
1623 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1624 netdev_name, strerror(error));
1629 error = tc_add_del_ingress_qdisc(netdev, true);
1631 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1632 netdev_name, strerror(error));
1636 error = tc_add_policer(netdev, kbits_rate, kbits_burst);
1638 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1639 netdev_name, strerror(error));
1644 netdev_dev->kbits_rate = kbits_rate;
1645 netdev_dev->kbits_burst = kbits_burst;
1646 netdev_dev->cache_valid |= VALID_POLICING;
1652 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1655 const struct tc_ops **opsp;
1657 for (opsp = tcs; *opsp != NULL; opsp++) {
1658 const struct tc_ops *ops = *opsp;
1659 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1660 sset_add(types, ops->ovs_name);
1666 static const struct tc_ops *
1667 tc_lookup_ovs_name(const char *name)
1669 const struct tc_ops **opsp;
1671 for (opsp = tcs; *opsp != NULL; opsp++) {
1672 const struct tc_ops *ops = *opsp;
1673 if (!strcmp(name, ops->ovs_name)) {
1680 static const struct tc_ops *
1681 tc_lookup_linux_name(const char *name)
1683 const struct tc_ops **opsp;
1685 for (opsp = tcs; *opsp != NULL; opsp++) {
1686 const struct tc_ops *ops = *opsp;
1687 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1694 static struct tc_queue *
1695 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1698 struct netdev_dev_linux *netdev_dev =
1699 netdev_dev_linux_cast(netdev_get_dev(netdev));
1700 struct tc_queue *queue;
1702 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1703 if (queue->queue_id == queue_id) {
1710 static struct tc_queue *
1711 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1713 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1717 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1719 struct netdev_qos_capabilities *caps)
1721 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1725 caps->n_queues = ops->n_queues;
1730 netdev_linux_get_qos(const struct netdev *netdev,
1731 const char **typep, struct shash *details)
1733 struct netdev_dev_linux *netdev_dev =
1734 netdev_dev_linux_cast(netdev_get_dev(netdev));
1737 error = tc_query_qdisc(netdev);
1742 *typep = netdev_dev->tc->ops->ovs_name;
1743 return (netdev_dev->tc->ops->qdisc_get
1744 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1749 netdev_linux_set_qos(struct netdev *netdev,
1750 const char *type, const struct shash *details)
1752 struct netdev_dev_linux *netdev_dev =
1753 netdev_dev_linux_cast(netdev_get_dev(netdev));
1754 const struct tc_ops *new_ops;
1757 new_ops = tc_lookup_ovs_name(type);
1758 if (!new_ops || !new_ops->tc_install) {
1762 error = tc_query_qdisc(netdev);
1767 if (new_ops == netdev_dev->tc->ops) {
1768 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1770 /* Delete existing qdisc. */
1771 error = tc_del_qdisc(netdev);
1775 assert(netdev_dev->tc == NULL);
1777 /* Install new qdisc. */
1778 error = new_ops->tc_install(netdev, details);
1779 assert((error == 0) == (netdev_dev->tc != NULL));
1786 netdev_linux_get_queue(const struct netdev *netdev,
1787 unsigned int queue_id, struct shash *details)
1789 struct netdev_dev_linux *netdev_dev =
1790 netdev_dev_linux_cast(netdev_get_dev(netdev));
1793 error = tc_query_qdisc(netdev);
1797 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1799 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1805 netdev_linux_set_queue(struct netdev *netdev,
1806 unsigned int queue_id, const struct shash *details)
1808 struct netdev_dev_linux *netdev_dev =
1809 netdev_dev_linux_cast(netdev_get_dev(netdev));
1812 error = tc_query_qdisc(netdev);
1815 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1816 || !netdev_dev->tc->ops->class_set) {
1820 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1824 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1826 struct netdev_dev_linux *netdev_dev =
1827 netdev_dev_linux_cast(netdev_get_dev(netdev));
1830 error = tc_query_qdisc(netdev);
1833 } else if (!netdev_dev->tc->ops->class_delete) {
1836 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1838 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1844 netdev_linux_get_queue_stats(const struct netdev *netdev,
1845 unsigned int queue_id,
1846 struct netdev_queue_stats *stats)
1848 struct netdev_dev_linux *netdev_dev =
1849 netdev_dev_linux_cast(netdev_get_dev(netdev));
1852 error = tc_query_qdisc(netdev);
1855 } else if (!netdev_dev->tc->ops->class_get_stats) {
1858 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1860 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1866 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1868 struct ofpbuf request;
1869 struct tcmsg *tcmsg;
1871 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
1875 tcmsg->tcm_parent = 0;
1876 nl_dump_start(dump, rtnl_sock, &request);
1877 ofpbuf_uninit(&request);
1882 netdev_linux_dump_queues(const struct netdev *netdev,
1883 netdev_dump_queues_cb *cb, void *aux)
1885 struct netdev_dev_linux *netdev_dev =
1886 netdev_dev_linux_cast(netdev_get_dev(netdev));
1887 struct tc_queue *queue;
1888 struct shash details;
1892 error = tc_query_qdisc(netdev);
1895 } else if (!netdev_dev->tc->ops->class_get) {
1900 shash_init(&details);
1901 HMAP_FOR_EACH (queue, hmap_node, &netdev_dev->tc->queues) {
1902 shash_clear(&details);
1904 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
1906 (*cb)(queue->queue_id, &details, aux);
1911 shash_destroy(&details);
1917 netdev_linux_dump_queue_stats(const struct netdev *netdev,
1918 netdev_dump_queue_stats_cb *cb, void *aux)
1920 struct netdev_dev_linux *netdev_dev =
1921 netdev_dev_linux_cast(netdev_get_dev(netdev));
1922 struct nl_dump dump;
1927 error = tc_query_qdisc(netdev);
1930 } else if (!netdev_dev->tc->ops->class_dump_stats) {
1935 if (!start_queue_dump(netdev, &dump)) {
1938 while (nl_dump_next(&dump, &msg)) {
1939 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
1945 error = nl_dump_done(&dump);
1946 return error ? error : last_error;
1950 netdev_linux_get_in4(const struct netdev *netdev_,
1951 struct in_addr *address, struct in_addr *netmask)
1953 struct netdev_dev_linux *netdev_dev =
1954 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1956 if (!(netdev_dev->cache_valid & VALID_IN4)) {
1959 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
1960 SIOCGIFADDR, "SIOCGIFADDR");
1965 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
1966 SIOCGIFNETMASK, "SIOCGIFNETMASK");
1971 netdev_dev->cache_valid |= VALID_IN4;
1973 *address = netdev_dev->address;
1974 *netmask = netdev_dev->netmask;
1975 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
1979 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
1980 struct in_addr netmask)
1982 struct netdev_dev_linux *netdev_dev =
1983 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1986 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
1988 netdev_dev->cache_valid |= VALID_IN4;
1989 netdev_dev->address = address;
1990 netdev_dev->netmask = netmask;
1991 if (address.s_addr != INADDR_ANY) {
1992 error = do_set_addr(netdev_, SIOCSIFNETMASK,
1993 "SIOCSIFNETMASK", netmask);
2000 parse_if_inet6_line(const char *line,
2001 struct in6_addr *in6, char ifname[16 + 1])
2003 uint8_t *s6 = in6->s6_addr;
2004 #define X8 "%2"SCNx8
2006 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2007 "%*x %*x %*x %*x %16s\n",
2008 &s6[0], &s6[1], &s6[2], &s6[3],
2009 &s6[4], &s6[5], &s6[6], &s6[7],
2010 &s6[8], &s6[9], &s6[10], &s6[11],
2011 &s6[12], &s6[13], &s6[14], &s6[15],
2015 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2016 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2018 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2020 struct netdev_dev_linux *netdev_dev =
2021 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2022 if (!(netdev_dev->cache_valid & VALID_IN6)) {
2026 netdev_dev->in6 = in6addr_any;
2028 file = fopen("/proc/net/if_inet6", "r");
2030 const char *name = netdev_get_name(netdev_);
2031 while (fgets(line, sizeof line, file)) {
2032 struct in6_addr in6_tmp;
2033 char ifname[16 + 1];
2034 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2035 && !strcmp(name, ifname))
2037 netdev_dev->in6 = in6_tmp;
2043 netdev_dev->cache_valid |= VALID_IN6;
2045 *in6 = netdev_dev->in6;
2050 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2052 struct sockaddr_in sin;
2053 memset(&sin, 0, sizeof sin);
2054 sin.sin_family = AF_INET;
2055 sin.sin_addr = addr;
2058 memset(sa, 0, sizeof *sa);
2059 memcpy(sa, &sin, sizeof sin);
2063 do_set_addr(struct netdev *netdev,
2064 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2067 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2068 make_in4_sockaddr(&ifr.ifr_addr, addr);
2070 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2074 /* Adds 'router' as a default IP gateway. */
2076 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2078 struct in_addr any = { INADDR_ANY };
2082 memset(&rt, 0, sizeof rt);
2083 make_in4_sockaddr(&rt.rt_dst, any);
2084 make_in4_sockaddr(&rt.rt_gateway, router);
2085 make_in4_sockaddr(&rt.rt_genmask, any);
2086 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2087 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2089 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
2095 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2098 static const char fn[] = "/proc/net/route";
2103 *netdev_name = NULL;
2104 stream = fopen(fn, "r");
2105 if (stream == NULL) {
2106 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
2111 while (fgets(line, sizeof line, stream)) {
2114 ovs_be32 dest, gateway, mask;
2115 int refcnt, metric, mtu;
2116 unsigned int flags, use, window, irtt;
2119 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2121 iface, &dest, &gateway, &flags, &refcnt,
2122 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2124 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2128 if (!(flags & RTF_UP)) {
2129 /* Skip routes that aren't up. */
2133 /* The output of 'dest', 'mask', and 'gateway' were given in
2134 * network byte order, so we don't need need any endian
2135 * conversions here. */
2136 if ((dest & mask) == (host->s_addr & mask)) {
2138 /* The host is directly reachable. */
2139 next_hop->s_addr = 0;
2141 /* To reach the host, we must go through a gateway. */
2142 next_hop->s_addr = gateway;
2144 *netdev_name = xstrdup(iface);
2156 netdev_linux_get_status(const struct netdev *netdev, struct shash *sh)
2158 struct ethtool_drvinfo drvinfo;
2161 memset(&drvinfo, 0, sizeof drvinfo);
2162 error = netdev_linux_do_ethtool(netdev_get_name(netdev),
2163 (struct ethtool_cmd *)&drvinfo,
2165 "ETHTOOL_GDRVINFO");
2167 shash_add(sh, "driver_name", xstrdup(drvinfo.driver));
2168 shash_add(sh, "driver_version", xstrdup(drvinfo.version));
2169 shash_add(sh, "firmware_version", xstrdup(drvinfo.fw_version));
2175 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2176 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2177 * returns 0. Otherwise, it returns a positive errno value; in particular,
2178 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2180 netdev_linux_arp_lookup(const struct netdev *netdev,
2181 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2184 struct sockaddr_in sin;
2187 memset(&r, 0, sizeof r);
2188 memset(&sin, 0, sizeof sin);
2189 sin.sin_family = AF_INET;
2190 sin.sin_addr.s_addr = ip;
2192 memcpy(&r.arp_pa, &sin, sizeof sin);
2193 r.arp_ha.sa_family = ARPHRD_ETHER;
2195 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2196 COVERAGE_INC(netdev_arp_lookup);
2197 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2199 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2200 } else if (retval != ENXIO) {
2201 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2202 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
2208 nd_to_iff_flags(enum netdev_flags nd)
2211 if (nd & NETDEV_UP) {
2214 if (nd & NETDEV_PROMISC) {
2221 iff_to_nd_flags(int iff)
2223 enum netdev_flags nd = 0;
2227 if (iff & IFF_PROMISC) {
2228 nd |= NETDEV_PROMISC;
2234 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2235 enum netdev_flags on, enum netdev_flags *old_flagsp)
2237 struct netdev_dev_linux *netdev_dev;
2238 int old_flags, new_flags;
2241 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2242 old_flags = netdev_dev->ifi_flags;
2243 *old_flagsp = iff_to_nd_flags(old_flags);
2244 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2245 if (new_flags != old_flags) {
2246 error = set_flags(netdev, new_flags);
2247 get_flags(&netdev_dev->netdev_dev, &netdev_dev->ifi_flags);
2253 netdev_linux_change_seq(const struct netdev *netdev)
2255 return netdev_dev_linux_cast(netdev_get_dev(netdev))->change_seq;
2258 #define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS) \
2262 netdev_linux_init, \
2264 netdev_linux_wait, \
2267 netdev_linux_destroy, \
2268 NULL, /* get_config */ \
2269 NULL, /* set_config */ \
2271 netdev_linux_open, \
2272 netdev_linux_close, \
2274 netdev_linux_listen, \
2275 netdev_linux_recv, \
2276 netdev_linux_recv_wait, \
2277 netdev_linux_drain, \
2279 netdev_linux_send, \
2280 netdev_linux_send_wait, \
2282 netdev_linux_set_etheraddr, \
2283 netdev_linux_get_etheraddr, \
2284 netdev_linux_get_mtu, \
2285 netdev_linux_set_mtu, \
2286 netdev_linux_get_ifindex, \
2287 netdev_linux_get_carrier, \
2288 netdev_linux_get_carrier_resets, \
2289 netdev_linux_set_miimon_interval, \
2293 netdev_linux_get_features, \
2294 netdev_linux_set_advertisements, \
2296 netdev_linux_set_policing, \
2297 netdev_linux_get_qos_types, \
2298 netdev_linux_get_qos_capabilities, \
2299 netdev_linux_get_qos, \
2300 netdev_linux_set_qos, \
2301 netdev_linux_get_queue, \
2302 netdev_linux_set_queue, \
2303 netdev_linux_delete_queue, \
2304 netdev_linux_get_queue_stats, \
2305 netdev_linux_dump_queues, \
2306 netdev_linux_dump_queue_stats, \
2308 netdev_linux_get_in4, \
2309 netdev_linux_set_in4, \
2310 netdev_linux_get_in6, \
2311 netdev_linux_add_router, \
2312 netdev_linux_get_next_hop, \
2313 netdev_linux_get_status, \
2314 netdev_linux_arp_lookup, \
2316 netdev_linux_update_flags, \
2318 netdev_linux_change_seq \
2321 const struct netdev_class netdev_linux_class =
2324 netdev_linux_create,
2325 netdev_linux_get_stats,
2326 NULL); /* set_stats */
2328 const struct netdev_class netdev_tap_class =
2331 netdev_linux_create_tap,
2332 netdev_tap_get_stats,
2333 NULL); /* set_stats */
2335 const struct netdev_class netdev_internal_class =
2338 netdev_linux_create,
2339 netdev_internal_get_stats,
2340 netdev_vport_set_stats);
2342 /* HTB traffic control class. */
2344 #define HTB_N_QUEUES 0xf000
2348 unsigned int max_rate; /* In bytes/s. */
2352 struct tc_queue tc_queue;
2353 unsigned int min_rate; /* In bytes/s. */
2354 unsigned int max_rate; /* In bytes/s. */
2355 unsigned int burst; /* In bytes. */
2356 unsigned int priority; /* Lower values are higher priorities. */
2360 htb_get__(const struct netdev *netdev)
2362 struct netdev_dev_linux *netdev_dev =
2363 netdev_dev_linux_cast(netdev_get_dev(netdev));
2364 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2368 htb_install__(struct netdev *netdev, uint64_t max_rate)
2370 struct netdev_dev_linux *netdev_dev =
2371 netdev_dev_linux_cast(netdev_get_dev(netdev));
2374 htb = xmalloc(sizeof *htb);
2375 tc_init(&htb->tc, &tc_ops_htb);
2376 htb->max_rate = max_rate;
2378 netdev_dev->tc = &htb->tc;
2381 /* Create an HTB qdisc.
2383 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2385 htb_setup_qdisc__(struct netdev *netdev)
2388 struct tc_htb_glob opt;
2389 struct ofpbuf request;
2390 struct tcmsg *tcmsg;
2392 tc_del_qdisc(netdev);
2394 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2395 NLM_F_EXCL | NLM_F_CREATE, &request);
2399 tcmsg->tcm_handle = tc_make_handle(1, 0);
2400 tcmsg->tcm_parent = TC_H_ROOT;
2402 nl_msg_put_string(&request, TCA_KIND, "htb");
2404 memset(&opt, 0, sizeof opt);
2405 opt.rate2quantum = 10;
2409 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2410 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2411 nl_msg_end_nested(&request, opt_offset);
2413 return tc_transact(&request, NULL);
2416 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2417 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2419 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2420 unsigned int parent, struct htb_class *class)
2423 struct tc_htb_opt opt;
2424 struct ofpbuf request;
2425 struct tcmsg *tcmsg;
2429 error = netdev_get_mtu(netdev, &mtu);
2431 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2432 netdev_get_name(netdev));
2436 memset(&opt, 0, sizeof opt);
2437 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2438 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2439 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2440 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2441 opt.prio = class->priority;
2443 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2447 tcmsg->tcm_handle = handle;
2448 tcmsg->tcm_parent = parent;
2450 nl_msg_put_string(&request, TCA_KIND, "htb");
2451 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2452 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2453 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2454 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2455 nl_msg_end_nested(&request, opt_offset);
2457 error = tc_transact(&request, NULL);
2459 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2460 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2461 netdev_get_name(netdev),
2462 tc_get_major(handle), tc_get_minor(handle),
2463 tc_get_major(parent), tc_get_minor(parent),
2464 class->min_rate, class->max_rate,
2465 class->burst, class->priority, strerror(error));
2470 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2471 * description of them into 'details'. The description complies with the
2472 * specification given in the vswitch database documentation for linux-htb
2475 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2477 static const struct nl_policy tca_htb_policy[] = {
2478 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2479 .min_len = sizeof(struct tc_htb_opt) },
2482 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2483 const struct tc_htb_opt *htb;
2485 if (!nl_parse_nested(nl_options, tca_htb_policy,
2486 attrs, ARRAY_SIZE(tca_htb_policy))) {
2487 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2491 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2492 class->min_rate = htb->rate.rate;
2493 class->max_rate = htb->ceil.rate;
2494 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2495 class->priority = htb->prio;
2500 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2501 struct htb_class *options,
2502 struct netdev_queue_stats *stats)
2504 struct nlattr *nl_options;
2505 unsigned int handle;
2508 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2509 if (!error && queue_id) {
2510 unsigned int major = tc_get_major(handle);
2511 unsigned int minor = tc_get_minor(handle);
2512 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2513 *queue_id = minor - 1;
2518 if (!error && options) {
2519 error = htb_parse_tca_options__(nl_options, options);
2525 htb_parse_qdisc_details__(struct netdev *netdev,
2526 const struct shash *details, struct htb_class *hc)
2528 const char *max_rate_s;
2530 max_rate_s = shash_find_data(details, "max-rate");
2531 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2532 if (!hc->max_rate) {
2535 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2536 hc->max_rate = netdev_features_to_bps(current) / 8;
2538 hc->min_rate = hc->max_rate;
2544 htb_parse_class_details__(struct netdev *netdev,
2545 const struct shash *details, struct htb_class *hc)
2547 const struct htb *htb = htb_get__(netdev);
2548 const char *min_rate_s = shash_find_data(details, "min-rate");
2549 const char *max_rate_s = shash_find_data(details, "max-rate");
2550 const char *burst_s = shash_find_data(details, "burst");
2551 const char *priority_s = shash_find_data(details, "priority");
2554 error = netdev_get_mtu(netdev, &mtu);
2556 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2557 netdev_get_name(netdev));
2561 /* HTB requires at least an mtu sized min-rate to send any traffic even
2562 * on uncongested links. */
2563 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2564 hc->min_rate = MAX(hc->min_rate, mtu);
2565 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2568 hc->max_rate = (max_rate_s
2569 ? strtoull(max_rate_s, NULL, 10) / 8
2571 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2572 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2576 * According to hints in the documentation that I've read, it is important
2577 * that 'burst' be at least as big as the largest frame that might be
2578 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2579 * but having it a bit too small is a problem. Since netdev_get_mtu()
2580 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2581 * the MTU. We actually add 64, instead of 14, as a guard against
2582 * additional headers get tacked on somewhere that we're not aware of. */
2583 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2584 hc->burst = MAX(hc->burst, mtu + 64);
2587 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2593 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2594 unsigned int parent, struct htb_class *options,
2595 struct netdev_queue_stats *stats)
2597 struct ofpbuf *reply;
2600 error = tc_query_class(netdev, handle, parent, &reply);
2602 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2603 ofpbuf_delete(reply);
2609 htb_tc_install(struct netdev *netdev, const struct shash *details)
2613 error = htb_setup_qdisc__(netdev);
2615 struct htb_class hc;
2617 htb_parse_qdisc_details__(netdev, details, &hc);
2618 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2619 tc_make_handle(1, 0), &hc);
2621 htb_install__(netdev, hc.max_rate);
2627 static struct htb_class *
2628 htb_class_cast__(const struct tc_queue *queue)
2630 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2634 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2635 const struct htb_class *hc)
2637 struct htb *htb = htb_get__(netdev);
2638 size_t hash = hash_int(queue_id, 0);
2639 struct tc_queue *queue;
2640 struct htb_class *hcp;
2642 queue = tc_find_queue__(netdev, queue_id, hash);
2644 hcp = htb_class_cast__(queue);
2646 hcp = xmalloc(sizeof *hcp);
2647 queue = &hcp->tc_queue;
2648 queue->queue_id = queue_id;
2649 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2652 hcp->min_rate = hc->min_rate;
2653 hcp->max_rate = hc->max_rate;
2654 hcp->burst = hc->burst;
2655 hcp->priority = hc->priority;
2659 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2662 struct nl_dump dump;
2663 struct htb_class hc;
2665 /* Get qdisc options. */
2667 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2668 htb_install__(netdev, hc.max_rate);
2671 if (!start_queue_dump(netdev, &dump)) {
2674 while (nl_dump_next(&dump, &msg)) {
2675 unsigned int queue_id;
2677 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2678 htb_update_queue__(netdev, queue_id, &hc);
2681 nl_dump_done(&dump);
2687 htb_tc_destroy(struct tc *tc)
2689 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2690 struct htb_class *hc, *next;
2692 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2693 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2701 htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2703 const struct htb *htb = htb_get__(netdev);
2704 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2709 htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2711 struct htb_class hc;
2714 htb_parse_qdisc_details__(netdev, details, &hc);
2715 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2716 tc_make_handle(1, 0), &hc);
2718 htb_get__(netdev)->max_rate = hc.max_rate;
2724 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2725 const struct tc_queue *queue, struct shash *details)
2727 const struct htb_class *hc = htb_class_cast__(queue);
2729 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2730 if (hc->min_rate != hc->max_rate) {
2731 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2733 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2735 shash_add(details, "priority", xasprintf("%u", hc->priority));
2741 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2742 const struct shash *details)
2744 struct htb_class hc;
2747 error = htb_parse_class_details__(netdev, details, &hc);
2752 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2753 tc_make_handle(1, 0xfffe), &hc);
2758 htb_update_queue__(netdev, queue_id, &hc);
2763 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2765 struct htb_class *hc = htb_class_cast__(queue);
2766 struct htb *htb = htb_get__(netdev);
2769 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2771 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2778 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2779 struct netdev_queue_stats *stats)
2781 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2782 tc_make_handle(1, 0xfffe), NULL, stats);
2786 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2787 const struct ofpbuf *nlmsg,
2788 netdev_dump_queue_stats_cb *cb, void *aux)
2790 struct netdev_queue_stats stats;
2791 unsigned int handle, major, minor;
2794 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2799 major = tc_get_major(handle);
2800 minor = tc_get_minor(handle);
2801 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2802 (*cb)(minor - 1, &stats, aux);
2807 static const struct tc_ops tc_ops_htb = {
2808 "htb", /* linux_name */
2809 "linux-htb", /* ovs_name */
2810 HTB_N_QUEUES, /* n_queues */
2819 htb_class_get_stats,
2820 htb_class_dump_stats
2823 /* "linux-hfsc" traffic control class. */
2825 #define HFSC_N_QUEUES 0xf000
2833 struct tc_queue tc_queue;
2838 static struct hfsc *
2839 hfsc_get__(const struct netdev *netdev)
2841 struct netdev_dev_linux *netdev_dev;
2842 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2843 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2846 static struct hfsc_class *
2847 hfsc_class_cast__(const struct tc_queue *queue)
2849 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2853 hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2855 struct netdev_dev_linux * netdev_dev;
2858 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2859 hfsc = xmalloc(sizeof *hfsc);
2860 tc_init(&hfsc->tc, &tc_ops_hfsc);
2861 hfsc->max_rate = max_rate;
2862 netdev_dev->tc = &hfsc->tc;
2866 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
2867 const struct hfsc_class *hc)
2871 struct hfsc_class *hcp;
2872 struct tc_queue *queue;
2874 hfsc = hfsc_get__(netdev);
2875 hash = hash_int(queue_id, 0);
2877 queue = tc_find_queue__(netdev, queue_id, hash);
2879 hcp = hfsc_class_cast__(queue);
2881 hcp = xmalloc(sizeof *hcp);
2882 queue = &hcp->tc_queue;
2883 queue->queue_id = queue_id;
2884 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
2887 hcp->min_rate = hc->min_rate;
2888 hcp->max_rate = hc->max_rate;
2892 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
2894 const struct tc_service_curve *rsc, *fsc, *usc;
2895 static const struct nl_policy tca_hfsc_policy[] = {
2897 .type = NL_A_UNSPEC,
2899 .min_len = sizeof(struct tc_service_curve),
2902 .type = NL_A_UNSPEC,
2904 .min_len = sizeof(struct tc_service_curve),
2907 .type = NL_A_UNSPEC,
2909 .min_len = sizeof(struct tc_service_curve),
2912 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
2914 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
2915 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
2916 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
2920 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
2921 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
2922 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
2924 if (rsc->m1 != 0 || rsc->d != 0 ||
2925 fsc->m1 != 0 || fsc->d != 0 ||
2926 usc->m1 != 0 || usc->d != 0) {
2927 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2928 "Non-linear service curves are not supported.");
2932 if (rsc->m2 != fsc->m2) {
2933 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2934 "Real-time service curves are not supported ");
2938 if (rsc->m2 > usc->m2) {
2939 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2940 "Min-rate service curve is greater than "
2941 "the max-rate service curve.");
2945 class->min_rate = fsc->m2;
2946 class->max_rate = usc->m2;
2951 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2952 struct hfsc_class *options,
2953 struct netdev_queue_stats *stats)
2956 unsigned int handle;
2957 struct nlattr *nl_options;
2959 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2965 unsigned int major, minor;
2967 major = tc_get_major(handle);
2968 minor = tc_get_minor(handle);
2969 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
2970 *queue_id = minor - 1;
2977 error = hfsc_parse_tca_options__(nl_options, options);
2984 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
2985 unsigned int parent, struct hfsc_class *options,
2986 struct netdev_queue_stats *stats)
2989 struct ofpbuf *reply;
2991 error = tc_query_class(netdev, handle, parent, &reply);
2996 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
2997 ofpbuf_delete(reply);
3002 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
3003 struct hfsc_class *class)
3006 const char *max_rate_s;
3008 max_rate_s = shash_find_data(details, "max-rate");
3009 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3014 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
3015 max_rate = netdev_features_to_bps(current) / 8;
3018 class->min_rate = max_rate;
3019 class->max_rate = max_rate;
3023 hfsc_parse_class_details__(struct netdev *netdev,
3024 const struct shash *details,
3025 struct hfsc_class * class)
3027 const struct hfsc *hfsc;
3028 uint32_t min_rate, max_rate;
3029 const char *min_rate_s, *max_rate_s;
3031 hfsc = hfsc_get__(netdev);
3032 min_rate_s = shash_find_data(details, "min-rate");
3033 max_rate_s = shash_find_data(details, "max-rate");
3035 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3036 min_rate = MAX(min_rate, 1);
3037 min_rate = MIN(min_rate, hfsc->max_rate);
3039 max_rate = (max_rate_s
3040 ? strtoull(max_rate_s, NULL, 10) / 8
3042 max_rate = MAX(max_rate, min_rate);
3043 max_rate = MIN(max_rate, hfsc->max_rate);
3045 class->min_rate = min_rate;
3046 class->max_rate = max_rate;
3051 /* Create an HFSC qdisc.
3053 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3055 hfsc_setup_qdisc__(struct netdev * netdev)
3057 struct tcmsg *tcmsg;
3058 struct ofpbuf request;
3059 struct tc_hfsc_qopt opt;
3061 tc_del_qdisc(netdev);
3063 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3064 NLM_F_EXCL | NLM_F_CREATE, &request);
3070 tcmsg->tcm_handle = tc_make_handle(1, 0);
3071 tcmsg->tcm_parent = TC_H_ROOT;
3073 memset(&opt, 0, sizeof opt);
3076 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3077 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3079 return tc_transact(&request, NULL);
3082 /* Create an HFSC class.
3084 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3085 * sc rate <min_rate> ul rate <max_rate>" */
3087 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3088 unsigned int parent, struct hfsc_class *class)
3092 struct tcmsg *tcmsg;
3093 struct ofpbuf request;
3094 struct tc_service_curve min, max;
3096 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3102 tcmsg->tcm_handle = handle;
3103 tcmsg->tcm_parent = parent;
3107 min.m2 = class->min_rate;
3111 max.m2 = class->max_rate;
3113 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3114 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3115 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3116 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3117 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3118 nl_msg_end_nested(&request, opt_offset);
3120 error = tc_transact(&request, NULL);
3122 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3123 "min-rate %ubps, max-rate %ubps (%s)",
3124 netdev_get_name(netdev),
3125 tc_get_major(handle), tc_get_minor(handle),
3126 tc_get_major(parent), tc_get_minor(parent),
3127 class->min_rate, class->max_rate, strerror(error));
3134 hfsc_tc_install(struct netdev *netdev, const struct shash *details)
3137 struct hfsc_class class;
3139 error = hfsc_setup_qdisc__(netdev);
3145 hfsc_parse_qdisc_details__(netdev, details, &class);
3146 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3147 tc_make_handle(1, 0), &class);
3153 hfsc_install__(netdev, class.max_rate);
3158 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3161 struct nl_dump dump;
3162 struct hfsc_class hc;
3165 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3166 hfsc_install__(netdev, hc.max_rate);
3168 if (!start_queue_dump(netdev, &dump)) {
3172 while (nl_dump_next(&dump, &msg)) {
3173 unsigned int queue_id;
3175 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3176 hfsc_update_queue__(netdev, queue_id, &hc);
3180 nl_dump_done(&dump);
3185 hfsc_tc_destroy(struct tc *tc)
3188 struct hfsc_class *hc, *next;
3190 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3192 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3193 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3202 hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3204 const struct hfsc *hfsc;
3205 hfsc = hfsc_get__(netdev);
3206 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3211 hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3214 struct hfsc_class class;
3216 hfsc_parse_qdisc_details__(netdev, details, &class);
3217 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3218 tc_make_handle(1, 0), &class);
3221 hfsc_get__(netdev)->max_rate = class.max_rate;
3228 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3229 const struct tc_queue *queue, struct shash *details)
3231 const struct hfsc_class *hc;
3233 hc = hfsc_class_cast__(queue);
3234 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3235 if (hc->min_rate != hc->max_rate) {
3236 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3242 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3243 const struct shash *details)
3246 struct hfsc_class class;
3248 error = hfsc_parse_class_details__(netdev, details, &class);
3253 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3254 tc_make_handle(1, 0xfffe), &class);
3259 hfsc_update_queue__(netdev, queue_id, &class);
3264 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3268 struct hfsc_class *hc;
3270 hc = hfsc_class_cast__(queue);
3271 hfsc = hfsc_get__(netdev);
3273 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3275 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3282 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3283 struct netdev_queue_stats *stats)
3285 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3286 tc_make_handle(1, 0xfffe), NULL, stats);
3290 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3291 const struct ofpbuf *nlmsg,
3292 netdev_dump_queue_stats_cb *cb, void *aux)
3294 struct netdev_queue_stats stats;
3295 unsigned int handle, major, minor;
3298 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3303 major = tc_get_major(handle);
3304 minor = tc_get_minor(handle);
3305 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3306 (*cb)(minor - 1, &stats, aux);
3311 static const struct tc_ops tc_ops_hfsc = {
3312 "hfsc", /* linux_name */
3313 "linux-hfsc", /* ovs_name */
3314 HFSC_N_QUEUES, /* n_queues */
3315 hfsc_tc_install, /* tc_install */
3316 hfsc_tc_load, /* tc_load */
3317 hfsc_tc_destroy, /* tc_destroy */
3318 hfsc_qdisc_get, /* qdisc_get */
3319 hfsc_qdisc_set, /* qdisc_set */
3320 hfsc_class_get, /* class_get */
3321 hfsc_class_set, /* class_set */
3322 hfsc_class_delete, /* class_delete */
3323 hfsc_class_get_stats, /* class_get_stats */
3324 hfsc_class_dump_stats /* class_dump_stats */
3327 /* "linux-default" traffic control class.
3329 * This class represents the default, unnamed Linux qdisc. It corresponds to
3330 * the "" (empty string) QoS type in the OVS database. */
3333 default_install__(struct netdev *netdev)
3335 struct netdev_dev_linux *netdev_dev =
3336 netdev_dev_linux_cast(netdev_get_dev(netdev));
3337 static struct tc *tc;
3340 tc = xmalloc(sizeof *tc);
3341 tc_init(tc, &tc_ops_default);
3343 netdev_dev->tc = tc;
3347 default_tc_install(struct netdev *netdev,
3348 const struct shash *details OVS_UNUSED)
3350 default_install__(netdev);
3355 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3357 default_install__(netdev);
3361 static const struct tc_ops tc_ops_default = {
3362 NULL, /* linux_name */
3367 NULL, /* tc_destroy */
3368 NULL, /* qdisc_get */
3369 NULL, /* qdisc_set */
3370 NULL, /* class_get */
3371 NULL, /* class_set */
3372 NULL, /* class_delete */
3373 NULL, /* class_get_stats */
3374 NULL /* class_dump_stats */
3377 /* "linux-other" traffic control class.
3382 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3384 struct netdev_dev_linux *netdev_dev =
3385 netdev_dev_linux_cast(netdev_get_dev(netdev));
3386 static struct tc *tc;
3389 tc = xmalloc(sizeof *tc);
3390 tc_init(tc, &tc_ops_other);
3392 netdev_dev->tc = tc;
3396 static const struct tc_ops tc_ops_other = {
3397 NULL, /* linux_name */
3398 "linux-other", /* ovs_name */
3400 NULL, /* tc_install */
3402 NULL, /* tc_destroy */
3403 NULL, /* qdisc_get */
3404 NULL, /* qdisc_set */
3405 NULL, /* class_get */
3406 NULL, /* class_set */
3407 NULL, /* class_delete */
3408 NULL, /* class_get_stats */
3409 NULL /* class_dump_stats */
3412 /* Traffic control. */
3414 /* Number of kernel "tc" ticks per second. */
3415 static double ticks_per_s;
3417 /* Number of kernel "jiffies" per second. This is used for the purpose of
3418 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3419 * one jiffy's worth of data.
3421 * There are two possibilities here:
3423 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3424 * approximate range of 100 to 1024. That means that we really need to
3425 * make sure that the qdisc can buffer that much data.
3427 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3428 * has finely granular timers and there's no need to fudge additional room
3429 * for buffers. (There's no extra effort needed to implement that: the
3430 * large 'buffer_hz' is used as a divisor, so practically any number will
3431 * come out as 0 in the division. Small integer results in the case of
3432 * really high dividends won't have any real effect anyhow.)
3434 static unsigned int buffer_hz;
3436 /* Returns tc handle 'major':'minor'. */
3438 tc_make_handle(unsigned int major, unsigned int minor)
3440 return TC_H_MAKE(major << 16, minor);
3443 /* Returns the major number from 'handle'. */
3445 tc_get_major(unsigned int handle)
3447 return TC_H_MAJ(handle) >> 16;
3450 /* Returns the minor number from 'handle'. */
3452 tc_get_minor(unsigned int handle)
3454 return TC_H_MIN(handle);
3457 static struct tcmsg *
3458 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3459 struct ofpbuf *request)
3461 struct tcmsg *tcmsg;
3465 error = get_ifindex(netdev, &ifindex);
3470 ofpbuf_init(request, 512);
3471 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3472 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3473 tcmsg->tcm_family = AF_UNSPEC;
3474 tcmsg->tcm_ifindex = ifindex;
3475 /* Caller should fill in tcmsg->tcm_handle. */
3476 /* Caller should fill in tcmsg->tcm_parent. */
3482 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3484 int error = nl_sock_transact(rtnl_sock, request, replyp);
3485 ofpbuf_uninit(request);
3489 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3490 * policing configuration.
3492 * This function is equivalent to running the following when 'add' is true:
3493 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3495 * This function is equivalent to running the following when 'add' is false:
3496 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3498 * The configuration and stats may be seen with the following command:
3499 * /sbin/tc -s qdisc show dev <devname>
3501 * Returns 0 if successful, otherwise a positive errno value.
3504 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3506 struct ofpbuf request;
3507 struct tcmsg *tcmsg;
3509 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3510 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3512 tcmsg = tc_make_request(netdev, type, flags, &request);
3516 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3517 tcmsg->tcm_parent = TC_H_INGRESS;
3518 nl_msg_put_string(&request, TCA_KIND, "ingress");
3519 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3521 error = tc_transact(&request, NULL);
3523 /* If we're deleting the qdisc, don't worry about some of the
3524 * error conditions. */
3525 if (!add && (error == ENOENT || error == EINVAL)) {
3534 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3537 * This function is equivalent to running:
3538 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3539 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3542 * The configuration and stats may be seen with the following command:
3543 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3545 * Returns 0 if successful, otherwise a positive errno value.
3548 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3550 struct tc_police tc_police;
3551 struct ofpbuf request;
3552 struct tcmsg *tcmsg;
3553 size_t basic_offset;
3554 size_t police_offset;
3558 memset(&tc_police, 0, sizeof tc_police);
3559 tc_police.action = TC_POLICE_SHOT;
3560 tc_police.mtu = mtu;
3561 tc_fill_rate(&tc_police.rate, kbits_rate/8 * 1000, mtu);
3562 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3563 kbits_burst * 1024);
3565 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3566 NLM_F_EXCL | NLM_F_CREATE, &request);
3570 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3571 tcmsg->tcm_info = tc_make_handle(49,
3572 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3574 nl_msg_put_string(&request, TCA_KIND, "basic");
3575 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3576 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3577 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3578 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3579 nl_msg_end_nested(&request, police_offset);
3580 nl_msg_end_nested(&request, basic_offset);
3582 error = tc_transact(&request, NULL);
3593 /* The values in psched are not individually very meaningful, but they are
3594 * important. The tables below show some values seen in the wild.
3598 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3599 * (Before that, there are hints that it was 1000000000.)
3601 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3605 * -----------------------------------
3606 * [1] 000c8000 000f4240 000f4240 00000064
3607 * [2] 000003e8 00000400 000f4240 3b9aca00
3608 * [3] 000003e8 00000400 000f4240 3b9aca00
3609 * [4] 000003e8 00000400 000f4240 00000064
3610 * [5] 000003e8 00000040 000f4240 3b9aca00
3611 * [6] 000003e8 00000040 000f4240 000000f9
3613 * a b c d ticks_per_s buffer_hz
3614 * ------- --------- ---------- ------------- ----------- -------------
3615 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3616 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3617 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3618 * [4] 1,000 1,024 1,000,000 100 976,562 100
3619 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3620 * [6] 1,000 64 1,000,000 249 15,625,000 249
3622 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3623 * [2] 2.6.26-1-686-bigmem from Debian lenny
3624 * [3] 2.6.26-2-sparc64 from Debian lenny
3625 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3626 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3627 * [6] 2.6.34 from kernel.org on KVM
3629 static const char fn[] = "/proc/net/psched";
3630 unsigned int a, b, c, d;
3636 stream = fopen(fn, "r");
3638 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3642 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3643 VLOG_WARN("%s: read failed", fn);
3647 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3651 VLOG_WARN("%s: invalid scheduler parameters", fn);
3655 ticks_per_s = (double) a * c / b;
3659 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3662 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3665 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3666 * rate of 'rate' bytes per second. */
3668 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3673 return (rate * ticks) / ticks_per_s;
3676 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3677 * rate of 'rate' bytes per second. */
3679 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3684 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3687 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3688 * a transmission rate of 'rate' bytes per second. */
3690 tc_buffer_per_jiffy(unsigned int rate)
3695 return rate / buffer_hz;
3698 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3699 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3700 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3701 * stores NULL into it if it is absent.
3703 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3706 * Returns 0 if successful, otherwise a positive errno value. */
3708 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3709 struct nlattr **options)
3711 static const struct nl_policy tca_policy[] = {
3712 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3713 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3715 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3717 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3718 tca_policy, ta, ARRAY_SIZE(ta))) {
3719 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3724 *kind = nl_attr_get_string(ta[TCA_KIND]);
3728 *options = ta[TCA_OPTIONS];
3743 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3744 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3745 * into '*options', and its queue statistics into '*stats'. Any of the output
3746 * arguments may be null.
3748 * Returns 0 if successful, otherwise a positive errno value. */
3750 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3751 struct nlattr **options, struct netdev_queue_stats *stats)
3753 static const struct nl_policy tca_policy[] = {
3754 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3755 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3757 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3759 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3760 tca_policy, ta, ARRAY_SIZE(ta))) {
3761 VLOG_WARN_RL(&rl, "failed to parse class message");
3766 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3767 *handlep = tc->tcm_handle;
3771 *options = ta[TCA_OPTIONS];
3775 const struct gnet_stats_queue *gsq;
3776 struct gnet_stats_basic gsb;
3778 static const struct nl_policy stats_policy[] = {
3779 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3780 .min_len = sizeof gsb },
3781 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3782 .min_len = sizeof *gsq },
3784 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3786 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3787 sa, ARRAY_SIZE(sa))) {
3788 VLOG_WARN_RL(&rl, "failed to parse class stats");
3792 /* Alignment issues screw up the length of struct gnet_stats_basic on
3793 * some arch/bitsize combinations. Newer versions of Linux have a
3794 * struct gnet_stats_basic_packed, but we can't depend on that. The
3795 * easiest thing to do is just to make a copy. */
3796 memset(&gsb, 0, sizeof gsb);
3797 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3798 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3799 stats->tx_bytes = gsb.bytes;
3800 stats->tx_packets = gsb.packets;
3802 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3803 stats->tx_errors = gsq->drops;
3813 memset(stats, 0, sizeof *stats);
3818 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3821 tc_query_class(const struct netdev *netdev,
3822 unsigned int handle, unsigned int parent,
3823 struct ofpbuf **replyp)
3825 struct ofpbuf request;
3826 struct tcmsg *tcmsg;
3829 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3833 tcmsg->tcm_handle = handle;
3834 tcmsg->tcm_parent = parent;
3836 error = tc_transact(&request, replyp);
3838 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3839 netdev_get_name(netdev),
3840 tc_get_major(handle), tc_get_minor(handle),
3841 tc_get_major(parent), tc_get_minor(parent),
3847 /* Equivalent to "tc class del dev <name> handle <handle>". */
3849 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3851 struct ofpbuf request;
3852 struct tcmsg *tcmsg;
3855 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
3859 tcmsg->tcm_handle = handle;
3860 tcmsg->tcm_parent = 0;
3862 error = tc_transact(&request, NULL);
3864 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3865 netdev_get_name(netdev),
3866 tc_get_major(handle), tc_get_minor(handle),
3872 /* Equivalent to "tc qdisc del dev <name> root". */
3874 tc_del_qdisc(struct netdev *netdev)
3876 struct netdev_dev_linux *netdev_dev =
3877 netdev_dev_linux_cast(netdev_get_dev(netdev));
3878 struct ofpbuf request;
3879 struct tcmsg *tcmsg;
3882 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
3886 tcmsg->tcm_handle = tc_make_handle(1, 0);
3887 tcmsg->tcm_parent = TC_H_ROOT;
3889 error = tc_transact(&request, NULL);
3890 if (error == EINVAL) {
3891 /* EINVAL probably means that the default qdisc was in use, in which
3892 * case we've accomplished our purpose. */
3895 if (!error && netdev_dev->tc) {
3896 if (netdev_dev->tc->ops->tc_destroy) {
3897 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3899 netdev_dev->tc = NULL;
3904 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
3905 * kernel to determine what they are. Returns 0 if successful, otherwise a
3906 * positive errno value. */
3908 tc_query_qdisc(const struct netdev *netdev)
3910 struct netdev_dev_linux *netdev_dev =
3911 netdev_dev_linux_cast(netdev_get_dev(netdev));
3912 struct ofpbuf request, *qdisc;
3913 const struct tc_ops *ops;
3914 struct tcmsg *tcmsg;
3918 if (netdev_dev->tc) {
3922 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3923 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3924 * 2.6.35 without that fix backported to it.
3926 * To avoid the OOPS, we must not make a request that would attempt to dump
3927 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3928 * few others. There are a few ways that I can see to do this, but most of
3929 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3930 * technique chosen here is to assume that any non-default qdisc that we
3931 * create will have a class with handle 1:0. The built-in qdiscs only have
3932 * a class with handle 0:0.
3934 * We could check for Linux 2.6.35+ and use a more straightforward method
3936 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
3940 tcmsg->tcm_handle = tc_make_handle(1, 0);
3941 tcmsg->tcm_parent = 0;
3943 /* Figure out what tc class to instantiate. */
3944 error = tc_transact(&request, &qdisc);
3948 error = tc_parse_qdisc(qdisc, &kind, NULL);
3950 ops = &tc_ops_other;
3952 ops = tc_lookup_linux_name(kind);
3954 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
3955 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
3957 ops = &tc_ops_other;
3960 } else if (error == ENOENT) {
3961 /* Either it's a built-in qdisc, or it's a qdisc set up by some
3962 * other entity that doesn't have a handle 1:0. We will assume
3963 * that it's the system default qdisc. */
3964 ops = &tc_ops_default;
3967 /* Who knows? Maybe the device got deleted. */
3968 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
3969 netdev_get_name(netdev), strerror(error));
3970 ops = &tc_ops_other;
3973 /* Instantiate it. */
3974 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
3975 assert((load_error == 0) == (netdev_dev->tc != NULL));
3976 ofpbuf_delete(qdisc);
3978 return error ? error : load_error;
3981 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
3982 approximate the time to transmit packets of various lengths. For an MTU of
3983 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
3984 represents two possible packet lengths; for a MTU of 513 through 1024, four
3985 possible lengths; and so on.
3987 Returns, for the specified 'mtu', the number of bits that packet lengths
3988 need to be shifted right to fit within such a 256-entry table. */
3990 tc_calc_cell_log(unsigned int mtu)
3995 mtu = ETH_PAYLOAD_MAX;
3997 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
3999 for (cell_log = 0; mtu >= 256; cell_log++) {
4006 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4009 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4011 memset(rate, 0, sizeof *rate);
4012 rate->cell_log = tc_calc_cell_log(mtu);
4013 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4014 /* rate->cell_align = 0; */ /* distro headers. */
4015 rate->mpu = ETH_TOTAL_MIN;
4019 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4020 * attribute of the specified "type".
4022 * See tc_calc_cell_log() above for a description of "rtab"s. */
4024 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4029 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4030 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4031 unsigned packet_size = (i + 1) << rate->cell_log;
4032 if (packet_size < rate->mpu) {
4033 packet_size = rate->mpu;
4035 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4039 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4040 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4041 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4044 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4046 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4047 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4050 /* Linux-only functions declared in netdev-linux.h */
4052 /* Returns a fd for an AF_INET socket or a negative errno value. */
4054 netdev_linux_get_af_inet_sock(void)
4056 int error = netdev_linux_init();
4057 return error ? -error : af_inet_sock;
4060 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4061 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4063 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4064 const char *flag_name, bool enable)
4066 const char *netdev_name = netdev_get_name(netdev);
4067 struct ethtool_value evalue;
4071 memset(&evalue, 0, sizeof evalue);
4072 error = netdev_linux_do_ethtool(netdev_name,
4073 (struct ethtool_cmd *)&evalue,
4074 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4079 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4080 error = netdev_linux_do_ethtool(netdev_name,
4081 (struct ethtool_cmd *)&evalue,
4082 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4087 memset(&evalue, 0, sizeof evalue);
4088 error = netdev_linux_do_ethtool(netdev_name,
4089 (struct ethtool_cmd *)&evalue,
4090 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4095 if (new_flags != evalue.data) {
4096 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4097 "device %s failed", enable ? "enable" : "disable",
4098 flag_name, netdev_name);
4105 /* Utility functions. */
4107 /* Copies 'src' into 'dst', performing format conversion in the process. */
4109 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4110 const struct rtnl_link_stats *src)
4112 dst->rx_packets = src->rx_packets;
4113 dst->tx_packets = src->tx_packets;
4114 dst->rx_bytes = src->rx_bytes;
4115 dst->tx_bytes = src->tx_bytes;
4116 dst->rx_errors = src->rx_errors;
4117 dst->tx_errors = src->tx_errors;
4118 dst->rx_dropped = src->rx_dropped;
4119 dst->tx_dropped = src->tx_dropped;
4120 dst->multicast = src->multicast;
4121 dst->collisions = src->collisions;
4122 dst->rx_length_errors = src->rx_length_errors;
4123 dst->rx_over_errors = src->rx_over_errors;
4124 dst->rx_crc_errors = src->rx_crc_errors;
4125 dst->rx_frame_errors = src->rx_frame_errors;
4126 dst->rx_fifo_errors = src->rx_fifo_errors;
4127 dst->rx_missed_errors = src->rx_missed_errors;
4128 dst->tx_aborted_errors = src->tx_aborted_errors;
4129 dst->tx_carrier_errors = src->tx_carrier_errors;
4130 dst->tx_fifo_errors = src->tx_fifo_errors;
4131 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4132 dst->tx_window_errors = src->tx_window_errors;
4136 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4138 /* Policy for RTNLGRP_LINK messages.
4140 * There are *many* more fields in these messages, but currently we only
4141 * care about these fields. */
4142 static const struct nl_policy rtnlgrp_link_policy[] = {
4143 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4144 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4145 .min_len = sizeof(struct rtnl_link_stats) },
4148 struct ofpbuf request;
4149 struct ofpbuf *reply;
4150 struct ifinfomsg *ifi;
4151 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4154 ofpbuf_init(&request, 0);
4155 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4156 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4157 ifi->ifi_family = PF_UNSPEC;
4158 ifi->ifi_index = ifindex;
4159 error = nl_sock_transact(rtnl_sock, &request, &reply);
4160 ofpbuf_uninit(&request);
4165 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4166 rtnlgrp_link_policy,
4167 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4168 ofpbuf_delete(reply);
4172 if (!attrs[IFLA_STATS]) {
4173 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4174 ofpbuf_delete(reply);
4178 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4180 ofpbuf_delete(reply);
4186 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4188 static const char fn[] = "/proc/net/dev";
4193 stream = fopen(fn, "r");
4195 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
4200 while (fgets(line, sizeof line, stream)) {
4203 #define X64 "%"SCNu64
4206 X64 X64 X64 X64 X64 X64 X64 "%*u"
4207 X64 X64 X64 X64 X64 X64 X64 "%*u",
4213 &stats->rx_fifo_errors,
4214 &stats->rx_frame_errors,
4220 &stats->tx_fifo_errors,
4222 &stats->tx_carrier_errors) != 15) {
4223 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4224 } else if (!strcmp(devname, netdev_name)) {
4225 stats->rx_length_errors = UINT64_MAX;
4226 stats->rx_over_errors = UINT64_MAX;
4227 stats->rx_crc_errors = UINT64_MAX;
4228 stats->rx_missed_errors = UINT64_MAX;
4229 stats->tx_aborted_errors = UINT64_MAX;
4230 stats->tx_heartbeat_errors = UINT64_MAX;
4231 stats->tx_window_errors = UINT64_MAX;
4237 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4243 get_flags(const struct netdev_dev *dev, unsigned int *flags)
4249 error = netdev_linux_do_ioctl(dev->name, &ifr, SIOCGIFFLAGS,
4252 *flags = ifr.ifr_flags;
4258 set_flags(struct netdev *netdev, unsigned int flags)
4262 ifr.ifr_flags = flags;
4263 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4268 do_get_ifindex(const char *netdev_name)
4272 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4273 COVERAGE_INC(netdev_get_ifindex);
4274 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4275 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4276 netdev_name, strerror(errno));
4279 return ifr.ifr_ifindex;
4283 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4285 struct netdev_dev_linux *netdev_dev =
4286 netdev_dev_linux_cast(netdev_get_dev(netdev_));
4288 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
4289 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4293 netdev_dev->cache_valid |= VALID_IFINDEX;
4294 netdev_dev->ifindex = ifindex;
4296 *ifindexp = netdev_dev->ifindex;
4301 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4306 memset(&ifr, 0, sizeof ifr);
4307 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4308 COVERAGE_INC(netdev_get_hwaddr);
4309 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4310 /* ENODEV probably means that a vif disappeared asynchronously and
4311 * hasn't been removed from the database yet, so reduce the log level
4312 * to INFO for that case. */
4313 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4314 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4315 netdev_name, strerror(errno));
4318 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4319 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4320 VLOG_WARN("%s device has unknown hardware address family %d",
4321 netdev_name, hwaddr_family);
4323 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4328 set_etheraddr(const char *netdev_name, int hwaddr_family,
4329 const uint8_t mac[ETH_ADDR_LEN])
4333 memset(&ifr, 0, sizeof ifr);
4334 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4335 ifr.ifr_hwaddr.sa_family = hwaddr_family;
4336 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4337 COVERAGE_INC(netdev_set_hwaddr);
4338 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4339 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4340 netdev_name, strerror(errno));
4347 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4348 int cmd, const char *cmd_name)
4352 memset(&ifr, 0, sizeof ifr);
4353 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4354 ifr.ifr_data = (caddr_t) ecmd;
4357 COVERAGE_INC(netdev_ethtool);
4358 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4361 if (errno != EOPNOTSUPP) {
4362 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4363 "failed: %s", cmd_name, name, strerror(errno));
4365 /* The device doesn't support this operation. That's pretty
4366 * common, so there's no point in logging anything. */
4373 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4374 const char *cmd_name)
4376 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4377 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4378 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4386 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4387 int cmd, const char *cmd_name)
4392 ifr.ifr_addr.sa_family = AF_INET;
4393 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4395 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4396 *ip = sin->sin_addr;
4401 /* Returns an AF_PACKET raw socket or a negative errno value. */
4403 af_packet_sock(void)
4405 static int sock = INT_MIN;
4407 if (sock == INT_MIN) {
4408 sock = socket(AF_PACKET, SOCK_RAW, 0);
4410 set_nonblocking(sock);
4413 VLOG_ERR("failed to create packet socket: %s", strerror(errno));