2 * Copyright (c) 2009, 2010, 2011, 2012 Nicira Networks.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
24 #include <arpa/inet.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
30 #include <linux/types.h>
31 #include <linux/ethtool.h>
32 #include <linux/mii.h>
33 #include <linux/pkt_cls.h>
34 #include <linux/pkt_sched.h>
35 #include <linux/rtnetlink.h>
36 #include <linux/sockios.h>
37 #include <linux/version.h>
38 #include <sys/types.h>
39 #include <sys/ioctl.h>
40 #include <sys/socket.h>
41 #include <netpacket/packet.h>
43 #include <net/if_arp.h>
44 #include <net/if_packet.h>
45 #include <net/route.h>
46 #include <netinet/in.h>
53 #include "dpif-linux.h"
54 #include "dynamic-string.h"
55 #include "fatal-signal.h"
58 #include "netdev-provider.h"
59 #include "netdev-vport.h"
61 #include "netlink-notifier.h"
62 #include "netlink-socket.h"
64 #include "openflow/openflow.h"
66 #include "poll-loop.h"
67 #include "rtnetlink-link.h"
68 #include "socket-util.h"
74 VLOG_DEFINE_THIS_MODULE(netdev_linux);
76 COVERAGE_DEFINE(netdev_set_policing);
77 COVERAGE_DEFINE(netdev_arp_lookup);
78 COVERAGE_DEFINE(netdev_get_ifindex);
79 COVERAGE_DEFINE(netdev_get_hwaddr);
80 COVERAGE_DEFINE(netdev_set_hwaddr);
81 COVERAGE_DEFINE(netdev_ethtool);
83 /* These were introduced in Linux 2.6.14, so they might be missing if we have
85 #ifndef ADVERTISED_Pause
86 #define ADVERTISED_Pause (1 << 13)
88 #ifndef ADVERTISED_Asym_Pause
89 #define ADVERTISED_Asym_Pause (1 << 14)
92 /* These were introduced in Linux 2.6.24, so they might be missing if we
93 * have old headers. */
94 #ifndef ETHTOOL_GFLAGS
95 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
97 #ifndef ETHTOOL_SFLAGS
98 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
101 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
104 #define TC_RTAB_SIZE 1024
107 static struct nln_notifier *netdev_linux_cache_notifier = NULL;
108 static int cache_notifier_refcount;
111 VALID_IFINDEX = 1 << 0,
112 VALID_ETHERADDR = 1 << 1,
116 VALID_POLICING = 1 << 5,
117 VALID_VPORT_STAT_ERROR = 1 << 6
125 /* Traffic control. */
127 /* An instance of a traffic control class. Always associated with a particular
130 * Each TC implementation subclasses this with whatever additional data it
133 const struct tc_ops *ops;
134 struct hmap queues; /* Contains "struct tc_queue"s.
135 * Read by generic TC layer.
136 * Written only by TC implementation. */
139 /* One traffic control queue.
141 * Each TC implementation subclasses this with whatever additional data it
144 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
145 unsigned int queue_id; /* OpenFlow queue ID. */
148 /* A particular kind of traffic control. Each implementation generally maps to
149 * one particular Linux qdisc class.
151 * The functions below return 0 if successful or a positive errno value on
152 * failure, except where otherwise noted. All of them must be provided, except
153 * where otherwise noted. */
155 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
156 * This is null for tc_ops_default and tc_ops_other, for which there are no
157 * appropriate values. */
158 const char *linux_name;
160 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
161 const char *ovs_name;
163 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
164 * queues. The queues are numbered 0 through n_queues - 1. */
165 unsigned int n_queues;
167 /* Called to install this TC class on 'netdev'. The implementation should
168 * make the Netlink calls required to set up 'netdev' with the right qdisc
169 * and configure it according to 'details'. The implementation may assume
170 * that the current qdisc is the default; that is, there is no need for it
171 * to delete the current qdisc before installing itself.
173 * The contents of 'details' should be documented as valid for 'ovs_name'
174 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
175 * (which is built as ovs-vswitchd.conf.db(8)).
177 * This function must return 0 if and only if it sets 'netdev->tc' to an
178 * initialized 'struct tc'.
180 * (This function is null for tc_ops_other, which cannot be installed. For
181 * other TC classes it should always be nonnull.) */
182 int (*tc_install)(struct netdev *netdev, const struct shash *details);
184 /* Called when the netdev code determines (through a Netlink query) that
185 * this TC class's qdisc is installed on 'netdev', but we didn't install
186 * it ourselves and so don't know any of the details.
188 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
189 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
190 * implementation should parse the other attributes of 'nlmsg' as
191 * necessary to determine its configuration. If necessary it should also
192 * use Netlink queries to determine the configuration of queues on
195 * This function must return 0 if and only if it sets 'netdev->tc' to an
196 * initialized 'struct tc'. */
197 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
199 /* Destroys the data structures allocated by the implementation as part of
200 * 'tc'. (This includes destroying 'tc->queues' by calling
203 * The implementation should not need to perform any Netlink calls. If
204 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
205 * (But it may not be desirable.)
207 * This function may be null if 'tc' is trivial. */
208 void (*tc_destroy)(struct tc *tc);
210 /* Retrieves details of 'netdev->tc' configuration into 'details'.
212 * The implementation should not need to perform any Netlink calls, because
213 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
214 * cached the configuration.
216 * The contents of 'details' should be documented as valid for 'ovs_name'
217 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
218 * (which is built as ovs-vswitchd.conf.db(8)).
220 * This function may be null if 'tc' is not configurable.
222 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
224 /* Reconfigures 'netdev->tc' according to 'details', performing any
225 * required Netlink calls to complete the reconfiguration.
227 * The contents of 'details' should be documented as valid for 'ovs_name'
228 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
229 * (which is built as ovs-vswitchd.conf.db(8)).
231 * This function may be null if 'tc' is not configurable.
233 int (*qdisc_set)(struct netdev *, const struct shash *details);
235 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
236 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
238 * The contents of 'details' should be documented as valid for 'ovs_name'
239 * in the "other_config" column in the "Queue" table in
240 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
242 * The implementation should not need to perform any Netlink calls, because
243 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
244 * cached the queue configuration.
246 * This function may be null if 'tc' does not have queues ('n_queues' is
248 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
249 struct shash *details);
251 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
252 * 'details', perfoming any required Netlink calls to complete the
253 * reconfiguration. The caller ensures that 'queue_id' is less than
256 * The contents of 'details' should be documented as valid for 'ovs_name'
257 * in the "other_config" column in the "Queue" table in
258 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
260 * This function may be null if 'tc' does not have queues or its queues are
261 * not configurable. */
262 int (*class_set)(struct netdev *, unsigned int queue_id,
263 const struct shash *details);
265 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
266 * tc_queue's within 'netdev->tc->queues'.
268 * This function may be null if 'tc' does not have queues or its queues
269 * cannot be deleted. */
270 int (*class_delete)(struct netdev *, struct tc_queue *queue);
272 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
273 * 'struct tc_queue's within 'netdev->tc->queues'.
275 * On success, initializes '*stats'.
277 * This function may be null if 'tc' does not have queues or if it cannot
278 * report queue statistics. */
279 int (*class_get_stats)(const struct netdev *netdev,
280 const struct tc_queue *queue,
281 struct netdev_queue_stats *stats);
283 /* Extracts queue stats from 'nlmsg', which is a response to a
284 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
286 * This function may be null if 'tc' does not have queues or if it cannot
287 * report queue statistics. */
288 int (*class_dump_stats)(const struct netdev *netdev,
289 const struct ofpbuf *nlmsg,
290 netdev_dump_queue_stats_cb *cb, void *aux);
294 tc_init(struct tc *tc, const struct tc_ops *ops)
297 hmap_init(&tc->queues);
301 tc_destroy(struct tc *tc)
303 hmap_destroy(&tc->queues);
306 static const struct tc_ops tc_ops_htb;
307 static const struct tc_ops tc_ops_hfsc;
308 static const struct tc_ops tc_ops_default;
309 static const struct tc_ops tc_ops_other;
311 static const struct tc_ops *tcs[] = {
312 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
313 &tc_ops_hfsc, /* Hierarchical fair service curve. */
314 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
315 &tc_ops_other, /* Some other qdisc. */
319 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
320 static unsigned int tc_get_major(unsigned int handle);
321 static unsigned int tc_get_minor(unsigned int handle);
323 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
324 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
325 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
327 static struct tcmsg *tc_make_request(const struct netdev *, int type,
328 unsigned int flags, struct ofpbuf *);
329 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
330 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
331 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
334 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
335 struct nlattr **options);
336 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
337 struct nlattr **options,
338 struct netdev_queue_stats *);
339 static int tc_query_class(const struct netdev *,
340 unsigned int handle, unsigned int parent,
341 struct ofpbuf **replyp);
342 static int tc_delete_class(const struct netdev *, unsigned int handle);
344 static int tc_del_qdisc(struct netdev *netdev);
345 static int tc_query_qdisc(const struct netdev *netdev);
347 static int tc_calc_cell_log(unsigned int mtu);
348 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
349 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
350 const struct tc_ratespec *rate);
351 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
353 struct netdev_dev_linux {
354 struct netdev_dev netdev_dev;
356 struct shash_node *shash_node;
357 unsigned int cache_valid;
358 unsigned int change_seq;
360 bool miimon; /* Link status of last poll. */
361 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
362 struct timer miimon_timer;
364 /* The following are figured out "on demand" only. They are only valid
365 * when the corresponding VALID_* bit in 'cache_valid' is set. */
367 uint8_t etheraddr[ETH_ADDR_LEN];
368 struct in_addr address, netmask;
371 unsigned int ifi_flags;
372 long long int carrier_resets;
373 uint32_t kbits_rate; /* Policing data. */
374 uint32_t kbits_burst;
375 int vport_stats_error; /* Cached error code from vport_get_stats().
376 0 or an errno value. */
380 struct tap_state tap;
384 struct netdev_linux {
385 struct netdev netdev;
389 /* Sockets used for ioctl operations. */
390 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
392 /* A Netlink routing socket that is not subscribed to any multicast groups. */
393 static struct nl_sock *rtnl_sock;
395 /* This is set pretty low because we probably won't learn anything from the
396 * additional log messages. */
397 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
399 static int netdev_linux_init(void);
401 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
402 int cmd, const char *cmd_name);
403 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
404 const char *cmd_name);
405 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
406 int cmd, const char *cmd_name);
407 static int get_flags(const struct netdev_dev *, unsigned int *flags);
408 static int set_flags(struct netdev *, unsigned int flags);
409 static int do_get_ifindex(const char *netdev_name);
410 static int get_ifindex(const struct netdev *, int *ifindexp);
411 static int do_set_addr(struct netdev *netdev,
412 int ioctl_nr, const char *ioctl_name,
413 struct in_addr addr);
414 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
415 static int set_etheraddr(const char *netdev_name, int hwaddr_family,
416 const uint8_t[ETH_ADDR_LEN]);
417 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
418 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
419 static int af_packet_sock(void);
420 static void netdev_linux_miimon_run(void);
421 static void netdev_linux_miimon_wait(void);
424 is_netdev_linux_class(const struct netdev_class *netdev_class)
426 return netdev_class->init == netdev_linux_init;
429 static struct netdev_dev_linux *
430 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
432 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
433 assert(is_netdev_linux_class(netdev_class));
435 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
438 static struct netdev_linux *
439 netdev_linux_cast(const struct netdev *netdev)
441 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
442 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
443 assert(is_netdev_linux_class(netdev_class));
445 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
449 netdev_linux_init(void)
451 static int status = -1;
453 /* Create AF_INET socket. */
454 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
455 status = af_inet_sock >= 0 ? 0 : errno;
457 VLOG_ERR("failed to create inet socket: %s", strerror(status));
460 /* Create rtnetlink socket. */
462 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
464 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
473 netdev_linux_run(void)
475 rtnetlink_link_run();
476 netdev_linux_miimon_run();
480 netdev_linux_wait(void)
482 rtnetlink_link_wait();
483 netdev_linux_miimon_wait();
487 netdev_dev_linux_changed(struct netdev_dev_linux *dev, unsigned int ifi_flags)
490 if (!dev->change_seq) {
494 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
495 dev->carrier_resets++;
497 dev->ifi_flags = ifi_flags;
499 dev->cache_valid = 0;
503 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
504 void *aux OVS_UNUSED)
506 struct netdev_dev_linux *dev;
508 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
510 const struct netdev_class *netdev_class =
511 netdev_dev_get_class(base_dev);
513 if (is_netdev_linux_class(netdev_class)) {
514 dev = netdev_dev_linux_cast(base_dev);
515 netdev_dev_linux_changed(dev, change->ifi_flags);
519 struct shash device_shash;
520 struct shash_node *node;
522 shash_init(&device_shash);
523 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
524 SHASH_FOR_EACH (node, &device_shash) {
529 get_flags(&dev->netdev_dev, &flags);
530 netdev_dev_linux_changed(dev, flags);
532 shash_destroy(&device_shash);
537 cache_notifier_ref(void)
539 if (!cache_notifier_refcount) {
540 assert(!netdev_linux_cache_notifier);
542 netdev_linux_cache_notifier =
543 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
545 if (!netdev_linux_cache_notifier) {
549 cache_notifier_refcount++;
555 cache_notifier_unref(void)
557 assert(cache_notifier_refcount > 0);
558 if (!--cache_notifier_refcount) {
559 assert(netdev_linux_cache_notifier);
560 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
561 netdev_linux_cache_notifier = NULL;
565 /* Creates system and internal devices. */
567 netdev_linux_create(const struct netdev_class *class, const char *name,
568 struct netdev_dev **netdev_devp)
570 struct netdev_dev_linux *netdev_dev;
573 error = cache_notifier_ref();
578 netdev_dev = xzalloc(sizeof *netdev_dev);
579 netdev_dev->change_seq = 1;
580 netdev_dev_init(&netdev_dev->netdev_dev, name, class);
581 get_flags(&netdev_dev->netdev_dev, &netdev_dev->ifi_flags);
583 *netdev_devp = &netdev_dev->netdev_dev;
587 /* For most types of netdevs we open the device for each call of
588 * netdev_open(). However, this is not the case with tap devices,
589 * since it is only possible to open the device once. In this
590 * situation we share a single file descriptor, and consequently
591 * buffers, across all readers. Therefore once data is read it will
592 * be unavailable to other reads for tap devices. */
594 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
595 const char *name, struct netdev_dev **netdev_devp)
597 struct netdev_dev_linux *netdev_dev;
598 struct tap_state *state;
599 static const char tap_dev[] = "/dev/net/tun";
603 netdev_dev = xzalloc(sizeof *netdev_dev);
604 state = &netdev_dev->state.tap;
606 error = cache_notifier_ref();
611 /* Open tap device. */
612 state->fd = open(tap_dev, O_RDWR);
615 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
616 goto error_unref_notifier;
619 /* Create tap device. */
620 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
621 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
622 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
623 VLOG_WARN("%s: creating tap device failed: %s", name,
626 goto error_unref_notifier;
629 /* Make non-blocking. */
630 error = set_nonblocking(state->fd);
632 goto error_unref_notifier;
635 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
636 *netdev_devp = &netdev_dev->netdev_dev;
639 error_unref_notifier:
640 cache_notifier_unref();
647 destroy_tap(struct netdev_dev_linux *netdev_dev)
649 struct tap_state *state = &netdev_dev->state.tap;
651 if (state->fd >= 0) {
656 /* Destroys the netdev device 'netdev_dev_'. */
658 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
660 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
661 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
663 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
664 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
667 if (class == &netdev_tap_class) {
668 destroy_tap(netdev_dev);
672 cache_notifier_unref();
676 netdev_linux_open(struct netdev_dev *netdev_dev_, struct netdev **netdevp)
678 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
679 struct netdev_linux *netdev;
680 enum netdev_flags flags;
683 /* Allocate network device. */
684 netdev = xzalloc(sizeof *netdev);
686 netdev_init(&netdev->netdev, netdev_dev_);
688 /* Verify that the device really exists, by attempting to read its flags.
689 * (The flags might be cached, in which case this won't actually do an
692 * Don't do this for "internal" netdevs, though, because those have to be
693 * created as netdev objects before they exist in the kernel, because
694 * creating them in the kernel happens by passing a netdev object to
695 * dpif_port_add(). */
696 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
697 error = netdev_get_flags(&netdev->netdev, &flags);
698 if (error == ENODEV) {
703 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
704 !netdev_dev->state.tap.opened) {
706 /* We assume that the first user of the tap device is the primary user
707 * and give them the tap FD. Subsequent users probably just expect
708 * this to be a system device so open it normally to avoid send/receive
709 * directions appearing to be reversed. */
710 netdev->fd = netdev_dev->state.tap.fd;
711 netdev_dev->state.tap.opened = true;
714 *netdevp = &netdev->netdev;
718 netdev_uninit(&netdev->netdev, true);
722 /* Closes and destroys 'netdev'. */
724 netdev_linux_close(struct netdev *netdev_)
726 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
728 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
735 netdev_linux_listen(struct netdev *netdev_)
737 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
738 struct sockaddr_ll sll;
743 if (netdev->fd >= 0) {
747 /* Create file descriptor. */
748 fd = socket(PF_PACKET, SOCK_RAW, 0);
751 VLOG_ERR("failed to create raw socket (%s)", strerror(error));
755 /* Set non-blocking mode. */
756 error = set_nonblocking(fd);
761 /* Get ethernet device index. */
762 error = get_ifindex(&netdev->netdev, &ifindex);
767 /* Bind to specific ethernet device. */
768 memset(&sll, 0, sizeof sll);
769 sll.sll_family = AF_PACKET;
770 sll.sll_ifindex = ifindex;
771 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
772 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
774 VLOG_ERR("%s: failed to bind raw socket (%s)",
775 netdev_get_name(netdev_), strerror(error));
790 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
792 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
794 if (netdev->fd < 0) {
795 /* Device is not listening. */
802 retval = (netdev_->netdev_dev->netdev_class == &netdev_tap_class
803 ? read(netdev->fd, data, size)
804 : recv(netdev->fd, data, size, MSG_TRUNC));
806 return retval <= size ? retval : -EMSGSIZE;
807 } else if (errno != EINTR) {
808 if (errno != EAGAIN) {
809 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
810 strerror(errno), netdev_get_name(netdev_));
817 /* Registers with the poll loop to wake up from the next call to poll_block()
818 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
820 netdev_linux_recv_wait(struct netdev *netdev_)
822 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
823 if (netdev->fd >= 0) {
824 poll_fd_wait(netdev->fd, POLLIN);
828 /* Discards all packets waiting to be received from 'netdev'. */
830 netdev_linux_drain(struct netdev *netdev_)
832 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
833 if (netdev->fd < 0) {
835 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
837 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
838 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
842 drain_fd(netdev->fd, ifr.ifr_qlen);
845 return drain_rcvbuf(netdev->fd);
849 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
850 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
851 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
852 * the packet is too big or too small to transmit on the device.
854 * The caller retains ownership of 'buffer' in all cases.
856 * The kernel maintains a packet transmission queue, so the caller is not
857 * expected to do additional queuing of packets. */
859 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
861 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
865 if (netdev->fd < 0) {
866 /* Use our AF_PACKET socket to send to this device. */
867 struct sockaddr_ll sll;
874 sock = af_packet_sock();
879 error = get_ifindex(netdev_, &ifindex);
884 /* We don't bother setting most fields in sockaddr_ll because the
885 * kernel ignores them for SOCK_RAW. */
886 memset(&sll, 0, sizeof sll);
887 sll.sll_family = AF_PACKET;
888 sll.sll_ifindex = ifindex;
890 iov.iov_base = (void *) data;
894 msg.msg_namelen = sizeof sll;
897 msg.msg_control = NULL;
898 msg.msg_controllen = 0;
901 retval = sendmsg(sock, &msg, 0);
903 /* Use the netdev's own fd to send to this device. This is
904 * essential for tap devices, because packets sent to a tap device
905 * with an AF_PACKET socket will loop back to be *received* again
906 * on the tap device. */
907 retval = write(netdev->fd, data, size);
911 /* The Linux AF_PACKET implementation never blocks waiting for room
912 * for packets, instead returning ENOBUFS. Translate this into
913 * EAGAIN for the caller. */
914 if (errno == ENOBUFS) {
916 } else if (errno == EINTR) {
918 } else if (errno != EAGAIN) {
919 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
920 netdev_get_name(netdev_), strerror(errno));
923 } else if (retval != size) {
924 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
925 "%zu) on %s", retval, size, netdev_get_name(netdev_));
933 /* Registers with the poll loop to wake up from the next call to poll_block()
934 * when the packet transmission queue has sufficient room to transmit a packet
935 * with netdev_send().
937 * The kernel maintains a packet transmission queue, so the client is not
938 * expected to do additional queuing of packets. Thus, this function is
939 * unlikely to ever be used. It is included for completeness. */
941 netdev_linux_send_wait(struct netdev *netdev_)
943 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
944 if (netdev->fd < 0) {
946 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
947 poll_fd_wait(netdev->fd, POLLOUT);
949 /* TAP device always accepts packets.*/
950 poll_immediate_wake();
954 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
955 * otherwise a positive errno value. */
957 netdev_linux_set_etheraddr(struct netdev *netdev_,
958 const uint8_t mac[ETH_ADDR_LEN])
960 struct netdev_dev_linux *netdev_dev =
961 netdev_dev_linux_cast(netdev_get_dev(netdev_));
964 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
965 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
966 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
968 netdev_dev->cache_valid |= VALID_ETHERADDR;
969 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
977 /* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
978 * free the returned buffer. */
980 netdev_linux_get_etheraddr(const struct netdev *netdev_,
981 uint8_t mac[ETH_ADDR_LEN])
983 struct netdev_dev_linux *netdev_dev =
984 netdev_dev_linux_cast(netdev_get_dev(netdev_));
985 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
986 int error = get_etheraddr(netdev_get_name(netdev_),
987 netdev_dev->etheraddr);
991 netdev_dev->cache_valid |= VALID_ETHERADDR;
993 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
997 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
998 * in bytes, not including the hardware header; thus, this is typically 1500
999 * bytes for Ethernet devices. */
1001 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1003 struct netdev_dev_linux *netdev_dev =
1004 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1005 if (!(netdev_dev->cache_valid & VALID_MTU)) {
1009 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1010 SIOCGIFMTU, "SIOCGIFMTU");
1014 netdev_dev->mtu = ifr.ifr_mtu;
1015 netdev_dev->cache_valid |= VALID_MTU;
1017 *mtup = netdev_dev->mtu;
1021 /* Sets the maximum size of transmitted (MTU) for given device using linux
1022 * networking ioctl interface.
1025 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1027 struct netdev_dev_linux *netdev_dev =
1028 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1032 if (netdev_dev->cache_valid & VALID_MTU &&
1033 netdev_dev->mtu == mtu) {
1037 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1038 SIOCSIFMTU, "SIOCSIFMTU");
1043 netdev_dev->mtu = ifr.ifr_mtu;
1044 netdev_dev->cache_valid |= VALID_MTU;
1048 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1049 * On failure, returns a negative errno value. */
1051 netdev_linux_get_ifindex(const struct netdev *netdev)
1055 error = get_ifindex(netdev, &ifindex);
1056 return error ? -error : ifindex;
1060 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1062 struct netdev_dev_linux *netdev_dev =
1063 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1065 if (netdev_dev->miimon_interval > 0) {
1066 *carrier = netdev_dev->miimon;
1068 *carrier = (netdev_dev->ifi_flags & IFF_RUNNING) != 0;
1074 static long long int
1075 netdev_linux_get_carrier_resets(const struct netdev *netdev)
1077 return netdev_dev_linux_cast(netdev_get_dev(netdev))->carrier_resets;
1081 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1082 struct mii_ioctl_data *data)
1087 memset(&ifr, 0, sizeof ifr);
1088 memcpy(&ifr.ifr_data, data, sizeof *data);
1089 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1090 memcpy(data, &ifr.ifr_data, sizeof *data);
1096 netdev_linux_get_miimon(const char *name, bool *miimon)
1098 struct mii_ioctl_data data;
1103 memset(&data, 0, sizeof data);
1104 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1106 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1107 data.reg_num = MII_BMSR;
1108 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1112 *miimon = !!(data.val_out & BMSR_LSTATUS);
1114 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1117 struct ethtool_cmd ecmd;
1119 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1122 memset(&ecmd, 0, sizeof ecmd);
1123 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1126 struct ethtool_value eval;
1128 memcpy(&eval, &ecmd, sizeof eval);
1129 *miimon = !!eval.data;
1131 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1139 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1140 long long int interval)
1142 struct netdev_dev_linux *netdev_dev;
1144 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev_));
1146 interval = interval > 0 ? MAX(interval, 100) : 0;
1147 if (netdev_dev->miimon_interval != interval) {
1148 netdev_dev->miimon_interval = interval;
1149 timer_set_expired(&netdev_dev->miimon_timer);
1156 netdev_linux_miimon_run(void)
1158 struct shash device_shash;
1159 struct shash_node *node;
1161 shash_init(&device_shash);
1162 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1163 SHASH_FOR_EACH (node, &device_shash) {
1164 struct netdev_dev_linux *dev = node->data;
1167 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1171 netdev_linux_get_miimon(dev->netdev_dev.name, &miimon);
1172 if (miimon != dev->miimon) {
1173 dev->miimon = miimon;
1174 netdev_dev_linux_changed(dev, dev->ifi_flags);
1177 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1180 shash_destroy(&device_shash);
1184 netdev_linux_miimon_wait(void)
1186 struct shash device_shash;
1187 struct shash_node *node;
1189 shash_init(&device_shash);
1190 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1191 SHASH_FOR_EACH (node, &device_shash) {
1192 struct netdev_dev_linux *dev = node->data;
1194 if (dev->miimon_interval > 0) {
1195 timer_wait(&dev->miimon_timer);
1198 shash_destroy(&device_shash);
1201 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1202 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1205 check_for_working_netlink_stats(void)
1207 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1208 * preferable, so if that works, we'll use it. */
1209 int ifindex = do_get_ifindex("lo");
1211 VLOG_WARN("failed to get ifindex for lo, "
1212 "obtaining netdev stats from proc");
1215 struct netdev_stats stats;
1216 int error = get_stats_via_netlink(ifindex, &stats);
1218 VLOG_DBG("obtaining netdev stats via rtnetlink");
1221 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1222 "via proc (you are probably running a pre-2.6.19 "
1223 "kernel)", strerror(error));
1230 swap_uint64(uint64_t *a, uint64_t *b)
1238 get_stats_via_vport(const struct netdev *netdev_,
1239 struct netdev_stats *stats)
1241 struct netdev_dev_linux *netdev_dev =
1242 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1244 if (!netdev_dev->vport_stats_error ||
1245 !(netdev_dev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1248 error = netdev_vport_get_stats(netdev_, stats);
1250 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1251 "(%s)", netdev_get_name(netdev_), strerror(error));
1253 netdev_dev->vport_stats_error = error;
1254 netdev_dev->cache_valid |= VALID_VPORT_STAT_ERROR;
1259 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1260 struct netdev_stats *stats)
1262 static int use_netlink_stats = -1;
1265 if (use_netlink_stats < 0) {
1266 use_netlink_stats = check_for_working_netlink_stats();
1269 if (use_netlink_stats) {
1272 error = get_ifindex(netdev_, &ifindex);
1274 error = get_stats_via_netlink(ifindex, stats);
1277 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1281 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1282 netdev_get_name(netdev_), error);
1288 /* Retrieves current device stats for 'netdev-linux'. */
1290 netdev_linux_get_stats(const struct netdev *netdev_,
1291 struct netdev_stats *stats)
1293 struct netdev_dev_linux *netdev_dev =
1294 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1295 struct netdev_stats dev_stats;
1298 get_stats_via_vport(netdev_, stats);
1300 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1303 if (netdev_dev->vport_stats_error) {
1310 if (netdev_dev->vport_stats_error) {
1311 /* stats not available from OVS then use ioctl stats. */
1314 stats->rx_errors += dev_stats.rx_errors;
1315 stats->tx_errors += dev_stats.tx_errors;
1316 stats->rx_dropped += dev_stats.rx_dropped;
1317 stats->tx_dropped += dev_stats.tx_dropped;
1318 stats->multicast += dev_stats.multicast;
1319 stats->collisions += dev_stats.collisions;
1320 stats->rx_length_errors += dev_stats.rx_length_errors;
1321 stats->rx_over_errors += dev_stats.rx_over_errors;
1322 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1323 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1324 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1325 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1326 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1327 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1328 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1329 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1330 stats->tx_window_errors += dev_stats.tx_window_errors;
1335 /* Retrieves current device stats for 'netdev-tap' netdev or
1336 * netdev-internal. */
1338 netdev_tap_get_stats(const struct netdev *netdev_,
1339 struct netdev_stats *stats)
1341 struct netdev_dev_linux *netdev_dev =
1342 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1343 struct netdev_stats dev_stats;
1346 get_stats_via_vport(netdev_, stats);
1348 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1350 if (netdev_dev->vport_stats_error) {
1357 /* If this port is an internal port then the transmit and receive stats
1358 * will appear to be swapped relative to the other ports since we are the
1359 * one sending the data, not a remote computer. For consistency, we swap
1360 * them back here. This does not apply if we are getting stats from the
1361 * vport layer because it always tracks stats from the perspective of the
1363 if (netdev_dev->vport_stats_error) {
1365 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1366 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1367 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1368 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1369 stats->rx_length_errors = 0;
1370 stats->rx_over_errors = 0;
1371 stats->rx_crc_errors = 0;
1372 stats->rx_frame_errors = 0;
1373 stats->rx_fifo_errors = 0;
1374 stats->rx_missed_errors = 0;
1375 stats->tx_aborted_errors = 0;
1376 stats->tx_carrier_errors = 0;
1377 stats->tx_fifo_errors = 0;
1378 stats->tx_heartbeat_errors = 0;
1379 stats->tx_window_errors = 0;
1381 stats->rx_dropped += dev_stats.tx_dropped;
1382 stats->tx_dropped += dev_stats.rx_dropped;
1384 stats->rx_errors += dev_stats.tx_errors;
1385 stats->tx_errors += dev_stats.rx_errors;
1387 stats->multicast += dev_stats.multicast;
1388 stats->collisions += dev_stats.collisions;
1394 netdev_internal_get_stats(const struct netdev *netdev_,
1395 struct netdev_stats *stats)
1397 struct netdev_dev_linux *netdev_dev =
1398 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1400 get_stats_via_vport(netdev_, stats);
1401 return netdev_dev->vport_stats_error;
1404 /* Stores the features supported by 'netdev' into each of '*current',
1405 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1406 * bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if
1407 * successful, otherwise a positive errno value. */
1409 netdev_linux_get_features(const struct netdev *netdev,
1410 uint32_t *current, uint32_t *advertised,
1411 uint32_t *supported, uint32_t *peer)
1413 struct ethtool_cmd ecmd;
1416 memset(&ecmd, 0, sizeof ecmd);
1417 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1418 ETHTOOL_GSET, "ETHTOOL_GSET");
1423 /* Supported features. */
1425 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1426 *supported |= OFPPF_10MB_HD;
1428 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1429 *supported |= OFPPF_10MB_FD;
1431 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1432 *supported |= OFPPF_100MB_HD;
1434 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1435 *supported |= OFPPF_100MB_FD;
1437 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1438 *supported |= OFPPF_1GB_HD;
1440 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1441 *supported |= OFPPF_1GB_FD;
1443 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1444 *supported |= OFPPF_10GB_FD;
1446 if (ecmd.supported & SUPPORTED_TP) {
1447 *supported |= OFPPF_COPPER;
1449 if (ecmd.supported & SUPPORTED_FIBRE) {
1450 *supported |= OFPPF_FIBER;
1452 if (ecmd.supported & SUPPORTED_Autoneg) {
1453 *supported |= OFPPF_AUTONEG;
1455 if (ecmd.supported & SUPPORTED_Pause) {
1456 *supported |= OFPPF_PAUSE;
1458 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1459 *supported |= OFPPF_PAUSE_ASYM;
1462 /* Advertised features. */
1464 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1465 *advertised |= OFPPF_10MB_HD;
1467 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1468 *advertised |= OFPPF_10MB_FD;
1470 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1471 *advertised |= OFPPF_100MB_HD;
1473 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1474 *advertised |= OFPPF_100MB_FD;
1476 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1477 *advertised |= OFPPF_1GB_HD;
1479 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1480 *advertised |= OFPPF_1GB_FD;
1482 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1483 *advertised |= OFPPF_10GB_FD;
1485 if (ecmd.advertising & ADVERTISED_TP) {
1486 *advertised |= OFPPF_COPPER;
1488 if (ecmd.advertising & ADVERTISED_FIBRE) {
1489 *advertised |= OFPPF_FIBER;
1491 if (ecmd.advertising & ADVERTISED_Autoneg) {
1492 *advertised |= OFPPF_AUTONEG;
1494 if (ecmd.advertising & ADVERTISED_Pause) {
1495 *advertised |= OFPPF_PAUSE;
1497 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1498 *advertised |= OFPPF_PAUSE_ASYM;
1501 /* Current settings. */
1502 if (ecmd.speed == SPEED_10) {
1503 *current = ecmd.duplex ? OFPPF_10MB_FD : OFPPF_10MB_HD;
1504 } else if (ecmd.speed == SPEED_100) {
1505 *current = ecmd.duplex ? OFPPF_100MB_FD : OFPPF_100MB_HD;
1506 } else if (ecmd.speed == SPEED_1000) {
1507 *current = ecmd.duplex ? OFPPF_1GB_FD : OFPPF_1GB_HD;
1508 } else if (ecmd.speed == SPEED_10000) {
1509 *current = OFPPF_10GB_FD;
1514 if (ecmd.port == PORT_TP) {
1515 *current |= OFPPF_COPPER;
1516 } else if (ecmd.port == PORT_FIBRE) {
1517 *current |= OFPPF_FIBER;
1521 *current |= OFPPF_AUTONEG;
1524 /* Peer advertisements. */
1525 *peer = 0; /* XXX */
1530 /* Set the features advertised by 'netdev' to 'advertise'. */
1532 netdev_linux_set_advertisements(struct netdev *netdev, uint32_t advertise)
1534 struct ethtool_cmd ecmd;
1537 memset(&ecmd, 0, sizeof ecmd);
1538 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1539 ETHTOOL_GSET, "ETHTOOL_GSET");
1544 ecmd.advertising = 0;
1545 if (advertise & OFPPF_10MB_HD) {
1546 ecmd.advertising |= ADVERTISED_10baseT_Half;
1548 if (advertise & OFPPF_10MB_FD) {
1549 ecmd.advertising |= ADVERTISED_10baseT_Full;
1551 if (advertise & OFPPF_100MB_HD) {
1552 ecmd.advertising |= ADVERTISED_100baseT_Half;
1554 if (advertise & OFPPF_100MB_FD) {
1555 ecmd.advertising |= ADVERTISED_100baseT_Full;
1557 if (advertise & OFPPF_1GB_HD) {
1558 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1560 if (advertise & OFPPF_1GB_FD) {
1561 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1563 if (advertise & OFPPF_10GB_FD) {
1564 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1566 if (advertise & OFPPF_COPPER) {
1567 ecmd.advertising |= ADVERTISED_TP;
1569 if (advertise & OFPPF_FIBER) {
1570 ecmd.advertising |= ADVERTISED_FIBRE;
1572 if (advertise & OFPPF_AUTONEG) {
1573 ecmd.advertising |= ADVERTISED_Autoneg;
1575 if (advertise & OFPPF_PAUSE) {
1576 ecmd.advertising |= ADVERTISED_Pause;
1578 if (advertise & OFPPF_PAUSE_ASYM) {
1579 ecmd.advertising |= ADVERTISED_Asym_Pause;
1581 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1582 ETHTOOL_SSET, "ETHTOOL_SSET");
1585 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1586 * successful, otherwise a positive errno value. */
1588 netdev_linux_set_policing(struct netdev *netdev,
1589 uint32_t kbits_rate, uint32_t kbits_burst)
1591 struct netdev_dev_linux *netdev_dev =
1592 netdev_dev_linux_cast(netdev_get_dev(netdev));
1593 const char *netdev_name = netdev_get_name(netdev);
1597 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1598 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1599 : kbits_burst); /* Stick with user-specified value. */
1601 if (netdev_dev->cache_valid & VALID_POLICING
1602 && netdev_dev->kbits_rate == kbits_rate
1603 && netdev_dev->kbits_burst == kbits_burst) {
1604 /* Assume that settings haven't changed since we last set them. */
1608 COVERAGE_INC(netdev_set_policing);
1609 /* Remove any existing ingress qdisc. */
1610 error = tc_add_del_ingress_qdisc(netdev, false);
1612 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1613 netdev_name, strerror(error));
1618 error = tc_add_del_ingress_qdisc(netdev, true);
1620 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1621 netdev_name, strerror(error));
1625 error = tc_add_policer(netdev, kbits_rate, kbits_burst);
1627 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1628 netdev_name, strerror(error));
1633 netdev_dev->kbits_rate = kbits_rate;
1634 netdev_dev->kbits_burst = kbits_burst;
1635 netdev_dev->cache_valid |= VALID_POLICING;
1641 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1644 const struct tc_ops **opsp;
1646 for (opsp = tcs; *opsp != NULL; opsp++) {
1647 const struct tc_ops *ops = *opsp;
1648 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1649 sset_add(types, ops->ovs_name);
1655 static const struct tc_ops *
1656 tc_lookup_ovs_name(const char *name)
1658 const struct tc_ops **opsp;
1660 for (opsp = tcs; *opsp != NULL; opsp++) {
1661 const struct tc_ops *ops = *opsp;
1662 if (!strcmp(name, ops->ovs_name)) {
1669 static const struct tc_ops *
1670 tc_lookup_linux_name(const char *name)
1672 const struct tc_ops **opsp;
1674 for (opsp = tcs; *opsp != NULL; opsp++) {
1675 const struct tc_ops *ops = *opsp;
1676 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1683 static struct tc_queue *
1684 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1687 struct netdev_dev_linux *netdev_dev =
1688 netdev_dev_linux_cast(netdev_get_dev(netdev));
1689 struct tc_queue *queue;
1691 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1692 if (queue->queue_id == queue_id) {
1699 static struct tc_queue *
1700 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1702 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1706 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1708 struct netdev_qos_capabilities *caps)
1710 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1714 caps->n_queues = ops->n_queues;
1719 netdev_linux_get_qos(const struct netdev *netdev,
1720 const char **typep, struct shash *details)
1722 struct netdev_dev_linux *netdev_dev =
1723 netdev_dev_linux_cast(netdev_get_dev(netdev));
1726 error = tc_query_qdisc(netdev);
1731 *typep = netdev_dev->tc->ops->ovs_name;
1732 return (netdev_dev->tc->ops->qdisc_get
1733 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1738 netdev_linux_set_qos(struct netdev *netdev,
1739 const char *type, const struct shash *details)
1741 struct netdev_dev_linux *netdev_dev =
1742 netdev_dev_linux_cast(netdev_get_dev(netdev));
1743 const struct tc_ops *new_ops;
1746 new_ops = tc_lookup_ovs_name(type);
1747 if (!new_ops || !new_ops->tc_install) {
1751 error = tc_query_qdisc(netdev);
1756 if (new_ops == netdev_dev->tc->ops) {
1757 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1759 /* Delete existing qdisc. */
1760 error = tc_del_qdisc(netdev);
1764 assert(netdev_dev->tc == NULL);
1766 /* Install new qdisc. */
1767 error = new_ops->tc_install(netdev, details);
1768 assert((error == 0) == (netdev_dev->tc != NULL));
1775 netdev_linux_get_queue(const struct netdev *netdev,
1776 unsigned int queue_id, struct shash *details)
1778 struct netdev_dev_linux *netdev_dev =
1779 netdev_dev_linux_cast(netdev_get_dev(netdev));
1782 error = tc_query_qdisc(netdev);
1786 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1788 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1794 netdev_linux_set_queue(struct netdev *netdev,
1795 unsigned int queue_id, const struct shash *details)
1797 struct netdev_dev_linux *netdev_dev =
1798 netdev_dev_linux_cast(netdev_get_dev(netdev));
1801 error = tc_query_qdisc(netdev);
1804 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1805 || !netdev_dev->tc->ops->class_set) {
1809 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1813 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1815 struct netdev_dev_linux *netdev_dev =
1816 netdev_dev_linux_cast(netdev_get_dev(netdev));
1819 error = tc_query_qdisc(netdev);
1822 } else if (!netdev_dev->tc->ops->class_delete) {
1825 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1827 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1833 netdev_linux_get_queue_stats(const struct netdev *netdev,
1834 unsigned int queue_id,
1835 struct netdev_queue_stats *stats)
1837 struct netdev_dev_linux *netdev_dev =
1838 netdev_dev_linux_cast(netdev_get_dev(netdev));
1841 error = tc_query_qdisc(netdev);
1844 } else if (!netdev_dev->tc->ops->class_get_stats) {
1847 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1849 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1855 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1857 struct ofpbuf request;
1858 struct tcmsg *tcmsg;
1860 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
1864 tcmsg->tcm_parent = 0;
1865 nl_dump_start(dump, rtnl_sock, &request);
1866 ofpbuf_uninit(&request);
1871 netdev_linux_dump_queues(const struct netdev *netdev,
1872 netdev_dump_queues_cb *cb, void *aux)
1874 struct netdev_dev_linux *netdev_dev =
1875 netdev_dev_linux_cast(netdev_get_dev(netdev));
1876 struct tc_queue *queue;
1877 struct shash details;
1881 error = tc_query_qdisc(netdev);
1884 } else if (!netdev_dev->tc->ops->class_get) {
1889 shash_init(&details);
1890 HMAP_FOR_EACH (queue, hmap_node, &netdev_dev->tc->queues) {
1891 shash_clear(&details);
1893 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
1895 (*cb)(queue->queue_id, &details, aux);
1900 shash_destroy(&details);
1906 netdev_linux_dump_queue_stats(const struct netdev *netdev,
1907 netdev_dump_queue_stats_cb *cb, void *aux)
1909 struct netdev_dev_linux *netdev_dev =
1910 netdev_dev_linux_cast(netdev_get_dev(netdev));
1911 struct nl_dump dump;
1916 error = tc_query_qdisc(netdev);
1919 } else if (!netdev_dev->tc->ops->class_dump_stats) {
1924 if (!start_queue_dump(netdev, &dump)) {
1927 while (nl_dump_next(&dump, &msg)) {
1928 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
1934 error = nl_dump_done(&dump);
1935 return error ? error : last_error;
1939 netdev_linux_get_in4(const struct netdev *netdev_,
1940 struct in_addr *address, struct in_addr *netmask)
1942 struct netdev_dev_linux *netdev_dev =
1943 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1945 if (!(netdev_dev->cache_valid & VALID_IN4)) {
1948 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
1949 SIOCGIFADDR, "SIOCGIFADDR");
1954 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
1955 SIOCGIFNETMASK, "SIOCGIFNETMASK");
1960 netdev_dev->cache_valid |= VALID_IN4;
1962 *address = netdev_dev->address;
1963 *netmask = netdev_dev->netmask;
1964 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
1968 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
1969 struct in_addr netmask)
1971 struct netdev_dev_linux *netdev_dev =
1972 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1975 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
1977 netdev_dev->cache_valid |= VALID_IN4;
1978 netdev_dev->address = address;
1979 netdev_dev->netmask = netmask;
1980 if (address.s_addr != INADDR_ANY) {
1981 error = do_set_addr(netdev_, SIOCSIFNETMASK,
1982 "SIOCSIFNETMASK", netmask);
1989 parse_if_inet6_line(const char *line,
1990 struct in6_addr *in6, char ifname[16 + 1])
1992 uint8_t *s6 = in6->s6_addr;
1993 #define X8 "%2"SCNx8
1995 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
1996 "%*x %*x %*x %*x %16s\n",
1997 &s6[0], &s6[1], &s6[2], &s6[3],
1998 &s6[4], &s6[5], &s6[6], &s6[7],
1999 &s6[8], &s6[9], &s6[10], &s6[11],
2000 &s6[12], &s6[13], &s6[14], &s6[15],
2004 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2005 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2007 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2009 struct netdev_dev_linux *netdev_dev =
2010 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2011 if (!(netdev_dev->cache_valid & VALID_IN6)) {
2015 netdev_dev->in6 = in6addr_any;
2017 file = fopen("/proc/net/if_inet6", "r");
2019 const char *name = netdev_get_name(netdev_);
2020 while (fgets(line, sizeof line, file)) {
2021 struct in6_addr in6_tmp;
2022 char ifname[16 + 1];
2023 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2024 && !strcmp(name, ifname))
2026 netdev_dev->in6 = in6_tmp;
2032 netdev_dev->cache_valid |= VALID_IN6;
2034 *in6 = netdev_dev->in6;
2039 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2041 struct sockaddr_in sin;
2042 memset(&sin, 0, sizeof sin);
2043 sin.sin_family = AF_INET;
2044 sin.sin_addr = addr;
2047 memset(sa, 0, sizeof *sa);
2048 memcpy(sa, &sin, sizeof sin);
2052 do_set_addr(struct netdev *netdev,
2053 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2056 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2057 make_in4_sockaddr(&ifr.ifr_addr, addr);
2059 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2063 /* Adds 'router' as a default IP gateway. */
2065 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2067 struct in_addr any = { INADDR_ANY };
2071 memset(&rt, 0, sizeof rt);
2072 make_in4_sockaddr(&rt.rt_dst, any);
2073 make_in4_sockaddr(&rt.rt_gateway, router);
2074 make_in4_sockaddr(&rt.rt_genmask, any);
2075 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2076 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2078 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
2084 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2087 static const char fn[] = "/proc/net/route";
2092 *netdev_name = NULL;
2093 stream = fopen(fn, "r");
2094 if (stream == NULL) {
2095 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
2100 while (fgets(line, sizeof line, stream)) {
2103 ovs_be32 dest, gateway, mask;
2104 int refcnt, metric, mtu;
2105 unsigned int flags, use, window, irtt;
2108 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2110 iface, &dest, &gateway, &flags, &refcnt,
2111 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2113 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2117 if (!(flags & RTF_UP)) {
2118 /* Skip routes that aren't up. */
2122 /* The output of 'dest', 'mask', and 'gateway' were given in
2123 * network byte order, so we don't need need any endian
2124 * conversions here. */
2125 if ((dest & mask) == (host->s_addr & mask)) {
2127 /* The host is directly reachable. */
2128 next_hop->s_addr = 0;
2130 /* To reach the host, we must go through a gateway. */
2131 next_hop->s_addr = gateway;
2133 *netdev_name = xstrdup(iface);
2145 netdev_linux_get_status(const struct netdev *netdev, struct shash *sh)
2147 struct ethtool_drvinfo drvinfo;
2150 memset(&drvinfo, 0, sizeof drvinfo);
2151 error = netdev_linux_do_ethtool(netdev_get_name(netdev),
2152 (struct ethtool_cmd *)&drvinfo,
2154 "ETHTOOL_GDRVINFO");
2156 shash_add(sh, "driver_name", xstrdup(drvinfo.driver));
2157 shash_add(sh, "driver_version", xstrdup(drvinfo.version));
2158 shash_add(sh, "firmware_version", xstrdup(drvinfo.fw_version));
2164 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2165 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2166 * returns 0. Otherwise, it returns a positive errno value; in particular,
2167 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2169 netdev_linux_arp_lookup(const struct netdev *netdev,
2170 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2173 struct sockaddr_in sin;
2176 memset(&r, 0, sizeof r);
2177 memset(&sin, 0, sizeof sin);
2178 sin.sin_family = AF_INET;
2179 sin.sin_addr.s_addr = ip;
2181 memcpy(&r.arp_pa, &sin, sizeof sin);
2182 r.arp_ha.sa_family = ARPHRD_ETHER;
2184 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2185 COVERAGE_INC(netdev_arp_lookup);
2186 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2188 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2189 } else if (retval != ENXIO) {
2190 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2191 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
2197 nd_to_iff_flags(enum netdev_flags nd)
2200 if (nd & NETDEV_UP) {
2203 if (nd & NETDEV_PROMISC) {
2210 iff_to_nd_flags(int iff)
2212 enum netdev_flags nd = 0;
2216 if (iff & IFF_PROMISC) {
2217 nd |= NETDEV_PROMISC;
2223 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2224 enum netdev_flags on, enum netdev_flags *old_flagsp)
2226 struct netdev_dev_linux *netdev_dev;
2227 int old_flags, new_flags;
2230 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2231 old_flags = netdev_dev->ifi_flags;
2232 *old_flagsp = iff_to_nd_flags(old_flags);
2233 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2234 if (new_flags != old_flags) {
2235 error = set_flags(netdev, new_flags);
2236 get_flags(&netdev_dev->netdev_dev, &netdev_dev->ifi_flags);
2242 netdev_linux_change_seq(const struct netdev *netdev)
2244 return netdev_dev_linux_cast(netdev_get_dev(netdev))->change_seq;
2247 #define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS) \
2251 netdev_linux_init, \
2253 netdev_linux_wait, \
2256 netdev_linux_destroy, \
2257 NULL, /* get_config */ \
2258 NULL, /* set_config */ \
2260 netdev_linux_open, \
2261 netdev_linux_close, \
2263 netdev_linux_listen, \
2264 netdev_linux_recv, \
2265 netdev_linux_recv_wait, \
2266 netdev_linux_drain, \
2268 netdev_linux_send, \
2269 netdev_linux_send_wait, \
2271 netdev_linux_set_etheraddr, \
2272 netdev_linux_get_etheraddr, \
2273 netdev_linux_get_mtu, \
2274 netdev_linux_set_mtu, \
2275 netdev_linux_get_ifindex, \
2276 netdev_linux_get_carrier, \
2277 netdev_linux_get_carrier_resets, \
2278 netdev_linux_set_miimon_interval, \
2282 netdev_linux_get_features, \
2283 netdev_linux_set_advertisements, \
2285 netdev_linux_set_policing, \
2286 netdev_linux_get_qos_types, \
2287 netdev_linux_get_qos_capabilities, \
2288 netdev_linux_get_qos, \
2289 netdev_linux_set_qos, \
2290 netdev_linux_get_queue, \
2291 netdev_linux_set_queue, \
2292 netdev_linux_delete_queue, \
2293 netdev_linux_get_queue_stats, \
2294 netdev_linux_dump_queues, \
2295 netdev_linux_dump_queue_stats, \
2297 netdev_linux_get_in4, \
2298 netdev_linux_set_in4, \
2299 netdev_linux_get_in6, \
2300 netdev_linux_add_router, \
2301 netdev_linux_get_next_hop, \
2302 netdev_linux_get_status, \
2303 netdev_linux_arp_lookup, \
2305 netdev_linux_update_flags, \
2307 netdev_linux_change_seq \
2310 const struct netdev_class netdev_linux_class =
2313 netdev_linux_create,
2314 netdev_linux_get_stats,
2315 NULL); /* set_stats */
2317 const struct netdev_class netdev_tap_class =
2320 netdev_linux_create_tap,
2321 netdev_tap_get_stats,
2322 NULL); /* set_stats */
2324 const struct netdev_class netdev_internal_class =
2327 netdev_linux_create,
2328 netdev_internal_get_stats,
2329 netdev_vport_set_stats);
2331 /* HTB traffic control class. */
2333 #define HTB_N_QUEUES 0xf000
2337 unsigned int max_rate; /* In bytes/s. */
2341 struct tc_queue tc_queue;
2342 unsigned int min_rate; /* In bytes/s. */
2343 unsigned int max_rate; /* In bytes/s. */
2344 unsigned int burst; /* In bytes. */
2345 unsigned int priority; /* Lower values are higher priorities. */
2349 htb_get__(const struct netdev *netdev)
2351 struct netdev_dev_linux *netdev_dev =
2352 netdev_dev_linux_cast(netdev_get_dev(netdev));
2353 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2357 htb_install__(struct netdev *netdev, uint64_t max_rate)
2359 struct netdev_dev_linux *netdev_dev =
2360 netdev_dev_linux_cast(netdev_get_dev(netdev));
2363 htb = xmalloc(sizeof *htb);
2364 tc_init(&htb->tc, &tc_ops_htb);
2365 htb->max_rate = max_rate;
2367 netdev_dev->tc = &htb->tc;
2370 /* Create an HTB qdisc.
2372 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2374 htb_setup_qdisc__(struct netdev *netdev)
2377 struct tc_htb_glob opt;
2378 struct ofpbuf request;
2379 struct tcmsg *tcmsg;
2381 tc_del_qdisc(netdev);
2383 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2384 NLM_F_EXCL | NLM_F_CREATE, &request);
2388 tcmsg->tcm_handle = tc_make_handle(1, 0);
2389 tcmsg->tcm_parent = TC_H_ROOT;
2391 nl_msg_put_string(&request, TCA_KIND, "htb");
2393 memset(&opt, 0, sizeof opt);
2394 opt.rate2quantum = 10;
2398 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2399 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2400 nl_msg_end_nested(&request, opt_offset);
2402 return tc_transact(&request, NULL);
2405 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2406 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2408 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2409 unsigned int parent, struct htb_class *class)
2412 struct tc_htb_opt opt;
2413 struct ofpbuf request;
2414 struct tcmsg *tcmsg;
2418 error = netdev_get_mtu(netdev, &mtu);
2420 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2421 netdev_get_name(netdev));
2425 memset(&opt, 0, sizeof opt);
2426 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2427 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2428 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2429 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2430 opt.prio = class->priority;
2432 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2436 tcmsg->tcm_handle = handle;
2437 tcmsg->tcm_parent = parent;
2439 nl_msg_put_string(&request, TCA_KIND, "htb");
2440 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2441 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2442 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2443 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2444 nl_msg_end_nested(&request, opt_offset);
2446 error = tc_transact(&request, NULL);
2448 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2449 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2450 netdev_get_name(netdev),
2451 tc_get_major(handle), tc_get_minor(handle),
2452 tc_get_major(parent), tc_get_minor(parent),
2453 class->min_rate, class->max_rate,
2454 class->burst, class->priority, strerror(error));
2459 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2460 * description of them into 'details'. The description complies with the
2461 * specification given in the vswitch database documentation for linux-htb
2464 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2466 static const struct nl_policy tca_htb_policy[] = {
2467 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2468 .min_len = sizeof(struct tc_htb_opt) },
2471 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2472 const struct tc_htb_opt *htb;
2474 if (!nl_parse_nested(nl_options, tca_htb_policy,
2475 attrs, ARRAY_SIZE(tca_htb_policy))) {
2476 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2480 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2481 class->min_rate = htb->rate.rate;
2482 class->max_rate = htb->ceil.rate;
2483 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2484 class->priority = htb->prio;
2489 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2490 struct htb_class *options,
2491 struct netdev_queue_stats *stats)
2493 struct nlattr *nl_options;
2494 unsigned int handle;
2497 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2498 if (!error && queue_id) {
2499 unsigned int major = tc_get_major(handle);
2500 unsigned int minor = tc_get_minor(handle);
2501 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2502 *queue_id = minor - 1;
2507 if (!error && options) {
2508 error = htb_parse_tca_options__(nl_options, options);
2514 htb_parse_qdisc_details__(struct netdev *netdev,
2515 const struct shash *details, struct htb_class *hc)
2517 const char *max_rate_s;
2519 max_rate_s = shash_find_data(details, "max-rate");
2520 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2521 if (!hc->max_rate) {
2524 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2525 hc->max_rate = netdev_features_to_bps(current) / 8;
2527 hc->min_rate = hc->max_rate;
2533 htb_parse_class_details__(struct netdev *netdev,
2534 const struct shash *details, struct htb_class *hc)
2536 const struct htb *htb = htb_get__(netdev);
2537 const char *min_rate_s = shash_find_data(details, "min-rate");
2538 const char *max_rate_s = shash_find_data(details, "max-rate");
2539 const char *burst_s = shash_find_data(details, "burst");
2540 const char *priority_s = shash_find_data(details, "priority");
2543 error = netdev_get_mtu(netdev, &mtu);
2545 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2546 netdev_get_name(netdev));
2550 /* HTB requires at least an mtu sized min-rate to send any traffic even
2551 * on uncongested links. */
2552 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2553 hc->min_rate = MAX(hc->min_rate, mtu);
2554 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2557 hc->max_rate = (max_rate_s
2558 ? strtoull(max_rate_s, NULL, 10) / 8
2560 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2561 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2565 * According to hints in the documentation that I've read, it is important
2566 * that 'burst' be at least as big as the largest frame that might be
2567 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2568 * but having it a bit too small is a problem. Since netdev_get_mtu()
2569 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2570 * the MTU. We actually add 64, instead of 14, as a guard against
2571 * additional headers get tacked on somewhere that we're not aware of. */
2572 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2573 hc->burst = MAX(hc->burst, mtu + 64);
2576 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2582 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2583 unsigned int parent, struct htb_class *options,
2584 struct netdev_queue_stats *stats)
2586 struct ofpbuf *reply;
2589 error = tc_query_class(netdev, handle, parent, &reply);
2591 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2592 ofpbuf_delete(reply);
2598 htb_tc_install(struct netdev *netdev, const struct shash *details)
2602 error = htb_setup_qdisc__(netdev);
2604 struct htb_class hc;
2606 htb_parse_qdisc_details__(netdev, details, &hc);
2607 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2608 tc_make_handle(1, 0), &hc);
2610 htb_install__(netdev, hc.max_rate);
2616 static struct htb_class *
2617 htb_class_cast__(const struct tc_queue *queue)
2619 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2623 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2624 const struct htb_class *hc)
2626 struct htb *htb = htb_get__(netdev);
2627 size_t hash = hash_int(queue_id, 0);
2628 struct tc_queue *queue;
2629 struct htb_class *hcp;
2631 queue = tc_find_queue__(netdev, queue_id, hash);
2633 hcp = htb_class_cast__(queue);
2635 hcp = xmalloc(sizeof *hcp);
2636 queue = &hcp->tc_queue;
2637 queue->queue_id = queue_id;
2638 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2641 hcp->min_rate = hc->min_rate;
2642 hcp->max_rate = hc->max_rate;
2643 hcp->burst = hc->burst;
2644 hcp->priority = hc->priority;
2648 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2651 struct nl_dump dump;
2652 struct htb_class hc;
2654 /* Get qdisc options. */
2656 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2657 htb_install__(netdev, hc.max_rate);
2660 if (!start_queue_dump(netdev, &dump)) {
2663 while (nl_dump_next(&dump, &msg)) {
2664 unsigned int queue_id;
2666 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2667 htb_update_queue__(netdev, queue_id, &hc);
2670 nl_dump_done(&dump);
2676 htb_tc_destroy(struct tc *tc)
2678 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2679 struct htb_class *hc, *next;
2681 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2682 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2690 htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2692 const struct htb *htb = htb_get__(netdev);
2693 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2698 htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2700 struct htb_class hc;
2703 htb_parse_qdisc_details__(netdev, details, &hc);
2704 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2705 tc_make_handle(1, 0), &hc);
2707 htb_get__(netdev)->max_rate = hc.max_rate;
2713 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2714 const struct tc_queue *queue, struct shash *details)
2716 const struct htb_class *hc = htb_class_cast__(queue);
2718 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2719 if (hc->min_rate != hc->max_rate) {
2720 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2722 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2724 shash_add(details, "priority", xasprintf("%u", hc->priority));
2730 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2731 const struct shash *details)
2733 struct htb_class hc;
2736 error = htb_parse_class_details__(netdev, details, &hc);
2741 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2742 tc_make_handle(1, 0xfffe), &hc);
2747 htb_update_queue__(netdev, queue_id, &hc);
2752 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2754 struct htb_class *hc = htb_class_cast__(queue);
2755 struct htb *htb = htb_get__(netdev);
2758 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2760 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2767 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2768 struct netdev_queue_stats *stats)
2770 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2771 tc_make_handle(1, 0xfffe), NULL, stats);
2775 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2776 const struct ofpbuf *nlmsg,
2777 netdev_dump_queue_stats_cb *cb, void *aux)
2779 struct netdev_queue_stats stats;
2780 unsigned int handle, major, minor;
2783 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2788 major = tc_get_major(handle);
2789 minor = tc_get_minor(handle);
2790 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2791 (*cb)(minor - 1, &stats, aux);
2796 static const struct tc_ops tc_ops_htb = {
2797 "htb", /* linux_name */
2798 "linux-htb", /* ovs_name */
2799 HTB_N_QUEUES, /* n_queues */
2808 htb_class_get_stats,
2809 htb_class_dump_stats
2812 /* "linux-hfsc" traffic control class. */
2814 #define HFSC_N_QUEUES 0xf000
2822 struct tc_queue tc_queue;
2827 static struct hfsc *
2828 hfsc_get__(const struct netdev *netdev)
2830 struct netdev_dev_linux *netdev_dev;
2831 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2832 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2835 static struct hfsc_class *
2836 hfsc_class_cast__(const struct tc_queue *queue)
2838 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2842 hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2844 struct netdev_dev_linux * netdev_dev;
2847 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2848 hfsc = xmalloc(sizeof *hfsc);
2849 tc_init(&hfsc->tc, &tc_ops_hfsc);
2850 hfsc->max_rate = max_rate;
2851 netdev_dev->tc = &hfsc->tc;
2855 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
2856 const struct hfsc_class *hc)
2860 struct hfsc_class *hcp;
2861 struct tc_queue *queue;
2863 hfsc = hfsc_get__(netdev);
2864 hash = hash_int(queue_id, 0);
2866 queue = tc_find_queue__(netdev, queue_id, hash);
2868 hcp = hfsc_class_cast__(queue);
2870 hcp = xmalloc(sizeof *hcp);
2871 queue = &hcp->tc_queue;
2872 queue->queue_id = queue_id;
2873 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
2876 hcp->min_rate = hc->min_rate;
2877 hcp->max_rate = hc->max_rate;
2881 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
2883 const struct tc_service_curve *rsc, *fsc, *usc;
2884 static const struct nl_policy tca_hfsc_policy[] = {
2886 .type = NL_A_UNSPEC,
2888 .min_len = sizeof(struct tc_service_curve),
2891 .type = NL_A_UNSPEC,
2893 .min_len = sizeof(struct tc_service_curve),
2896 .type = NL_A_UNSPEC,
2898 .min_len = sizeof(struct tc_service_curve),
2901 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
2903 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
2904 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
2905 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
2909 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
2910 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
2911 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
2913 if (rsc->m1 != 0 || rsc->d != 0 ||
2914 fsc->m1 != 0 || fsc->d != 0 ||
2915 usc->m1 != 0 || usc->d != 0) {
2916 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2917 "Non-linear service curves are not supported.");
2921 if (rsc->m2 != fsc->m2) {
2922 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2923 "Real-time service curves are not supported ");
2927 if (rsc->m2 > usc->m2) {
2928 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2929 "Min-rate service curve is greater than "
2930 "the max-rate service curve.");
2934 class->min_rate = fsc->m2;
2935 class->max_rate = usc->m2;
2940 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2941 struct hfsc_class *options,
2942 struct netdev_queue_stats *stats)
2945 unsigned int handle;
2946 struct nlattr *nl_options;
2948 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2954 unsigned int major, minor;
2956 major = tc_get_major(handle);
2957 minor = tc_get_minor(handle);
2958 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
2959 *queue_id = minor - 1;
2966 error = hfsc_parse_tca_options__(nl_options, options);
2973 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
2974 unsigned int parent, struct hfsc_class *options,
2975 struct netdev_queue_stats *stats)
2978 struct ofpbuf *reply;
2980 error = tc_query_class(netdev, handle, parent, &reply);
2985 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
2986 ofpbuf_delete(reply);
2991 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
2992 struct hfsc_class *class)
2995 const char *max_rate_s;
2997 max_rate_s = shash_find_data(details, "max-rate");
2998 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3003 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
3004 max_rate = netdev_features_to_bps(current) / 8;
3007 class->min_rate = max_rate;
3008 class->max_rate = max_rate;
3012 hfsc_parse_class_details__(struct netdev *netdev,
3013 const struct shash *details,
3014 struct hfsc_class * class)
3016 const struct hfsc *hfsc;
3017 uint32_t min_rate, max_rate;
3018 const char *min_rate_s, *max_rate_s;
3020 hfsc = hfsc_get__(netdev);
3021 min_rate_s = shash_find_data(details, "min-rate");
3022 max_rate_s = shash_find_data(details, "max-rate");
3024 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3025 min_rate = MAX(min_rate, 1);
3026 min_rate = MIN(min_rate, hfsc->max_rate);
3028 max_rate = (max_rate_s
3029 ? strtoull(max_rate_s, NULL, 10) / 8
3031 max_rate = MAX(max_rate, min_rate);
3032 max_rate = MIN(max_rate, hfsc->max_rate);
3034 class->min_rate = min_rate;
3035 class->max_rate = max_rate;
3040 /* Create an HFSC qdisc.
3042 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3044 hfsc_setup_qdisc__(struct netdev * netdev)
3046 struct tcmsg *tcmsg;
3047 struct ofpbuf request;
3048 struct tc_hfsc_qopt opt;
3050 tc_del_qdisc(netdev);
3052 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3053 NLM_F_EXCL | NLM_F_CREATE, &request);
3059 tcmsg->tcm_handle = tc_make_handle(1, 0);
3060 tcmsg->tcm_parent = TC_H_ROOT;
3062 memset(&opt, 0, sizeof opt);
3065 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3066 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3068 return tc_transact(&request, NULL);
3071 /* Create an HFSC class.
3073 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3074 * sc rate <min_rate> ul rate <max_rate>" */
3076 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3077 unsigned int parent, struct hfsc_class *class)
3081 struct tcmsg *tcmsg;
3082 struct ofpbuf request;
3083 struct tc_service_curve min, max;
3085 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3091 tcmsg->tcm_handle = handle;
3092 tcmsg->tcm_parent = parent;
3096 min.m2 = class->min_rate;
3100 max.m2 = class->max_rate;
3102 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3103 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3104 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3105 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3106 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3107 nl_msg_end_nested(&request, opt_offset);
3109 error = tc_transact(&request, NULL);
3111 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3112 "min-rate %ubps, max-rate %ubps (%s)",
3113 netdev_get_name(netdev),
3114 tc_get_major(handle), tc_get_minor(handle),
3115 tc_get_major(parent), tc_get_minor(parent),
3116 class->min_rate, class->max_rate, strerror(error));
3123 hfsc_tc_install(struct netdev *netdev, const struct shash *details)
3126 struct hfsc_class class;
3128 error = hfsc_setup_qdisc__(netdev);
3134 hfsc_parse_qdisc_details__(netdev, details, &class);
3135 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3136 tc_make_handle(1, 0), &class);
3142 hfsc_install__(netdev, class.max_rate);
3147 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3150 struct nl_dump dump;
3151 struct hfsc_class hc;
3154 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3155 hfsc_install__(netdev, hc.max_rate);
3157 if (!start_queue_dump(netdev, &dump)) {
3161 while (nl_dump_next(&dump, &msg)) {
3162 unsigned int queue_id;
3164 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3165 hfsc_update_queue__(netdev, queue_id, &hc);
3169 nl_dump_done(&dump);
3174 hfsc_tc_destroy(struct tc *tc)
3177 struct hfsc_class *hc, *next;
3179 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3181 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3182 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3191 hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3193 const struct hfsc *hfsc;
3194 hfsc = hfsc_get__(netdev);
3195 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3200 hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3203 struct hfsc_class class;
3205 hfsc_parse_qdisc_details__(netdev, details, &class);
3206 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3207 tc_make_handle(1, 0), &class);
3210 hfsc_get__(netdev)->max_rate = class.max_rate;
3217 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3218 const struct tc_queue *queue, struct shash *details)
3220 const struct hfsc_class *hc;
3222 hc = hfsc_class_cast__(queue);
3223 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3224 if (hc->min_rate != hc->max_rate) {
3225 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3231 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3232 const struct shash *details)
3235 struct hfsc_class class;
3237 error = hfsc_parse_class_details__(netdev, details, &class);
3242 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3243 tc_make_handle(1, 0xfffe), &class);
3248 hfsc_update_queue__(netdev, queue_id, &class);
3253 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3257 struct hfsc_class *hc;
3259 hc = hfsc_class_cast__(queue);
3260 hfsc = hfsc_get__(netdev);
3262 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3264 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3271 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3272 struct netdev_queue_stats *stats)
3274 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3275 tc_make_handle(1, 0xfffe), NULL, stats);
3279 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3280 const struct ofpbuf *nlmsg,
3281 netdev_dump_queue_stats_cb *cb, void *aux)
3283 struct netdev_queue_stats stats;
3284 unsigned int handle, major, minor;
3287 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3292 major = tc_get_major(handle);
3293 minor = tc_get_minor(handle);
3294 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3295 (*cb)(minor - 1, &stats, aux);
3300 static const struct tc_ops tc_ops_hfsc = {
3301 "hfsc", /* linux_name */
3302 "linux-hfsc", /* ovs_name */
3303 HFSC_N_QUEUES, /* n_queues */
3304 hfsc_tc_install, /* tc_install */
3305 hfsc_tc_load, /* tc_load */
3306 hfsc_tc_destroy, /* tc_destroy */
3307 hfsc_qdisc_get, /* qdisc_get */
3308 hfsc_qdisc_set, /* qdisc_set */
3309 hfsc_class_get, /* class_get */
3310 hfsc_class_set, /* class_set */
3311 hfsc_class_delete, /* class_delete */
3312 hfsc_class_get_stats, /* class_get_stats */
3313 hfsc_class_dump_stats /* class_dump_stats */
3316 /* "linux-default" traffic control class.
3318 * This class represents the default, unnamed Linux qdisc. It corresponds to
3319 * the "" (empty string) QoS type in the OVS database. */
3322 default_install__(struct netdev *netdev)
3324 struct netdev_dev_linux *netdev_dev =
3325 netdev_dev_linux_cast(netdev_get_dev(netdev));
3326 static struct tc *tc;
3329 tc = xmalloc(sizeof *tc);
3330 tc_init(tc, &tc_ops_default);
3332 netdev_dev->tc = tc;
3336 default_tc_install(struct netdev *netdev,
3337 const struct shash *details OVS_UNUSED)
3339 default_install__(netdev);
3344 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3346 default_install__(netdev);
3350 static const struct tc_ops tc_ops_default = {
3351 NULL, /* linux_name */
3356 NULL, /* tc_destroy */
3357 NULL, /* qdisc_get */
3358 NULL, /* qdisc_set */
3359 NULL, /* class_get */
3360 NULL, /* class_set */
3361 NULL, /* class_delete */
3362 NULL, /* class_get_stats */
3363 NULL /* class_dump_stats */
3366 /* "linux-other" traffic control class.
3371 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3373 struct netdev_dev_linux *netdev_dev =
3374 netdev_dev_linux_cast(netdev_get_dev(netdev));
3375 static struct tc *tc;
3378 tc = xmalloc(sizeof *tc);
3379 tc_init(tc, &tc_ops_other);
3381 netdev_dev->tc = tc;
3385 static const struct tc_ops tc_ops_other = {
3386 NULL, /* linux_name */
3387 "linux-other", /* ovs_name */
3389 NULL, /* tc_install */
3391 NULL, /* tc_destroy */
3392 NULL, /* qdisc_get */
3393 NULL, /* qdisc_set */
3394 NULL, /* class_get */
3395 NULL, /* class_set */
3396 NULL, /* class_delete */
3397 NULL, /* class_get_stats */
3398 NULL /* class_dump_stats */
3401 /* Traffic control. */
3403 /* Number of kernel "tc" ticks per second. */
3404 static double ticks_per_s;
3406 /* Number of kernel "jiffies" per second. This is used for the purpose of
3407 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3408 * one jiffy's worth of data.
3410 * There are two possibilities here:
3412 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3413 * approximate range of 100 to 1024. That means that we really need to
3414 * make sure that the qdisc can buffer that much data.
3416 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3417 * has finely granular timers and there's no need to fudge additional room
3418 * for buffers. (There's no extra effort needed to implement that: the
3419 * large 'buffer_hz' is used as a divisor, so practically any number will
3420 * come out as 0 in the division. Small integer results in the case of
3421 * really high dividends won't have any real effect anyhow.)
3423 static unsigned int buffer_hz;
3425 /* Returns tc handle 'major':'minor'. */
3427 tc_make_handle(unsigned int major, unsigned int minor)
3429 return TC_H_MAKE(major << 16, minor);
3432 /* Returns the major number from 'handle'. */
3434 tc_get_major(unsigned int handle)
3436 return TC_H_MAJ(handle) >> 16;
3439 /* Returns the minor number from 'handle'. */
3441 tc_get_minor(unsigned int handle)
3443 return TC_H_MIN(handle);
3446 static struct tcmsg *
3447 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3448 struct ofpbuf *request)
3450 struct tcmsg *tcmsg;
3454 error = get_ifindex(netdev, &ifindex);
3459 ofpbuf_init(request, 512);
3460 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3461 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3462 tcmsg->tcm_family = AF_UNSPEC;
3463 tcmsg->tcm_ifindex = ifindex;
3464 /* Caller should fill in tcmsg->tcm_handle. */
3465 /* Caller should fill in tcmsg->tcm_parent. */
3471 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3473 int error = nl_sock_transact(rtnl_sock, request, replyp);
3474 ofpbuf_uninit(request);
3478 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3479 * policing configuration.
3481 * This function is equivalent to running the following when 'add' is true:
3482 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3484 * This function is equivalent to running the following when 'add' is false:
3485 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3487 * The configuration and stats may be seen with the following command:
3488 * /sbin/tc -s qdisc show dev <devname>
3490 * Returns 0 if successful, otherwise a positive errno value.
3493 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3495 struct ofpbuf request;
3496 struct tcmsg *tcmsg;
3498 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3499 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3501 tcmsg = tc_make_request(netdev, type, flags, &request);
3505 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3506 tcmsg->tcm_parent = TC_H_INGRESS;
3507 nl_msg_put_string(&request, TCA_KIND, "ingress");
3508 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3510 error = tc_transact(&request, NULL);
3512 /* If we're deleting the qdisc, don't worry about some of the
3513 * error conditions. */
3514 if (!add && (error == ENOENT || error == EINVAL)) {
3523 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3526 * This function is equivalent to running:
3527 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3528 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3531 * The configuration and stats may be seen with the following command:
3532 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3534 * Returns 0 if successful, otherwise a positive errno value.
3537 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3539 struct tc_police tc_police;
3540 struct ofpbuf request;
3541 struct tcmsg *tcmsg;
3542 size_t basic_offset;
3543 size_t police_offset;
3547 memset(&tc_police, 0, sizeof tc_police);
3548 tc_police.action = TC_POLICE_SHOT;
3549 tc_police.mtu = mtu;
3550 tc_fill_rate(&tc_police.rate, kbits_rate/8 * 1000, mtu);
3551 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3552 kbits_burst * 1024);
3554 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3555 NLM_F_EXCL | NLM_F_CREATE, &request);
3559 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3560 tcmsg->tcm_info = tc_make_handle(49,
3561 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3563 nl_msg_put_string(&request, TCA_KIND, "basic");
3564 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3565 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3566 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3567 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3568 nl_msg_end_nested(&request, police_offset);
3569 nl_msg_end_nested(&request, basic_offset);
3571 error = tc_transact(&request, NULL);
3582 /* The values in psched are not individually very meaningful, but they are
3583 * important. The tables below show some values seen in the wild.
3587 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3588 * (Before that, there are hints that it was 1000000000.)
3590 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3594 * -----------------------------------
3595 * [1] 000c8000 000f4240 000f4240 00000064
3596 * [2] 000003e8 00000400 000f4240 3b9aca00
3597 * [3] 000003e8 00000400 000f4240 3b9aca00
3598 * [4] 000003e8 00000400 000f4240 00000064
3599 * [5] 000003e8 00000040 000f4240 3b9aca00
3600 * [6] 000003e8 00000040 000f4240 000000f9
3602 * a b c d ticks_per_s buffer_hz
3603 * ------- --------- ---------- ------------- ----------- -------------
3604 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3605 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3606 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3607 * [4] 1,000 1,024 1,000,000 100 976,562 100
3608 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3609 * [6] 1,000 64 1,000,000 249 15,625,000 249
3611 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3612 * [2] 2.6.26-1-686-bigmem from Debian lenny
3613 * [3] 2.6.26-2-sparc64 from Debian lenny
3614 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3615 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3616 * [6] 2.6.34 from kernel.org on KVM
3618 static const char fn[] = "/proc/net/psched";
3619 unsigned int a, b, c, d;
3625 stream = fopen(fn, "r");
3627 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3631 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3632 VLOG_WARN("%s: read failed", fn);
3636 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3640 VLOG_WARN("%s: invalid scheduler parameters", fn);
3644 ticks_per_s = (double) a * c / b;
3648 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3651 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3654 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3655 * rate of 'rate' bytes per second. */
3657 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3662 return (rate * ticks) / ticks_per_s;
3665 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3666 * rate of 'rate' bytes per second. */
3668 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3673 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3676 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3677 * a transmission rate of 'rate' bytes per second. */
3679 tc_buffer_per_jiffy(unsigned int rate)
3684 return rate / buffer_hz;
3687 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3688 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3689 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3690 * stores NULL into it if it is absent.
3692 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3695 * Returns 0 if successful, otherwise a positive errno value. */
3697 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3698 struct nlattr **options)
3700 static const struct nl_policy tca_policy[] = {
3701 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3702 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3704 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3706 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3707 tca_policy, ta, ARRAY_SIZE(ta))) {
3708 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3713 *kind = nl_attr_get_string(ta[TCA_KIND]);
3717 *options = ta[TCA_OPTIONS];
3732 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3733 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3734 * into '*options', and its queue statistics into '*stats'. Any of the output
3735 * arguments may be null.
3737 * Returns 0 if successful, otherwise a positive errno value. */
3739 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3740 struct nlattr **options, struct netdev_queue_stats *stats)
3742 static const struct nl_policy tca_policy[] = {
3743 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3744 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3746 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3748 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3749 tca_policy, ta, ARRAY_SIZE(ta))) {
3750 VLOG_WARN_RL(&rl, "failed to parse class message");
3755 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3756 *handlep = tc->tcm_handle;
3760 *options = ta[TCA_OPTIONS];
3764 const struct gnet_stats_queue *gsq;
3765 struct gnet_stats_basic gsb;
3767 static const struct nl_policy stats_policy[] = {
3768 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3769 .min_len = sizeof gsb },
3770 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3771 .min_len = sizeof *gsq },
3773 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3775 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3776 sa, ARRAY_SIZE(sa))) {
3777 VLOG_WARN_RL(&rl, "failed to parse class stats");
3781 /* Alignment issues screw up the length of struct gnet_stats_basic on
3782 * some arch/bitsize combinations. Newer versions of Linux have a
3783 * struct gnet_stats_basic_packed, but we can't depend on that. The
3784 * easiest thing to do is just to make a copy. */
3785 memset(&gsb, 0, sizeof gsb);
3786 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3787 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3788 stats->tx_bytes = gsb.bytes;
3789 stats->tx_packets = gsb.packets;
3791 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3792 stats->tx_errors = gsq->drops;
3802 memset(stats, 0, sizeof *stats);
3807 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3810 tc_query_class(const struct netdev *netdev,
3811 unsigned int handle, unsigned int parent,
3812 struct ofpbuf **replyp)
3814 struct ofpbuf request;
3815 struct tcmsg *tcmsg;
3818 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3822 tcmsg->tcm_handle = handle;
3823 tcmsg->tcm_parent = parent;
3825 error = tc_transact(&request, replyp);
3827 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3828 netdev_get_name(netdev),
3829 tc_get_major(handle), tc_get_minor(handle),
3830 tc_get_major(parent), tc_get_minor(parent),
3836 /* Equivalent to "tc class del dev <name> handle <handle>". */
3838 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3840 struct ofpbuf request;
3841 struct tcmsg *tcmsg;
3844 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
3848 tcmsg->tcm_handle = handle;
3849 tcmsg->tcm_parent = 0;
3851 error = tc_transact(&request, NULL);
3853 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3854 netdev_get_name(netdev),
3855 tc_get_major(handle), tc_get_minor(handle),
3861 /* Equivalent to "tc qdisc del dev <name> root". */
3863 tc_del_qdisc(struct netdev *netdev)
3865 struct netdev_dev_linux *netdev_dev =
3866 netdev_dev_linux_cast(netdev_get_dev(netdev));
3867 struct ofpbuf request;
3868 struct tcmsg *tcmsg;
3871 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
3875 tcmsg->tcm_handle = tc_make_handle(1, 0);
3876 tcmsg->tcm_parent = TC_H_ROOT;
3878 error = tc_transact(&request, NULL);
3879 if (error == EINVAL) {
3880 /* EINVAL probably means that the default qdisc was in use, in which
3881 * case we've accomplished our purpose. */
3884 if (!error && netdev_dev->tc) {
3885 if (netdev_dev->tc->ops->tc_destroy) {
3886 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3888 netdev_dev->tc = NULL;
3893 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
3894 * kernel to determine what they are. Returns 0 if successful, otherwise a
3895 * positive errno value. */
3897 tc_query_qdisc(const struct netdev *netdev)
3899 struct netdev_dev_linux *netdev_dev =
3900 netdev_dev_linux_cast(netdev_get_dev(netdev));
3901 struct ofpbuf request, *qdisc;
3902 const struct tc_ops *ops;
3903 struct tcmsg *tcmsg;
3907 if (netdev_dev->tc) {
3911 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3912 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3913 * 2.6.35 without that fix backported to it.
3915 * To avoid the OOPS, we must not make a request that would attempt to dump
3916 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3917 * few others. There are a few ways that I can see to do this, but most of
3918 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3919 * technique chosen here is to assume that any non-default qdisc that we
3920 * create will have a class with handle 1:0. The built-in qdiscs only have
3921 * a class with handle 0:0.
3923 * We could check for Linux 2.6.35+ and use a more straightforward method
3925 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
3929 tcmsg->tcm_handle = tc_make_handle(1, 0);
3930 tcmsg->tcm_parent = 0;
3932 /* Figure out what tc class to instantiate. */
3933 error = tc_transact(&request, &qdisc);
3937 error = tc_parse_qdisc(qdisc, &kind, NULL);
3939 ops = &tc_ops_other;
3941 ops = tc_lookup_linux_name(kind);
3943 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
3944 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
3946 ops = &tc_ops_other;
3949 } else if (error == ENOENT) {
3950 /* Either it's a built-in qdisc, or it's a qdisc set up by some
3951 * other entity that doesn't have a handle 1:0. We will assume
3952 * that it's the system default qdisc. */
3953 ops = &tc_ops_default;
3956 /* Who knows? Maybe the device got deleted. */
3957 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
3958 netdev_get_name(netdev), strerror(error));
3959 ops = &tc_ops_other;
3962 /* Instantiate it. */
3963 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
3964 assert((load_error == 0) == (netdev_dev->tc != NULL));
3965 ofpbuf_delete(qdisc);
3967 return error ? error : load_error;
3970 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
3971 approximate the time to transmit packets of various lengths. For an MTU of
3972 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
3973 represents two possible packet lengths; for a MTU of 513 through 1024, four
3974 possible lengths; and so on.
3976 Returns, for the specified 'mtu', the number of bits that packet lengths
3977 need to be shifted right to fit within such a 256-entry table. */
3979 tc_calc_cell_log(unsigned int mtu)
3984 mtu = ETH_PAYLOAD_MAX;
3986 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
3988 for (cell_log = 0; mtu >= 256; cell_log++) {
3995 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
3998 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4000 memset(rate, 0, sizeof *rate);
4001 rate->cell_log = tc_calc_cell_log(mtu);
4002 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4003 /* rate->cell_align = 0; */ /* distro headers. */
4004 rate->mpu = ETH_TOTAL_MIN;
4008 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4009 * attribute of the specified "type".
4011 * See tc_calc_cell_log() above for a description of "rtab"s. */
4013 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4018 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4019 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4020 unsigned packet_size = (i + 1) << rate->cell_log;
4021 if (packet_size < rate->mpu) {
4022 packet_size = rate->mpu;
4024 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4028 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4029 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4030 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4033 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4035 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4036 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4039 /* Linux-only functions declared in netdev-linux.h */
4041 /* Returns a fd for an AF_INET socket or a negative errno value. */
4043 netdev_linux_get_af_inet_sock(void)
4045 int error = netdev_linux_init();
4046 return error ? -error : af_inet_sock;
4049 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4050 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4052 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4053 const char *flag_name, bool enable)
4055 const char *netdev_name = netdev_get_name(netdev);
4056 struct ethtool_value evalue;
4060 memset(&evalue, 0, sizeof evalue);
4061 error = netdev_linux_do_ethtool(netdev_name,
4062 (struct ethtool_cmd *)&evalue,
4063 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4068 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4069 error = netdev_linux_do_ethtool(netdev_name,
4070 (struct ethtool_cmd *)&evalue,
4071 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4076 memset(&evalue, 0, sizeof evalue);
4077 error = netdev_linux_do_ethtool(netdev_name,
4078 (struct ethtool_cmd *)&evalue,
4079 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4084 if (new_flags != evalue.data) {
4085 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4086 "device %s failed", enable ? "enable" : "disable",
4087 flag_name, netdev_name);
4094 /* Utility functions. */
4096 /* Copies 'src' into 'dst', performing format conversion in the process. */
4098 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4099 const struct rtnl_link_stats *src)
4101 dst->rx_packets = src->rx_packets;
4102 dst->tx_packets = src->tx_packets;
4103 dst->rx_bytes = src->rx_bytes;
4104 dst->tx_bytes = src->tx_bytes;
4105 dst->rx_errors = src->rx_errors;
4106 dst->tx_errors = src->tx_errors;
4107 dst->rx_dropped = src->rx_dropped;
4108 dst->tx_dropped = src->tx_dropped;
4109 dst->multicast = src->multicast;
4110 dst->collisions = src->collisions;
4111 dst->rx_length_errors = src->rx_length_errors;
4112 dst->rx_over_errors = src->rx_over_errors;
4113 dst->rx_crc_errors = src->rx_crc_errors;
4114 dst->rx_frame_errors = src->rx_frame_errors;
4115 dst->rx_fifo_errors = src->rx_fifo_errors;
4116 dst->rx_missed_errors = src->rx_missed_errors;
4117 dst->tx_aborted_errors = src->tx_aborted_errors;
4118 dst->tx_carrier_errors = src->tx_carrier_errors;
4119 dst->tx_fifo_errors = src->tx_fifo_errors;
4120 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4121 dst->tx_window_errors = src->tx_window_errors;
4125 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4127 /* Policy for RTNLGRP_LINK messages.
4129 * There are *many* more fields in these messages, but currently we only
4130 * care about these fields. */
4131 static const struct nl_policy rtnlgrp_link_policy[] = {
4132 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4133 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4134 .min_len = sizeof(struct rtnl_link_stats) },
4137 struct ofpbuf request;
4138 struct ofpbuf *reply;
4139 struct ifinfomsg *ifi;
4140 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4143 ofpbuf_init(&request, 0);
4144 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4145 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4146 ifi->ifi_family = PF_UNSPEC;
4147 ifi->ifi_index = ifindex;
4148 error = nl_sock_transact(rtnl_sock, &request, &reply);
4149 ofpbuf_uninit(&request);
4154 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4155 rtnlgrp_link_policy,
4156 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4157 ofpbuf_delete(reply);
4161 if (!attrs[IFLA_STATS]) {
4162 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4163 ofpbuf_delete(reply);
4167 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4169 ofpbuf_delete(reply);
4175 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4177 static const char fn[] = "/proc/net/dev";
4182 stream = fopen(fn, "r");
4184 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
4189 while (fgets(line, sizeof line, stream)) {
4192 #define X64 "%"SCNu64
4195 X64 X64 X64 X64 X64 X64 X64 "%*u"
4196 X64 X64 X64 X64 X64 X64 X64 "%*u",
4202 &stats->rx_fifo_errors,
4203 &stats->rx_frame_errors,
4209 &stats->tx_fifo_errors,
4211 &stats->tx_carrier_errors) != 15) {
4212 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4213 } else if (!strcmp(devname, netdev_name)) {
4214 stats->rx_length_errors = UINT64_MAX;
4215 stats->rx_over_errors = UINT64_MAX;
4216 stats->rx_crc_errors = UINT64_MAX;
4217 stats->rx_missed_errors = UINT64_MAX;
4218 stats->tx_aborted_errors = UINT64_MAX;
4219 stats->tx_heartbeat_errors = UINT64_MAX;
4220 stats->tx_window_errors = UINT64_MAX;
4226 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4232 get_flags(const struct netdev_dev *dev, unsigned int *flags)
4238 error = netdev_linux_do_ioctl(dev->name, &ifr, SIOCGIFFLAGS,
4241 *flags = ifr.ifr_flags;
4247 set_flags(struct netdev *netdev, unsigned int flags)
4251 ifr.ifr_flags = flags;
4252 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4257 do_get_ifindex(const char *netdev_name)
4261 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4262 COVERAGE_INC(netdev_get_ifindex);
4263 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4264 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4265 netdev_name, strerror(errno));
4268 return ifr.ifr_ifindex;
4272 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4274 struct netdev_dev_linux *netdev_dev =
4275 netdev_dev_linux_cast(netdev_get_dev(netdev_));
4277 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
4278 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4282 netdev_dev->cache_valid |= VALID_IFINDEX;
4283 netdev_dev->ifindex = ifindex;
4285 *ifindexp = netdev_dev->ifindex;
4290 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4295 memset(&ifr, 0, sizeof ifr);
4296 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4297 COVERAGE_INC(netdev_get_hwaddr);
4298 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4299 /* ENODEV probably means that a vif disappeared asynchronously and
4300 * hasn't been removed from the database yet, so reduce the log level
4301 * to INFO for that case. */
4302 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4303 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4304 netdev_name, strerror(errno));
4307 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4308 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4309 VLOG_WARN("%s device has unknown hardware address family %d",
4310 netdev_name, hwaddr_family);
4312 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4317 set_etheraddr(const char *netdev_name, int hwaddr_family,
4318 const uint8_t mac[ETH_ADDR_LEN])
4322 memset(&ifr, 0, sizeof ifr);
4323 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4324 ifr.ifr_hwaddr.sa_family = hwaddr_family;
4325 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4326 COVERAGE_INC(netdev_set_hwaddr);
4327 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4328 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4329 netdev_name, strerror(errno));
4336 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4337 int cmd, const char *cmd_name)
4341 memset(&ifr, 0, sizeof ifr);
4342 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4343 ifr.ifr_data = (caddr_t) ecmd;
4346 COVERAGE_INC(netdev_ethtool);
4347 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4350 if (errno != EOPNOTSUPP) {
4351 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4352 "failed: %s", cmd_name, name, strerror(errno));
4354 /* The device doesn't support this operation. That's pretty
4355 * common, so there's no point in logging anything. */
4362 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4363 const char *cmd_name)
4365 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4366 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4367 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4375 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4376 int cmd, const char *cmd_name)
4381 ifr.ifr_addr.sa_family = AF_INET;
4382 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4384 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4385 *ip = sin->sin_addr;
4390 /* Returns an AF_PACKET raw socket or a negative errno value. */
4392 af_packet_sock(void)
4394 static int sock = INT_MIN;
4396 if (sock == INT_MIN) {
4397 sock = socket(AF_PACKET, SOCK_RAW, 0);
4399 set_nonblocking(sock);
4402 VLOG_ERR("failed to create packet socket: %s", strerror(errno));