2 * Copyright (c) 2009, 2010, 2011 Nicira Networks.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
24 #include <arpa/inet.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
30 #include <linux/types.h>
31 #include <linux/ethtool.h>
32 #include <linux/mii.h>
33 #include <linux/pkt_cls.h>
34 #include <linux/pkt_sched.h>
35 #include <linux/rtnetlink.h>
36 #include <linux/sockios.h>
37 #include <linux/version.h>
38 #include <sys/types.h>
39 #include <sys/ioctl.h>
40 #include <sys/socket.h>
41 #include <netpacket/packet.h>
43 #include <net/if_arp.h>
44 #include <net/if_packet.h>
45 #include <net/route.h>
46 #include <netinet/in.h>
53 #include "dpif-linux.h"
54 #include "dynamic-string.h"
55 #include "fatal-signal.h"
58 #include "netdev-provider.h"
59 #include "netdev-vport.h"
61 #include "netlink-notifier.h"
62 #include "netlink-socket.h"
64 #include "openflow/openflow.h"
66 #include "poll-loop.h"
67 #include "rtnetlink-link.h"
68 #include "socket-util.h"
74 VLOG_DEFINE_THIS_MODULE(netdev_linux);
76 COVERAGE_DEFINE(netdev_set_policing);
77 COVERAGE_DEFINE(netdev_arp_lookup);
78 COVERAGE_DEFINE(netdev_get_ifindex);
79 COVERAGE_DEFINE(netdev_get_hwaddr);
80 COVERAGE_DEFINE(netdev_set_hwaddr);
81 COVERAGE_DEFINE(netdev_ethtool);
83 /* These were introduced in Linux 2.6.14, so they might be missing if we have
85 #ifndef ADVERTISED_Pause
86 #define ADVERTISED_Pause (1 << 13)
88 #ifndef ADVERTISED_Asym_Pause
89 #define ADVERTISED_Asym_Pause (1 << 14)
92 /* These were introduced in Linux 2.6.24, so they might be missing if we
93 * have old headers. */
94 #ifndef ETHTOOL_GFLAGS
95 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
97 #ifndef ETHTOOL_SFLAGS
98 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
101 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
104 #define TC_RTAB_SIZE 1024
107 static struct nln_notifier *netdev_linux_cache_notifier = NULL;
108 static int cache_notifier_refcount;
111 VALID_IFINDEX = 1 << 0,
112 VALID_ETHERADDR = 1 << 1,
116 VALID_POLICING = 1 << 5,
117 VALID_HAVE_VPORT_STATS = 1 << 6
125 /* Traffic control. */
127 /* An instance of a traffic control class. Always associated with a particular
130 * Each TC implementation subclasses this with whatever additional data it
133 const struct tc_ops *ops;
134 struct hmap queues; /* Contains "struct tc_queue"s.
135 * Read by generic TC layer.
136 * Written only by TC implementation. */
139 /* One traffic control queue.
141 * Each TC implementation subclasses this with whatever additional data it
144 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
145 unsigned int queue_id; /* OpenFlow queue ID. */
148 /* A particular kind of traffic control. Each implementation generally maps to
149 * one particular Linux qdisc class.
151 * The functions below return 0 if successful or a positive errno value on
152 * failure, except where otherwise noted. All of them must be provided, except
153 * where otherwise noted. */
155 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
156 * This is null for tc_ops_default and tc_ops_other, for which there are no
157 * appropriate values. */
158 const char *linux_name;
160 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
161 const char *ovs_name;
163 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
164 * queues. The queues are numbered 0 through n_queues - 1. */
165 unsigned int n_queues;
167 /* Called to install this TC class on 'netdev'. The implementation should
168 * make the Netlink calls required to set up 'netdev' with the right qdisc
169 * and configure it according to 'details'. The implementation may assume
170 * that the current qdisc is the default; that is, there is no need for it
171 * to delete the current qdisc before installing itself.
173 * The contents of 'details' should be documented as valid for 'ovs_name'
174 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
175 * (which is built as ovs-vswitchd.conf.db(8)).
177 * This function must return 0 if and only if it sets 'netdev->tc' to an
178 * initialized 'struct tc'.
180 * (This function is null for tc_ops_other, which cannot be installed. For
181 * other TC classes it should always be nonnull.) */
182 int (*tc_install)(struct netdev *netdev, const struct shash *details);
184 /* Called when the netdev code determines (through a Netlink query) that
185 * this TC class's qdisc is installed on 'netdev', but we didn't install
186 * it ourselves and so don't know any of the details.
188 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
189 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
190 * implementation should parse the other attributes of 'nlmsg' as
191 * necessary to determine its configuration. If necessary it should also
192 * use Netlink queries to determine the configuration of queues on
195 * This function must return 0 if and only if it sets 'netdev->tc' to an
196 * initialized 'struct tc'. */
197 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
199 /* Destroys the data structures allocated by the implementation as part of
200 * 'tc'. (This includes destroying 'tc->queues' by calling
203 * The implementation should not need to perform any Netlink calls. If
204 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
205 * (But it may not be desirable.)
207 * This function may be null if 'tc' is trivial. */
208 void (*tc_destroy)(struct tc *tc);
210 /* Retrieves details of 'netdev->tc' configuration into 'details'.
212 * The implementation should not need to perform any Netlink calls, because
213 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
214 * cached the configuration.
216 * The contents of 'details' should be documented as valid for 'ovs_name'
217 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
218 * (which is built as ovs-vswitchd.conf.db(8)).
220 * This function may be null if 'tc' is not configurable.
222 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
224 /* Reconfigures 'netdev->tc' according to 'details', performing any
225 * required Netlink calls to complete the reconfiguration.
227 * The contents of 'details' should be documented as valid for 'ovs_name'
228 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
229 * (which is built as ovs-vswitchd.conf.db(8)).
231 * This function may be null if 'tc' is not configurable.
233 int (*qdisc_set)(struct netdev *, const struct shash *details);
235 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
236 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
238 * The contents of 'details' should be documented as valid for 'ovs_name'
239 * in the "other_config" column in the "Queue" table in
240 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
242 * The implementation should not need to perform any Netlink calls, because
243 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
244 * cached the queue configuration.
246 * This function may be null if 'tc' does not have queues ('n_queues' is
248 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
249 struct shash *details);
251 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
252 * 'details', perfoming any required Netlink calls to complete the
253 * reconfiguration. The caller ensures that 'queue_id' is less than
256 * The contents of 'details' should be documented as valid for 'ovs_name'
257 * in the "other_config" column in the "Queue" table in
258 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
260 * This function may be null if 'tc' does not have queues or its queues are
261 * not configurable. */
262 int (*class_set)(struct netdev *, unsigned int queue_id,
263 const struct shash *details);
265 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
266 * tc_queue's within 'netdev->tc->queues'.
268 * This function may be null if 'tc' does not have queues or its queues
269 * cannot be deleted. */
270 int (*class_delete)(struct netdev *, struct tc_queue *queue);
272 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
273 * 'struct tc_queue's within 'netdev->tc->queues'.
275 * On success, initializes '*stats'.
277 * This function may be null if 'tc' does not have queues or if it cannot
278 * report queue statistics. */
279 int (*class_get_stats)(const struct netdev *netdev,
280 const struct tc_queue *queue,
281 struct netdev_queue_stats *stats);
283 /* Extracts queue stats from 'nlmsg', which is a response to a
284 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
286 * This function may be null if 'tc' does not have queues or if it cannot
287 * report queue statistics. */
288 int (*class_dump_stats)(const struct netdev *netdev,
289 const struct ofpbuf *nlmsg,
290 netdev_dump_queue_stats_cb *cb, void *aux);
294 tc_init(struct tc *tc, const struct tc_ops *ops)
297 hmap_init(&tc->queues);
301 tc_destroy(struct tc *tc)
303 hmap_destroy(&tc->queues);
306 static const struct tc_ops tc_ops_htb;
307 static const struct tc_ops tc_ops_hfsc;
308 static const struct tc_ops tc_ops_default;
309 static const struct tc_ops tc_ops_other;
311 static const struct tc_ops *tcs[] = {
312 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
313 &tc_ops_hfsc, /* Hierarchical fair service curve. */
314 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
315 &tc_ops_other, /* Some other qdisc. */
319 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
320 static unsigned int tc_get_major(unsigned int handle);
321 static unsigned int tc_get_minor(unsigned int handle);
323 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
324 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
325 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
327 static struct tcmsg *tc_make_request(const struct netdev *, int type,
328 unsigned int flags, struct ofpbuf *);
329 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
330 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
331 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
334 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
335 struct nlattr **options);
336 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
337 struct nlattr **options,
338 struct netdev_queue_stats *);
339 static int tc_query_class(const struct netdev *,
340 unsigned int handle, unsigned int parent,
341 struct ofpbuf **replyp);
342 static int tc_delete_class(const struct netdev *, unsigned int handle);
344 static int tc_del_qdisc(struct netdev *netdev);
345 static int tc_query_qdisc(const struct netdev *netdev);
347 static int tc_calc_cell_log(unsigned int mtu);
348 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
349 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
350 const struct tc_ratespec *rate);
351 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
353 struct netdev_dev_linux {
354 struct netdev_dev netdev_dev;
356 struct shash_node *shash_node;
357 unsigned int cache_valid;
358 unsigned int change_seq;
360 bool miimon; /* Link status of last poll. */
361 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
362 struct timer miimon_timer;
364 /* The following are figured out "on demand" only. They are only valid
365 * when the corresponding VALID_* bit in 'cache_valid' is set. */
367 uint8_t etheraddr[ETH_ADDR_LEN];
368 struct in_addr address, netmask;
372 long long int carrier_resets;
373 uint32_t kbits_rate; /* Policing data. */
374 uint32_t kbits_burst;
375 bool have_vport_stats;
379 struct tap_state tap;
383 struct netdev_linux {
384 struct netdev netdev;
388 /* Sockets used for ioctl operations. */
389 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
391 /* A Netlink routing socket that is not subscribed to any multicast groups. */
392 static struct nl_sock *rtnl_sock;
394 /* This is set pretty low because we probably won't learn anything from the
395 * additional log messages. */
396 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
398 static int netdev_linux_init(void);
400 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
401 int cmd, const char *cmd_name);
402 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
403 const char *cmd_name);
404 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
405 int cmd, const char *cmd_name);
406 static int get_flags(const struct netdev *, int *flagsp);
407 static int set_flags(struct netdev *, int flags);
408 static int do_get_ifindex(const char *netdev_name);
409 static int get_ifindex(const struct netdev *, int *ifindexp);
410 static int do_set_addr(struct netdev *netdev,
411 int ioctl_nr, const char *ioctl_name,
412 struct in_addr addr);
413 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
414 static int set_etheraddr(const char *netdev_name, int hwaddr_family,
415 const uint8_t[ETH_ADDR_LEN]);
416 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
417 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
418 static int get_carrier_via_sysfs(const char *name, bool *carrier);
419 static int af_packet_sock(void);
420 static void netdev_linux_miimon_run(void);
421 static void netdev_linux_miimon_wait(void);
424 is_netdev_linux_class(const struct netdev_class *netdev_class)
426 return netdev_class->init == netdev_linux_init;
429 static struct netdev_dev_linux *
430 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
432 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
433 assert(is_netdev_linux_class(netdev_class));
435 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
438 static struct netdev_linux *
439 netdev_linux_cast(const struct netdev *netdev)
441 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
442 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
443 assert(is_netdev_linux_class(netdev_class));
445 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
449 netdev_linux_init(void)
451 static int status = -1;
453 /* Create AF_INET socket. */
454 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
455 status = af_inet_sock >= 0 ? 0 : errno;
457 VLOG_ERR("failed to create inet socket: %s", strerror(status));
460 /* Create rtnetlink socket. */
462 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
464 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
473 netdev_linux_run(void)
475 rtnetlink_link_run();
476 netdev_linux_miimon_run();
480 netdev_linux_wait(void)
482 rtnetlink_link_wait();
483 netdev_linux_miimon_wait();
487 netdev_dev_linux_changed(struct netdev_dev_linux *dev)
490 if (!dev->change_seq) {
493 dev->cache_valid = 0;
497 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
498 void *aux OVS_UNUSED)
500 struct netdev_dev_linux *dev;
502 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
504 const struct netdev_class *netdev_class =
505 netdev_dev_get_class(base_dev);
507 if (is_netdev_linux_class(netdev_class)) {
508 dev = netdev_dev_linux_cast(base_dev);
510 if (dev->carrier != change->running) {
511 dev->carrier = change->running;
512 dev->carrier_resets++;
515 netdev_dev_linux_changed(dev);
519 struct shash device_shash;
520 struct shash_node *node;
522 shash_init(&device_shash);
523 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
524 SHASH_FOR_EACH (node, &device_shash) {
529 get_carrier_via_sysfs(node->name, &carrier);
530 if (dev->carrier != carrier) {
531 dev->carrier = carrier;
532 dev->carrier_resets++;
535 netdev_dev_linux_changed(dev);
537 shash_destroy(&device_shash);
542 cache_notifier_ref(void)
544 if (!cache_notifier_refcount) {
545 assert(!netdev_linux_cache_notifier);
547 netdev_linux_cache_notifier =
548 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
550 if (!netdev_linux_cache_notifier) {
554 cache_notifier_refcount++;
560 cache_notifier_unref(void)
562 assert(cache_notifier_refcount > 0);
563 if (!--cache_notifier_refcount) {
564 assert(netdev_linux_cache_notifier);
565 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
566 netdev_linux_cache_notifier = NULL;
570 /* Creates system and internal devices. */
572 netdev_linux_create(const struct netdev_class *class, const char *name,
573 struct netdev_dev **netdev_devp)
575 struct netdev_dev_linux *netdev_dev;
578 error = cache_notifier_ref();
583 netdev_dev = xzalloc(sizeof *netdev_dev);
584 netdev_dev->change_seq = 1;
585 netdev_dev_init(&netdev_dev->netdev_dev, name, class);
586 get_carrier_via_sysfs(name, &netdev_dev->carrier);
588 *netdev_devp = &netdev_dev->netdev_dev;
592 /* For most types of netdevs we open the device for each call of
593 * netdev_open(). However, this is not the case with tap devices,
594 * since it is only possible to open the device once. In this
595 * situation we share a single file descriptor, and consequently
596 * buffers, across all readers. Therefore once data is read it will
597 * be unavailable to other reads for tap devices. */
599 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
600 const char *name, struct netdev_dev **netdev_devp)
602 struct netdev_dev_linux *netdev_dev;
603 struct tap_state *state;
604 static const char tap_dev[] = "/dev/net/tun";
608 netdev_dev = xzalloc(sizeof *netdev_dev);
609 state = &netdev_dev->state.tap;
611 error = cache_notifier_ref();
616 /* Open tap device. */
617 state->fd = open(tap_dev, O_RDWR);
620 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
621 goto error_unref_notifier;
624 /* Create tap device. */
625 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
626 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
627 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
628 VLOG_WARN("%s: creating tap device failed: %s", name,
631 goto error_unref_notifier;
634 /* Make non-blocking. */
635 error = set_nonblocking(state->fd);
637 goto error_unref_notifier;
640 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
641 *netdev_devp = &netdev_dev->netdev_dev;
644 error_unref_notifier:
645 cache_notifier_unref();
652 destroy_tap(struct netdev_dev_linux *netdev_dev)
654 struct tap_state *state = &netdev_dev->state.tap;
656 if (state->fd >= 0) {
661 /* Destroys the netdev device 'netdev_dev_'. */
663 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
665 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
666 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
668 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
669 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
672 if (class == &netdev_tap_class) {
673 destroy_tap(netdev_dev);
677 cache_notifier_unref();
681 netdev_linux_open(struct netdev_dev *netdev_dev_, struct netdev **netdevp)
683 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
684 struct netdev_linux *netdev;
685 enum netdev_flags flags;
688 /* Allocate network device. */
689 netdev = xzalloc(sizeof *netdev);
691 netdev_init(&netdev->netdev, netdev_dev_);
693 /* Verify that the device really exists, by attempting to read its flags.
694 * (The flags might be cached, in which case this won't actually do an
697 * Don't do this for "internal" netdevs, though, because those have to be
698 * created as netdev objects before they exist in the kernel, because
699 * creating them in the kernel happens by passing a netdev object to
700 * dpif_port_add(). */
701 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
702 error = netdev_get_flags(&netdev->netdev, &flags);
703 if (error == ENODEV) {
708 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
709 !netdev_dev->state.tap.opened) {
711 /* We assume that the first user of the tap device is the primary user
712 * and give them the tap FD. Subsequent users probably just expect
713 * this to be a system device so open it normally to avoid send/receive
714 * directions appearing to be reversed. */
715 netdev->fd = netdev_dev->state.tap.fd;
716 netdev_dev->state.tap.opened = true;
719 *netdevp = &netdev->netdev;
723 netdev_uninit(&netdev->netdev, true);
727 /* Closes and destroys 'netdev'. */
729 netdev_linux_close(struct netdev *netdev_)
731 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
733 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
740 netdev_linux_listen(struct netdev *netdev_)
742 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
743 struct sockaddr_ll sll;
748 if (netdev->fd >= 0) {
752 /* Create file descriptor. */
753 fd = socket(PF_PACKET, SOCK_RAW, 0);
756 VLOG_ERR("failed to create raw socket (%s)", strerror(error));
760 /* Set non-blocking mode. */
761 error = set_nonblocking(fd);
766 /* Get ethernet device index. */
767 error = get_ifindex(&netdev->netdev, &ifindex);
772 /* Bind to specific ethernet device. */
773 memset(&sll, 0, sizeof sll);
774 sll.sll_family = AF_PACKET;
775 sll.sll_ifindex = ifindex;
776 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
777 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
779 VLOG_ERR("%s: failed to bind raw socket (%s)",
780 netdev_get_name(netdev_), strerror(error));
795 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
797 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
799 if (netdev->fd < 0) {
800 /* Device is not listening. */
805 ssize_t retval = recv(netdev->fd, data, size, MSG_TRUNC);
807 /* Received packet was longer than supplied buffer. */
809 } else if (retval >= 0) {
811 } else if (errno != EINTR) {
812 if (errno != EAGAIN) {
813 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
814 strerror(errno), netdev_get_name(netdev_));
821 /* Registers with the poll loop to wake up from the next call to poll_block()
822 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
824 netdev_linux_recv_wait(struct netdev *netdev_)
826 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
827 if (netdev->fd >= 0) {
828 poll_fd_wait(netdev->fd, POLLIN);
832 /* Discards all packets waiting to be received from 'netdev'. */
834 netdev_linux_drain(struct netdev *netdev_)
836 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
837 if (netdev->fd < 0) {
839 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
841 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
842 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
846 drain_fd(netdev->fd, ifr.ifr_qlen);
849 return drain_rcvbuf(netdev->fd);
853 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
854 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
855 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
856 * the packet is too big or too small to transmit on the device.
858 * The caller retains ownership of 'buffer' in all cases.
860 * The kernel maintains a packet transmission queue, so the caller is not
861 * expected to do additional queuing of packets. */
863 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
865 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
869 if (netdev->fd < 0) {
870 /* Use our AF_PACKET socket to send to this device. */
871 struct sockaddr_ll sll;
878 sock = af_packet_sock();
883 error = get_ifindex(netdev_, &ifindex);
888 /* We don't bother setting most fields in sockaddr_ll because the
889 * kernel ignores them for SOCK_RAW. */
890 memset(&sll, 0, sizeof sll);
891 sll.sll_family = AF_PACKET;
892 sll.sll_ifindex = ifindex;
894 iov.iov_base = (void *) data;
898 msg.msg_namelen = sizeof sll;
901 msg.msg_control = NULL;
902 msg.msg_controllen = 0;
905 retval = sendmsg(sock, &msg, 0);
907 /* Use the netdev's own fd to send to this device. This is
908 * essential for tap devices, because packets sent to a tap device
909 * with an AF_PACKET socket will loop back to be *received* again
910 * on the tap device. */
911 retval = write(netdev->fd, data, size);
915 /* The Linux AF_PACKET implementation never blocks waiting for room
916 * for packets, instead returning ENOBUFS. Translate this into
917 * EAGAIN for the caller. */
918 if (errno == ENOBUFS) {
920 } else if (errno == EINTR) {
922 } else if (errno != EAGAIN) {
923 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
924 netdev_get_name(netdev_), strerror(errno));
927 } else if (retval != size) {
928 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
929 "%zu) on %s", retval, size, netdev_get_name(netdev_));
937 /* Registers with the poll loop to wake up from the next call to poll_block()
938 * when the packet transmission queue has sufficient room to transmit a packet
939 * with netdev_send().
941 * The kernel maintains a packet transmission queue, so the client is not
942 * expected to do additional queuing of packets. Thus, this function is
943 * unlikely to ever be used. It is included for completeness. */
945 netdev_linux_send_wait(struct netdev *netdev_)
947 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
948 if (netdev->fd < 0) {
950 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
951 poll_fd_wait(netdev->fd, POLLOUT);
953 /* TAP device always accepts packets.*/
954 poll_immediate_wake();
958 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
959 * otherwise a positive errno value. */
961 netdev_linux_set_etheraddr(struct netdev *netdev_,
962 const uint8_t mac[ETH_ADDR_LEN])
964 struct netdev_dev_linux *netdev_dev =
965 netdev_dev_linux_cast(netdev_get_dev(netdev_));
968 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
969 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
970 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
972 netdev_dev->cache_valid |= VALID_ETHERADDR;
973 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
981 /* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
982 * free the returned buffer. */
984 netdev_linux_get_etheraddr(const struct netdev *netdev_,
985 uint8_t mac[ETH_ADDR_LEN])
987 struct netdev_dev_linux *netdev_dev =
988 netdev_dev_linux_cast(netdev_get_dev(netdev_));
989 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
990 int error = get_etheraddr(netdev_get_name(netdev_),
991 netdev_dev->etheraddr);
995 netdev_dev->cache_valid |= VALID_ETHERADDR;
997 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
1001 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1002 * in bytes, not including the hardware header; thus, this is typically 1500
1003 * bytes for Ethernet devices. */
1005 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1007 struct netdev_dev_linux *netdev_dev =
1008 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1009 if (!(netdev_dev->cache_valid & VALID_MTU)) {
1013 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1014 SIOCGIFMTU, "SIOCGIFMTU");
1018 netdev_dev->mtu = ifr.ifr_mtu;
1019 netdev_dev->cache_valid |= VALID_MTU;
1021 *mtup = netdev_dev->mtu;
1025 /* Sets the maximum size of transmitted (MTU) for given device using linux
1026 * networking ioctl interface.
1029 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1031 struct netdev_dev_linux *netdev_dev =
1032 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1037 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1038 SIOCSIFMTU, "SIOCSIFMTU");
1043 netdev_dev->mtu = ifr.ifr_mtu;
1044 netdev_dev->cache_valid |= VALID_MTU;
1048 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1049 * On failure, returns a negative errno value. */
1051 netdev_linux_get_ifindex(const struct netdev *netdev)
1055 error = get_ifindex(netdev, &ifindex);
1056 return error ? -error : ifindex;
1060 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1062 struct netdev_dev_linux *netdev_dev =
1063 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1065 if (netdev_dev->miimon_interval > 0) {
1066 *carrier = netdev_dev->miimon;
1068 *carrier = netdev_dev->carrier;
1074 static long long int
1075 netdev_linux_get_carrier_resets(const struct netdev *netdev)
1077 return netdev_dev_linux_cast(netdev_get_dev(netdev))->carrier_resets;
1081 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1082 struct mii_ioctl_data *data)
1087 memset(&ifr, 0, sizeof ifr);
1088 memcpy(&ifr.ifr_data, data, sizeof *data);
1089 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1090 memcpy(data, &ifr.ifr_data, sizeof *data);
1096 netdev_linux_get_miimon(const char *name, bool *miimon)
1098 struct mii_ioctl_data data;
1103 memset(&data, 0, sizeof data);
1104 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1106 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1107 data.reg_num = MII_BMSR;
1108 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1112 *miimon = !!(data.val_out & BMSR_LSTATUS);
1114 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1117 struct ethtool_cmd ecmd;
1119 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1122 memset(&ecmd, 0, sizeof ecmd);
1123 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1126 struct ethtool_value eval;
1128 memcpy(&eval, &ecmd, sizeof eval);
1129 *miimon = !!eval.data;
1131 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1139 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1140 long long int interval)
1142 struct netdev_dev_linux *netdev_dev;
1144 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev_));
1146 interval = interval > 0 ? MAX(interval, 100) : 0;
1147 if (netdev_dev->miimon_interval != interval) {
1148 netdev_dev->miimon_interval = interval;
1149 timer_set_expired(&netdev_dev->miimon_timer);
1156 netdev_linux_miimon_run(void)
1158 struct shash device_shash;
1159 struct shash_node *node;
1161 shash_init(&device_shash);
1162 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1163 SHASH_FOR_EACH (node, &device_shash) {
1164 struct netdev_dev_linux *dev = node->data;
1167 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1171 netdev_linux_get_miimon(dev->netdev_dev.name, &miimon);
1172 if (miimon != dev->miimon) {
1173 dev->miimon = miimon;
1174 netdev_dev_linux_changed(dev);
1177 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1180 shash_destroy(&device_shash);
1184 netdev_linux_miimon_wait(void)
1186 struct shash device_shash;
1187 struct shash_node *node;
1189 shash_init(&device_shash);
1190 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1191 SHASH_FOR_EACH (node, &device_shash) {
1192 struct netdev_dev_linux *dev = node->data;
1194 if (dev->miimon_interval > 0) {
1195 timer_wait(&dev->miimon_timer);
1198 shash_destroy(&device_shash);
1201 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1202 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1205 check_for_working_netlink_stats(void)
1207 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1208 * preferable, so if that works, we'll use it. */
1209 int ifindex = do_get_ifindex("lo");
1211 VLOG_WARN("failed to get ifindex for lo, "
1212 "obtaining netdev stats from proc");
1215 struct netdev_stats stats;
1216 int error = get_stats_via_netlink(ifindex, &stats);
1218 VLOG_DBG("obtaining netdev stats via rtnetlink");
1221 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1222 "via proc (you are probably running a pre-2.6.19 "
1223 "kernel)", strerror(error));
1230 swap_uint64(uint64_t *a, uint64_t *b)
1238 get_stats_via_vport(const struct netdev *netdev_,
1239 struct netdev_stats *stats)
1241 struct netdev_dev_linux *netdev_dev =
1242 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1244 if (netdev_dev->have_vport_stats ||
1245 !(netdev_dev->cache_valid & VALID_HAVE_VPORT_STATS)) {
1248 error = netdev_vport_get_stats(netdev_, stats);
1250 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1251 "(%s)", netdev_get_name(netdev_), strerror(error));
1253 netdev_dev->have_vport_stats = !error;
1254 netdev_dev->cache_valid |= VALID_HAVE_VPORT_STATS;
1259 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1260 struct netdev_stats *stats)
1262 static int use_netlink_stats = -1;
1265 if (use_netlink_stats < 0) {
1266 use_netlink_stats = check_for_working_netlink_stats();
1269 if (use_netlink_stats) {
1272 error = get_ifindex(netdev_, &ifindex);
1274 error = get_stats_via_netlink(ifindex, stats);
1277 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1281 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1282 netdev_get_name(netdev_), error);
1288 /* Retrieves current device stats for 'netdev-linux'. */
1290 netdev_linux_get_stats(const struct netdev *netdev_,
1291 struct netdev_stats *stats)
1293 struct netdev_dev_linux *netdev_dev =
1294 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1295 struct netdev_stats dev_stats;
1298 get_stats_via_vport(netdev_, stats);
1300 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1303 if (!netdev_dev->have_vport_stats) {
1310 if (!netdev_dev->have_vport_stats) {
1311 /* stats not available from OVS then use ioctl stats. */
1314 stats->rx_errors += dev_stats.rx_errors;
1315 stats->tx_errors += dev_stats.tx_errors;
1316 stats->rx_dropped += dev_stats.rx_dropped;
1317 stats->tx_dropped += dev_stats.tx_dropped;
1318 stats->multicast += dev_stats.multicast;
1319 stats->collisions += dev_stats.collisions;
1320 stats->rx_length_errors += dev_stats.rx_length_errors;
1321 stats->rx_over_errors += dev_stats.rx_over_errors;
1322 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1323 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1324 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1325 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1326 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1327 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1328 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1329 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1330 stats->tx_window_errors += dev_stats.tx_window_errors;
1335 /* Retrieves current device stats for 'netdev-tap' netdev or
1336 * netdev-internal. */
1338 netdev_pseudo_get_stats(const struct netdev *netdev_,
1339 struct netdev_stats *stats)
1341 struct netdev_dev_linux *netdev_dev =
1342 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1343 struct netdev_stats dev_stats;
1346 get_stats_via_vport(netdev_, stats);
1348 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1350 if (!netdev_dev->have_vport_stats) {
1357 /* If this port is an internal port then the transmit and receive stats
1358 * will appear to be swapped relative to the other ports since we are the
1359 * one sending the data, not a remote computer. For consistency, we swap
1360 * them back here. This does not apply if we are getting stats from the
1361 * vport layer because it always tracks stats from the perspective of the
1363 if (!netdev_dev->have_vport_stats) {
1365 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1366 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1367 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1368 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1369 stats->rx_length_errors = 0;
1370 stats->rx_over_errors = 0;
1371 stats->rx_crc_errors = 0;
1372 stats->rx_frame_errors = 0;
1373 stats->rx_fifo_errors = 0;
1374 stats->rx_missed_errors = 0;
1375 stats->tx_aborted_errors = 0;
1376 stats->tx_carrier_errors = 0;
1377 stats->tx_fifo_errors = 0;
1378 stats->tx_heartbeat_errors = 0;
1379 stats->tx_window_errors = 0;
1381 stats->rx_dropped += dev_stats.tx_dropped;
1382 stats->tx_dropped += dev_stats.rx_dropped;
1384 stats->rx_errors += dev_stats.tx_errors;
1385 stats->tx_errors += dev_stats.rx_errors;
1387 stats->multicast += dev_stats.multicast;
1388 stats->collisions += dev_stats.collisions;
1393 /* Stores the features supported by 'netdev' into each of '*current',
1394 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1395 * bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if
1396 * successful, otherwise a positive errno value. */
1398 netdev_linux_get_features(const struct netdev *netdev,
1399 uint32_t *current, uint32_t *advertised,
1400 uint32_t *supported, uint32_t *peer)
1402 struct ethtool_cmd ecmd;
1405 memset(&ecmd, 0, sizeof ecmd);
1406 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1407 ETHTOOL_GSET, "ETHTOOL_GSET");
1412 /* Supported features. */
1414 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1415 *supported |= OFPPF_10MB_HD;
1417 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1418 *supported |= OFPPF_10MB_FD;
1420 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1421 *supported |= OFPPF_100MB_HD;
1423 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1424 *supported |= OFPPF_100MB_FD;
1426 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1427 *supported |= OFPPF_1GB_HD;
1429 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1430 *supported |= OFPPF_1GB_FD;
1432 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1433 *supported |= OFPPF_10GB_FD;
1435 if (ecmd.supported & SUPPORTED_TP) {
1436 *supported |= OFPPF_COPPER;
1438 if (ecmd.supported & SUPPORTED_FIBRE) {
1439 *supported |= OFPPF_FIBER;
1441 if (ecmd.supported & SUPPORTED_Autoneg) {
1442 *supported |= OFPPF_AUTONEG;
1444 if (ecmd.supported & SUPPORTED_Pause) {
1445 *supported |= OFPPF_PAUSE;
1447 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1448 *supported |= OFPPF_PAUSE_ASYM;
1451 /* Advertised features. */
1453 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1454 *advertised |= OFPPF_10MB_HD;
1456 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1457 *advertised |= OFPPF_10MB_FD;
1459 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1460 *advertised |= OFPPF_100MB_HD;
1462 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1463 *advertised |= OFPPF_100MB_FD;
1465 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1466 *advertised |= OFPPF_1GB_HD;
1468 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1469 *advertised |= OFPPF_1GB_FD;
1471 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1472 *advertised |= OFPPF_10GB_FD;
1474 if (ecmd.advertising & ADVERTISED_TP) {
1475 *advertised |= OFPPF_COPPER;
1477 if (ecmd.advertising & ADVERTISED_FIBRE) {
1478 *advertised |= OFPPF_FIBER;
1480 if (ecmd.advertising & ADVERTISED_Autoneg) {
1481 *advertised |= OFPPF_AUTONEG;
1483 if (ecmd.advertising & ADVERTISED_Pause) {
1484 *advertised |= OFPPF_PAUSE;
1486 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1487 *advertised |= OFPPF_PAUSE_ASYM;
1490 /* Current settings. */
1491 if (ecmd.speed == SPEED_10) {
1492 *current = ecmd.duplex ? OFPPF_10MB_FD : OFPPF_10MB_HD;
1493 } else if (ecmd.speed == SPEED_100) {
1494 *current = ecmd.duplex ? OFPPF_100MB_FD : OFPPF_100MB_HD;
1495 } else if (ecmd.speed == SPEED_1000) {
1496 *current = ecmd.duplex ? OFPPF_1GB_FD : OFPPF_1GB_HD;
1497 } else if (ecmd.speed == SPEED_10000) {
1498 *current = OFPPF_10GB_FD;
1503 if (ecmd.port == PORT_TP) {
1504 *current |= OFPPF_COPPER;
1505 } else if (ecmd.port == PORT_FIBRE) {
1506 *current |= OFPPF_FIBER;
1510 *current |= OFPPF_AUTONEG;
1513 /* Peer advertisements. */
1514 *peer = 0; /* XXX */
1519 /* Set the features advertised by 'netdev' to 'advertise'. */
1521 netdev_linux_set_advertisements(struct netdev *netdev, uint32_t advertise)
1523 struct ethtool_cmd ecmd;
1526 memset(&ecmd, 0, sizeof ecmd);
1527 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1528 ETHTOOL_GSET, "ETHTOOL_GSET");
1533 ecmd.advertising = 0;
1534 if (advertise & OFPPF_10MB_HD) {
1535 ecmd.advertising |= ADVERTISED_10baseT_Half;
1537 if (advertise & OFPPF_10MB_FD) {
1538 ecmd.advertising |= ADVERTISED_10baseT_Full;
1540 if (advertise & OFPPF_100MB_HD) {
1541 ecmd.advertising |= ADVERTISED_100baseT_Half;
1543 if (advertise & OFPPF_100MB_FD) {
1544 ecmd.advertising |= ADVERTISED_100baseT_Full;
1546 if (advertise & OFPPF_1GB_HD) {
1547 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1549 if (advertise & OFPPF_1GB_FD) {
1550 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1552 if (advertise & OFPPF_10GB_FD) {
1553 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1555 if (advertise & OFPPF_COPPER) {
1556 ecmd.advertising |= ADVERTISED_TP;
1558 if (advertise & OFPPF_FIBER) {
1559 ecmd.advertising |= ADVERTISED_FIBRE;
1561 if (advertise & OFPPF_AUTONEG) {
1562 ecmd.advertising |= ADVERTISED_Autoneg;
1564 if (advertise & OFPPF_PAUSE) {
1565 ecmd.advertising |= ADVERTISED_Pause;
1567 if (advertise & OFPPF_PAUSE_ASYM) {
1568 ecmd.advertising |= ADVERTISED_Asym_Pause;
1570 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1571 ETHTOOL_SSET, "ETHTOOL_SSET");
1574 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1575 * successful, otherwise a positive errno value. */
1577 netdev_linux_set_policing(struct netdev *netdev,
1578 uint32_t kbits_rate, uint32_t kbits_burst)
1580 struct netdev_dev_linux *netdev_dev =
1581 netdev_dev_linux_cast(netdev_get_dev(netdev));
1582 const char *netdev_name = netdev_get_name(netdev);
1585 COVERAGE_INC(netdev_set_policing);
1587 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1588 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1589 : kbits_burst); /* Stick with user-specified value. */
1591 if (netdev_dev->cache_valid & VALID_POLICING
1592 && netdev_dev->kbits_rate == kbits_rate
1593 && netdev_dev->kbits_burst == kbits_burst) {
1594 /* Assume that settings haven't changed since we last set them. */
1598 /* Remove any existing ingress qdisc. */
1599 error = tc_add_del_ingress_qdisc(netdev, false);
1601 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1602 netdev_name, strerror(error));
1607 error = tc_add_del_ingress_qdisc(netdev, true);
1609 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1610 netdev_name, strerror(error));
1614 error = tc_add_policer(netdev, kbits_rate, kbits_burst);
1616 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1617 netdev_name, strerror(error));
1622 netdev_dev->kbits_rate = kbits_rate;
1623 netdev_dev->kbits_burst = kbits_burst;
1624 netdev_dev->cache_valid |= VALID_POLICING;
1630 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1633 const struct tc_ops **opsp;
1635 for (opsp = tcs; *opsp != NULL; opsp++) {
1636 const struct tc_ops *ops = *opsp;
1637 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1638 sset_add(types, ops->ovs_name);
1644 static const struct tc_ops *
1645 tc_lookup_ovs_name(const char *name)
1647 const struct tc_ops **opsp;
1649 for (opsp = tcs; *opsp != NULL; opsp++) {
1650 const struct tc_ops *ops = *opsp;
1651 if (!strcmp(name, ops->ovs_name)) {
1658 static const struct tc_ops *
1659 tc_lookup_linux_name(const char *name)
1661 const struct tc_ops **opsp;
1663 for (opsp = tcs; *opsp != NULL; opsp++) {
1664 const struct tc_ops *ops = *opsp;
1665 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1672 static struct tc_queue *
1673 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1676 struct netdev_dev_linux *netdev_dev =
1677 netdev_dev_linux_cast(netdev_get_dev(netdev));
1678 struct tc_queue *queue;
1680 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1681 if (queue->queue_id == queue_id) {
1688 static struct tc_queue *
1689 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1691 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1695 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1697 struct netdev_qos_capabilities *caps)
1699 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1703 caps->n_queues = ops->n_queues;
1708 netdev_linux_get_qos(const struct netdev *netdev,
1709 const char **typep, struct shash *details)
1711 struct netdev_dev_linux *netdev_dev =
1712 netdev_dev_linux_cast(netdev_get_dev(netdev));
1715 error = tc_query_qdisc(netdev);
1720 *typep = netdev_dev->tc->ops->ovs_name;
1721 return (netdev_dev->tc->ops->qdisc_get
1722 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1727 netdev_linux_set_qos(struct netdev *netdev,
1728 const char *type, const struct shash *details)
1730 struct netdev_dev_linux *netdev_dev =
1731 netdev_dev_linux_cast(netdev_get_dev(netdev));
1732 const struct tc_ops *new_ops;
1735 new_ops = tc_lookup_ovs_name(type);
1736 if (!new_ops || !new_ops->tc_install) {
1740 error = tc_query_qdisc(netdev);
1745 if (new_ops == netdev_dev->tc->ops) {
1746 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1748 /* Delete existing qdisc. */
1749 error = tc_del_qdisc(netdev);
1753 assert(netdev_dev->tc == NULL);
1755 /* Install new qdisc. */
1756 error = new_ops->tc_install(netdev, details);
1757 assert((error == 0) == (netdev_dev->tc != NULL));
1764 netdev_linux_get_queue(const struct netdev *netdev,
1765 unsigned int queue_id, struct shash *details)
1767 struct netdev_dev_linux *netdev_dev =
1768 netdev_dev_linux_cast(netdev_get_dev(netdev));
1771 error = tc_query_qdisc(netdev);
1775 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1777 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1783 netdev_linux_set_queue(struct netdev *netdev,
1784 unsigned int queue_id, const struct shash *details)
1786 struct netdev_dev_linux *netdev_dev =
1787 netdev_dev_linux_cast(netdev_get_dev(netdev));
1790 error = tc_query_qdisc(netdev);
1793 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1794 || !netdev_dev->tc->ops->class_set) {
1798 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1802 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1804 struct netdev_dev_linux *netdev_dev =
1805 netdev_dev_linux_cast(netdev_get_dev(netdev));
1808 error = tc_query_qdisc(netdev);
1811 } else if (!netdev_dev->tc->ops->class_delete) {
1814 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1816 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1822 netdev_linux_get_queue_stats(const struct netdev *netdev,
1823 unsigned int queue_id,
1824 struct netdev_queue_stats *stats)
1826 struct netdev_dev_linux *netdev_dev =
1827 netdev_dev_linux_cast(netdev_get_dev(netdev));
1830 error = tc_query_qdisc(netdev);
1833 } else if (!netdev_dev->tc->ops->class_get_stats) {
1836 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1838 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1844 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1846 struct ofpbuf request;
1847 struct tcmsg *tcmsg;
1849 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
1853 tcmsg->tcm_parent = 0;
1854 nl_dump_start(dump, rtnl_sock, &request);
1855 ofpbuf_uninit(&request);
1860 netdev_linux_dump_queues(const struct netdev *netdev,
1861 netdev_dump_queues_cb *cb, void *aux)
1863 struct netdev_dev_linux *netdev_dev =
1864 netdev_dev_linux_cast(netdev_get_dev(netdev));
1865 struct tc_queue *queue;
1866 struct shash details;
1870 error = tc_query_qdisc(netdev);
1873 } else if (!netdev_dev->tc->ops->class_get) {
1878 shash_init(&details);
1879 HMAP_FOR_EACH (queue, hmap_node, &netdev_dev->tc->queues) {
1880 shash_clear(&details);
1882 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
1884 (*cb)(queue->queue_id, &details, aux);
1889 shash_destroy(&details);
1895 netdev_linux_dump_queue_stats(const struct netdev *netdev,
1896 netdev_dump_queue_stats_cb *cb, void *aux)
1898 struct netdev_dev_linux *netdev_dev =
1899 netdev_dev_linux_cast(netdev_get_dev(netdev));
1900 struct nl_dump dump;
1905 error = tc_query_qdisc(netdev);
1908 } else if (!netdev_dev->tc->ops->class_dump_stats) {
1913 if (!start_queue_dump(netdev, &dump)) {
1916 while (nl_dump_next(&dump, &msg)) {
1917 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
1923 error = nl_dump_done(&dump);
1924 return error ? error : last_error;
1928 netdev_linux_get_in4(const struct netdev *netdev_,
1929 struct in_addr *address, struct in_addr *netmask)
1931 struct netdev_dev_linux *netdev_dev =
1932 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1934 if (!(netdev_dev->cache_valid & VALID_IN4)) {
1937 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
1938 SIOCGIFADDR, "SIOCGIFADDR");
1943 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
1944 SIOCGIFNETMASK, "SIOCGIFNETMASK");
1949 netdev_dev->cache_valid |= VALID_IN4;
1951 *address = netdev_dev->address;
1952 *netmask = netdev_dev->netmask;
1953 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
1957 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
1958 struct in_addr netmask)
1960 struct netdev_dev_linux *netdev_dev =
1961 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1964 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
1966 netdev_dev->cache_valid |= VALID_IN4;
1967 netdev_dev->address = address;
1968 netdev_dev->netmask = netmask;
1969 if (address.s_addr != INADDR_ANY) {
1970 error = do_set_addr(netdev_, SIOCSIFNETMASK,
1971 "SIOCSIFNETMASK", netmask);
1978 parse_if_inet6_line(const char *line,
1979 struct in6_addr *in6, char ifname[16 + 1])
1981 uint8_t *s6 = in6->s6_addr;
1982 #define X8 "%2"SCNx8
1984 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
1985 "%*x %*x %*x %*x %16s\n",
1986 &s6[0], &s6[1], &s6[2], &s6[3],
1987 &s6[4], &s6[5], &s6[6], &s6[7],
1988 &s6[8], &s6[9], &s6[10], &s6[11],
1989 &s6[12], &s6[13], &s6[14], &s6[15],
1993 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
1994 * 'in6' is non-null) and returns true. Otherwise, returns false. */
1996 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
1998 struct netdev_dev_linux *netdev_dev =
1999 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2000 if (!(netdev_dev->cache_valid & VALID_IN6)) {
2004 netdev_dev->in6 = in6addr_any;
2006 file = fopen("/proc/net/if_inet6", "r");
2008 const char *name = netdev_get_name(netdev_);
2009 while (fgets(line, sizeof line, file)) {
2010 struct in6_addr in6_tmp;
2011 char ifname[16 + 1];
2012 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2013 && !strcmp(name, ifname))
2015 netdev_dev->in6 = in6_tmp;
2021 netdev_dev->cache_valid |= VALID_IN6;
2023 *in6 = netdev_dev->in6;
2028 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2030 struct sockaddr_in sin;
2031 memset(&sin, 0, sizeof sin);
2032 sin.sin_family = AF_INET;
2033 sin.sin_addr = addr;
2036 memset(sa, 0, sizeof *sa);
2037 memcpy(sa, &sin, sizeof sin);
2041 do_set_addr(struct netdev *netdev,
2042 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2045 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2046 make_in4_sockaddr(&ifr.ifr_addr, addr);
2048 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2052 /* Adds 'router' as a default IP gateway. */
2054 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2056 struct in_addr any = { INADDR_ANY };
2060 memset(&rt, 0, sizeof rt);
2061 make_in4_sockaddr(&rt.rt_dst, any);
2062 make_in4_sockaddr(&rt.rt_gateway, router);
2063 make_in4_sockaddr(&rt.rt_genmask, any);
2064 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2065 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2067 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
2073 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2076 static const char fn[] = "/proc/net/route";
2081 *netdev_name = NULL;
2082 stream = fopen(fn, "r");
2083 if (stream == NULL) {
2084 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
2089 while (fgets(line, sizeof line, stream)) {
2092 ovs_be32 dest, gateway, mask;
2093 int refcnt, metric, mtu;
2094 unsigned int flags, use, window, irtt;
2097 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2099 iface, &dest, &gateway, &flags, &refcnt,
2100 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2102 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2106 if (!(flags & RTF_UP)) {
2107 /* Skip routes that aren't up. */
2111 /* The output of 'dest', 'mask', and 'gateway' were given in
2112 * network byte order, so we don't need need any endian
2113 * conversions here. */
2114 if ((dest & mask) == (host->s_addr & mask)) {
2116 /* The host is directly reachable. */
2117 next_hop->s_addr = 0;
2119 /* To reach the host, we must go through a gateway. */
2120 next_hop->s_addr = gateway;
2122 *netdev_name = xstrdup(iface);
2134 netdev_linux_get_status(const struct netdev *netdev, struct shash *sh)
2136 struct ethtool_drvinfo drvinfo;
2139 memset(&drvinfo, 0, sizeof drvinfo);
2140 error = netdev_linux_do_ethtool(netdev_get_name(netdev),
2141 (struct ethtool_cmd *)&drvinfo,
2143 "ETHTOOL_GDRVINFO");
2145 shash_add(sh, "driver_name", xstrdup(drvinfo.driver));
2146 shash_add(sh, "driver_version", xstrdup(drvinfo.version));
2147 shash_add(sh, "firmware_version", xstrdup(drvinfo.fw_version));
2153 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2154 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2155 * returns 0. Otherwise, it returns a positive errno value; in particular,
2156 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2158 netdev_linux_arp_lookup(const struct netdev *netdev,
2159 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2162 struct sockaddr_in sin;
2165 memset(&r, 0, sizeof r);
2166 memset(&sin, 0, sizeof sin);
2167 sin.sin_family = AF_INET;
2168 sin.sin_addr.s_addr = ip;
2170 memcpy(&r.arp_pa, &sin, sizeof sin);
2171 r.arp_ha.sa_family = ARPHRD_ETHER;
2173 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2174 COVERAGE_INC(netdev_arp_lookup);
2175 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2177 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2178 } else if (retval != ENXIO) {
2179 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2180 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
2186 nd_to_iff_flags(enum netdev_flags nd)
2189 if (nd & NETDEV_UP) {
2192 if (nd & NETDEV_PROMISC) {
2199 iff_to_nd_flags(int iff)
2201 enum netdev_flags nd = 0;
2205 if (iff & IFF_PROMISC) {
2206 nd |= NETDEV_PROMISC;
2212 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2213 enum netdev_flags on, enum netdev_flags *old_flagsp)
2215 int old_flags, new_flags;
2218 error = get_flags(netdev, &old_flags);
2220 *old_flagsp = iff_to_nd_flags(old_flags);
2221 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2222 if (new_flags != old_flags) {
2223 error = set_flags(netdev, new_flags);
2230 netdev_linux_change_seq(const struct netdev *netdev)
2232 return netdev_dev_linux_cast(netdev_get_dev(netdev))->change_seq;
2235 #define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS) \
2239 netdev_linux_init, \
2241 netdev_linux_wait, \
2244 netdev_linux_destroy, \
2245 NULL, /* get_config */ \
2246 NULL, /* set_config */ \
2248 netdev_linux_open, \
2249 netdev_linux_close, \
2251 netdev_linux_listen, \
2252 netdev_linux_recv, \
2253 netdev_linux_recv_wait, \
2254 netdev_linux_drain, \
2256 netdev_linux_send, \
2257 netdev_linux_send_wait, \
2259 netdev_linux_set_etheraddr, \
2260 netdev_linux_get_etheraddr, \
2261 netdev_linux_get_mtu, \
2262 netdev_linux_set_mtu, \
2263 netdev_linux_get_ifindex, \
2264 netdev_linux_get_carrier, \
2265 netdev_linux_get_carrier_resets, \
2266 netdev_linux_set_miimon_interval, \
2270 netdev_linux_get_features, \
2271 netdev_linux_set_advertisements, \
2273 netdev_linux_set_policing, \
2274 netdev_linux_get_qos_types, \
2275 netdev_linux_get_qos_capabilities, \
2276 netdev_linux_get_qos, \
2277 netdev_linux_set_qos, \
2278 netdev_linux_get_queue, \
2279 netdev_linux_set_queue, \
2280 netdev_linux_delete_queue, \
2281 netdev_linux_get_queue_stats, \
2282 netdev_linux_dump_queues, \
2283 netdev_linux_dump_queue_stats, \
2285 netdev_linux_get_in4, \
2286 netdev_linux_set_in4, \
2287 netdev_linux_get_in6, \
2288 netdev_linux_add_router, \
2289 netdev_linux_get_next_hop, \
2290 netdev_linux_get_status, \
2291 netdev_linux_arp_lookup, \
2293 netdev_linux_update_flags, \
2295 netdev_linux_change_seq \
2298 const struct netdev_class netdev_linux_class =
2301 netdev_linux_create,
2302 netdev_linux_get_stats,
2303 NULL); /* set_stats */
2305 const struct netdev_class netdev_tap_class =
2308 netdev_linux_create_tap,
2309 netdev_pseudo_get_stats,
2310 NULL); /* set_stats */
2312 const struct netdev_class netdev_internal_class =
2315 netdev_linux_create,
2316 netdev_pseudo_get_stats,
2317 netdev_vport_set_stats);
2319 /* HTB traffic control class. */
2321 #define HTB_N_QUEUES 0xf000
2325 unsigned int max_rate; /* In bytes/s. */
2329 struct tc_queue tc_queue;
2330 unsigned int min_rate; /* In bytes/s. */
2331 unsigned int max_rate; /* In bytes/s. */
2332 unsigned int burst; /* In bytes. */
2333 unsigned int priority; /* Lower values are higher priorities. */
2337 htb_get__(const struct netdev *netdev)
2339 struct netdev_dev_linux *netdev_dev =
2340 netdev_dev_linux_cast(netdev_get_dev(netdev));
2341 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2345 htb_install__(struct netdev *netdev, uint64_t max_rate)
2347 struct netdev_dev_linux *netdev_dev =
2348 netdev_dev_linux_cast(netdev_get_dev(netdev));
2351 htb = xmalloc(sizeof *htb);
2352 tc_init(&htb->tc, &tc_ops_htb);
2353 htb->max_rate = max_rate;
2355 netdev_dev->tc = &htb->tc;
2358 /* Create an HTB qdisc.
2360 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2362 htb_setup_qdisc__(struct netdev *netdev)
2365 struct tc_htb_glob opt;
2366 struct ofpbuf request;
2367 struct tcmsg *tcmsg;
2369 tc_del_qdisc(netdev);
2371 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2372 NLM_F_EXCL | NLM_F_CREATE, &request);
2376 tcmsg->tcm_handle = tc_make_handle(1, 0);
2377 tcmsg->tcm_parent = TC_H_ROOT;
2379 nl_msg_put_string(&request, TCA_KIND, "htb");
2381 memset(&opt, 0, sizeof opt);
2382 opt.rate2quantum = 10;
2386 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2387 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2388 nl_msg_end_nested(&request, opt_offset);
2390 return tc_transact(&request, NULL);
2393 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2394 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2396 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2397 unsigned int parent, struct htb_class *class)
2400 struct tc_htb_opt opt;
2401 struct ofpbuf request;
2402 struct tcmsg *tcmsg;
2406 error = netdev_get_mtu(netdev, &mtu);
2408 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2409 netdev_get_name(netdev));
2413 memset(&opt, 0, sizeof opt);
2414 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2415 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2416 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2417 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2418 opt.prio = class->priority;
2420 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2424 tcmsg->tcm_handle = handle;
2425 tcmsg->tcm_parent = parent;
2427 nl_msg_put_string(&request, TCA_KIND, "htb");
2428 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2429 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2430 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2431 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2432 nl_msg_end_nested(&request, opt_offset);
2434 error = tc_transact(&request, NULL);
2436 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2437 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2438 netdev_get_name(netdev),
2439 tc_get_major(handle), tc_get_minor(handle),
2440 tc_get_major(parent), tc_get_minor(parent),
2441 class->min_rate, class->max_rate,
2442 class->burst, class->priority, strerror(error));
2447 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2448 * description of them into 'details'. The description complies with the
2449 * specification given in the vswitch database documentation for linux-htb
2452 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2454 static const struct nl_policy tca_htb_policy[] = {
2455 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2456 .min_len = sizeof(struct tc_htb_opt) },
2459 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2460 const struct tc_htb_opt *htb;
2462 if (!nl_parse_nested(nl_options, tca_htb_policy,
2463 attrs, ARRAY_SIZE(tca_htb_policy))) {
2464 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2468 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2469 class->min_rate = htb->rate.rate;
2470 class->max_rate = htb->ceil.rate;
2471 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2472 class->priority = htb->prio;
2477 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2478 struct htb_class *options,
2479 struct netdev_queue_stats *stats)
2481 struct nlattr *nl_options;
2482 unsigned int handle;
2485 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2486 if (!error && queue_id) {
2487 unsigned int major = tc_get_major(handle);
2488 unsigned int minor = tc_get_minor(handle);
2489 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2490 *queue_id = minor - 1;
2495 if (!error && options) {
2496 error = htb_parse_tca_options__(nl_options, options);
2502 htb_parse_qdisc_details__(struct netdev *netdev,
2503 const struct shash *details, struct htb_class *hc)
2505 const char *max_rate_s;
2507 max_rate_s = shash_find_data(details, "max-rate");
2508 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2509 if (!hc->max_rate) {
2512 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2513 hc->max_rate = netdev_features_to_bps(current) / 8;
2515 hc->min_rate = hc->max_rate;
2521 htb_parse_class_details__(struct netdev *netdev,
2522 const struct shash *details, struct htb_class *hc)
2524 const struct htb *htb = htb_get__(netdev);
2525 const char *min_rate_s = shash_find_data(details, "min-rate");
2526 const char *max_rate_s = shash_find_data(details, "max-rate");
2527 const char *burst_s = shash_find_data(details, "burst");
2528 const char *priority_s = shash_find_data(details, "priority");
2531 error = netdev_get_mtu(netdev, &mtu);
2533 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2534 netdev_get_name(netdev));
2538 /* HTB requires at least an mtu sized min-rate to send any traffic even
2539 * on uncongested links. */
2540 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2541 hc->min_rate = MAX(hc->min_rate, mtu);
2542 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2545 hc->max_rate = (max_rate_s
2546 ? strtoull(max_rate_s, NULL, 10) / 8
2548 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2549 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2553 * According to hints in the documentation that I've read, it is important
2554 * that 'burst' be at least as big as the largest frame that might be
2555 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2556 * but having it a bit too small is a problem. Since netdev_get_mtu()
2557 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2558 * the MTU. We actually add 64, instead of 14, as a guard against
2559 * additional headers get tacked on somewhere that we're not aware of. */
2560 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2561 hc->burst = MAX(hc->burst, mtu + 64);
2564 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2570 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2571 unsigned int parent, struct htb_class *options,
2572 struct netdev_queue_stats *stats)
2574 struct ofpbuf *reply;
2577 error = tc_query_class(netdev, handle, parent, &reply);
2579 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2580 ofpbuf_delete(reply);
2586 htb_tc_install(struct netdev *netdev, const struct shash *details)
2590 error = htb_setup_qdisc__(netdev);
2592 struct htb_class hc;
2594 htb_parse_qdisc_details__(netdev, details, &hc);
2595 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2596 tc_make_handle(1, 0), &hc);
2598 htb_install__(netdev, hc.max_rate);
2604 static struct htb_class *
2605 htb_class_cast__(const struct tc_queue *queue)
2607 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2611 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2612 const struct htb_class *hc)
2614 struct htb *htb = htb_get__(netdev);
2615 size_t hash = hash_int(queue_id, 0);
2616 struct tc_queue *queue;
2617 struct htb_class *hcp;
2619 queue = tc_find_queue__(netdev, queue_id, hash);
2621 hcp = htb_class_cast__(queue);
2623 hcp = xmalloc(sizeof *hcp);
2624 queue = &hcp->tc_queue;
2625 queue->queue_id = queue_id;
2626 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2629 hcp->min_rate = hc->min_rate;
2630 hcp->max_rate = hc->max_rate;
2631 hcp->burst = hc->burst;
2632 hcp->priority = hc->priority;
2636 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2639 struct nl_dump dump;
2640 struct htb_class hc;
2642 /* Get qdisc options. */
2644 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2645 htb_install__(netdev, hc.max_rate);
2648 if (!start_queue_dump(netdev, &dump)) {
2651 while (nl_dump_next(&dump, &msg)) {
2652 unsigned int queue_id;
2654 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2655 htb_update_queue__(netdev, queue_id, &hc);
2658 nl_dump_done(&dump);
2664 htb_tc_destroy(struct tc *tc)
2666 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2667 struct htb_class *hc, *next;
2669 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2670 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2678 htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2680 const struct htb *htb = htb_get__(netdev);
2681 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2686 htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2688 struct htb_class hc;
2691 htb_parse_qdisc_details__(netdev, details, &hc);
2692 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2693 tc_make_handle(1, 0), &hc);
2695 htb_get__(netdev)->max_rate = hc.max_rate;
2701 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2702 const struct tc_queue *queue, struct shash *details)
2704 const struct htb_class *hc = htb_class_cast__(queue);
2706 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2707 if (hc->min_rate != hc->max_rate) {
2708 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2710 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2712 shash_add(details, "priority", xasprintf("%u", hc->priority));
2718 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2719 const struct shash *details)
2721 struct htb_class hc;
2724 error = htb_parse_class_details__(netdev, details, &hc);
2729 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2730 tc_make_handle(1, 0xfffe), &hc);
2735 htb_update_queue__(netdev, queue_id, &hc);
2740 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2742 struct htb_class *hc = htb_class_cast__(queue);
2743 struct htb *htb = htb_get__(netdev);
2746 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2748 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2755 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2756 struct netdev_queue_stats *stats)
2758 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2759 tc_make_handle(1, 0xfffe), NULL, stats);
2763 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2764 const struct ofpbuf *nlmsg,
2765 netdev_dump_queue_stats_cb *cb, void *aux)
2767 struct netdev_queue_stats stats;
2768 unsigned int handle, major, minor;
2771 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2776 major = tc_get_major(handle);
2777 minor = tc_get_minor(handle);
2778 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2779 (*cb)(minor - 1, &stats, aux);
2784 static const struct tc_ops tc_ops_htb = {
2785 "htb", /* linux_name */
2786 "linux-htb", /* ovs_name */
2787 HTB_N_QUEUES, /* n_queues */
2796 htb_class_get_stats,
2797 htb_class_dump_stats
2800 /* "linux-hfsc" traffic control class. */
2802 #define HFSC_N_QUEUES 0xf000
2810 struct tc_queue tc_queue;
2815 static struct hfsc *
2816 hfsc_get__(const struct netdev *netdev)
2818 struct netdev_dev_linux *netdev_dev;
2819 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2820 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2823 static struct hfsc_class *
2824 hfsc_class_cast__(const struct tc_queue *queue)
2826 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2830 hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2832 struct netdev_dev_linux * netdev_dev;
2835 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2836 hfsc = xmalloc(sizeof *hfsc);
2837 tc_init(&hfsc->tc, &tc_ops_hfsc);
2838 hfsc->max_rate = max_rate;
2839 netdev_dev->tc = &hfsc->tc;
2843 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
2844 const struct hfsc_class *hc)
2848 struct hfsc_class *hcp;
2849 struct tc_queue *queue;
2851 hfsc = hfsc_get__(netdev);
2852 hash = hash_int(queue_id, 0);
2854 queue = tc_find_queue__(netdev, queue_id, hash);
2856 hcp = hfsc_class_cast__(queue);
2858 hcp = xmalloc(sizeof *hcp);
2859 queue = &hcp->tc_queue;
2860 queue->queue_id = queue_id;
2861 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
2864 hcp->min_rate = hc->min_rate;
2865 hcp->max_rate = hc->max_rate;
2869 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
2871 const struct tc_service_curve *rsc, *fsc, *usc;
2872 static const struct nl_policy tca_hfsc_policy[] = {
2874 .type = NL_A_UNSPEC,
2876 .min_len = sizeof(struct tc_service_curve),
2879 .type = NL_A_UNSPEC,
2881 .min_len = sizeof(struct tc_service_curve),
2884 .type = NL_A_UNSPEC,
2886 .min_len = sizeof(struct tc_service_curve),
2889 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
2891 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
2892 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
2893 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
2897 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
2898 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
2899 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
2901 if (rsc->m1 != 0 || rsc->d != 0 ||
2902 fsc->m1 != 0 || fsc->d != 0 ||
2903 usc->m1 != 0 || usc->d != 0) {
2904 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2905 "Non-linear service curves are not supported.");
2909 if (rsc->m2 != fsc->m2) {
2910 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2911 "Real-time service curves are not supported ");
2915 if (rsc->m2 > usc->m2) {
2916 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2917 "Min-rate service curve is greater than "
2918 "the max-rate service curve.");
2922 class->min_rate = fsc->m2;
2923 class->max_rate = usc->m2;
2928 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2929 struct hfsc_class *options,
2930 struct netdev_queue_stats *stats)
2933 unsigned int handle;
2934 struct nlattr *nl_options;
2936 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2942 unsigned int major, minor;
2944 major = tc_get_major(handle);
2945 minor = tc_get_minor(handle);
2946 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
2947 *queue_id = minor - 1;
2954 error = hfsc_parse_tca_options__(nl_options, options);
2961 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
2962 unsigned int parent, struct hfsc_class *options,
2963 struct netdev_queue_stats *stats)
2966 struct ofpbuf *reply;
2968 error = tc_query_class(netdev, handle, parent, &reply);
2973 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
2974 ofpbuf_delete(reply);
2979 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
2980 struct hfsc_class *class)
2983 const char *max_rate_s;
2985 max_rate_s = shash_find_data(details, "max-rate");
2986 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2991 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2992 max_rate = netdev_features_to_bps(current) / 8;
2995 class->min_rate = max_rate;
2996 class->max_rate = max_rate;
3000 hfsc_parse_class_details__(struct netdev *netdev,
3001 const struct shash *details,
3002 struct hfsc_class * class)
3004 const struct hfsc *hfsc;
3005 uint32_t min_rate, max_rate;
3006 const char *min_rate_s, *max_rate_s;
3008 hfsc = hfsc_get__(netdev);
3009 min_rate_s = shash_find_data(details, "min-rate");
3010 max_rate_s = shash_find_data(details, "max-rate");
3012 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3013 min_rate = MAX(min_rate, 1);
3014 min_rate = MIN(min_rate, hfsc->max_rate);
3016 max_rate = (max_rate_s
3017 ? strtoull(max_rate_s, NULL, 10) / 8
3019 max_rate = MAX(max_rate, min_rate);
3020 max_rate = MIN(max_rate, hfsc->max_rate);
3022 class->min_rate = min_rate;
3023 class->max_rate = max_rate;
3028 /* Create an HFSC qdisc.
3030 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3032 hfsc_setup_qdisc__(struct netdev * netdev)
3034 struct tcmsg *tcmsg;
3035 struct ofpbuf request;
3036 struct tc_hfsc_qopt opt;
3038 tc_del_qdisc(netdev);
3040 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3041 NLM_F_EXCL | NLM_F_CREATE, &request);
3047 tcmsg->tcm_handle = tc_make_handle(1, 0);
3048 tcmsg->tcm_parent = TC_H_ROOT;
3050 memset(&opt, 0, sizeof opt);
3053 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3054 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3056 return tc_transact(&request, NULL);
3059 /* Create an HFSC class.
3061 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3062 * sc rate <min_rate> ul rate <max_rate>" */
3064 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3065 unsigned int parent, struct hfsc_class *class)
3069 struct tcmsg *tcmsg;
3070 struct ofpbuf request;
3071 struct tc_service_curve min, max;
3073 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3079 tcmsg->tcm_handle = handle;
3080 tcmsg->tcm_parent = parent;
3084 min.m2 = class->min_rate;
3088 max.m2 = class->max_rate;
3090 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3091 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3092 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3093 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3094 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3095 nl_msg_end_nested(&request, opt_offset);
3097 error = tc_transact(&request, NULL);
3099 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3100 "min-rate %ubps, max-rate %ubps (%s)",
3101 netdev_get_name(netdev),
3102 tc_get_major(handle), tc_get_minor(handle),
3103 tc_get_major(parent), tc_get_minor(parent),
3104 class->min_rate, class->max_rate, strerror(error));
3111 hfsc_tc_install(struct netdev *netdev, const struct shash *details)
3114 struct hfsc_class class;
3116 error = hfsc_setup_qdisc__(netdev);
3122 hfsc_parse_qdisc_details__(netdev, details, &class);
3123 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3124 tc_make_handle(1, 0), &class);
3130 hfsc_install__(netdev, class.max_rate);
3135 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3138 struct nl_dump dump;
3139 struct hfsc_class hc;
3142 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3143 hfsc_install__(netdev, hc.max_rate);
3145 if (!start_queue_dump(netdev, &dump)) {
3149 while (nl_dump_next(&dump, &msg)) {
3150 unsigned int queue_id;
3152 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3153 hfsc_update_queue__(netdev, queue_id, &hc);
3157 nl_dump_done(&dump);
3162 hfsc_tc_destroy(struct tc *tc)
3165 struct hfsc_class *hc, *next;
3167 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3169 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3170 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3179 hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3181 const struct hfsc *hfsc;
3182 hfsc = hfsc_get__(netdev);
3183 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3188 hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3191 struct hfsc_class class;
3193 hfsc_parse_qdisc_details__(netdev, details, &class);
3194 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3195 tc_make_handle(1, 0), &class);
3198 hfsc_get__(netdev)->max_rate = class.max_rate;
3205 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3206 const struct tc_queue *queue, struct shash *details)
3208 const struct hfsc_class *hc;
3210 hc = hfsc_class_cast__(queue);
3211 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3212 if (hc->min_rate != hc->max_rate) {
3213 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3219 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3220 const struct shash *details)
3223 struct hfsc_class class;
3225 error = hfsc_parse_class_details__(netdev, details, &class);
3230 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3231 tc_make_handle(1, 0xfffe), &class);
3236 hfsc_update_queue__(netdev, queue_id, &class);
3241 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3245 struct hfsc_class *hc;
3247 hc = hfsc_class_cast__(queue);
3248 hfsc = hfsc_get__(netdev);
3250 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3252 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3259 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3260 struct netdev_queue_stats *stats)
3262 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3263 tc_make_handle(1, 0xfffe), NULL, stats);
3267 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3268 const struct ofpbuf *nlmsg,
3269 netdev_dump_queue_stats_cb *cb, void *aux)
3271 struct netdev_queue_stats stats;
3272 unsigned int handle, major, minor;
3275 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3280 major = tc_get_major(handle);
3281 minor = tc_get_minor(handle);
3282 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3283 (*cb)(minor - 1, &stats, aux);
3288 static const struct tc_ops tc_ops_hfsc = {
3289 "hfsc", /* linux_name */
3290 "linux-hfsc", /* ovs_name */
3291 HFSC_N_QUEUES, /* n_queues */
3292 hfsc_tc_install, /* tc_install */
3293 hfsc_tc_load, /* tc_load */
3294 hfsc_tc_destroy, /* tc_destroy */
3295 hfsc_qdisc_get, /* qdisc_get */
3296 hfsc_qdisc_set, /* qdisc_set */
3297 hfsc_class_get, /* class_get */
3298 hfsc_class_set, /* class_set */
3299 hfsc_class_delete, /* class_delete */
3300 hfsc_class_get_stats, /* class_get_stats */
3301 hfsc_class_dump_stats /* class_dump_stats */
3304 /* "linux-default" traffic control class.
3306 * This class represents the default, unnamed Linux qdisc. It corresponds to
3307 * the "" (empty string) QoS type in the OVS database. */
3310 default_install__(struct netdev *netdev)
3312 struct netdev_dev_linux *netdev_dev =
3313 netdev_dev_linux_cast(netdev_get_dev(netdev));
3314 static struct tc *tc;
3317 tc = xmalloc(sizeof *tc);
3318 tc_init(tc, &tc_ops_default);
3320 netdev_dev->tc = tc;
3324 default_tc_install(struct netdev *netdev,
3325 const struct shash *details OVS_UNUSED)
3327 default_install__(netdev);
3332 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3334 default_install__(netdev);
3338 static const struct tc_ops tc_ops_default = {
3339 NULL, /* linux_name */
3344 NULL, /* tc_destroy */
3345 NULL, /* qdisc_get */
3346 NULL, /* qdisc_set */
3347 NULL, /* class_get */
3348 NULL, /* class_set */
3349 NULL, /* class_delete */
3350 NULL, /* class_get_stats */
3351 NULL /* class_dump_stats */
3354 /* "linux-other" traffic control class.
3359 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3361 struct netdev_dev_linux *netdev_dev =
3362 netdev_dev_linux_cast(netdev_get_dev(netdev));
3363 static struct tc *tc;
3366 tc = xmalloc(sizeof *tc);
3367 tc_init(tc, &tc_ops_other);
3369 netdev_dev->tc = tc;
3373 static const struct tc_ops tc_ops_other = {
3374 NULL, /* linux_name */
3375 "linux-other", /* ovs_name */
3377 NULL, /* tc_install */
3379 NULL, /* tc_destroy */
3380 NULL, /* qdisc_get */
3381 NULL, /* qdisc_set */
3382 NULL, /* class_get */
3383 NULL, /* class_set */
3384 NULL, /* class_delete */
3385 NULL, /* class_get_stats */
3386 NULL /* class_dump_stats */
3389 /* Traffic control. */
3391 /* Number of kernel "tc" ticks per second. */
3392 static double ticks_per_s;
3394 /* Number of kernel "jiffies" per second. This is used for the purpose of
3395 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3396 * one jiffy's worth of data.
3398 * There are two possibilities here:
3400 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3401 * approximate range of 100 to 1024. That means that we really need to
3402 * make sure that the qdisc can buffer that much data.
3404 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3405 * has finely granular timers and there's no need to fudge additional room
3406 * for buffers. (There's no extra effort needed to implement that: the
3407 * large 'buffer_hz' is used as a divisor, so practically any number will
3408 * come out as 0 in the division. Small integer results in the case of
3409 * really high dividends won't have any real effect anyhow.)
3411 static unsigned int buffer_hz;
3413 /* Returns tc handle 'major':'minor'. */
3415 tc_make_handle(unsigned int major, unsigned int minor)
3417 return TC_H_MAKE(major << 16, minor);
3420 /* Returns the major number from 'handle'. */
3422 tc_get_major(unsigned int handle)
3424 return TC_H_MAJ(handle) >> 16;
3427 /* Returns the minor number from 'handle'. */
3429 tc_get_minor(unsigned int handle)
3431 return TC_H_MIN(handle);
3434 static struct tcmsg *
3435 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3436 struct ofpbuf *request)
3438 struct tcmsg *tcmsg;
3442 error = get_ifindex(netdev, &ifindex);
3447 ofpbuf_init(request, 512);
3448 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3449 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3450 tcmsg->tcm_family = AF_UNSPEC;
3451 tcmsg->tcm_ifindex = ifindex;
3452 /* Caller should fill in tcmsg->tcm_handle. */
3453 /* Caller should fill in tcmsg->tcm_parent. */
3459 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3461 int error = nl_sock_transact(rtnl_sock, request, replyp);
3462 ofpbuf_uninit(request);
3466 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3467 * policing configuration.
3469 * This function is equivalent to running the following when 'add' is true:
3470 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3472 * This function is equivalent to running the following when 'add' is false:
3473 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3475 * The configuration and stats may be seen with the following command:
3476 * /sbin/tc -s qdisc show dev <devname>
3478 * Returns 0 if successful, otherwise a positive errno value.
3481 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3483 struct ofpbuf request;
3484 struct tcmsg *tcmsg;
3486 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3487 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3489 tcmsg = tc_make_request(netdev, type, flags, &request);
3493 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3494 tcmsg->tcm_parent = TC_H_INGRESS;
3495 nl_msg_put_string(&request, TCA_KIND, "ingress");
3496 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3498 error = tc_transact(&request, NULL);
3500 /* If we're deleting the qdisc, don't worry about some of the
3501 * error conditions. */
3502 if (!add && (error == ENOENT || error == EINVAL)) {
3511 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3514 * This function is equivalent to running:
3515 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3516 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3519 * The configuration and stats may be seen with the following command:
3520 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3522 * Returns 0 if successful, otherwise a positive errno value.
3525 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3527 struct tc_police tc_police;
3528 struct ofpbuf request;
3529 struct tcmsg *tcmsg;
3530 size_t basic_offset;
3531 size_t police_offset;
3535 memset(&tc_police, 0, sizeof tc_police);
3536 tc_police.action = TC_POLICE_SHOT;
3537 tc_police.mtu = mtu;
3538 tc_fill_rate(&tc_police.rate, kbits_rate/8 * 1000, mtu);
3539 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3540 kbits_burst * 1024);
3542 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3543 NLM_F_EXCL | NLM_F_CREATE, &request);
3547 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3548 tcmsg->tcm_info = tc_make_handle(49,
3549 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3551 nl_msg_put_string(&request, TCA_KIND, "basic");
3552 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3553 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3554 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3555 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3556 nl_msg_end_nested(&request, police_offset);
3557 nl_msg_end_nested(&request, basic_offset);
3559 error = tc_transact(&request, NULL);
3570 /* The values in psched are not individually very meaningful, but they are
3571 * important. The tables below show some values seen in the wild.
3575 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3576 * (Before that, there are hints that it was 1000000000.)
3578 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3582 * -----------------------------------
3583 * [1] 000c8000 000f4240 000f4240 00000064
3584 * [2] 000003e8 00000400 000f4240 3b9aca00
3585 * [3] 000003e8 00000400 000f4240 3b9aca00
3586 * [4] 000003e8 00000400 000f4240 00000064
3587 * [5] 000003e8 00000040 000f4240 3b9aca00
3588 * [6] 000003e8 00000040 000f4240 000000f9
3590 * a b c d ticks_per_s buffer_hz
3591 * ------- --------- ---------- ------------- ----------- -------------
3592 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3593 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3594 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3595 * [4] 1,000 1,024 1,000,000 100 976,562 100
3596 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3597 * [6] 1,000 64 1,000,000 249 15,625,000 249
3599 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3600 * [2] 2.6.26-1-686-bigmem from Debian lenny
3601 * [3] 2.6.26-2-sparc64 from Debian lenny
3602 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3603 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3604 * [6] 2.6.34 from kernel.org on KVM
3606 static const char fn[] = "/proc/net/psched";
3607 unsigned int a, b, c, d;
3613 stream = fopen(fn, "r");
3615 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3619 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3620 VLOG_WARN("%s: read failed", fn);
3624 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3628 VLOG_WARN("%s: invalid scheduler parameters", fn);
3632 ticks_per_s = (double) a * c / b;
3636 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3639 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3642 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3643 * rate of 'rate' bytes per second. */
3645 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3650 return (rate * ticks) / ticks_per_s;
3653 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3654 * rate of 'rate' bytes per second. */
3656 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3661 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3664 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3665 * a transmission rate of 'rate' bytes per second. */
3667 tc_buffer_per_jiffy(unsigned int rate)
3672 return rate / buffer_hz;
3675 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3676 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3677 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3678 * stores NULL into it if it is absent.
3680 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3683 * Returns 0 if successful, otherwise a positive errno value. */
3685 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3686 struct nlattr **options)
3688 static const struct nl_policy tca_policy[] = {
3689 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3690 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3692 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3694 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3695 tca_policy, ta, ARRAY_SIZE(ta))) {
3696 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3701 *kind = nl_attr_get_string(ta[TCA_KIND]);
3705 *options = ta[TCA_OPTIONS];
3720 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3721 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3722 * into '*options', and its queue statistics into '*stats'. Any of the output
3723 * arguments may be null.
3725 * Returns 0 if successful, otherwise a positive errno value. */
3727 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3728 struct nlattr **options, struct netdev_queue_stats *stats)
3730 static const struct nl_policy tca_policy[] = {
3731 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3732 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3734 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3736 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3737 tca_policy, ta, ARRAY_SIZE(ta))) {
3738 VLOG_WARN_RL(&rl, "failed to parse class message");
3743 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3744 *handlep = tc->tcm_handle;
3748 *options = ta[TCA_OPTIONS];
3752 const struct gnet_stats_queue *gsq;
3753 struct gnet_stats_basic gsb;
3755 static const struct nl_policy stats_policy[] = {
3756 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3757 .min_len = sizeof gsb },
3758 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3759 .min_len = sizeof *gsq },
3761 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3763 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3764 sa, ARRAY_SIZE(sa))) {
3765 VLOG_WARN_RL(&rl, "failed to parse class stats");
3769 /* Alignment issues screw up the length of struct gnet_stats_basic on
3770 * some arch/bitsize combinations. Newer versions of Linux have a
3771 * struct gnet_stats_basic_packed, but we can't depend on that. The
3772 * easiest thing to do is just to make a copy. */
3773 memset(&gsb, 0, sizeof gsb);
3774 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3775 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3776 stats->tx_bytes = gsb.bytes;
3777 stats->tx_packets = gsb.packets;
3779 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3780 stats->tx_errors = gsq->drops;
3790 memset(stats, 0, sizeof *stats);
3795 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3798 tc_query_class(const struct netdev *netdev,
3799 unsigned int handle, unsigned int parent,
3800 struct ofpbuf **replyp)
3802 struct ofpbuf request;
3803 struct tcmsg *tcmsg;
3806 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3810 tcmsg->tcm_handle = handle;
3811 tcmsg->tcm_parent = parent;
3813 error = tc_transact(&request, replyp);
3815 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3816 netdev_get_name(netdev),
3817 tc_get_major(handle), tc_get_minor(handle),
3818 tc_get_major(parent), tc_get_minor(parent),
3824 /* Equivalent to "tc class del dev <name> handle <handle>". */
3826 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3828 struct ofpbuf request;
3829 struct tcmsg *tcmsg;
3832 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
3836 tcmsg->tcm_handle = handle;
3837 tcmsg->tcm_parent = 0;
3839 error = tc_transact(&request, NULL);
3841 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3842 netdev_get_name(netdev),
3843 tc_get_major(handle), tc_get_minor(handle),
3849 /* Equivalent to "tc qdisc del dev <name> root". */
3851 tc_del_qdisc(struct netdev *netdev)
3853 struct netdev_dev_linux *netdev_dev =
3854 netdev_dev_linux_cast(netdev_get_dev(netdev));
3855 struct ofpbuf request;
3856 struct tcmsg *tcmsg;
3859 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
3863 tcmsg->tcm_handle = tc_make_handle(1, 0);
3864 tcmsg->tcm_parent = TC_H_ROOT;
3866 error = tc_transact(&request, NULL);
3867 if (error == EINVAL) {
3868 /* EINVAL probably means that the default qdisc was in use, in which
3869 * case we've accomplished our purpose. */
3872 if (!error && netdev_dev->tc) {
3873 if (netdev_dev->tc->ops->tc_destroy) {
3874 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3876 netdev_dev->tc = NULL;
3881 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
3882 * kernel to determine what they are. Returns 0 if successful, otherwise a
3883 * positive errno value. */
3885 tc_query_qdisc(const struct netdev *netdev)
3887 struct netdev_dev_linux *netdev_dev =
3888 netdev_dev_linux_cast(netdev_get_dev(netdev));
3889 struct ofpbuf request, *qdisc;
3890 const struct tc_ops *ops;
3891 struct tcmsg *tcmsg;
3895 if (netdev_dev->tc) {
3899 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3900 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3901 * 2.6.35 without that fix backported to it.
3903 * To avoid the OOPS, we must not make a request that would attempt to dump
3904 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3905 * few others. There are a few ways that I can see to do this, but most of
3906 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3907 * technique chosen here is to assume that any non-default qdisc that we
3908 * create will have a class with handle 1:0. The built-in qdiscs only have
3909 * a class with handle 0:0.
3911 * We could check for Linux 2.6.35+ and use a more straightforward method
3913 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
3917 tcmsg->tcm_handle = tc_make_handle(1, 0);
3918 tcmsg->tcm_parent = 0;
3920 /* Figure out what tc class to instantiate. */
3921 error = tc_transact(&request, &qdisc);
3925 error = tc_parse_qdisc(qdisc, &kind, NULL);
3927 ops = &tc_ops_other;
3929 ops = tc_lookup_linux_name(kind);
3931 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
3932 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
3934 ops = &tc_ops_other;
3937 } else if (error == ENOENT) {
3938 /* Either it's a built-in qdisc, or it's a qdisc set up by some
3939 * other entity that doesn't have a handle 1:0. We will assume
3940 * that it's the system default qdisc. */
3941 ops = &tc_ops_default;
3944 /* Who knows? Maybe the device got deleted. */
3945 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
3946 netdev_get_name(netdev), strerror(error));
3947 ops = &tc_ops_other;
3950 /* Instantiate it. */
3951 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
3952 assert((load_error == 0) == (netdev_dev->tc != NULL));
3953 ofpbuf_delete(qdisc);
3955 return error ? error : load_error;
3958 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
3959 approximate the time to transmit packets of various lengths. For an MTU of
3960 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
3961 represents two possible packet lengths; for a MTU of 513 through 1024, four
3962 possible lengths; and so on.
3964 Returns, for the specified 'mtu', the number of bits that packet lengths
3965 need to be shifted right to fit within such a 256-entry table. */
3967 tc_calc_cell_log(unsigned int mtu)
3972 mtu = ETH_PAYLOAD_MAX;
3974 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
3976 for (cell_log = 0; mtu >= 256; cell_log++) {
3983 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
3986 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
3988 memset(rate, 0, sizeof *rate);
3989 rate->cell_log = tc_calc_cell_log(mtu);
3990 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
3991 /* rate->cell_align = 0; */ /* distro headers. */
3992 rate->mpu = ETH_TOTAL_MIN;
3996 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
3997 * attribute of the specified "type".
3999 * See tc_calc_cell_log() above for a description of "rtab"s. */
4001 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4006 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4007 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4008 unsigned packet_size = (i + 1) << rate->cell_log;
4009 if (packet_size < rate->mpu) {
4010 packet_size = rate->mpu;
4012 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4016 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4017 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4018 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4021 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4023 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4024 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4027 /* Linux-only functions declared in netdev-linux.h */
4029 /* Returns a fd for an AF_INET socket or a negative errno value. */
4031 netdev_linux_get_af_inet_sock(void)
4033 int error = netdev_linux_init();
4034 return error ? -error : af_inet_sock;
4037 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4038 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4040 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4041 const char *flag_name, bool enable)
4043 const char *netdev_name = netdev_get_name(netdev);
4044 struct ethtool_value evalue;
4048 memset(&evalue, 0, sizeof evalue);
4049 error = netdev_linux_do_ethtool(netdev_name,
4050 (struct ethtool_cmd *)&evalue,
4051 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4056 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4057 error = netdev_linux_do_ethtool(netdev_name,
4058 (struct ethtool_cmd *)&evalue,
4059 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4064 memset(&evalue, 0, sizeof evalue);
4065 error = netdev_linux_do_ethtool(netdev_name,
4066 (struct ethtool_cmd *)&evalue,
4067 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4072 if (new_flags != evalue.data) {
4073 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4074 "device %s failed", enable ? "enable" : "disable",
4075 flag_name, netdev_name);
4082 /* Utility functions. */
4084 /* Copies 'src' into 'dst', performing format conversion in the process. */
4086 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4087 const struct rtnl_link_stats *src)
4089 dst->rx_packets = src->rx_packets;
4090 dst->tx_packets = src->tx_packets;
4091 dst->rx_bytes = src->rx_bytes;
4092 dst->tx_bytes = src->tx_bytes;
4093 dst->rx_errors = src->rx_errors;
4094 dst->tx_errors = src->tx_errors;
4095 dst->rx_dropped = src->rx_dropped;
4096 dst->tx_dropped = src->tx_dropped;
4097 dst->multicast = src->multicast;
4098 dst->collisions = src->collisions;
4099 dst->rx_length_errors = src->rx_length_errors;
4100 dst->rx_over_errors = src->rx_over_errors;
4101 dst->rx_crc_errors = src->rx_crc_errors;
4102 dst->rx_frame_errors = src->rx_frame_errors;
4103 dst->rx_fifo_errors = src->rx_fifo_errors;
4104 dst->rx_missed_errors = src->rx_missed_errors;
4105 dst->tx_aborted_errors = src->tx_aborted_errors;
4106 dst->tx_carrier_errors = src->tx_carrier_errors;
4107 dst->tx_fifo_errors = src->tx_fifo_errors;
4108 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4109 dst->tx_window_errors = src->tx_window_errors;
4113 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4115 /* Policy for RTNLGRP_LINK messages.
4117 * There are *many* more fields in these messages, but currently we only
4118 * care about these fields. */
4119 static const struct nl_policy rtnlgrp_link_policy[] = {
4120 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4121 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4122 .min_len = sizeof(struct rtnl_link_stats) },
4125 struct ofpbuf request;
4126 struct ofpbuf *reply;
4127 struct ifinfomsg *ifi;
4128 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4131 ofpbuf_init(&request, 0);
4132 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4133 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4134 ifi->ifi_family = PF_UNSPEC;
4135 ifi->ifi_index = ifindex;
4136 error = nl_sock_transact(rtnl_sock, &request, &reply);
4137 ofpbuf_uninit(&request);
4142 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4143 rtnlgrp_link_policy,
4144 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4145 ofpbuf_delete(reply);
4149 if (!attrs[IFLA_STATS]) {
4150 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4151 ofpbuf_delete(reply);
4155 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4157 ofpbuf_delete(reply);
4163 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4165 static const char fn[] = "/proc/net/dev";
4170 stream = fopen(fn, "r");
4172 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
4177 while (fgets(line, sizeof line, stream)) {
4180 #define X64 "%"SCNu64
4183 X64 X64 X64 X64 X64 X64 X64 "%*u"
4184 X64 X64 X64 X64 X64 X64 X64 "%*u",
4190 &stats->rx_fifo_errors,
4191 &stats->rx_frame_errors,
4197 &stats->tx_fifo_errors,
4199 &stats->tx_carrier_errors) != 15) {
4200 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4201 } else if (!strcmp(devname, netdev_name)) {
4202 stats->rx_length_errors = UINT64_MAX;
4203 stats->rx_over_errors = UINT64_MAX;
4204 stats->rx_crc_errors = UINT64_MAX;
4205 stats->rx_missed_errors = UINT64_MAX;
4206 stats->tx_aborted_errors = UINT64_MAX;
4207 stats->tx_heartbeat_errors = UINT64_MAX;
4208 stats->tx_window_errors = UINT64_MAX;
4214 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4220 get_carrier_via_sysfs(const char *name, bool *carrier)
4231 fn = xasprintf("/sys/class/net/%s/carrier", name);
4232 fd = open(fn, O_RDONLY);
4235 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(error));
4239 retval = read(fd, line, sizeof line);
4242 if (error == EINVAL) {
4243 /* This is the normal return value when we try to check carrier if
4244 * the network device is not up. */
4246 VLOG_WARN_RL(&rl, "%s: read failed: %s", fn, strerror(error));
4249 } else if (retval == 0) {
4251 VLOG_WARN_RL(&rl, "%s: unexpected end of file", fn);
4255 if (line[0] != '0' && line[0] != '1') {
4257 VLOG_WARN_RL(&rl, "%s: value is %c (expected 0 or 1)", fn, line[0]);
4260 *carrier = line[0] != '0';
4272 get_flags(const struct netdev *netdev, int *flags)
4277 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCGIFFLAGS,
4279 *flags = ifr.ifr_flags;
4284 set_flags(struct netdev *netdev, int flags)
4288 ifr.ifr_flags = flags;
4289 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4294 do_get_ifindex(const char *netdev_name)
4298 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4299 COVERAGE_INC(netdev_get_ifindex);
4300 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4301 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4302 netdev_name, strerror(errno));
4305 return ifr.ifr_ifindex;
4309 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4311 struct netdev_dev_linux *netdev_dev =
4312 netdev_dev_linux_cast(netdev_get_dev(netdev_));
4314 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
4315 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4319 netdev_dev->cache_valid |= VALID_IFINDEX;
4320 netdev_dev->ifindex = ifindex;
4322 *ifindexp = netdev_dev->ifindex;
4327 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4332 memset(&ifr, 0, sizeof ifr);
4333 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4334 COVERAGE_INC(netdev_get_hwaddr);
4335 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4336 /* ENODEV probably means that a vif disappeared asynchronously and
4337 * hasn't been removed from the database yet, so reduce the log level
4338 * to INFO for that case. */
4339 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4340 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4341 netdev_name, strerror(errno));
4344 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4345 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4346 VLOG_WARN("%s device has unknown hardware address family %d",
4347 netdev_name, hwaddr_family);
4349 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4354 set_etheraddr(const char *netdev_name, int hwaddr_family,
4355 const uint8_t mac[ETH_ADDR_LEN])
4359 memset(&ifr, 0, sizeof ifr);
4360 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4361 ifr.ifr_hwaddr.sa_family = hwaddr_family;
4362 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4363 COVERAGE_INC(netdev_set_hwaddr);
4364 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4365 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4366 netdev_name, strerror(errno));
4373 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4374 int cmd, const char *cmd_name)
4378 memset(&ifr, 0, sizeof ifr);
4379 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4380 ifr.ifr_data = (caddr_t) ecmd;
4383 COVERAGE_INC(netdev_ethtool);
4384 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4387 if (errno != EOPNOTSUPP) {
4388 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4389 "failed: %s", cmd_name, name, strerror(errno));
4391 /* The device doesn't support this operation. That's pretty
4392 * common, so there's no point in logging anything. */
4399 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4400 const char *cmd_name)
4402 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4403 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4404 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4412 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4413 int cmd, const char *cmd_name)
4418 ifr.ifr_addr.sa_family = AF_INET;
4419 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4421 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4422 *ip = sin->sin_addr;
4427 /* Returns an AF_PACKET raw socket or a negative errno value. */
4429 af_packet_sock(void)
4431 static int sock = INT_MIN;
4433 if (sock == INT_MIN) {
4434 sock = socket(AF_PACKET, SOCK_RAW, 0);
4436 set_nonblocking(sock);
4439 VLOG_ERR("failed to create packet socket: %s", strerror(errno));