2 * Copyright (c) 2009, 2010, 2011 Nicira Networks.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
24 #include <arpa/inet.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
30 #include <linux/types.h>
31 #include <linux/ethtool.h>
32 #include <linux/mii.h>
33 #include <linux/pkt_cls.h>
34 #include <linux/pkt_sched.h>
35 #include <linux/rtnetlink.h>
36 #include <linux/sockios.h>
37 #include <linux/version.h>
38 #include <sys/types.h>
39 #include <sys/ioctl.h>
40 #include <sys/socket.h>
41 #include <netpacket/packet.h>
43 #include <net/if_arp.h>
44 #include <net/if_packet.h>
45 #include <net/route.h>
46 #include <netinet/in.h>
53 #include "dpif-linux.h"
54 #include "dynamic-string.h"
55 #include "fatal-signal.h"
58 #include "netdev-provider.h"
59 #include "netdev-vport.h"
61 #include "netlink-notifier.h"
62 #include "netlink-socket.h"
64 #include "openflow/openflow.h"
66 #include "poll-loop.h"
67 #include "rtnetlink-link.h"
68 #include "socket-util.h"
74 VLOG_DEFINE_THIS_MODULE(netdev_linux);
76 COVERAGE_DEFINE(netdev_set_policing);
77 COVERAGE_DEFINE(netdev_arp_lookup);
78 COVERAGE_DEFINE(netdev_get_ifindex);
79 COVERAGE_DEFINE(netdev_get_hwaddr);
80 COVERAGE_DEFINE(netdev_set_hwaddr);
81 COVERAGE_DEFINE(netdev_ethtool);
83 /* These were introduced in Linux 2.6.14, so they might be missing if we have
85 #ifndef ADVERTISED_Pause
86 #define ADVERTISED_Pause (1 << 13)
88 #ifndef ADVERTISED_Asym_Pause
89 #define ADVERTISED_Asym_Pause (1 << 14)
92 /* These were introduced in Linux 2.6.24, so they might be missing if we
93 * have old headers. */
94 #ifndef ETHTOOL_GFLAGS
95 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
97 #ifndef ETHTOOL_SFLAGS
98 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
101 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
104 #define TC_RTAB_SIZE 1024
107 static struct nln_notifier *netdev_linux_cache_notifier = NULL;
108 static int cache_notifier_refcount;
111 VALID_IFINDEX = 1 << 0,
112 VALID_ETHERADDR = 1 << 1,
116 VALID_POLICING = 1 << 5,
117 VALID_HAVE_VPORT_STATS = 1 << 6
125 /* Traffic control. */
127 /* An instance of a traffic control class. Always associated with a particular
130 * Each TC implementation subclasses this with whatever additional data it
133 const struct tc_ops *ops;
134 struct hmap queues; /* Contains "struct tc_queue"s.
135 * Read by generic TC layer.
136 * Written only by TC implementation. */
139 /* One traffic control queue.
141 * Each TC implementation subclasses this with whatever additional data it
144 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
145 unsigned int queue_id; /* OpenFlow queue ID. */
148 /* A particular kind of traffic control. Each implementation generally maps to
149 * one particular Linux qdisc class.
151 * The functions below return 0 if successful or a positive errno value on
152 * failure, except where otherwise noted. All of them must be provided, except
153 * where otherwise noted. */
155 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
156 * This is null for tc_ops_default and tc_ops_other, for which there are no
157 * appropriate values. */
158 const char *linux_name;
160 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
161 const char *ovs_name;
163 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
164 * queues. The queues are numbered 0 through n_queues - 1. */
165 unsigned int n_queues;
167 /* Called to install this TC class on 'netdev'. The implementation should
168 * make the Netlink calls required to set up 'netdev' with the right qdisc
169 * and configure it according to 'details'. The implementation may assume
170 * that the current qdisc is the default; that is, there is no need for it
171 * to delete the current qdisc before installing itself.
173 * The contents of 'details' should be documented as valid for 'ovs_name'
174 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
175 * (which is built as ovs-vswitchd.conf.db(8)).
177 * This function must return 0 if and only if it sets 'netdev->tc' to an
178 * initialized 'struct tc'.
180 * (This function is null for tc_ops_other, which cannot be installed. For
181 * other TC classes it should always be nonnull.) */
182 int (*tc_install)(struct netdev *netdev, const struct shash *details);
184 /* Called when the netdev code determines (through a Netlink query) that
185 * this TC class's qdisc is installed on 'netdev', but we didn't install
186 * it ourselves and so don't know any of the details.
188 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
189 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
190 * implementation should parse the other attributes of 'nlmsg' as
191 * necessary to determine its configuration. If necessary it should also
192 * use Netlink queries to determine the configuration of queues on
195 * This function must return 0 if and only if it sets 'netdev->tc' to an
196 * initialized 'struct tc'. */
197 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
199 /* Destroys the data structures allocated by the implementation as part of
200 * 'tc'. (This includes destroying 'tc->queues' by calling
203 * The implementation should not need to perform any Netlink calls. If
204 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
205 * (But it may not be desirable.)
207 * This function may be null if 'tc' is trivial. */
208 void (*tc_destroy)(struct tc *tc);
210 /* Retrieves details of 'netdev->tc' configuration into 'details'.
212 * The implementation should not need to perform any Netlink calls, because
213 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
214 * cached the configuration.
216 * The contents of 'details' should be documented as valid for 'ovs_name'
217 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
218 * (which is built as ovs-vswitchd.conf.db(8)).
220 * This function may be null if 'tc' is not configurable.
222 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
224 /* Reconfigures 'netdev->tc' according to 'details', performing any
225 * required Netlink calls to complete the reconfiguration.
227 * The contents of 'details' should be documented as valid for 'ovs_name'
228 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
229 * (which is built as ovs-vswitchd.conf.db(8)).
231 * This function may be null if 'tc' is not configurable.
233 int (*qdisc_set)(struct netdev *, const struct shash *details);
235 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
236 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
238 * The contents of 'details' should be documented as valid for 'ovs_name'
239 * in the "other_config" column in the "Queue" table in
240 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
242 * The implementation should not need to perform any Netlink calls, because
243 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
244 * cached the queue configuration.
246 * This function may be null if 'tc' does not have queues ('n_queues' is
248 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
249 struct shash *details);
251 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
252 * 'details', perfoming any required Netlink calls to complete the
253 * reconfiguration. The caller ensures that 'queue_id' is less than
256 * The contents of 'details' should be documented as valid for 'ovs_name'
257 * in the "other_config" column in the "Queue" table in
258 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
260 * This function may be null if 'tc' does not have queues or its queues are
261 * not configurable. */
262 int (*class_set)(struct netdev *, unsigned int queue_id,
263 const struct shash *details);
265 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
266 * tc_queue's within 'netdev->tc->queues'.
268 * This function may be null if 'tc' does not have queues or its queues
269 * cannot be deleted. */
270 int (*class_delete)(struct netdev *, struct tc_queue *queue);
272 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
273 * 'struct tc_queue's within 'netdev->tc->queues'.
275 * On success, initializes '*stats'.
277 * This function may be null if 'tc' does not have queues or if it cannot
278 * report queue statistics. */
279 int (*class_get_stats)(const struct netdev *netdev,
280 const struct tc_queue *queue,
281 struct netdev_queue_stats *stats);
283 /* Extracts queue stats from 'nlmsg', which is a response to a
284 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
286 * This function may be null if 'tc' does not have queues or if it cannot
287 * report queue statistics. */
288 int (*class_dump_stats)(const struct netdev *netdev,
289 const struct ofpbuf *nlmsg,
290 netdev_dump_queue_stats_cb *cb, void *aux);
294 tc_init(struct tc *tc, const struct tc_ops *ops)
297 hmap_init(&tc->queues);
301 tc_destroy(struct tc *tc)
303 hmap_destroy(&tc->queues);
306 static const struct tc_ops tc_ops_htb;
307 static const struct tc_ops tc_ops_hfsc;
308 static const struct tc_ops tc_ops_default;
309 static const struct tc_ops tc_ops_other;
311 static const struct tc_ops *tcs[] = {
312 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
313 &tc_ops_hfsc, /* Hierarchical fair service curve. */
314 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
315 &tc_ops_other, /* Some other qdisc. */
319 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
320 static unsigned int tc_get_major(unsigned int handle);
321 static unsigned int tc_get_minor(unsigned int handle);
323 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
324 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
325 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
327 static struct tcmsg *tc_make_request(const struct netdev *, int type,
328 unsigned int flags, struct ofpbuf *);
329 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
330 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
331 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
334 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
335 struct nlattr **options);
336 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
337 struct nlattr **options,
338 struct netdev_queue_stats *);
339 static int tc_query_class(const struct netdev *,
340 unsigned int handle, unsigned int parent,
341 struct ofpbuf **replyp);
342 static int tc_delete_class(const struct netdev *, unsigned int handle);
344 static int tc_del_qdisc(struct netdev *netdev);
345 static int tc_query_qdisc(const struct netdev *netdev);
347 static int tc_calc_cell_log(unsigned int mtu);
348 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
349 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
350 const struct tc_ratespec *rate);
351 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
353 struct netdev_dev_linux {
354 struct netdev_dev netdev_dev;
356 struct shash_node *shash_node;
357 unsigned int cache_valid;
358 unsigned int change_seq;
360 bool miimon; /* Link status of last poll. */
361 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
362 struct timer miimon_timer;
364 /* The following are figured out "on demand" only. They are only valid
365 * when the corresponding VALID_* bit in 'cache_valid' is set. */
367 uint8_t etheraddr[ETH_ADDR_LEN];
368 struct in_addr address, netmask;
372 long long int carrier_resets;
373 uint32_t kbits_rate; /* Policing data. */
374 uint32_t kbits_burst;
375 bool have_vport_stats;
379 struct tap_state tap;
383 struct netdev_linux {
384 struct netdev netdev;
388 /* Sockets used for ioctl operations. */
389 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
391 /* A Netlink routing socket that is not subscribed to any multicast groups. */
392 static struct nl_sock *rtnl_sock;
394 /* This is set pretty low because we probably won't learn anything from the
395 * additional log messages. */
396 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
398 static int netdev_linux_init(void);
400 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
401 int cmd, const char *cmd_name);
402 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
403 const char *cmd_name);
404 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
405 int cmd, const char *cmd_name);
406 static int get_flags(const struct netdev *, int *flagsp);
407 static int set_flags(struct netdev *, int flags);
408 static int do_get_ifindex(const char *netdev_name);
409 static int get_ifindex(const struct netdev *, int *ifindexp);
410 static int do_set_addr(struct netdev *netdev,
411 int ioctl_nr, const char *ioctl_name,
412 struct in_addr addr);
413 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
414 static int set_etheraddr(const char *netdev_name, int hwaddr_family,
415 const uint8_t[ETH_ADDR_LEN]);
416 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
417 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
418 static int get_carrier_via_sysfs(const char *name, bool *carrier);
419 static int af_packet_sock(void);
420 static void netdev_linux_miimon_run(void);
421 static void netdev_linux_miimon_wait(void);
424 is_netdev_linux_class(const struct netdev_class *netdev_class)
426 return netdev_class->init == netdev_linux_init;
429 static struct netdev_dev_linux *
430 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
432 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
433 assert(is_netdev_linux_class(netdev_class));
435 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
438 static struct netdev_linux *
439 netdev_linux_cast(const struct netdev *netdev)
441 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
442 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
443 assert(is_netdev_linux_class(netdev_class));
445 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
449 netdev_linux_init(void)
451 static int status = -1;
453 /* Create AF_INET socket. */
454 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
455 status = af_inet_sock >= 0 ? 0 : errno;
457 VLOG_ERR("failed to create inet socket: %s", strerror(status));
460 /* Create rtnetlink socket. */
462 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
464 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
473 netdev_linux_run(void)
475 rtnetlink_link_run();
476 netdev_linux_miimon_run();
480 netdev_linux_wait(void)
482 rtnetlink_link_wait();
483 netdev_linux_miimon_wait();
487 netdev_dev_linux_changed(struct netdev_dev_linux *dev)
490 if (!dev->change_seq) {
493 dev->cache_valid = 0;
497 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
498 void *aux OVS_UNUSED)
500 struct netdev_dev_linux *dev;
502 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
504 const struct netdev_class *netdev_class =
505 netdev_dev_get_class(base_dev);
507 if (is_netdev_linux_class(netdev_class)) {
508 dev = netdev_dev_linux_cast(base_dev);
510 if (dev->carrier != change->running) {
511 dev->carrier = change->running;
512 dev->carrier_resets++;
515 netdev_dev_linux_changed(dev);
519 struct shash device_shash;
520 struct shash_node *node;
522 shash_init(&device_shash);
523 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
524 SHASH_FOR_EACH (node, &device_shash) {
529 get_carrier_via_sysfs(node->name, &carrier);
530 if (dev->carrier != carrier) {
531 dev->carrier = carrier;
532 dev->carrier_resets++;
535 netdev_dev_linux_changed(dev);
537 shash_destroy(&device_shash);
542 cache_notifier_ref(void)
544 if (!cache_notifier_refcount) {
545 assert(!netdev_linux_cache_notifier);
547 netdev_linux_cache_notifier =
548 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
550 if (!netdev_linux_cache_notifier) {
554 cache_notifier_refcount++;
560 cache_notifier_unref(void)
562 assert(cache_notifier_refcount > 0);
563 if (!--cache_notifier_refcount) {
564 assert(netdev_linux_cache_notifier);
565 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
566 netdev_linux_cache_notifier = NULL;
570 /* Creates system and internal devices. */
572 netdev_linux_create(const struct netdev_class *class, const char *name,
573 struct netdev_dev **netdev_devp)
575 struct netdev_dev_linux *netdev_dev;
578 error = cache_notifier_ref();
583 netdev_dev = xzalloc(sizeof *netdev_dev);
584 netdev_dev->change_seq = 1;
585 netdev_dev_init(&netdev_dev->netdev_dev, name, class);
586 get_carrier_via_sysfs(name, &netdev_dev->carrier);
588 *netdev_devp = &netdev_dev->netdev_dev;
592 /* For most types of netdevs we open the device for each call of
593 * netdev_open(). However, this is not the case with tap devices,
594 * since it is only possible to open the device once. In this
595 * situation we share a single file descriptor, and consequently
596 * buffers, across all readers. Therefore once data is read it will
597 * be unavailable to other reads for tap devices. */
599 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
600 const char *name, struct netdev_dev **netdev_devp)
602 struct netdev_dev_linux *netdev_dev;
603 struct tap_state *state;
604 static const char tap_dev[] = "/dev/net/tun";
608 netdev_dev = xzalloc(sizeof *netdev_dev);
609 state = &netdev_dev->state.tap;
611 error = cache_notifier_ref();
616 /* Open tap device. */
617 state->fd = open(tap_dev, O_RDWR);
620 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
621 goto error_unref_notifier;
624 /* Create tap device. */
625 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
626 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
627 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
628 VLOG_WARN("%s: creating tap device failed: %s", name,
631 goto error_unref_notifier;
634 /* Make non-blocking. */
635 error = set_nonblocking(state->fd);
637 goto error_unref_notifier;
640 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
641 *netdev_devp = &netdev_dev->netdev_dev;
644 error_unref_notifier:
645 cache_notifier_unref();
652 destroy_tap(struct netdev_dev_linux *netdev_dev)
654 struct tap_state *state = &netdev_dev->state.tap;
656 if (state->fd >= 0) {
661 /* Destroys the netdev device 'netdev_dev_'. */
663 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
665 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
666 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
668 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
669 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
672 if (class == &netdev_tap_class) {
673 destroy_tap(netdev_dev);
677 cache_notifier_unref();
681 netdev_linux_open(struct netdev_dev *netdev_dev_, struct netdev **netdevp)
683 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
684 struct netdev_linux *netdev;
685 enum netdev_flags flags;
688 /* Allocate network device. */
689 netdev = xzalloc(sizeof *netdev);
691 netdev_init(&netdev->netdev, netdev_dev_);
693 /* Verify that the device really exists, by attempting to read its flags.
694 * (The flags might be cached, in which case this won't actually do an
697 * Don't do this for "internal" netdevs, though, because those have to be
698 * created as netdev objects before they exist in the kernel, because
699 * creating them in the kernel happens by passing a netdev object to
700 * dpif_port_add(). */
701 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
702 error = netdev_get_flags(&netdev->netdev, &flags);
703 if (error == ENODEV) {
708 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
709 !netdev_dev->state.tap.opened) {
711 /* We assume that the first user of the tap device is the primary user
712 * and give them the tap FD. Subsequent users probably just expect
713 * this to be a system device so open it normally to avoid send/receive
714 * directions appearing to be reversed. */
715 netdev->fd = netdev_dev->state.tap.fd;
716 netdev_dev->state.tap.opened = true;
719 *netdevp = &netdev->netdev;
723 netdev_uninit(&netdev->netdev, true);
727 /* Closes and destroys 'netdev'. */
729 netdev_linux_close(struct netdev *netdev_)
731 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
733 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
740 netdev_linux_listen(struct netdev *netdev_)
742 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
743 struct sockaddr_ll sll;
748 if (netdev->fd >= 0) {
752 /* Create file descriptor. */
753 fd = socket(PF_PACKET, SOCK_RAW, 0);
756 VLOG_ERR("failed to create raw socket (%s)", strerror(error));
760 /* Set non-blocking mode. */
761 error = set_nonblocking(fd);
766 /* Get ethernet device index. */
767 error = get_ifindex(&netdev->netdev, &ifindex);
772 /* Bind to specific ethernet device. */
773 memset(&sll, 0, sizeof sll);
774 sll.sll_family = AF_PACKET;
775 sll.sll_ifindex = ifindex;
776 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
777 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
779 VLOG_ERR("%s: failed to bind raw socket (%s)",
780 netdev_get_name(netdev_), strerror(error));
795 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
797 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
799 if (netdev->fd < 0) {
800 /* Device is not listening. */
807 retval = (netdev_->netdev_dev->netdev_class == &netdev_tap_class
808 ? read(netdev->fd, data, size)
809 : recv(netdev->fd, data, size, MSG_TRUNC));
811 return retval <= size ? retval : -EMSGSIZE;
812 } else if (errno != EINTR) {
813 if (errno != EAGAIN) {
814 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
815 strerror(errno), netdev_get_name(netdev_));
822 /* Registers with the poll loop to wake up from the next call to poll_block()
823 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
825 netdev_linux_recv_wait(struct netdev *netdev_)
827 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
828 if (netdev->fd >= 0) {
829 poll_fd_wait(netdev->fd, POLLIN);
833 /* Discards all packets waiting to be received from 'netdev'. */
835 netdev_linux_drain(struct netdev *netdev_)
837 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
838 if (netdev->fd < 0) {
840 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
842 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
843 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
847 drain_fd(netdev->fd, ifr.ifr_qlen);
850 return drain_rcvbuf(netdev->fd);
854 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
855 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
856 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
857 * the packet is too big or too small to transmit on the device.
859 * The caller retains ownership of 'buffer' in all cases.
861 * The kernel maintains a packet transmission queue, so the caller is not
862 * expected to do additional queuing of packets. */
864 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
866 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
870 if (netdev->fd < 0) {
871 /* Use our AF_PACKET socket to send to this device. */
872 struct sockaddr_ll sll;
879 sock = af_packet_sock();
884 error = get_ifindex(netdev_, &ifindex);
889 /* We don't bother setting most fields in sockaddr_ll because the
890 * kernel ignores them for SOCK_RAW. */
891 memset(&sll, 0, sizeof sll);
892 sll.sll_family = AF_PACKET;
893 sll.sll_ifindex = ifindex;
895 iov.iov_base = (void *) data;
899 msg.msg_namelen = sizeof sll;
902 msg.msg_control = NULL;
903 msg.msg_controllen = 0;
906 retval = sendmsg(sock, &msg, 0);
908 /* Use the netdev's own fd to send to this device. This is
909 * essential for tap devices, because packets sent to a tap device
910 * with an AF_PACKET socket will loop back to be *received* again
911 * on the tap device. */
912 retval = write(netdev->fd, data, size);
916 /* The Linux AF_PACKET implementation never blocks waiting for room
917 * for packets, instead returning ENOBUFS. Translate this into
918 * EAGAIN for the caller. */
919 if (errno == ENOBUFS) {
921 } else if (errno == EINTR) {
923 } else if (errno != EAGAIN) {
924 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
925 netdev_get_name(netdev_), strerror(errno));
928 } else if (retval != size) {
929 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
930 "%zu) on %s", retval, size, netdev_get_name(netdev_));
938 /* Registers with the poll loop to wake up from the next call to poll_block()
939 * when the packet transmission queue has sufficient room to transmit a packet
940 * with netdev_send().
942 * The kernel maintains a packet transmission queue, so the client is not
943 * expected to do additional queuing of packets. Thus, this function is
944 * unlikely to ever be used. It is included for completeness. */
946 netdev_linux_send_wait(struct netdev *netdev_)
948 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
949 if (netdev->fd < 0) {
951 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
952 poll_fd_wait(netdev->fd, POLLOUT);
954 /* TAP device always accepts packets.*/
955 poll_immediate_wake();
959 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
960 * otherwise a positive errno value. */
962 netdev_linux_set_etheraddr(struct netdev *netdev_,
963 const uint8_t mac[ETH_ADDR_LEN])
965 struct netdev_dev_linux *netdev_dev =
966 netdev_dev_linux_cast(netdev_get_dev(netdev_));
969 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
970 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
971 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
973 netdev_dev->cache_valid |= VALID_ETHERADDR;
974 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
982 /* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
983 * free the returned buffer. */
985 netdev_linux_get_etheraddr(const struct netdev *netdev_,
986 uint8_t mac[ETH_ADDR_LEN])
988 struct netdev_dev_linux *netdev_dev =
989 netdev_dev_linux_cast(netdev_get_dev(netdev_));
990 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
991 int error = get_etheraddr(netdev_get_name(netdev_),
992 netdev_dev->etheraddr);
996 netdev_dev->cache_valid |= VALID_ETHERADDR;
998 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
1002 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1003 * in bytes, not including the hardware header; thus, this is typically 1500
1004 * bytes for Ethernet devices. */
1006 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1008 struct netdev_dev_linux *netdev_dev =
1009 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1010 if (!(netdev_dev->cache_valid & VALID_MTU)) {
1014 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1015 SIOCGIFMTU, "SIOCGIFMTU");
1019 netdev_dev->mtu = ifr.ifr_mtu;
1020 netdev_dev->cache_valid |= VALID_MTU;
1022 *mtup = netdev_dev->mtu;
1026 /* Sets the maximum size of transmitted (MTU) for given device using linux
1027 * networking ioctl interface.
1030 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1032 struct netdev_dev_linux *netdev_dev =
1033 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1038 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1039 SIOCSIFMTU, "SIOCSIFMTU");
1044 netdev_dev->mtu = ifr.ifr_mtu;
1045 netdev_dev->cache_valid |= VALID_MTU;
1049 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1050 * On failure, returns a negative errno value. */
1052 netdev_linux_get_ifindex(const struct netdev *netdev)
1056 error = get_ifindex(netdev, &ifindex);
1057 return error ? -error : ifindex;
1061 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1063 struct netdev_dev_linux *netdev_dev =
1064 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1066 if (netdev_dev->miimon_interval > 0) {
1067 *carrier = netdev_dev->miimon;
1069 *carrier = netdev_dev->carrier;
1075 static long long int
1076 netdev_linux_get_carrier_resets(const struct netdev *netdev)
1078 return netdev_dev_linux_cast(netdev_get_dev(netdev))->carrier_resets;
1082 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1083 struct mii_ioctl_data *data)
1088 memset(&ifr, 0, sizeof ifr);
1089 memcpy(&ifr.ifr_data, data, sizeof *data);
1090 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1091 memcpy(data, &ifr.ifr_data, sizeof *data);
1097 netdev_linux_get_miimon(const char *name, bool *miimon)
1099 struct mii_ioctl_data data;
1104 memset(&data, 0, sizeof data);
1105 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1107 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1108 data.reg_num = MII_BMSR;
1109 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1113 *miimon = !!(data.val_out & BMSR_LSTATUS);
1115 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1118 struct ethtool_cmd ecmd;
1120 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1123 memset(&ecmd, 0, sizeof ecmd);
1124 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1127 struct ethtool_value eval;
1129 memcpy(&eval, &ecmd, sizeof eval);
1130 *miimon = !!eval.data;
1132 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1140 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1141 long long int interval)
1143 struct netdev_dev_linux *netdev_dev;
1145 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev_));
1147 interval = interval > 0 ? MAX(interval, 100) : 0;
1148 if (netdev_dev->miimon_interval != interval) {
1149 netdev_dev->miimon_interval = interval;
1150 timer_set_expired(&netdev_dev->miimon_timer);
1157 netdev_linux_miimon_run(void)
1159 struct shash device_shash;
1160 struct shash_node *node;
1162 shash_init(&device_shash);
1163 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1164 SHASH_FOR_EACH (node, &device_shash) {
1165 struct netdev_dev_linux *dev = node->data;
1168 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1172 netdev_linux_get_miimon(dev->netdev_dev.name, &miimon);
1173 if (miimon != dev->miimon) {
1174 dev->miimon = miimon;
1175 netdev_dev_linux_changed(dev);
1178 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1181 shash_destroy(&device_shash);
1185 netdev_linux_miimon_wait(void)
1187 struct shash device_shash;
1188 struct shash_node *node;
1190 shash_init(&device_shash);
1191 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1192 SHASH_FOR_EACH (node, &device_shash) {
1193 struct netdev_dev_linux *dev = node->data;
1195 if (dev->miimon_interval > 0) {
1196 timer_wait(&dev->miimon_timer);
1199 shash_destroy(&device_shash);
1202 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1203 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1206 check_for_working_netlink_stats(void)
1208 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1209 * preferable, so if that works, we'll use it. */
1210 int ifindex = do_get_ifindex("lo");
1212 VLOG_WARN("failed to get ifindex for lo, "
1213 "obtaining netdev stats from proc");
1216 struct netdev_stats stats;
1217 int error = get_stats_via_netlink(ifindex, &stats);
1219 VLOG_DBG("obtaining netdev stats via rtnetlink");
1222 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1223 "via proc (you are probably running a pre-2.6.19 "
1224 "kernel)", strerror(error));
1231 swap_uint64(uint64_t *a, uint64_t *b)
1239 get_stats_via_vport(const struct netdev *netdev_,
1240 struct netdev_stats *stats)
1242 struct netdev_dev_linux *netdev_dev =
1243 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1245 if (netdev_dev->have_vport_stats ||
1246 !(netdev_dev->cache_valid & VALID_HAVE_VPORT_STATS)) {
1249 error = netdev_vport_get_stats(netdev_, stats);
1251 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1252 "(%s)", netdev_get_name(netdev_), strerror(error));
1254 netdev_dev->have_vport_stats = !error;
1255 netdev_dev->cache_valid |= VALID_HAVE_VPORT_STATS;
1260 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1261 struct netdev_stats *stats)
1263 static int use_netlink_stats = -1;
1266 if (use_netlink_stats < 0) {
1267 use_netlink_stats = check_for_working_netlink_stats();
1270 if (use_netlink_stats) {
1273 error = get_ifindex(netdev_, &ifindex);
1275 error = get_stats_via_netlink(ifindex, stats);
1278 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1282 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1283 netdev_get_name(netdev_), error);
1289 /* Retrieves current device stats for 'netdev-linux'. */
1291 netdev_linux_get_stats(const struct netdev *netdev_,
1292 struct netdev_stats *stats)
1294 struct netdev_dev_linux *netdev_dev =
1295 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1296 struct netdev_stats dev_stats;
1299 get_stats_via_vport(netdev_, stats);
1301 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1304 if (!netdev_dev->have_vport_stats) {
1311 if (!netdev_dev->have_vport_stats) {
1312 /* stats not available from OVS then use ioctl stats. */
1315 stats->rx_errors += dev_stats.rx_errors;
1316 stats->tx_errors += dev_stats.tx_errors;
1317 stats->rx_dropped += dev_stats.rx_dropped;
1318 stats->tx_dropped += dev_stats.tx_dropped;
1319 stats->multicast += dev_stats.multicast;
1320 stats->collisions += dev_stats.collisions;
1321 stats->rx_length_errors += dev_stats.rx_length_errors;
1322 stats->rx_over_errors += dev_stats.rx_over_errors;
1323 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1324 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1325 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1326 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1327 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1328 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1329 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1330 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1331 stats->tx_window_errors += dev_stats.tx_window_errors;
1336 /* Retrieves current device stats for 'netdev-tap' netdev or
1337 * netdev-internal. */
1339 netdev_pseudo_get_stats(const struct netdev *netdev_,
1340 struct netdev_stats *stats)
1342 struct netdev_dev_linux *netdev_dev =
1343 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1344 struct netdev_stats dev_stats;
1347 get_stats_via_vport(netdev_, stats);
1349 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1351 if (!netdev_dev->have_vport_stats) {
1358 /* If this port is an internal port then the transmit and receive stats
1359 * will appear to be swapped relative to the other ports since we are the
1360 * one sending the data, not a remote computer. For consistency, we swap
1361 * them back here. This does not apply if we are getting stats from the
1362 * vport layer because it always tracks stats from the perspective of the
1364 if (!netdev_dev->have_vport_stats) {
1366 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1367 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1368 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1369 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1370 stats->rx_length_errors = 0;
1371 stats->rx_over_errors = 0;
1372 stats->rx_crc_errors = 0;
1373 stats->rx_frame_errors = 0;
1374 stats->rx_fifo_errors = 0;
1375 stats->rx_missed_errors = 0;
1376 stats->tx_aborted_errors = 0;
1377 stats->tx_carrier_errors = 0;
1378 stats->tx_fifo_errors = 0;
1379 stats->tx_heartbeat_errors = 0;
1380 stats->tx_window_errors = 0;
1382 stats->rx_dropped += dev_stats.tx_dropped;
1383 stats->tx_dropped += dev_stats.rx_dropped;
1385 stats->rx_errors += dev_stats.tx_errors;
1386 stats->tx_errors += dev_stats.rx_errors;
1388 stats->multicast += dev_stats.multicast;
1389 stats->collisions += dev_stats.collisions;
1394 /* Stores the features supported by 'netdev' into each of '*current',
1395 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1396 * bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if
1397 * successful, otherwise a positive errno value. */
1399 netdev_linux_get_features(const struct netdev *netdev,
1400 uint32_t *current, uint32_t *advertised,
1401 uint32_t *supported, uint32_t *peer)
1403 struct ethtool_cmd ecmd;
1406 memset(&ecmd, 0, sizeof ecmd);
1407 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1408 ETHTOOL_GSET, "ETHTOOL_GSET");
1413 /* Supported features. */
1415 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1416 *supported |= OFPPF_10MB_HD;
1418 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1419 *supported |= OFPPF_10MB_FD;
1421 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1422 *supported |= OFPPF_100MB_HD;
1424 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1425 *supported |= OFPPF_100MB_FD;
1427 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1428 *supported |= OFPPF_1GB_HD;
1430 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1431 *supported |= OFPPF_1GB_FD;
1433 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1434 *supported |= OFPPF_10GB_FD;
1436 if (ecmd.supported & SUPPORTED_TP) {
1437 *supported |= OFPPF_COPPER;
1439 if (ecmd.supported & SUPPORTED_FIBRE) {
1440 *supported |= OFPPF_FIBER;
1442 if (ecmd.supported & SUPPORTED_Autoneg) {
1443 *supported |= OFPPF_AUTONEG;
1445 if (ecmd.supported & SUPPORTED_Pause) {
1446 *supported |= OFPPF_PAUSE;
1448 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1449 *supported |= OFPPF_PAUSE_ASYM;
1452 /* Advertised features. */
1454 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1455 *advertised |= OFPPF_10MB_HD;
1457 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1458 *advertised |= OFPPF_10MB_FD;
1460 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1461 *advertised |= OFPPF_100MB_HD;
1463 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1464 *advertised |= OFPPF_100MB_FD;
1466 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1467 *advertised |= OFPPF_1GB_HD;
1469 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1470 *advertised |= OFPPF_1GB_FD;
1472 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1473 *advertised |= OFPPF_10GB_FD;
1475 if (ecmd.advertising & ADVERTISED_TP) {
1476 *advertised |= OFPPF_COPPER;
1478 if (ecmd.advertising & ADVERTISED_FIBRE) {
1479 *advertised |= OFPPF_FIBER;
1481 if (ecmd.advertising & ADVERTISED_Autoneg) {
1482 *advertised |= OFPPF_AUTONEG;
1484 if (ecmd.advertising & ADVERTISED_Pause) {
1485 *advertised |= OFPPF_PAUSE;
1487 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1488 *advertised |= OFPPF_PAUSE_ASYM;
1491 /* Current settings. */
1492 if (ecmd.speed == SPEED_10) {
1493 *current = ecmd.duplex ? OFPPF_10MB_FD : OFPPF_10MB_HD;
1494 } else if (ecmd.speed == SPEED_100) {
1495 *current = ecmd.duplex ? OFPPF_100MB_FD : OFPPF_100MB_HD;
1496 } else if (ecmd.speed == SPEED_1000) {
1497 *current = ecmd.duplex ? OFPPF_1GB_FD : OFPPF_1GB_HD;
1498 } else if (ecmd.speed == SPEED_10000) {
1499 *current = OFPPF_10GB_FD;
1504 if (ecmd.port == PORT_TP) {
1505 *current |= OFPPF_COPPER;
1506 } else if (ecmd.port == PORT_FIBRE) {
1507 *current |= OFPPF_FIBER;
1511 *current |= OFPPF_AUTONEG;
1514 /* Peer advertisements. */
1515 *peer = 0; /* XXX */
1520 /* Set the features advertised by 'netdev' to 'advertise'. */
1522 netdev_linux_set_advertisements(struct netdev *netdev, uint32_t advertise)
1524 struct ethtool_cmd ecmd;
1527 memset(&ecmd, 0, sizeof ecmd);
1528 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1529 ETHTOOL_GSET, "ETHTOOL_GSET");
1534 ecmd.advertising = 0;
1535 if (advertise & OFPPF_10MB_HD) {
1536 ecmd.advertising |= ADVERTISED_10baseT_Half;
1538 if (advertise & OFPPF_10MB_FD) {
1539 ecmd.advertising |= ADVERTISED_10baseT_Full;
1541 if (advertise & OFPPF_100MB_HD) {
1542 ecmd.advertising |= ADVERTISED_100baseT_Half;
1544 if (advertise & OFPPF_100MB_FD) {
1545 ecmd.advertising |= ADVERTISED_100baseT_Full;
1547 if (advertise & OFPPF_1GB_HD) {
1548 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1550 if (advertise & OFPPF_1GB_FD) {
1551 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1553 if (advertise & OFPPF_10GB_FD) {
1554 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1556 if (advertise & OFPPF_COPPER) {
1557 ecmd.advertising |= ADVERTISED_TP;
1559 if (advertise & OFPPF_FIBER) {
1560 ecmd.advertising |= ADVERTISED_FIBRE;
1562 if (advertise & OFPPF_AUTONEG) {
1563 ecmd.advertising |= ADVERTISED_Autoneg;
1565 if (advertise & OFPPF_PAUSE) {
1566 ecmd.advertising |= ADVERTISED_Pause;
1568 if (advertise & OFPPF_PAUSE_ASYM) {
1569 ecmd.advertising |= ADVERTISED_Asym_Pause;
1571 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1572 ETHTOOL_SSET, "ETHTOOL_SSET");
1575 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1576 * successful, otherwise a positive errno value. */
1578 netdev_linux_set_policing(struct netdev *netdev,
1579 uint32_t kbits_rate, uint32_t kbits_burst)
1581 struct netdev_dev_linux *netdev_dev =
1582 netdev_dev_linux_cast(netdev_get_dev(netdev));
1583 const char *netdev_name = netdev_get_name(netdev);
1586 COVERAGE_INC(netdev_set_policing);
1588 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1589 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1590 : kbits_burst); /* Stick with user-specified value. */
1592 if (netdev_dev->cache_valid & VALID_POLICING
1593 && netdev_dev->kbits_rate == kbits_rate
1594 && netdev_dev->kbits_burst == kbits_burst) {
1595 /* Assume that settings haven't changed since we last set them. */
1599 /* Remove any existing ingress qdisc. */
1600 error = tc_add_del_ingress_qdisc(netdev, false);
1602 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1603 netdev_name, strerror(error));
1608 error = tc_add_del_ingress_qdisc(netdev, true);
1610 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1611 netdev_name, strerror(error));
1615 error = tc_add_policer(netdev, kbits_rate, kbits_burst);
1617 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1618 netdev_name, strerror(error));
1623 netdev_dev->kbits_rate = kbits_rate;
1624 netdev_dev->kbits_burst = kbits_burst;
1625 netdev_dev->cache_valid |= VALID_POLICING;
1631 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1634 const struct tc_ops **opsp;
1636 for (opsp = tcs; *opsp != NULL; opsp++) {
1637 const struct tc_ops *ops = *opsp;
1638 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1639 sset_add(types, ops->ovs_name);
1645 static const struct tc_ops *
1646 tc_lookup_ovs_name(const char *name)
1648 const struct tc_ops **opsp;
1650 for (opsp = tcs; *opsp != NULL; opsp++) {
1651 const struct tc_ops *ops = *opsp;
1652 if (!strcmp(name, ops->ovs_name)) {
1659 static const struct tc_ops *
1660 tc_lookup_linux_name(const char *name)
1662 const struct tc_ops **opsp;
1664 for (opsp = tcs; *opsp != NULL; opsp++) {
1665 const struct tc_ops *ops = *opsp;
1666 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1673 static struct tc_queue *
1674 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1677 struct netdev_dev_linux *netdev_dev =
1678 netdev_dev_linux_cast(netdev_get_dev(netdev));
1679 struct tc_queue *queue;
1681 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1682 if (queue->queue_id == queue_id) {
1689 static struct tc_queue *
1690 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1692 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1696 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1698 struct netdev_qos_capabilities *caps)
1700 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1704 caps->n_queues = ops->n_queues;
1709 netdev_linux_get_qos(const struct netdev *netdev,
1710 const char **typep, struct shash *details)
1712 struct netdev_dev_linux *netdev_dev =
1713 netdev_dev_linux_cast(netdev_get_dev(netdev));
1716 error = tc_query_qdisc(netdev);
1721 *typep = netdev_dev->tc->ops->ovs_name;
1722 return (netdev_dev->tc->ops->qdisc_get
1723 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1728 netdev_linux_set_qos(struct netdev *netdev,
1729 const char *type, const struct shash *details)
1731 struct netdev_dev_linux *netdev_dev =
1732 netdev_dev_linux_cast(netdev_get_dev(netdev));
1733 const struct tc_ops *new_ops;
1736 new_ops = tc_lookup_ovs_name(type);
1737 if (!new_ops || !new_ops->tc_install) {
1741 error = tc_query_qdisc(netdev);
1746 if (new_ops == netdev_dev->tc->ops) {
1747 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1749 /* Delete existing qdisc. */
1750 error = tc_del_qdisc(netdev);
1754 assert(netdev_dev->tc == NULL);
1756 /* Install new qdisc. */
1757 error = new_ops->tc_install(netdev, details);
1758 assert((error == 0) == (netdev_dev->tc != NULL));
1765 netdev_linux_get_queue(const struct netdev *netdev,
1766 unsigned int queue_id, struct shash *details)
1768 struct netdev_dev_linux *netdev_dev =
1769 netdev_dev_linux_cast(netdev_get_dev(netdev));
1772 error = tc_query_qdisc(netdev);
1776 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1778 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1784 netdev_linux_set_queue(struct netdev *netdev,
1785 unsigned int queue_id, const struct shash *details)
1787 struct netdev_dev_linux *netdev_dev =
1788 netdev_dev_linux_cast(netdev_get_dev(netdev));
1791 error = tc_query_qdisc(netdev);
1794 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1795 || !netdev_dev->tc->ops->class_set) {
1799 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1803 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1805 struct netdev_dev_linux *netdev_dev =
1806 netdev_dev_linux_cast(netdev_get_dev(netdev));
1809 error = tc_query_qdisc(netdev);
1812 } else if (!netdev_dev->tc->ops->class_delete) {
1815 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1817 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1823 netdev_linux_get_queue_stats(const struct netdev *netdev,
1824 unsigned int queue_id,
1825 struct netdev_queue_stats *stats)
1827 struct netdev_dev_linux *netdev_dev =
1828 netdev_dev_linux_cast(netdev_get_dev(netdev));
1831 error = tc_query_qdisc(netdev);
1834 } else if (!netdev_dev->tc->ops->class_get_stats) {
1837 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1839 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1845 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1847 struct ofpbuf request;
1848 struct tcmsg *tcmsg;
1850 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
1854 tcmsg->tcm_parent = 0;
1855 nl_dump_start(dump, rtnl_sock, &request);
1856 ofpbuf_uninit(&request);
1861 netdev_linux_dump_queues(const struct netdev *netdev,
1862 netdev_dump_queues_cb *cb, void *aux)
1864 struct netdev_dev_linux *netdev_dev =
1865 netdev_dev_linux_cast(netdev_get_dev(netdev));
1866 struct tc_queue *queue, *next_queue;
1867 struct shash details;
1871 error = tc_query_qdisc(netdev);
1874 } else if (!netdev_dev->tc->ops->class_get) {
1879 shash_init(&details);
1880 HMAP_FOR_EACH_SAFE (queue, next_queue, hmap_node,
1881 &netdev_dev->tc->queues) {
1882 shash_clear(&details);
1884 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
1886 (*cb)(queue->queue_id, &details, aux);
1891 shash_destroy(&details);
1897 netdev_linux_dump_queue_stats(const struct netdev *netdev,
1898 netdev_dump_queue_stats_cb *cb, void *aux)
1900 struct netdev_dev_linux *netdev_dev =
1901 netdev_dev_linux_cast(netdev_get_dev(netdev));
1902 struct nl_dump dump;
1907 error = tc_query_qdisc(netdev);
1910 } else if (!netdev_dev->tc->ops->class_dump_stats) {
1915 if (!start_queue_dump(netdev, &dump)) {
1918 while (nl_dump_next(&dump, &msg)) {
1919 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
1925 error = nl_dump_done(&dump);
1926 return error ? error : last_error;
1930 netdev_linux_get_in4(const struct netdev *netdev_,
1931 struct in_addr *address, struct in_addr *netmask)
1933 struct netdev_dev_linux *netdev_dev =
1934 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1936 if (!(netdev_dev->cache_valid & VALID_IN4)) {
1939 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
1940 SIOCGIFADDR, "SIOCGIFADDR");
1945 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
1946 SIOCGIFNETMASK, "SIOCGIFNETMASK");
1951 netdev_dev->cache_valid |= VALID_IN4;
1953 *address = netdev_dev->address;
1954 *netmask = netdev_dev->netmask;
1955 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
1959 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
1960 struct in_addr netmask)
1962 struct netdev_dev_linux *netdev_dev =
1963 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1966 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
1968 netdev_dev->cache_valid |= VALID_IN4;
1969 netdev_dev->address = address;
1970 netdev_dev->netmask = netmask;
1971 if (address.s_addr != INADDR_ANY) {
1972 error = do_set_addr(netdev_, SIOCSIFNETMASK,
1973 "SIOCSIFNETMASK", netmask);
1980 parse_if_inet6_line(const char *line,
1981 struct in6_addr *in6, char ifname[16 + 1])
1983 uint8_t *s6 = in6->s6_addr;
1984 #define X8 "%2"SCNx8
1986 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
1987 "%*x %*x %*x %*x %16s\n",
1988 &s6[0], &s6[1], &s6[2], &s6[3],
1989 &s6[4], &s6[5], &s6[6], &s6[7],
1990 &s6[8], &s6[9], &s6[10], &s6[11],
1991 &s6[12], &s6[13], &s6[14], &s6[15],
1995 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
1996 * 'in6' is non-null) and returns true. Otherwise, returns false. */
1998 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2000 struct netdev_dev_linux *netdev_dev =
2001 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2002 if (!(netdev_dev->cache_valid & VALID_IN6)) {
2006 netdev_dev->in6 = in6addr_any;
2008 file = fopen("/proc/net/if_inet6", "r");
2010 const char *name = netdev_get_name(netdev_);
2011 while (fgets(line, sizeof line, file)) {
2012 struct in6_addr in6_tmp;
2013 char ifname[16 + 1];
2014 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2015 && !strcmp(name, ifname))
2017 netdev_dev->in6 = in6_tmp;
2023 netdev_dev->cache_valid |= VALID_IN6;
2025 *in6 = netdev_dev->in6;
2030 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2032 struct sockaddr_in sin;
2033 memset(&sin, 0, sizeof sin);
2034 sin.sin_family = AF_INET;
2035 sin.sin_addr = addr;
2038 memset(sa, 0, sizeof *sa);
2039 memcpy(sa, &sin, sizeof sin);
2043 do_set_addr(struct netdev *netdev,
2044 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2047 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2048 make_in4_sockaddr(&ifr.ifr_addr, addr);
2050 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2054 /* Adds 'router' as a default IP gateway. */
2056 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2058 struct in_addr any = { INADDR_ANY };
2062 memset(&rt, 0, sizeof rt);
2063 make_in4_sockaddr(&rt.rt_dst, any);
2064 make_in4_sockaddr(&rt.rt_gateway, router);
2065 make_in4_sockaddr(&rt.rt_genmask, any);
2066 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2067 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2069 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
2075 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2078 static const char fn[] = "/proc/net/route";
2083 *netdev_name = NULL;
2084 stream = fopen(fn, "r");
2085 if (stream == NULL) {
2086 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
2091 while (fgets(line, sizeof line, stream)) {
2094 ovs_be32 dest, gateway, mask;
2095 int refcnt, metric, mtu;
2096 unsigned int flags, use, window, irtt;
2099 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2101 iface, &dest, &gateway, &flags, &refcnt,
2102 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2104 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2108 if (!(flags & RTF_UP)) {
2109 /* Skip routes that aren't up. */
2113 /* The output of 'dest', 'mask', and 'gateway' were given in
2114 * network byte order, so we don't need need any endian
2115 * conversions here. */
2116 if ((dest & mask) == (host->s_addr & mask)) {
2118 /* The host is directly reachable. */
2119 next_hop->s_addr = 0;
2121 /* To reach the host, we must go through a gateway. */
2122 next_hop->s_addr = gateway;
2124 *netdev_name = xstrdup(iface);
2136 netdev_linux_get_status(const struct netdev *netdev, struct shash *sh)
2138 struct ethtool_drvinfo drvinfo;
2141 memset(&drvinfo, 0, sizeof drvinfo);
2142 error = netdev_linux_do_ethtool(netdev_get_name(netdev),
2143 (struct ethtool_cmd *)&drvinfo,
2145 "ETHTOOL_GDRVINFO");
2147 shash_add(sh, "driver_name", xstrdup(drvinfo.driver));
2148 shash_add(sh, "driver_version", xstrdup(drvinfo.version));
2149 shash_add(sh, "firmware_version", xstrdup(drvinfo.fw_version));
2155 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2156 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2157 * returns 0. Otherwise, it returns a positive errno value; in particular,
2158 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2160 netdev_linux_arp_lookup(const struct netdev *netdev,
2161 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2164 struct sockaddr_in sin;
2167 memset(&r, 0, sizeof r);
2168 memset(&sin, 0, sizeof sin);
2169 sin.sin_family = AF_INET;
2170 sin.sin_addr.s_addr = ip;
2172 memcpy(&r.arp_pa, &sin, sizeof sin);
2173 r.arp_ha.sa_family = ARPHRD_ETHER;
2175 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2176 COVERAGE_INC(netdev_arp_lookup);
2177 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2179 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2180 } else if (retval != ENXIO) {
2181 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2182 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
2188 nd_to_iff_flags(enum netdev_flags nd)
2191 if (nd & NETDEV_UP) {
2194 if (nd & NETDEV_PROMISC) {
2201 iff_to_nd_flags(int iff)
2203 enum netdev_flags nd = 0;
2207 if (iff & IFF_PROMISC) {
2208 nd |= NETDEV_PROMISC;
2214 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2215 enum netdev_flags on, enum netdev_flags *old_flagsp)
2217 int old_flags, new_flags;
2220 error = get_flags(netdev, &old_flags);
2222 *old_flagsp = iff_to_nd_flags(old_flags);
2223 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2224 if (new_flags != old_flags) {
2225 error = set_flags(netdev, new_flags);
2232 netdev_linux_change_seq(const struct netdev *netdev)
2234 return netdev_dev_linux_cast(netdev_get_dev(netdev))->change_seq;
2237 #define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS) \
2241 netdev_linux_init, \
2243 netdev_linux_wait, \
2246 netdev_linux_destroy, \
2247 NULL, /* get_config */ \
2248 NULL, /* set_config */ \
2250 netdev_linux_open, \
2251 netdev_linux_close, \
2253 netdev_linux_listen, \
2254 netdev_linux_recv, \
2255 netdev_linux_recv_wait, \
2256 netdev_linux_drain, \
2258 netdev_linux_send, \
2259 netdev_linux_send_wait, \
2261 netdev_linux_set_etheraddr, \
2262 netdev_linux_get_etheraddr, \
2263 netdev_linux_get_mtu, \
2264 netdev_linux_set_mtu, \
2265 netdev_linux_get_ifindex, \
2266 netdev_linux_get_carrier, \
2267 netdev_linux_get_carrier_resets, \
2268 netdev_linux_set_miimon_interval, \
2272 netdev_linux_get_features, \
2273 netdev_linux_set_advertisements, \
2275 netdev_linux_set_policing, \
2276 netdev_linux_get_qos_types, \
2277 netdev_linux_get_qos_capabilities, \
2278 netdev_linux_get_qos, \
2279 netdev_linux_set_qos, \
2280 netdev_linux_get_queue, \
2281 netdev_linux_set_queue, \
2282 netdev_linux_delete_queue, \
2283 netdev_linux_get_queue_stats, \
2284 netdev_linux_dump_queues, \
2285 netdev_linux_dump_queue_stats, \
2287 netdev_linux_get_in4, \
2288 netdev_linux_set_in4, \
2289 netdev_linux_get_in6, \
2290 netdev_linux_add_router, \
2291 netdev_linux_get_next_hop, \
2292 netdev_linux_get_status, \
2293 netdev_linux_arp_lookup, \
2295 netdev_linux_update_flags, \
2297 netdev_linux_change_seq \
2300 const struct netdev_class netdev_linux_class =
2303 netdev_linux_create,
2304 netdev_linux_get_stats,
2305 NULL); /* set_stats */
2307 const struct netdev_class netdev_tap_class =
2310 netdev_linux_create_tap,
2311 netdev_pseudo_get_stats,
2312 NULL); /* set_stats */
2314 const struct netdev_class netdev_internal_class =
2317 netdev_linux_create,
2318 netdev_pseudo_get_stats,
2319 netdev_vport_set_stats);
2321 /* HTB traffic control class. */
2323 #define HTB_N_QUEUES 0xf000
2327 unsigned int max_rate; /* In bytes/s. */
2331 struct tc_queue tc_queue;
2332 unsigned int min_rate; /* In bytes/s. */
2333 unsigned int max_rate; /* In bytes/s. */
2334 unsigned int burst; /* In bytes. */
2335 unsigned int priority; /* Lower values are higher priorities. */
2339 htb_get__(const struct netdev *netdev)
2341 struct netdev_dev_linux *netdev_dev =
2342 netdev_dev_linux_cast(netdev_get_dev(netdev));
2343 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2347 htb_install__(struct netdev *netdev, uint64_t max_rate)
2349 struct netdev_dev_linux *netdev_dev =
2350 netdev_dev_linux_cast(netdev_get_dev(netdev));
2353 htb = xmalloc(sizeof *htb);
2354 tc_init(&htb->tc, &tc_ops_htb);
2355 htb->max_rate = max_rate;
2357 netdev_dev->tc = &htb->tc;
2360 /* Create an HTB qdisc.
2362 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2364 htb_setup_qdisc__(struct netdev *netdev)
2367 struct tc_htb_glob opt;
2368 struct ofpbuf request;
2369 struct tcmsg *tcmsg;
2371 tc_del_qdisc(netdev);
2373 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2374 NLM_F_EXCL | NLM_F_CREATE, &request);
2378 tcmsg->tcm_handle = tc_make_handle(1, 0);
2379 tcmsg->tcm_parent = TC_H_ROOT;
2381 nl_msg_put_string(&request, TCA_KIND, "htb");
2383 memset(&opt, 0, sizeof opt);
2384 opt.rate2quantum = 10;
2388 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2389 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2390 nl_msg_end_nested(&request, opt_offset);
2392 return tc_transact(&request, NULL);
2395 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2396 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2398 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2399 unsigned int parent, struct htb_class *class)
2402 struct tc_htb_opt opt;
2403 struct ofpbuf request;
2404 struct tcmsg *tcmsg;
2408 error = netdev_get_mtu(netdev, &mtu);
2410 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2411 netdev_get_name(netdev));
2415 memset(&opt, 0, sizeof opt);
2416 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2417 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2418 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2419 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2420 opt.prio = class->priority;
2422 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2426 tcmsg->tcm_handle = handle;
2427 tcmsg->tcm_parent = parent;
2429 nl_msg_put_string(&request, TCA_KIND, "htb");
2430 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2431 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2432 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2433 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2434 nl_msg_end_nested(&request, opt_offset);
2436 error = tc_transact(&request, NULL);
2438 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2439 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2440 netdev_get_name(netdev),
2441 tc_get_major(handle), tc_get_minor(handle),
2442 tc_get_major(parent), tc_get_minor(parent),
2443 class->min_rate, class->max_rate,
2444 class->burst, class->priority, strerror(error));
2449 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2450 * description of them into 'details'. The description complies with the
2451 * specification given in the vswitch database documentation for linux-htb
2454 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2456 static const struct nl_policy tca_htb_policy[] = {
2457 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2458 .min_len = sizeof(struct tc_htb_opt) },
2461 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2462 const struct tc_htb_opt *htb;
2464 if (!nl_parse_nested(nl_options, tca_htb_policy,
2465 attrs, ARRAY_SIZE(tca_htb_policy))) {
2466 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2470 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2471 class->min_rate = htb->rate.rate;
2472 class->max_rate = htb->ceil.rate;
2473 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2474 class->priority = htb->prio;
2479 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2480 struct htb_class *options,
2481 struct netdev_queue_stats *stats)
2483 struct nlattr *nl_options;
2484 unsigned int handle;
2487 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2488 if (!error && queue_id) {
2489 unsigned int major = tc_get_major(handle);
2490 unsigned int minor = tc_get_minor(handle);
2491 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2492 *queue_id = minor - 1;
2497 if (!error && options) {
2498 error = htb_parse_tca_options__(nl_options, options);
2504 htb_parse_qdisc_details__(struct netdev *netdev,
2505 const struct shash *details, struct htb_class *hc)
2507 const char *max_rate_s;
2509 max_rate_s = shash_find_data(details, "max-rate");
2510 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2511 if (!hc->max_rate) {
2514 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2515 hc->max_rate = netdev_features_to_bps(current) / 8;
2517 hc->min_rate = hc->max_rate;
2523 htb_parse_class_details__(struct netdev *netdev,
2524 const struct shash *details, struct htb_class *hc)
2526 const struct htb *htb = htb_get__(netdev);
2527 const char *min_rate_s = shash_find_data(details, "min-rate");
2528 const char *max_rate_s = shash_find_data(details, "max-rate");
2529 const char *burst_s = shash_find_data(details, "burst");
2530 const char *priority_s = shash_find_data(details, "priority");
2533 error = netdev_get_mtu(netdev, &mtu);
2535 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2536 netdev_get_name(netdev));
2540 /* HTB requires at least an mtu sized min-rate to send any traffic even
2541 * on uncongested links. */
2542 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2543 hc->min_rate = MAX(hc->min_rate, mtu);
2544 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2547 hc->max_rate = (max_rate_s
2548 ? strtoull(max_rate_s, NULL, 10) / 8
2550 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2551 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2555 * According to hints in the documentation that I've read, it is important
2556 * that 'burst' be at least as big as the largest frame that might be
2557 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2558 * but having it a bit too small is a problem. Since netdev_get_mtu()
2559 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2560 * the MTU. We actually add 64, instead of 14, as a guard against
2561 * additional headers get tacked on somewhere that we're not aware of. */
2562 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2563 hc->burst = MAX(hc->burst, mtu + 64);
2566 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2572 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2573 unsigned int parent, struct htb_class *options,
2574 struct netdev_queue_stats *stats)
2576 struct ofpbuf *reply;
2579 error = tc_query_class(netdev, handle, parent, &reply);
2581 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2582 ofpbuf_delete(reply);
2588 htb_tc_install(struct netdev *netdev, const struct shash *details)
2592 error = htb_setup_qdisc__(netdev);
2594 struct htb_class hc;
2596 htb_parse_qdisc_details__(netdev, details, &hc);
2597 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2598 tc_make_handle(1, 0), &hc);
2600 htb_install__(netdev, hc.max_rate);
2606 static struct htb_class *
2607 htb_class_cast__(const struct tc_queue *queue)
2609 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2613 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2614 const struct htb_class *hc)
2616 struct htb *htb = htb_get__(netdev);
2617 size_t hash = hash_int(queue_id, 0);
2618 struct tc_queue *queue;
2619 struct htb_class *hcp;
2621 queue = tc_find_queue__(netdev, queue_id, hash);
2623 hcp = htb_class_cast__(queue);
2625 hcp = xmalloc(sizeof *hcp);
2626 queue = &hcp->tc_queue;
2627 queue->queue_id = queue_id;
2628 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2631 hcp->min_rate = hc->min_rate;
2632 hcp->max_rate = hc->max_rate;
2633 hcp->burst = hc->burst;
2634 hcp->priority = hc->priority;
2638 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2641 struct nl_dump dump;
2642 struct htb_class hc;
2644 /* Get qdisc options. */
2646 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2647 htb_install__(netdev, hc.max_rate);
2650 if (!start_queue_dump(netdev, &dump)) {
2653 while (nl_dump_next(&dump, &msg)) {
2654 unsigned int queue_id;
2656 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2657 htb_update_queue__(netdev, queue_id, &hc);
2660 nl_dump_done(&dump);
2666 htb_tc_destroy(struct tc *tc)
2668 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2669 struct htb_class *hc, *next;
2671 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2672 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2680 htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2682 const struct htb *htb = htb_get__(netdev);
2683 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2688 htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2690 struct htb_class hc;
2693 htb_parse_qdisc_details__(netdev, details, &hc);
2694 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2695 tc_make_handle(1, 0), &hc);
2697 htb_get__(netdev)->max_rate = hc.max_rate;
2703 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2704 const struct tc_queue *queue, struct shash *details)
2706 const struct htb_class *hc = htb_class_cast__(queue);
2708 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2709 if (hc->min_rate != hc->max_rate) {
2710 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2712 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2714 shash_add(details, "priority", xasprintf("%u", hc->priority));
2720 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2721 const struct shash *details)
2723 struct htb_class hc;
2726 error = htb_parse_class_details__(netdev, details, &hc);
2731 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2732 tc_make_handle(1, 0xfffe), &hc);
2737 htb_update_queue__(netdev, queue_id, &hc);
2742 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2744 struct htb_class *hc = htb_class_cast__(queue);
2745 struct htb *htb = htb_get__(netdev);
2748 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2750 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2757 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2758 struct netdev_queue_stats *stats)
2760 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2761 tc_make_handle(1, 0xfffe), NULL, stats);
2765 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2766 const struct ofpbuf *nlmsg,
2767 netdev_dump_queue_stats_cb *cb, void *aux)
2769 struct netdev_queue_stats stats;
2770 unsigned int handle, major, minor;
2773 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2778 major = tc_get_major(handle);
2779 minor = tc_get_minor(handle);
2780 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2781 (*cb)(minor - 1, &stats, aux);
2786 static const struct tc_ops tc_ops_htb = {
2787 "htb", /* linux_name */
2788 "linux-htb", /* ovs_name */
2789 HTB_N_QUEUES, /* n_queues */
2798 htb_class_get_stats,
2799 htb_class_dump_stats
2802 /* "linux-hfsc" traffic control class. */
2804 #define HFSC_N_QUEUES 0xf000
2812 struct tc_queue tc_queue;
2817 static struct hfsc *
2818 hfsc_get__(const struct netdev *netdev)
2820 struct netdev_dev_linux *netdev_dev;
2821 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2822 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2825 static struct hfsc_class *
2826 hfsc_class_cast__(const struct tc_queue *queue)
2828 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2832 hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2834 struct netdev_dev_linux * netdev_dev;
2837 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2838 hfsc = xmalloc(sizeof *hfsc);
2839 tc_init(&hfsc->tc, &tc_ops_hfsc);
2840 hfsc->max_rate = max_rate;
2841 netdev_dev->tc = &hfsc->tc;
2845 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
2846 const struct hfsc_class *hc)
2850 struct hfsc_class *hcp;
2851 struct tc_queue *queue;
2853 hfsc = hfsc_get__(netdev);
2854 hash = hash_int(queue_id, 0);
2856 queue = tc_find_queue__(netdev, queue_id, hash);
2858 hcp = hfsc_class_cast__(queue);
2860 hcp = xmalloc(sizeof *hcp);
2861 queue = &hcp->tc_queue;
2862 queue->queue_id = queue_id;
2863 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
2866 hcp->min_rate = hc->min_rate;
2867 hcp->max_rate = hc->max_rate;
2871 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
2873 const struct tc_service_curve *rsc, *fsc, *usc;
2874 static const struct nl_policy tca_hfsc_policy[] = {
2876 .type = NL_A_UNSPEC,
2878 .min_len = sizeof(struct tc_service_curve),
2881 .type = NL_A_UNSPEC,
2883 .min_len = sizeof(struct tc_service_curve),
2886 .type = NL_A_UNSPEC,
2888 .min_len = sizeof(struct tc_service_curve),
2891 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
2893 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
2894 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
2895 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
2899 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
2900 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
2901 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
2903 if (rsc->m1 != 0 || rsc->d != 0 ||
2904 fsc->m1 != 0 || fsc->d != 0 ||
2905 usc->m1 != 0 || usc->d != 0) {
2906 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2907 "Non-linear service curves are not supported.");
2911 if (rsc->m2 != fsc->m2) {
2912 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2913 "Real-time service curves are not supported ");
2917 if (rsc->m2 > usc->m2) {
2918 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2919 "Min-rate service curve is greater than "
2920 "the max-rate service curve.");
2924 class->min_rate = fsc->m2;
2925 class->max_rate = usc->m2;
2930 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2931 struct hfsc_class *options,
2932 struct netdev_queue_stats *stats)
2935 unsigned int handle;
2936 struct nlattr *nl_options;
2938 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2944 unsigned int major, minor;
2946 major = tc_get_major(handle);
2947 minor = tc_get_minor(handle);
2948 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
2949 *queue_id = minor - 1;
2956 error = hfsc_parse_tca_options__(nl_options, options);
2963 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
2964 unsigned int parent, struct hfsc_class *options,
2965 struct netdev_queue_stats *stats)
2968 struct ofpbuf *reply;
2970 error = tc_query_class(netdev, handle, parent, &reply);
2975 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
2976 ofpbuf_delete(reply);
2981 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
2982 struct hfsc_class *class)
2985 const char *max_rate_s;
2987 max_rate_s = shash_find_data(details, "max-rate");
2988 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2993 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2994 max_rate = netdev_features_to_bps(current) / 8;
2997 class->min_rate = max_rate;
2998 class->max_rate = max_rate;
3002 hfsc_parse_class_details__(struct netdev *netdev,
3003 const struct shash *details,
3004 struct hfsc_class * class)
3006 const struct hfsc *hfsc;
3007 uint32_t min_rate, max_rate;
3008 const char *min_rate_s, *max_rate_s;
3010 hfsc = hfsc_get__(netdev);
3011 min_rate_s = shash_find_data(details, "min-rate");
3012 max_rate_s = shash_find_data(details, "max-rate");
3014 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3015 min_rate = MAX(min_rate, 1);
3016 min_rate = MIN(min_rate, hfsc->max_rate);
3018 max_rate = (max_rate_s
3019 ? strtoull(max_rate_s, NULL, 10) / 8
3021 max_rate = MAX(max_rate, min_rate);
3022 max_rate = MIN(max_rate, hfsc->max_rate);
3024 class->min_rate = min_rate;
3025 class->max_rate = max_rate;
3030 /* Create an HFSC qdisc.
3032 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3034 hfsc_setup_qdisc__(struct netdev * netdev)
3036 struct tcmsg *tcmsg;
3037 struct ofpbuf request;
3038 struct tc_hfsc_qopt opt;
3040 tc_del_qdisc(netdev);
3042 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3043 NLM_F_EXCL | NLM_F_CREATE, &request);
3049 tcmsg->tcm_handle = tc_make_handle(1, 0);
3050 tcmsg->tcm_parent = TC_H_ROOT;
3052 memset(&opt, 0, sizeof opt);
3055 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3056 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3058 return tc_transact(&request, NULL);
3061 /* Create an HFSC class.
3063 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3064 * sc rate <min_rate> ul rate <max_rate>" */
3066 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3067 unsigned int parent, struct hfsc_class *class)
3071 struct tcmsg *tcmsg;
3072 struct ofpbuf request;
3073 struct tc_service_curve min, max;
3075 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3081 tcmsg->tcm_handle = handle;
3082 tcmsg->tcm_parent = parent;
3086 min.m2 = class->min_rate;
3090 max.m2 = class->max_rate;
3092 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3093 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3094 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3095 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3096 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3097 nl_msg_end_nested(&request, opt_offset);
3099 error = tc_transact(&request, NULL);
3101 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3102 "min-rate %ubps, max-rate %ubps (%s)",
3103 netdev_get_name(netdev),
3104 tc_get_major(handle), tc_get_minor(handle),
3105 tc_get_major(parent), tc_get_minor(parent),
3106 class->min_rate, class->max_rate, strerror(error));
3113 hfsc_tc_install(struct netdev *netdev, const struct shash *details)
3116 struct hfsc_class class;
3118 error = hfsc_setup_qdisc__(netdev);
3124 hfsc_parse_qdisc_details__(netdev, details, &class);
3125 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3126 tc_make_handle(1, 0), &class);
3132 hfsc_install__(netdev, class.max_rate);
3137 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3140 struct nl_dump dump;
3141 struct hfsc_class hc;
3144 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3145 hfsc_install__(netdev, hc.max_rate);
3147 if (!start_queue_dump(netdev, &dump)) {
3151 while (nl_dump_next(&dump, &msg)) {
3152 unsigned int queue_id;
3154 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3155 hfsc_update_queue__(netdev, queue_id, &hc);
3159 nl_dump_done(&dump);
3164 hfsc_tc_destroy(struct tc *tc)
3167 struct hfsc_class *hc, *next;
3169 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3171 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3172 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3181 hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3183 const struct hfsc *hfsc;
3184 hfsc = hfsc_get__(netdev);
3185 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3190 hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3193 struct hfsc_class class;
3195 hfsc_parse_qdisc_details__(netdev, details, &class);
3196 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3197 tc_make_handle(1, 0), &class);
3200 hfsc_get__(netdev)->max_rate = class.max_rate;
3207 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3208 const struct tc_queue *queue, struct shash *details)
3210 const struct hfsc_class *hc;
3212 hc = hfsc_class_cast__(queue);
3213 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3214 if (hc->min_rate != hc->max_rate) {
3215 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3221 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3222 const struct shash *details)
3225 struct hfsc_class class;
3227 error = hfsc_parse_class_details__(netdev, details, &class);
3232 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3233 tc_make_handle(1, 0xfffe), &class);
3238 hfsc_update_queue__(netdev, queue_id, &class);
3243 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3247 struct hfsc_class *hc;
3249 hc = hfsc_class_cast__(queue);
3250 hfsc = hfsc_get__(netdev);
3252 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3254 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3261 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3262 struct netdev_queue_stats *stats)
3264 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3265 tc_make_handle(1, 0xfffe), NULL, stats);
3269 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3270 const struct ofpbuf *nlmsg,
3271 netdev_dump_queue_stats_cb *cb, void *aux)
3273 struct netdev_queue_stats stats;
3274 unsigned int handle, major, minor;
3277 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3282 major = tc_get_major(handle);
3283 minor = tc_get_minor(handle);
3284 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3285 (*cb)(minor - 1, &stats, aux);
3290 static const struct tc_ops tc_ops_hfsc = {
3291 "hfsc", /* linux_name */
3292 "linux-hfsc", /* ovs_name */
3293 HFSC_N_QUEUES, /* n_queues */
3294 hfsc_tc_install, /* tc_install */
3295 hfsc_tc_load, /* tc_load */
3296 hfsc_tc_destroy, /* tc_destroy */
3297 hfsc_qdisc_get, /* qdisc_get */
3298 hfsc_qdisc_set, /* qdisc_set */
3299 hfsc_class_get, /* class_get */
3300 hfsc_class_set, /* class_set */
3301 hfsc_class_delete, /* class_delete */
3302 hfsc_class_get_stats, /* class_get_stats */
3303 hfsc_class_dump_stats /* class_dump_stats */
3306 /* "linux-default" traffic control class.
3308 * This class represents the default, unnamed Linux qdisc. It corresponds to
3309 * the "" (empty string) QoS type in the OVS database. */
3312 default_install__(struct netdev *netdev)
3314 struct netdev_dev_linux *netdev_dev =
3315 netdev_dev_linux_cast(netdev_get_dev(netdev));
3316 static struct tc *tc;
3319 tc = xmalloc(sizeof *tc);
3320 tc_init(tc, &tc_ops_default);
3322 netdev_dev->tc = tc;
3326 default_tc_install(struct netdev *netdev,
3327 const struct shash *details OVS_UNUSED)
3329 default_install__(netdev);
3334 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3336 default_install__(netdev);
3340 static const struct tc_ops tc_ops_default = {
3341 NULL, /* linux_name */
3346 NULL, /* tc_destroy */
3347 NULL, /* qdisc_get */
3348 NULL, /* qdisc_set */
3349 NULL, /* class_get */
3350 NULL, /* class_set */
3351 NULL, /* class_delete */
3352 NULL, /* class_get_stats */
3353 NULL /* class_dump_stats */
3356 /* "linux-other" traffic control class.
3361 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3363 struct netdev_dev_linux *netdev_dev =
3364 netdev_dev_linux_cast(netdev_get_dev(netdev));
3365 static struct tc *tc;
3368 tc = xmalloc(sizeof *tc);
3369 tc_init(tc, &tc_ops_other);
3371 netdev_dev->tc = tc;
3375 static const struct tc_ops tc_ops_other = {
3376 NULL, /* linux_name */
3377 "linux-other", /* ovs_name */
3379 NULL, /* tc_install */
3381 NULL, /* tc_destroy */
3382 NULL, /* qdisc_get */
3383 NULL, /* qdisc_set */
3384 NULL, /* class_get */
3385 NULL, /* class_set */
3386 NULL, /* class_delete */
3387 NULL, /* class_get_stats */
3388 NULL /* class_dump_stats */
3391 /* Traffic control. */
3393 /* Number of kernel "tc" ticks per second. */
3394 static double ticks_per_s;
3396 /* Number of kernel "jiffies" per second. This is used for the purpose of
3397 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3398 * one jiffy's worth of data.
3400 * There are two possibilities here:
3402 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3403 * approximate range of 100 to 1024. That means that we really need to
3404 * make sure that the qdisc can buffer that much data.
3406 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3407 * has finely granular timers and there's no need to fudge additional room
3408 * for buffers. (There's no extra effort needed to implement that: the
3409 * large 'buffer_hz' is used as a divisor, so practically any number will
3410 * come out as 0 in the division. Small integer results in the case of
3411 * really high dividends won't have any real effect anyhow.)
3413 static unsigned int buffer_hz;
3415 /* Returns tc handle 'major':'minor'. */
3417 tc_make_handle(unsigned int major, unsigned int minor)
3419 return TC_H_MAKE(major << 16, minor);
3422 /* Returns the major number from 'handle'. */
3424 tc_get_major(unsigned int handle)
3426 return TC_H_MAJ(handle) >> 16;
3429 /* Returns the minor number from 'handle'. */
3431 tc_get_minor(unsigned int handle)
3433 return TC_H_MIN(handle);
3436 static struct tcmsg *
3437 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3438 struct ofpbuf *request)
3440 struct tcmsg *tcmsg;
3444 error = get_ifindex(netdev, &ifindex);
3449 ofpbuf_init(request, 512);
3450 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3451 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3452 tcmsg->tcm_family = AF_UNSPEC;
3453 tcmsg->tcm_ifindex = ifindex;
3454 /* Caller should fill in tcmsg->tcm_handle. */
3455 /* Caller should fill in tcmsg->tcm_parent. */
3461 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3463 int error = nl_sock_transact(rtnl_sock, request, replyp);
3464 ofpbuf_uninit(request);
3468 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3469 * policing configuration.
3471 * This function is equivalent to running the following when 'add' is true:
3472 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3474 * This function is equivalent to running the following when 'add' is false:
3475 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3477 * The configuration and stats may be seen with the following command:
3478 * /sbin/tc -s qdisc show dev <devname>
3480 * Returns 0 if successful, otherwise a positive errno value.
3483 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3485 struct ofpbuf request;
3486 struct tcmsg *tcmsg;
3488 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3489 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3491 tcmsg = tc_make_request(netdev, type, flags, &request);
3495 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3496 tcmsg->tcm_parent = TC_H_INGRESS;
3497 nl_msg_put_string(&request, TCA_KIND, "ingress");
3498 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3500 error = tc_transact(&request, NULL);
3502 /* If we're deleting the qdisc, don't worry about some of the
3503 * error conditions. */
3504 if (!add && (error == ENOENT || error == EINVAL)) {
3513 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3516 * This function is equivalent to running:
3517 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3518 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3521 * The configuration and stats may be seen with the following command:
3522 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3524 * Returns 0 if successful, otherwise a positive errno value.
3527 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3529 struct tc_police tc_police;
3530 struct ofpbuf request;
3531 struct tcmsg *tcmsg;
3532 size_t basic_offset;
3533 size_t police_offset;
3537 memset(&tc_police, 0, sizeof tc_police);
3538 tc_police.action = TC_POLICE_SHOT;
3539 tc_police.mtu = mtu;
3540 tc_fill_rate(&tc_police.rate, kbits_rate/8 * 1000, mtu);
3541 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3542 kbits_burst * 1024);
3544 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3545 NLM_F_EXCL | NLM_F_CREATE, &request);
3549 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3550 tcmsg->tcm_info = tc_make_handle(49,
3551 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3553 nl_msg_put_string(&request, TCA_KIND, "basic");
3554 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3555 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3556 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3557 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3558 nl_msg_end_nested(&request, police_offset);
3559 nl_msg_end_nested(&request, basic_offset);
3561 error = tc_transact(&request, NULL);
3572 /* The values in psched are not individually very meaningful, but they are
3573 * important. The tables below show some values seen in the wild.
3577 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3578 * (Before that, there are hints that it was 1000000000.)
3580 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3584 * -----------------------------------
3585 * [1] 000c8000 000f4240 000f4240 00000064
3586 * [2] 000003e8 00000400 000f4240 3b9aca00
3587 * [3] 000003e8 00000400 000f4240 3b9aca00
3588 * [4] 000003e8 00000400 000f4240 00000064
3589 * [5] 000003e8 00000040 000f4240 3b9aca00
3590 * [6] 000003e8 00000040 000f4240 000000f9
3592 * a b c d ticks_per_s buffer_hz
3593 * ------- --------- ---------- ------------- ----------- -------------
3594 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3595 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3596 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3597 * [4] 1,000 1,024 1,000,000 100 976,562 100
3598 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3599 * [6] 1,000 64 1,000,000 249 15,625,000 249
3601 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3602 * [2] 2.6.26-1-686-bigmem from Debian lenny
3603 * [3] 2.6.26-2-sparc64 from Debian lenny
3604 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3605 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3606 * [6] 2.6.34 from kernel.org on KVM
3608 static const char fn[] = "/proc/net/psched";
3609 unsigned int a, b, c, d;
3615 stream = fopen(fn, "r");
3617 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3621 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3622 VLOG_WARN("%s: read failed", fn);
3626 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3630 VLOG_WARN("%s: invalid scheduler parameters", fn);
3634 ticks_per_s = (double) a * c / b;
3638 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3641 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3644 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3645 * rate of 'rate' bytes per second. */
3647 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3652 return (rate * ticks) / ticks_per_s;
3655 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3656 * rate of 'rate' bytes per second. */
3658 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3663 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3666 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3667 * a transmission rate of 'rate' bytes per second. */
3669 tc_buffer_per_jiffy(unsigned int rate)
3674 return rate / buffer_hz;
3677 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3678 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3679 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3680 * stores NULL into it if it is absent.
3682 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3685 * Returns 0 if successful, otherwise a positive errno value. */
3687 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3688 struct nlattr **options)
3690 static const struct nl_policy tca_policy[] = {
3691 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3692 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3694 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3696 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3697 tca_policy, ta, ARRAY_SIZE(ta))) {
3698 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3703 *kind = nl_attr_get_string(ta[TCA_KIND]);
3707 *options = ta[TCA_OPTIONS];
3722 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3723 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3724 * into '*options', and its queue statistics into '*stats'. Any of the output
3725 * arguments may be null.
3727 * Returns 0 if successful, otherwise a positive errno value. */
3729 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3730 struct nlattr **options, struct netdev_queue_stats *stats)
3732 static const struct nl_policy tca_policy[] = {
3733 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3734 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3736 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3738 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3739 tca_policy, ta, ARRAY_SIZE(ta))) {
3740 VLOG_WARN_RL(&rl, "failed to parse class message");
3745 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3746 *handlep = tc->tcm_handle;
3750 *options = ta[TCA_OPTIONS];
3754 const struct gnet_stats_queue *gsq;
3755 struct gnet_stats_basic gsb;
3757 static const struct nl_policy stats_policy[] = {
3758 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3759 .min_len = sizeof gsb },
3760 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3761 .min_len = sizeof *gsq },
3763 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3765 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3766 sa, ARRAY_SIZE(sa))) {
3767 VLOG_WARN_RL(&rl, "failed to parse class stats");
3771 /* Alignment issues screw up the length of struct gnet_stats_basic on
3772 * some arch/bitsize combinations. Newer versions of Linux have a
3773 * struct gnet_stats_basic_packed, but we can't depend on that. The
3774 * easiest thing to do is just to make a copy. */
3775 memset(&gsb, 0, sizeof gsb);
3776 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3777 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3778 stats->tx_bytes = gsb.bytes;
3779 stats->tx_packets = gsb.packets;
3781 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3782 stats->tx_errors = gsq->drops;
3792 memset(stats, 0, sizeof *stats);
3797 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3800 tc_query_class(const struct netdev *netdev,
3801 unsigned int handle, unsigned int parent,
3802 struct ofpbuf **replyp)
3804 struct ofpbuf request;
3805 struct tcmsg *tcmsg;
3808 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3812 tcmsg->tcm_handle = handle;
3813 tcmsg->tcm_parent = parent;
3815 error = tc_transact(&request, replyp);
3817 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3818 netdev_get_name(netdev),
3819 tc_get_major(handle), tc_get_minor(handle),
3820 tc_get_major(parent), tc_get_minor(parent),
3826 /* Equivalent to "tc class del dev <name> handle <handle>". */
3828 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3830 struct ofpbuf request;
3831 struct tcmsg *tcmsg;
3834 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
3838 tcmsg->tcm_handle = handle;
3839 tcmsg->tcm_parent = 0;
3841 error = tc_transact(&request, NULL);
3843 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3844 netdev_get_name(netdev),
3845 tc_get_major(handle), tc_get_minor(handle),
3851 /* Equivalent to "tc qdisc del dev <name> root". */
3853 tc_del_qdisc(struct netdev *netdev)
3855 struct netdev_dev_linux *netdev_dev =
3856 netdev_dev_linux_cast(netdev_get_dev(netdev));
3857 struct ofpbuf request;
3858 struct tcmsg *tcmsg;
3861 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
3865 tcmsg->tcm_handle = tc_make_handle(1, 0);
3866 tcmsg->tcm_parent = TC_H_ROOT;
3868 error = tc_transact(&request, NULL);
3869 if (error == EINVAL) {
3870 /* EINVAL probably means that the default qdisc was in use, in which
3871 * case we've accomplished our purpose. */
3874 if (!error && netdev_dev->tc) {
3875 if (netdev_dev->tc->ops->tc_destroy) {
3876 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3878 netdev_dev->tc = NULL;
3883 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
3884 * kernel to determine what they are. Returns 0 if successful, otherwise a
3885 * positive errno value. */
3887 tc_query_qdisc(const struct netdev *netdev)
3889 struct netdev_dev_linux *netdev_dev =
3890 netdev_dev_linux_cast(netdev_get_dev(netdev));
3891 struct ofpbuf request, *qdisc;
3892 const struct tc_ops *ops;
3893 struct tcmsg *tcmsg;
3897 if (netdev_dev->tc) {
3901 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3902 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3903 * 2.6.35 without that fix backported to it.
3905 * To avoid the OOPS, we must not make a request that would attempt to dump
3906 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3907 * few others. There are a few ways that I can see to do this, but most of
3908 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3909 * technique chosen here is to assume that any non-default qdisc that we
3910 * create will have a class with handle 1:0. The built-in qdiscs only have
3911 * a class with handle 0:0.
3913 * We could check for Linux 2.6.35+ and use a more straightforward method
3915 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
3919 tcmsg->tcm_handle = tc_make_handle(1, 0);
3920 tcmsg->tcm_parent = 0;
3922 /* Figure out what tc class to instantiate. */
3923 error = tc_transact(&request, &qdisc);
3927 error = tc_parse_qdisc(qdisc, &kind, NULL);
3929 ops = &tc_ops_other;
3931 ops = tc_lookup_linux_name(kind);
3933 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
3934 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
3936 ops = &tc_ops_other;
3939 } else if (error == ENOENT) {
3940 /* Either it's a built-in qdisc, or it's a qdisc set up by some
3941 * other entity that doesn't have a handle 1:0. We will assume
3942 * that it's the system default qdisc. */
3943 ops = &tc_ops_default;
3946 /* Who knows? Maybe the device got deleted. */
3947 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
3948 netdev_get_name(netdev), strerror(error));
3949 ops = &tc_ops_other;
3952 /* Instantiate it. */
3953 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
3954 assert((load_error == 0) == (netdev_dev->tc != NULL));
3955 ofpbuf_delete(qdisc);
3957 return error ? error : load_error;
3960 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
3961 approximate the time to transmit packets of various lengths. For an MTU of
3962 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
3963 represents two possible packet lengths; for a MTU of 513 through 1024, four
3964 possible lengths; and so on.
3966 Returns, for the specified 'mtu', the number of bits that packet lengths
3967 need to be shifted right to fit within such a 256-entry table. */
3969 tc_calc_cell_log(unsigned int mtu)
3974 mtu = ETH_PAYLOAD_MAX;
3976 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
3978 for (cell_log = 0; mtu >= 256; cell_log++) {
3985 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
3988 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
3990 memset(rate, 0, sizeof *rate);
3991 rate->cell_log = tc_calc_cell_log(mtu);
3992 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
3993 /* rate->cell_align = 0; */ /* distro headers. */
3994 rate->mpu = ETH_TOTAL_MIN;
3998 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
3999 * attribute of the specified "type".
4001 * See tc_calc_cell_log() above for a description of "rtab"s. */
4003 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4008 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4009 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4010 unsigned packet_size = (i + 1) << rate->cell_log;
4011 if (packet_size < rate->mpu) {
4012 packet_size = rate->mpu;
4014 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4018 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4019 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4020 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4023 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4025 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4026 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4029 /* Linux-only functions declared in netdev-linux.h */
4031 /* Returns a fd for an AF_INET socket or a negative errno value. */
4033 netdev_linux_get_af_inet_sock(void)
4035 int error = netdev_linux_init();
4036 return error ? -error : af_inet_sock;
4039 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4040 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4042 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4043 const char *flag_name, bool enable)
4045 const char *netdev_name = netdev_get_name(netdev);
4046 struct ethtool_value evalue;
4050 memset(&evalue, 0, sizeof evalue);
4051 error = netdev_linux_do_ethtool(netdev_name,
4052 (struct ethtool_cmd *)&evalue,
4053 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4058 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4059 error = netdev_linux_do_ethtool(netdev_name,
4060 (struct ethtool_cmd *)&evalue,
4061 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4066 memset(&evalue, 0, sizeof evalue);
4067 error = netdev_linux_do_ethtool(netdev_name,
4068 (struct ethtool_cmd *)&evalue,
4069 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4074 if (new_flags != evalue.data) {
4075 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4076 "device %s failed", enable ? "enable" : "disable",
4077 flag_name, netdev_name);
4084 /* Utility functions. */
4086 /* Copies 'src' into 'dst', performing format conversion in the process. */
4088 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4089 const struct rtnl_link_stats *src)
4091 dst->rx_packets = src->rx_packets;
4092 dst->tx_packets = src->tx_packets;
4093 dst->rx_bytes = src->rx_bytes;
4094 dst->tx_bytes = src->tx_bytes;
4095 dst->rx_errors = src->rx_errors;
4096 dst->tx_errors = src->tx_errors;
4097 dst->rx_dropped = src->rx_dropped;
4098 dst->tx_dropped = src->tx_dropped;
4099 dst->multicast = src->multicast;
4100 dst->collisions = src->collisions;
4101 dst->rx_length_errors = src->rx_length_errors;
4102 dst->rx_over_errors = src->rx_over_errors;
4103 dst->rx_crc_errors = src->rx_crc_errors;
4104 dst->rx_frame_errors = src->rx_frame_errors;
4105 dst->rx_fifo_errors = src->rx_fifo_errors;
4106 dst->rx_missed_errors = src->rx_missed_errors;
4107 dst->tx_aborted_errors = src->tx_aborted_errors;
4108 dst->tx_carrier_errors = src->tx_carrier_errors;
4109 dst->tx_fifo_errors = src->tx_fifo_errors;
4110 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4111 dst->tx_window_errors = src->tx_window_errors;
4115 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4117 /* Policy for RTNLGRP_LINK messages.
4119 * There are *many* more fields in these messages, but currently we only
4120 * care about these fields. */
4121 static const struct nl_policy rtnlgrp_link_policy[] = {
4122 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4123 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4124 .min_len = sizeof(struct rtnl_link_stats) },
4127 struct ofpbuf request;
4128 struct ofpbuf *reply;
4129 struct ifinfomsg *ifi;
4130 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4133 ofpbuf_init(&request, 0);
4134 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4135 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4136 ifi->ifi_family = PF_UNSPEC;
4137 ifi->ifi_index = ifindex;
4138 error = nl_sock_transact(rtnl_sock, &request, &reply);
4139 ofpbuf_uninit(&request);
4144 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4145 rtnlgrp_link_policy,
4146 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4147 ofpbuf_delete(reply);
4151 if (!attrs[IFLA_STATS]) {
4152 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4153 ofpbuf_delete(reply);
4157 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4159 ofpbuf_delete(reply);
4165 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4167 static const char fn[] = "/proc/net/dev";
4172 stream = fopen(fn, "r");
4174 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
4179 while (fgets(line, sizeof line, stream)) {
4182 #define X64 "%"SCNu64
4185 X64 X64 X64 X64 X64 X64 X64 "%*u"
4186 X64 X64 X64 X64 X64 X64 X64 "%*u",
4192 &stats->rx_fifo_errors,
4193 &stats->rx_frame_errors,
4199 &stats->tx_fifo_errors,
4201 &stats->tx_carrier_errors) != 15) {
4202 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4203 } else if (!strcmp(devname, netdev_name)) {
4204 stats->rx_length_errors = UINT64_MAX;
4205 stats->rx_over_errors = UINT64_MAX;
4206 stats->rx_crc_errors = UINT64_MAX;
4207 stats->rx_missed_errors = UINT64_MAX;
4208 stats->tx_aborted_errors = UINT64_MAX;
4209 stats->tx_heartbeat_errors = UINT64_MAX;
4210 stats->tx_window_errors = UINT64_MAX;
4216 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4222 get_carrier_via_sysfs(const char *name, bool *carrier)
4233 fn = xasprintf("/sys/class/net/%s/carrier", name);
4234 fd = open(fn, O_RDONLY);
4237 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(error));
4241 retval = read(fd, line, sizeof line);
4244 if (error == EINVAL) {
4245 /* This is the normal return value when we try to check carrier if
4246 * the network device is not up. */
4248 VLOG_WARN_RL(&rl, "%s: read failed: %s", fn, strerror(error));
4251 } else if (retval == 0) {
4253 VLOG_WARN_RL(&rl, "%s: unexpected end of file", fn);
4257 if (line[0] != '0' && line[0] != '1') {
4259 VLOG_WARN_RL(&rl, "%s: value is %c (expected 0 or 1)", fn, line[0]);
4262 *carrier = line[0] != '0';
4274 get_flags(const struct netdev *netdev, int *flags)
4279 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCGIFFLAGS,
4281 *flags = ifr.ifr_flags;
4286 set_flags(struct netdev *netdev, int flags)
4290 ifr.ifr_flags = flags;
4291 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4296 do_get_ifindex(const char *netdev_name)
4300 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4301 COVERAGE_INC(netdev_get_ifindex);
4302 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4303 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4304 netdev_name, strerror(errno));
4307 return ifr.ifr_ifindex;
4311 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4313 struct netdev_dev_linux *netdev_dev =
4314 netdev_dev_linux_cast(netdev_get_dev(netdev_));
4316 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
4317 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4321 netdev_dev->cache_valid |= VALID_IFINDEX;
4322 netdev_dev->ifindex = ifindex;
4324 *ifindexp = netdev_dev->ifindex;
4329 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4334 memset(&ifr, 0, sizeof ifr);
4335 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4336 COVERAGE_INC(netdev_get_hwaddr);
4337 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4338 /* ENODEV probably means that a vif disappeared asynchronously and
4339 * hasn't been removed from the database yet, so reduce the log level
4340 * to INFO for that case. */
4341 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4342 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4343 netdev_name, strerror(errno));
4346 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4347 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4348 VLOG_WARN("%s device has unknown hardware address family %d",
4349 netdev_name, hwaddr_family);
4351 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4356 set_etheraddr(const char *netdev_name, int hwaddr_family,
4357 const uint8_t mac[ETH_ADDR_LEN])
4361 memset(&ifr, 0, sizeof ifr);
4362 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4363 ifr.ifr_hwaddr.sa_family = hwaddr_family;
4364 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4365 COVERAGE_INC(netdev_set_hwaddr);
4366 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4367 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4368 netdev_name, strerror(errno));
4375 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4376 int cmd, const char *cmd_name)
4380 memset(&ifr, 0, sizeof ifr);
4381 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4382 ifr.ifr_data = (caddr_t) ecmd;
4385 COVERAGE_INC(netdev_ethtool);
4386 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4389 if (errno != EOPNOTSUPP) {
4390 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4391 "failed: %s", cmd_name, name, strerror(errno));
4393 /* The device doesn't support this operation. That's pretty
4394 * common, so there's no point in logging anything. */
4401 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4402 const char *cmd_name)
4404 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4405 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4406 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4414 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4415 int cmd, const char *cmd_name)
4420 ifr.ifr_addr.sa_family = AF_INET;
4421 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4423 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4424 *ip = sin->sin_addr;
4429 /* Returns an AF_PACKET raw socket or a negative errno value. */
4431 af_packet_sock(void)
4433 static int sock = INT_MIN;
4435 if (sock == INT_MIN) {
4436 sock = socket(AF_PACKET, SOCK_RAW, 0);
4438 set_nonblocking(sock);
4441 VLOG_ERR("failed to create packet socket: %s", strerror(errno));