2 * Copyright (c) 2009, 2010, 2011 Nicira Networks.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
24 #include <arpa/inet.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
30 #include <linux/types.h>
31 #include <linux/ethtool.h>
32 #include <linux/mii.h>
33 #include <linux/pkt_cls.h>
34 #include <linux/pkt_sched.h>
35 #include <linux/rtnetlink.h>
36 #include <linux/sockios.h>
37 #include <linux/version.h>
38 #include <sys/types.h>
39 #include <sys/ioctl.h>
40 #include <sys/socket.h>
41 #include <netpacket/packet.h>
43 #include <net/if_arp.h>
44 #include <net/if_packet.h>
45 #include <net/route.h>
46 #include <netinet/in.h>
53 #include "dpif-linux.h"
54 #include "dynamic-string.h"
55 #include "fatal-signal.h"
58 #include "netdev-provider.h"
59 #include "netdev-vport.h"
61 #include "netlink-notifier.h"
62 #include "netlink-socket.h"
64 #include "openflow/openflow.h"
66 #include "poll-loop.h"
67 #include "rtnetlink-link.h"
68 #include "socket-util.h"
74 VLOG_DEFINE_THIS_MODULE(netdev_linux);
76 COVERAGE_DEFINE(netdev_set_policing);
77 COVERAGE_DEFINE(netdev_arp_lookup);
78 COVERAGE_DEFINE(netdev_get_ifindex);
79 COVERAGE_DEFINE(netdev_get_hwaddr);
80 COVERAGE_DEFINE(netdev_set_hwaddr);
81 COVERAGE_DEFINE(netdev_ethtool);
83 /* These were introduced in Linux 2.6.14, so they might be missing if we have
85 #ifndef ADVERTISED_Pause
86 #define ADVERTISED_Pause (1 << 13)
88 #ifndef ADVERTISED_Asym_Pause
89 #define ADVERTISED_Asym_Pause (1 << 14)
92 /* These were introduced in Linux 2.6.24, so they might be missing if we
93 * have old headers. */
94 #ifndef ETHTOOL_GFLAGS
95 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
97 #ifndef ETHTOOL_SFLAGS
98 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
101 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
104 #define TC_RTAB_SIZE 1024
107 static struct nln_notifier *netdev_linux_cache_notifier = NULL;
108 static int cache_notifier_refcount;
111 VALID_IFINDEX = 1 << 0,
112 VALID_ETHERADDR = 1 << 1,
116 VALID_POLICING = 1 << 5,
117 VALID_HAVE_VPORT_STATS = 1 << 6
125 /* Traffic control. */
127 /* An instance of a traffic control class. Always associated with a particular
130 * Each TC implementation subclasses this with whatever additional data it
133 const struct tc_ops *ops;
134 struct hmap queues; /* Contains "struct tc_queue"s.
135 * Read by generic TC layer.
136 * Written only by TC implementation. */
139 /* One traffic control queue.
141 * Each TC implementation subclasses this with whatever additional data it
144 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
145 unsigned int queue_id; /* OpenFlow queue ID. */
148 /* A particular kind of traffic control. Each implementation generally maps to
149 * one particular Linux qdisc class.
151 * The functions below return 0 if successful or a positive errno value on
152 * failure, except where otherwise noted. All of them must be provided, except
153 * where otherwise noted. */
155 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
156 * This is null for tc_ops_default and tc_ops_other, for which there are no
157 * appropriate values. */
158 const char *linux_name;
160 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
161 const char *ovs_name;
163 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
164 * queues. The queues are numbered 0 through n_queues - 1. */
165 unsigned int n_queues;
167 /* Called to install this TC class on 'netdev'. The implementation should
168 * make the Netlink calls required to set up 'netdev' with the right qdisc
169 * and configure it according to 'details'. The implementation may assume
170 * that the current qdisc is the default; that is, there is no need for it
171 * to delete the current qdisc before installing itself.
173 * The contents of 'details' should be documented as valid for 'ovs_name'
174 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
175 * (which is built as ovs-vswitchd.conf.db(8)).
177 * This function must return 0 if and only if it sets 'netdev->tc' to an
178 * initialized 'struct tc'.
180 * (This function is null for tc_ops_other, which cannot be installed. For
181 * other TC classes it should always be nonnull.) */
182 int (*tc_install)(struct netdev *netdev, const struct shash *details);
184 /* Called when the netdev code determines (through a Netlink query) that
185 * this TC class's qdisc is installed on 'netdev', but we didn't install
186 * it ourselves and so don't know any of the details.
188 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
189 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
190 * implementation should parse the other attributes of 'nlmsg' as
191 * necessary to determine its configuration. If necessary it should also
192 * use Netlink queries to determine the configuration of queues on
195 * This function must return 0 if and only if it sets 'netdev->tc' to an
196 * initialized 'struct tc'. */
197 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
199 /* Destroys the data structures allocated by the implementation as part of
200 * 'tc'. (This includes destroying 'tc->queues' by calling
203 * The implementation should not need to perform any Netlink calls. If
204 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
205 * (But it may not be desirable.)
207 * This function may be null if 'tc' is trivial. */
208 void (*tc_destroy)(struct tc *tc);
210 /* Retrieves details of 'netdev->tc' configuration into 'details'.
212 * The implementation should not need to perform any Netlink calls, because
213 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
214 * cached the configuration.
216 * The contents of 'details' should be documented as valid for 'ovs_name'
217 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
218 * (which is built as ovs-vswitchd.conf.db(8)).
220 * This function may be null if 'tc' is not configurable.
222 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
224 /* Reconfigures 'netdev->tc' according to 'details', performing any
225 * required Netlink calls to complete the reconfiguration.
227 * The contents of 'details' should be documented as valid for 'ovs_name'
228 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
229 * (which is built as ovs-vswitchd.conf.db(8)).
231 * This function may be null if 'tc' is not configurable.
233 int (*qdisc_set)(struct netdev *, const struct shash *details);
235 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
236 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
238 * The contents of 'details' should be documented as valid for 'ovs_name'
239 * in the "other_config" column in the "Queue" table in
240 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
242 * The implementation should not need to perform any Netlink calls, because
243 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
244 * cached the queue configuration.
246 * This function may be null if 'tc' does not have queues ('n_queues' is
248 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
249 struct shash *details);
251 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
252 * 'details', perfoming any required Netlink calls to complete the
253 * reconfiguration. The caller ensures that 'queue_id' is less than
256 * The contents of 'details' should be documented as valid for 'ovs_name'
257 * in the "other_config" column in the "Queue" table in
258 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
260 * This function may be null if 'tc' does not have queues or its queues are
261 * not configurable. */
262 int (*class_set)(struct netdev *, unsigned int queue_id,
263 const struct shash *details);
265 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
266 * tc_queue's within 'netdev->tc->queues'.
268 * This function may be null if 'tc' does not have queues or its queues
269 * cannot be deleted. */
270 int (*class_delete)(struct netdev *, struct tc_queue *queue);
272 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
273 * 'struct tc_queue's within 'netdev->tc->queues'.
275 * On success, initializes '*stats'.
277 * This function may be null if 'tc' does not have queues or if it cannot
278 * report queue statistics. */
279 int (*class_get_stats)(const struct netdev *netdev,
280 const struct tc_queue *queue,
281 struct netdev_queue_stats *stats);
283 /* Extracts queue stats from 'nlmsg', which is a response to a
284 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
286 * This function may be null if 'tc' does not have queues or if it cannot
287 * report queue statistics. */
288 int (*class_dump_stats)(const struct netdev *netdev,
289 const struct ofpbuf *nlmsg,
290 netdev_dump_queue_stats_cb *cb, void *aux);
294 tc_init(struct tc *tc, const struct tc_ops *ops)
297 hmap_init(&tc->queues);
301 tc_destroy(struct tc *tc)
303 hmap_destroy(&tc->queues);
306 static const struct tc_ops tc_ops_htb;
307 static const struct tc_ops tc_ops_hfsc;
308 static const struct tc_ops tc_ops_default;
309 static const struct tc_ops tc_ops_other;
311 static const struct tc_ops *tcs[] = {
312 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
313 &tc_ops_hfsc, /* Hierarchical fair service curve. */
314 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
315 &tc_ops_other, /* Some other qdisc. */
319 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
320 static unsigned int tc_get_major(unsigned int handle);
321 static unsigned int tc_get_minor(unsigned int handle);
323 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
324 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
325 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
327 static struct tcmsg *tc_make_request(const struct netdev *, int type,
328 unsigned int flags, struct ofpbuf *);
329 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
330 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
331 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
334 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
335 struct nlattr **options);
336 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
337 struct nlattr **options,
338 struct netdev_queue_stats *);
339 static int tc_query_class(const struct netdev *,
340 unsigned int handle, unsigned int parent,
341 struct ofpbuf **replyp);
342 static int tc_delete_class(const struct netdev *, unsigned int handle);
344 static int tc_del_qdisc(struct netdev *netdev);
345 static int tc_query_qdisc(const struct netdev *netdev);
347 static int tc_calc_cell_log(unsigned int mtu);
348 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
349 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
350 const struct tc_ratespec *rate);
351 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
353 struct netdev_dev_linux {
354 struct netdev_dev netdev_dev;
356 struct shash_node *shash_node;
357 unsigned int cache_valid;
358 unsigned int change_seq;
360 bool miimon; /* Link status of last poll. */
361 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
362 struct timer miimon_timer;
364 /* The following are figured out "on demand" only. They are only valid
365 * when the corresponding VALID_* bit in 'cache_valid' is set. */
367 uint8_t etheraddr[ETH_ADDR_LEN];
368 struct in_addr address, netmask;
372 long long int carrier_resets;
373 uint32_t kbits_rate; /* Policing data. */
374 uint32_t kbits_burst;
375 bool have_vport_stats;
379 struct tap_state tap;
383 struct netdev_linux {
384 struct netdev netdev;
388 /* Sockets used for ioctl operations. */
389 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
391 /* A Netlink routing socket that is not subscribed to any multicast groups. */
392 static struct nl_sock *rtnl_sock;
394 /* This is set pretty low because we probably won't learn anything from the
395 * additional log messages. */
396 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
398 static int netdev_linux_init(void);
400 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
401 int cmd, const char *cmd_name);
402 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
403 const char *cmd_name);
404 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
405 int cmd, const char *cmd_name);
406 static int get_flags(const struct netdev_dev *, int *flagsp);
407 static int set_flags(struct netdev *, int flags);
408 static int do_get_ifindex(const char *netdev_name);
409 static int get_ifindex(const struct netdev *, int *ifindexp);
410 static int do_set_addr(struct netdev *netdev,
411 int ioctl_nr, const char *ioctl_name,
412 struct in_addr addr);
413 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
414 static int set_etheraddr(const char *netdev_name, int hwaddr_family,
415 const uint8_t[ETH_ADDR_LEN]);
416 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
417 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
418 static int af_packet_sock(void);
419 static void netdev_linux_miimon_run(void);
420 static void netdev_linux_miimon_wait(void);
423 is_netdev_linux_class(const struct netdev_class *netdev_class)
425 return netdev_class->init == netdev_linux_init;
428 static struct netdev_dev_linux *
429 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
431 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
432 assert(is_netdev_linux_class(netdev_class));
434 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
437 static struct netdev_linux *
438 netdev_linux_cast(const struct netdev *netdev)
440 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
441 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
442 assert(is_netdev_linux_class(netdev_class));
444 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
448 netdev_linux_init(void)
450 static int status = -1;
452 /* Create AF_INET socket. */
453 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
454 status = af_inet_sock >= 0 ? 0 : errno;
456 VLOG_ERR("failed to create inet socket: %s", strerror(status));
459 /* Create rtnetlink socket. */
461 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
463 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
472 netdev_linux_run(void)
474 rtnetlink_link_run();
475 netdev_linux_miimon_run();
479 netdev_linux_wait(void)
481 rtnetlink_link_wait();
482 netdev_linux_miimon_wait();
486 netdev_dev_linux_changed(struct netdev_dev_linux *dev)
489 if (!dev->change_seq) {
492 dev->cache_valid = 0;
496 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
497 void *aux OVS_UNUSED)
499 struct netdev_dev_linux *dev;
501 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
503 const struct netdev_class *netdev_class =
504 netdev_dev_get_class(base_dev);
506 if (is_netdev_linux_class(netdev_class)) {
507 dev = netdev_dev_linux_cast(base_dev);
509 if (dev->carrier != change->running) {
510 dev->carrier = change->running;
511 dev->carrier_resets++;
514 netdev_dev_linux_changed(dev);
518 struct shash device_shash;
519 struct shash_node *node;
521 shash_init(&device_shash);
522 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
523 SHASH_FOR_EACH (node, &device_shash) {
529 get_flags(&dev->netdev_dev, &flags);
530 carrier = (flags & IFF_RUNNING) != 0;
531 if (dev->carrier != carrier) {
532 dev->carrier = carrier;
533 dev->carrier_resets++;
536 netdev_dev_linux_changed(dev);
538 shash_destroy(&device_shash);
543 cache_notifier_ref(void)
545 if (!cache_notifier_refcount) {
546 assert(!netdev_linux_cache_notifier);
548 netdev_linux_cache_notifier =
549 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
551 if (!netdev_linux_cache_notifier) {
555 cache_notifier_refcount++;
561 cache_notifier_unref(void)
563 assert(cache_notifier_refcount > 0);
564 if (!--cache_notifier_refcount) {
565 assert(netdev_linux_cache_notifier);
566 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
567 netdev_linux_cache_notifier = NULL;
571 /* Creates system and internal devices. */
573 netdev_linux_create(const struct netdev_class *class, const char *name,
574 struct netdev_dev **netdev_devp)
576 struct netdev_dev_linux *netdev_dev;
580 error = cache_notifier_ref();
585 netdev_dev = xzalloc(sizeof *netdev_dev);
586 netdev_dev->change_seq = 1;
587 netdev_dev_init(&netdev_dev->netdev_dev, name, class);
588 get_flags(&netdev_dev->netdev_dev, &flags);
589 netdev_dev->carrier = (flags & IFF_RUNNING) != 0;
591 *netdev_devp = &netdev_dev->netdev_dev;
595 /* For most types of netdevs we open the device for each call of
596 * netdev_open(). However, this is not the case with tap devices,
597 * since it is only possible to open the device once. In this
598 * situation we share a single file descriptor, and consequently
599 * buffers, across all readers. Therefore once data is read it will
600 * be unavailable to other reads for tap devices. */
602 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
603 const char *name, struct netdev_dev **netdev_devp)
605 struct netdev_dev_linux *netdev_dev;
606 struct tap_state *state;
607 static const char tap_dev[] = "/dev/net/tun";
611 netdev_dev = xzalloc(sizeof *netdev_dev);
612 state = &netdev_dev->state.tap;
614 error = cache_notifier_ref();
619 /* Open tap device. */
620 state->fd = open(tap_dev, O_RDWR);
623 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
624 goto error_unref_notifier;
627 /* Create tap device. */
628 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
629 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
630 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
631 VLOG_WARN("%s: creating tap device failed: %s", name,
634 goto error_unref_notifier;
637 /* Make non-blocking. */
638 error = set_nonblocking(state->fd);
640 goto error_unref_notifier;
643 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
644 *netdev_devp = &netdev_dev->netdev_dev;
647 error_unref_notifier:
648 cache_notifier_unref();
655 destroy_tap(struct netdev_dev_linux *netdev_dev)
657 struct tap_state *state = &netdev_dev->state.tap;
659 if (state->fd >= 0) {
664 /* Destroys the netdev device 'netdev_dev_'. */
666 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
668 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
669 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
671 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
672 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
675 if (class == &netdev_tap_class) {
676 destroy_tap(netdev_dev);
680 cache_notifier_unref();
684 netdev_linux_open(struct netdev_dev *netdev_dev_, struct netdev **netdevp)
686 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
687 struct netdev_linux *netdev;
688 enum netdev_flags flags;
691 /* Allocate network device. */
692 netdev = xzalloc(sizeof *netdev);
694 netdev_init(&netdev->netdev, netdev_dev_);
696 /* Verify that the device really exists, by attempting to read its flags.
697 * (The flags might be cached, in which case this won't actually do an
700 * Don't do this for "internal" netdevs, though, because those have to be
701 * created as netdev objects before they exist in the kernel, because
702 * creating them in the kernel happens by passing a netdev object to
703 * dpif_port_add(). */
704 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
705 error = netdev_get_flags(&netdev->netdev, &flags);
706 if (error == ENODEV) {
711 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
712 !netdev_dev->state.tap.opened) {
714 /* We assume that the first user of the tap device is the primary user
715 * and give them the tap FD. Subsequent users probably just expect
716 * this to be a system device so open it normally to avoid send/receive
717 * directions appearing to be reversed. */
718 netdev->fd = netdev_dev->state.tap.fd;
719 netdev_dev->state.tap.opened = true;
722 *netdevp = &netdev->netdev;
726 netdev_uninit(&netdev->netdev, true);
730 /* Closes and destroys 'netdev'. */
732 netdev_linux_close(struct netdev *netdev_)
734 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
736 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
743 netdev_linux_listen(struct netdev *netdev_)
745 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
746 struct sockaddr_ll sll;
751 if (netdev->fd >= 0) {
755 /* Create file descriptor. */
756 fd = socket(PF_PACKET, SOCK_RAW, 0);
759 VLOG_ERR("failed to create raw socket (%s)", strerror(error));
763 /* Set non-blocking mode. */
764 error = set_nonblocking(fd);
769 /* Get ethernet device index. */
770 error = get_ifindex(&netdev->netdev, &ifindex);
775 /* Bind to specific ethernet device. */
776 memset(&sll, 0, sizeof sll);
777 sll.sll_family = AF_PACKET;
778 sll.sll_ifindex = ifindex;
779 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
780 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
782 VLOG_ERR("%s: failed to bind raw socket (%s)",
783 netdev_get_name(netdev_), strerror(error));
798 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
800 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
802 if (netdev->fd < 0) {
803 /* Device is not listening. */
808 ssize_t retval = recv(netdev->fd, data, size, MSG_TRUNC);
810 return retval <= size ? retval : -EMSGSIZE;
811 } else if (errno != EINTR) {
812 if (errno != EAGAIN) {
813 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
814 strerror(errno), netdev_get_name(netdev_));
821 /* Registers with the poll loop to wake up from the next call to poll_block()
822 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
824 netdev_linux_recv_wait(struct netdev *netdev_)
826 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
827 if (netdev->fd >= 0) {
828 poll_fd_wait(netdev->fd, POLLIN);
832 /* Discards all packets waiting to be received from 'netdev'. */
834 netdev_linux_drain(struct netdev *netdev_)
836 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
837 if (netdev->fd < 0) {
839 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
841 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
842 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
846 drain_fd(netdev->fd, ifr.ifr_qlen);
849 return drain_rcvbuf(netdev->fd);
853 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
854 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
855 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
856 * the packet is too big or too small to transmit on the device.
858 * The caller retains ownership of 'buffer' in all cases.
860 * The kernel maintains a packet transmission queue, so the caller is not
861 * expected to do additional queuing of packets. */
863 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
865 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
869 if (netdev->fd < 0) {
870 /* Use our AF_PACKET socket to send to this device. */
871 struct sockaddr_ll sll;
878 sock = af_packet_sock();
883 error = get_ifindex(netdev_, &ifindex);
888 /* We don't bother setting most fields in sockaddr_ll because the
889 * kernel ignores them for SOCK_RAW. */
890 memset(&sll, 0, sizeof sll);
891 sll.sll_family = AF_PACKET;
892 sll.sll_ifindex = ifindex;
894 iov.iov_base = (void *) data;
898 msg.msg_namelen = sizeof sll;
901 msg.msg_control = NULL;
902 msg.msg_controllen = 0;
905 retval = sendmsg(sock, &msg, 0);
907 /* Use the netdev's own fd to send to this device. This is
908 * essential for tap devices, because packets sent to a tap device
909 * with an AF_PACKET socket will loop back to be *received* again
910 * on the tap device. */
911 retval = write(netdev->fd, data, size);
915 /* The Linux AF_PACKET implementation never blocks waiting for room
916 * for packets, instead returning ENOBUFS. Translate this into
917 * EAGAIN for the caller. */
918 if (errno == ENOBUFS) {
920 } else if (errno == EINTR) {
922 } else if (errno != EAGAIN) {
923 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
924 netdev_get_name(netdev_), strerror(errno));
927 } else if (retval != size) {
928 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
929 "%zu) on %s", retval, size, netdev_get_name(netdev_));
937 /* Registers with the poll loop to wake up from the next call to poll_block()
938 * when the packet transmission queue has sufficient room to transmit a packet
939 * with netdev_send().
941 * The kernel maintains a packet transmission queue, so the client is not
942 * expected to do additional queuing of packets. Thus, this function is
943 * unlikely to ever be used. It is included for completeness. */
945 netdev_linux_send_wait(struct netdev *netdev_)
947 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
948 if (netdev->fd < 0) {
950 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
951 poll_fd_wait(netdev->fd, POLLOUT);
953 /* TAP device always accepts packets.*/
954 poll_immediate_wake();
958 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
959 * otherwise a positive errno value. */
961 netdev_linux_set_etheraddr(struct netdev *netdev_,
962 const uint8_t mac[ETH_ADDR_LEN])
964 struct netdev_dev_linux *netdev_dev =
965 netdev_dev_linux_cast(netdev_get_dev(netdev_));
968 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
969 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
970 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
972 netdev_dev->cache_valid |= VALID_ETHERADDR;
973 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
981 /* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
982 * free the returned buffer. */
984 netdev_linux_get_etheraddr(const struct netdev *netdev_,
985 uint8_t mac[ETH_ADDR_LEN])
987 struct netdev_dev_linux *netdev_dev =
988 netdev_dev_linux_cast(netdev_get_dev(netdev_));
989 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
990 int error = get_etheraddr(netdev_get_name(netdev_),
991 netdev_dev->etheraddr);
995 netdev_dev->cache_valid |= VALID_ETHERADDR;
997 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
1001 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1002 * in bytes, not including the hardware header; thus, this is typically 1500
1003 * bytes for Ethernet devices. */
1005 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1007 struct netdev_dev_linux *netdev_dev =
1008 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1009 if (!(netdev_dev->cache_valid & VALID_MTU)) {
1013 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1014 SIOCGIFMTU, "SIOCGIFMTU");
1018 netdev_dev->mtu = ifr.ifr_mtu;
1019 netdev_dev->cache_valid |= VALID_MTU;
1021 *mtup = netdev_dev->mtu;
1025 /* Sets the maximum size of transmitted (MTU) for given device using linux
1026 * networking ioctl interface.
1029 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1031 struct netdev_dev_linux *netdev_dev =
1032 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1036 if (netdev_dev->cache_valid & VALID_MTU &&
1037 netdev_dev->mtu == mtu) {
1041 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1042 SIOCSIFMTU, "SIOCSIFMTU");
1047 netdev_dev->mtu = ifr.ifr_mtu;
1048 netdev_dev->cache_valid |= VALID_MTU;
1052 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1053 * On failure, returns a negative errno value. */
1055 netdev_linux_get_ifindex(const struct netdev *netdev)
1059 error = get_ifindex(netdev, &ifindex);
1060 return error ? -error : ifindex;
1064 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1066 struct netdev_dev_linux *netdev_dev =
1067 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1069 if (netdev_dev->miimon_interval > 0) {
1070 *carrier = netdev_dev->miimon;
1072 *carrier = netdev_dev->carrier;
1078 static long long int
1079 netdev_linux_get_carrier_resets(const struct netdev *netdev)
1081 return netdev_dev_linux_cast(netdev_get_dev(netdev))->carrier_resets;
1085 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1086 struct mii_ioctl_data *data)
1091 memset(&ifr, 0, sizeof ifr);
1092 memcpy(&ifr.ifr_data, data, sizeof *data);
1093 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1094 memcpy(data, &ifr.ifr_data, sizeof *data);
1100 netdev_linux_get_miimon(const char *name, bool *miimon)
1102 struct mii_ioctl_data data;
1107 memset(&data, 0, sizeof data);
1108 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1110 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1111 data.reg_num = MII_BMSR;
1112 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1116 *miimon = !!(data.val_out & BMSR_LSTATUS);
1118 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1121 struct ethtool_cmd ecmd;
1123 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1126 memset(&ecmd, 0, sizeof ecmd);
1127 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1130 struct ethtool_value eval;
1132 memcpy(&eval, &ecmd, sizeof eval);
1133 *miimon = !!eval.data;
1135 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1143 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1144 long long int interval)
1146 struct netdev_dev_linux *netdev_dev;
1148 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev_));
1150 interval = interval > 0 ? MAX(interval, 100) : 0;
1151 if (netdev_dev->miimon_interval != interval) {
1152 netdev_dev->miimon_interval = interval;
1153 timer_set_expired(&netdev_dev->miimon_timer);
1160 netdev_linux_miimon_run(void)
1162 struct shash device_shash;
1163 struct shash_node *node;
1165 shash_init(&device_shash);
1166 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1167 SHASH_FOR_EACH (node, &device_shash) {
1168 struct netdev_dev_linux *dev = node->data;
1171 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1175 netdev_linux_get_miimon(dev->netdev_dev.name, &miimon);
1176 if (miimon != dev->miimon) {
1177 dev->miimon = miimon;
1178 netdev_dev_linux_changed(dev);
1181 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1184 shash_destroy(&device_shash);
1188 netdev_linux_miimon_wait(void)
1190 struct shash device_shash;
1191 struct shash_node *node;
1193 shash_init(&device_shash);
1194 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1195 SHASH_FOR_EACH (node, &device_shash) {
1196 struct netdev_dev_linux *dev = node->data;
1198 if (dev->miimon_interval > 0) {
1199 timer_wait(&dev->miimon_timer);
1202 shash_destroy(&device_shash);
1205 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1206 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1209 check_for_working_netlink_stats(void)
1211 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1212 * preferable, so if that works, we'll use it. */
1213 int ifindex = do_get_ifindex("lo");
1215 VLOG_WARN("failed to get ifindex for lo, "
1216 "obtaining netdev stats from proc");
1219 struct netdev_stats stats;
1220 int error = get_stats_via_netlink(ifindex, &stats);
1222 VLOG_DBG("obtaining netdev stats via rtnetlink");
1225 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1226 "via proc (you are probably running a pre-2.6.19 "
1227 "kernel)", strerror(error));
1234 swap_uint64(uint64_t *a, uint64_t *b)
1242 get_stats_via_vport(const struct netdev *netdev_,
1243 struct netdev_stats *stats)
1245 struct netdev_dev_linux *netdev_dev =
1246 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1248 if (netdev_dev->have_vport_stats ||
1249 !(netdev_dev->cache_valid & VALID_HAVE_VPORT_STATS)) {
1252 error = netdev_vport_get_stats(netdev_, stats);
1254 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1255 "(%s)", netdev_get_name(netdev_), strerror(error));
1257 netdev_dev->have_vport_stats = !error;
1258 netdev_dev->cache_valid |= VALID_HAVE_VPORT_STATS;
1263 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1264 struct netdev_stats *stats)
1266 static int use_netlink_stats = -1;
1269 if (use_netlink_stats < 0) {
1270 use_netlink_stats = check_for_working_netlink_stats();
1273 if (use_netlink_stats) {
1276 error = get_ifindex(netdev_, &ifindex);
1278 error = get_stats_via_netlink(ifindex, stats);
1281 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1285 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1286 netdev_get_name(netdev_), error);
1292 /* Retrieves current device stats for 'netdev-linux'. */
1294 netdev_linux_get_stats(const struct netdev *netdev_,
1295 struct netdev_stats *stats)
1297 struct netdev_dev_linux *netdev_dev =
1298 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1299 struct netdev_stats dev_stats;
1302 get_stats_via_vport(netdev_, stats);
1304 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1307 if (!netdev_dev->have_vport_stats) {
1314 if (!netdev_dev->have_vport_stats) {
1315 /* stats not available from OVS then use ioctl stats. */
1318 stats->rx_errors += dev_stats.rx_errors;
1319 stats->tx_errors += dev_stats.tx_errors;
1320 stats->rx_dropped += dev_stats.rx_dropped;
1321 stats->tx_dropped += dev_stats.tx_dropped;
1322 stats->multicast += dev_stats.multicast;
1323 stats->collisions += dev_stats.collisions;
1324 stats->rx_length_errors += dev_stats.rx_length_errors;
1325 stats->rx_over_errors += dev_stats.rx_over_errors;
1326 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1327 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1328 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1329 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1330 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1331 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1332 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1333 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1334 stats->tx_window_errors += dev_stats.tx_window_errors;
1339 /* Retrieves current device stats for 'netdev-tap' netdev or
1340 * netdev-internal. */
1342 netdev_pseudo_get_stats(const struct netdev *netdev_,
1343 struct netdev_stats *stats)
1345 struct netdev_dev_linux *netdev_dev =
1346 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1347 struct netdev_stats dev_stats;
1350 get_stats_via_vport(netdev_, stats);
1352 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1354 if (!netdev_dev->have_vport_stats) {
1361 /* If this port is an internal port then the transmit and receive stats
1362 * will appear to be swapped relative to the other ports since we are the
1363 * one sending the data, not a remote computer. For consistency, we swap
1364 * them back here. This does not apply if we are getting stats from the
1365 * vport layer because it always tracks stats from the perspective of the
1367 if (!netdev_dev->have_vport_stats) {
1369 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1370 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1371 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1372 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1373 stats->rx_length_errors = 0;
1374 stats->rx_over_errors = 0;
1375 stats->rx_crc_errors = 0;
1376 stats->rx_frame_errors = 0;
1377 stats->rx_fifo_errors = 0;
1378 stats->rx_missed_errors = 0;
1379 stats->tx_aborted_errors = 0;
1380 stats->tx_carrier_errors = 0;
1381 stats->tx_fifo_errors = 0;
1382 stats->tx_heartbeat_errors = 0;
1383 stats->tx_window_errors = 0;
1385 stats->rx_dropped += dev_stats.tx_dropped;
1386 stats->tx_dropped += dev_stats.rx_dropped;
1388 stats->rx_errors += dev_stats.tx_errors;
1389 stats->tx_errors += dev_stats.rx_errors;
1391 stats->multicast += dev_stats.multicast;
1392 stats->collisions += dev_stats.collisions;
1397 /* Stores the features supported by 'netdev' into each of '*current',
1398 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1399 * bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if
1400 * successful, otherwise a positive errno value. */
1402 netdev_linux_get_features(const struct netdev *netdev,
1403 uint32_t *current, uint32_t *advertised,
1404 uint32_t *supported, uint32_t *peer)
1406 struct ethtool_cmd ecmd;
1409 memset(&ecmd, 0, sizeof ecmd);
1410 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1411 ETHTOOL_GSET, "ETHTOOL_GSET");
1416 /* Supported features. */
1418 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1419 *supported |= OFPPF_10MB_HD;
1421 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1422 *supported |= OFPPF_10MB_FD;
1424 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1425 *supported |= OFPPF_100MB_HD;
1427 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1428 *supported |= OFPPF_100MB_FD;
1430 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1431 *supported |= OFPPF_1GB_HD;
1433 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1434 *supported |= OFPPF_1GB_FD;
1436 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1437 *supported |= OFPPF_10GB_FD;
1439 if (ecmd.supported & SUPPORTED_TP) {
1440 *supported |= OFPPF_COPPER;
1442 if (ecmd.supported & SUPPORTED_FIBRE) {
1443 *supported |= OFPPF_FIBER;
1445 if (ecmd.supported & SUPPORTED_Autoneg) {
1446 *supported |= OFPPF_AUTONEG;
1448 if (ecmd.supported & SUPPORTED_Pause) {
1449 *supported |= OFPPF_PAUSE;
1451 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1452 *supported |= OFPPF_PAUSE_ASYM;
1455 /* Advertised features. */
1457 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1458 *advertised |= OFPPF_10MB_HD;
1460 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1461 *advertised |= OFPPF_10MB_FD;
1463 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1464 *advertised |= OFPPF_100MB_HD;
1466 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1467 *advertised |= OFPPF_100MB_FD;
1469 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1470 *advertised |= OFPPF_1GB_HD;
1472 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1473 *advertised |= OFPPF_1GB_FD;
1475 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1476 *advertised |= OFPPF_10GB_FD;
1478 if (ecmd.advertising & ADVERTISED_TP) {
1479 *advertised |= OFPPF_COPPER;
1481 if (ecmd.advertising & ADVERTISED_FIBRE) {
1482 *advertised |= OFPPF_FIBER;
1484 if (ecmd.advertising & ADVERTISED_Autoneg) {
1485 *advertised |= OFPPF_AUTONEG;
1487 if (ecmd.advertising & ADVERTISED_Pause) {
1488 *advertised |= OFPPF_PAUSE;
1490 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1491 *advertised |= OFPPF_PAUSE_ASYM;
1494 /* Current settings. */
1495 if (ecmd.speed == SPEED_10) {
1496 *current = ecmd.duplex ? OFPPF_10MB_FD : OFPPF_10MB_HD;
1497 } else if (ecmd.speed == SPEED_100) {
1498 *current = ecmd.duplex ? OFPPF_100MB_FD : OFPPF_100MB_HD;
1499 } else if (ecmd.speed == SPEED_1000) {
1500 *current = ecmd.duplex ? OFPPF_1GB_FD : OFPPF_1GB_HD;
1501 } else if (ecmd.speed == SPEED_10000) {
1502 *current = OFPPF_10GB_FD;
1507 if (ecmd.port == PORT_TP) {
1508 *current |= OFPPF_COPPER;
1509 } else if (ecmd.port == PORT_FIBRE) {
1510 *current |= OFPPF_FIBER;
1514 *current |= OFPPF_AUTONEG;
1517 /* Peer advertisements. */
1518 *peer = 0; /* XXX */
1523 /* Set the features advertised by 'netdev' to 'advertise'. */
1525 netdev_linux_set_advertisements(struct netdev *netdev, uint32_t advertise)
1527 struct ethtool_cmd ecmd;
1530 memset(&ecmd, 0, sizeof ecmd);
1531 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1532 ETHTOOL_GSET, "ETHTOOL_GSET");
1537 ecmd.advertising = 0;
1538 if (advertise & OFPPF_10MB_HD) {
1539 ecmd.advertising |= ADVERTISED_10baseT_Half;
1541 if (advertise & OFPPF_10MB_FD) {
1542 ecmd.advertising |= ADVERTISED_10baseT_Full;
1544 if (advertise & OFPPF_100MB_HD) {
1545 ecmd.advertising |= ADVERTISED_100baseT_Half;
1547 if (advertise & OFPPF_100MB_FD) {
1548 ecmd.advertising |= ADVERTISED_100baseT_Full;
1550 if (advertise & OFPPF_1GB_HD) {
1551 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1553 if (advertise & OFPPF_1GB_FD) {
1554 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1556 if (advertise & OFPPF_10GB_FD) {
1557 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1559 if (advertise & OFPPF_COPPER) {
1560 ecmd.advertising |= ADVERTISED_TP;
1562 if (advertise & OFPPF_FIBER) {
1563 ecmd.advertising |= ADVERTISED_FIBRE;
1565 if (advertise & OFPPF_AUTONEG) {
1566 ecmd.advertising |= ADVERTISED_Autoneg;
1568 if (advertise & OFPPF_PAUSE) {
1569 ecmd.advertising |= ADVERTISED_Pause;
1571 if (advertise & OFPPF_PAUSE_ASYM) {
1572 ecmd.advertising |= ADVERTISED_Asym_Pause;
1574 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1575 ETHTOOL_SSET, "ETHTOOL_SSET");
1578 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1579 * successful, otherwise a positive errno value. */
1581 netdev_linux_set_policing(struct netdev *netdev,
1582 uint32_t kbits_rate, uint32_t kbits_burst)
1584 struct netdev_dev_linux *netdev_dev =
1585 netdev_dev_linux_cast(netdev_get_dev(netdev));
1586 const char *netdev_name = netdev_get_name(netdev);
1589 COVERAGE_INC(netdev_set_policing);
1591 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1592 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1593 : kbits_burst); /* Stick with user-specified value. */
1595 if (netdev_dev->cache_valid & VALID_POLICING
1596 && netdev_dev->kbits_rate == kbits_rate
1597 && netdev_dev->kbits_burst == kbits_burst) {
1598 /* Assume that settings haven't changed since we last set them. */
1602 /* Remove any existing ingress qdisc. */
1603 error = tc_add_del_ingress_qdisc(netdev, false);
1605 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1606 netdev_name, strerror(error));
1611 error = tc_add_del_ingress_qdisc(netdev, true);
1613 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1614 netdev_name, strerror(error));
1618 error = tc_add_policer(netdev, kbits_rate, kbits_burst);
1620 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1621 netdev_name, strerror(error));
1626 netdev_dev->kbits_rate = kbits_rate;
1627 netdev_dev->kbits_burst = kbits_burst;
1628 netdev_dev->cache_valid |= VALID_POLICING;
1634 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1637 const struct tc_ops **opsp;
1639 for (opsp = tcs; *opsp != NULL; opsp++) {
1640 const struct tc_ops *ops = *opsp;
1641 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1642 sset_add(types, ops->ovs_name);
1648 static const struct tc_ops *
1649 tc_lookup_ovs_name(const char *name)
1651 const struct tc_ops **opsp;
1653 for (opsp = tcs; *opsp != NULL; opsp++) {
1654 const struct tc_ops *ops = *opsp;
1655 if (!strcmp(name, ops->ovs_name)) {
1662 static const struct tc_ops *
1663 tc_lookup_linux_name(const char *name)
1665 const struct tc_ops **opsp;
1667 for (opsp = tcs; *opsp != NULL; opsp++) {
1668 const struct tc_ops *ops = *opsp;
1669 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1676 static struct tc_queue *
1677 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1680 struct netdev_dev_linux *netdev_dev =
1681 netdev_dev_linux_cast(netdev_get_dev(netdev));
1682 struct tc_queue *queue;
1684 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1685 if (queue->queue_id == queue_id) {
1692 static struct tc_queue *
1693 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1695 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1699 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1701 struct netdev_qos_capabilities *caps)
1703 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1707 caps->n_queues = ops->n_queues;
1712 netdev_linux_get_qos(const struct netdev *netdev,
1713 const char **typep, struct shash *details)
1715 struct netdev_dev_linux *netdev_dev =
1716 netdev_dev_linux_cast(netdev_get_dev(netdev));
1719 error = tc_query_qdisc(netdev);
1724 *typep = netdev_dev->tc->ops->ovs_name;
1725 return (netdev_dev->tc->ops->qdisc_get
1726 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1731 netdev_linux_set_qos(struct netdev *netdev,
1732 const char *type, const struct shash *details)
1734 struct netdev_dev_linux *netdev_dev =
1735 netdev_dev_linux_cast(netdev_get_dev(netdev));
1736 const struct tc_ops *new_ops;
1739 new_ops = tc_lookup_ovs_name(type);
1740 if (!new_ops || !new_ops->tc_install) {
1744 error = tc_query_qdisc(netdev);
1749 if (new_ops == netdev_dev->tc->ops) {
1750 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1752 /* Delete existing qdisc. */
1753 error = tc_del_qdisc(netdev);
1757 assert(netdev_dev->tc == NULL);
1759 /* Install new qdisc. */
1760 error = new_ops->tc_install(netdev, details);
1761 assert((error == 0) == (netdev_dev->tc != NULL));
1768 netdev_linux_get_queue(const struct netdev *netdev,
1769 unsigned int queue_id, struct shash *details)
1771 struct netdev_dev_linux *netdev_dev =
1772 netdev_dev_linux_cast(netdev_get_dev(netdev));
1775 error = tc_query_qdisc(netdev);
1779 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1781 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1787 netdev_linux_set_queue(struct netdev *netdev,
1788 unsigned int queue_id, const struct shash *details)
1790 struct netdev_dev_linux *netdev_dev =
1791 netdev_dev_linux_cast(netdev_get_dev(netdev));
1794 error = tc_query_qdisc(netdev);
1797 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1798 || !netdev_dev->tc->ops->class_set) {
1802 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1806 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1808 struct netdev_dev_linux *netdev_dev =
1809 netdev_dev_linux_cast(netdev_get_dev(netdev));
1812 error = tc_query_qdisc(netdev);
1815 } else if (!netdev_dev->tc->ops->class_delete) {
1818 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1820 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1826 netdev_linux_get_queue_stats(const struct netdev *netdev,
1827 unsigned int queue_id,
1828 struct netdev_queue_stats *stats)
1830 struct netdev_dev_linux *netdev_dev =
1831 netdev_dev_linux_cast(netdev_get_dev(netdev));
1834 error = tc_query_qdisc(netdev);
1837 } else if (!netdev_dev->tc->ops->class_get_stats) {
1840 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1842 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1848 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1850 struct ofpbuf request;
1851 struct tcmsg *tcmsg;
1853 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
1857 tcmsg->tcm_parent = 0;
1858 nl_dump_start(dump, rtnl_sock, &request);
1859 ofpbuf_uninit(&request);
1864 netdev_linux_dump_queues(const struct netdev *netdev,
1865 netdev_dump_queues_cb *cb, void *aux)
1867 struct netdev_dev_linux *netdev_dev =
1868 netdev_dev_linux_cast(netdev_get_dev(netdev));
1869 struct tc_queue *queue;
1870 struct shash details;
1874 error = tc_query_qdisc(netdev);
1877 } else if (!netdev_dev->tc->ops->class_get) {
1882 shash_init(&details);
1883 HMAP_FOR_EACH (queue, hmap_node, &netdev_dev->tc->queues) {
1884 shash_clear(&details);
1886 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
1888 (*cb)(queue->queue_id, &details, aux);
1893 shash_destroy(&details);
1899 netdev_linux_dump_queue_stats(const struct netdev *netdev,
1900 netdev_dump_queue_stats_cb *cb, void *aux)
1902 struct netdev_dev_linux *netdev_dev =
1903 netdev_dev_linux_cast(netdev_get_dev(netdev));
1904 struct nl_dump dump;
1909 error = tc_query_qdisc(netdev);
1912 } else if (!netdev_dev->tc->ops->class_dump_stats) {
1917 if (!start_queue_dump(netdev, &dump)) {
1920 while (nl_dump_next(&dump, &msg)) {
1921 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
1927 error = nl_dump_done(&dump);
1928 return error ? error : last_error;
1932 netdev_linux_get_in4(const struct netdev *netdev_,
1933 struct in_addr *address, struct in_addr *netmask)
1935 struct netdev_dev_linux *netdev_dev =
1936 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1938 if (!(netdev_dev->cache_valid & VALID_IN4)) {
1941 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
1942 SIOCGIFADDR, "SIOCGIFADDR");
1947 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
1948 SIOCGIFNETMASK, "SIOCGIFNETMASK");
1953 netdev_dev->cache_valid |= VALID_IN4;
1955 *address = netdev_dev->address;
1956 *netmask = netdev_dev->netmask;
1957 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
1961 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
1962 struct in_addr netmask)
1964 struct netdev_dev_linux *netdev_dev =
1965 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1968 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
1970 netdev_dev->cache_valid |= VALID_IN4;
1971 netdev_dev->address = address;
1972 netdev_dev->netmask = netmask;
1973 if (address.s_addr != INADDR_ANY) {
1974 error = do_set_addr(netdev_, SIOCSIFNETMASK,
1975 "SIOCSIFNETMASK", netmask);
1982 parse_if_inet6_line(const char *line,
1983 struct in6_addr *in6, char ifname[16 + 1])
1985 uint8_t *s6 = in6->s6_addr;
1986 #define X8 "%2"SCNx8
1988 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
1989 "%*x %*x %*x %*x %16s\n",
1990 &s6[0], &s6[1], &s6[2], &s6[3],
1991 &s6[4], &s6[5], &s6[6], &s6[7],
1992 &s6[8], &s6[9], &s6[10], &s6[11],
1993 &s6[12], &s6[13], &s6[14], &s6[15],
1997 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
1998 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2000 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2002 struct netdev_dev_linux *netdev_dev =
2003 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2004 if (!(netdev_dev->cache_valid & VALID_IN6)) {
2008 netdev_dev->in6 = in6addr_any;
2010 file = fopen("/proc/net/if_inet6", "r");
2012 const char *name = netdev_get_name(netdev_);
2013 while (fgets(line, sizeof line, file)) {
2014 struct in6_addr in6_tmp;
2015 char ifname[16 + 1];
2016 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2017 && !strcmp(name, ifname))
2019 netdev_dev->in6 = in6_tmp;
2025 netdev_dev->cache_valid |= VALID_IN6;
2027 *in6 = netdev_dev->in6;
2032 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2034 struct sockaddr_in sin;
2035 memset(&sin, 0, sizeof sin);
2036 sin.sin_family = AF_INET;
2037 sin.sin_addr = addr;
2040 memset(sa, 0, sizeof *sa);
2041 memcpy(sa, &sin, sizeof sin);
2045 do_set_addr(struct netdev *netdev,
2046 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2049 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2050 make_in4_sockaddr(&ifr.ifr_addr, addr);
2052 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2056 /* Adds 'router' as a default IP gateway. */
2058 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2060 struct in_addr any = { INADDR_ANY };
2064 memset(&rt, 0, sizeof rt);
2065 make_in4_sockaddr(&rt.rt_dst, any);
2066 make_in4_sockaddr(&rt.rt_gateway, router);
2067 make_in4_sockaddr(&rt.rt_genmask, any);
2068 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2069 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2071 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
2077 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2080 static const char fn[] = "/proc/net/route";
2085 *netdev_name = NULL;
2086 stream = fopen(fn, "r");
2087 if (stream == NULL) {
2088 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
2093 while (fgets(line, sizeof line, stream)) {
2096 ovs_be32 dest, gateway, mask;
2097 int refcnt, metric, mtu;
2098 unsigned int flags, use, window, irtt;
2101 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2103 iface, &dest, &gateway, &flags, &refcnt,
2104 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2106 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2110 if (!(flags & RTF_UP)) {
2111 /* Skip routes that aren't up. */
2115 /* The output of 'dest', 'mask', and 'gateway' were given in
2116 * network byte order, so we don't need need any endian
2117 * conversions here. */
2118 if ((dest & mask) == (host->s_addr & mask)) {
2120 /* The host is directly reachable. */
2121 next_hop->s_addr = 0;
2123 /* To reach the host, we must go through a gateway. */
2124 next_hop->s_addr = gateway;
2126 *netdev_name = xstrdup(iface);
2138 netdev_linux_get_status(const struct netdev *netdev, struct shash *sh)
2140 struct ethtool_drvinfo drvinfo;
2143 memset(&drvinfo, 0, sizeof drvinfo);
2144 error = netdev_linux_do_ethtool(netdev_get_name(netdev),
2145 (struct ethtool_cmd *)&drvinfo,
2147 "ETHTOOL_GDRVINFO");
2149 shash_add(sh, "driver_name", xstrdup(drvinfo.driver));
2150 shash_add(sh, "driver_version", xstrdup(drvinfo.version));
2151 shash_add(sh, "firmware_version", xstrdup(drvinfo.fw_version));
2157 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2158 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2159 * returns 0. Otherwise, it returns a positive errno value; in particular,
2160 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2162 netdev_linux_arp_lookup(const struct netdev *netdev,
2163 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2166 struct sockaddr_in sin;
2169 memset(&r, 0, sizeof r);
2170 memset(&sin, 0, sizeof sin);
2171 sin.sin_family = AF_INET;
2172 sin.sin_addr.s_addr = ip;
2174 memcpy(&r.arp_pa, &sin, sizeof sin);
2175 r.arp_ha.sa_family = ARPHRD_ETHER;
2177 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2178 COVERAGE_INC(netdev_arp_lookup);
2179 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2181 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2182 } else if (retval != ENXIO) {
2183 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2184 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
2190 nd_to_iff_flags(enum netdev_flags nd)
2193 if (nd & NETDEV_UP) {
2196 if (nd & NETDEV_PROMISC) {
2203 iff_to_nd_flags(int iff)
2205 enum netdev_flags nd = 0;
2209 if (iff & IFF_PROMISC) {
2210 nd |= NETDEV_PROMISC;
2216 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2217 enum netdev_flags on, enum netdev_flags *old_flagsp)
2219 int old_flags, new_flags;
2222 error = get_flags(netdev_get_dev(netdev), &old_flags);
2224 *old_flagsp = iff_to_nd_flags(old_flags);
2225 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2226 if (new_flags != old_flags) {
2227 error = set_flags(netdev, new_flags);
2234 netdev_linux_change_seq(const struct netdev *netdev)
2236 return netdev_dev_linux_cast(netdev_get_dev(netdev))->change_seq;
2239 #define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS) \
2243 netdev_linux_init, \
2245 netdev_linux_wait, \
2248 netdev_linux_destroy, \
2249 NULL, /* get_config */ \
2250 NULL, /* set_config */ \
2252 netdev_linux_open, \
2253 netdev_linux_close, \
2255 netdev_linux_listen, \
2256 netdev_linux_recv, \
2257 netdev_linux_recv_wait, \
2258 netdev_linux_drain, \
2260 netdev_linux_send, \
2261 netdev_linux_send_wait, \
2263 netdev_linux_set_etheraddr, \
2264 netdev_linux_get_etheraddr, \
2265 netdev_linux_get_mtu, \
2266 netdev_linux_set_mtu, \
2267 netdev_linux_get_ifindex, \
2268 netdev_linux_get_carrier, \
2269 netdev_linux_get_carrier_resets, \
2270 netdev_linux_set_miimon_interval, \
2274 netdev_linux_get_features, \
2275 netdev_linux_set_advertisements, \
2277 netdev_linux_set_policing, \
2278 netdev_linux_get_qos_types, \
2279 netdev_linux_get_qos_capabilities, \
2280 netdev_linux_get_qos, \
2281 netdev_linux_set_qos, \
2282 netdev_linux_get_queue, \
2283 netdev_linux_set_queue, \
2284 netdev_linux_delete_queue, \
2285 netdev_linux_get_queue_stats, \
2286 netdev_linux_dump_queues, \
2287 netdev_linux_dump_queue_stats, \
2289 netdev_linux_get_in4, \
2290 netdev_linux_set_in4, \
2291 netdev_linux_get_in6, \
2292 netdev_linux_add_router, \
2293 netdev_linux_get_next_hop, \
2294 netdev_linux_get_status, \
2295 netdev_linux_arp_lookup, \
2297 netdev_linux_update_flags, \
2299 netdev_linux_change_seq \
2302 const struct netdev_class netdev_linux_class =
2305 netdev_linux_create,
2306 netdev_linux_get_stats,
2307 NULL); /* set_stats */
2309 const struct netdev_class netdev_tap_class =
2312 netdev_linux_create_tap,
2313 netdev_pseudo_get_stats,
2314 NULL); /* set_stats */
2316 const struct netdev_class netdev_internal_class =
2319 netdev_linux_create,
2320 netdev_pseudo_get_stats,
2321 netdev_vport_set_stats);
2323 /* HTB traffic control class. */
2325 #define HTB_N_QUEUES 0xf000
2329 unsigned int max_rate; /* In bytes/s. */
2333 struct tc_queue tc_queue;
2334 unsigned int min_rate; /* In bytes/s. */
2335 unsigned int max_rate; /* In bytes/s. */
2336 unsigned int burst; /* In bytes. */
2337 unsigned int priority; /* Lower values are higher priorities. */
2341 htb_get__(const struct netdev *netdev)
2343 struct netdev_dev_linux *netdev_dev =
2344 netdev_dev_linux_cast(netdev_get_dev(netdev));
2345 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2349 htb_install__(struct netdev *netdev, uint64_t max_rate)
2351 struct netdev_dev_linux *netdev_dev =
2352 netdev_dev_linux_cast(netdev_get_dev(netdev));
2355 htb = xmalloc(sizeof *htb);
2356 tc_init(&htb->tc, &tc_ops_htb);
2357 htb->max_rate = max_rate;
2359 netdev_dev->tc = &htb->tc;
2362 /* Create an HTB qdisc.
2364 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2366 htb_setup_qdisc__(struct netdev *netdev)
2369 struct tc_htb_glob opt;
2370 struct ofpbuf request;
2371 struct tcmsg *tcmsg;
2373 tc_del_qdisc(netdev);
2375 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2376 NLM_F_EXCL | NLM_F_CREATE, &request);
2380 tcmsg->tcm_handle = tc_make_handle(1, 0);
2381 tcmsg->tcm_parent = TC_H_ROOT;
2383 nl_msg_put_string(&request, TCA_KIND, "htb");
2385 memset(&opt, 0, sizeof opt);
2386 opt.rate2quantum = 10;
2390 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2391 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2392 nl_msg_end_nested(&request, opt_offset);
2394 return tc_transact(&request, NULL);
2397 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2398 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2400 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2401 unsigned int parent, struct htb_class *class)
2404 struct tc_htb_opt opt;
2405 struct ofpbuf request;
2406 struct tcmsg *tcmsg;
2410 error = netdev_get_mtu(netdev, &mtu);
2412 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2413 netdev_get_name(netdev));
2417 memset(&opt, 0, sizeof opt);
2418 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2419 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2420 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2421 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2422 opt.prio = class->priority;
2424 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2428 tcmsg->tcm_handle = handle;
2429 tcmsg->tcm_parent = parent;
2431 nl_msg_put_string(&request, TCA_KIND, "htb");
2432 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2433 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2434 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2435 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2436 nl_msg_end_nested(&request, opt_offset);
2438 error = tc_transact(&request, NULL);
2440 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2441 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2442 netdev_get_name(netdev),
2443 tc_get_major(handle), tc_get_minor(handle),
2444 tc_get_major(parent), tc_get_minor(parent),
2445 class->min_rate, class->max_rate,
2446 class->burst, class->priority, strerror(error));
2451 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2452 * description of them into 'details'. The description complies with the
2453 * specification given in the vswitch database documentation for linux-htb
2456 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2458 static const struct nl_policy tca_htb_policy[] = {
2459 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2460 .min_len = sizeof(struct tc_htb_opt) },
2463 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2464 const struct tc_htb_opt *htb;
2466 if (!nl_parse_nested(nl_options, tca_htb_policy,
2467 attrs, ARRAY_SIZE(tca_htb_policy))) {
2468 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2472 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2473 class->min_rate = htb->rate.rate;
2474 class->max_rate = htb->ceil.rate;
2475 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2476 class->priority = htb->prio;
2481 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2482 struct htb_class *options,
2483 struct netdev_queue_stats *stats)
2485 struct nlattr *nl_options;
2486 unsigned int handle;
2489 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2490 if (!error && queue_id) {
2491 unsigned int major = tc_get_major(handle);
2492 unsigned int minor = tc_get_minor(handle);
2493 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2494 *queue_id = minor - 1;
2499 if (!error && options) {
2500 error = htb_parse_tca_options__(nl_options, options);
2506 htb_parse_qdisc_details__(struct netdev *netdev,
2507 const struct shash *details, struct htb_class *hc)
2509 const char *max_rate_s;
2511 max_rate_s = shash_find_data(details, "max-rate");
2512 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2513 if (!hc->max_rate) {
2516 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2517 hc->max_rate = netdev_features_to_bps(current) / 8;
2519 hc->min_rate = hc->max_rate;
2525 htb_parse_class_details__(struct netdev *netdev,
2526 const struct shash *details, struct htb_class *hc)
2528 const struct htb *htb = htb_get__(netdev);
2529 const char *min_rate_s = shash_find_data(details, "min-rate");
2530 const char *max_rate_s = shash_find_data(details, "max-rate");
2531 const char *burst_s = shash_find_data(details, "burst");
2532 const char *priority_s = shash_find_data(details, "priority");
2535 error = netdev_get_mtu(netdev, &mtu);
2537 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2538 netdev_get_name(netdev));
2542 /* HTB requires at least an mtu sized min-rate to send any traffic even
2543 * on uncongested links. */
2544 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2545 hc->min_rate = MAX(hc->min_rate, mtu);
2546 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2549 hc->max_rate = (max_rate_s
2550 ? strtoull(max_rate_s, NULL, 10) / 8
2552 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2553 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2557 * According to hints in the documentation that I've read, it is important
2558 * that 'burst' be at least as big as the largest frame that might be
2559 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2560 * but having it a bit too small is a problem. Since netdev_get_mtu()
2561 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2562 * the MTU. We actually add 64, instead of 14, as a guard against
2563 * additional headers get tacked on somewhere that we're not aware of. */
2564 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2565 hc->burst = MAX(hc->burst, mtu + 64);
2568 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2574 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2575 unsigned int parent, struct htb_class *options,
2576 struct netdev_queue_stats *stats)
2578 struct ofpbuf *reply;
2581 error = tc_query_class(netdev, handle, parent, &reply);
2583 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2584 ofpbuf_delete(reply);
2590 htb_tc_install(struct netdev *netdev, const struct shash *details)
2594 error = htb_setup_qdisc__(netdev);
2596 struct htb_class hc;
2598 htb_parse_qdisc_details__(netdev, details, &hc);
2599 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2600 tc_make_handle(1, 0), &hc);
2602 htb_install__(netdev, hc.max_rate);
2608 static struct htb_class *
2609 htb_class_cast__(const struct tc_queue *queue)
2611 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2615 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2616 const struct htb_class *hc)
2618 struct htb *htb = htb_get__(netdev);
2619 size_t hash = hash_int(queue_id, 0);
2620 struct tc_queue *queue;
2621 struct htb_class *hcp;
2623 queue = tc_find_queue__(netdev, queue_id, hash);
2625 hcp = htb_class_cast__(queue);
2627 hcp = xmalloc(sizeof *hcp);
2628 queue = &hcp->tc_queue;
2629 queue->queue_id = queue_id;
2630 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2633 hcp->min_rate = hc->min_rate;
2634 hcp->max_rate = hc->max_rate;
2635 hcp->burst = hc->burst;
2636 hcp->priority = hc->priority;
2640 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2643 struct nl_dump dump;
2644 struct htb_class hc;
2646 /* Get qdisc options. */
2648 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2649 htb_install__(netdev, hc.max_rate);
2652 if (!start_queue_dump(netdev, &dump)) {
2655 while (nl_dump_next(&dump, &msg)) {
2656 unsigned int queue_id;
2658 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2659 htb_update_queue__(netdev, queue_id, &hc);
2662 nl_dump_done(&dump);
2668 htb_tc_destroy(struct tc *tc)
2670 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2671 struct htb_class *hc, *next;
2673 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2674 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2682 htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2684 const struct htb *htb = htb_get__(netdev);
2685 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2690 htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2692 struct htb_class hc;
2695 htb_parse_qdisc_details__(netdev, details, &hc);
2696 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2697 tc_make_handle(1, 0), &hc);
2699 htb_get__(netdev)->max_rate = hc.max_rate;
2705 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2706 const struct tc_queue *queue, struct shash *details)
2708 const struct htb_class *hc = htb_class_cast__(queue);
2710 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2711 if (hc->min_rate != hc->max_rate) {
2712 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2714 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2716 shash_add(details, "priority", xasprintf("%u", hc->priority));
2722 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2723 const struct shash *details)
2725 struct htb_class hc;
2728 error = htb_parse_class_details__(netdev, details, &hc);
2733 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2734 tc_make_handle(1, 0xfffe), &hc);
2739 htb_update_queue__(netdev, queue_id, &hc);
2744 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2746 struct htb_class *hc = htb_class_cast__(queue);
2747 struct htb *htb = htb_get__(netdev);
2750 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2752 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2759 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2760 struct netdev_queue_stats *stats)
2762 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2763 tc_make_handle(1, 0xfffe), NULL, stats);
2767 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2768 const struct ofpbuf *nlmsg,
2769 netdev_dump_queue_stats_cb *cb, void *aux)
2771 struct netdev_queue_stats stats;
2772 unsigned int handle, major, minor;
2775 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2780 major = tc_get_major(handle);
2781 minor = tc_get_minor(handle);
2782 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2783 (*cb)(minor - 1, &stats, aux);
2788 static const struct tc_ops tc_ops_htb = {
2789 "htb", /* linux_name */
2790 "linux-htb", /* ovs_name */
2791 HTB_N_QUEUES, /* n_queues */
2800 htb_class_get_stats,
2801 htb_class_dump_stats
2804 /* "linux-hfsc" traffic control class. */
2806 #define HFSC_N_QUEUES 0xf000
2814 struct tc_queue tc_queue;
2819 static struct hfsc *
2820 hfsc_get__(const struct netdev *netdev)
2822 struct netdev_dev_linux *netdev_dev;
2823 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2824 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2827 static struct hfsc_class *
2828 hfsc_class_cast__(const struct tc_queue *queue)
2830 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2834 hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2836 struct netdev_dev_linux * netdev_dev;
2839 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2840 hfsc = xmalloc(sizeof *hfsc);
2841 tc_init(&hfsc->tc, &tc_ops_hfsc);
2842 hfsc->max_rate = max_rate;
2843 netdev_dev->tc = &hfsc->tc;
2847 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
2848 const struct hfsc_class *hc)
2852 struct hfsc_class *hcp;
2853 struct tc_queue *queue;
2855 hfsc = hfsc_get__(netdev);
2856 hash = hash_int(queue_id, 0);
2858 queue = tc_find_queue__(netdev, queue_id, hash);
2860 hcp = hfsc_class_cast__(queue);
2862 hcp = xmalloc(sizeof *hcp);
2863 queue = &hcp->tc_queue;
2864 queue->queue_id = queue_id;
2865 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
2868 hcp->min_rate = hc->min_rate;
2869 hcp->max_rate = hc->max_rate;
2873 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
2875 const struct tc_service_curve *rsc, *fsc, *usc;
2876 static const struct nl_policy tca_hfsc_policy[] = {
2878 .type = NL_A_UNSPEC,
2880 .min_len = sizeof(struct tc_service_curve),
2883 .type = NL_A_UNSPEC,
2885 .min_len = sizeof(struct tc_service_curve),
2888 .type = NL_A_UNSPEC,
2890 .min_len = sizeof(struct tc_service_curve),
2893 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
2895 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
2896 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
2897 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
2901 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
2902 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
2903 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
2905 if (rsc->m1 != 0 || rsc->d != 0 ||
2906 fsc->m1 != 0 || fsc->d != 0 ||
2907 usc->m1 != 0 || usc->d != 0) {
2908 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2909 "Non-linear service curves are not supported.");
2913 if (rsc->m2 != fsc->m2) {
2914 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2915 "Real-time service curves are not supported ");
2919 if (rsc->m2 > usc->m2) {
2920 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2921 "Min-rate service curve is greater than "
2922 "the max-rate service curve.");
2926 class->min_rate = fsc->m2;
2927 class->max_rate = usc->m2;
2932 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2933 struct hfsc_class *options,
2934 struct netdev_queue_stats *stats)
2937 unsigned int handle;
2938 struct nlattr *nl_options;
2940 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2946 unsigned int major, minor;
2948 major = tc_get_major(handle);
2949 minor = tc_get_minor(handle);
2950 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
2951 *queue_id = minor - 1;
2958 error = hfsc_parse_tca_options__(nl_options, options);
2965 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
2966 unsigned int parent, struct hfsc_class *options,
2967 struct netdev_queue_stats *stats)
2970 struct ofpbuf *reply;
2972 error = tc_query_class(netdev, handle, parent, &reply);
2977 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
2978 ofpbuf_delete(reply);
2983 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
2984 struct hfsc_class *class)
2987 const char *max_rate_s;
2989 max_rate_s = shash_find_data(details, "max-rate");
2990 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2995 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2996 max_rate = netdev_features_to_bps(current) / 8;
2999 class->min_rate = max_rate;
3000 class->max_rate = max_rate;
3004 hfsc_parse_class_details__(struct netdev *netdev,
3005 const struct shash *details,
3006 struct hfsc_class * class)
3008 const struct hfsc *hfsc;
3009 uint32_t min_rate, max_rate;
3010 const char *min_rate_s, *max_rate_s;
3012 hfsc = hfsc_get__(netdev);
3013 min_rate_s = shash_find_data(details, "min-rate");
3014 max_rate_s = shash_find_data(details, "max-rate");
3016 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3017 min_rate = MAX(min_rate, 1);
3018 min_rate = MIN(min_rate, hfsc->max_rate);
3020 max_rate = (max_rate_s
3021 ? strtoull(max_rate_s, NULL, 10) / 8
3023 max_rate = MAX(max_rate, min_rate);
3024 max_rate = MIN(max_rate, hfsc->max_rate);
3026 class->min_rate = min_rate;
3027 class->max_rate = max_rate;
3032 /* Create an HFSC qdisc.
3034 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3036 hfsc_setup_qdisc__(struct netdev * netdev)
3038 struct tcmsg *tcmsg;
3039 struct ofpbuf request;
3040 struct tc_hfsc_qopt opt;
3042 tc_del_qdisc(netdev);
3044 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3045 NLM_F_EXCL | NLM_F_CREATE, &request);
3051 tcmsg->tcm_handle = tc_make_handle(1, 0);
3052 tcmsg->tcm_parent = TC_H_ROOT;
3054 memset(&opt, 0, sizeof opt);
3057 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3058 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3060 return tc_transact(&request, NULL);
3063 /* Create an HFSC class.
3065 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3066 * sc rate <min_rate> ul rate <max_rate>" */
3068 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3069 unsigned int parent, struct hfsc_class *class)
3073 struct tcmsg *tcmsg;
3074 struct ofpbuf request;
3075 struct tc_service_curve min, max;
3077 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3083 tcmsg->tcm_handle = handle;
3084 tcmsg->tcm_parent = parent;
3088 min.m2 = class->min_rate;
3092 max.m2 = class->max_rate;
3094 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3095 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3096 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3097 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3098 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3099 nl_msg_end_nested(&request, opt_offset);
3101 error = tc_transact(&request, NULL);
3103 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3104 "min-rate %ubps, max-rate %ubps (%s)",
3105 netdev_get_name(netdev),
3106 tc_get_major(handle), tc_get_minor(handle),
3107 tc_get_major(parent), tc_get_minor(parent),
3108 class->min_rate, class->max_rate, strerror(error));
3115 hfsc_tc_install(struct netdev *netdev, const struct shash *details)
3118 struct hfsc_class class;
3120 error = hfsc_setup_qdisc__(netdev);
3126 hfsc_parse_qdisc_details__(netdev, details, &class);
3127 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3128 tc_make_handle(1, 0), &class);
3134 hfsc_install__(netdev, class.max_rate);
3139 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3142 struct nl_dump dump;
3143 struct hfsc_class hc;
3146 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3147 hfsc_install__(netdev, hc.max_rate);
3149 if (!start_queue_dump(netdev, &dump)) {
3153 while (nl_dump_next(&dump, &msg)) {
3154 unsigned int queue_id;
3156 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3157 hfsc_update_queue__(netdev, queue_id, &hc);
3161 nl_dump_done(&dump);
3166 hfsc_tc_destroy(struct tc *tc)
3169 struct hfsc_class *hc, *next;
3171 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3173 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3174 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3183 hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3185 const struct hfsc *hfsc;
3186 hfsc = hfsc_get__(netdev);
3187 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3192 hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3195 struct hfsc_class class;
3197 hfsc_parse_qdisc_details__(netdev, details, &class);
3198 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3199 tc_make_handle(1, 0), &class);
3202 hfsc_get__(netdev)->max_rate = class.max_rate;
3209 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3210 const struct tc_queue *queue, struct shash *details)
3212 const struct hfsc_class *hc;
3214 hc = hfsc_class_cast__(queue);
3215 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3216 if (hc->min_rate != hc->max_rate) {
3217 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3223 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3224 const struct shash *details)
3227 struct hfsc_class class;
3229 error = hfsc_parse_class_details__(netdev, details, &class);
3234 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3235 tc_make_handle(1, 0xfffe), &class);
3240 hfsc_update_queue__(netdev, queue_id, &class);
3245 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3249 struct hfsc_class *hc;
3251 hc = hfsc_class_cast__(queue);
3252 hfsc = hfsc_get__(netdev);
3254 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3256 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3263 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3264 struct netdev_queue_stats *stats)
3266 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3267 tc_make_handle(1, 0xfffe), NULL, stats);
3271 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3272 const struct ofpbuf *nlmsg,
3273 netdev_dump_queue_stats_cb *cb, void *aux)
3275 struct netdev_queue_stats stats;
3276 unsigned int handle, major, minor;
3279 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3284 major = tc_get_major(handle);
3285 minor = tc_get_minor(handle);
3286 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3287 (*cb)(minor - 1, &stats, aux);
3292 static const struct tc_ops tc_ops_hfsc = {
3293 "hfsc", /* linux_name */
3294 "linux-hfsc", /* ovs_name */
3295 HFSC_N_QUEUES, /* n_queues */
3296 hfsc_tc_install, /* tc_install */
3297 hfsc_tc_load, /* tc_load */
3298 hfsc_tc_destroy, /* tc_destroy */
3299 hfsc_qdisc_get, /* qdisc_get */
3300 hfsc_qdisc_set, /* qdisc_set */
3301 hfsc_class_get, /* class_get */
3302 hfsc_class_set, /* class_set */
3303 hfsc_class_delete, /* class_delete */
3304 hfsc_class_get_stats, /* class_get_stats */
3305 hfsc_class_dump_stats /* class_dump_stats */
3308 /* "linux-default" traffic control class.
3310 * This class represents the default, unnamed Linux qdisc. It corresponds to
3311 * the "" (empty string) QoS type in the OVS database. */
3314 default_install__(struct netdev *netdev)
3316 struct netdev_dev_linux *netdev_dev =
3317 netdev_dev_linux_cast(netdev_get_dev(netdev));
3318 static struct tc *tc;
3321 tc = xmalloc(sizeof *tc);
3322 tc_init(tc, &tc_ops_default);
3324 netdev_dev->tc = tc;
3328 default_tc_install(struct netdev *netdev,
3329 const struct shash *details OVS_UNUSED)
3331 default_install__(netdev);
3336 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3338 default_install__(netdev);
3342 static const struct tc_ops tc_ops_default = {
3343 NULL, /* linux_name */
3348 NULL, /* tc_destroy */
3349 NULL, /* qdisc_get */
3350 NULL, /* qdisc_set */
3351 NULL, /* class_get */
3352 NULL, /* class_set */
3353 NULL, /* class_delete */
3354 NULL, /* class_get_stats */
3355 NULL /* class_dump_stats */
3358 /* "linux-other" traffic control class.
3363 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3365 struct netdev_dev_linux *netdev_dev =
3366 netdev_dev_linux_cast(netdev_get_dev(netdev));
3367 static struct tc *tc;
3370 tc = xmalloc(sizeof *tc);
3371 tc_init(tc, &tc_ops_other);
3373 netdev_dev->tc = tc;
3377 static const struct tc_ops tc_ops_other = {
3378 NULL, /* linux_name */
3379 "linux-other", /* ovs_name */
3381 NULL, /* tc_install */
3383 NULL, /* tc_destroy */
3384 NULL, /* qdisc_get */
3385 NULL, /* qdisc_set */
3386 NULL, /* class_get */
3387 NULL, /* class_set */
3388 NULL, /* class_delete */
3389 NULL, /* class_get_stats */
3390 NULL /* class_dump_stats */
3393 /* Traffic control. */
3395 /* Number of kernel "tc" ticks per second. */
3396 static double ticks_per_s;
3398 /* Number of kernel "jiffies" per second. This is used for the purpose of
3399 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3400 * one jiffy's worth of data.
3402 * There are two possibilities here:
3404 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3405 * approximate range of 100 to 1024. That means that we really need to
3406 * make sure that the qdisc can buffer that much data.
3408 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3409 * has finely granular timers and there's no need to fudge additional room
3410 * for buffers. (There's no extra effort needed to implement that: the
3411 * large 'buffer_hz' is used as a divisor, so practically any number will
3412 * come out as 0 in the division. Small integer results in the case of
3413 * really high dividends won't have any real effect anyhow.)
3415 static unsigned int buffer_hz;
3417 /* Returns tc handle 'major':'minor'. */
3419 tc_make_handle(unsigned int major, unsigned int minor)
3421 return TC_H_MAKE(major << 16, minor);
3424 /* Returns the major number from 'handle'. */
3426 tc_get_major(unsigned int handle)
3428 return TC_H_MAJ(handle) >> 16;
3431 /* Returns the minor number from 'handle'. */
3433 tc_get_minor(unsigned int handle)
3435 return TC_H_MIN(handle);
3438 static struct tcmsg *
3439 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3440 struct ofpbuf *request)
3442 struct tcmsg *tcmsg;
3446 error = get_ifindex(netdev, &ifindex);
3451 ofpbuf_init(request, 512);
3452 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3453 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3454 tcmsg->tcm_family = AF_UNSPEC;
3455 tcmsg->tcm_ifindex = ifindex;
3456 /* Caller should fill in tcmsg->tcm_handle. */
3457 /* Caller should fill in tcmsg->tcm_parent. */
3463 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3465 int error = nl_sock_transact(rtnl_sock, request, replyp);
3466 ofpbuf_uninit(request);
3470 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3471 * policing configuration.
3473 * This function is equivalent to running the following when 'add' is true:
3474 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3476 * This function is equivalent to running the following when 'add' is false:
3477 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3479 * The configuration and stats may be seen with the following command:
3480 * /sbin/tc -s qdisc show dev <devname>
3482 * Returns 0 if successful, otherwise a positive errno value.
3485 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3487 struct ofpbuf request;
3488 struct tcmsg *tcmsg;
3490 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3491 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3493 tcmsg = tc_make_request(netdev, type, flags, &request);
3497 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3498 tcmsg->tcm_parent = TC_H_INGRESS;
3499 nl_msg_put_string(&request, TCA_KIND, "ingress");
3500 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3502 error = tc_transact(&request, NULL);
3504 /* If we're deleting the qdisc, don't worry about some of the
3505 * error conditions. */
3506 if (!add && (error == ENOENT || error == EINVAL)) {
3515 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3518 * This function is equivalent to running:
3519 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3520 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3523 * The configuration and stats may be seen with the following command:
3524 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3526 * Returns 0 if successful, otherwise a positive errno value.
3529 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3531 struct tc_police tc_police;
3532 struct ofpbuf request;
3533 struct tcmsg *tcmsg;
3534 size_t basic_offset;
3535 size_t police_offset;
3539 memset(&tc_police, 0, sizeof tc_police);
3540 tc_police.action = TC_POLICE_SHOT;
3541 tc_police.mtu = mtu;
3542 tc_fill_rate(&tc_police.rate, kbits_rate/8 * 1000, mtu);
3543 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3544 kbits_burst * 1024);
3546 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3547 NLM_F_EXCL | NLM_F_CREATE, &request);
3551 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3552 tcmsg->tcm_info = tc_make_handle(49,
3553 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3555 nl_msg_put_string(&request, TCA_KIND, "basic");
3556 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3557 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3558 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3559 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3560 nl_msg_end_nested(&request, police_offset);
3561 nl_msg_end_nested(&request, basic_offset);
3563 error = tc_transact(&request, NULL);
3574 /* The values in psched are not individually very meaningful, but they are
3575 * important. The tables below show some values seen in the wild.
3579 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3580 * (Before that, there are hints that it was 1000000000.)
3582 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3586 * -----------------------------------
3587 * [1] 000c8000 000f4240 000f4240 00000064
3588 * [2] 000003e8 00000400 000f4240 3b9aca00
3589 * [3] 000003e8 00000400 000f4240 3b9aca00
3590 * [4] 000003e8 00000400 000f4240 00000064
3591 * [5] 000003e8 00000040 000f4240 3b9aca00
3592 * [6] 000003e8 00000040 000f4240 000000f9
3594 * a b c d ticks_per_s buffer_hz
3595 * ------- --------- ---------- ------------- ----------- -------------
3596 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3597 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3598 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3599 * [4] 1,000 1,024 1,000,000 100 976,562 100
3600 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3601 * [6] 1,000 64 1,000,000 249 15,625,000 249
3603 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3604 * [2] 2.6.26-1-686-bigmem from Debian lenny
3605 * [3] 2.6.26-2-sparc64 from Debian lenny
3606 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3607 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3608 * [6] 2.6.34 from kernel.org on KVM
3610 static const char fn[] = "/proc/net/psched";
3611 unsigned int a, b, c, d;
3617 stream = fopen(fn, "r");
3619 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3623 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3624 VLOG_WARN("%s: read failed", fn);
3628 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3632 VLOG_WARN("%s: invalid scheduler parameters", fn);
3636 ticks_per_s = (double) a * c / b;
3640 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3643 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3646 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3647 * rate of 'rate' bytes per second. */
3649 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3654 return (rate * ticks) / ticks_per_s;
3657 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3658 * rate of 'rate' bytes per second. */
3660 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3665 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3668 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3669 * a transmission rate of 'rate' bytes per second. */
3671 tc_buffer_per_jiffy(unsigned int rate)
3676 return rate / buffer_hz;
3679 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3680 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3681 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3682 * stores NULL into it if it is absent.
3684 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3687 * Returns 0 if successful, otherwise a positive errno value. */
3689 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3690 struct nlattr **options)
3692 static const struct nl_policy tca_policy[] = {
3693 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3694 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3696 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3698 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3699 tca_policy, ta, ARRAY_SIZE(ta))) {
3700 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3705 *kind = nl_attr_get_string(ta[TCA_KIND]);
3709 *options = ta[TCA_OPTIONS];
3724 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3725 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3726 * into '*options', and its queue statistics into '*stats'. Any of the output
3727 * arguments may be null.
3729 * Returns 0 if successful, otherwise a positive errno value. */
3731 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3732 struct nlattr **options, struct netdev_queue_stats *stats)
3734 static const struct nl_policy tca_policy[] = {
3735 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3736 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3738 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3740 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3741 tca_policy, ta, ARRAY_SIZE(ta))) {
3742 VLOG_WARN_RL(&rl, "failed to parse class message");
3747 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3748 *handlep = tc->tcm_handle;
3752 *options = ta[TCA_OPTIONS];
3756 const struct gnet_stats_queue *gsq;
3757 struct gnet_stats_basic gsb;
3759 static const struct nl_policy stats_policy[] = {
3760 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3761 .min_len = sizeof gsb },
3762 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3763 .min_len = sizeof *gsq },
3765 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3767 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3768 sa, ARRAY_SIZE(sa))) {
3769 VLOG_WARN_RL(&rl, "failed to parse class stats");
3773 /* Alignment issues screw up the length of struct gnet_stats_basic on
3774 * some arch/bitsize combinations. Newer versions of Linux have a
3775 * struct gnet_stats_basic_packed, but we can't depend on that. The
3776 * easiest thing to do is just to make a copy. */
3777 memset(&gsb, 0, sizeof gsb);
3778 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3779 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3780 stats->tx_bytes = gsb.bytes;
3781 stats->tx_packets = gsb.packets;
3783 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3784 stats->tx_errors = gsq->drops;
3794 memset(stats, 0, sizeof *stats);
3799 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3802 tc_query_class(const struct netdev *netdev,
3803 unsigned int handle, unsigned int parent,
3804 struct ofpbuf **replyp)
3806 struct ofpbuf request;
3807 struct tcmsg *tcmsg;
3810 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3814 tcmsg->tcm_handle = handle;
3815 tcmsg->tcm_parent = parent;
3817 error = tc_transact(&request, replyp);
3819 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3820 netdev_get_name(netdev),
3821 tc_get_major(handle), tc_get_minor(handle),
3822 tc_get_major(parent), tc_get_minor(parent),
3828 /* Equivalent to "tc class del dev <name> handle <handle>". */
3830 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3832 struct ofpbuf request;
3833 struct tcmsg *tcmsg;
3836 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
3840 tcmsg->tcm_handle = handle;
3841 tcmsg->tcm_parent = 0;
3843 error = tc_transact(&request, NULL);
3845 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3846 netdev_get_name(netdev),
3847 tc_get_major(handle), tc_get_minor(handle),
3853 /* Equivalent to "tc qdisc del dev <name> root". */
3855 tc_del_qdisc(struct netdev *netdev)
3857 struct netdev_dev_linux *netdev_dev =
3858 netdev_dev_linux_cast(netdev_get_dev(netdev));
3859 struct ofpbuf request;
3860 struct tcmsg *tcmsg;
3863 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
3867 tcmsg->tcm_handle = tc_make_handle(1, 0);
3868 tcmsg->tcm_parent = TC_H_ROOT;
3870 error = tc_transact(&request, NULL);
3871 if (error == EINVAL) {
3872 /* EINVAL probably means that the default qdisc was in use, in which
3873 * case we've accomplished our purpose. */
3876 if (!error && netdev_dev->tc) {
3877 if (netdev_dev->tc->ops->tc_destroy) {
3878 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3880 netdev_dev->tc = NULL;
3885 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
3886 * kernel to determine what they are. Returns 0 if successful, otherwise a
3887 * positive errno value. */
3889 tc_query_qdisc(const struct netdev *netdev)
3891 struct netdev_dev_linux *netdev_dev =
3892 netdev_dev_linux_cast(netdev_get_dev(netdev));
3893 struct ofpbuf request, *qdisc;
3894 const struct tc_ops *ops;
3895 struct tcmsg *tcmsg;
3899 if (netdev_dev->tc) {
3903 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3904 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3905 * 2.6.35 without that fix backported to it.
3907 * To avoid the OOPS, we must not make a request that would attempt to dump
3908 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3909 * few others. There are a few ways that I can see to do this, but most of
3910 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3911 * technique chosen here is to assume that any non-default qdisc that we
3912 * create will have a class with handle 1:0. The built-in qdiscs only have
3913 * a class with handle 0:0.
3915 * We could check for Linux 2.6.35+ and use a more straightforward method
3917 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
3921 tcmsg->tcm_handle = tc_make_handle(1, 0);
3922 tcmsg->tcm_parent = 0;
3924 /* Figure out what tc class to instantiate. */
3925 error = tc_transact(&request, &qdisc);
3929 error = tc_parse_qdisc(qdisc, &kind, NULL);
3931 ops = &tc_ops_other;
3933 ops = tc_lookup_linux_name(kind);
3935 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
3936 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
3938 ops = &tc_ops_other;
3941 } else if (error == ENOENT) {
3942 /* Either it's a built-in qdisc, or it's a qdisc set up by some
3943 * other entity that doesn't have a handle 1:0. We will assume
3944 * that it's the system default qdisc. */
3945 ops = &tc_ops_default;
3948 /* Who knows? Maybe the device got deleted. */
3949 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
3950 netdev_get_name(netdev), strerror(error));
3951 ops = &tc_ops_other;
3954 /* Instantiate it. */
3955 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
3956 assert((load_error == 0) == (netdev_dev->tc != NULL));
3957 ofpbuf_delete(qdisc);
3959 return error ? error : load_error;
3962 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
3963 approximate the time to transmit packets of various lengths. For an MTU of
3964 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
3965 represents two possible packet lengths; for a MTU of 513 through 1024, four
3966 possible lengths; and so on.
3968 Returns, for the specified 'mtu', the number of bits that packet lengths
3969 need to be shifted right to fit within such a 256-entry table. */
3971 tc_calc_cell_log(unsigned int mtu)
3976 mtu = ETH_PAYLOAD_MAX;
3978 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
3980 for (cell_log = 0; mtu >= 256; cell_log++) {
3987 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
3990 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
3992 memset(rate, 0, sizeof *rate);
3993 rate->cell_log = tc_calc_cell_log(mtu);
3994 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
3995 /* rate->cell_align = 0; */ /* distro headers. */
3996 rate->mpu = ETH_TOTAL_MIN;
4000 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4001 * attribute of the specified "type".
4003 * See tc_calc_cell_log() above for a description of "rtab"s. */
4005 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4010 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4011 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4012 unsigned packet_size = (i + 1) << rate->cell_log;
4013 if (packet_size < rate->mpu) {
4014 packet_size = rate->mpu;
4016 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4020 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4021 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4022 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4025 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4027 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4028 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4031 /* Linux-only functions declared in netdev-linux.h */
4033 /* Returns a fd for an AF_INET socket or a negative errno value. */
4035 netdev_linux_get_af_inet_sock(void)
4037 int error = netdev_linux_init();
4038 return error ? -error : af_inet_sock;
4041 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4042 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4044 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4045 const char *flag_name, bool enable)
4047 const char *netdev_name = netdev_get_name(netdev);
4048 struct ethtool_value evalue;
4052 memset(&evalue, 0, sizeof evalue);
4053 error = netdev_linux_do_ethtool(netdev_name,
4054 (struct ethtool_cmd *)&evalue,
4055 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4060 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4061 error = netdev_linux_do_ethtool(netdev_name,
4062 (struct ethtool_cmd *)&evalue,
4063 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4068 memset(&evalue, 0, sizeof evalue);
4069 error = netdev_linux_do_ethtool(netdev_name,
4070 (struct ethtool_cmd *)&evalue,
4071 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4076 if (new_flags != evalue.data) {
4077 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4078 "device %s failed", enable ? "enable" : "disable",
4079 flag_name, netdev_name);
4086 /* Utility functions. */
4088 /* Copies 'src' into 'dst', performing format conversion in the process. */
4090 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4091 const struct rtnl_link_stats *src)
4093 dst->rx_packets = src->rx_packets;
4094 dst->tx_packets = src->tx_packets;
4095 dst->rx_bytes = src->rx_bytes;
4096 dst->tx_bytes = src->tx_bytes;
4097 dst->rx_errors = src->rx_errors;
4098 dst->tx_errors = src->tx_errors;
4099 dst->rx_dropped = src->rx_dropped;
4100 dst->tx_dropped = src->tx_dropped;
4101 dst->multicast = src->multicast;
4102 dst->collisions = src->collisions;
4103 dst->rx_length_errors = src->rx_length_errors;
4104 dst->rx_over_errors = src->rx_over_errors;
4105 dst->rx_crc_errors = src->rx_crc_errors;
4106 dst->rx_frame_errors = src->rx_frame_errors;
4107 dst->rx_fifo_errors = src->rx_fifo_errors;
4108 dst->rx_missed_errors = src->rx_missed_errors;
4109 dst->tx_aborted_errors = src->tx_aborted_errors;
4110 dst->tx_carrier_errors = src->tx_carrier_errors;
4111 dst->tx_fifo_errors = src->tx_fifo_errors;
4112 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4113 dst->tx_window_errors = src->tx_window_errors;
4117 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4119 /* Policy for RTNLGRP_LINK messages.
4121 * There are *many* more fields in these messages, but currently we only
4122 * care about these fields. */
4123 static const struct nl_policy rtnlgrp_link_policy[] = {
4124 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4125 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4126 .min_len = sizeof(struct rtnl_link_stats) },
4129 struct ofpbuf request;
4130 struct ofpbuf *reply;
4131 struct ifinfomsg *ifi;
4132 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4135 ofpbuf_init(&request, 0);
4136 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4137 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4138 ifi->ifi_family = PF_UNSPEC;
4139 ifi->ifi_index = ifindex;
4140 error = nl_sock_transact(rtnl_sock, &request, &reply);
4141 ofpbuf_uninit(&request);
4146 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4147 rtnlgrp_link_policy,
4148 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4149 ofpbuf_delete(reply);
4153 if (!attrs[IFLA_STATS]) {
4154 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4155 ofpbuf_delete(reply);
4159 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4161 ofpbuf_delete(reply);
4167 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4169 static const char fn[] = "/proc/net/dev";
4174 stream = fopen(fn, "r");
4176 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
4181 while (fgets(line, sizeof line, stream)) {
4184 #define X64 "%"SCNu64
4187 X64 X64 X64 X64 X64 X64 X64 "%*u"
4188 X64 X64 X64 X64 X64 X64 X64 "%*u",
4194 &stats->rx_fifo_errors,
4195 &stats->rx_frame_errors,
4201 &stats->tx_fifo_errors,
4203 &stats->tx_carrier_errors) != 15) {
4204 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4205 } else if (!strcmp(devname, netdev_name)) {
4206 stats->rx_length_errors = UINT64_MAX;
4207 stats->rx_over_errors = UINT64_MAX;
4208 stats->rx_crc_errors = UINT64_MAX;
4209 stats->rx_missed_errors = UINT64_MAX;
4210 stats->tx_aborted_errors = UINT64_MAX;
4211 stats->tx_heartbeat_errors = UINT64_MAX;
4212 stats->tx_window_errors = UINT64_MAX;
4218 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4224 get_flags(const struct netdev_dev *dev, int *flags)
4230 error = netdev_linux_do_ioctl(dev->name, &ifr, SIOCGIFFLAGS,
4233 *flags = ifr.ifr_flags;
4239 set_flags(struct netdev *netdev, int flags)
4243 ifr.ifr_flags = flags;
4244 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4249 do_get_ifindex(const char *netdev_name)
4253 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4254 COVERAGE_INC(netdev_get_ifindex);
4255 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4256 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4257 netdev_name, strerror(errno));
4260 return ifr.ifr_ifindex;
4264 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4266 struct netdev_dev_linux *netdev_dev =
4267 netdev_dev_linux_cast(netdev_get_dev(netdev_));
4269 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
4270 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4274 netdev_dev->cache_valid |= VALID_IFINDEX;
4275 netdev_dev->ifindex = ifindex;
4277 *ifindexp = netdev_dev->ifindex;
4282 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4287 memset(&ifr, 0, sizeof ifr);
4288 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4289 COVERAGE_INC(netdev_get_hwaddr);
4290 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4291 /* ENODEV probably means that a vif disappeared asynchronously and
4292 * hasn't been removed from the database yet, so reduce the log level
4293 * to INFO for that case. */
4294 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4295 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4296 netdev_name, strerror(errno));
4299 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4300 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4301 VLOG_WARN("%s device has unknown hardware address family %d",
4302 netdev_name, hwaddr_family);
4304 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4309 set_etheraddr(const char *netdev_name, int hwaddr_family,
4310 const uint8_t mac[ETH_ADDR_LEN])
4314 memset(&ifr, 0, sizeof ifr);
4315 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4316 ifr.ifr_hwaddr.sa_family = hwaddr_family;
4317 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4318 COVERAGE_INC(netdev_set_hwaddr);
4319 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4320 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4321 netdev_name, strerror(errno));
4328 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4329 int cmd, const char *cmd_name)
4333 memset(&ifr, 0, sizeof ifr);
4334 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4335 ifr.ifr_data = (caddr_t) ecmd;
4338 COVERAGE_INC(netdev_ethtool);
4339 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4342 if (errno != EOPNOTSUPP) {
4343 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4344 "failed: %s", cmd_name, name, strerror(errno));
4346 /* The device doesn't support this operation. That's pretty
4347 * common, so there's no point in logging anything. */
4354 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4355 const char *cmd_name)
4357 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4358 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4359 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4367 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4368 int cmd, const char *cmd_name)
4373 ifr.ifr_addr.sa_family = AF_INET;
4374 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4376 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4377 *ip = sin->sin_addr;
4382 /* Returns an AF_PACKET raw socket or a negative errno value. */
4384 af_packet_sock(void)
4386 static int sock = INT_MIN;
4388 if (sock == INT_MIN) {
4389 sock = socket(AF_PACKET, SOCK_RAW, 0);
4391 set_nonblocking(sock);
4394 VLOG_ERR("failed to create packet socket: %s", strerror(errno));