2 * Copyright (c) 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <linux/version.h>
37 #include <sys/types.h>
38 #include <sys/ioctl.h>
39 #include <sys/socket.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
52 #include "dpif-linux.h"
53 #include "dynamic-string.h"
54 #include "fatal-signal.h"
57 #include "netdev-provider.h"
58 #include "netdev-vport.h"
59 #include "netlink-notifier.h"
60 #include "netlink-socket.h"
63 #include "openflow/openflow.h"
65 #include "poll-loop.h"
66 #include "rtnetlink-link.h"
68 #include "socket-util.h"
71 #include "unaligned.h"
74 VLOG_DEFINE_THIS_MODULE(netdev_linux);
76 COVERAGE_DEFINE(netdev_set_policing);
77 COVERAGE_DEFINE(netdev_arp_lookup);
78 COVERAGE_DEFINE(netdev_get_ifindex);
79 COVERAGE_DEFINE(netdev_get_hwaddr);
80 COVERAGE_DEFINE(netdev_set_hwaddr);
81 COVERAGE_DEFINE(netdev_get_ethtool);
82 COVERAGE_DEFINE(netdev_set_ethtool);
85 /* These were introduced in Linux 2.6.14, so they might be missing if we have
87 #ifndef ADVERTISED_Pause
88 #define ADVERTISED_Pause (1 << 13)
90 #ifndef ADVERTISED_Asym_Pause
91 #define ADVERTISED_Asym_Pause (1 << 14)
94 /* These were introduced in Linux 2.6.24, so they might be missing if we
95 * have old headers. */
96 #ifndef ETHTOOL_GFLAGS
97 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
99 #ifndef ETHTOOL_SFLAGS
100 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
103 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
106 #define TC_RTAB_SIZE 1024
110 VALID_IFINDEX = 1 << 0,
111 VALID_ETHERADDR = 1 << 1,
115 VALID_POLICING = 1 << 5,
116 VALID_VPORT_STAT_ERROR = 1 << 6,
117 VALID_DRVINFO = 1 << 7,
118 VALID_FEATURES = 1 << 8,
121 /* Traffic control. */
123 /* An instance of a traffic control class. Always associated with a particular
126 * Each TC implementation subclasses this with whatever additional data it
129 const struct tc_ops *ops;
130 struct hmap queues; /* Contains "struct tc_queue"s.
131 * Read by generic TC layer.
132 * Written only by TC implementation. */
135 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
137 /* One traffic control queue.
139 * Each TC implementation subclasses this with whatever additional data it
142 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
143 unsigned int queue_id; /* OpenFlow queue ID. */
144 long long int created; /* Time queue was created, in msecs. */
147 /* A particular kind of traffic control. Each implementation generally maps to
148 * one particular Linux qdisc class.
150 * The functions below return 0 if successful or a positive errno value on
151 * failure, except where otherwise noted. All of them must be provided, except
152 * where otherwise noted. */
154 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
155 * This is null for tc_ops_default and tc_ops_other, for which there are no
156 * appropriate values. */
157 const char *linux_name;
159 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
160 const char *ovs_name;
162 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
163 * queues. The queues are numbered 0 through n_queues - 1. */
164 unsigned int n_queues;
166 /* Called to install this TC class on 'netdev'. The implementation should
167 * make the Netlink calls required to set up 'netdev' with the right qdisc
168 * and configure it according to 'details'. The implementation may assume
169 * that the current qdisc is the default; that is, there is no need for it
170 * to delete the current qdisc before installing itself.
172 * The contents of 'details' should be documented as valid for 'ovs_name'
173 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
174 * (which is built as ovs-vswitchd.conf.db(8)).
176 * This function must return 0 if and only if it sets 'netdev->tc' to an
177 * initialized 'struct tc'.
179 * (This function is null for tc_ops_other, which cannot be installed. For
180 * other TC classes it should always be nonnull.) */
181 int (*tc_install)(struct netdev *netdev, const struct smap *details);
183 /* Called when the netdev code determines (through a Netlink query) that
184 * this TC class's qdisc is installed on 'netdev', but we didn't install
185 * it ourselves and so don't know any of the details.
187 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
188 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
189 * implementation should parse the other attributes of 'nlmsg' as
190 * necessary to determine its configuration. If necessary it should also
191 * use Netlink queries to determine the configuration of queues on
194 * This function must return 0 if and only if it sets 'netdev->tc' to an
195 * initialized 'struct tc'. */
196 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
198 /* Destroys the data structures allocated by the implementation as part of
199 * 'tc'. (This includes destroying 'tc->queues' by calling
202 * The implementation should not need to perform any Netlink calls. If
203 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
204 * (But it may not be desirable.)
206 * This function may be null if 'tc' is trivial. */
207 void (*tc_destroy)(struct tc *tc);
209 /* Retrieves details of 'netdev->tc' configuration into 'details'.
211 * The implementation should not need to perform any Netlink calls, because
212 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
213 * cached the configuration.
215 * The contents of 'details' should be documented as valid for 'ovs_name'
216 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
217 * (which is built as ovs-vswitchd.conf.db(8)).
219 * This function may be null if 'tc' is not configurable.
221 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
223 /* Reconfigures 'netdev->tc' according to 'details', performing any
224 * required Netlink calls to complete the reconfiguration.
226 * The contents of 'details' should be documented as valid for 'ovs_name'
227 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
228 * (which is built as ovs-vswitchd.conf.db(8)).
230 * This function may be null if 'tc' is not configurable.
232 int (*qdisc_set)(struct netdev *, const struct smap *details);
234 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
235 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
237 * The contents of 'details' should be documented as valid for 'ovs_name'
238 * in the "other_config" column in the "Queue" table in
239 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
241 * The implementation should not need to perform any Netlink calls, because
242 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
243 * cached the queue configuration.
245 * This function may be null if 'tc' does not have queues ('n_queues' is
247 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
248 struct smap *details);
250 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
251 * 'details', perfoming any required Netlink calls to complete the
252 * reconfiguration. The caller ensures that 'queue_id' is less than
255 * The contents of 'details' should be documented as valid for 'ovs_name'
256 * in the "other_config" column in the "Queue" table in
257 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
259 * This function may be null if 'tc' does not have queues or its queues are
260 * not configurable. */
261 int (*class_set)(struct netdev *, unsigned int queue_id,
262 const struct smap *details);
264 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
265 * tc_queue's within 'netdev->tc->queues'.
267 * This function may be null if 'tc' does not have queues or its queues
268 * cannot be deleted. */
269 int (*class_delete)(struct netdev *, struct tc_queue *queue);
271 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
272 * 'struct tc_queue's within 'netdev->tc->queues'.
274 * On success, initializes '*stats'.
276 * This function may be null if 'tc' does not have queues or if it cannot
277 * report queue statistics. */
278 int (*class_get_stats)(const struct netdev *netdev,
279 const struct tc_queue *queue,
280 struct netdev_queue_stats *stats);
282 /* Extracts queue stats from 'nlmsg', which is a response to a
283 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
285 * This function may be null if 'tc' does not have queues or if it cannot
286 * report queue statistics. */
287 int (*class_dump_stats)(const struct netdev *netdev,
288 const struct ofpbuf *nlmsg,
289 netdev_dump_queue_stats_cb *cb, void *aux);
293 tc_init(struct tc *tc, const struct tc_ops *ops)
296 hmap_init(&tc->queues);
300 tc_destroy(struct tc *tc)
302 hmap_destroy(&tc->queues);
305 static const struct tc_ops tc_ops_htb;
306 static const struct tc_ops tc_ops_hfsc;
307 static const struct tc_ops tc_ops_default;
308 static const struct tc_ops tc_ops_other;
310 static const struct tc_ops *const tcs[] = {
311 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
312 &tc_ops_hfsc, /* Hierarchical fair service curve. */
313 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
314 &tc_ops_other, /* Some other qdisc. */
318 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
319 static unsigned int tc_get_major(unsigned int handle);
320 static unsigned int tc_get_minor(unsigned int handle);
322 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
323 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
324 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
326 static struct tcmsg *tc_make_request(const struct netdev *, int type,
327 unsigned int flags, struct ofpbuf *);
328 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
329 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
330 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
333 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
334 struct nlattr **options);
335 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
336 struct nlattr **options,
337 struct netdev_queue_stats *);
338 static int tc_query_class(const struct netdev *,
339 unsigned int handle, unsigned int parent,
340 struct ofpbuf **replyp);
341 static int tc_delete_class(const struct netdev *, unsigned int handle);
343 static int tc_del_qdisc(struct netdev *netdev);
344 static int tc_query_qdisc(const struct netdev *netdev);
346 static int tc_calc_cell_log(unsigned int mtu);
347 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
348 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
349 const struct tc_ratespec *rate);
350 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
352 struct netdev_linux {
355 /* Protects all members below. */
356 struct ovs_mutex mutex;
358 unsigned int cache_valid;
359 unsigned int change_seq;
361 bool miimon; /* Link status of last poll. */
362 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
363 struct timer miimon_timer;
365 /* The following are figured out "on demand" only. They are only valid
366 * when the corresponding VALID_* bit in 'cache_valid' is set. */
368 uint8_t etheraddr[ETH_ADDR_LEN];
369 struct in_addr address, netmask;
372 unsigned int ifi_flags;
373 long long int carrier_resets;
374 uint32_t kbits_rate; /* Policing data. */
375 uint32_t kbits_burst;
376 int vport_stats_error; /* Cached error code from vport_get_stats().
377 0 or an errno value. */
378 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
379 int ether_addr_error; /* Cached error code from set/get etheraddr. */
380 int netdev_policing_error; /* Cached error code from set policing. */
381 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
382 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
384 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
385 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
386 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
388 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
391 /* For devices of class netdev_tap_class only. */
395 struct netdev_rx_linux {
401 /* This is set pretty low because we probably won't learn anything from the
402 * additional log messages. */
403 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
405 static void netdev_linux_run(void);
407 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
408 int cmd, const char *cmd_name);
409 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
410 int cmd, const char *cmd_name);
411 static int get_flags(const struct netdev *, unsigned int *flags);
412 static int set_flags(const char *, unsigned int flags);
413 static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
414 enum netdev_flags on, enum netdev_flags *old_flagsp)
415 OVS_REQUIRES(netdev->mutex);
416 static int do_get_ifindex(const char *netdev_name);
417 static int get_ifindex(const struct netdev *, int *ifindexp);
418 static int do_set_addr(struct netdev *netdev,
419 int ioctl_nr, const char *ioctl_name,
420 struct in_addr addr);
421 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
422 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
423 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
424 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
425 static int af_packet_sock(void);
426 static void netdev_linux_miimon_run(void);
427 static void netdev_linux_miimon_wait(void);
430 is_netdev_linux_class(const struct netdev_class *netdev_class)
432 return netdev_class->run == netdev_linux_run;
436 is_tap_netdev(const struct netdev *netdev)
438 return netdev_get_class(netdev) == &netdev_tap_class;
441 static struct netdev_linux *
442 netdev_linux_cast(const struct netdev *netdev)
444 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
446 return CONTAINER_OF(netdev, struct netdev_linux, up);
449 static struct netdev_rx_linux *
450 netdev_rx_linux_cast(const struct netdev_rx *rx)
452 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
453 return CONTAINER_OF(rx, struct netdev_rx_linux, up);
456 static void netdev_linux_update(struct netdev_linux *netdev,
457 const struct rtnetlink_link_change *)
458 OVS_REQUIRES(netdev->mutex);
459 static void netdev_linux_changed(struct netdev_linux *netdev,
460 unsigned int ifi_flags, unsigned int mask)
461 OVS_REQUIRES(netdev->mutex);
463 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK changes, or NULL
464 * if no such socket could be created. */
465 static struct nl_sock *
466 netdev_linux_notify_sock(void)
468 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
469 static struct nl_sock *sock;
471 if (ovsthread_once_start(&once)) {
474 error = nl_sock_create(NETLINK_ROUTE, &sock);
476 error = nl_sock_join_mcgroup(sock, RTNLGRP_LINK);
478 nl_sock_destroy(sock);
482 ovsthread_once_done(&once);
489 netdev_linux_run(void)
491 struct nl_sock *sock;
494 netdev_linux_miimon_run();
496 sock = netdev_linux_notify_sock();
502 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
503 uint64_t buf_stub[4096 / 8];
506 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
507 error = nl_sock_recv(sock, &buf, false);
509 struct rtnetlink_link_change change;
511 if (rtnetlink_link_parse(&buf, &change)) {
512 struct netdev *netdev_ = netdev_from_name(change.ifname);
513 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
514 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
516 ovs_mutex_lock(&netdev->mutex);
517 netdev_linux_update(netdev, &change);
518 ovs_mutex_unlock(&netdev->mutex);
520 netdev_close(netdev_);
522 } else if (error == ENOBUFS) {
523 struct shash device_shash;
524 struct shash_node *node;
528 shash_init(&device_shash);
529 netdev_get_devices(&netdev_linux_class, &device_shash);
530 SHASH_FOR_EACH (node, &device_shash) {
531 struct netdev *netdev_ = node->data;
532 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
535 ovs_mutex_lock(&netdev->mutex);
536 get_flags(netdev_, &flags);
537 netdev_linux_changed(netdev, flags, 0);
538 ovs_mutex_unlock(&netdev->mutex);
540 netdev_close(netdev_);
542 shash_destroy(&device_shash);
543 } else if (error != EAGAIN) {
544 VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)",
545 ovs_strerror(error));
552 netdev_linux_wait(void)
554 struct nl_sock *sock;
556 netdev_linux_miimon_wait();
557 sock = netdev_linux_notify_sock();
559 nl_sock_wait(sock, POLLIN);
564 netdev_linux_changed(struct netdev_linux *dev,
565 unsigned int ifi_flags, unsigned int mask)
566 OVS_REQUIRES(dev->mutex)
569 if (!dev->change_seq) {
573 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
574 dev->carrier_resets++;
576 dev->ifi_flags = ifi_flags;
578 dev->cache_valid &= mask;
582 netdev_linux_update(struct netdev_linux *dev,
583 const struct rtnetlink_link_change *change)
584 OVS_REQUIRES(dev->mutex)
586 if (change->nlmsg_type == RTM_NEWLINK) {
588 netdev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
590 /* Update netdev from rtnl-change msg. */
592 dev->mtu = change->mtu;
593 dev->cache_valid |= VALID_MTU;
594 dev->netdev_mtu_error = 0;
597 if (!eth_addr_is_zero(change->addr)) {
598 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
599 dev->cache_valid |= VALID_ETHERADDR;
600 dev->ether_addr_error = 0;
603 dev->ifindex = change->ifi_index;
604 dev->cache_valid |= VALID_IFINDEX;
605 dev->get_ifindex_error = 0;
608 netdev_linux_changed(dev, change->ifi_flags, 0);
612 static struct netdev *
613 netdev_linux_alloc(void)
615 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
620 netdev_linux_common_construct(struct netdev_linux *netdev)
622 ovs_mutex_init(&netdev->mutex);
623 netdev->change_seq = 1;
626 /* Creates system and internal devices. */
628 netdev_linux_construct(struct netdev *netdev_)
630 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
633 netdev_linux_common_construct(netdev);
635 error = get_flags(&netdev->up, &netdev->ifi_flags);
636 if (error == ENODEV) {
637 if (netdev->up.netdev_class != &netdev_internal_class) {
638 /* The device does not exist, so don't allow it to be opened. */
641 /* "Internal" netdevs have to be created as netdev objects before
642 * they exist in the kernel, because creating them in the kernel
643 * happens by passing a netdev object to dpif_port_add().
644 * Therefore, ignore the error. */
651 /* For most types of netdevs we open the device for each call of
652 * netdev_open(). However, this is not the case with tap devices,
653 * since it is only possible to open the device once. In this
654 * situation we share a single file descriptor, and consequently
655 * buffers, across all readers. Therefore once data is read it will
656 * be unavailable to other reads for tap devices. */
658 netdev_linux_construct_tap(struct netdev *netdev_)
660 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
661 static const char tap_dev[] = "/dev/net/tun";
662 const char *name = netdev_->name;
666 netdev_linux_common_construct(netdev);
668 /* Open tap device. */
669 netdev->tap_fd = open(tap_dev, O_RDWR);
670 if (netdev->tap_fd < 0) {
672 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
676 /* Create tap device. */
677 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
678 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
679 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
680 VLOG_WARN("%s: creating tap device failed: %s", name,
681 ovs_strerror(errno));
686 /* Make non-blocking. */
687 error = set_nonblocking(netdev->tap_fd);
695 close(netdev->tap_fd);
700 netdev_linux_destruct(struct netdev *netdev_)
702 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
704 if (netdev->tc && netdev->tc->ops->tc_destroy) {
705 netdev->tc->ops->tc_destroy(netdev->tc);
708 if (netdev_get_class(netdev_) == &netdev_tap_class
709 && netdev->tap_fd >= 0)
711 close(netdev->tap_fd);
714 ovs_mutex_destroy(&netdev->mutex);
718 netdev_linux_dealloc(struct netdev *netdev_)
720 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
724 static struct netdev_rx *
725 netdev_linux_rx_alloc(void)
727 struct netdev_rx_linux *rx = xzalloc(sizeof *rx);
732 netdev_linux_rx_construct(struct netdev_rx *rx_)
734 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
735 struct netdev *netdev_ = rx->up.netdev;
736 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
739 ovs_mutex_lock(&netdev->mutex);
740 rx->is_tap = is_tap_netdev(netdev_);
742 rx->fd = netdev->tap_fd;
744 struct sockaddr_ll sll;
746 /* Result of tcpdump -dd inbound */
747 static const struct sock_filter filt[] = {
748 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
749 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
750 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
751 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
753 static const struct sock_fprog fprog = {
754 ARRAY_SIZE(filt), (struct sock_filter *) filt
757 /* Create file descriptor. */
758 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
761 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
765 /* Set non-blocking mode. */
766 error = set_nonblocking(rx->fd);
771 /* Get ethernet device index. */
772 error = get_ifindex(&netdev->up, &ifindex);
777 /* Bind to specific ethernet device. */
778 memset(&sll, 0, sizeof sll);
779 sll.sll_family = AF_PACKET;
780 sll.sll_ifindex = ifindex;
781 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
782 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
784 VLOG_ERR("%s: failed to bind raw socket (%s)",
785 netdev_get_name(netdev_), ovs_strerror(error));
789 /* Filter for only inbound packets. */
790 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
794 VLOG_ERR("%s: failed to attach filter (%s)",
795 netdev_get_name(netdev_), ovs_strerror(error));
799 ovs_mutex_unlock(&netdev->mutex);
807 ovs_mutex_unlock(&netdev->mutex);
812 netdev_linux_rx_destruct(struct netdev_rx *rx_)
814 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
822 netdev_linux_rx_dealloc(struct netdev_rx *rx_)
824 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
830 netdev_linux_rx_recv(struct netdev_rx *rx_, void *data, size_t size)
832 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
837 ? read(rx->fd, data, size)
838 : recv(rx->fd, data, size, MSG_TRUNC));
839 } while (retval < 0 && errno == EINTR);
842 return retval > size ? -EMSGSIZE : retval;
844 if (errno != EAGAIN) {
845 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
846 ovs_strerror(errno), netdev_rx_get_name(rx_));
853 netdev_linux_rx_wait(struct netdev_rx *rx_)
855 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
856 poll_fd_wait(rx->fd, POLLIN);
860 netdev_linux_rx_drain(struct netdev_rx *rx_)
862 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
865 int error = af_inet_ifreq_ioctl(netdev_rx_get_name(rx_), &ifr,
866 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
870 drain_fd(rx->fd, ifr.ifr_qlen);
873 return drain_rcvbuf(rx->fd);
877 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
878 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
879 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
880 * the packet is too big or too small to transmit on the device.
882 * The caller retains ownership of 'buffer' in all cases.
884 * The kernel maintains a packet transmission queue, so the caller is not
885 * expected to do additional queuing of packets. */
887 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
892 if (!is_tap_netdev(netdev_)) {
893 /* Use our AF_PACKET socket to send to this device. */
894 struct sockaddr_ll sll;
900 sock = af_packet_sock();
905 ifindex = netdev_get_ifindex(netdev_);
910 /* We don't bother setting most fields in sockaddr_ll because the
911 * kernel ignores them for SOCK_RAW. */
912 memset(&sll, 0, sizeof sll);
913 sll.sll_family = AF_PACKET;
914 sll.sll_ifindex = ifindex;
916 iov.iov_base = CONST_CAST(void *, data);
920 msg.msg_namelen = sizeof sll;
923 msg.msg_control = NULL;
924 msg.msg_controllen = 0;
927 retval = sendmsg(sock, &msg, 0);
929 /* Use the tap fd to send to this device. This is essential for
930 * tap devices, because packets sent to a tap device with an
931 * AF_PACKET socket will loop back to be *received* again on the
932 * tap device. This doesn't occur on other interface types
933 * because we attach a socket filter to the rx socket. */
934 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
936 retval = write(netdev->tap_fd, data, size);
940 /* The Linux AF_PACKET implementation never blocks waiting for room
941 * for packets, instead returning ENOBUFS. Translate this into
942 * EAGAIN for the caller. */
943 if (errno == ENOBUFS) {
945 } else if (errno == EINTR) {
947 } else if (errno != EAGAIN) {
948 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
949 netdev_get_name(netdev_), ovs_strerror(errno));
952 } else if (retval != size) {
953 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
954 "%zu) on %s", retval, size, netdev_get_name(netdev_));
962 /* Registers with the poll loop to wake up from the next call to poll_block()
963 * when the packet transmission queue has sufficient room to transmit a packet
964 * with netdev_send().
966 * The kernel maintains a packet transmission queue, so the client is not
967 * expected to do additional queuing of packets. Thus, this function is
968 * unlikely to ever be used. It is included for completeness. */
970 netdev_linux_send_wait(struct netdev *netdev)
972 if (is_tap_netdev(netdev)) {
973 /* TAP device always accepts packets.*/
974 poll_immediate_wake();
978 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
979 * otherwise a positive errno value. */
981 netdev_linux_set_etheraddr(struct netdev *netdev_,
982 const uint8_t mac[ETH_ADDR_LEN])
984 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
985 enum netdev_flags old_flags = 0;
988 ovs_mutex_lock(&netdev->mutex);
990 if (netdev->cache_valid & VALID_ETHERADDR) {
991 error = netdev->ether_addr_error;
992 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
995 netdev->cache_valid &= ~VALID_ETHERADDR;
998 /* Tap devices must be brought down before setting the address. */
999 if (is_tap_netdev(netdev_)) {
1000 update_flags(netdev, NETDEV_UP, 0, &old_flags);
1002 error = set_etheraddr(netdev_get_name(netdev_), mac);
1003 if (!error || error == ENODEV) {
1004 netdev->ether_addr_error = error;
1005 netdev->cache_valid |= VALID_ETHERADDR;
1007 memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
1011 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1012 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1016 ovs_mutex_unlock(&netdev->mutex);
1020 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1022 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1023 uint8_t mac[ETH_ADDR_LEN])
1025 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1028 ovs_mutex_lock(&netdev->mutex);
1029 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1030 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1032 netdev->cache_valid |= VALID_ETHERADDR;
1035 error = netdev->ether_addr_error;
1037 memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
1039 ovs_mutex_unlock(&netdev->mutex);
1045 netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
1049 if (!(netdev->cache_valid & VALID_MTU)) {
1052 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
1053 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
1054 netdev->mtu = ifr.ifr_mtu;
1055 netdev->cache_valid |= VALID_MTU;
1058 error = netdev->netdev_mtu_error;
1060 *mtup = netdev->mtu;
1066 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1067 * in bytes, not including the hardware header; thus, this is typically 1500
1068 * bytes for Ethernet devices. */
1070 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1072 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1075 ovs_mutex_lock(&netdev->mutex);
1076 error = netdev_linux_get_mtu__(netdev, mtup);
1077 ovs_mutex_unlock(&netdev->mutex);
1082 /* Sets the maximum size of transmitted (MTU) for given device using linux
1083 * networking ioctl interface.
1086 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1088 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1092 ovs_mutex_lock(&netdev->mutex);
1093 if (netdev->cache_valid & VALID_MTU) {
1094 error = netdev->netdev_mtu_error;
1095 if (error || netdev->mtu == mtu) {
1098 netdev->cache_valid &= ~VALID_MTU;
1101 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1102 SIOCSIFMTU, "SIOCSIFMTU");
1103 if (!error || error == ENODEV) {
1104 netdev->netdev_mtu_error = error;
1105 netdev->mtu = ifr.ifr_mtu;
1106 netdev->cache_valid |= VALID_MTU;
1109 ovs_mutex_unlock(&netdev->mutex);
1113 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1114 * On failure, returns a negative errno value. */
1116 netdev_linux_get_ifindex(const struct netdev *netdev_)
1118 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1121 ovs_mutex_lock(&netdev->mutex);
1122 error = get_ifindex(netdev_, &ifindex);
1123 ovs_mutex_unlock(&netdev->mutex);
1125 return error ? -error : ifindex;
1129 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1131 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1133 ovs_mutex_lock(&netdev->mutex);
1134 if (netdev->miimon_interval > 0) {
1135 *carrier = netdev->miimon;
1137 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1139 ovs_mutex_unlock(&netdev->mutex);
1144 static long long int
1145 netdev_linux_get_carrier_resets(const struct netdev *netdev_)
1147 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1148 long long int carrier_resets;
1150 ovs_mutex_lock(&netdev->mutex);
1151 carrier_resets = netdev->carrier_resets;
1152 ovs_mutex_unlock(&netdev->mutex);
1154 return carrier_resets;
1158 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1159 struct mii_ioctl_data *data)
1164 memset(&ifr, 0, sizeof ifr);
1165 memcpy(&ifr.ifr_data, data, sizeof *data);
1166 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
1167 memcpy(data, &ifr.ifr_data, sizeof *data);
1173 netdev_linux_get_miimon(const char *name, bool *miimon)
1175 struct mii_ioctl_data data;
1180 memset(&data, 0, sizeof data);
1181 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1183 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1184 data.reg_num = MII_BMSR;
1185 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1189 *miimon = !!(data.val_out & BMSR_LSTATUS);
1191 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1194 struct ethtool_cmd ecmd;
1196 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1199 COVERAGE_INC(netdev_get_ethtool);
1200 memset(&ecmd, 0, sizeof ecmd);
1201 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1204 struct ethtool_value eval;
1206 memcpy(&eval, &ecmd, sizeof eval);
1207 *miimon = !!eval.data;
1209 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1217 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1218 long long int interval)
1220 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1222 ovs_mutex_lock(&netdev->mutex);
1223 interval = interval > 0 ? MAX(interval, 100) : 0;
1224 if (netdev->miimon_interval != interval) {
1225 netdev->miimon_interval = interval;
1226 timer_set_expired(&netdev->miimon_timer);
1228 ovs_mutex_unlock(&netdev->mutex);
1234 netdev_linux_miimon_run(void)
1236 struct shash device_shash;
1237 struct shash_node *node;
1239 shash_init(&device_shash);
1240 netdev_get_devices(&netdev_linux_class, &device_shash);
1241 SHASH_FOR_EACH (node, &device_shash) {
1242 struct netdev *netdev = node->data;
1243 struct netdev_linux *dev = netdev_linux_cast(netdev);
1246 ovs_mutex_lock(&dev->mutex);
1247 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1248 netdev_linux_get_miimon(dev->up.name, &miimon);
1249 if (miimon != dev->miimon) {
1250 dev->miimon = miimon;
1251 netdev_linux_changed(dev, dev->ifi_flags, 0);
1254 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1256 ovs_mutex_unlock(&dev->mutex);
1257 netdev_close(netdev);
1260 shash_destroy(&device_shash);
1264 netdev_linux_miimon_wait(void)
1266 struct shash device_shash;
1267 struct shash_node *node;
1269 shash_init(&device_shash);
1270 netdev_get_devices(&netdev_linux_class, &device_shash);
1271 SHASH_FOR_EACH (node, &device_shash) {
1272 struct netdev *netdev = node->data;
1273 struct netdev_linux *dev = netdev_linux_cast(netdev);
1275 ovs_mutex_lock(&dev->mutex);
1276 if (dev->miimon_interval > 0) {
1277 timer_wait(&dev->miimon_timer);
1279 ovs_mutex_unlock(&dev->mutex);
1280 netdev_close(netdev);
1282 shash_destroy(&device_shash);
1285 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1286 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1289 check_for_working_netlink_stats(void)
1291 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1292 * preferable, so if that works, we'll use it. */
1293 int ifindex = do_get_ifindex("lo");
1295 VLOG_WARN("failed to get ifindex for lo, "
1296 "obtaining netdev stats from proc");
1299 struct netdev_stats stats;
1300 int error = get_stats_via_netlink(ifindex, &stats);
1302 VLOG_DBG("obtaining netdev stats via rtnetlink");
1305 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1306 "via proc (you are probably running a pre-2.6.19 "
1307 "kernel)", ovs_strerror(error));
1314 swap_uint64(uint64_t *a, uint64_t *b)
1321 /* Copies 'src' into 'dst', performing format conversion in the process.
1323 * 'src' is allowed to be misaligned. */
1325 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1326 const struct ovs_vport_stats *src)
1328 dst->rx_packets = get_unaligned_u64(&src->rx_packets);
1329 dst->tx_packets = get_unaligned_u64(&src->tx_packets);
1330 dst->rx_bytes = get_unaligned_u64(&src->rx_bytes);
1331 dst->tx_bytes = get_unaligned_u64(&src->tx_bytes);
1332 dst->rx_errors = get_unaligned_u64(&src->rx_errors);
1333 dst->tx_errors = get_unaligned_u64(&src->tx_errors);
1334 dst->rx_dropped = get_unaligned_u64(&src->rx_dropped);
1335 dst->tx_dropped = get_unaligned_u64(&src->tx_dropped);
1337 dst->collisions = 0;
1338 dst->rx_length_errors = 0;
1339 dst->rx_over_errors = 0;
1340 dst->rx_crc_errors = 0;
1341 dst->rx_frame_errors = 0;
1342 dst->rx_fifo_errors = 0;
1343 dst->rx_missed_errors = 0;
1344 dst->tx_aborted_errors = 0;
1345 dst->tx_carrier_errors = 0;
1346 dst->tx_fifo_errors = 0;
1347 dst->tx_heartbeat_errors = 0;
1348 dst->tx_window_errors = 0;
1352 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1354 struct dpif_linux_vport reply;
1358 error = dpif_linux_vport_get(netdev_get_name(netdev), &reply, &buf);
1361 } else if (!reply.stats) {
1366 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1374 get_stats_via_vport(const struct netdev *netdev_,
1375 struct netdev_stats *stats)
1377 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1379 if (!netdev->vport_stats_error ||
1380 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1383 error = get_stats_via_vport__(netdev_, stats);
1384 if (error && error != ENOENT) {
1385 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1387 netdev_get_name(netdev_), ovs_strerror(error));
1389 netdev->vport_stats_error = error;
1390 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1395 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1396 struct netdev_stats *stats)
1398 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
1399 static int use_netlink_stats;
1402 if (ovsthread_once_start(&once)) {
1403 use_netlink_stats = check_for_working_netlink_stats();
1404 ovsthread_once_done(&once);
1407 if (use_netlink_stats) {
1410 error = get_ifindex(netdev_, &ifindex);
1412 error = get_stats_via_netlink(ifindex, stats);
1415 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1419 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1420 netdev_get_name(netdev_), error);
1426 /* Retrieves current device stats for 'netdev-linux'. */
1428 netdev_linux_get_stats(const struct netdev *netdev_,
1429 struct netdev_stats *stats)
1431 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1432 struct netdev_stats dev_stats;
1435 ovs_mutex_lock(&netdev->mutex);
1436 get_stats_via_vport(netdev_, stats);
1437 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1439 if (!netdev->vport_stats_error) {
1442 } else if (netdev->vport_stats_error) {
1443 /* stats not available from OVS then use ioctl stats. */
1446 stats->rx_errors += dev_stats.rx_errors;
1447 stats->tx_errors += dev_stats.tx_errors;
1448 stats->rx_dropped += dev_stats.rx_dropped;
1449 stats->tx_dropped += dev_stats.tx_dropped;
1450 stats->multicast += dev_stats.multicast;
1451 stats->collisions += dev_stats.collisions;
1452 stats->rx_length_errors += dev_stats.rx_length_errors;
1453 stats->rx_over_errors += dev_stats.rx_over_errors;
1454 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1455 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1456 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1457 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1458 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1459 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1460 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1461 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1462 stats->tx_window_errors += dev_stats.tx_window_errors;
1464 ovs_mutex_unlock(&netdev->mutex);
1469 /* Retrieves current device stats for 'netdev-tap' netdev or
1470 * netdev-internal. */
1472 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1474 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1475 struct netdev_stats dev_stats;
1478 ovs_mutex_lock(&netdev->mutex);
1479 get_stats_via_vport(netdev_, stats);
1480 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1482 if (!netdev->vport_stats_error) {
1485 } else if (netdev->vport_stats_error) {
1486 /* Transmit and receive stats will appear to be swapped relative to the
1487 * other ports since we are the one sending the data, not a remote
1488 * computer. For consistency, we swap them back here. This does not
1489 * apply if we are getting stats from the vport layer because it always
1490 * tracks stats from the perspective of the switch. */
1493 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1494 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1495 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1496 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1497 stats->rx_length_errors = 0;
1498 stats->rx_over_errors = 0;
1499 stats->rx_crc_errors = 0;
1500 stats->rx_frame_errors = 0;
1501 stats->rx_fifo_errors = 0;
1502 stats->rx_missed_errors = 0;
1503 stats->tx_aborted_errors = 0;
1504 stats->tx_carrier_errors = 0;
1505 stats->tx_fifo_errors = 0;
1506 stats->tx_heartbeat_errors = 0;
1507 stats->tx_window_errors = 0;
1509 stats->rx_dropped += dev_stats.tx_dropped;
1510 stats->tx_dropped += dev_stats.rx_dropped;
1512 stats->rx_errors += dev_stats.tx_errors;
1513 stats->tx_errors += dev_stats.rx_errors;
1515 stats->multicast += dev_stats.multicast;
1516 stats->collisions += dev_stats.collisions;
1518 ovs_mutex_unlock(&netdev->mutex);
1524 netdev_internal_get_stats(const struct netdev *netdev_,
1525 struct netdev_stats *stats)
1527 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1530 ovs_mutex_lock(&netdev->mutex);
1531 get_stats_via_vport(netdev_, stats);
1532 error = netdev->vport_stats_error;
1533 ovs_mutex_unlock(&netdev->mutex);
1539 netdev_internal_set_stats(struct netdev *netdev,
1540 const struct netdev_stats *stats)
1542 struct ovs_vport_stats vport_stats;
1543 struct dpif_linux_vport vport;
1546 vport_stats.rx_packets = stats->rx_packets;
1547 vport_stats.tx_packets = stats->tx_packets;
1548 vport_stats.rx_bytes = stats->rx_bytes;
1549 vport_stats.tx_bytes = stats->tx_bytes;
1550 vport_stats.rx_errors = stats->rx_errors;
1551 vport_stats.tx_errors = stats->tx_errors;
1552 vport_stats.rx_dropped = stats->rx_dropped;
1553 vport_stats.tx_dropped = stats->tx_dropped;
1555 dpif_linux_vport_init(&vport);
1556 vport.cmd = OVS_VPORT_CMD_SET;
1557 vport.name = netdev_get_name(netdev);
1558 vport.stats = &vport_stats;
1560 err = dpif_linux_vport_transact(&vport, NULL, NULL);
1562 /* If the vport layer doesn't know about the device, that doesn't mean it
1563 * doesn't exist (after all were able to open it when netdev_open() was
1564 * called), it just means that it isn't attached and we'll be getting
1565 * stats a different way. */
1566 if (err == ENODEV) {
1574 netdev_linux_read_features(struct netdev_linux *netdev)
1576 struct ethtool_cmd ecmd;
1580 if (netdev->cache_valid & VALID_FEATURES) {
1584 COVERAGE_INC(netdev_get_ethtool);
1585 memset(&ecmd, 0, sizeof ecmd);
1586 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1587 ETHTOOL_GSET, "ETHTOOL_GSET");
1592 /* Supported features. */
1593 netdev->supported = 0;
1594 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1595 netdev->supported |= NETDEV_F_10MB_HD;
1597 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1598 netdev->supported |= NETDEV_F_10MB_FD;
1600 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1601 netdev->supported |= NETDEV_F_100MB_HD;
1603 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1604 netdev->supported |= NETDEV_F_100MB_FD;
1606 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1607 netdev->supported |= NETDEV_F_1GB_HD;
1609 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1610 netdev->supported |= NETDEV_F_1GB_FD;
1612 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1613 netdev->supported |= NETDEV_F_10GB_FD;
1615 if (ecmd.supported & SUPPORTED_TP) {
1616 netdev->supported |= NETDEV_F_COPPER;
1618 if (ecmd.supported & SUPPORTED_FIBRE) {
1619 netdev->supported |= NETDEV_F_FIBER;
1621 if (ecmd.supported & SUPPORTED_Autoneg) {
1622 netdev->supported |= NETDEV_F_AUTONEG;
1624 if (ecmd.supported & SUPPORTED_Pause) {
1625 netdev->supported |= NETDEV_F_PAUSE;
1627 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1628 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1631 /* Advertised features. */
1632 netdev->advertised = 0;
1633 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1634 netdev->advertised |= NETDEV_F_10MB_HD;
1636 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1637 netdev->advertised |= NETDEV_F_10MB_FD;
1639 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1640 netdev->advertised |= NETDEV_F_100MB_HD;
1642 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1643 netdev->advertised |= NETDEV_F_100MB_FD;
1645 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1646 netdev->advertised |= NETDEV_F_1GB_HD;
1648 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1649 netdev->advertised |= NETDEV_F_1GB_FD;
1651 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1652 netdev->advertised |= NETDEV_F_10GB_FD;
1654 if (ecmd.advertising & ADVERTISED_TP) {
1655 netdev->advertised |= NETDEV_F_COPPER;
1657 if (ecmd.advertising & ADVERTISED_FIBRE) {
1658 netdev->advertised |= NETDEV_F_FIBER;
1660 if (ecmd.advertising & ADVERTISED_Autoneg) {
1661 netdev->advertised |= NETDEV_F_AUTONEG;
1663 if (ecmd.advertising & ADVERTISED_Pause) {
1664 netdev->advertised |= NETDEV_F_PAUSE;
1666 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1667 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1670 /* Current settings. */
1672 if (speed == SPEED_10) {
1673 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1674 } else if (speed == SPEED_100) {
1675 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1676 } else if (speed == SPEED_1000) {
1677 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1678 } else if (speed == SPEED_10000) {
1679 netdev->current = NETDEV_F_10GB_FD;
1680 } else if (speed == 40000) {
1681 netdev->current = NETDEV_F_40GB_FD;
1682 } else if (speed == 100000) {
1683 netdev->current = NETDEV_F_100GB_FD;
1684 } else if (speed == 1000000) {
1685 netdev->current = NETDEV_F_1TB_FD;
1687 netdev->current = 0;
1690 if (ecmd.port == PORT_TP) {
1691 netdev->current |= NETDEV_F_COPPER;
1692 } else if (ecmd.port == PORT_FIBRE) {
1693 netdev->current |= NETDEV_F_FIBER;
1697 netdev->current |= NETDEV_F_AUTONEG;
1701 netdev->cache_valid |= VALID_FEATURES;
1702 netdev->get_features_error = error;
1705 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
1706 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1707 * Returns 0 if successful, otherwise a positive errno value. */
1709 netdev_linux_get_features(const struct netdev *netdev_,
1710 enum netdev_features *current,
1711 enum netdev_features *advertised,
1712 enum netdev_features *supported,
1713 enum netdev_features *peer)
1715 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1718 ovs_mutex_lock(&netdev->mutex);
1719 netdev_linux_read_features(netdev);
1720 if (!netdev->get_features_error) {
1721 *current = netdev->current;
1722 *advertised = netdev->advertised;
1723 *supported = netdev->supported;
1724 *peer = 0; /* XXX */
1726 error = netdev->get_features_error;
1727 ovs_mutex_unlock(&netdev->mutex);
1732 /* Set the features advertised by 'netdev' to 'advertise'. */
1734 netdev_linux_set_advertisements(struct netdev *netdev_,
1735 enum netdev_features advertise)
1737 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1738 struct ethtool_cmd ecmd;
1741 ovs_mutex_lock(&netdev->mutex);
1743 COVERAGE_INC(netdev_get_ethtool);
1744 memset(&ecmd, 0, sizeof ecmd);
1745 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1746 ETHTOOL_GSET, "ETHTOOL_GSET");
1751 ecmd.advertising = 0;
1752 if (advertise & NETDEV_F_10MB_HD) {
1753 ecmd.advertising |= ADVERTISED_10baseT_Half;
1755 if (advertise & NETDEV_F_10MB_FD) {
1756 ecmd.advertising |= ADVERTISED_10baseT_Full;
1758 if (advertise & NETDEV_F_100MB_HD) {
1759 ecmd.advertising |= ADVERTISED_100baseT_Half;
1761 if (advertise & NETDEV_F_100MB_FD) {
1762 ecmd.advertising |= ADVERTISED_100baseT_Full;
1764 if (advertise & NETDEV_F_1GB_HD) {
1765 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1767 if (advertise & NETDEV_F_1GB_FD) {
1768 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1770 if (advertise & NETDEV_F_10GB_FD) {
1771 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1773 if (advertise & NETDEV_F_COPPER) {
1774 ecmd.advertising |= ADVERTISED_TP;
1776 if (advertise & NETDEV_F_FIBER) {
1777 ecmd.advertising |= ADVERTISED_FIBRE;
1779 if (advertise & NETDEV_F_AUTONEG) {
1780 ecmd.advertising |= ADVERTISED_Autoneg;
1782 if (advertise & NETDEV_F_PAUSE) {
1783 ecmd.advertising |= ADVERTISED_Pause;
1785 if (advertise & NETDEV_F_PAUSE_ASYM) {
1786 ecmd.advertising |= ADVERTISED_Asym_Pause;
1788 COVERAGE_INC(netdev_set_ethtool);
1789 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1790 ETHTOOL_SSET, "ETHTOOL_SSET");
1793 ovs_mutex_unlock(&netdev->mutex);
1797 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1798 * successful, otherwise a positive errno value. */
1800 netdev_linux_set_policing(struct netdev *netdev_,
1801 uint32_t kbits_rate, uint32_t kbits_burst)
1803 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1804 const char *netdev_name = netdev_get_name(netdev_);
1807 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1808 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1809 : kbits_burst); /* Stick with user-specified value. */
1811 ovs_mutex_lock(&netdev->mutex);
1812 if (netdev->cache_valid & VALID_POLICING) {
1813 error = netdev->netdev_policing_error;
1814 if (error || (netdev->kbits_rate == kbits_rate &&
1815 netdev->kbits_burst == kbits_burst)) {
1816 /* Assume that settings haven't changed since we last set them. */
1819 netdev->cache_valid &= ~VALID_POLICING;
1822 COVERAGE_INC(netdev_set_policing);
1823 /* Remove any existing ingress qdisc. */
1824 error = tc_add_del_ingress_qdisc(netdev_, false);
1826 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1827 netdev_name, ovs_strerror(error));
1832 error = tc_add_del_ingress_qdisc(netdev_, true);
1834 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1835 netdev_name, ovs_strerror(error));
1839 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
1841 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1842 netdev_name, ovs_strerror(error));
1847 netdev->kbits_rate = kbits_rate;
1848 netdev->kbits_burst = kbits_burst;
1851 if (!error || error == ENODEV) {
1852 netdev->netdev_policing_error = error;
1853 netdev->cache_valid |= VALID_POLICING;
1855 ovs_mutex_unlock(&netdev->mutex);
1860 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1863 const struct tc_ops *const *opsp;
1865 for (opsp = tcs; *opsp != NULL; opsp++) {
1866 const struct tc_ops *ops = *opsp;
1867 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1868 sset_add(types, ops->ovs_name);
1874 static const struct tc_ops *
1875 tc_lookup_ovs_name(const char *name)
1877 const struct tc_ops *const *opsp;
1879 for (opsp = tcs; *opsp != NULL; opsp++) {
1880 const struct tc_ops *ops = *opsp;
1881 if (!strcmp(name, ops->ovs_name)) {
1888 static const struct tc_ops *
1889 tc_lookup_linux_name(const char *name)
1891 const struct tc_ops *const *opsp;
1893 for (opsp = tcs; *opsp != NULL; opsp++) {
1894 const struct tc_ops *ops = *opsp;
1895 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1902 static struct tc_queue *
1903 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
1906 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1907 struct tc_queue *queue;
1909 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
1910 if (queue->queue_id == queue_id) {
1917 static struct tc_queue *
1918 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1920 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1924 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1926 struct netdev_qos_capabilities *caps)
1928 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1932 caps->n_queues = ops->n_queues;
1937 netdev_linux_get_qos(const struct netdev *netdev_,
1938 const char **typep, struct smap *details)
1940 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1943 ovs_mutex_lock(&netdev->mutex);
1944 error = tc_query_qdisc(netdev_);
1946 *typep = netdev->tc->ops->ovs_name;
1947 error = (netdev->tc->ops->qdisc_get
1948 ? netdev->tc->ops->qdisc_get(netdev_, details)
1951 ovs_mutex_unlock(&netdev->mutex);
1957 netdev_linux_set_qos(struct netdev *netdev_,
1958 const char *type, const struct smap *details)
1960 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1961 const struct tc_ops *new_ops;
1964 new_ops = tc_lookup_ovs_name(type);
1965 if (!new_ops || !new_ops->tc_install) {
1969 ovs_mutex_lock(&netdev->mutex);
1970 error = tc_query_qdisc(netdev_);
1975 if (new_ops == netdev->tc->ops) {
1976 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
1978 /* Delete existing qdisc. */
1979 error = tc_del_qdisc(netdev_);
1983 ovs_assert(netdev->tc == NULL);
1985 /* Install new qdisc. */
1986 error = new_ops->tc_install(netdev_, details);
1987 ovs_assert((error == 0) == (netdev->tc != NULL));
1991 ovs_mutex_unlock(&netdev->mutex);
1996 netdev_linux_get_queue(const struct netdev *netdev_,
1997 unsigned int queue_id, struct smap *details)
1999 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2002 ovs_mutex_lock(&netdev->mutex);
2003 error = tc_query_qdisc(netdev_);
2005 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2007 ? netdev->tc->ops->class_get(netdev_, queue, details)
2010 ovs_mutex_unlock(&netdev->mutex);
2016 netdev_linux_set_queue(struct netdev *netdev_,
2017 unsigned int queue_id, const struct smap *details)
2019 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2022 ovs_mutex_lock(&netdev->mutex);
2023 error = tc_query_qdisc(netdev_);
2025 error = (queue_id < netdev->tc->ops->n_queues
2026 && netdev->tc->ops->class_set
2027 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2030 ovs_mutex_unlock(&netdev->mutex);
2036 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
2038 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2041 ovs_mutex_lock(&netdev->mutex);
2042 error = tc_query_qdisc(netdev_);
2044 if (netdev->tc->ops->class_delete) {
2045 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2047 ? netdev->tc->ops->class_delete(netdev_, queue)
2053 ovs_mutex_unlock(&netdev->mutex);
2059 netdev_linux_get_queue_stats(const struct netdev *netdev_,
2060 unsigned int queue_id,
2061 struct netdev_queue_stats *stats)
2063 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2066 ovs_mutex_lock(&netdev->mutex);
2067 error = tc_query_qdisc(netdev_);
2069 if (netdev->tc->ops->class_get_stats) {
2070 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2072 stats->created = queue->created;
2073 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2082 ovs_mutex_unlock(&netdev->mutex);
2088 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
2090 struct ofpbuf request;
2091 struct tcmsg *tcmsg;
2093 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2097 tcmsg->tcm_parent = 0;
2098 nl_dump_start(dump, NETLINK_ROUTE, &request);
2099 ofpbuf_uninit(&request);
2104 netdev_linux_dump_queues(const struct netdev *netdev_,
2105 netdev_dump_queues_cb *cb, void *aux)
2107 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2110 ovs_mutex_lock(&netdev->mutex);
2111 error = tc_query_qdisc(netdev_);
2113 if (netdev->tc->ops->class_get) {
2114 struct tc_queue *queue, *next_queue;
2115 struct smap details;
2117 smap_init(&details);
2118 HMAP_FOR_EACH_SAFE (queue, next_queue, hmap_node,
2119 &netdev->tc->queues) {
2122 smap_clear(&details);
2124 retval = netdev->tc->ops->class_get(netdev_, queue, &details);
2126 (*cb)(queue->queue_id, &details, aux);
2131 smap_destroy(&details);
2136 ovs_mutex_unlock(&netdev->mutex);
2142 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2143 netdev_dump_queue_stats_cb *cb, void *aux)
2145 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2148 ovs_mutex_lock(&netdev->mutex);
2149 error = tc_query_qdisc(netdev_);
2151 struct nl_dump dump;
2153 if (!netdev->tc->ops->class_dump_stats) {
2155 } else if (!start_queue_dump(netdev_, &dump)) {
2161 while (nl_dump_next(&dump, &msg)) {
2162 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2169 retval = nl_dump_done(&dump);
2175 ovs_mutex_unlock(&netdev->mutex);
2181 netdev_linux_get_in4(const struct netdev *netdev_,
2182 struct in_addr *address, struct in_addr *netmask)
2184 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2187 ovs_mutex_lock(&netdev->mutex);
2188 if (!(netdev->cache_valid & VALID_IN4)) {
2189 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
2190 SIOCGIFADDR, "SIOCGIFADDR");
2192 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2193 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2195 netdev->cache_valid |= VALID_IN4;
2203 if (netdev->address.s_addr != INADDR_ANY) {
2204 *address = netdev->address;
2205 *netmask = netdev->netmask;
2207 error = EADDRNOTAVAIL;
2210 ovs_mutex_unlock(&netdev->mutex);
2216 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2217 struct in_addr netmask)
2219 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2222 ovs_mutex_lock(&netdev->mutex);
2223 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2225 netdev->cache_valid |= VALID_IN4;
2226 netdev->address = address;
2227 netdev->netmask = netmask;
2228 if (address.s_addr != INADDR_ANY) {
2229 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2230 "SIOCSIFNETMASK", netmask);
2233 ovs_mutex_unlock(&netdev->mutex);
2239 parse_if_inet6_line(const char *line,
2240 struct in6_addr *in6, char ifname[16 + 1])
2242 uint8_t *s6 = in6->s6_addr;
2243 #define X8 "%2"SCNx8
2245 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2246 "%*x %*x %*x %*x %16s\n",
2247 &s6[0], &s6[1], &s6[2], &s6[3],
2248 &s6[4], &s6[5], &s6[6], &s6[7],
2249 &s6[8], &s6[9], &s6[10], &s6[11],
2250 &s6[12], &s6[13], &s6[14], &s6[15],
2254 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2255 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2257 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2259 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2261 ovs_mutex_lock(&netdev->mutex);
2262 if (!(netdev->cache_valid & VALID_IN6)) {
2266 netdev->in6 = in6addr_any;
2268 file = fopen("/proc/net/if_inet6", "r");
2270 const char *name = netdev_get_name(netdev_);
2271 while (fgets(line, sizeof line, file)) {
2272 struct in6_addr in6_tmp;
2273 char ifname[16 + 1];
2274 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2275 && !strcmp(name, ifname))
2277 netdev->in6 = in6_tmp;
2283 netdev->cache_valid |= VALID_IN6;
2286 ovs_mutex_unlock(&netdev->mutex);
2292 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2294 struct sockaddr_in sin;
2295 memset(&sin, 0, sizeof sin);
2296 sin.sin_family = AF_INET;
2297 sin.sin_addr = addr;
2300 memset(sa, 0, sizeof *sa);
2301 memcpy(sa, &sin, sizeof sin);
2305 do_set_addr(struct netdev *netdev,
2306 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2310 make_in4_sockaddr(&ifr.ifr_addr, addr);
2311 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2315 /* Adds 'router' as a default IP gateway. */
2317 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2319 struct in_addr any = { INADDR_ANY };
2323 memset(&rt, 0, sizeof rt);
2324 make_in4_sockaddr(&rt.rt_dst, any);
2325 make_in4_sockaddr(&rt.rt_gateway, router);
2326 make_in4_sockaddr(&rt.rt_genmask, any);
2327 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2328 error = af_inet_ioctl(SIOCADDRT, &rt);
2330 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2336 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2339 static const char fn[] = "/proc/net/route";
2344 *netdev_name = NULL;
2345 stream = fopen(fn, "r");
2346 if (stream == NULL) {
2347 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2352 while (fgets(line, sizeof line, stream)) {
2355 ovs_be32 dest, gateway, mask;
2356 int refcnt, metric, mtu;
2357 unsigned int flags, use, window, irtt;
2360 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2362 iface, &dest, &gateway, &flags, &refcnt,
2363 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2365 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2369 if (!(flags & RTF_UP)) {
2370 /* Skip routes that aren't up. */
2374 /* The output of 'dest', 'mask', and 'gateway' were given in
2375 * network byte order, so we don't need need any endian
2376 * conversions here. */
2377 if ((dest & mask) == (host->s_addr & mask)) {
2379 /* The host is directly reachable. */
2380 next_hop->s_addr = 0;
2382 /* To reach the host, we must go through a gateway. */
2383 next_hop->s_addr = gateway;
2385 *netdev_name = xstrdup(iface);
2397 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2399 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2402 ovs_mutex_lock(&netdev->mutex);
2403 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2404 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2406 COVERAGE_INC(netdev_get_ethtool);
2407 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2408 error = netdev_linux_do_ethtool(netdev->up.name,
2411 "ETHTOOL_GDRVINFO");
2413 netdev->cache_valid |= VALID_DRVINFO;
2418 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2419 smap_add(smap, "driver_version", netdev->drvinfo.version);
2420 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2422 ovs_mutex_unlock(&netdev->mutex);
2428 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2431 smap_add(smap, "driver_name", "openvswitch");
2435 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2436 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2437 * returns 0. Otherwise, it returns a positive errno value; in particular,
2438 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2440 netdev_linux_arp_lookup(const struct netdev *netdev,
2441 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2444 struct sockaddr_in sin;
2447 memset(&r, 0, sizeof r);
2448 memset(&sin, 0, sizeof sin);
2449 sin.sin_family = AF_INET;
2450 sin.sin_addr.s_addr = ip;
2452 memcpy(&r.arp_pa, &sin, sizeof sin);
2453 r.arp_ha.sa_family = ARPHRD_ETHER;
2455 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2456 COVERAGE_INC(netdev_arp_lookup);
2457 retval = af_inet_ioctl(SIOCGARP, &r);
2459 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2460 } else if (retval != ENXIO) {
2461 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2462 netdev_get_name(netdev), IP_ARGS(ip),
2463 ovs_strerror(retval));
2469 nd_to_iff_flags(enum netdev_flags nd)
2472 if (nd & NETDEV_UP) {
2475 if (nd & NETDEV_PROMISC) {
2482 iff_to_nd_flags(int iff)
2484 enum netdev_flags nd = 0;
2488 if (iff & IFF_PROMISC) {
2489 nd |= NETDEV_PROMISC;
2495 update_flags(struct netdev_linux *netdev, enum netdev_flags off,
2496 enum netdev_flags on, enum netdev_flags *old_flagsp)
2497 OVS_REQUIRES(netdev->mutex)
2499 int old_flags, new_flags;
2502 old_flags = netdev->ifi_flags;
2503 *old_flagsp = iff_to_nd_flags(old_flags);
2504 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2505 if (new_flags != old_flags) {
2506 error = set_flags(netdev_get_name(&netdev->up), new_flags);
2507 get_flags(&netdev->up, &netdev->ifi_flags);
2514 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2515 enum netdev_flags on, enum netdev_flags *old_flagsp)
2517 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2520 ovs_mutex_lock(&netdev->mutex);
2521 error = update_flags(netdev, off, on, old_flagsp);
2522 ovs_mutex_unlock(&netdev->mutex);
2528 netdev_linux_change_seq(const struct netdev *netdev_)
2530 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2531 unsigned int change_seq;
2533 ovs_mutex_lock(&netdev->mutex);
2534 change_seq = netdev->change_seq;
2535 ovs_mutex_unlock(&netdev->mutex);
2540 #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, SET_STATS, \
2541 GET_FEATURES, GET_STATUS) \
2547 netdev_linux_wait, \
2549 netdev_linux_alloc, \
2551 netdev_linux_destruct, \
2552 netdev_linux_dealloc, \
2553 NULL, /* get_config */ \
2554 NULL, /* set_config */ \
2555 NULL, /* get_tunnel_config */ \
2557 netdev_linux_send, \
2558 netdev_linux_send_wait, \
2560 netdev_linux_set_etheraddr, \
2561 netdev_linux_get_etheraddr, \
2562 netdev_linux_get_mtu, \
2563 netdev_linux_set_mtu, \
2564 netdev_linux_get_ifindex, \
2565 netdev_linux_get_carrier, \
2566 netdev_linux_get_carrier_resets, \
2567 netdev_linux_set_miimon_interval, \
2572 netdev_linux_set_advertisements, \
2574 netdev_linux_set_policing, \
2575 netdev_linux_get_qos_types, \
2576 netdev_linux_get_qos_capabilities, \
2577 netdev_linux_get_qos, \
2578 netdev_linux_set_qos, \
2579 netdev_linux_get_queue, \
2580 netdev_linux_set_queue, \
2581 netdev_linux_delete_queue, \
2582 netdev_linux_get_queue_stats, \
2583 netdev_linux_dump_queues, \
2584 netdev_linux_dump_queue_stats, \
2586 netdev_linux_get_in4, \
2587 netdev_linux_set_in4, \
2588 netdev_linux_get_in6, \
2589 netdev_linux_add_router, \
2590 netdev_linux_get_next_hop, \
2592 netdev_linux_arp_lookup, \
2594 netdev_linux_update_flags, \
2596 netdev_linux_change_seq, \
2598 netdev_linux_rx_alloc, \
2599 netdev_linux_rx_construct, \
2600 netdev_linux_rx_destruct, \
2601 netdev_linux_rx_dealloc, \
2602 netdev_linux_rx_recv, \
2603 netdev_linux_rx_wait, \
2604 netdev_linux_rx_drain, \
2607 const struct netdev_class netdev_linux_class =
2610 netdev_linux_construct,
2611 netdev_linux_get_stats,
2612 NULL, /* set_stats */
2613 netdev_linux_get_features,
2614 netdev_linux_get_status);
2616 const struct netdev_class netdev_tap_class =
2619 netdev_linux_construct_tap,
2620 netdev_tap_get_stats,
2621 NULL, /* set_stats */
2622 netdev_linux_get_features,
2623 netdev_linux_get_status);
2625 const struct netdev_class netdev_internal_class =
2628 netdev_linux_construct,
2629 netdev_internal_get_stats,
2630 netdev_internal_set_stats,
2631 NULL, /* get_features */
2632 netdev_internal_get_status);
2634 /* HTB traffic control class. */
2636 #define HTB_N_QUEUES 0xf000
2640 unsigned int max_rate; /* In bytes/s. */
2644 struct tc_queue tc_queue;
2645 unsigned int min_rate; /* In bytes/s. */
2646 unsigned int max_rate; /* In bytes/s. */
2647 unsigned int burst; /* In bytes. */
2648 unsigned int priority; /* Lower values are higher priorities. */
2652 htb_get__(const struct netdev *netdev_)
2654 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2655 return CONTAINER_OF(netdev->tc, struct htb, tc);
2659 htb_install__(struct netdev *netdev_, uint64_t max_rate)
2661 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2664 htb = xmalloc(sizeof *htb);
2665 tc_init(&htb->tc, &tc_ops_htb);
2666 htb->max_rate = max_rate;
2668 netdev->tc = &htb->tc;
2671 /* Create an HTB qdisc.
2673 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2675 htb_setup_qdisc__(struct netdev *netdev)
2678 struct tc_htb_glob opt;
2679 struct ofpbuf request;
2680 struct tcmsg *tcmsg;
2682 tc_del_qdisc(netdev);
2684 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2685 NLM_F_EXCL | NLM_F_CREATE, &request);
2689 tcmsg->tcm_handle = tc_make_handle(1, 0);
2690 tcmsg->tcm_parent = TC_H_ROOT;
2692 nl_msg_put_string(&request, TCA_KIND, "htb");
2694 memset(&opt, 0, sizeof opt);
2695 opt.rate2quantum = 10;
2699 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2700 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2701 nl_msg_end_nested(&request, opt_offset);
2703 return tc_transact(&request, NULL);
2706 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2707 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2709 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2710 unsigned int parent, struct htb_class *class)
2713 struct tc_htb_opt opt;
2714 struct ofpbuf request;
2715 struct tcmsg *tcmsg;
2719 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
2721 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2722 netdev_get_name(netdev));
2726 memset(&opt, 0, sizeof opt);
2727 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2728 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2729 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2730 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2731 opt.prio = class->priority;
2733 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2737 tcmsg->tcm_handle = handle;
2738 tcmsg->tcm_parent = parent;
2740 nl_msg_put_string(&request, TCA_KIND, "htb");
2741 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2742 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2743 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2744 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2745 nl_msg_end_nested(&request, opt_offset);
2747 error = tc_transact(&request, NULL);
2749 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2750 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2751 netdev_get_name(netdev),
2752 tc_get_major(handle), tc_get_minor(handle),
2753 tc_get_major(parent), tc_get_minor(parent),
2754 class->min_rate, class->max_rate,
2755 class->burst, class->priority, ovs_strerror(error));
2760 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2761 * description of them into 'details'. The description complies with the
2762 * specification given in the vswitch database documentation for linux-htb
2765 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2767 static const struct nl_policy tca_htb_policy[] = {
2768 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2769 .min_len = sizeof(struct tc_htb_opt) },
2772 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2773 const struct tc_htb_opt *htb;
2775 if (!nl_parse_nested(nl_options, tca_htb_policy,
2776 attrs, ARRAY_SIZE(tca_htb_policy))) {
2777 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2781 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2782 class->min_rate = htb->rate.rate;
2783 class->max_rate = htb->ceil.rate;
2784 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2785 class->priority = htb->prio;
2790 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2791 struct htb_class *options,
2792 struct netdev_queue_stats *stats)
2794 struct nlattr *nl_options;
2795 unsigned int handle;
2798 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2799 if (!error && queue_id) {
2800 unsigned int major = tc_get_major(handle);
2801 unsigned int minor = tc_get_minor(handle);
2802 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2803 *queue_id = minor - 1;
2808 if (!error && options) {
2809 error = htb_parse_tca_options__(nl_options, options);
2815 htb_parse_qdisc_details__(struct netdev *netdev_,
2816 const struct smap *details, struct htb_class *hc)
2818 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2819 const char *max_rate_s;
2821 max_rate_s = smap_get(details, "max-rate");
2822 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2823 if (!hc->max_rate) {
2824 enum netdev_features current;
2826 netdev_linux_read_features(netdev);
2827 current = !netdev->get_features_error ? netdev->current : 0;
2828 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
2830 hc->min_rate = hc->max_rate;
2836 htb_parse_class_details__(struct netdev *netdev,
2837 const struct smap *details, struct htb_class *hc)
2839 const struct htb *htb = htb_get__(netdev);
2840 const char *min_rate_s = smap_get(details, "min-rate");
2841 const char *max_rate_s = smap_get(details, "max-rate");
2842 const char *burst_s = smap_get(details, "burst");
2843 const char *priority_s = smap_get(details, "priority");
2846 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
2848 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2849 netdev_get_name(netdev));
2853 /* HTB requires at least an mtu sized min-rate to send any traffic even
2854 * on uncongested links. */
2855 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2856 hc->min_rate = MAX(hc->min_rate, mtu);
2857 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2860 hc->max_rate = (max_rate_s
2861 ? strtoull(max_rate_s, NULL, 10) / 8
2863 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2864 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2868 * According to hints in the documentation that I've read, it is important
2869 * that 'burst' be at least as big as the largest frame that might be
2870 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2871 * but having it a bit too small is a problem. Since netdev_get_mtu()
2872 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2873 * the MTU. We actually add 64, instead of 14, as a guard against
2874 * additional headers get tacked on somewhere that we're not aware of. */
2875 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2876 hc->burst = MAX(hc->burst, mtu + 64);
2879 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2885 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2886 unsigned int parent, struct htb_class *options,
2887 struct netdev_queue_stats *stats)
2889 struct ofpbuf *reply;
2892 error = tc_query_class(netdev, handle, parent, &reply);
2894 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2895 ofpbuf_delete(reply);
2901 htb_tc_install(struct netdev *netdev, const struct smap *details)
2905 error = htb_setup_qdisc__(netdev);
2907 struct htb_class hc;
2909 htb_parse_qdisc_details__(netdev, details, &hc);
2910 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2911 tc_make_handle(1, 0), &hc);
2913 htb_install__(netdev, hc.max_rate);
2919 static struct htb_class *
2920 htb_class_cast__(const struct tc_queue *queue)
2922 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2926 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2927 const struct htb_class *hc)
2929 struct htb *htb = htb_get__(netdev);
2930 size_t hash = hash_int(queue_id, 0);
2931 struct tc_queue *queue;
2932 struct htb_class *hcp;
2934 queue = tc_find_queue__(netdev, queue_id, hash);
2936 hcp = htb_class_cast__(queue);
2938 hcp = xmalloc(sizeof *hcp);
2939 queue = &hcp->tc_queue;
2940 queue->queue_id = queue_id;
2941 queue->created = time_msec();
2942 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2945 hcp->min_rate = hc->min_rate;
2946 hcp->max_rate = hc->max_rate;
2947 hcp->burst = hc->burst;
2948 hcp->priority = hc->priority;
2952 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2955 struct nl_dump dump;
2956 struct htb_class hc;
2958 /* Get qdisc options. */
2960 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2961 htb_install__(netdev, hc.max_rate);
2964 if (!start_queue_dump(netdev, &dump)) {
2967 while (nl_dump_next(&dump, &msg)) {
2968 unsigned int queue_id;
2970 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2971 htb_update_queue__(netdev, queue_id, &hc);
2974 nl_dump_done(&dump);
2980 htb_tc_destroy(struct tc *tc)
2982 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2983 struct htb_class *hc, *next;
2985 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2986 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2994 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
2996 const struct htb *htb = htb_get__(netdev);
2997 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
3002 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
3004 struct htb_class hc;
3007 htb_parse_qdisc_details__(netdev, details, &hc);
3008 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3009 tc_make_handle(1, 0), &hc);
3011 htb_get__(netdev)->max_rate = hc.max_rate;
3017 htb_class_get(const struct netdev *netdev OVS_UNUSED,
3018 const struct tc_queue *queue, struct smap *details)
3020 const struct htb_class *hc = htb_class_cast__(queue);
3022 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3023 if (hc->min_rate != hc->max_rate) {
3024 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3026 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
3028 smap_add_format(details, "priority", "%u", hc->priority);
3034 htb_class_set(struct netdev *netdev, unsigned int queue_id,
3035 const struct smap *details)
3037 struct htb_class hc;
3040 error = htb_parse_class_details__(netdev, details, &hc);
3045 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3046 tc_make_handle(1, 0xfffe), &hc);
3051 htb_update_queue__(netdev, queue_id, &hc);
3056 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
3058 struct htb_class *hc = htb_class_cast__(queue);
3059 struct htb *htb = htb_get__(netdev);
3062 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3064 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3071 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3072 struct netdev_queue_stats *stats)
3074 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3075 tc_make_handle(1, 0xfffe), NULL, stats);
3079 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3080 const struct ofpbuf *nlmsg,
3081 netdev_dump_queue_stats_cb *cb, void *aux)
3083 struct netdev_queue_stats stats;
3084 unsigned int handle, major, minor;
3087 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3092 major = tc_get_major(handle);
3093 minor = tc_get_minor(handle);
3094 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3095 (*cb)(minor - 1, &stats, aux);
3100 static const struct tc_ops tc_ops_htb = {
3101 "htb", /* linux_name */
3102 "linux-htb", /* ovs_name */
3103 HTB_N_QUEUES, /* n_queues */
3112 htb_class_get_stats,
3113 htb_class_dump_stats
3116 /* "linux-hfsc" traffic control class. */
3118 #define HFSC_N_QUEUES 0xf000
3126 struct tc_queue tc_queue;
3131 static struct hfsc *
3132 hfsc_get__(const struct netdev *netdev_)
3134 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3135 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
3138 static struct hfsc_class *
3139 hfsc_class_cast__(const struct tc_queue *queue)
3141 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3145 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
3147 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3150 hfsc = xmalloc(sizeof *hfsc);
3151 tc_init(&hfsc->tc, &tc_ops_hfsc);
3152 hfsc->max_rate = max_rate;
3153 netdev->tc = &hfsc->tc;
3157 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3158 const struct hfsc_class *hc)
3162 struct hfsc_class *hcp;
3163 struct tc_queue *queue;
3165 hfsc = hfsc_get__(netdev);
3166 hash = hash_int(queue_id, 0);
3168 queue = tc_find_queue__(netdev, queue_id, hash);
3170 hcp = hfsc_class_cast__(queue);
3172 hcp = xmalloc(sizeof *hcp);
3173 queue = &hcp->tc_queue;
3174 queue->queue_id = queue_id;
3175 queue->created = time_msec();
3176 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3179 hcp->min_rate = hc->min_rate;
3180 hcp->max_rate = hc->max_rate;
3184 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3186 const struct tc_service_curve *rsc, *fsc, *usc;
3187 static const struct nl_policy tca_hfsc_policy[] = {
3189 .type = NL_A_UNSPEC,
3191 .min_len = sizeof(struct tc_service_curve),
3194 .type = NL_A_UNSPEC,
3196 .min_len = sizeof(struct tc_service_curve),
3199 .type = NL_A_UNSPEC,
3201 .min_len = sizeof(struct tc_service_curve),
3204 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3206 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3207 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3208 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3212 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3213 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3214 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3216 if (rsc->m1 != 0 || rsc->d != 0 ||
3217 fsc->m1 != 0 || fsc->d != 0 ||
3218 usc->m1 != 0 || usc->d != 0) {
3219 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3220 "Non-linear service curves are not supported.");
3224 if (rsc->m2 != fsc->m2) {
3225 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3226 "Real-time service curves are not supported ");
3230 if (rsc->m2 > usc->m2) {
3231 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3232 "Min-rate service curve is greater than "
3233 "the max-rate service curve.");
3237 class->min_rate = fsc->m2;
3238 class->max_rate = usc->m2;
3243 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3244 struct hfsc_class *options,
3245 struct netdev_queue_stats *stats)
3248 unsigned int handle;
3249 struct nlattr *nl_options;
3251 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3257 unsigned int major, minor;
3259 major = tc_get_major(handle);
3260 minor = tc_get_minor(handle);
3261 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3262 *queue_id = minor - 1;
3269 error = hfsc_parse_tca_options__(nl_options, options);
3276 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3277 unsigned int parent, struct hfsc_class *options,
3278 struct netdev_queue_stats *stats)
3281 struct ofpbuf *reply;
3283 error = tc_query_class(netdev, handle, parent, &reply);
3288 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3289 ofpbuf_delete(reply);
3294 hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
3295 struct hfsc_class *class)
3297 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3299 const char *max_rate_s;
3301 max_rate_s = smap_get(details, "max-rate");
3302 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3305 enum netdev_features current;
3307 netdev_linux_read_features(netdev);
3308 current = !netdev->get_features_error ? netdev->current : 0;
3309 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3312 class->min_rate = max_rate;
3313 class->max_rate = max_rate;
3317 hfsc_parse_class_details__(struct netdev *netdev,
3318 const struct smap *details,
3319 struct hfsc_class * class)
3321 const struct hfsc *hfsc;
3322 uint32_t min_rate, max_rate;
3323 const char *min_rate_s, *max_rate_s;
3325 hfsc = hfsc_get__(netdev);
3326 min_rate_s = smap_get(details, "min-rate");
3327 max_rate_s = smap_get(details, "max-rate");
3329 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3330 min_rate = MAX(min_rate, 1);
3331 min_rate = MIN(min_rate, hfsc->max_rate);
3333 max_rate = (max_rate_s
3334 ? strtoull(max_rate_s, NULL, 10) / 8
3336 max_rate = MAX(max_rate, min_rate);
3337 max_rate = MIN(max_rate, hfsc->max_rate);
3339 class->min_rate = min_rate;
3340 class->max_rate = max_rate;
3345 /* Create an HFSC qdisc.
3347 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3349 hfsc_setup_qdisc__(struct netdev * netdev)
3351 struct tcmsg *tcmsg;
3352 struct ofpbuf request;
3353 struct tc_hfsc_qopt opt;
3355 tc_del_qdisc(netdev);
3357 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3358 NLM_F_EXCL | NLM_F_CREATE, &request);
3364 tcmsg->tcm_handle = tc_make_handle(1, 0);
3365 tcmsg->tcm_parent = TC_H_ROOT;
3367 memset(&opt, 0, sizeof opt);
3370 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3371 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3373 return tc_transact(&request, NULL);
3376 /* Create an HFSC class.
3378 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3379 * sc rate <min_rate> ul rate <max_rate>" */
3381 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3382 unsigned int parent, struct hfsc_class *class)
3386 struct tcmsg *tcmsg;
3387 struct ofpbuf request;
3388 struct tc_service_curve min, max;
3390 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3396 tcmsg->tcm_handle = handle;
3397 tcmsg->tcm_parent = parent;
3401 min.m2 = class->min_rate;
3405 max.m2 = class->max_rate;
3407 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3408 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3409 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3410 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3411 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3412 nl_msg_end_nested(&request, opt_offset);
3414 error = tc_transact(&request, NULL);
3416 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3417 "min-rate %ubps, max-rate %ubps (%s)",
3418 netdev_get_name(netdev),
3419 tc_get_major(handle), tc_get_minor(handle),
3420 tc_get_major(parent), tc_get_minor(parent),
3421 class->min_rate, class->max_rate, ovs_strerror(error));
3428 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
3431 struct hfsc_class class;
3433 error = hfsc_setup_qdisc__(netdev);
3439 hfsc_parse_qdisc_details__(netdev, details, &class);
3440 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3441 tc_make_handle(1, 0), &class);
3447 hfsc_install__(netdev, class.max_rate);
3452 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3455 struct nl_dump dump;
3456 struct hfsc_class hc;
3459 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3460 hfsc_install__(netdev, hc.max_rate);
3462 if (!start_queue_dump(netdev, &dump)) {
3466 while (nl_dump_next(&dump, &msg)) {
3467 unsigned int queue_id;
3469 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3470 hfsc_update_queue__(netdev, queue_id, &hc);
3474 nl_dump_done(&dump);
3479 hfsc_tc_destroy(struct tc *tc)
3482 struct hfsc_class *hc, *next;
3484 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3486 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3487 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3496 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
3498 const struct hfsc *hfsc;
3499 hfsc = hfsc_get__(netdev);
3500 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
3505 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
3508 struct hfsc_class class;
3510 hfsc_parse_qdisc_details__(netdev, details, &class);
3511 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3512 tc_make_handle(1, 0), &class);
3515 hfsc_get__(netdev)->max_rate = class.max_rate;
3522 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3523 const struct tc_queue *queue, struct smap *details)
3525 const struct hfsc_class *hc;
3527 hc = hfsc_class_cast__(queue);
3528 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3529 if (hc->min_rate != hc->max_rate) {
3530 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3536 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3537 const struct smap *details)
3540 struct hfsc_class class;
3542 error = hfsc_parse_class_details__(netdev, details, &class);
3547 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3548 tc_make_handle(1, 0xfffe), &class);
3553 hfsc_update_queue__(netdev, queue_id, &class);
3558 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3562 struct hfsc_class *hc;
3564 hc = hfsc_class_cast__(queue);
3565 hfsc = hfsc_get__(netdev);
3567 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3569 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3576 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3577 struct netdev_queue_stats *stats)
3579 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3580 tc_make_handle(1, 0xfffe), NULL, stats);
3584 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3585 const struct ofpbuf *nlmsg,
3586 netdev_dump_queue_stats_cb *cb, void *aux)
3588 struct netdev_queue_stats stats;
3589 unsigned int handle, major, minor;
3592 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3597 major = tc_get_major(handle);
3598 minor = tc_get_minor(handle);
3599 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3600 (*cb)(minor - 1, &stats, aux);
3605 static const struct tc_ops tc_ops_hfsc = {
3606 "hfsc", /* linux_name */
3607 "linux-hfsc", /* ovs_name */
3608 HFSC_N_QUEUES, /* n_queues */
3609 hfsc_tc_install, /* tc_install */
3610 hfsc_tc_load, /* tc_load */
3611 hfsc_tc_destroy, /* tc_destroy */
3612 hfsc_qdisc_get, /* qdisc_get */
3613 hfsc_qdisc_set, /* qdisc_set */
3614 hfsc_class_get, /* class_get */
3615 hfsc_class_set, /* class_set */
3616 hfsc_class_delete, /* class_delete */
3617 hfsc_class_get_stats, /* class_get_stats */
3618 hfsc_class_dump_stats /* class_dump_stats */
3621 /* "linux-default" traffic control class.
3623 * This class represents the default, unnamed Linux qdisc. It corresponds to
3624 * the "" (empty string) QoS type in the OVS database. */
3627 default_install__(struct netdev *netdev_)
3629 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3630 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
3632 /* Nothing but a tc class implementation is allowed to write to a tc. This
3633 * class never does that, so we can legitimately use a const tc object. */
3634 netdev->tc = CONST_CAST(struct tc *, &tc);
3638 default_tc_install(struct netdev *netdev,
3639 const struct smap *details OVS_UNUSED)
3641 default_install__(netdev);
3646 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3648 default_install__(netdev);
3652 static const struct tc_ops tc_ops_default = {
3653 NULL, /* linux_name */
3658 NULL, /* tc_destroy */
3659 NULL, /* qdisc_get */
3660 NULL, /* qdisc_set */
3661 NULL, /* class_get */
3662 NULL, /* class_set */
3663 NULL, /* class_delete */
3664 NULL, /* class_get_stats */
3665 NULL /* class_dump_stats */
3668 /* "linux-other" traffic control class.
3673 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
3675 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3676 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
3678 /* Nothing but a tc class implementation is allowed to write to a tc. This
3679 * class never does that, so we can legitimately use a const tc object. */
3680 netdev->tc = CONST_CAST(struct tc *, &tc);
3684 static const struct tc_ops tc_ops_other = {
3685 NULL, /* linux_name */
3686 "linux-other", /* ovs_name */
3688 NULL, /* tc_install */
3690 NULL, /* tc_destroy */
3691 NULL, /* qdisc_get */
3692 NULL, /* qdisc_set */
3693 NULL, /* class_get */
3694 NULL, /* class_set */
3695 NULL, /* class_delete */
3696 NULL, /* class_get_stats */
3697 NULL /* class_dump_stats */
3700 /* Traffic control. */
3702 /* Number of kernel "tc" ticks per second. */
3703 static double ticks_per_s;
3705 /* Number of kernel "jiffies" per second. This is used for the purpose of
3706 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3707 * one jiffy's worth of data.
3709 * There are two possibilities here:
3711 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3712 * approximate range of 100 to 1024. That means that we really need to
3713 * make sure that the qdisc can buffer that much data.
3715 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3716 * has finely granular timers and there's no need to fudge additional room
3717 * for buffers. (There's no extra effort needed to implement that: the
3718 * large 'buffer_hz' is used as a divisor, so practically any number will
3719 * come out as 0 in the division. Small integer results in the case of
3720 * really high dividends won't have any real effect anyhow.)
3722 static unsigned int buffer_hz;
3724 /* Returns tc handle 'major':'minor'. */
3726 tc_make_handle(unsigned int major, unsigned int minor)
3728 return TC_H_MAKE(major << 16, minor);
3731 /* Returns the major number from 'handle'. */
3733 tc_get_major(unsigned int handle)
3735 return TC_H_MAJ(handle) >> 16;
3738 /* Returns the minor number from 'handle'. */
3740 tc_get_minor(unsigned int handle)
3742 return TC_H_MIN(handle);
3745 static struct tcmsg *
3746 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3747 struct ofpbuf *request)
3749 struct tcmsg *tcmsg;
3753 error = get_ifindex(netdev, &ifindex);
3758 ofpbuf_init(request, 512);
3759 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3760 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3761 tcmsg->tcm_family = AF_UNSPEC;
3762 tcmsg->tcm_ifindex = ifindex;
3763 /* Caller should fill in tcmsg->tcm_handle. */
3764 /* Caller should fill in tcmsg->tcm_parent. */
3770 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3772 int error = nl_transact(NETLINK_ROUTE, request, replyp);
3773 ofpbuf_uninit(request);
3777 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3778 * policing configuration.
3780 * This function is equivalent to running the following when 'add' is true:
3781 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3783 * This function is equivalent to running the following when 'add' is false:
3784 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3786 * The configuration and stats may be seen with the following command:
3787 * /sbin/tc -s qdisc show dev <devname>
3789 * Returns 0 if successful, otherwise a positive errno value.
3792 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3794 struct ofpbuf request;
3795 struct tcmsg *tcmsg;
3797 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3798 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3800 tcmsg = tc_make_request(netdev, type, flags, &request);
3804 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3805 tcmsg->tcm_parent = TC_H_INGRESS;
3806 nl_msg_put_string(&request, TCA_KIND, "ingress");
3807 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3809 error = tc_transact(&request, NULL);
3811 /* If we're deleting the qdisc, don't worry about some of the
3812 * error conditions. */
3813 if (!add && (error == ENOENT || error == EINVAL)) {
3822 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3825 * This function is equivalent to running:
3826 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3827 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3830 * The configuration and stats may be seen with the following command:
3831 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3833 * Returns 0 if successful, otherwise a positive errno value.
3836 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3838 struct tc_police tc_police;
3839 struct ofpbuf request;
3840 struct tcmsg *tcmsg;
3841 size_t basic_offset;
3842 size_t police_offset;
3846 memset(&tc_police, 0, sizeof tc_police);
3847 tc_police.action = TC_POLICE_SHOT;
3848 tc_police.mtu = mtu;
3849 tc_fill_rate(&tc_police.rate, (kbits_rate * 1000)/8, mtu);
3850 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3851 kbits_burst * 1024);
3853 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3854 NLM_F_EXCL | NLM_F_CREATE, &request);
3858 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3859 tcmsg->tcm_info = tc_make_handle(49,
3860 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3862 nl_msg_put_string(&request, TCA_KIND, "basic");
3863 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3864 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3865 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3866 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3867 nl_msg_end_nested(&request, police_offset);
3868 nl_msg_end_nested(&request, basic_offset);
3870 error = tc_transact(&request, NULL);
3881 /* The values in psched are not individually very meaningful, but they are
3882 * important. The tables below show some values seen in the wild.
3886 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3887 * (Before that, there are hints that it was 1000000000.)
3889 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3893 * -----------------------------------
3894 * [1] 000c8000 000f4240 000f4240 00000064
3895 * [2] 000003e8 00000400 000f4240 3b9aca00
3896 * [3] 000003e8 00000400 000f4240 3b9aca00
3897 * [4] 000003e8 00000400 000f4240 00000064
3898 * [5] 000003e8 00000040 000f4240 3b9aca00
3899 * [6] 000003e8 00000040 000f4240 000000f9
3901 * a b c d ticks_per_s buffer_hz
3902 * ------- --------- ---------- ------------- ----------- -------------
3903 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3904 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3905 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3906 * [4] 1,000 1,024 1,000,000 100 976,562 100
3907 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3908 * [6] 1,000 64 1,000,000 249 15,625,000 249
3910 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3911 * [2] 2.6.26-1-686-bigmem from Debian lenny
3912 * [3] 2.6.26-2-sparc64 from Debian lenny
3913 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3914 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3915 * [6] 2.6.34 from kernel.org on KVM
3917 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
3918 static const char fn[] = "/proc/net/psched";
3919 unsigned int a, b, c, d;
3922 if (!ovsthread_once_start(&once)) {
3929 stream = fopen(fn, "r");
3931 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
3935 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3936 VLOG_WARN("%s: read failed", fn);
3940 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3944 VLOG_WARN("%s: invalid scheduler parameters", fn);
3948 ticks_per_s = (double) a * c / b;
3952 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3955 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3958 ovsthread_once_done(&once);
3961 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3962 * rate of 'rate' bytes per second. */
3964 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3967 return (rate * ticks) / ticks_per_s;
3970 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3971 * rate of 'rate' bytes per second. */
3973 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3976 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3979 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3980 * a transmission rate of 'rate' bytes per second. */
3982 tc_buffer_per_jiffy(unsigned int rate)
3985 return rate / buffer_hz;
3988 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3989 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3990 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3991 * stores NULL into it if it is absent.
3993 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3996 * Returns 0 if successful, otherwise a positive errno value. */
3998 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3999 struct nlattr **options)
4001 static const struct nl_policy tca_policy[] = {
4002 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
4003 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
4005 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4007 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4008 tca_policy, ta, ARRAY_SIZE(ta))) {
4009 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
4014 *kind = nl_attr_get_string(ta[TCA_KIND]);
4018 *options = ta[TCA_OPTIONS];
4033 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
4034 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
4035 * into '*options', and its queue statistics into '*stats'. Any of the output
4036 * arguments may be null.
4038 * Returns 0 if successful, otherwise a positive errno value. */
4040 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
4041 struct nlattr **options, struct netdev_queue_stats *stats)
4043 static const struct nl_policy tca_policy[] = {
4044 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
4045 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
4047 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4049 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4050 tca_policy, ta, ARRAY_SIZE(ta))) {
4051 VLOG_WARN_RL(&rl, "failed to parse class message");
4056 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
4057 *handlep = tc->tcm_handle;
4061 *options = ta[TCA_OPTIONS];
4065 const struct gnet_stats_queue *gsq;
4066 struct gnet_stats_basic gsb;
4068 static const struct nl_policy stats_policy[] = {
4069 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
4070 .min_len = sizeof gsb },
4071 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
4072 .min_len = sizeof *gsq },
4074 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
4076 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
4077 sa, ARRAY_SIZE(sa))) {
4078 VLOG_WARN_RL(&rl, "failed to parse class stats");
4082 /* Alignment issues screw up the length of struct gnet_stats_basic on
4083 * some arch/bitsize combinations. Newer versions of Linux have a
4084 * struct gnet_stats_basic_packed, but we can't depend on that. The
4085 * easiest thing to do is just to make a copy. */
4086 memset(&gsb, 0, sizeof gsb);
4087 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
4088 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
4089 stats->tx_bytes = gsb.bytes;
4090 stats->tx_packets = gsb.packets;
4092 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
4093 stats->tx_errors = gsq->drops;
4103 memset(stats, 0, sizeof *stats);
4108 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
4111 tc_query_class(const struct netdev *netdev,
4112 unsigned int handle, unsigned int parent,
4113 struct ofpbuf **replyp)
4115 struct ofpbuf request;
4116 struct tcmsg *tcmsg;
4119 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
4123 tcmsg->tcm_handle = handle;
4124 tcmsg->tcm_parent = parent;
4126 error = tc_transact(&request, replyp);
4128 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
4129 netdev_get_name(netdev),
4130 tc_get_major(handle), tc_get_minor(handle),
4131 tc_get_major(parent), tc_get_minor(parent),
4132 ovs_strerror(error));
4137 /* Equivalent to "tc class del dev <name> handle <handle>". */
4139 tc_delete_class(const struct netdev *netdev, unsigned int handle)
4141 struct ofpbuf request;
4142 struct tcmsg *tcmsg;
4145 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
4149 tcmsg->tcm_handle = handle;
4150 tcmsg->tcm_parent = 0;
4152 error = tc_transact(&request, NULL);
4154 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4155 netdev_get_name(netdev),
4156 tc_get_major(handle), tc_get_minor(handle),
4157 ovs_strerror(error));
4162 /* Equivalent to "tc qdisc del dev <name> root". */
4164 tc_del_qdisc(struct netdev *netdev_)
4166 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4167 struct ofpbuf request;
4168 struct tcmsg *tcmsg;
4171 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
4175 tcmsg->tcm_handle = tc_make_handle(1, 0);
4176 tcmsg->tcm_parent = TC_H_ROOT;
4178 error = tc_transact(&request, NULL);
4179 if (error == EINVAL) {
4180 /* EINVAL probably means that the default qdisc was in use, in which
4181 * case we've accomplished our purpose. */
4184 if (!error && netdev->tc) {
4185 if (netdev->tc->ops->tc_destroy) {
4186 netdev->tc->ops->tc_destroy(netdev->tc);
4193 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4194 * kernel to determine what they are. Returns 0 if successful, otherwise a
4195 * positive errno value. */
4197 tc_query_qdisc(const struct netdev *netdev_)
4199 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4200 struct ofpbuf request, *qdisc;
4201 const struct tc_ops *ops;
4202 struct tcmsg *tcmsg;
4210 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4211 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4212 * 2.6.35 without that fix backported to it.
4214 * To avoid the OOPS, we must not make a request that would attempt to dump
4215 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4216 * few others. There are a few ways that I can see to do this, but most of
4217 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4218 * technique chosen here is to assume that any non-default qdisc that we
4219 * create will have a class with handle 1:0. The built-in qdiscs only have
4220 * a class with handle 0:0.
4222 * We could check for Linux 2.6.35+ and use a more straightforward method
4224 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
4228 tcmsg->tcm_handle = tc_make_handle(1, 0);
4229 tcmsg->tcm_parent = 0;
4231 /* Figure out what tc class to instantiate. */
4232 error = tc_transact(&request, &qdisc);
4236 error = tc_parse_qdisc(qdisc, &kind, NULL);
4238 ops = &tc_ops_other;
4240 ops = tc_lookup_linux_name(kind);
4242 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4243 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4245 ops = &tc_ops_other;
4248 } else if (error == ENOENT) {
4249 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4250 * other entity that doesn't have a handle 1:0. We will assume
4251 * that it's the system default qdisc. */
4252 ops = &tc_ops_default;
4255 /* Who knows? Maybe the device got deleted. */
4256 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4257 netdev_get_name(netdev_), ovs_strerror(error));
4258 ops = &tc_ops_other;
4261 /* Instantiate it. */
4262 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
4263 ovs_assert((load_error == 0) == (netdev->tc != NULL));
4264 ofpbuf_delete(qdisc);
4266 return error ? error : load_error;
4269 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4270 approximate the time to transmit packets of various lengths. For an MTU of
4271 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4272 represents two possible packet lengths; for a MTU of 513 through 1024, four
4273 possible lengths; and so on.
4275 Returns, for the specified 'mtu', the number of bits that packet lengths
4276 need to be shifted right to fit within such a 256-entry table. */
4278 tc_calc_cell_log(unsigned int mtu)
4283 mtu = ETH_PAYLOAD_MAX;
4285 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4287 for (cell_log = 0; mtu >= 256; cell_log++) {
4294 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4297 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4299 memset(rate, 0, sizeof *rate);
4300 rate->cell_log = tc_calc_cell_log(mtu);
4301 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4302 /* rate->cell_align = 0; */ /* distro headers. */
4303 rate->mpu = ETH_TOTAL_MIN;
4307 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4308 * attribute of the specified "type".
4310 * See tc_calc_cell_log() above for a description of "rtab"s. */
4312 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4317 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4318 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4319 unsigned packet_size = (i + 1) << rate->cell_log;
4320 if (packet_size < rate->mpu) {
4321 packet_size = rate->mpu;
4323 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4327 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4328 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4329 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4332 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4334 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4335 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4338 /* Linux-only functions declared in netdev-linux.h */
4340 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4341 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4343 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4344 const char *flag_name, bool enable)
4346 const char *netdev_name = netdev_get_name(netdev);
4347 struct ethtool_value evalue;
4351 COVERAGE_INC(netdev_get_ethtool);
4352 memset(&evalue, 0, sizeof evalue);
4353 error = netdev_linux_do_ethtool(netdev_name,
4354 (struct ethtool_cmd *)&evalue,
4355 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4360 COVERAGE_INC(netdev_set_ethtool);
4361 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4362 error = netdev_linux_do_ethtool(netdev_name,
4363 (struct ethtool_cmd *)&evalue,
4364 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4369 COVERAGE_INC(netdev_get_ethtool);
4370 memset(&evalue, 0, sizeof evalue);
4371 error = netdev_linux_do_ethtool(netdev_name,
4372 (struct ethtool_cmd *)&evalue,
4373 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4378 if (new_flags != evalue.data) {
4379 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4380 "device %s failed", enable ? "enable" : "disable",
4381 flag_name, netdev_name);
4388 /* Utility functions. */
4390 /* Copies 'src' into 'dst', performing format conversion in the process. */
4392 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4393 const struct rtnl_link_stats *src)
4395 dst->rx_packets = src->rx_packets;
4396 dst->tx_packets = src->tx_packets;
4397 dst->rx_bytes = src->rx_bytes;
4398 dst->tx_bytes = src->tx_bytes;
4399 dst->rx_errors = src->rx_errors;
4400 dst->tx_errors = src->tx_errors;
4401 dst->rx_dropped = src->rx_dropped;
4402 dst->tx_dropped = src->tx_dropped;
4403 dst->multicast = src->multicast;
4404 dst->collisions = src->collisions;
4405 dst->rx_length_errors = src->rx_length_errors;
4406 dst->rx_over_errors = src->rx_over_errors;
4407 dst->rx_crc_errors = src->rx_crc_errors;
4408 dst->rx_frame_errors = src->rx_frame_errors;
4409 dst->rx_fifo_errors = src->rx_fifo_errors;
4410 dst->rx_missed_errors = src->rx_missed_errors;
4411 dst->tx_aborted_errors = src->tx_aborted_errors;
4412 dst->tx_carrier_errors = src->tx_carrier_errors;
4413 dst->tx_fifo_errors = src->tx_fifo_errors;
4414 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4415 dst->tx_window_errors = src->tx_window_errors;
4419 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4421 /* Policy for RTNLGRP_LINK messages.
4423 * There are *many* more fields in these messages, but currently we only
4424 * care about these fields. */
4425 static const struct nl_policy rtnlgrp_link_policy[] = {
4426 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4427 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4428 .min_len = sizeof(struct rtnl_link_stats) },
4431 struct ofpbuf request;
4432 struct ofpbuf *reply;
4433 struct ifinfomsg *ifi;
4434 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4437 ofpbuf_init(&request, 0);
4438 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4439 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4440 ifi->ifi_family = PF_UNSPEC;
4441 ifi->ifi_index = ifindex;
4442 error = nl_transact(NETLINK_ROUTE, &request, &reply);
4443 ofpbuf_uninit(&request);
4448 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4449 rtnlgrp_link_policy,
4450 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4451 ofpbuf_delete(reply);
4455 if (!attrs[IFLA_STATS]) {
4456 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4457 ofpbuf_delete(reply);
4461 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4463 ofpbuf_delete(reply);
4469 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4471 static const char fn[] = "/proc/net/dev";
4476 stream = fopen(fn, "r");
4478 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
4483 while (fgets(line, sizeof line, stream)) {
4486 #define X64 "%"SCNu64
4489 X64 X64 X64 X64 X64 X64 X64 "%*u"
4490 X64 X64 X64 X64 X64 X64 X64 "%*u",
4496 &stats->rx_fifo_errors,
4497 &stats->rx_frame_errors,
4503 &stats->tx_fifo_errors,
4505 &stats->tx_carrier_errors) != 15) {
4506 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4507 } else if (!strcmp(devname, netdev_name)) {
4508 stats->rx_length_errors = UINT64_MAX;
4509 stats->rx_over_errors = UINT64_MAX;
4510 stats->rx_crc_errors = UINT64_MAX;
4511 stats->rx_missed_errors = UINT64_MAX;
4512 stats->tx_aborted_errors = UINT64_MAX;
4513 stats->tx_heartbeat_errors = UINT64_MAX;
4514 stats->tx_window_errors = UINT64_MAX;
4520 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4526 get_flags(const struct netdev *dev, unsigned int *flags)
4532 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
4534 *flags = ifr.ifr_flags;
4540 set_flags(const char *name, unsigned int flags)
4544 ifr.ifr_flags = flags;
4545 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
4549 do_get_ifindex(const char *netdev_name)
4554 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4555 COVERAGE_INC(netdev_get_ifindex);
4557 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
4559 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4560 netdev_name, ovs_strerror(error));
4563 return ifr.ifr_ifindex;
4567 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4569 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4571 if (!(netdev->cache_valid & VALID_IFINDEX)) {
4572 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4575 netdev->get_ifindex_error = -ifindex;
4576 netdev->ifindex = 0;
4578 netdev->get_ifindex_error = 0;
4579 netdev->ifindex = ifindex;
4581 netdev->cache_valid |= VALID_IFINDEX;
4584 *ifindexp = netdev->ifindex;
4585 return netdev->get_ifindex_error;
4589 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4595 memset(&ifr, 0, sizeof ifr);
4596 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4597 COVERAGE_INC(netdev_get_hwaddr);
4598 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
4600 /* ENODEV probably means that a vif disappeared asynchronously and
4601 * hasn't been removed from the database yet, so reduce the log level
4602 * to INFO for that case. */
4603 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
4604 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4605 netdev_name, ovs_strerror(error));
4608 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4609 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4610 VLOG_WARN("%s device has unknown hardware address family %d",
4611 netdev_name, hwaddr_family);
4613 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4618 set_etheraddr(const char *netdev_name,
4619 const uint8_t mac[ETH_ADDR_LEN])
4624 memset(&ifr, 0, sizeof ifr);
4625 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4626 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4627 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4628 COVERAGE_INC(netdev_set_hwaddr);
4629 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
4631 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4632 netdev_name, ovs_strerror(error));
4638 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4639 int cmd, const char *cmd_name)
4644 memset(&ifr, 0, sizeof ifr);
4645 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4646 ifr.ifr_data = (caddr_t) ecmd;
4649 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
4651 if (error != EOPNOTSUPP) {
4652 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4653 "failed: %s", cmd_name, name, ovs_strerror(error));
4655 /* The device doesn't support this operation. That's pretty
4656 * common, so there's no point in logging anything. */
4663 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4664 int cmd, const char *cmd_name)
4669 ifr.ifr_addr.sa_family = AF_INET;
4670 error = af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4672 const struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *,
4674 *ip = sin->sin_addr;
4679 /* Returns an AF_PACKET raw socket or a negative errno value. */
4681 af_packet_sock(void)
4683 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4686 if (ovsthread_once_start(&once)) {
4687 sock = socket(AF_PACKET, SOCK_RAW, 0);
4689 int error = set_nonblocking(sock);
4696 VLOG_ERR("failed to create packet socket: %s",
4697 ovs_strerror(errno));
4699 ovsthread_once_done(&once);