2 * Copyright (c) 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <linux/version.h>
37 #include <sys/types.h>
38 #include <sys/ioctl.h>
39 #include <sys/socket.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
52 #include "dpif-linux.h"
53 #include "dynamic-string.h"
54 #include "fatal-signal.h"
57 #include "netdev-provider.h"
58 #include "netdev-vport.h"
59 #include "netlink-notifier.h"
60 #include "netlink-socket.h"
63 #include "openflow/openflow.h"
65 #include "poll-loop.h"
66 #include "rtnetlink-link.h"
68 #include "socket-util.h"
71 #include "unaligned.h"
74 VLOG_DEFINE_THIS_MODULE(netdev_linux);
76 COVERAGE_DEFINE(netdev_set_policing);
77 COVERAGE_DEFINE(netdev_arp_lookup);
78 COVERAGE_DEFINE(netdev_get_ifindex);
79 COVERAGE_DEFINE(netdev_get_hwaddr);
80 COVERAGE_DEFINE(netdev_set_hwaddr);
81 COVERAGE_DEFINE(netdev_get_ethtool);
82 COVERAGE_DEFINE(netdev_set_ethtool);
85 /* These were introduced in Linux 2.6.14, so they might be missing if we have
87 #ifndef ADVERTISED_Pause
88 #define ADVERTISED_Pause (1 << 13)
90 #ifndef ADVERTISED_Asym_Pause
91 #define ADVERTISED_Asym_Pause (1 << 14)
94 /* These were introduced in Linux 2.6.24, so they might be missing if we
95 * have old headers. */
96 #ifndef ETHTOOL_GFLAGS
97 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
99 #ifndef ETHTOOL_SFLAGS
100 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
103 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
106 #define TC_RTAB_SIZE 1024
110 VALID_IFINDEX = 1 << 0,
111 VALID_ETHERADDR = 1 << 1,
115 VALID_POLICING = 1 << 5,
116 VALID_VPORT_STAT_ERROR = 1 << 6,
117 VALID_DRVINFO = 1 << 7,
118 VALID_FEATURES = 1 << 8,
121 /* Traffic control. */
123 /* An instance of a traffic control class. Always associated with a particular
126 * Each TC implementation subclasses this with whatever additional data it
129 const struct tc_ops *ops;
130 struct hmap queues; /* Contains "struct tc_queue"s.
131 * Read by generic TC layer.
132 * Written only by TC implementation. */
135 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
137 /* One traffic control queue.
139 * Each TC implementation subclasses this with whatever additional data it
142 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
143 unsigned int queue_id; /* OpenFlow queue ID. */
144 long long int created; /* Time queue was created, in msecs. */
147 /* A particular kind of traffic control. Each implementation generally maps to
148 * one particular Linux qdisc class.
150 * The functions below return 0 if successful or a positive errno value on
151 * failure, except where otherwise noted. All of them must be provided, except
152 * where otherwise noted. */
154 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
155 * This is null for tc_ops_default and tc_ops_other, for which there are no
156 * appropriate values. */
157 const char *linux_name;
159 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
160 const char *ovs_name;
162 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
163 * queues. The queues are numbered 0 through n_queues - 1. */
164 unsigned int n_queues;
166 /* Called to install this TC class on 'netdev'. The implementation should
167 * make the Netlink calls required to set up 'netdev' with the right qdisc
168 * and configure it according to 'details'. The implementation may assume
169 * that the current qdisc is the default; that is, there is no need for it
170 * to delete the current qdisc before installing itself.
172 * The contents of 'details' should be documented as valid for 'ovs_name'
173 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
174 * (which is built as ovs-vswitchd.conf.db(8)).
176 * This function must return 0 if and only if it sets 'netdev->tc' to an
177 * initialized 'struct tc'.
179 * (This function is null for tc_ops_other, which cannot be installed. For
180 * other TC classes it should always be nonnull.) */
181 int (*tc_install)(struct netdev *netdev, const struct smap *details);
183 /* Called when the netdev code determines (through a Netlink query) that
184 * this TC class's qdisc is installed on 'netdev', but we didn't install
185 * it ourselves and so don't know any of the details.
187 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
188 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
189 * implementation should parse the other attributes of 'nlmsg' as
190 * necessary to determine its configuration. If necessary it should also
191 * use Netlink queries to determine the configuration of queues on
194 * This function must return 0 if and only if it sets 'netdev->tc' to an
195 * initialized 'struct tc'. */
196 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
198 /* Destroys the data structures allocated by the implementation as part of
199 * 'tc'. (This includes destroying 'tc->queues' by calling
202 * The implementation should not need to perform any Netlink calls. If
203 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
204 * (But it may not be desirable.)
206 * This function may be null if 'tc' is trivial. */
207 void (*tc_destroy)(struct tc *tc);
209 /* Retrieves details of 'netdev->tc' configuration into 'details'.
211 * The implementation should not need to perform any Netlink calls, because
212 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
213 * cached the configuration.
215 * The contents of 'details' should be documented as valid for 'ovs_name'
216 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
217 * (which is built as ovs-vswitchd.conf.db(8)).
219 * This function may be null if 'tc' is not configurable.
221 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
223 /* Reconfigures 'netdev->tc' according to 'details', performing any
224 * required Netlink calls to complete the reconfiguration.
226 * The contents of 'details' should be documented as valid for 'ovs_name'
227 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
228 * (which is built as ovs-vswitchd.conf.db(8)).
230 * This function may be null if 'tc' is not configurable.
232 int (*qdisc_set)(struct netdev *, const struct smap *details);
234 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
235 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
237 * The contents of 'details' should be documented as valid for 'ovs_name'
238 * in the "other_config" column in the "Queue" table in
239 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
241 * The implementation should not need to perform any Netlink calls, because
242 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
243 * cached the queue configuration.
245 * This function may be null if 'tc' does not have queues ('n_queues' is
247 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
248 struct smap *details);
250 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
251 * 'details', perfoming any required Netlink calls to complete the
252 * reconfiguration. The caller ensures that 'queue_id' is less than
255 * The contents of 'details' should be documented as valid for 'ovs_name'
256 * in the "other_config" column in the "Queue" table in
257 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
259 * This function may be null if 'tc' does not have queues or its queues are
260 * not configurable. */
261 int (*class_set)(struct netdev *, unsigned int queue_id,
262 const struct smap *details);
264 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
265 * tc_queue's within 'netdev->tc->queues'.
267 * This function may be null if 'tc' does not have queues or its queues
268 * cannot be deleted. */
269 int (*class_delete)(struct netdev *, struct tc_queue *queue);
271 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
272 * 'struct tc_queue's within 'netdev->tc->queues'.
274 * On success, initializes '*stats'.
276 * This function may be null if 'tc' does not have queues or if it cannot
277 * report queue statistics. */
278 int (*class_get_stats)(const struct netdev *netdev,
279 const struct tc_queue *queue,
280 struct netdev_queue_stats *stats);
282 /* Extracts queue stats from 'nlmsg', which is a response to a
283 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
285 * This function may be null if 'tc' does not have queues or if it cannot
286 * report queue statistics. */
287 int (*class_dump_stats)(const struct netdev *netdev,
288 const struct ofpbuf *nlmsg,
289 netdev_dump_queue_stats_cb *cb, void *aux);
293 tc_init(struct tc *tc, const struct tc_ops *ops)
296 hmap_init(&tc->queues);
300 tc_destroy(struct tc *tc)
302 hmap_destroy(&tc->queues);
305 static const struct tc_ops tc_ops_htb;
306 static const struct tc_ops tc_ops_hfsc;
307 static const struct tc_ops tc_ops_default;
308 static const struct tc_ops tc_ops_other;
310 static const struct tc_ops *const tcs[] = {
311 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
312 &tc_ops_hfsc, /* Hierarchical fair service curve. */
313 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
314 &tc_ops_other, /* Some other qdisc. */
318 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
319 static unsigned int tc_get_major(unsigned int handle);
320 static unsigned int tc_get_minor(unsigned int handle);
322 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
323 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
324 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
326 static struct tcmsg *tc_make_request(const struct netdev *, int type,
327 unsigned int flags, struct ofpbuf *);
328 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
329 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
330 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
333 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
334 struct nlattr **options);
335 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
336 struct nlattr **options,
337 struct netdev_queue_stats *);
338 static int tc_query_class(const struct netdev *,
339 unsigned int handle, unsigned int parent,
340 struct ofpbuf **replyp);
341 static int tc_delete_class(const struct netdev *, unsigned int handle);
343 static int tc_del_qdisc(struct netdev *netdev);
344 static int tc_query_qdisc(const struct netdev *netdev);
346 static int tc_calc_cell_log(unsigned int mtu);
347 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
348 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
349 const struct tc_ratespec *rate);
350 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
352 struct netdev_linux {
355 /* Protects all members below. */
356 struct ovs_mutex mutex;
358 unsigned int cache_valid;
359 unsigned int change_seq;
361 bool miimon; /* Link status of last poll. */
362 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
363 struct timer miimon_timer;
365 /* The following are figured out "on demand" only. They are only valid
366 * when the corresponding VALID_* bit in 'cache_valid' is set. */
368 uint8_t etheraddr[ETH_ADDR_LEN];
369 struct in_addr address, netmask;
372 unsigned int ifi_flags;
373 long long int carrier_resets;
374 uint32_t kbits_rate; /* Policing data. */
375 uint32_t kbits_burst;
376 int vport_stats_error; /* Cached error code from vport_get_stats().
377 0 or an errno value. */
378 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
379 int ether_addr_error; /* Cached error code from set/get etheraddr. */
380 int netdev_policing_error; /* Cached error code from set policing. */
381 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
382 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
384 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
385 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
386 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
388 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
391 /* For devices of class netdev_tap_class only. */
395 struct netdev_rx_linux {
401 /* This is set pretty low because we probably won't learn anything from the
402 * additional log messages. */
403 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
405 static void netdev_linux_run(void);
407 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
408 int cmd, const char *cmd_name);
409 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
410 int cmd, const char *cmd_name);
411 static int get_flags(const struct netdev *, unsigned int *flags);
412 static int set_flags(const char *, unsigned int flags);
413 static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
414 enum netdev_flags on, enum netdev_flags *old_flagsp)
415 OVS_REQUIRES(netdev->mutex);
416 static int do_get_ifindex(const char *netdev_name);
417 static int get_ifindex(const struct netdev *, int *ifindexp);
418 static int do_set_addr(struct netdev *netdev,
419 int ioctl_nr, const char *ioctl_name,
420 struct in_addr addr);
421 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
422 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
423 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
424 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
425 static int af_packet_sock(void);
426 static void netdev_linux_miimon_run(void);
427 static void netdev_linux_miimon_wait(void);
430 is_netdev_linux_class(const struct netdev_class *netdev_class)
432 return netdev_class->run == netdev_linux_run;
436 is_tap_netdev(const struct netdev *netdev)
438 return netdev_get_class(netdev) == &netdev_tap_class;
441 static struct netdev_linux *
442 netdev_linux_cast(const struct netdev *netdev)
444 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
446 return CONTAINER_OF(netdev, struct netdev_linux, up);
449 static struct netdev_rx_linux *
450 netdev_rx_linux_cast(const struct netdev_rx *rx)
452 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
453 return CONTAINER_OF(rx, struct netdev_rx_linux, up);
456 static void netdev_linux_update(struct netdev_linux *netdev,
457 const struct rtnetlink_link_change *)
458 OVS_REQUIRES(netdev->mutex);
459 static void netdev_linux_changed(struct netdev_linux *netdev,
460 unsigned int ifi_flags, unsigned int mask)
461 OVS_REQUIRES(netdev->mutex);
463 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK changes, or NULL
464 * if no such socket could be created. */
465 static struct nl_sock *
466 netdev_linux_notify_sock(void)
468 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
469 static struct nl_sock *sock;
471 if (ovsthread_once_start(&once)) {
474 error = nl_sock_create(NETLINK_ROUTE, &sock);
476 error = nl_sock_join_mcgroup(sock, RTNLGRP_LINK);
478 nl_sock_destroy(sock);
482 ovsthread_once_done(&once);
489 netdev_linux_run(void)
491 struct nl_sock *sock;
494 netdev_linux_miimon_run();
496 sock = netdev_linux_notify_sock();
502 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
503 uint64_t buf_stub[4096 / 8];
506 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
507 error = nl_sock_recv(sock, &buf, false);
509 struct rtnetlink_link_change change;
511 if (rtnetlink_link_parse(&buf, &change)) {
512 struct netdev *netdev_ = netdev_from_name(change.ifname);
513 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
514 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
516 ovs_mutex_lock(&netdev->mutex);
517 netdev_linux_update(netdev, &change);
518 ovs_mutex_unlock(&netdev->mutex);
520 netdev_close(netdev_);
522 } else if (error == ENOBUFS) {
523 struct shash device_shash;
524 struct shash_node *node;
528 shash_init(&device_shash);
529 netdev_get_devices(&netdev_linux_class, &device_shash);
530 SHASH_FOR_EACH (node, &device_shash) {
531 struct netdev *netdev_ = node->data;
532 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
535 ovs_mutex_lock(&netdev->mutex);
536 get_flags(netdev_, &flags);
537 netdev_linux_changed(netdev, flags, 0);
538 ovs_mutex_unlock(&netdev->mutex);
540 netdev_close(netdev_);
542 shash_destroy(&device_shash);
543 } else if (error != EAGAIN) {
544 VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)",
545 ovs_strerror(error));
552 netdev_linux_wait(void)
554 struct nl_sock *sock;
556 netdev_linux_miimon_wait();
557 sock = netdev_linux_notify_sock();
559 nl_sock_wait(sock, POLLIN);
564 netdev_linux_changed(struct netdev_linux *dev,
565 unsigned int ifi_flags, unsigned int mask)
566 OVS_REQUIRES(dev->mutex)
569 if (!dev->change_seq) {
573 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
574 dev->carrier_resets++;
576 dev->ifi_flags = ifi_flags;
578 dev->cache_valid &= mask;
582 netdev_linux_update(struct netdev_linux *dev,
583 const struct rtnetlink_link_change *change)
584 OVS_REQUIRES(dev->mutex)
586 if (change->nlmsg_type == RTM_NEWLINK) {
588 netdev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
590 /* Update netdev from rtnl-change msg. */
592 dev->mtu = change->mtu;
593 dev->cache_valid |= VALID_MTU;
594 dev->netdev_mtu_error = 0;
597 if (!eth_addr_is_zero(change->addr)) {
598 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
599 dev->cache_valid |= VALID_ETHERADDR;
600 dev->ether_addr_error = 0;
603 dev->ifindex = change->ifi_index;
604 dev->cache_valid |= VALID_IFINDEX;
605 dev->get_ifindex_error = 0;
608 netdev_linux_changed(dev, change->ifi_flags, 0);
612 static struct netdev *
613 netdev_linux_alloc(void)
615 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
620 netdev_linux_common_construct(struct netdev_linux *netdev)
622 ovs_mutex_init(&netdev->mutex);
623 netdev->change_seq = 1;
626 /* Creates system and internal devices. */
628 netdev_linux_construct(struct netdev *netdev_)
630 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
633 netdev_linux_common_construct(netdev);
635 error = get_flags(&netdev->up, &netdev->ifi_flags);
636 if (error == ENODEV) {
637 if (netdev->up.netdev_class != &netdev_internal_class) {
638 /* The device does not exist, so don't allow it to be opened. */
641 /* "Internal" netdevs have to be created as netdev objects before
642 * they exist in the kernel, because creating them in the kernel
643 * happens by passing a netdev object to dpif_port_add().
644 * Therefore, ignore the error. */
651 /* For most types of netdevs we open the device for each call of
652 * netdev_open(). However, this is not the case with tap devices,
653 * since it is only possible to open the device once. In this
654 * situation we share a single file descriptor, and consequently
655 * buffers, across all readers. Therefore once data is read it will
656 * be unavailable to other reads for tap devices. */
658 netdev_linux_construct_tap(struct netdev *netdev_)
660 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
661 static const char tap_dev[] = "/dev/net/tun";
662 const char *name = netdev_->name;
666 netdev_linux_common_construct(netdev);
668 /* Open tap device. */
669 netdev->tap_fd = open(tap_dev, O_RDWR);
670 if (netdev->tap_fd < 0) {
672 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
676 /* Create tap device. */
677 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
678 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
679 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
680 VLOG_WARN("%s: creating tap device failed: %s", name,
681 ovs_strerror(errno));
686 /* Make non-blocking. */
687 error = set_nonblocking(netdev->tap_fd);
695 close(netdev->tap_fd);
700 netdev_linux_destruct(struct netdev *netdev_)
702 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
704 if (netdev->tc && netdev->tc->ops->tc_destroy) {
705 netdev->tc->ops->tc_destroy(netdev->tc);
708 if (netdev_get_class(netdev_) == &netdev_tap_class
709 && netdev->tap_fd >= 0)
711 close(netdev->tap_fd);
714 ovs_mutex_destroy(&netdev->mutex);
718 netdev_linux_dealloc(struct netdev *netdev_)
720 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
724 static struct netdev_rx *
725 netdev_linux_rx_alloc(void)
727 struct netdev_rx_linux *rx = xzalloc(sizeof *rx);
732 netdev_linux_rx_construct(struct netdev_rx *rx_)
734 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
735 struct netdev *netdev_ = rx->up.netdev;
736 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
739 ovs_mutex_lock(&netdev->mutex);
740 rx->is_tap = is_tap_netdev(netdev_);
742 rx->fd = netdev->tap_fd;
744 struct sockaddr_ll sll;
746 /* Result of tcpdump -dd inbound */
747 static const struct sock_filter filt[] = {
748 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
749 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
750 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
751 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
753 static const struct sock_fprog fprog = {
754 ARRAY_SIZE(filt), (struct sock_filter *) filt
757 /* Create file descriptor. */
758 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
761 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
765 /* Set non-blocking mode. */
766 error = set_nonblocking(rx->fd);
771 /* Get ethernet device index. */
772 error = get_ifindex(&netdev->up, &ifindex);
777 /* Bind to specific ethernet device. */
778 memset(&sll, 0, sizeof sll);
779 sll.sll_family = AF_PACKET;
780 sll.sll_ifindex = ifindex;
781 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
782 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
784 VLOG_ERR("%s: failed to bind raw socket (%s)",
785 netdev_get_name(netdev_), ovs_strerror(error));
789 /* Filter for only inbound packets. */
790 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
794 VLOG_ERR("%s: failed to attach filter (%s)",
795 netdev_get_name(netdev_), ovs_strerror(error));
799 ovs_mutex_unlock(&netdev->mutex);
807 ovs_mutex_unlock(&netdev->mutex);
812 netdev_linux_rx_destruct(struct netdev_rx *rx_)
814 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
822 netdev_linux_rx_dealloc(struct netdev_rx *rx_)
824 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
830 netdev_linux_rx_recv(struct netdev_rx *rx_, void *data, size_t size)
832 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
837 ? read(rx->fd, data, size)
838 : recv(rx->fd, data, size, MSG_TRUNC));
839 } while (retval < 0 && errno == EINTR);
842 return retval > size ? -EMSGSIZE : retval;
844 if (errno != EAGAIN) {
845 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
846 ovs_strerror(errno), netdev_rx_get_name(rx_));
853 netdev_linux_rx_wait(struct netdev_rx *rx_)
855 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
856 poll_fd_wait(rx->fd, POLLIN);
860 netdev_linux_rx_drain(struct netdev_rx *rx_)
862 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
865 int error = af_inet_ifreq_ioctl(netdev_rx_get_name(rx_), &ifr,
866 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
870 drain_fd(rx->fd, ifr.ifr_qlen);
873 return drain_rcvbuf(rx->fd);
877 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
878 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
879 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
880 * the packet is too big or too small to transmit on the device.
882 * The caller retains ownership of 'buffer' in all cases.
884 * The kernel maintains a packet transmission queue, so the caller is not
885 * expected to do additional queuing of packets. */
887 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
892 if (!is_tap_netdev(netdev_)) {
893 /* Use our AF_PACKET socket to send to this device. */
894 struct sockaddr_ll sll;
900 sock = af_packet_sock();
905 ifindex = netdev_get_ifindex(netdev_);
910 /* We don't bother setting most fields in sockaddr_ll because the
911 * kernel ignores them for SOCK_RAW. */
912 memset(&sll, 0, sizeof sll);
913 sll.sll_family = AF_PACKET;
914 sll.sll_ifindex = ifindex;
916 iov.iov_base = CONST_CAST(void *, data);
920 msg.msg_namelen = sizeof sll;
923 msg.msg_control = NULL;
924 msg.msg_controllen = 0;
927 retval = sendmsg(sock, &msg, 0);
929 /* Use the tap fd to send to this device. This is essential for
930 * tap devices, because packets sent to a tap device with an
931 * AF_PACKET socket will loop back to be *received* again on the
932 * tap device. This doesn't occur on other interface types
933 * because we attach a socket filter to the rx socket. */
934 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
936 retval = write(netdev->tap_fd, data, size);
940 /* The Linux AF_PACKET implementation never blocks waiting for room
941 * for packets, instead returning ENOBUFS. Translate this into
942 * EAGAIN for the caller. */
943 if (errno == ENOBUFS) {
945 } else if (errno == EINTR) {
947 } else if (errno != EAGAIN) {
948 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
949 netdev_get_name(netdev_), ovs_strerror(errno));
952 } else if (retval != size) {
953 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
954 "%zu) on %s", retval, size, netdev_get_name(netdev_));
962 /* Registers with the poll loop to wake up from the next call to poll_block()
963 * when the packet transmission queue has sufficient room to transmit a packet
964 * with netdev_send().
966 * The kernel maintains a packet transmission queue, so the client is not
967 * expected to do additional queuing of packets. Thus, this function is
968 * unlikely to ever be used. It is included for completeness. */
970 netdev_linux_send_wait(struct netdev *netdev)
972 if (is_tap_netdev(netdev)) {
973 /* TAP device always accepts packets.*/
974 poll_immediate_wake();
978 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
979 * otherwise a positive errno value. */
981 netdev_linux_set_etheraddr(struct netdev *netdev_,
982 const uint8_t mac[ETH_ADDR_LEN])
984 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
985 enum netdev_flags old_flags = 0;
988 ovs_mutex_lock(&netdev->mutex);
990 if (netdev->cache_valid & VALID_ETHERADDR) {
991 error = netdev->ether_addr_error;
992 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
995 netdev->cache_valid &= ~VALID_ETHERADDR;
998 /* Tap devices must be brought down before setting the address. */
999 if (is_tap_netdev(netdev_)) {
1000 update_flags(netdev, NETDEV_UP, 0, &old_flags);
1002 error = set_etheraddr(netdev_get_name(netdev_), mac);
1003 if (!error || error == ENODEV) {
1004 netdev->ether_addr_error = error;
1005 netdev->cache_valid |= VALID_ETHERADDR;
1007 memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
1011 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1012 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1016 ovs_mutex_unlock(&netdev->mutex);
1020 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1022 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1023 uint8_t mac[ETH_ADDR_LEN])
1025 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1028 ovs_mutex_lock(&netdev->mutex);
1029 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1030 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1032 netdev->cache_valid |= VALID_ETHERADDR;
1035 error = netdev->ether_addr_error;
1037 memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
1039 ovs_mutex_unlock(&netdev->mutex);
1045 netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
1049 if (!(netdev->cache_valid & VALID_MTU)) {
1052 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
1053 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
1054 netdev->mtu = ifr.ifr_mtu;
1055 netdev->cache_valid |= VALID_MTU;
1058 error = netdev->netdev_mtu_error;
1060 *mtup = netdev->mtu;
1066 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1067 * in bytes, not including the hardware header; thus, this is typically 1500
1068 * bytes for Ethernet devices. */
1070 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1072 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1075 ovs_mutex_lock(&netdev->mutex);
1076 error = netdev_linux_get_mtu__(netdev, mtup);
1077 ovs_mutex_unlock(&netdev->mutex);
1082 /* Sets the maximum size of transmitted (MTU) for given device using linux
1083 * networking ioctl interface.
1086 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1088 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1092 ovs_mutex_lock(&netdev->mutex);
1093 if (netdev->cache_valid & VALID_MTU) {
1094 error = netdev->netdev_mtu_error;
1095 if (error || netdev->mtu == mtu) {
1098 netdev->cache_valid &= ~VALID_MTU;
1101 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1102 SIOCSIFMTU, "SIOCSIFMTU");
1103 if (!error || error == ENODEV) {
1104 netdev->netdev_mtu_error = error;
1105 netdev->mtu = ifr.ifr_mtu;
1106 netdev->cache_valid |= VALID_MTU;
1109 ovs_mutex_unlock(&netdev->mutex);
1113 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1114 * On failure, returns a negative errno value. */
1116 netdev_linux_get_ifindex(const struct netdev *netdev_)
1118 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1121 ovs_mutex_lock(&netdev->mutex);
1122 error = get_ifindex(netdev_, &ifindex);
1123 ovs_mutex_unlock(&netdev->mutex);
1125 return error ? -error : ifindex;
1129 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1131 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1133 ovs_mutex_lock(&netdev->mutex);
1134 if (netdev->miimon_interval > 0) {
1135 *carrier = netdev->miimon;
1137 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1139 ovs_mutex_unlock(&netdev->mutex);
1144 static long long int
1145 netdev_linux_get_carrier_resets(const struct netdev *netdev_)
1147 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1148 long long int carrier_resets;
1150 ovs_mutex_lock(&netdev->mutex);
1151 carrier_resets = netdev->carrier_resets;
1152 ovs_mutex_unlock(&netdev->mutex);
1154 return carrier_resets;
1158 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1159 struct mii_ioctl_data *data)
1164 memset(&ifr, 0, sizeof ifr);
1165 memcpy(&ifr.ifr_data, data, sizeof *data);
1166 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
1167 memcpy(data, &ifr.ifr_data, sizeof *data);
1173 netdev_linux_get_miimon(const char *name, bool *miimon)
1175 struct mii_ioctl_data data;
1180 memset(&data, 0, sizeof data);
1181 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1183 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1184 data.reg_num = MII_BMSR;
1185 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1189 *miimon = !!(data.val_out & BMSR_LSTATUS);
1191 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1194 struct ethtool_cmd ecmd;
1196 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1199 COVERAGE_INC(netdev_get_ethtool);
1200 memset(&ecmd, 0, sizeof ecmd);
1201 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1204 struct ethtool_value eval;
1206 memcpy(&eval, &ecmd, sizeof eval);
1207 *miimon = !!eval.data;
1209 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1217 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1218 long long int interval)
1220 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1222 ovs_mutex_lock(&netdev->mutex);
1223 interval = interval > 0 ? MAX(interval, 100) : 0;
1224 if (netdev->miimon_interval != interval) {
1225 netdev->miimon_interval = interval;
1226 timer_set_expired(&netdev->miimon_timer);
1228 ovs_mutex_unlock(&netdev->mutex);
1234 netdev_linux_miimon_run(void)
1236 struct shash device_shash;
1237 struct shash_node *node;
1239 shash_init(&device_shash);
1240 netdev_get_devices(&netdev_linux_class, &device_shash);
1241 SHASH_FOR_EACH (node, &device_shash) {
1242 struct netdev *netdev = node->data;
1243 struct netdev_linux *dev = netdev_linux_cast(netdev);
1246 ovs_mutex_lock(&dev->mutex);
1247 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1248 netdev_linux_get_miimon(dev->up.name, &miimon);
1249 if (miimon != dev->miimon) {
1250 dev->miimon = miimon;
1251 netdev_linux_changed(dev, dev->ifi_flags, 0);
1254 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1256 ovs_mutex_unlock(&dev->mutex);
1257 netdev_close(netdev);
1260 shash_destroy(&device_shash);
1264 netdev_linux_miimon_wait(void)
1266 struct shash device_shash;
1267 struct shash_node *node;
1269 shash_init(&device_shash);
1270 netdev_get_devices(&netdev_linux_class, &device_shash);
1271 SHASH_FOR_EACH (node, &device_shash) {
1272 struct netdev *netdev = node->data;
1273 struct netdev_linux *dev = netdev_linux_cast(netdev);
1275 ovs_mutex_lock(&dev->mutex);
1276 if (dev->miimon_interval > 0) {
1277 timer_wait(&dev->miimon_timer);
1279 ovs_mutex_unlock(&dev->mutex);
1280 netdev_close(netdev);
1282 shash_destroy(&device_shash);
1285 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1286 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1289 check_for_working_netlink_stats(void)
1291 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1292 * preferable, so if that works, we'll use it. */
1293 int ifindex = do_get_ifindex("lo");
1295 VLOG_WARN("failed to get ifindex for lo, "
1296 "obtaining netdev stats from proc");
1299 struct netdev_stats stats;
1300 int error = get_stats_via_netlink(ifindex, &stats);
1302 VLOG_DBG("obtaining netdev stats via rtnetlink");
1305 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1306 "via proc (you are probably running a pre-2.6.19 "
1307 "kernel)", ovs_strerror(error));
1314 swap_uint64(uint64_t *a, uint64_t *b)
1321 /* Copies 'src' into 'dst', performing format conversion in the process.
1323 * 'src' is allowed to be misaligned. */
1325 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1326 const struct ovs_vport_stats *src)
1328 dst->rx_packets = get_unaligned_u64(&src->rx_packets);
1329 dst->tx_packets = get_unaligned_u64(&src->tx_packets);
1330 dst->rx_bytes = get_unaligned_u64(&src->rx_bytes);
1331 dst->tx_bytes = get_unaligned_u64(&src->tx_bytes);
1332 dst->rx_errors = get_unaligned_u64(&src->rx_errors);
1333 dst->tx_errors = get_unaligned_u64(&src->tx_errors);
1334 dst->rx_dropped = get_unaligned_u64(&src->rx_dropped);
1335 dst->tx_dropped = get_unaligned_u64(&src->tx_dropped);
1337 dst->collisions = 0;
1338 dst->rx_length_errors = 0;
1339 dst->rx_over_errors = 0;
1340 dst->rx_crc_errors = 0;
1341 dst->rx_frame_errors = 0;
1342 dst->rx_fifo_errors = 0;
1343 dst->rx_missed_errors = 0;
1344 dst->tx_aborted_errors = 0;
1345 dst->tx_carrier_errors = 0;
1346 dst->tx_fifo_errors = 0;
1347 dst->tx_heartbeat_errors = 0;
1348 dst->tx_window_errors = 0;
1352 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1354 struct dpif_linux_vport reply;
1358 error = dpif_linux_vport_get(netdev_get_name(netdev), &reply, &buf);
1361 } else if (!reply.stats) {
1366 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1374 get_stats_via_vport(const struct netdev *netdev_,
1375 struct netdev_stats *stats)
1377 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1379 if (!netdev->vport_stats_error ||
1380 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1383 error = get_stats_via_vport__(netdev_, stats);
1384 if (error && error != ENOENT) {
1385 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1387 netdev_get_name(netdev_), ovs_strerror(error));
1389 netdev->vport_stats_error = error;
1390 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1395 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1396 struct netdev_stats *stats)
1398 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
1399 static int use_netlink_stats;
1402 if (ovsthread_once_start(&once)) {
1403 use_netlink_stats = check_for_working_netlink_stats();
1404 ovsthread_once_done(&once);
1407 if (use_netlink_stats) {
1410 error = get_ifindex(netdev_, &ifindex);
1412 error = get_stats_via_netlink(ifindex, stats);
1415 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1419 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1420 netdev_get_name(netdev_), error);
1426 /* Retrieves current device stats for 'netdev-linux'. */
1428 netdev_linux_get_stats(const struct netdev *netdev_,
1429 struct netdev_stats *stats)
1431 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1432 struct netdev_stats dev_stats;
1435 ovs_mutex_lock(&netdev->mutex);
1436 get_stats_via_vport(netdev_, stats);
1437 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1439 if (!netdev->vport_stats_error) {
1442 } else if (netdev->vport_stats_error) {
1443 /* stats not available from OVS then use ioctl stats. */
1446 stats->rx_errors += dev_stats.rx_errors;
1447 stats->tx_errors += dev_stats.tx_errors;
1448 stats->rx_dropped += dev_stats.rx_dropped;
1449 stats->tx_dropped += dev_stats.tx_dropped;
1450 stats->multicast += dev_stats.multicast;
1451 stats->collisions += dev_stats.collisions;
1452 stats->rx_length_errors += dev_stats.rx_length_errors;
1453 stats->rx_over_errors += dev_stats.rx_over_errors;
1454 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1455 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1456 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1457 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1458 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1459 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1460 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1461 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1462 stats->tx_window_errors += dev_stats.tx_window_errors;
1464 ovs_mutex_unlock(&netdev->mutex);
1469 /* Retrieves current device stats for 'netdev-tap' netdev or
1470 * netdev-internal. */
1472 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1474 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1475 struct netdev_stats dev_stats;
1478 ovs_mutex_lock(&netdev->mutex);
1479 get_stats_via_vport(netdev_, stats);
1480 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1482 if (!netdev->vport_stats_error) {
1485 } else if (netdev->vport_stats_error) {
1486 /* Transmit and receive stats will appear to be swapped relative to the
1487 * other ports since we are the one sending the data, not a remote
1488 * computer. For consistency, we swap them back here. This does not
1489 * apply if we are getting stats from the vport layer because it always
1490 * tracks stats from the perspective of the switch. */
1493 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1494 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1495 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1496 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1497 stats->rx_length_errors = 0;
1498 stats->rx_over_errors = 0;
1499 stats->rx_crc_errors = 0;
1500 stats->rx_frame_errors = 0;
1501 stats->rx_fifo_errors = 0;
1502 stats->rx_missed_errors = 0;
1503 stats->tx_aborted_errors = 0;
1504 stats->tx_carrier_errors = 0;
1505 stats->tx_fifo_errors = 0;
1506 stats->tx_heartbeat_errors = 0;
1507 stats->tx_window_errors = 0;
1509 stats->rx_dropped += dev_stats.tx_dropped;
1510 stats->tx_dropped += dev_stats.rx_dropped;
1512 stats->rx_errors += dev_stats.tx_errors;
1513 stats->tx_errors += dev_stats.rx_errors;
1515 stats->multicast += dev_stats.multicast;
1516 stats->collisions += dev_stats.collisions;
1518 ovs_mutex_unlock(&netdev->mutex);
1524 netdev_internal_get_stats(const struct netdev *netdev_,
1525 struct netdev_stats *stats)
1527 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1530 ovs_mutex_lock(&netdev->mutex);
1531 get_stats_via_vport(netdev_, stats);
1532 error = netdev->vport_stats_error;
1533 ovs_mutex_unlock(&netdev->mutex);
1539 netdev_internal_set_stats(struct netdev *netdev,
1540 const struct netdev_stats *stats)
1542 struct ovs_vport_stats vport_stats;
1543 struct dpif_linux_vport vport;
1546 vport_stats.rx_packets = stats->rx_packets;
1547 vport_stats.tx_packets = stats->tx_packets;
1548 vport_stats.rx_bytes = stats->rx_bytes;
1549 vport_stats.tx_bytes = stats->tx_bytes;
1550 vport_stats.rx_errors = stats->rx_errors;
1551 vport_stats.tx_errors = stats->tx_errors;
1552 vport_stats.rx_dropped = stats->rx_dropped;
1553 vport_stats.tx_dropped = stats->tx_dropped;
1555 dpif_linux_vport_init(&vport);
1556 vport.cmd = OVS_VPORT_CMD_SET;
1557 vport.name = netdev_get_name(netdev);
1558 vport.stats = &vport_stats;
1560 err = dpif_linux_vport_transact(&vport, NULL, NULL);
1562 /* If the vport layer doesn't know about the device, that doesn't mean it
1563 * doesn't exist (after all were able to open it when netdev_open() was
1564 * called), it just means that it isn't attached and we'll be getting
1565 * stats a different way. */
1566 if (err == ENODEV) {
1574 netdev_linux_read_features(struct netdev_linux *netdev)
1576 struct ethtool_cmd ecmd;
1580 if (netdev->cache_valid & VALID_FEATURES) {
1584 COVERAGE_INC(netdev_get_ethtool);
1585 memset(&ecmd, 0, sizeof ecmd);
1586 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1587 ETHTOOL_GSET, "ETHTOOL_GSET");
1592 /* Supported features. */
1593 netdev->supported = 0;
1594 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1595 netdev->supported |= NETDEV_F_10MB_HD;
1597 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1598 netdev->supported |= NETDEV_F_10MB_FD;
1600 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1601 netdev->supported |= NETDEV_F_100MB_HD;
1603 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1604 netdev->supported |= NETDEV_F_100MB_FD;
1606 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1607 netdev->supported |= NETDEV_F_1GB_HD;
1609 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1610 netdev->supported |= NETDEV_F_1GB_FD;
1612 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1613 netdev->supported |= NETDEV_F_10GB_FD;
1615 if (ecmd.supported & SUPPORTED_TP) {
1616 netdev->supported |= NETDEV_F_COPPER;
1618 if (ecmd.supported & SUPPORTED_FIBRE) {
1619 netdev->supported |= NETDEV_F_FIBER;
1621 if (ecmd.supported & SUPPORTED_Autoneg) {
1622 netdev->supported |= NETDEV_F_AUTONEG;
1624 if (ecmd.supported & SUPPORTED_Pause) {
1625 netdev->supported |= NETDEV_F_PAUSE;
1627 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1628 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1631 /* Advertised features. */
1632 netdev->advertised = 0;
1633 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1634 netdev->advertised |= NETDEV_F_10MB_HD;
1636 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1637 netdev->advertised |= NETDEV_F_10MB_FD;
1639 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1640 netdev->advertised |= NETDEV_F_100MB_HD;
1642 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1643 netdev->advertised |= NETDEV_F_100MB_FD;
1645 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1646 netdev->advertised |= NETDEV_F_1GB_HD;
1648 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1649 netdev->advertised |= NETDEV_F_1GB_FD;
1651 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1652 netdev->advertised |= NETDEV_F_10GB_FD;
1654 if (ecmd.advertising & ADVERTISED_TP) {
1655 netdev->advertised |= NETDEV_F_COPPER;
1657 if (ecmd.advertising & ADVERTISED_FIBRE) {
1658 netdev->advertised |= NETDEV_F_FIBER;
1660 if (ecmd.advertising & ADVERTISED_Autoneg) {
1661 netdev->advertised |= NETDEV_F_AUTONEG;
1663 if (ecmd.advertising & ADVERTISED_Pause) {
1664 netdev->advertised |= NETDEV_F_PAUSE;
1666 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1667 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1670 /* Current settings. */
1672 if (speed == SPEED_10) {
1673 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1674 } else if (speed == SPEED_100) {
1675 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1676 } else if (speed == SPEED_1000) {
1677 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1678 } else if (speed == SPEED_10000) {
1679 netdev->current = NETDEV_F_10GB_FD;
1680 } else if (speed == 40000) {
1681 netdev->current = NETDEV_F_40GB_FD;
1682 } else if (speed == 100000) {
1683 netdev->current = NETDEV_F_100GB_FD;
1684 } else if (speed == 1000000) {
1685 netdev->current = NETDEV_F_1TB_FD;
1687 netdev->current = 0;
1690 if (ecmd.port == PORT_TP) {
1691 netdev->current |= NETDEV_F_COPPER;
1692 } else if (ecmd.port == PORT_FIBRE) {
1693 netdev->current |= NETDEV_F_FIBER;
1697 netdev->current |= NETDEV_F_AUTONEG;
1701 netdev->cache_valid |= VALID_FEATURES;
1702 netdev->get_features_error = error;
1705 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
1706 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1707 * Returns 0 if successful, otherwise a positive errno value. */
1709 netdev_linux_get_features(const struct netdev *netdev_,
1710 enum netdev_features *current,
1711 enum netdev_features *advertised,
1712 enum netdev_features *supported,
1713 enum netdev_features *peer)
1715 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1718 ovs_mutex_lock(&netdev->mutex);
1719 netdev_linux_read_features(netdev);
1720 if (!netdev->get_features_error) {
1721 *current = netdev->current;
1722 *advertised = netdev->advertised;
1723 *supported = netdev->supported;
1724 *peer = 0; /* XXX */
1726 error = netdev->get_features_error;
1727 ovs_mutex_unlock(&netdev->mutex);
1732 /* Set the features advertised by 'netdev' to 'advertise'. */
1734 netdev_linux_set_advertisements(struct netdev *netdev_,
1735 enum netdev_features advertise)
1737 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1738 struct ethtool_cmd ecmd;
1741 ovs_mutex_lock(&netdev->mutex);
1743 COVERAGE_INC(netdev_get_ethtool);
1744 memset(&ecmd, 0, sizeof ecmd);
1745 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1746 ETHTOOL_GSET, "ETHTOOL_GSET");
1751 ecmd.advertising = 0;
1752 if (advertise & NETDEV_F_10MB_HD) {
1753 ecmd.advertising |= ADVERTISED_10baseT_Half;
1755 if (advertise & NETDEV_F_10MB_FD) {
1756 ecmd.advertising |= ADVERTISED_10baseT_Full;
1758 if (advertise & NETDEV_F_100MB_HD) {
1759 ecmd.advertising |= ADVERTISED_100baseT_Half;
1761 if (advertise & NETDEV_F_100MB_FD) {
1762 ecmd.advertising |= ADVERTISED_100baseT_Full;
1764 if (advertise & NETDEV_F_1GB_HD) {
1765 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1767 if (advertise & NETDEV_F_1GB_FD) {
1768 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1770 if (advertise & NETDEV_F_10GB_FD) {
1771 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1773 if (advertise & NETDEV_F_COPPER) {
1774 ecmd.advertising |= ADVERTISED_TP;
1776 if (advertise & NETDEV_F_FIBER) {
1777 ecmd.advertising |= ADVERTISED_FIBRE;
1779 if (advertise & NETDEV_F_AUTONEG) {
1780 ecmd.advertising |= ADVERTISED_Autoneg;
1782 if (advertise & NETDEV_F_PAUSE) {
1783 ecmd.advertising |= ADVERTISED_Pause;
1785 if (advertise & NETDEV_F_PAUSE_ASYM) {
1786 ecmd.advertising |= ADVERTISED_Asym_Pause;
1788 COVERAGE_INC(netdev_set_ethtool);
1789 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1790 ETHTOOL_SSET, "ETHTOOL_SSET");
1793 ovs_mutex_unlock(&netdev->mutex);
1797 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1798 * successful, otherwise a positive errno value. */
1800 netdev_linux_set_policing(struct netdev *netdev_,
1801 uint32_t kbits_rate, uint32_t kbits_burst)
1803 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1804 const char *netdev_name = netdev_get_name(netdev_);
1807 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1808 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1809 : kbits_burst); /* Stick with user-specified value. */
1811 ovs_mutex_lock(&netdev->mutex);
1812 if (netdev->cache_valid & VALID_POLICING) {
1813 error = netdev->netdev_policing_error;
1814 if (error || (netdev->kbits_rate == kbits_rate &&
1815 netdev->kbits_burst == kbits_burst)) {
1816 /* Assume that settings haven't changed since we last set them. */
1819 netdev->cache_valid &= ~VALID_POLICING;
1822 COVERAGE_INC(netdev_set_policing);
1823 /* Remove any existing ingress qdisc. */
1824 error = tc_add_del_ingress_qdisc(netdev_, false);
1826 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1827 netdev_name, ovs_strerror(error));
1832 error = tc_add_del_ingress_qdisc(netdev_, true);
1834 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1835 netdev_name, ovs_strerror(error));
1839 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
1841 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1842 netdev_name, ovs_strerror(error));
1847 netdev->kbits_rate = kbits_rate;
1848 netdev->kbits_burst = kbits_burst;
1851 if (!error || error == ENODEV) {
1852 netdev->netdev_policing_error = error;
1853 netdev->cache_valid |= VALID_POLICING;
1855 ovs_mutex_unlock(&netdev->mutex);
1860 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1863 const struct tc_ops *const *opsp;
1865 for (opsp = tcs; *opsp != NULL; opsp++) {
1866 const struct tc_ops *ops = *opsp;
1867 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1868 sset_add(types, ops->ovs_name);
1874 static const struct tc_ops *
1875 tc_lookup_ovs_name(const char *name)
1877 const struct tc_ops *const *opsp;
1879 for (opsp = tcs; *opsp != NULL; opsp++) {
1880 const struct tc_ops *ops = *opsp;
1881 if (!strcmp(name, ops->ovs_name)) {
1888 static const struct tc_ops *
1889 tc_lookup_linux_name(const char *name)
1891 const struct tc_ops *const *opsp;
1893 for (opsp = tcs; *opsp != NULL; opsp++) {
1894 const struct tc_ops *ops = *opsp;
1895 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1902 static struct tc_queue *
1903 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
1906 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1907 struct tc_queue *queue;
1909 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
1910 if (queue->queue_id == queue_id) {
1917 static struct tc_queue *
1918 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1920 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1924 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1926 struct netdev_qos_capabilities *caps)
1928 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1932 caps->n_queues = ops->n_queues;
1937 netdev_linux_get_qos(const struct netdev *netdev_,
1938 const char **typep, struct smap *details)
1940 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1943 ovs_mutex_lock(&netdev->mutex);
1944 error = tc_query_qdisc(netdev_);
1946 *typep = netdev->tc->ops->ovs_name;
1947 error = (netdev->tc->ops->qdisc_get
1948 ? netdev->tc->ops->qdisc_get(netdev_, details)
1951 ovs_mutex_unlock(&netdev->mutex);
1957 netdev_linux_set_qos(struct netdev *netdev_,
1958 const char *type, const struct smap *details)
1960 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1961 const struct tc_ops *new_ops;
1964 new_ops = tc_lookup_ovs_name(type);
1965 if (!new_ops || !new_ops->tc_install) {
1969 ovs_mutex_lock(&netdev->mutex);
1970 error = tc_query_qdisc(netdev_);
1975 if (new_ops == netdev->tc->ops) {
1976 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
1978 /* Delete existing qdisc. */
1979 error = tc_del_qdisc(netdev_);
1983 ovs_assert(netdev->tc == NULL);
1985 /* Install new qdisc. */
1986 error = new_ops->tc_install(netdev_, details);
1987 ovs_assert((error == 0) == (netdev->tc != NULL));
1991 ovs_mutex_unlock(&netdev->mutex);
1996 netdev_linux_get_queue(const struct netdev *netdev_,
1997 unsigned int queue_id, struct smap *details)
1999 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2002 ovs_mutex_lock(&netdev->mutex);
2003 error = tc_query_qdisc(netdev_);
2005 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2007 ? netdev->tc->ops->class_get(netdev_, queue, details)
2010 ovs_mutex_unlock(&netdev->mutex);
2016 netdev_linux_set_queue(struct netdev *netdev_,
2017 unsigned int queue_id, const struct smap *details)
2019 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2022 ovs_mutex_lock(&netdev->mutex);
2023 error = tc_query_qdisc(netdev_);
2025 error = (queue_id < netdev->tc->ops->n_queues
2026 && netdev->tc->ops->class_set
2027 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2030 ovs_mutex_unlock(&netdev->mutex);
2036 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
2038 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2041 ovs_mutex_lock(&netdev->mutex);
2042 error = tc_query_qdisc(netdev_);
2044 if (netdev->tc->ops->class_delete) {
2045 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2047 ? netdev->tc->ops->class_delete(netdev_, queue)
2053 ovs_mutex_unlock(&netdev->mutex);
2059 netdev_linux_get_queue_stats(const struct netdev *netdev_,
2060 unsigned int queue_id,
2061 struct netdev_queue_stats *stats)
2063 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2066 ovs_mutex_lock(&netdev->mutex);
2067 error = tc_query_qdisc(netdev_);
2069 if (netdev->tc->ops->class_get_stats) {
2070 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2072 stats->created = queue->created;
2073 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2082 ovs_mutex_unlock(&netdev->mutex);
2088 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
2090 struct ofpbuf request;
2091 struct tcmsg *tcmsg;
2093 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2097 tcmsg->tcm_parent = 0;
2098 nl_dump_start(dump, NETLINK_ROUTE, &request);
2099 ofpbuf_uninit(&request);
2103 struct netdev_linux_queue_state {
2104 unsigned int *queues;
2110 netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
2112 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2115 ovs_mutex_lock(&netdev->mutex);
2116 error = tc_query_qdisc(netdev_);
2118 if (netdev->tc->ops->class_get) {
2119 struct netdev_linux_queue_state *state;
2120 struct tc_queue *queue;
2123 *statep = state = xmalloc(sizeof *state);
2124 state->n_queues = hmap_count(&netdev->tc->queues);
2125 state->cur_queue = 0;
2126 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2129 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2130 state->queues[i++] = queue->queue_id;
2136 ovs_mutex_unlock(&netdev->mutex);
2142 netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2143 unsigned int *queue_idp, struct smap *details)
2145 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2146 struct netdev_linux_queue_state *state = state_;
2149 ovs_mutex_lock(&netdev->mutex);
2150 while (state->cur_queue < state->n_queues) {
2151 unsigned int queue_id = state->queues[state->cur_queue++];
2152 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2155 *queue_idp = queue_id;
2156 error = netdev->tc->ops->class_get(netdev_, queue, details);
2160 ovs_mutex_unlock(&netdev->mutex);
2166 netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2169 struct netdev_linux_queue_state *state = state_;
2171 free(state->queues);
2177 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2178 netdev_dump_queue_stats_cb *cb, void *aux)
2180 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2183 ovs_mutex_lock(&netdev->mutex);
2184 error = tc_query_qdisc(netdev_);
2186 struct nl_dump dump;
2188 if (!netdev->tc->ops->class_dump_stats) {
2190 } else if (!start_queue_dump(netdev_, &dump)) {
2196 while (nl_dump_next(&dump, &msg)) {
2197 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2204 retval = nl_dump_done(&dump);
2210 ovs_mutex_unlock(&netdev->mutex);
2216 netdev_linux_get_in4(const struct netdev *netdev_,
2217 struct in_addr *address, struct in_addr *netmask)
2219 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2222 ovs_mutex_lock(&netdev->mutex);
2223 if (!(netdev->cache_valid & VALID_IN4)) {
2224 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
2225 SIOCGIFADDR, "SIOCGIFADDR");
2227 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2228 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2230 netdev->cache_valid |= VALID_IN4;
2238 if (netdev->address.s_addr != INADDR_ANY) {
2239 *address = netdev->address;
2240 *netmask = netdev->netmask;
2242 error = EADDRNOTAVAIL;
2245 ovs_mutex_unlock(&netdev->mutex);
2251 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2252 struct in_addr netmask)
2254 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2257 ovs_mutex_lock(&netdev->mutex);
2258 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2260 netdev->cache_valid |= VALID_IN4;
2261 netdev->address = address;
2262 netdev->netmask = netmask;
2263 if (address.s_addr != INADDR_ANY) {
2264 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2265 "SIOCSIFNETMASK", netmask);
2268 ovs_mutex_unlock(&netdev->mutex);
2274 parse_if_inet6_line(const char *line,
2275 struct in6_addr *in6, char ifname[16 + 1])
2277 uint8_t *s6 = in6->s6_addr;
2278 #define X8 "%2"SCNx8
2280 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2281 "%*x %*x %*x %*x %16s\n",
2282 &s6[0], &s6[1], &s6[2], &s6[3],
2283 &s6[4], &s6[5], &s6[6], &s6[7],
2284 &s6[8], &s6[9], &s6[10], &s6[11],
2285 &s6[12], &s6[13], &s6[14], &s6[15],
2289 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2290 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2292 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2294 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2296 ovs_mutex_lock(&netdev->mutex);
2297 if (!(netdev->cache_valid & VALID_IN6)) {
2301 netdev->in6 = in6addr_any;
2303 file = fopen("/proc/net/if_inet6", "r");
2305 const char *name = netdev_get_name(netdev_);
2306 while (fgets(line, sizeof line, file)) {
2307 struct in6_addr in6_tmp;
2308 char ifname[16 + 1];
2309 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2310 && !strcmp(name, ifname))
2312 netdev->in6 = in6_tmp;
2318 netdev->cache_valid |= VALID_IN6;
2321 ovs_mutex_unlock(&netdev->mutex);
2327 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2329 struct sockaddr_in sin;
2330 memset(&sin, 0, sizeof sin);
2331 sin.sin_family = AF_INET;
2332 sin.sin_addr = addr;
2335 memset(sa, 0, sizeof *sa);
2336 memcpy(sa, &sin, sizeof sin);
2340 do_set_addr(struct netdev *netdev,
2341 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2345 make_in4_sockaddr(&ifr.ifr_addr, addr);
2346 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2350 /* Adds 'router' as a default IP gateway. */
2352 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2354 struct in_addr any = { INADDR_ANY };
2358 memset(&rt, 0, sizeof rt);
2359 make_in4_sockaddr(&rt.rt_dst, any);
2360 make_in4_sockaddr(&rt.rt_gateway, router);
2361 make_in4_sockaddr(&rt.rt_genmask, any);
2362 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2363 error = af_inet_ioctl(SIOCADDRT, &rt);
2365 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2371 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2374 static const char fn[] = "/proc/net/route";
2379 *netdev_name = NULL;
2380 stream = fopen(fn, "r");
2381 if (stream == NULL) {
2382 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2387 while (fgets(line, sizeof line, stream)) {
2390 ovs_be32 dest, gateway, mask;
2391 int refcnt, metric, mtu;
2392 unsigned int flags, use, window, irtt;
2395 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2397 iface, &dest, &gateway, &flags, &refcnt,
2398 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2400 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2404 if (!(flags & RTF_UP)) {
2405 /* Skip routes that aren't up. */
2409 /* The output of 'dest', 'mask', and 'gateway' were given in
2410 * network byte order, so we don't need need any endian
2411 * conversions here. */
2412 if ((dest & mask) == (host->s_addr & mask)) {
2414 /* The host is directly reachable. */
2415 next_hop->s_addr = 0;
2417 /* To reach the host, we must go through a gateway. */
2418 next_hop->s_addr = gateway;
2420 *netdev_name = xstrdup(iface);
2432 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2434 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2437 ovs_mutex_lock(&netdev->mutex);
2438 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2439 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2441 COVERAGE_INC(netdev_get_ethtool);
2442 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2443 error = netdev_linux_do_ethtool(netdev->up.name,
2446 "ETHTOOL_GDRVINFO");
2448 netdev->cache_valid |= VALID_DRVINFO;
2453 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2454 smap_add(smap, "driver_version", netdev->drvinfo.version);
2455 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2457 ovs_mutex_unlock(&netdev->mutex);
2463 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2466 smap_add(smap, "driver_name", "openvswitch");
2470 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2471 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2472 * returns 0. Otherwise, it returns a positive errno value; in particular,
2473 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2475 netdev_linux_arp_lookup(const struct netdev *netdev,
2476 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2479 struct sockaddr_in sin;
2482 memset(&r, 0, sizeof r);
2483 memset(&sin, 0, sizeof sin);
2484 sin.sin_family = AF_INET;
2485 sin.sin_addr.s_addr = ip;
2487 memcpy(&r.arp_pa, &sin, sizeof sin);
2488 r.arp_ha.sa_family = ARPHRD_ETHER;
2490 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2491 COVERAGE_INC(netdev_arp_lookup);
2492 retval = af_inet_ioctl(SIOCGARP, &r);
2494 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2495 } else if (retval != ENXIO) {
2496 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2497 netdev_get_name(netdev), IP_ARGS(ip),
2498 ovs_strerror(retval));
2504 nd_to_iff_flags(enum netdev_flags nd)
2507 if (nd & NETDEV_UP) {
2510 if (nd & NETDEV_PROMISC) {
2513 if (nd & NETDEV_LOOPBACK) {
2514 iff |= IFF_LOOPBACK;
2520 iff_to_nd_flags(int iff)
2522 enum netdev_flags nd = 0;
2526 if (iff & IFF_PROMISC) {
2527 nd |= NETDEV_PROMISC;
2529 if (iff & IFF_LOOPBACK) {
2530 nd |= NETDEV_LOOPBACK;
2536 update_flags(struct netdev_linux *netdev, enum netdev_flags off,
2537 enum netdev_flags on, enum netdev_flags *old_flagsp)
2538 OVS_REQUIRES(netdev->mutex)
2540 int old_flags, new_flags;
2543 old_flags = netdev->ifi_flags;
2544 *old_flagsp = iff_to_nd_flags(old_flags);
2545 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2546 if (new_flags != old_flags) {
2547 error = set_flags(netdev_get_name(&netdev->up), new_flags);
2548 get_flags(&netdev->up, &netdev->ifi_flags);
2555 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2556 enum netdev_flags on, enum netdev_flags *old_flagsp)
2558 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2561 ovs_mutex_lock(&netdev->mutex);
2562 error = update_flags(netdev, off, on, old_flagsp);
2563 ovs_mutex_unlock(&netdev->mutex);
2569 netdev_linux_change_seq(const struct netdev *netdev_)
2571 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2572 unsigned int change_seq;
2574 ovs_mutex_lock(&netdev->mutex);
2575 change_seq = netdev->change_seq;
2576 ovs_mutex_unlock(&netdev->mutex);
2581 #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, SET_STATS, \
2582 GET_FEATURES, GET_STATUS) \
2588 netdev_linux_wait, \
2590 netdev_linux_alloc, \
2592 netdev_linux_destruct, \
2593 netdev_linux_dealloc, \
2594 NULL, /* get_config */ \
2595 NULL, /* set_config */ \
2596 NULL, /* get_tunnel_config */ \
2598 netdev_linux_send, \
2599 netdev_linux_send_wait, \
2601 netdev_linux_set_etheraddr, \
2602 netdev_linux_get_etheraddr, \
2603 netdev_linux_get_mtu, \
2604 netdev_linux_set_mtu, \
2605 netdev_linux_get_ifindex, \
2606 netdev_linux_get_carrier, \
2607 netdev_linux_get_carrier_resets, \
2608 netdev_linux_set_miimon_interval, \
2613 netdev_linux_set_advertisements, \
2615 netdev_linux_set_policing, \
2616 netdev_linux_get_qos_types, \
2617 netdev_linux_get_qos_capabilities, \
2618 netdev_linux_get_qos, \
2619 netdev_linux_set_qos, \
2620 netdev_linux_get_queue, \
2621 netdev_linux_set_queue, \
2622 netdev_linux_delete_queue, \
2623 netdev_linux_get_queue_stats, \
2624 netdev_linux_queue_dump_start, \
2625 netdev_linux_queue_dump_next, \
2626 netdev_linux_queue_dump_done, \
2627 netdev_linux_dump_queue_stats, \
2629 netdev_linux_get_in4, \
2630 netdev_linux_set_in4, \
2631 netdev_linux_get_in6, \
2632 netdev_linux_add_router, \
2633 netdev_linux_get_next_hop, \
2635 netdev_linux_arp_lookup, \
2637 netdev_linux_update_flags, \
2639 netdev_linux_change_seq, \
2641 netdev_linux_rx_alloc, \
2642 netdev_linux_rx_construct, \
2643 netdev_linux_rx_destruct, \
2644 netdev_linux_rx_dealloc, \
2645 netdev_linux_rx_recv, \
2646 netdev_linux_rx_wait, \
2647 netdev_linux_rx_drain, \
2650 const struct netdev_class netdev_linux_class =
2653 netdev_linux_construct,
2654 netdev_linux_get_stats,
2655 NULL, /* set_stats */
2656 netdev_linux_get_features,
2657 netdev_linux_get_status);
2659 const struct netdev_class netdev_tap_class =
2662 netdev_linux_construct_tap,
2663 netdev_tap_get_stats,
2664 NULL, /* set_stats */
2665 netdev_linux_get_features,
2666 netdev_linux_get_status);
2668 const struct netdev_class netdev_internal_class =
2671 netdev_linux_construct,
2672 netdev_internal_get_stats,
2673 netdev_internal_set_stats,
2674 NULL, /* get_features */
2675 netdev_internal_get_status);
2677 /* HTB traffic control class. */
2679 #define HTB_N_QUEUES 0xf000
2683 unsigned int max_rate; /* In bytes/s. */
2687 struct tc_queue tc_queue;
2688 unsigned int min_rate; /* In bytes/s. */
2689 unsigned int max_rate; /* In bytes/s. */
2690 unsigned int burst; /* In bytes. */
2691 unsigned int priority; /* Lower values are higher priorities. */
2695 htb_get__(const struct netdev *netdev_)
2697 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2698 return CONTAINER_OF(netdev->tc, struct htb, tc);
2702 htb_install__(struct netdev *netdev_, uint64_t max_rate)
2704 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2707 htb = xmalloc(sizeof *htb);
2708 tc_init(&htb->tc, &tc_ops_htb);
2709 htb->max_rate = max_rate;
2711 netdev->tc = &htb->tc;
2714 /* Create an HTB qdisc.
2716 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2718 htb_setup_qdisc__(struct netdev *netdev)
2721 struct tc_htb_glob opt;
2722 struct ofpbuf request;
2723 struct tcmsg *tcmsg;
2725 tc_del_qdisc(netdev);
2727 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2728 NLM_F_EXCL | NLM_F_CREATE, &request);
2732 tcmsg->tcm_handle = tc_make_handle(1, 0);
2733 tcmsg->tcm_parent = TC_H_ROOT;
2735 nl_msg_put_string(&request, TCA_KIND, "htb");
2737 memset(&opt, 0, sizeof opt);
2738 opt.rate2quantum = 10;
2742 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2743 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2744 nl_msg_end_nested(&request, opt_offset);
2746 return tc_transact(&request, NULL);
2749 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2750 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2752 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2753 unsigned int parent, struct htb_class *class)
2756 struct tc_htb_opt opt;
2757 struct ofpbuf request;
2758 struct tcmsg *tcmsg;
2762 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
2764 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2765 netdev_get_name(netdev));
2769 memset(&opt, 0, sizeof opt);
2770 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2771 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2772 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2773 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2774 opt.prio = class->priority;
2776 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2780 tcmsg->tcm_handle = handle;
2781 tcmsg->tcm_parent = parent;
2783 nl_msg_put_string(&request, TCA_KIND, "htb");
2784 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2785 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2786 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2787 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2788 nl_msg_end_nested(&request, opt_offset);
2790 error = tc_transact(&request, NULL);
2792 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2793 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2794 netdev_get_name(netdev),
2795 tc_get_major(handle), tc_get_minor(handle),
2796 tc_get_major(parent), tc_get_minor(parent),
2797 class->min_rate, class->max_rate,
2798 class->burst, class->priority, ovs_strerror(error));
2803 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2804 * description of them into 'details'. The description complies with the
2805 * specification given in the vswitch database documentation for linux-htb
2808 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2810 static const struct nl_policy tca_htb_policy[] = {
2811 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2812 .min_len = sizeof(struct tc_htb_opt) },
2815 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2816 const struct tc_htb_opt *htb;
2818 if (!nl_parse_nested(nl_options, tca_htb_policy,
2819 attrs, ARRAY_SIZE(tca_htb_policy))) {
2820 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2824 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2825 class->min_rate = htb->rate.rate;
2826 class->max_rate = htb->ceil.rate;
2827 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2828 class->priority = htb->prio;
2833 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2834 struct htb_class *options,
2835 struct netdev_queue_stats *stats)
2837 struct nlattr *nl_options;
2838 unsigned int handle;
2841 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2842 if (!error && queue_id) {
2843 unsigned int major = tc_get_major(handle);
2844 unsigned int minor = tc_get_minor(handle);
2845 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2846 *queue_id = minor - 1;
2851 if (!error && options) {
2852 error = htb_parse_tca_options__(nl_options, options);
2858 htb_parse_qdisc_details__(struct netdev *netdev_,
2859 const struct smap *details, struct htb_class *hc)
2861 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2862 const char *max_rate_s;
2864 max_rate_s = smap_get(details, "max-rate");
2865 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2866 if (!hc->max_rate) {
2867 enum netdev_features current;
2869 netdev_linux_read_features(netdev);
2870 current = !netdev->get_features_error ? netdev->current : 0;
2871 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
2873 hc->min_rate = hc->max_rate;
2879 htb_parse_class_details__(struct netdev *netdev,
2880 const struct smap *details, struct htb_class *hc)
2882 const struct htb *htb = htb_get__(netdev);
2883 const char *min_rate_s = smap_get(details, "min-rate");
2884 const char *max_rate_s = smap_get(details, "max-rate");
2885 const char *burst_s = smap_get(details, "burst");
2886 const char *priority_s = smap_get(details, "priority");
2889 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
2891 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2892 netdev_get_name(netdev));
2896 /* HTB requires at least an mtu sized min-rate to send any traffic even
2897 * on uncongested links. */
2898 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2899 hc->min_rate = MAX(hc->min_rate, mtu);
2900 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2903 hc->max_rate = (max_rate_s
2904 ? strtoull(max_rate_s, NULL, 10) / 8
2906 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2907 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2911 * According to hints in the documentation that I've read, it is important
2912 * that 'burst' be at least as big as the largest frame that might be
2913 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2914 * but having it a bit too small is a problem. Since netdev_get_mtu()
2915 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2916 * the MTU. We actually add 64, instead of 14, as a guard against
2917 * additional headers get tacked on somewhere that we're not aware of. */
2918 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2919 hc->burst = MAX(hc->burst, mtu + 64);
2922 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2928 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2929 unsigned int parent, struct htb_class *options,
2930 struct netdev_queue_stats *stats)
2932 struct ofpbuf *reply;
2935 error = tc_query_class(netdev, handle, parent, &reply);
2937 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2938 ofpbuf_delete(reply);
2944 htb_tc_install(struct netdev *netdev, const struct smap *details)
2948 error = htb_setup_qdisc__(netdev);
2950 struct htb_class hc;
2952 htb_parse_qdisc_details__(netdev, details, &hc);
2953 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2954 tc_make_handle(1, 0), &hc);
2956 htb_install__(netdev, hc.max_rate);
2962 static struct htb_class *
2963 htb_class_cast__(const struct tc_queue *queue)
2965 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2969 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2970 const struct htb_class *hc)
2972 struct htb *htb = htb_get__(netdev);
2973 size_t hash = hash_int(queue_id, 0);
2974 struct tc_queue *queue;
2975 struct htb_class *hcp;
2977 queue = tc_find_queue__(netdev, queue_id, hash);
2979 hcp = htb_class_cast__(queue);
2981 hcp = xmalloc(sizeof *hcp);
2982 queue = &hcp->tc_queue;
2983 queue->queue_id = queue_id;
2984 queue->created = time_msec();
2985 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2988 hcp->min_rate = hc->min_rate;
2989 hcp->max_rate = hc->max_rate;
2990 hcp->burst = hc->burst;
2991 hcp->priority = hc->priority;
2995 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2998 struct nl_dump dump;
2999 struct htb_class hc;
3001 /* Get qdisc options. */
3003 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3004 htb_install__(netdev, hc.max_rate);
3007 if (!start_queue_dump(netdev, &dump)) {
3010 while (nl_dump_next(&dump, &msg)) {
3011 unsigned int queue_id;
3013 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3014 htb_update_queue__(netdev, queue_id, &hc);
3017 nl_dump_done(&dump);
3023 htb_tc_destroy(struct tc *tc)
3025 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
3026 struct htb_class *hc, *next;
3028 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
3029 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3037 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
3039 const struct htb *htb = htb_get__(netdev);
3040 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
3045 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
3047 struct htb_class hc;
3050 htb_parse_qdisc_details__(netdev, details, &hc);
3051 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3052 tc_make_handle(1, 0), &hc);
3054 htb_get__(netdev)->max_rate = hc.max_rate;
3060 htb_class_get(const struct netdev *netdev OVS_UNUSED,
3061 const struct tc_queue *queue, struct smap *details)
3063 const struct htb_class *hc = htb_class_cast__(queue);
3065 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3066 if (hc->min_rate != hc->max_rate) {
3067 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3069 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
3071 smap_add_format(details, "priority", "%u", hc->priority);
3077 htb_class_set(struct netdev *netdev, unsigned int queue_id,
3078 const struct smap *details)
3080 struct htb_class hc;
3083 error = htb_parse_class_details__(netdev, details, &hc);
3088 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3089 tc_make_handle(1, 0xfffe), &hc);
3094 htb_update_queue__(netdev, queue_id, &hc);
3099 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
3101 struct htb_class *hc = htb_class_cast__(queue);
3102 struct htb *htb = htb_get__(netdev);
3105 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3107 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3114 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3115 struct netdev_queue_stats *stats)
3117 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3118 tc_make_handle(1, 0xfffe), NULL, stats);
3122 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3123 const struct ofpbuf *nlmsg,
3124 netdev_dump_queue_stats_cb *cb, void *aux)
3126 struct netdev_queue_stats stats;
3127 unsigned int handle, major, minor;
3130 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3135 major = tc_get_major(handle);
3136 minor = tc_get_minor(handle);
3137 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3138 (*cb)(minor - 1, &stats, aux);
3143 static const struct tc_ops tc_ops_htb = {
3144 "htb", /* linux_name */
3145 "linux-htb", /* ovs_name */
3146 HTB_N_QUEUES, /* n_queues */
3155 htb_class_get_stats,
3156 htb_class_dump_stats
3159 /* "linux-hfsc" traffic control class. */
3161 #define HFSC_N_QUEUES 0xf000
3169 struct tc_queue tc_queue;
3174 static struct hfsc *
3175 hfsc_get__(const struct netdev *netdev_)
3177 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3178 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
3181 static struct hfsc_class *
3182 hfsc_class_cast__(const struct tc_queue *queue)
3184 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3188 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
3190 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3193 hfsc = xmalloc(sizeof *hfsc);
3194 tc_init(&hfsc->tc, &tc_ops_hfsc);
3195 hfsc->max_rate = max_rate;
3196 netdev->tc = &hfsc->tc;
3200 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3201 const struct hfsc_class *hc)
3205 struct hfsc_class *hcp;
3206 struct tc_queue *queue;
3208 hfsc = hfsc_get__(netdev);
3209 hash = hash_int(queue_id, 0);
3211 queue = tc_find_queue__(netdev, queue_id, hash);
3213 hcp = hfsc_class_cast__(queue);
3215 hcp = xmalloc(sizeof *hcp);
3216 queue = &hcp->tc_queue;
3217 queue->queue_id = queue_id;
3218 queue->created = time_msec();
3219 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3222 hcp->min_rate = hc->min_rate;
3223 hcp->max_rate = hc->max_rate;
3227 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3229 const struct tc_service_curve *rsc, *fsc, *usc;
3230 static const struct nl_policy tca_hfsc_policy[] = {
3232 .type = NL_A_UNSPEC,
3234 .min_len = sizeof(struct tc_service_curve),
3237 .type = NL_A_UNSPEC,
3239 .min_len = sizeof(struct tc_service_curve),
3242 .type = NL_A_UNSPEC,
3244 .min_len = sizeof(struct tc_service_curve),
3247 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3249 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3250 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3251 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3255 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3256 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3257 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3259 if (rsc->m1 != 0 || rsc->d != 0 ||
3260 fsc->m1 != 0 || fsc->d != 0 ||
3261 usc->m1 != 0 || usc->d != 0) {
3262 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3263 "Non-linear service curves are not supported.");
3267 if (rsc->m2 != fsc->m2) {
3268 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3269 "Real-time service curves are not supported ");
3273 if (rsc->m2 > usc->m2) {
3274 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3275 "Min-rate service curve is greater than "
3276 "the max-rate service curve.");
3280 class->min_rate = fsc->m2;
3281 class->max_rate = usc->m2;
3286 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3287 struct hfsc_class *options,
3288 struct netdev_queue_stats *stats)
3291 unsigned int handle;
3292 struct nlattr *nl_options;
3294 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3300 unsigned int major, minor;
3302 major = tc_get_major(handle);
3303 minor = tc_get_minor(handle);
3304 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3305 *queue_id = minor - 1;
3312 error = hfsc_parse_tca_options__(nl_options, options);
3319 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3320 unsigned int parent, struct hfsc_class *options,
3321 struct netdev_queue_stats *stats)
3324 struct ofpbuf *reply;
3326 error = tc_query_class(netdev, handle, parent, &reply);
3331 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3332 ofpbuf_delete(reply);
3337 hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
3338 struct hfsc_class *class)
3340 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3342 const char *max_rate_s;
3344 max_rate_s = smap_get(details, "max-rate");
3345 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3348 enum netdev_features current;
3350 netdev_linux_read_features(netdev);
3351 current = !netdev->get_features_error ? netdev->current : 0;
3352 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3355 class->min_rate = max_rate;
3356 class->max_rate = max_rate;
3360 hfsc_parse_class_details__(struct netdev *netdev,
3361 const struct smap *details,
3362 struct hfsc_class * class)
3364 const struct hfsc *hfsc;
3365 uint32_t min_rate, max_rate;
3366 const char *min_rate_s, *max_rate_s;
3368 hfsc = hfsc_get__(netdev);
3369 min_rate_s = smap_get(details, "min-rate");
3370 max_rate_s = smap_get(details, "max-rate");
3372 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3373 min_rate = MAX(min_rate, 1);
3374 min_rate = MIN(min_rate, hfsc->max_rate);
3376 max_rate = (max_rate_s
3377 ? strtoull(max_rate_s, NULL, 10) / 8
3379 max_rate = MAX(max_rate, min_rate);
3380 max_rate = MIN(max_rate, hfsc->max_rate);
3382 class->min_rate = min_rate;
3383 class->max_rate = max_rate;
3388 /* Create an HFSC qdisc.
3390 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3392 hfsc_setup_qdisc__(struct netdev * netdev)
3394 struct tcmsg *tcmsg;
3395 struct ofpbuf request;
3396 struct tc_hfsc_qopt opt;
3398 tc_del_qdisc(netdev);
3400 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3401 NLM_F_EXCL | NLM_F_CREATE, &request);
3407 tcmsg->tcm_handle = tc_make_handle(1, 0);
3408 tcmsg->tcm_parent = TC_H_ROOT;
3410 memset(&opt, 0, sizeof opt);
3413 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3414 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3416 return tc_transact(&request, NULL);
3419 /* Create an HFSC class.
3421 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3422 * sc rate <min_rate> ul rate <max_rate>" */
3424 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3425 unsigned int parent, struct hfsc_class *class)
3429 struct tcmsg *tcmsg;
3430 struct ofpbuf request;
3431 struct tc_service_curve min, max;
3433 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3439 tcmsg->tcm_handle = handle;
3440 tcmsg->tcm_parent = parent;
3444 min.m2 = class->min_rate;
3448 max.m2 = class->max_rate;
3450 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3451 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3452 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3453 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3454 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3455 nl_msg_end_nested(&request, opt_offset);
3457 error = tc_transact(&request, NULL);
3459 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3460 "min-rate %ubps, max-rate %ubps (%s)",
3461 netdev_get_name(netdev),
3462 tc_get_major(handle), tc_get_minor(handle),
3463 tc_get_major(parent), tc_get_minor(parent),
3464 class->min_rate, class->max_rate, ovs_strerror(error));
3471 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
3474 struct hfsc_class class;
3476 error = hfsc_setup_qdisc__(netdev);
3482 hfsc_parse_qdisc_details__(netdev, details, &class);
3483 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3484 tc_make_handle(1, 0), &class);
3490 hfsc_install__(netdev, class.max_rate);
3495 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3498 struct nl_dump dump;
3499 struct hfsc_class hc;
3502 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3503 hfsc_install__(netdev, hc.max_rate);
3505 if (!start_queue_dump(netdev, &dump)) {
3509 while (nl_dump_next(&dump, &msg)) {
3510 unsigned int queue_id;
3512 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3513 hfsc_update_queue__(netdev, queue_id, &hc);
3517 nl_dump_done(&dump);
3522 hfsc_tc_destroy(struct tc *tc)
3525 struct hfsc_class *hc, *next;
3527 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3529 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3530 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3539 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
3541 const struct hfsc *hfsc;
3542 hfsc = hfsc_get__(netdev);
3543 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
3548 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
3551 struct hfsc_class class;
3553 hfsc_parse_qdisc_details__(netdev, details, &class);
3554 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3555 tc_make_handle(1, 0), &class);
3558 hfsc_get__(netdev)->max_rate = class.max_rate;
3565 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3566 const struct tc_queue *queue, struct smap *details)
3568 const struct hfsc_class *hc;
3570 hc = hfsc_class_cast__(queue);
3571 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3572 if (hc->min_rate != hc->max_rate) {
3573 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3579 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3580 const struct smap *details)
3583 struct hfsc_class class;
3585 error = hfsc_parse_class_details__(netdev, details, &class);
3590 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3591 tc_make_handle(1, 0xfffe), &class);
3596 hfsc_update_queue__(netdev, queue_id, &class);
3601 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3605 struct hfsc_class *hc;
3607 hc = hfsc_class_cast__(queue);
3608 hfsc = hfsc_get__(netdev);
3610 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3612 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3619 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3620 struct netdev_queue_stats *stats)
3622 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3623 tc_make_handle(1, 0xfffe), NULL, stats);
3627 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3628 const struct ofpbuf *nlmsg,
3629 netdev_dump_queue_stats_cb *cb, void *aux)
3631 struct netdev_queue_stats stats;
3632 unsigned int handle, major, minor;
3635 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3640 major = tc_get_major(handle);
3641 minor = tc_get_minor(handle);
3642 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3643 (*cb)(minor - 1, &stats, aux);
3648 static const struct tc_ops tc_ops_hfsc = {
3649 "hfsc", /* linux_name */
3650 "linux-hfsc", /* ovs_name */
3651 HFSC_N_QUEUES, /* n_queues */
3652 hfsc_tc_install, /* tc_install */
3653 hfsc_tc_load, /* tc_load */
3654 hfsc_tc_destroy, /* tc_destroy */
3655 hfsc_qdisc_get, /* qdisc_get */
3656 hfsc_qdisc_set, /* qdisc_set */
3657 hfsc_class_get, /* class_get */
3658 hfsc_class_set, /* class_set */
3659 hfsc_class_delete, /* class_delete */
3660 hfsc_class_get_stats, /* class_get_stats */
3661 hfsc_class_dump_stats /* class_dump_stats */
3664 /* "linux-default" traffic control class.
3666 * This class represents the default, unnamed Linux qdisc. It corresponds to
3667 * the "" (empty string) QoS type in the OVS database. */
3670 default_install__(struct netdev *netdev_)
3672 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3673 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
3675 /* Nothing but a tc class implementation is allowed to write to a tc. This
3676 * class never does that, so we can legitimately use a const tc object. */
3677 netdev->tc = CONST_CAST(struct tc *, &tc);
3681 default_tc_install(struct netdev *netdev,
3682 const struct smap *details OVS_UNUSED)
3684 default_install__(netdev);
3689 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3691 default_install__(netdev);
3695 static const struct tc_ops tc_ops_default = {
3696 NULL, /* linux_name */
3701 NULL, /* tc_destroy */
3702 NULL, /* qdisc_get */
3703 NULL, /* qdisc_set */
3704 NULL, /* class_get */
3705 NULL, /* class_set */
3706 NULL, /* class_delete */
3707 NULL, /* class_get_stats */
3708 NULL /* class_dump_stats */
3711 /* "linux-other" traffic control class.
3716 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
3718 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3719 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
3721 /* Nothing but a tc class implementation is allowed to write to a tc. This
3722 * class never does that, so we can legitimately use a const tc object. */
3723 netdev->tc = CONST_CAST(struct tc *, &tc);
3727 static const struct tc_ops tc_ops_other = {
3728 NULL, /* linux_name */
3729 "linux-other", /* ovs_name */
3731 NULL, /* tc_install */
3733 NULL, /* tc_destroy */
3734 NULL, /* qdisc_get */
3735 NULL, /* qdisc_set */
3736 NULL, /* class_get */
3737 NULL, /* class_set */
3738 NULL, /* class_delete */
3739 NULL, /* class_get_stats */
3740 NULL /* class_dump_stats */
3743 /* Traffic control. */
3745 /* Number of kernel "tc" ticks per second. */
3746 static double ticks_per_s;
3748 /* Number of kernel "jiffies" per second. This is used for the purpose of
3749 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3750 * one jiffy's worth of data.
3752 * There are two possibilities here:
3754 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3755 * approximate range of 100 to 1024. That means that we really need to
3756 * make sure that the qdisc can buffer that much data.
3758 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3759 * has finely granular timers and there's no need to fudge additional room
3760 * for buffers. (There's no extra effort needed to implement that: the
3761 * large 'buffer_hz' is used as a divisor, so practically any number will
3762 * come out as 0 in the division. Small integer results in the case of
3763 * really high dividends won't have any real effect anyhow.)
3765 static unsigned int buffer_hz;
3767 /* Returns tc handle 'major':'minor'. */
3769 tc_make_handle(unsigned int major, unsigned int minor)
3771 return TC_H_MAKE(major << 16, minor);
3774 /* Returns the major number from 'handle'. */
3776 tc_get_major(unsigned int handle)
3778 return TC_H_MAJ(handle) >> 16;
3781 /* Returns the minor number from 'handle'. */
3783 tc_get_minor(unsigned int handle)
3785 return TC_H_MIN(handle);
3788 static struct tcmsg *
3789 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3790 struct ofpbuf *request)
3792 struct tcmsg *tcmsg;
3796 error = get_ifindex(netdev, &ifindex);
3801 ofpbuf_init(request, 512);
3802 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3803 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3804 tcmsg->tcm_family = AF_UNSPEC;
3805 tcmsg->tcm_ifindex = ifindex;
3806 /* Caller should fill in tcmsg->tcm_handle. */
3807 /* Caller should fill in tcmsg->tcm_parent. */
3813 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3815 int error = nl_transact(NETLINK_ROUTE, request, replyp);
3816 ofpbuf_uninit(request);
3820 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3821 * policing configuration.
3823 * This function is equivalent to running the following when 'add' is true:
3824 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3826 * This function is equivalent to running the following when 'add' is false:
3827 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3829 * The configuration and stats may be seen with the following command:
3830 * /sbin/tc -s qdisc show dev <devname>
3832 * Returns 0 if successful, otherwise a positive errno value.
3835 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3837 struct ofpbuf request;
3838 struct tcmsg *tcmsg;
3840 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3841 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3843 tcmsg = tc_make_request(netdev, type, flags, &request);
3847 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3848 tcmsg->tcm_parent = TC_H_INGRESS;
3849 nl_msg_put_string(&request, TCA_KIND, "ingress");
3850 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3852 error = tc_transact(&request, NULL);
3854 /* If we're deleting the qdisc, don't worry about some of the
3855 * error conditions. */
3856 if (!add && (error == ENOENT || error == EINVAL)) {
3865 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3868 * This function is equivalent to running:
3869 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3870 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3873 * The configuration and stats may be seen with the following command:
3874 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3876 * Returns 0 if successful, otherwise a positive errno value.
3879 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3881 struct tc_police tc_police;
3882 struct ofpbuf request;
3883 struct tcmsg *tcmsg;
3884 size_t basic_offset;
3885 size_t police_offset;
3889 memset(&tc_police, 0, sizeof tc_police);
3890 tc_police.action = TC_POLICE_SHOT;
3891 tc_police.mtu = mtu;
3892 tc_fill_rate(&tc_police.rate, (kbits_rate * 1000)/8, mtu);
3893 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3894 kbits_burst * 1024);
3896 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3897 NLM_F_EXCL | NLM_F_CREATE, &request);
3901 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3902 tcmsg->tcm_info = tc_make_handle(49,
3903 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3905 nl_msg_put_string(&request, TCA_KIND, "basic");
3906 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3907 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3908 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3909 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3910 nl_msg_end_nested(&request, police_offset);
3911 nl_msg_end_nested(&request, basic_offset);
3913 error = tc_transact(&request, NULL);
3924 /* The values in psched are not individually very meaningful, but they are
3925 * important. The tables below show some values seen in the wild.
3929 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3930 * (Before that, there are hints that it was 1000000000.)
3932 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3936 * -----------------------------------
3937 * [1] 000c8000 000f4240 000f4240 00000064
3938 * [2] 000003e8 00000400 000f4240 3b9aca00
3939 * [3] 000003e8 00000400 000f4240 3b9aca00
3940 * [4] 000003e8 00000400 000f4240 00000064
3941 * [5] 000003e8 00000040 000f4240 3b9aca00
3942 * [6] 000003e8 00000040 000f4240 000000f9
3944 * a b c d ticks_per_s buffer_hz
3945 * ------- --------- ---------- ------------- ----------- -------------
3946 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3947 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3948 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3949 * [4] 1,000 1,024 1,000,000 100 976,562 100
3950 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3951 * [6] 1,000 64 1,000,000 249 15,625,000 249
3953 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3954 * [2] 2.6.26-1-686-bigmem from Debian lenny
3955 * [3] 2.6.26-2-sparc64 from Debian lenny
3956 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3957 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3958 * [6] 2.6.34 from kernel.org on KVM
3960 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
3961 static const char fn[] = "/proc/net/psched";
3962 unsigned int a, b, c, d;
3965 if (!ovsthread_once_start(&once)) {
3972 stream = fopen(fn, "r");
3974 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
3978 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3979 VLOG_WARN("%s: read failed", fn);
3983 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3987 VLOG_WARN("%s: invalid scheduler parameters", fn);
3991 ticks_per_s = (double) a * c / b;
3995 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3998 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
4001 ovsthread_once_done(&once);
4004 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
4005 * rate of 'rate' bytes per second. */
4007 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
4010 return (rate * ticks) / ticks_per_s;
4013 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
4014 * rate of 'rate' bytes per second. */
4016 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
4019 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
4022 /* Returns the number of bytes that need to be reserved for qdisc buffering at
4023 * a transmission rate of 'rate' bytes per second. */
4025 tc_buffer_per_jiffy(unsigned int rate)
4028 return rate / buffer_hz;
4031 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
4032 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
4033 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
4034 * stores NULL into it if it is absent.
4036 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
4039 * Returns 0 if successful, otherwise a positive errno value. */
4041 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
4042 struct nlattr **options)
4044 static const struct nl_policy tca_policy[] = {
4045 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
4046 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
4048 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4050 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4051 tca_policy, ta, ARRAY_SIZE(ta))) {
4052 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
4057 *kind = nl_attr_get_string(ta[TCA_KIND]);
4061 *options = ta[TCA_OPTIONS];
4076 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
4077 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
4078 * into '*options', and its queue statistics into '*stats'. Any of the output
4079 * arguments may be null.
4081 * Returns 0 if successful, otherwise a positive errno value. */
4083 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
4084 struct nlattr **options, struct netdev_queue_stats *stats)
4086 static const struct nl_policy tca_policy[] = {
4087 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
4088 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
4090 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4092 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4093 tca_policy, ta, ARRAY_SIZE(ta))) {
4094 VLOG_WARN_RL(&rl, "failed to parse class message");
4099 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
4100 *handlep = tc->tcm_handle;
4104 *options = ta[TCA_OPTIONS];
4108 const struct gnet_stats_queue *gsq;
4109 struct gnet_stats_basic gsb;
4111 static const struct nl_policy stats_policy[] = {
4112 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
4113 .min_len = sizeof gsb },
4114 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
4115 .min_len = sizeof *gsq },
4117 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
4119 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
4120 sa, ARRAY_SIZE(sa))) {
4121 VLOG_WARN_RL(&rl, "failed to parse class stats");
4125 /* Alignment issues screw up the length of struct gnet_stats_basic on
4126 * some arch/bitsize combinations. Newer versions of Linux have a
4127 * struct gnet_stats_basic_packed, but we can't depend on that. The
4128 * easiest thing to do is just to make a copy. */
4129 memset(&gsb, 0, sizeof gsb);
4130 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
4131 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
4132 stats->tx_bytes = gsb.bytes;
4133 stats->tx_packets = gsb.packets;
4135 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
4136 stats->tx_errors = gsq->drops;
4146 memset(stats, 0, sizeof *stats);
4151 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
4154 tc_query_class(const struct netdev *netdev,
4155 unsigned int handle, unsigned int parent,
4156 struct ofpbuf **replyp)
4158 struct ofpbuf request;
4159 struct tcmsg *tcmsg;
4162 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
4166 tcmsg->tcm_handle = handle;
4167 tcmsg->tcm_parent = parent;
4169 error = tc_transact(&request, replyp);
4171 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
4172 netdev_get_name(netdev),
4173 tc_get_major(handle), tc_get_minor(handle),
4174 tc_get_major(parent), tc_get_minor(parent),
4175 ovs_strerror(error));
4180 /* Equivalent to "tc class del dev <name> handle <handle>". */
4182 tc_delete_class(const struct netdev *netdev, unsigned int handle)
4184 struct ofpbuf request;
4185 struct tcmsg *tcmsg;
4188 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
4192 tcmsg->tcm_handle = handle;
4193 tcmsg->tcm_parent = 0;
4195 error = tc_transact(&request, NULL);
4197 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4198 netdev_get_name(netdev),
4199 tc_get_major(handle), tc_get_minor(handle),
4200 ovs_strerror(error));
4205 /* Equivalent to "tc qdisc del dev <name> root". */
4207 tc_del_qdisc(struct netdev *netdev_)
4209 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4210 struct ofpbuf request;
4211 struct tcmsg *tcmsg;
4214 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
4218 tcmsg->tcm_handle = tc_make_handle(1, 0);
4219 tcmsg->tcm_parent = TC_H_ROOT;
4221 error = tc_transact(&request, NULL);
4222 if (error == EINVAL) {
4223 /* EINVAL probably means that the default qdisc was in use, in which
4224 * case we've accomplished our purpose. */
4227 if (!error && netdev->tc) {
4228 if (netdev->tc->ops->tc_destroy) {
4229 netdev->tc->ops->tc_destroy(netdev->tc);
4236 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4237 * kernel to determine what they are. Returns 0 if successful, otherwise a
4238 * positive errno value. */
4240 tc_query_qdisc(const struct netdev *netdev_)
4242 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4243 struct ofpbuf request, *qdisc;
4244 const struct tc_ops *ops;
4245 struct tcmsg *tcmsg;
4253 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4254 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4255 * 2.6.35 without that fix backported to it.
4257 * To avoid the OOPS, we must not make a request that would attempt to dump
4258 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4259 * few others. There are a few ways that I can see to do this, but most of
4260 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4261 * technique chosen here is to assume that any non-default qdisc that we
4262 * create will have a class with handle 1:0. The built-in qdiscs only have
4263 * a class with handle 0:0.
4265 * We could check for Linux 2.6.35+ and use a more straightforward method
4267 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
4271 tcmsg->tcm_handle = tc_make_handle(1, 0);
4272 tcmsg->tcm_parent = 0;
4274 /* Figure out what tc class to instantiate. */
4275 error = tc_transact(&request, &qdisc);
4279 error = tc_parse_qdisc(qdisc, &kind, NULL);
4281 ops = &tc_ops_other;
4283 ops = tc_lookup_linux_name(kind);
4285 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4286 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4288 ops = &tc_ops_other;
4291 } else if (error == ENOENT) {
4292 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4293 * other entity that doesn't have a handle 1:0. We will assume
4294 * that it's the system default qdisc. */
4295 ops = &tc_ops_default;
4298 /* Who knows? Maybe the device got deleted. */
4299 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4300 netdev_get_name(netdev_), ovs_strerror(error));
4301 ops = &tc_ops_other;
4304 /* Instantiate it. */
4305 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
4306 ovs_assert((load_error == 0) == (netdev->tc != NULL));
4307 ofpbuf_delete(qdisc);
4309 return error ? error : load_error;
4312 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4313 approximate the time to transmit packets of various lengths. For an MTU of
4314 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4315 represents two possible packet lengths; for a MTU of 513 through 1024, four
4316 possible lengths; and so on.
4318 Returns, for the specified 'mtu', the number of bits that packet lengths
4319 need to be shifted right to fit within such a 256-entry table. */
4321 tc_calc_cell_log(unsigned int mtu)
4326 mtu = ETH_PAYLOAD_MAX;
4328 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4330 for (cell_log = 0; mtu >= 256; cell_log++) {
4337 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4340 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4342 memset(rate, 0, sizeof *rate);
4343 rate->cell_log = tc_calc_cell_log(mtu);
4344 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4345 /* rate->cell_align = 0; */ /* distro headers. */
4346 rate->mpu = ETH_TOTAL_MIN;
4350 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4351 * attribute of the specified "type".
4353 * See tc_calc_cell_log() above for a description of "rtab"s. */
4355 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4360 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4361 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4362 unsigned packet_size = (i + 1) << rate->cell_log;
4363 if (packet_size < rate->mpu) {
4364 packet_size = rate->mpu;
4366 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4370 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4371 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4372 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4375 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4377 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4378 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4381 /* Linux-only functions declared in netdev-linux.h */
4383 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4384 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4386 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4387 const char *flag_name, bool enable)
4389 const char *netdev_name = netdev_get_name(netdev);
4390 struct ethtool_value evalue;
4394 COVERAGE_INC(netdev_get_ethtool);
4395 memset(&evalue, 0, sizeof evalue);
4396 error = netdev_linux_do_ethtool(netdev_name,
4397 (struct ethtool_cmd *)&evalue,
4398 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4403 COVERAGE_INC(netdev_set_ethtool);
4404 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4405 error = netdev_linux_do_ethtool(netdev_name,
4406 (struct ethtool_cmd *)&evalue,
4407 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4412 COVERAGE_INC(netdev_get_ethtool);
4413 memset(&evalue, 0, sizeof evalue);
4414 error = netdev_linux_do_ethtool(netdev_name,
4415 (struct ethtool_cmd *)&evalue,
4416 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4421 if (new_flags != evalue.data) {
4422 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4423 "device %s failed", enable ? "enable" : "disable",
4424 flag_name, netdev_name);
4431 /* Utility functions. */
4433 /* Copies 'src' into 'dst', performing format conversion in the process. */
4435 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4436 const struct rtnl_link_stats *src)
4438 dst->rx_packets = src->rx_packets;
4439 dst->tx_packets = src->tx_packets;
4440 dst->rx_bytes = src->rx_bytes;
4441 dst->tx_bytes = src->tx_bytes;
4442 dst->rx_errors = src->rx_errors;
4443 dst->tx_errors = src->tx_errors;
4444 dst->rx_dropped = src->rx_dropped;
4445 dst->tx_dropped = src->tx_dropped;
4446 dst->multicast = src->multicast;
4447 dst->collisions = src->collisions;
4448 dst->rx_length_errors = src->rx_length_errors;
4449 dst->rx_over_errors = src->rx_over_errors;
4450 dst->rx_crc_errors = src->rx_crc_errors;
4451 dst->rx_frame_errors = src->rx_frame_errors;
4452 dst->rx_fifo_errors = src->rx_fifo_errors;
4453 dst->rx_missed_errors = src->rx_missed_errors;
4454 dst->tx_aborted_errors = src->tx_aborted_errors;
4455 dst->tx_carrier_errors = src->tx_carrier_errors;
4456 dst->tx_fifo_errors = src->tx_fifo_errors;
4457 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4458 dst->tx_window_errors = src->tx_window_errors;
4462 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4464 /* Policy for RTNLGRP_LINK messages.
4466 * There are *many* more fields in these messages, but currently we only
4467 * care about these fields. */
4468 static const struct nl_policy rtnlgrp_link_policy[] = {
4469 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4470 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4471 .min_len = sizeof(struct rtnl_link_stats) },
4474 struct ofpbuf request;
4475 struct ofpbuf *reply;
4476 struct ifinfomsg *ifi;
4477 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4480 ofpbuf_init(&request, 0);
4481 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4482 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4483 ifi->ifi_family = PF_UNSPEC;
4484 ifi->ifi_index = ifindex;
4485 error = nl_transact(NETLINK_ROUTE, &request, &reply);
4486 ofpbuf_uninit(&request);
4491 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4492 rtnlgrp_link_policy,
4493 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4494 ofpbuf_delete(reply);
4498 if (!attrs[IFLA_STATS]) {
4499 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4500 ofpbuf_delete(reply);
4504 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4506 ofpbuf_delete(reply);
4512 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4514 static const char fn[] = "/proc/net/dev";
4519 stream = fopen(fn, "r");
4521 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
4526 while (fgets(line, sizeof line, stream)) {
4529 #define X64 "%"SCNu64
4532 X64 X64 X64 X64 X64 X64 X64 "%*u"
4533 X64 X64 X64 X64 X64 X64 X64 "%*u",
4539 &stats->rx_fifo_errors,
4540 &stats->rx_frame_errors,
4546 &stats->tx_fifo_errors,
4548 &stats->tx_carrier_errors) != 15) {
4549 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4550 } else if (!strcmp(devname, netdev_name)) {
4551 stats->rx_length_errors = UINT64_MAX;
4552 stats->rx_over_errors = UINT64_MAX;
4553 stats->rx_crc_errors = UINT64_MAX;
4554 stats->rx_missed_errors = UINT64_MAX;
4555 stats->tx_aborted_errors = UINT64_MAX;
4556 stats->tx_heartbeat_errors = UINT64_MAX;
4557 stats->tx_window_errors = UINT64_MAX;
4563 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4569 get_flags(const struct netdev *dev, unsigned int *flags)
4575 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
4577 *flags = ifr.ifr_flags;
4583 set_flags(const char *name, unsigned int flags)
4587 ifr.ifr_flags = flags;
4588 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
4592 do_get_ifindex(const char *netdev_name)
4597 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4598 COVERAGE_INC(netdev_get_ifindex);
4600 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
4602 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4603 netdev_name, ovs_strerror(error));
4606 return ifr.ifr_ifindex;
4610 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4612 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4614 if (!(netdev->cache_valid & VALID_IFINDEX)) {
4615 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4618 netdev->get_ifindex_error = -ifindex;
4619 netdev->ifindex = 0;
4621 netdev->get_ifindex_error = 0;
4622 netdev->ifindex = ifindex;
4624 netdev->cache_valid |= VALID_IFINDEX;
4627 *ifindexp = netdev->ifindex;
4628 return netdev->get_ifindex_error;
4632 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4638 memset(&ifr, 0, sizeof ifr);
4639 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4640 COVERAGE_INC(netdev_get_hwaddr);
4641 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
4643 /* ENODEV probably means that a vif disappeared asynchronously and
4644 * hasn't been removed from the database yet, so reduce the log level
4645 * to INFO for that case. */
4646 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
4647 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4648 netdev_name, ovs_strerror(error));
4651 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4652 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4653 VLOG_WARN("%s device has unknown hardware address family %d",
4654 netdev_name, hwaddr_family);
4656 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4661 set_etheraddr(const char *netdev_name,
4662 const uint8_t mac[ETH_ADDR_LEN])
4667 memset(&ifr, 0, sizeof ifr);
4668 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4669 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4670 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4671 COVERAGE_INC(netdev_set_hwaddr);
4672 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
4674 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4675 netdev_name, ovs_strerror(error));
4681 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4682 int cmd, const char *cmd_name)
4687 memset(&ifr, 0, sizeof ifr);
4688 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4689 ifr.ifr_data = (caddr_t) ecmd;
4692 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
4694 if (error != EOPNOTSUPP) {
4695 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4696 "failed: %s", cmd_name, name, ovs_strerror(error));
4698 /* The device doesn't support this operation. That's pretty
4699 * common, so there's no point in logging anything. */
4706 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4707 int cmd, const char *cmd_name)
4712 ifr.ifr_addr.sa_family = AF_INET;
4713 error = af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4715 const struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *,
4717 *ip = sin->sin_addr;
4722 /* Returns an AF_PACKET raw socket or a negative errno value. */
4724 af_packet_sock(void)
4726 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4729 if (ovsthread_once_start(&once)) {
4730 sock = socket(AF_PACKET, SOCK_RAW, 0);
4732 int error = set_nonblocking(sock);
4739 VLOG_ERR("failed to create packet socket: %s",
4740 ovs_strerror(errno));
4742 ovsthread_once_done(&once);