2 * Copyright (c) 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <linux/version.h>
37 #include <sys/types.h>
38 #include <sys/ioctl.h>
39 #include <sys/socket.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
52 #include "dpif-linux.h"
53 #include "dynamic-string.h"
54 #include "fatal-signal.h"
57 #include "netdev-provider.h"
58 #include "netdev-vport.h"
59 #include "netlink-notifier.h"
60 #include "netlink-socket.h"
63 #include "openflow/openflow.h"
65 #include "poll-loop.h"
66 #include "rtnetlink-link.h"
68 #include "socket-util.h"
71 #include "unaligned.h"
74 VLOG_DEFINE_THIS_MODULE(netdev_linux);
76 COVERAGE_DEFINE(netdev_set_policing);
77 COVERAGE_DEFINE(netdev_arp_lookup);
78 COVERAGE_DEFINE(netdev_get_ifindex);
79 COVERAGE_DEFINE(netdev_get_hwaddr);
80 COVERAGE_DEFINE(netdev_set_hwaddr);
81 COVERAGE_DEFINE(netdev_get_ethtool);
82 COVERAGE_DEFINE(netdev_set_ethtool);
85 /* These were introduced in Linux 2.6.14, so they might be missing if we have
87 #ifndef ADVERTISED_Pause
88 #define ADVERTISED_Pause (1 << 13)
90 #ifndef ADVERTISED_Asym_Pause
91 #define ADVERTISED_Asym_Pause (1 << 14)
94 /* These were introduced in Linux 2.6.24, so they might be missing if we
95 * have old headers. */
96 #ifndef ETHTOOL_GFLAGS
97 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
99 #ifndef ETHTOOL_SFLAGS
100 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
103 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
106 #define TC_RTAB_SIZE 1024
110 VALID_IFINDEX = 1 << 0,
111 VALID_ETHERADDR = 1 << 1,
115 VALID_POLICING = 1 << 5,
116 VALID_VPORT_STAT_ERROR = 1 << 6,
117 VALID_DRVINFO = 1 << 7,
118 VALID_FEATURES = 1 << 8,
121 /* Traffic control. */
123 /* An instance of a traffic control class. Always associated with a particular
126 * Each TC implementation subclasses this with whatever additional data it
129 const struct tc_ops *ops;
130 struct hmap queues; /* Contains "struct tc_queue"s.
131 * Read by generic TC layer.
132 * Written only by TC implementation. */
135 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
137 /* One traffic control queue.
139 * Each TC implementation subclasses this with whatever additional data it
142 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
143 unsigned int queue_id; /* OpenFlow queue ID. */
144 long long int created; /* Time queue was created, in msecs. */
147 /* A particular kind of traffic control. Each implementation generally maps to
148 * one particular Linux qdisc class.
150 * The functions below return 0 if successful or a positive errno value on
151 * failure, except where otherwise noted. All of them must be provided, except
152 * where otherwise noted. */
154 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
155 * This is null for tc_ops_default and tc_ops_other, for which there are no
156 * appropriate values. */
157 const char *linux_name;
159 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
160 const char *ovs_name;
162 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
163 * queues. The queues are numbered 0 through n_queues - 1. */
164 unsigned int n_queues;
166 /* Called to install this TC class on 'netdev'. The implementation should
167 * make the Netlink calls required to set up 'netdev' with the right qdisc
168 * and configure it according to 'details'. The implementation may assume
169 * that the current qdisc is the default; that is, there is no need for it
170 * to delete the current qdisc before installing itself.
172 * The contents of 'details' should be documented as valid for 'ovs_name'
173 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
174 * (which is built as ovs-vswitchd.conf.db(8)).
176 * This function must return 0 if and only if it sets 'netdev->tc' to an
177 * initialized 'struct tc'.
179 * (This function is null for tc_ops_other, which cannot be installed. For
180 * other TC classes it should always be nonnull.) */
181 int (*tc_install)(struct netdev *netdev, const struct smap *details);
183 /* Called when the netdev code determines (through a Netlink query) that
184 * this TC class's qdisc is installed on 'netdev', but we didn't install
185 * it ourselves and so don't know any of the details.
187 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
188 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
189 * implementation should parse the other attributes of 'nlmsg' as
190 * necessary to determine its configuration. If necessary it should also
191 * use Netlink queries to determine the configuration of queues on
194 * This function must return 0 if and only if it sets 'netdev->tc' to an
195 * initialized 'struct tc'. */
196 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
198 /* Destroys the data structures allocated by the implementation as part of
199 * 'tc'. (This includes destroying 'tc->queues' by calling
202 * The implementation should not need to perform any Netlink calls. If
203 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
204 * (But it may not be desirable.)
206 * This function may be null if 'tc' is trivial. */
207 void (*tc_destroy)(struct tc *tc);
209 /* Retrieves details of 'netdev->tc' configuration into 'details'.
211 * The implementation should not need to perform any Netlink calls, because
212 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
213 * cached the configuration.
215 * The contents of 'details' should be documented as valid for 'ovs_name'
216 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
217 * (which is built as ovs-vswitchd.conf.db(8)).
219 * This function may be null if 'tc' is not configurable.
221 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
223 /* Reconfigures 'netdev->tc' according to 'details', performing any
224 * required Netlink calls to complete the reconfiguration.
226 * The contents of 'details' should be documented as valid for 'ovs_name'
227 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
228 * (which is built as ovs-vswitchd.conf.db(8)).
230 * This function may be null if 'tc' is not configurable.
232 int (*qdisc_set)(struct netdev *, const struct smap *details);
234 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
235 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
237 * The contents of 'details' should be documented as valid for 'ovs_name'
238 * in the "other_config" column in the "Queue" table in
239 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
241 * The implementation should not need to perform any Netlink calls, because
242 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
243 * cached the queue configuration.
245 * This function may be null if 'tc' does not have queues ('n_queues' is
247 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
248 struct smap *details);
250 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
251 * 'details', perfoming any required Netlink calls to complete the
252 * reconfiguration. The caller ensures that 'queue_id' is less than
255 * The contents of 'details' should be documented as valid for 'ovs_name'
256 * in the "other_config" column in the "Queue" table in
257 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
259 * This function may be null if 'tc' does not have queues or its queues are
260 * not configurable. */
261 int (*class_set)(struct netdev *, unsigned int queue_id,
262 const struct smap *details);
264 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
265 * tc_queue's within 'netdev->tc->queues'.
267 * This function may be null if 'tc' does not have queues or its queues
268 * cannot be deleted. */
269 int (*class_delete)(struct netdev *, struct tc_queue *queue);
271 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
272 * 'struct tc_queue's within 'netdev->tc->queues'.
274 * On success, initializes '*stats'.
276 * This function may be null if 'tc' does not have queues or if it cannot
277 * report queue statistics. */
278 int (*class_get_stats)(const struct netdev *netdev,
279 const struct tc_queue *queue,
280 struct netdev_queue_stats *stats);
282 /* Extracts queue stats from 'nlmsg', which is a response to a
283 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
285 * This function may be null if 'tc' does not have queues or if it cannot
286 * report queue statistics. */
287 int (*class_dump_stats)(const struct netdev *netdev,
288 const struct ofpbuf *nlmsg,
289 netdev_dump_queue_stats_cb *cb, void *aux);
293 tc_init(struct tc *tc, const struct tc_ops *ops)
296 hmap_init(&tc->queues);
300 tc_destroy(struct tc *tc)
302 hmap_destroy(&tc->queues);
305 static const struct tc_ops tc_ops_htb;
306 static const struct tc_ops tc_ops_hfsc;
307 static const struct tc_ops tc_ops_default;
308 static const struct tc_ops tc_ops_other;
310 static const struct tc_ops *const tcs[] = {
311 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
312 &tc_ops_hfsc, /* Hierarchical fair service curve. */
313 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
314 &tc_ops_other, /* Some other qdisc. */
318 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
319 static unsigned int tc_get_major(unsigned int handle);
320 static unsigned int tc_get_minor(unsigned int handle);
322 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
323 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
324 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
326 static struct tcmsg *tc_make_request(const struct netdev *, int type,
327 unsigned int flags, struct ofpbuf *);
328 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
329 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
330 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
333 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
334 struct nlattr **options);
335 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
336 struct nlattr **options,
337 struct netdev_queue_stats *);
338 static int tc_query_class(const struct netdev *,
339 unsigned int handle, unsigned int parent,
340 struct ofpbuf **replyp);
341 static int tc_delete_class(const struct netdev *, unsigned int handle);
343 static int tc_del_qdisc(struct netdev *netdev);
344 static int tc_query_qdisc(const struct netdev *netdev);
346 static int tc_calc_cell_log(unsigned int mtu);
347 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
348 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
349 const struct tc_ratespec *rate);
350 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
352 struct netdev_linux {
355 /* Protects all members below. */
356 struct ovs_mutex mutex;
358 unsigned int cache_valid;
359 unsigned int change_seq;
361 bool miimon; /* Link status of last poll. */
362 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
363 struct timer miimon_timer;
365 /* The following are figured out "on demand" only. They are only valid
366 * when the corresponding VALID_* bit in 'cache_valid' is set. */
368 uint8_t etheraddr[ETH_ADDR_LEN];
369 struct in_addr address, netmask;
372 unsigned int ifi_flags;
373 long long int carrier_resets;
374 uint32_t kbits_rate; /* Policing data. */
375 uint32_t kbits_burst;
376 int vport_stats_error; /* Cached error code from vport_get_stats().
377 0 or an errno value. */
378 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
379 int ether_addr_error; /* Cached error code from set/get etheraddr. */
380 int netdev_policing_error; /* Cached error code from set policing. */
381 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
382 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
384 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
385 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
386 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
388 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
391 /* For devices of class netdev_tap_class only. */
395 struct netdev_rx_linux {
401 /* This is set pretty low because we probably won't learn anything from the
402 * additional log messages. */
403 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
405 static void netdev_linux_run(void);
407 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
408 int cmd, const char *cmd_name);
409 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
410 int cmd, const char *cmd_name);
411 static int get_flags(const struct netdev *, unsigned int *flags);
412 static int set_flags(const char *, unsigned int flags);
413 static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
414 enum netdev_flags on, enum netdev_flags *old_flagsp)
415 OVS_REQUIRES(netdev->mutex);
416 static int do_get_ifindex(const char *netdev_name);
417 static int get_ifindex(const struct netdev *, int *ifindexp);
418 static int do_set_addr(struct netdev *netdev,
419 int ioctl_nr, const char *ioctl_name,
420 struct in_addr addr);
421 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
422 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
423 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
424 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
425 static int af_packet_sock(void);
426 static void netdev_linux_miimon_run(void);
427 static void netdev_linux_miimon_wait(void);
430 is_netdev_linux_class(const struct netdev_class *netdev_class)
432 return netdev_class->run == netdev_linux_run;
436 is_tap_netdev(const struct netdev *netdev)
438 return netdev_get_class(netdev) == &netdev_tap_class;
441 static struct netdev_linux *
442 netdev_linux_cast(const struct netdev *netdev)
444 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
446 return CONTAINER_OF(netdev, struct netdev_linux, up);
449 static struct netdev_rx_linux *
450 netdev_rx_linux_cast(const struct netdev_rx *rx)
452 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
453 return CONTAINER_OF(rx, struct netdev_rx_linux, up);
456 static void netdev_linux_update(struct netdev_linux *netdev,
457 const struct rtnetlink_link_change *)
458 OVS_REQUIRES(netdev->mutex);
459 static void netdev_linux_changed(struct netdev_linux *netdev,
460 unsigned int ifi_flags, unsigned int mask)
461 OVS_REQUIRES(netdev->mutex);
463 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK changes, or NULL
464 * if no such socket could be created. */
465 static struct nl_sock *
466 netdev_linux_notify_sock(void)
468 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
469 static struct nl_sock *sock;
471 if (ovsthread_once_start(&once)) {
474 error = nl_sock_create(NETLINK_ROUTE, &sock);
476 error = nl_sock_join_mcgroup(sock, RTNLGRP_LINK);
478 nl_sock_destroy(sock);
482 ovsthread_once_done(&once);
489 netdev_linux_run(void)
491 struct nl_sock *sock;
494 netdev_linux_miimon_run();
496 sock = netdev_linux_notify_sock();
502 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
503 uint64_t buf_stub[4096 / 8];
506 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
507 error = nl_sock_recv(sock, &buf, false);
509 struct rtnetlink_link_change change;
511 if (rtnetlink_link_parse(&buf, &change)) {
512 struct netdev *netdev_ = netdev_from_name(change.ifname);
513 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
514 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
516 ovs_mutex_lock(&netdev->mutex);
517 netdev_linux_update(netdev, &change);
518 ovs_mutex_unlock(&netdev->mutex);
520 netdev_close(netdev_);
522 } else if (error == ENOBUFS) {
523 struct shash device_shash;
524 struct shash_node *node;
528 shash_init(&device_shash);
529 netdev_get_devices(&netdev_linux_class, &device_shash);
530 SHASH_FOR_EACH (node, &device_shash) {
531 struct netdev *netdev_ = node->data;
532 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
535 ovs_mutex_lock(&netdev->mutex);
536 get_flags(netdev_, &flags);
537 netdev_linux_changed(netdev, flags, 0);
538 ovs_mutex_unlock(&netdev->mutex);
540 netdev_close(netdev_);
542 shash_destroy(&device_shash);
543 } else if (error != EAGAIN) {
544 VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)",
545 ovs_strerror(error));
552 netdev_linux_wait(void)
554 struct nl_sock *sock;
556 netdev_linux_miimon_wait();
557 sock = netdev_linux_notify_sock();
559 nl_sock_wait(sock, POLLIN);
564 netdev_linux_changed(struct netdev_linux *dev,
565 unsigned int ifi_flags, unsigned int mask)
566 OVS_REQUIRES(dev->mutex)
569 if (!dev->change_seq) {
573 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
574 dev->carrier_resets++;
576 dev->ifi_flags = ifi_flags;
578 dev->cache_valid &= mask;
582 netdev_linux_update(struct netdev_linux *dev,
583 const struct rtnetlink_link_change *change)
584 OVS_REQUIRES(dev->mutex)
586 if (change->nlmsg_type == RTM_NEWLINK) {
588 netdev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
590 /* Update netdev from rtnl-change msg. */
592 dev->mtu = change->mtu;
593 dev->cache_valid |= VALID_MTU;
594 dev->netdev_mtu_error = 0;
597 if (!eth_addr_is_zero(change->addr)) {
598 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
599 dev->cache_valid |= VALID_ETHERADDR;
600 dev->ether_addr_error = 0;
603 dev->ifindex = change->ifi_index;
604 dev->cache_valid |= VALID_IFINDEX;
605 dev->get_ifindex_error = 0;
608 netdev_linux_changed(dev, change->ifi_flags, 0);
612 static struct netdev *
613 netdev_linux_alloc(void)
615 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
620 netdev_linux_common_construct(struct netdev_linux *netdev)
622 ovs_mutex_init(&netdev->mutex);
623 netdev->change_seq = 1;
626 /* Creates system and internal devices. */
628 netdev_linux_construct(struct netdev *netdev_)
630 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
633 netdev_linux_common_construct(netdev);
635 error = get_flags(&netdev->up, &netdev->ifi_flags);
636 if (error == ENODEV) {
637 if (netdev->up.netdev_class != &netdev_internal_class) {
638 /* The device does not exist, so don't allow it to be opened. */
641 /* "Internal" netdevs have to be created as netdev objects before
642 * they exist in the kernel, because creating them in the kernel
643 * happens by passing a netdev object to dpif_port_add().
644 * Therefore, ignore the error. */
651 /* For most types of netdevs we open the device for each call of
652 * netdev_open(). However, this is not the case with tap devices,
653 * since it is only possible to open the device once. In this
654 * situation we share a single file descriptor, and consequently
655 * buffers, across all readers. Therefore once data is read it will
656 * be unavailable to other reads for tap devices. */
658 netdev_linux_construct_tap(struct netdev *netdev_)
660 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
661 static const char tap_dev[] = "/dev/net/tun";
662 const char *name = netdev_->name;
666 netdev_linux_common_construct(netdev);
668 /* Open tap device. */
669 netdev->tap_fd = open(tap_dev, O_RDWR);
670 if (netdev->tap_fd < 0) {
672 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
676 /* Create tap device. */
677 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
678 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
679 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
680 VLOG_WARN("%s: creating tap device failed: %s", name,
681 ovs_strerror(errno));
686 /* Make non-blocking. */
687 error = set_nonblocking(netdev->tap_fd);
695 close(netdev->tap_fd);
700 netdev_linux_destruct(struct netdev *netdev_)
702 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
704 if (netdev->tc && netdev->tc->ops->tc_destroy) {
705 netdev->tc->ops->tc_destroy(netdev->tc);
708 if (netdev_get_class(netdev_) == &netdev_tap_class
709 && netdev->tap_fd >= 0)
711 close(netdev->tap_fd);
714 ovs_mutex_destroy(&netdev->mutex);
718 netdev_linux_dealloc(struct netdev *netdev_)
720 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
724 static struct netdev_rx *
725 netdev_linux_rx_alloc(void)
727 struct netdev_rx_linux *rx = xzalloc(sizeof *rx);
732 netdev_linux_rx_construct(struct netdev_rx *rx_)
734 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
735 struct netdev *netdev_ = rx->up.netdev;
736 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
739 ovs_mutex_lock(&netdev->mutex);
740 rx->is_tap = is_tap_netdev(netdev_);
742 rx->fd = netdev->tap_fd;
744 struct sockaddr_ll sll;
746 /* Result of tcpdump -dd inbound */
747 static const struct sock_filter filt[] = {
748 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
749 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
750 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
751 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
753 static const struct sock_fprog fprog = {
754 ARRAY_SIZE(filt), (struct sock_filter *) filt
757 /* Create file descriptor. */
758 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
761 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
765 /* Set non-blocking mode. */
766 error = set_nonblocking(rx->fd);
771 /* Get ethernet device index. */
772 error = get_ifindex(&netdev->up, &ifindex);
777 /* Bind to specific ethernet device. */
778 memset(&sll, 0, sizeof sll);
779 sll.sll_family = AF_PACKET;
780 sll.sll_ifindex = ifindex;
781 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
782 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
784 VLOG_ERR("%s: failed to bind raw socket (%s)",
785 netdev_get_name(netdev_), ovs_strerror(error));
789 /* Filter for only inbound packets. */
790 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
794 VLOG_ERR("%s: failed to attach filter (%s)",
795 netdev_get_name(netdev_), ovs_strerror(error));
799 ovs_mutex_unlock(&netdev->mutex);
807 ovs_mutex_unlock(&netdev->mutex);
812 netdev_linux_rx_destruct(struct netdev_rx *rx_)
814 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
822 netdev_linux_rx_dealloc(struct netdev_rx *rx_)
824 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
830 netdev_linux_rx_recv(struct netdev_rx *rx_, void *data, size_t size)
832 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
837 ? read(rx->fd, data, size)
838 : recv(rx->fd, data, size, MSG_TRUNC));
839 } while (retval < 0 && errno == EINTR);
842 return retval > size ? -EMSGSIZE : retval;
844 if (errno != EAGAIN) {
845 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
846 ovs_strerror(errno), netdev_rx_get_name(rx_));
853 netdev_linux_rx_wait(struct netdev_rx *rx_)
855 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
856 poll_fd_wait(rx->fd, POLLIN);
860 netdev_linux_rx_drain(struct netdev_rx *rx_)
862 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
865 int error = af_inet_ifreq_ioctl(netdev_rx_get_name(rx_), &ifr,
866 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
870 drain_fd(rx->fd, ifr.ifr_qlen);
873 return drain_rcvbuf(rx->fd);
877 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
878 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
879 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
880 * the packet is too big or too small to transmit on the device.
882 * The caller retains ownership of 'buffer' in all cases.
884 * The kernel maintains a packet transmission queue, so the caller is not
885 * expected to do additional queuing of packets. */
887 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
892 if (!is_tap_netdev(netdev_)) {
893 /* Use our AF_PACKET socket to send to this device. */
894 struct sockaddr_ll sll;
900 sock = af_packet_sock();
905 ifindex = netdev_get_ifindex(netdev_);
910 /* We don't bother setting most fields in sockaddr_ll because the
911 * kernel ignores them for SOCK_RAW. */
912 memset(&sll, 0, sizeof sll);
913 sll.sll_family = AF_PACKET;
914 sll.sll_ifindex = ifindex;
916 iov.iov_base = CONST_CAST(void *, data);
920 msg.msg_namelen = sizeof sll;
923 msg.msg_control = NULL;
924 msg.msg_controllen = 0;
927 retval = sendmsg(sock, &msg, 0);
929 /* Use the tap fd to send to this device. This is essential for
930 * tap devices, because packets sent to a tap device with an
931 * AF_PACKET socket will loop back to be *received* again on the
932 * tap device. This doesn't occur on other interface types
933 * because we attach a socket filter to the rx socket. */
934 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
936 retval = write(netdev->tap_fd, data, size);
940 /* The Linux AF_PACKET implementation never blocks waiting for room
941 * for packets, instead returning ENOBUFS. Translate this into
942 * EAGAIN for the caller. */
943 if (errno == ENOBUFS) {
945 } else if (errno == EINTR) {
947 } else if (errno != EAGAIN) {
948 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
949 netdev_get_name(netdev_), ovs_strerror(errno));
952 } else if (retval != size) {
953 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
954 "%zu) on %s", retval, size, netdev_get_name(netdev_));
962 /* Registers with the poll loop to wake up from the next call to poll_block()
963 * when the packet transmission queue has sufficient room to transmit a packet
964 * with netdev_send().
966 * The kernel maintains a packet transmission queue, so the client is not
967 * expected to do additional queuing of packets. Thus, this function is
968 * unlikely to ever be used. It is included for completeness. */
970 netdev_linux_send_wait(struct netdev *netdev)
972 if (is_tap_netdev(netdev)) {
973 /* TAP device always accepts packets.*/
974 poll_immediate_wake();
978 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
979 * otherwise a positive errno value. */
981 netdev_linux_set_etheraddr(struct netdev *netdev_,
982 const uint8_t mac[ETH_ADDR_LEN])
984 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
985 enum netdev_flags old_flags = 0;
988 ovs_mutex_lock(&netdev->mutex);
990 if (netdev->cache_valid & VALID_ETHERADDR) {
991 error = netdev->ether_addr_error;
992 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
995 netdev->cache_valid &= ~VALID_ETHERADDR;
998 /* Tap devices must be brought down before setting the address. */
999 if (is_tap_netdev(netdev_)) {
1000 update_flags(netdev, NETDEV_UP, 0, &old_flags);
1002 error = set_etheraddr(netdev_get_name(netdev_), mac);
1003 if (!error || error == ENODEV) {
1004 netdev->ether_addr_error = error;
1005 netdev->cache_valid |= VALID_ETHERADDR;
1007 memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
1011 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1012 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1016 ovs_mutex_unlock(&netdev->mutex);
1020 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1022 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1023 uint8_t mac[ETH_ADDR_LEN])
1025 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1028 ovs_mutex_lock(&netdev->mutex);
1029 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1030 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1032 netdev->cache_valid |= VALID_ETHERADDR;
1035 error = netdev->ether_addr_error;
1037 memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
1039 ovs_mutex_unlock(&netdev->mutex);
1045 netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
1049 if (!(netdev->cache_valid & VALID_MTU)) {
1052 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
1053 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
1054 netdev->mtu = ifr.ifr_mtu;
1055 netdev->cache_valid |= VALID_MTU;
1058 error = netdev->netdev_mtu_error;
1060 *mtup = netdev->mtu;
1066 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1067 * in bytes, not including the hardware header; thus, this is typically 1500
1068 * bytes for Ethernet devices. */
1070 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1072 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1075 ovs_mutex_lock(&netdev->mutex);
1076 error = netdev_linux_get_mtu__(netdev, mtup);
1077 ovs_mutex_unlock(&netdev->mutex);
1082 /* Sets the maximum size of transmitted (MTU) for given device using linux
1083 * networking ioctl interface.
1086 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1088 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1092 ovs_mutex_lock(&netdev->mutex);
1093 if (netdev->cache_valid & VALID_MTU) {
1094 error = netdev->netdev_mtu_error;
1095 if (error || netdev->mtu == mtu) {
1098 netdev->cache_valid &= ~VALID_MTU;
1101 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1102 SIOCSIFMTU, "SIOCSIFMTU");
1103 if (!error || error == ENODEV) {
1104 netdev->netdev_mtu_error = error;
1105 netdev->mtu = ifr.ifr_mtu;
1106 netdev->cache_valid |= VALID_MTU;
1109 ovs_mutex_unlock(&netdev->mutex);
1113 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1114 * On failure, returns a negative errno value. */
1116 netdev_linux_get_ifindex(const struct netdev *netdev_)
1118 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1121 ovs_mutex_lock(&netdev->mutex);
1122 error = get_ifindex(netdev_, &ifindex);
1123 ovs_mutex_unlock(&netdev->mutex);
1125 return error ? -error : ifindex;
1129 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1131 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1133 ovs_mutex_lock(&netdev->mutex);
1134 if (netdev->miimon_interval > 0) {
1135 *carrier = netdev->miimon;
1137 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1139 ovs_mutex_unlock(&netdev->mutex);
1144 static long long int
1145 netdev_linux_get_carrier_resets(const struct netdev *netdev_)
1147 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1148 long long int carrier_resets;
1150 ovs_mutex_lock(&netdev->mutex);
1151 carrier_resets = netdev->carrier_resets;
1152 ovs_mutex_unlock(&netdev->mutex);
1154 return carrier_resets;
1158 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1159 struct mii_ioctl_data *data)
1164 memset(&ifr, 0, sizeof ifr);
1165 memcpy(&ifr.ifr_data, data, sizeof *data);
1166 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
1167 memcpy(data, &ifr.ifr_data, sizeof *data);
1173 netdev_linux_get_miimon(const char *name, bool *miimon)
1175 struct mii_ioctl_data data;
1180 memset(&data, 0, sizeof data);
1181 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1183 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1184 data.reg_num = MII_BMSR;
1185 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1189 *miimon = !!(data.val_out & BMSR_LSTATUS);
1191 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1194 struct ethtool_cmd ecmd;
1196 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1199 COVERAGE_INC(netdev_get_ethtool);
1200 memset(&ecmd, 0, sizeof ecmd);
1201 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1204 struct ethtool_value eval;
1206 memcpy(&eval, &ecmd, sizeof eval);
1207 *miimon = !!eval.data;
1209 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1217 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1218 long long int interval)
1220 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1222 ovs_mutex_lock(&netdev->mutex);
1223 interval = interval > 0 ? MAX(interval, 100) : 0;
1224 if (netdev->miimon_interval != interval) {
1225 netdev->miimon_interval = interval;
1226 timer_set_expired(&netdev->miimon_timer);
1228 ovs_mutex_unlock(&netdev->mutex);
1234 netdev_linux_miimon_run(void)
1236 struct shash device_shash;
1237 struct shash_node *node;
1239 shash_init(&device_shash);
1240 netdev_get_devices(&netdev_linux_class, &device_shash);
1241 SHASH_FOR_EACH (node, &device_shash) {
1242 struct netdev *netdev = node->data;
1243 struct netdev_linux *dev = netdev_linux_cast(netdev);
1246 ovs_mutex_lock(&dev->mutex);
1247 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1248 netdev_linux_get_miimon(dev->up.name, &miimon);
1249 if (miimon != dev->miimon) {
1250 dev->miimon = miimon;
1251 netdev_linux_changed(dev, dev->ifi_flags, 0);
1254 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1256 ovs_mutex_unlock(&dev->mutex);
1257 netdev_close(netdev);
1260 shash_destroy(&device_shash);
1264 netdev_linux_miimon_wait(void)
1266 struct shash device_shash;
1267 struct shash_node *node;
1269 shash_init(&device_shash);
1270 netdev_get_devices(&netdev_linux_class, &device_shash);
1271 SHASH_FOR_EACH (node, &device_shash) {
1272 struct netdev *netdev = node->data;
1273 struct netdev_linux *dev = netdev_linux_cast(netdev);
1275 ovs_mutex_lock(&dev->mutex);
1276 if (dev->miimon_interval > 0) {
1277 timer_wait(&dev->miimon_timer);
1279 ovs_mutex_unlock(&dev->mutex);
1280 netdev_close(netdev);
1282 shash_destroy(&device_shash);
1285 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1286 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1289 check_for_working_netlink_stats(void)
1291 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1292 * preferable, so if that works, we'll use it. */
1293 int ifindex = do_get_ifindex("lo");
1295 VLOG_WARN("failed to get ifindex for lo, "
1296 "obtaining netdev stats from proc");
1299 struct netdev_stats stats;
1300 int error = get_stats_via_netlink(ifindex, &stats);
1302 VLOG_DBG("obtaining netdev stats via rtnetlink");
1305 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1306 "via proc (you are probably running a pre-2.6.19 "
1307 "kernel)", ovs_strerror(error));
1314 swap_uint64(uint64_t *a, uint64_t *b)
1321 /* Copies 'src' into 'dst', performing format conversion in the process.
1323 * 'src' is allowed to be misaligned. */
1325 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1326 const struct ovs_vport_stats *src)
1328 dst->rx_packets = get_unaligned_u64(&src->rx_packets);
1329 dst->tx_packets = get_unaligned_u64(&src->tx_packets);
1330 dst->rx_bytes = get_unaligned_u64(&src->rx_bytes);
1331 dst->tx_bytes = get_unaligned_u64(&src->tx_bytes);
1332 dst->rx_errors = get_unaligned_u64(&src->rx_errors);
1333 dst->tx_errors = get_unaligned_u64(&src->tx_errors);
1334 dst->rx_dropped = get_unaligned_u64(&src->rx_dropped);
1335 dst->tx_dropped = get_unaligned_u64(&src->tx_dropped);
1337 dst->collisions = 0;
1338 dst->rx_length_errors = 0;
1339 dst->rx_over_errors = 0;
1340 dst->rx_crc_errors = 0;
1341 dst->rx_frame_errors = 0;
1342 dst->rx_fifo_errors = 0;
1343 dst->rx_missed_errors = 0;
1344 dst->tx_aborted_errors = 0;
1345 dst->tx_carrier_errors = 0;
1346 dst->tx_fifo_errors = 0;
1347 dst->tx_heartbeat_errors = 0;
1348 dst->tx_window_errors = 0;
1352 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1354 struct dpif_linux_vport reply;
1358 error = dpif_linux_vport_get(netdev_get_name(netdev), &reply, &buf);
1361 } else if (!reply.stats) {
1366 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1374 get_stats_via_vport(const struct netdev *netdev_,
1375 struct netdev_stats *stats)
1377 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1379 if (!netdev->vport_stats_error ||
1380 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1383 error = get_stats_via_vport__(netdev_, stats);
1384 if (error && error != ENOENT) {
1385 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1387 netdev_get_name(netdev_), ovs_strerror(error));
1389 netdev->vport_stats_error = error;
1390 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1395 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1396 struct netdev_stats *stats)
1398 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
1399 static int use_netlink_stats;
1402 if (ovsthread_once_start(&once)) {
1403 use_netlink_stats = check_for_working_netlink_stats();
1404 ovsthread_once_done(&once);
1407 if (use_netlink_stats) {
1410 error = get_ifindex(netdev_, &ifindex);
1412 error = get_stats_via_netlink(ifindex, stats);
1415 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1419 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1420 netdev_get_name(netdev_), error);
1426 /* Retrieves current device stats for 'netdev-linux'. */
1428 netdev_linux_get_stats(const struct netdev *netdev_,
1429 struct netdev_stats *stats)
1431 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1432 struct netdev_stats dev_stats;
1435 ovs_mutex_lock(&netdev->mutex);
1436 get_stats_via_vport(netdev_, stats);
1437 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1439 if (!netdev->vport_stats_error) {
1442 } else if (netdev->vport_stats_error) {
1443 /* stats not available from OVS then use ioctl stats. */
1446 stats->rx_errors += dev_stats.rx_errors;
1447 stats->tx_errors += dev_stats.tx_errors;
1448 stats->rx_dropped += dev_stats.rx_dropped;
1449 stats->tx_dropped += dev_stats.tx_dropped;
1450 stats->multicast += dev_stats.multicast;
1451 stats->collisions += dev_stats.collisions;
1452 stats->rx_length_errors += dev_stats.rx_length_errors;
1453 stats->rx_over_errors += dev_stats.rx_over_errors;
1454 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1455 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1456 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1457 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1458 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1459 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1460 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1461 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1462 stats->tx_window_errors += dev_stats.tx_window_errors;
1464 ovs_mutex_unlock(&netdev->mutex);
1469 /* Retrieves current device stats for 'netdev-tap' netdev or
1470 * netdev-internal. */
1472 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1474 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1475 struct netdev_stats dev_stats;
1478 ovs_mutex_lock(&netdev->mutex);
1479 get_stats_via_vport(netdev_, stats);
1480 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1482 if (!netdev->vport_stats_error) {
1485 } else if (netdev->vport_stats_error) {
1486 /* Transmit and receive stats will appear to be swapped relative to the
1487 * other ports since we are the one sending the data, not a remote
1488 * computer. For consistency, we swap them back here. This does not
1489 * apply if we are getting stats from the vport layer because it always
1490 * tracks stats from the perspective of the switch. */
1493 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1494 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1495 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1496 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1497 stats->rx_length_errors = 0;
1498 stats->rx_over_errors = 0;
1499 stats->rx_crc_errors = 0;
1500 stats->rx_frame_errors = 0;
1501 stats->rx_fifo_errors = 0;
1502 stats->rx_missed_errors = 0;
1503 stats->tx_aborted_errors = 0;
1504 stats->tx_carrier_errors = 0;
1505 stats->tx_fifo_errors = 0;
1506 stats->tx_heartbeat_errors = 0;
1507 stats->tx_window_errors = 0;
1509 stats->rx_dropped += dev_stats.tx_dropped;
1510 stats->tx_dropped += dev_stats.rx_dropped;
1512 stats->rx_errors += dev_stats.tx_errors;
1513 stats->tx_errors += dev_stats.rx_errors;
1515 stats->multicast += dev_stats.multicast;
1516 stats->collisions += dev_stats.collisions;
1518 ovs_mutex_unlock(&netdev->mutex);
1524 netdev_internal_get_stats(const struct netdev *netdev_,
1525 struct netdev_stats *stats)
1527 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1530 ovs_mutex_lock(&netdev->mutex);
1531 get_stats_via_vport(netdev_, stats);
1532 error = netdev->vport_stats_error;
1533 ovs_mutex_unlock(&netdev->mutex);
1539 netdev_internal_set_stats(struct netdev *netdev,
1540 const struct netdev_stats *stats)
1542 struct ovs_vport_stats vport_stats;
1543 struct dpif_linux_vport vport;
1546 vport_stats.rx_packets = stats->rx_packets;
1547 vport_stats.tx_packets = stats->tx_packets;
1548 vport_stats.rx_bytes = stats->rx_bytes;
1549 vport_stats.tx_bytes = stats->tx_bytes;
1550 vport_stats.rx_errors = stats->rx_errors;
1551 vport_stats.tx_errors = stats->tx_errors;
1552 vport_stats.rx_dropped = stats->rx_dropped;
1553 vport_stats.tx_dropped = stats->tx_dropped;
1555 dpif_linux_vport_init(&vport);
1556 vport.cmd = OVS_VPORT_CMD_SET;
1557 vport.name = netdev_get_name(netdev);
1558 vport.stats = &vport_stats;
1560 err = dpif_linux_vport_transact(&vport, NULL, NULL);
1562 /* If the vport layer doesn't know about the device, that doesn't mean it
1563 * doesn't exist (after all were able to open it when netdev_open() was
1564 * called), it just means that it isn't attached and we'll be getting
1565 * stats a different way. */
1566 if (err == ENODEV) {
1574 netdev_linux_read_features(struct netdev_linux *netdev)
1576 struct ethtool_cmd ecmd;
1580 if (netdev->cache_valid & VALID_FEATURES) {
1584 COVERAGE_INC(netdev_get_ethtool);
1585 memset(&ecmd, 0, sizeof ecmd);
1586 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1587 ETHTOOL_GSET, "ETHTOOL_GSET");
1592 /* Supported features. */
1593 netdev->supported = 0;
1594 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1595 netdev->supported |= NETDEV_F_10MB_HD;
1597 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1598 netdev->supported |= NETDEV_F_10MB_FD;
1600 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1601 netdev->supported |= NETDEV_F_100MB_HD;
1603 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1604 netdev->supported |= NETDEV_F_100MB_FD;
1606 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1607 netdev->supported |= NETDEV_F_1GB_HD;
1609 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1610 netdev->supported |= NETDEV_F_1GB_FD;
1612 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1613 netdev->supported |= NETDEV_F_10GB_FD;
1615 if (ecmd.supported & SUPPORTED_TP) {
1616 netdev->supported |= NETDEV_F_COPPER;
1618 if (ecmd.supported & SUPPORTED_FIBRE) {
1619 netdev->supported |= NETDEV_F_FIBER;
1621 if (ecmd.supported & SUPPORTED_Autoneg) {
1622 netdev->supported |= NETDEV_F_AUTONEG;
1624 if (ecmd.supported & SUPPORTED_Pause) {
1625 netdev->supported |= NETDEV_F_PAUSE;
1627 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1628 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1631 /* Advertised features. */
1632 netdev->advertised = 0;
1633 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1634 netdev->advertised |= NETDEV_F_10MB_HD;
1636 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1637 netdev->advertised |= NETDEV_F_10MB_FD;
1639 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1640 netdev->advertised |= NETDEV_F_100MB_HD;
1642 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1643 netdev->advertised |= NETDEV_F_100MB_FD;
1645 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1646 netdev->advertised |= NETDEV_F_1GB_HD;
1648 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1649 netdev->advertised |= NETDEV_F_1GB_FD;
1651 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1652 netdev->advertised |= NETDEV_F_10GB_FD;
1654 if (ecmd.advertising & ADVERTISED_TP) {
1655 netdev->advertised |= NETDEV_F_COPPER;
1657 if (ecmd.advertising & ADVERTISED_FIBRE) {
1658 netdev->advertised |= NETDEV_F_FIBER;
1660 if (ecmd.advertising & ADVERTISED_Autoneg) {
1661 netdev->advertised |= NETDEV_F_AUTONEG;
1663 if (ecmd.advertising & ADVERTISED_Pause) {
1664 netdev->advertised |= NETDEV_F_PAUSE;
1666 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1667 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1670 /* Current settings. */
1672 if (speed == SPEED_10) {
1673 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1674 } else if (speed == SPEED_100) {
1675 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1676 } else if (speed == SPEED_1000) {
1677 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1678 } else if (speed == SPEED_10000) {
1679 netdev->current = NETDEV_F_10GB_FD;
1680 } else if (speed == 40000) {
1681 netdev->current = NETDEV_F_40GB_FD;
1682 } else if (speed == 100000) {
1683 netdev->current = NETDEV_F_100GB_FD;
1684 } else if (speed == 1000000) {
1685 netdev->current = NETDEV_F_1TB_FD;
1687 netdev->current = 0;
1690 if (ecmd.port == PORT_TP) {
1691 netdev->current |= NETDEV_F_COPPER;
1692 } else if (ecmd.port == PORT_FIBRE) {
1693 netdev->current |= NETDEV_F_FIBER;
1697 netdev->current |= NETDEV_F_AUTONEG;
1701 netdev->cache_valid |= VALID_FEATURES;
1702 netdev->get_features_error = error;
1705 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
1706 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1707 * Returns 0 if successful, otherwise a positive errno value. */
1709 netdev_linux_get_features(const struct netdev *netdev_,
1710 enum netdev_features *current,
1711 enum netdev_features *advertised,
1712 enum netdev_features *supported,
1713 enum netdev_features *peer)
1715 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1718 ovs_mutex_lock(&netdev->mutex);
1719 netdev_linux_read_features(netdev);
1720 if (!netdev->get_features_error) {
1721 *current = netdev->current;
1722 *advertised = netdev->advertised;
1723 *supported = netdev->supported;
1724 *peer = 0; /* XXX */
1726 error = netdev->get_features_error;
1727 ovs_mutex_unlock(&netdev->mutex);
1732 /* Set the features advertised by 'netdev' to 'advertise'. */
1734 netdev_linux_set_advertisements(struct netdev *netdev_,
1735 enum netdev_features advertise)
1737 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1738 struct ethtool_cmd ecmd;
1741 ovs_mutex_lock(&netdev->mutex);
1743 COVERAGE_INC(netdev_get_ethtool);
1744 memset(&ecmd, 0, sizeof ecmd);
1745 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1746 ETHTOOL_GSET, "ETHTOOL_GSET");
1751 ecmd.advertising = 0;
1752 if (advertise & NETDEV_F_10MB_HD) {
1753 ecmd.advertising |= ADVERTISED_10baseT_Half;
1755 if (advertise & NETDEV_F_10MB_FD) {
1756 ecmd.advertising |= ADVERTISED_10baseT_Full;
1758 if (advertise & NETDEV_F_100MB_HD) {
1759 ecmd.advertising |= ADVERTISED_100baseT_Half;
1761 if (advertise & NETDEV_F_100MB_FD) {
1762 ecmd.advertising |= ADVERTISED_100baseT_Full;
1764 if (advertise & NETDEV_F_1GB_HD) {
1765 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1767 if (advertise & NETDEV_F_1GB_FD) {
1768 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1770 if (advertise & NETDEV_F_10GB_FD) {
1771 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1773 if (advertise & NETDEV_F_COPPER) {
1774 ecmd.advertising |= ADVERTISED_TP;
1776 if (advertise & NETDEV_F_FIBER) {
1777 ecmd.advertising |= ADVERTISED_FIBRE;
1779 if (advertise & NETDEV_F_AUTONEG) {
1780 ecmd.advertising |= ADVERTISED_Autoneg;
1782 if (advertise & NETDEV_F_PAUSE) {
1783 ecmd.advertising |= ADVERTISED_Pause;
1785 if (advertise & NETDEV_F_PAUSE_ASYM) {
1786 ecmd.advertising |= ADVERTISED_Asym_Pause;
1788 COVERAGE_INC(netdev_set_ethtool);
1789 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1790 ETHTOOL_SSET, "ETHTOOL_SSET");
1793 ovs_mutex_unlock(&netdev->mutex);
1797 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1798 * successful, otherwise a positive errno value. */
1800 netdev_linux_set_policing(struct netdev *netdev_,
1801 uint32_t kbits_rate, uint32_t kbits_burst)
1803 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1804 const char *netdev_name = netdev_get_name(netdev_);
1807 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1808 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1809 : kbits_burst); /* Stick with user-specified value. */
1811 ovs_mutex_lock(&netdev->mutex);
1812 if (netdev->cache_valid & VALID_POLICING) {
1813 error = netdev->netdev_policing_error;
1814 if (error || (netdev->kbits_rate == kbits_rate &&
1815 netdev->kbits_burst == kbits_burst)) {
1816 /* Assume that settings haven't changed since we last set them. */
1819 netdev->cache_valid &= ~VALID_POLICING;
1822 COVERAGE_INC(netdev_set_policing);
1823 /* Remove any existing ingress qdisc. */
1824 error = tc_add_del_ingress_qdisc(netdev_, false);
1826 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1827 netdev_name, ovs_strerror(error));
1832 error = tc_add_del_ingress_qdisc(netdev_, true);
1834 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1835 netdev_name, ovs_strerror(error));
1839 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
1841 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1842 netdev_name, ovs_strerror(error));
1847 netdev->kbits_rate = kbits_rate;
1848 netdev->kbits_burst = kbits_burst;
1851 if (!error || error == ENODEV) {
1852 netdev->netdev_policing_error = error;
1853 netdev->cache_valid |= VALID_POLICING;
1855 ovs_mutex_unlock(&netdev->mutex);
1860 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1863 const struct tc_ops *const *opsp;
1865 for (opsp = tcs; *opsp != NULL; opsp++) {
1866 const struct tc_ops *ops = *opsp;
1867 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1868 sset_add(types, ops->ovs_name);
1874 static const struct tc_ops *
1875 tc_lookup_ovs_name(const char *name)
1877 const struct tc_ops *const *opsp;
1879 for (opsp = tcs; *opsp != NULL; opsp++) {
1880 const struct tc_ops *ops = *opsp;
1881 if (!strcmp(name, ops->ovs_name)) {
1888 static const struct tc_ops *
1889 tc_lookup_linux_name(const char *name)
1891 const struct tc_ops *const *opsp;
1893 for (opsp = tcs; *opsp != NULL; opsp++) {
1894 const struct tc_ops *ops = *opsp;
1895 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1902 static struct tc_queue *
1903 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
1906 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1907 struct tc_queue *queue;
1909 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
1910 if (queue->queue_id == queue_id) {
1917 static struct tc_queue *
1918 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1920 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1924 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1926 struct netdev_qos_capabilities *caps)
1928 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1932 caps->n_queues = ops->n_queues;
1937 netdev_linux_get_qos(const struct netdev *netdev_,
1938 const char **typep, struct smap *details)
1940 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1943 ovs_mutex_lock(&netdev->mutex);
1944 error = tc_query_qdisc(netdev_);
1946 *typep = netdev->tc->ops->ovs_name;
1947 error = (netdev->tc->ops->qdisc_get
1948 ? netdev->tc->ops->qdisc_get(netdev_, details)
1951 ovs_mutex_unlock(&netdev->mutex);
1957 netdev_linux_set_qos(struct netdev *netdev_,
1958 const char *type, const struct smap *details)
1960 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1961 const struct tc_ops *new_ops;
1964 new_ops = tc_lookup_ovs_name(type);
1965 if (!new_ops || !new_ops->tc_install) {
1969 ovs_mutex_lock(&netdev->mutex);
1970 error = tc_query_qdisc(netdev_);
1975 if (new_ops == netdev->tc->ops) {
1976 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
1978 /* Delete existing qdisc. */
1979 error = tc_del_qdisc(netdev_);
1983 ovs_assert(netdev->tc == NULL);
1985 /* Install new qdisc. */
1986 error = new_ops->tc_install(netdev_, details);
1987 ovs_assert((error == 0) == (netdev->tc != NULL));
1991 ovs_mutex_unlock(&netdev->mutex);
1996 netdev_linux_get_queue(const struct netdev *netdev_,
1997 unsigned int queue_id, struct smap *details)
1999 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2002 ovs_mutex_lock(&netdev->mutex);
2003 error = tc_query_qdisc(netdev_);
2005 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2007 ? netdev->tc->ops->class_get(netdev_, queue, details)
2010 ovs_mutex_unlock(&netdev->mutex);
2016 netdev_linux_set_queue(struct netdev *netdev_,
2017 unsigned int queue_id, const struct smap *details)
2019 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2022 ovs_mutex_lock(&netdev->mutex);
2023 error = tc_query_qdisc(netdev_);
2025 error = (queue_id < netdev->tc->ops->n_queues
2026 && netdev->tc->ops->class_set
2027 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2030 ovs_mutex_unlock(&netdev->mutex);
2036 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
2038 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2041 ovs_mutex_lock(&netdev->mutex);
2042 error = tc_query_qdisc(netdev_);
2044 if (netdev->tc->ops->class_delete) {
2045 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2047 ? netdev->tc->ops->class_delete(netdev_, queue)
2053 ovs_mutex_unlock(&netdev->mutex);
2059 netdev_linux_get_queue_stats(const struct netdev *netdev_,
2060 unsigned int queue_id,
2061 struct netdev_queue_stats *stats)
2063 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2066 ovs_mutex_lock(&netdev->mutex);
2067 error = tc_query_qdisc(netdev_);
2069 if (netdev->tc->ops->class_get_stats) {
2070 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2072 stats->created = queue->created;
2073 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2082 ovs_mutex_unlock(&netdev->mutex);
2088 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
2090 struct ofpbuf request;
2091 struct tcmsg *tcmsg;
2093 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2097 tcmsg->tcm_parent = 0;
2098 nl_dump_start(dump, NETLINK_ROUTE, &request);
2099 ofpbuf_uninit(&request);
2103 struct netdev_linux_queue_state {
2104 unsigned int *queues;
2110 netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
2112 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2115 ovs_mutex_lock(&netdev->mutex);
2116 error = tc_query_qdisc(netdev_);
2118 if (netdev->tc->ops->class_get) {
2119 struct netdev_linux_queue_state *state;
2120 struct tc_queue *queue;
2123 *statep = state = xmalloc(sizeof *state);
2124 state->n_queues = hmap_count(&netdev->tc->queues);
2125 state->cur_queue = 0;
2126 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2129 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2130 state->queues[i++] = queue->queue_id;
2136 ovs_mutex_unlock(&netdev->mutex);
2142 netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2143 unsigned int *queue_idp, struct smap *details)
2145 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2146 struct netdev_linux_queue_state *state = state_;
2149 ovs_mutex_lock(&netdev->mutex);
2150 while (state->cur_queue < state->n_queues) {
2151 unsigned int queue_id = state->queues[state->cur_queue++];
2152 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2155 *queue_idp = queue_id;
2156 error = netdev->tc->ops->class_get(netdev_, queue, details);
2160 ovs_mutex_unlock(&netdev->mutex);
2166 netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2169 struct netdev_linux_queue_state *state = state_;
2171 free(state->queues);
2177 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2178 netdev_dump_queue_stats_cb *cb, void *aux)
2180 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2183 ovs_mutex_lock(&netdev->mutex);
2184 error = tc_query_qdisc(netdev_);
2186 struct nl_dump dump;
2188 if (!netdev->tc->ops->class_dump_stats) {
2190 } else if (!start_queue_dump(netdev_, &dump)) {
2196 while (nl_dump_next(&dump, &msg)) {
2197 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2204 retval = nl_dump_done(&dump);
2210 ovs_mutex_unlock(&netdev->mutex);
2216 netdev_linux_get_in4(const struct netdev *netdev_,
2217 struct in_addr *address, struct in_addr *netmask)
2219 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2222 ovs_mutex_lock(&netdev->mutex);
2223 if (!(netdev->cache_valid & VALID_IN4)) {
2224 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
2225 SIOCGIFADDR, "SIOCGIFADDR");
2227 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2228 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2230 netdev->cache_valid |= VALID_IN4;
2238 if (netdev->address.s_addr != INADDR_ANY) {
2239 *address = netdev->address;
2240 *netmask = netdev->netmask;
2242 error = EADDRNOTAVAIL;
2245 ovs_mutex_unlock(&netdev->mutex);
2251 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2252 struct in_addr netmask)
2254 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2257 ovs_mutex_lock(&netdev->mutex);
2258 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2260 netdev->cache_valid |= VALID_IN4;
2261 netdev->address = address;
2262 netdev->netmask = netmask;
2263 if (address.s_addr != INADDR_ANY) {
2264 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2265 "SIOCSIFNETMASK", netmask);
2268 ovs_mutex_unlock(&netdev->mutex);
2274 parse_if_inet6_line(const char *line,
2275 struct in6_addr *in6, char ifname[16 + 1])
2277 uint8_t *s6 = in6->s6_addr;
2278 #define X8 "%2"SCNx8
2280 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2281 "%*x %*x %*x %*x %16s\n",
2282 &s6[0], &s6[1], &s6[2], &s6[3],
2283 &s6[4], &s6[5], &s6[6], &s6[7],
2284 &s6[8], &s6[9], &s6[10], &s6[11],
2285 &s6[12], &s6[13], &s6[14], &s6[15],
2289 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2290 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2292 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2294 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2296 ovs_mutex_lock(&netdev->mutex);
2297 if (!(netdev->cache_valid & VALID_IN6)) {
2301 netdev->in6 = in6addr_any;
2303 file = fopen("/proc/net/if_inet6", "r");
2305 const char *name = netdev_get_name(netdev_);
2306 while (fgets(line, sizeof line, file)) {
2307 struct in6_addr in6_tmp;
2308 char ifname[16 + 1];
2309 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2310 && !strcmp(name, ifname))
2312 netdev->in6 = in6_tmp;
2318 netdev->cache_valid |= VALID_IN6;
2321 ovs_mutex_unlock(&netdev->mutex);
2327 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2329 struct sockaddr_in sin;
2330 memset(&sin, 0, sizeof sin);
2331 sin.sin_family = AF_INET;
2332 sin.sin_addr = addr;
2335 memset(sa, 0, sizeof *sa);
2336 memcpy(sa, &sin, sizeof sin);
2340 do_set_addr(struct netdev *netdev,
2341 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2345 make_in4_sockaddr(&ifr.ifr_addr, addr);
2346 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2350 /* Adds 'router' as a default IP gateway. */
2352 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2354 struct in_addr any = { INADDR_ANY };
2358 memset(&rt, 0, sizeof rt);
2359 make_in4_sockaddr(&rt.rt_dst, any);
2360 make_in4_sockaddr(&rt.rt_gateway, router);
2361 make_in4_sockaddr(&rt.rt_genmask, any);
2362 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2363 error = af_inet_ioctl(SIOCADDRT, &rt);
2365 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2371 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2374 static const char fn[] = "/proc/net/route";
2379 *netdev_name = NULL;
2380 stream = fopen(fn, "r");
2381 if (stream == NULL) {
2382 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2387 while (fgets(line, sizeof line, stream)) {
2390 ovs_be32 dest, gateway, mask;
2391 int refcnt, metric, mtu;
2392 unsigned int flags, use, window, irtt;
2395 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2397 iface, &dest, &gateway, &flags, &refcnt,
2398 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2400 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2404 if (!(flags & RTF_UP)) {
2405 /* Skip routes that aren't up. */
2409 /* The output of 'dest', 'mask', and 'gateway' were given in
2410 * network byte order, so we don't need need any endian
2411 * conversions here. */
2412 if ((dest & mask) == (host->s_addr & mask)) {
2414 /* The host is directly reachable. */
2415 next_hop->s_addr = 0;
2417 /* To reach the host, we must go through a gateway. */
2418 next_hop->s_addr = gateway;
2420 *netdev_name = xstrdup(iface);
2432 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2434 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2437 ovs_mutex_lock(&netdev->mutex);
2438 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2439 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2441 COVERAGE_INC(netdev_get_ethtool);
2442 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2443 error = netdev_linux_do_ethtool(netdev->up.name,
2446 "ETHTOOL_GDRVINFO");
2448 netdev->cache_valid |= VALID_DRVINFO;
2453 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2454 smap_add(smap, "driver_version", netdev->drvinfo.version);
2455 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2457 ovs_mutex_unlock(&netdev->mutex);
2463 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2466 smap_add(smap, "driver_name", "openvswitch");
2470 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2471 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2472 * returns 0. Otherwise, it returns a positive errno value; in particular,
2473 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2475 netdev_linux_arp_lookup(const struct netdev *netdev,
2476 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2479 struct sockaddr_in sin;
2482 memset(&r, 0, sizeof r);
2483 memset(&sin, 0, sizeof sin);
2484 sin.sin_family = AF_INET;
2485 sin.sin_addr.s_addr = ip;
2487 memcpy(&r.arp_pa, &sin, sizeof sin);
2488 r.arp_ha.sa_family = ARPHRD_ETHER;
2490 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2491 COVERAGE_INC(netdev_arp_lookup);
2492 retval = af_inet_ioctl(SIOCGARP, &r);
2494 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2495 } else if (retval != ENXIO) {
2496 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2497 netdev_get_name(netdev), IP_ARGS(ip),
2498 ovs_strerror(retval));
2504 nd_to_iff_flags(enum netdev_flags nd)
2507 if (nd & NETDEV_UP) {
2510 if (nd & NETDEV_PROMISC) {
2517 iff_to_nd_flags(int iff)
2519 enum netdev_flags nd = 0;
2523 if (iff & IFF_PROMISC) {
2524 nd |= NETDEV_PROMISC;
2530 update_flags(struct netdev_linux *netdev, enum netdev_flags off,
2531 enum netdev_flags on, enum netdev_flags *old_flagsp)
2532 OVS_REQUIRES(netdev->mutex)
2534 int old_flags, new_flags;
2537 old_flags = netdev->ifi_flags;
2538 *old_flagsp = iff_to_nd_flags(old_flags);
2539 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2540 if (new_flags != old_flags) {
2541 error = set_flags(netdev_get_name(&netdev->up), new_flags);
2542 get_flags(&netdev->up, &netdev->ifi_flags);
2549 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2550 enum netdev_flags on, enum netdev_flags *old_flagsp)
2552 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2555 ovs_mutex_lock(&netdev->mutex);
2556 error = update_flags(netdev, off, on, old_flagsp);
2557 ovs_mutex_unlock(&netdev->mutex);
2563 netdev_linux_change_seq(const struct netdev *netdev_)
2565 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2566 unsigned int change_seq;
2568 ovs_mutex_lock(&netdev->mutex);
2569 change_seq = netdev->change_seq;
2570 ovs_mutex_unlock(&netdev->mutex);
2575 #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, SET_STATS, \
2576 GET_FEATURES, GET_STATUS) \
2582 netdev_linux_wait, \
2584 netdev_linux_alloc, \
2586 netdev_linux_destruct, \
2587 netdev_linux_dealloc, \
2588 NULL, /* get_config */ \
2589 NULL, /* set_config */ \
2590 NULL, /* get_tunnel_config */ \
2592 netdev_linux_send, \
2593 netdev_linux_send_wait, \
2595 netdev_linux_set_etheraddr, \
2596 netdev_linux_get_etheraddr, \
2597 netdev_linux_get_mtu, \
2598 netdev_linux_set_mtu, \
2599 netdev_linux_get_ifindex, \
2600 netdev_linux_get_carrier, \
2601 netdev_linux_get_carrier_resets, \
2602 netdev_linux_set_miimon_interval, \
2607 netdev_linux_set_advertisements, \
2609 netdev_linux_set_policing, \
2610 netdev_linux_get_qos_types, \
2611 netdev_linux_get_qos_capabilities, \
2612 netdev_linux_get_qos, \
2613 netdev_linux_set_qos, \
2614 netdev_linux_get_queue, \
2615 netdev_linux_set_queue, \
2616 netdev_linux_delete_queue, \
2617 netdev_linux_get_queue_stats, \
2618 netdev_linux_queue_dump_start, \
2619 netdev_linux_queue_dump_next, \
2620 netdev_linux_queue_dump_done, \
2621 netdev_linux_dump_queue_stats, \
2623 netdev_linux_get_in4, \
2624 netdev_linux_set_in4, \
2625 netdev_linux_get_in6, \
2626 netdev_linux_add_router, \
2627 netdev_linux_get_next_hop, \
2629 netdev_linux_arp_lookup, \
2631 netdev_linux_update_flags, \
2633 netdev_linux_change_seq, \
2635 netdev_linux_rx_alloc, \
2636 netdev_linux_rx_construct, \
2637 netdev_linux_rx_destruct, \
2638 netdev_linux_rx_dealloc, \
2639 netdev_linux_rx_recv, \
2640 netdev_linux_rx_wait, \
2641 netdev_linux_rx_drain, \
2644 const struct netdev_class netdev_linux_class =
2647 netdev_linux_construct,
2648 netdev_linux_get_stats,
2649 NULL, /* set_stats */
2650 netdev_linux_get_features,
2651 netdev_linux_get_status);
2653 const struct netdev_class netdev_tap_class =
2656 netdev_linux_construct_tap,
2657 netdev_tap_get_stats,
2658 NULL, /* set_stats */
2659 netdev_linux_get_features,
2660 netdev_linux_get_status);
2662 const struct netdev_class netdev_internal_class =
2665 netdev_linux_construct,
2666 netdev_internal_get_stats,
2667 netdev_internal_set_stats,
2668 NULL, /* get_features */
2669 netdev_internal_get_status);
2671 /* HTB traffic control class. */
2673 #define HTB_N_QUEUES 0xf000
2677 unsigned int max_rate; /* In bytes/s. */
2681 struct tc_queue tc_queue;
2682 unsigned int min_rate; /* In bytes/s. */
2683 unsigned int max_rate; /* In bytes/s. */
2684 unsigned int burst; /* In bytes. */
2685 unsigned int priority; /* Lower values are higher priorities. */
2689 htb_get__(const struct netdev *netdev_)
2691 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2692 return CONTAINER_OF(netdev->tc, struct htb, tc);
2696 htb_install__(struct netdev *netdev_, uint64_t max_rate)
2698 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2701 htb = xmalloc(sizeof *htb);
2702 tc_init(&htb->tc, &tc_ops_htb);
2703 htb->max_rate = max_rate;
2705 netdev->tc = &htb->tc;
2708 /* Create an HTB qdisc.
2710 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2712 htb_setup_qdisc__(struct netdev *netdev)
2715 struct tc_htb_glob opt;
2716 struct ofpbuf request;
2717 struct tcmsg *tcmsg;
2719 tc_del_qdisc(netdev);
2721 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2722 NLM_F_EXCL | NLM_F_CREATE, &request);
2726 tcmsg->tcm_handle = tc_make_handle(1, 0);
2727 tcmsg->tcm_parent = TC_H_ROOT;
2729 nl_msg_put_string(&request, TCA_KIND, "htb");
2731 memset(&opt, 0, sizeof opt);
2732 opt.rate2quantum = 10;
2736 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2737 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2738 nl_msg_end_nested(&request, opt_offset);
2740 return tc_transact(&request, NULL);
2743 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2744 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2746 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2747 unsigned int parent, struct htb_class *class)
2750 struct tc_htb_opt opt;
2751 struct ofpbuf request;
2752 struct tcmsg *tcmsg;
2756 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
2758 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2759 netdev_get_name(netdev));
2763 memset(&opt, 0, sizeof opt);
2764 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2765 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2766 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2767 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2768 opt.prio = class->priority;
2770 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2774 tcmsg->tcm_handle = handle;
2775 tcmsg->tcm_parent = parent;
2777 nl_msg_put_string(&request, TCA_KIND, "htb");
2778 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2779 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2780 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2781 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2782 nl_msg_end_nested(&request, opt_offset);
2784 error = tc_transact(&request, NULL);
2786 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2787 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2788 netdev_get_name(netdev),
2789 tc_get_major(handle), tc_get_minor(handle),
2790 tc_get_major(parent), tc_get_minor(parent),
2791 class->min_rate, class->max_rate,
2792 class->burst, class->priority, ovs_strerror(error));
2797 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2798 * description of them into 'details'. The description complies with the
2799 * specification given in the vswitch database documentation for linux-htb
2802 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2804 static const struct nl_policy tca_htb_policy[] = {
2805 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2806 .min_len = sizeof(struct tc_htb_opt) },
2809 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2810 const struct tc_htb_opt *htb;
2812 if (!nl_parse_nested(nl_options, tca_htb_policy,
2813 attrs, ARRAY_SIZE(tca_htb_policy))) {
2814 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2818 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2819 class->min_rate = htb->rate.rate;
2820 class->max_rate = htb->ceil.rate;
2821 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2822 class->priority = htb->prio;
2827 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2828 struct htb_class *options,
2829 struct netdev_queue_stats *stats)
2831 struct nlattr *nl_options;
2832 unsigned int handle;
2835 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2836 if (!error && queue_id) {
2837 unsigned int major = tc_get_major(handle);
2838 unsigned int minor = tc_get_minor(handle);
2839 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2840 *queue_id = minor - 1;
2845 if (!error && options) {
2846 error = htb_parse_tca_options__(nl_options, options);
2852 htb_parse_qdisc_details__(struct netdev *netdev_,
2853 const struct smap *details, struct htb_class *hc)
2855 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2856 const char *max_rate_s;
2858 max_rate_s = smap_get(details, "max-rate");
2859 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2860 if (!hc->max_rate) {
2861 enum netdev_features current;
2863 netdev_linux_read_features(netdev);
2864 current = !netdev->get_features_error ? netdev->current : 0;
2865 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
2867 hc->min_rate = hc->max_rate;
2873 htb_parse_class_details__(struct netdev *netdev,
2874 const struct smap *details, struct htb_class *hc)
2876 const struct htb *htb = htb_get__(netdev);
2877 const char *min_rate_s = smap_get(details, "min-rate");
2878 const char *max_rate_s = smap_get(details, "max-rate");
2879 const char *burst_s = smap_get(details, "burst");
2880 const char *priority_s = smap_get(details, "priority");
2883 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
2885 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2886 netdev_get_name(netdev));
2890 /* HTB requires at least an mtu sized min-rate to send any traffic even
2891 * on uncongested links. */
2892 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2893 hc->min_rate = MAX(hc->min_rate, mtu);
2894 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2897 hc->max_rate = (max_rate_s
2898 ? strtoull(max_rate_s, NULL, 10) / 8
2900 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2901 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2905 * According to hints in the documentation that I've read, it is important
2906 * that 'burst' be at least as big as the largest frame that might be
2907 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2908 * but having it a bit too small is a problem. Since netdev_get_mtu()
2909 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2910 * the MTU. We actually add 64, instead of 14, as a guard against
2911 * additional headers get tacked on somewhere that we're not aware of. */
2912 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2913 hc->burst = MAX(hc->burst, mtu + 64);
2916 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2922 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2923 unsigned int parent, struct htb_class *options,
2924 struct netdev_queue_stats *stats)
2926 struct ofpbuf *reply;
2929 error = tc_query_class(netdev, handle, parent, &reply);
2931 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2932 ofpbuf_delete(reply);
2938 htb_tc_install(struct netdev *netdev, const struct smap *details)
2942 error = htb_setup_qdisc__(netdev);
2944 struct htb_class hc;
2946 htb_parse_qdisc_details__(netdev, details, &hc);
2947 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2948 tc_make_handle(1, 0), &hc);
2950 htb_install__(netdev, hc.max_rate);
2956 static struct htb_class *
2957 htb_class_cast__(const struct tc_queue *queue)
2959 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2963 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2964 const struct htb_class *hc)
2966 struct htb *htb = htb_get__(netdev);
2967 size_t hash = hash_int(queue_id, 0);
2968 struct tc_queue *queue;
2969 struct htb_class *hcp;
2971 queue = tc_find_queue__(netdev, queue_id, hash);
2973 hcp = htb_class_cast__(queue);
2975 hcp = xmalloc(sizeof *hcp);
2976 queue = &hcp->tc_queue;
2977 queue->queue_id = queue_id;
2978 queue->created = time_msec();
2979 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2982 hcp->min_rate = hc->min_rate;
2983 hcp->max_rate = hc->max_rate;
2984 hcp->burst = hc->burst;
2985 hcp->priority = hc->priority;
2989 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2992 struct nl_dump dump;
2993 struct htb_class hc;
2995 /* Get qdisc options. */
2997 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2998 htb_install__(netdev, hc.max_rate);
3001 if (!start_queue_dump(netdev, &dump)) {
3004 while (nl_dump_next(&dump, &msg)) {
3005 unsigned int queue_id;
3007 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3008 htb_update_queue__(netdev, queue_id, &hc);
3011 nl_dump_done(&dump);
3017 htb_tc_destroy(struct tc *tc)
3019 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
3020 struct htb_class *hc, *next;
3022 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
3023 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3031 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
3033 const struct htb *htb = htb_get__(netdev);
3034 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
3039 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
3041 struct htb_class hc;
3044 htb_parse_qdisc_details__(netdev, details, &hc);
3045 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3046 tc_make_handle(1, 0), &hc);
3048 htb_get__(netdev)->max_rate = hc.max_rate;
3054 htb_class_get(const struct netdev *netdev OVS_UNUSED,
3055 const struct tc_queue *queue, struct smap *details)
3057 const struct htb_class *hc = htb_class_cast__(queue);
3059 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3060 if (hc->min_rate != hc->max_rate) {
3061 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3063 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
3065 smap_add_format(details, "priority", "%u", hc->priority);
3071 htb_class_set(struct netdev *netdev, unsigned int queue_id,
3072 const struct smap *details)
3074 struct htb_class hc;
3077 error = htb_parse_class_details__(netdev, details, &hc);
3082 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3083 tc_make_handle(1, 0xfffe), &hc);
3088 htb_update_queue__(netdev, queue_id, &hc);
3093 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
3095 struct htb_class *hc = htb_class_cast__(queue);
3096 struct htb *htb = htb_get__(netdev);
3099 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3101 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3108 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3109 struct netdev_queue_stats *stats)
3111 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3112 tc_make_handle(1, 0xfffe), NULL, stats);
3116 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3117 const struct ofpbuf *nlmsg,
3118 netdev_dump_queue_stats_cb *cb, void *aux)
3120 struct netdev_queue_stats stats;
3121 unsigned int handle, major, minor;
3124 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3129 major = tc_get_major(handle);
3130 minor = tc_get_minor(handle);
3131 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3132 (*cb)(minor - 1, &stats, aux);
3137 static const struct tc_ops tc_ops_htb = {
3138 "htb", /* linux_name */
3139 "linux-htb", /* ovs_name */
3140 HTB_N_QUEUES, /* n_queues */
3149 htb_class_get_stats,
3150 htb_class_dump_stats
3153 /* "linux-hfsc" traffic control class. */
3155 #define HFSC_N_QUEUES 0xf000
3163 struct tc_queue tc_queue;
3168 static struct hfsc *
3169 hfsc_get__(const struct netdev *netdev_)
3171 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3172 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
3175 static struct hfsc_class *
3176 hfsc_class_cast__(const struct tc_queue *queue)
3178 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3182 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
3184 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3187 hfsc = xmalloc(sizeof *hfsc);
3188 tc_init(&hfsc->tc, &tc_ops_hfsc);
3189 hfsc->max_rate = max_rate;
3190 netdev->tc = &hfsc->tc;
3194 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3195 const struct hfsc_class *hc)
3199 struct hfsc_class *hcp;
3200 struct tc_queue *queue;
3202 hfsc = hfsc_get__(netdev);
3203 hash = hash_int(queue_id, 0);
3205 queue = tc_find_queue__(netdev, queue_id, hash);
3207 hcp = hfsc_class_cast__(queue);
3209 hcp = xmalloc(sizeof *hcp);
3210 queue = &hcp->tc_queue;
3211 queue->queue_id = queue_id;
3212 queue->created = time_msec();
3213 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3216 hcp->min_rate = hc->min_rate;
3217 hcp->max_rate = hc->max_rate;
3221 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3223 const struct tc_service_curve *rsc, *fsc, *usc;
3224 static const struct nl_policy tca_hfsc_policy[] = {
3226 .type = NL_A_UNSPEC,
3228 .min_len = sizeof(struct tc_service_curve),
3231 .type = NL_A_UNSPEC,
3233 .min_len = sizeof(struct tc_service_curve),
3236 .type = NL_A_UNSPEC,
3238 .min_len = sizeof(struct tc_service_curve),
3241 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3243 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3244 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3245 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3249 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3250 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3251 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3253 if (rsc->m1 != 0 || rsc->d != 0 ||
3254 fsc->m1 != 0 || fsc->d != 0 ||
3255 usc->m1 != 0 || usc->d != 0) {
3256 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3257 "Non-linear service curves are not supported.");
3261 if (rsc->m2 != fsc->m2) {
3262 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3263 "Real-time service curves are not supported ");
3267 if (rsc->m2 > usc->m2) {
3268 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3269 "Min-rate service curve is greater than "
3270 "the max-rate service curve.");
3274 class->min_rate = fsc->m2;
3275 class->max_rate = usc->m2;
3280 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3281 struct hfsc_class *options,
3282 struct netdev_queue_stats *stats)
3285 unsigned int handle;
3286 struct nlattr *nl_options;
3288 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3294 unsigned int major, minor;
3296 major = tc_get_major(handle);
3297 minor = tc_get_minor(handle);
3298 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3299 *queue_id = minor - 1;
3306 error = hfsc_parse_tca_options__(nl_options, options);
3313 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3314 unsigned int parent, struct hfsc_class *options,
3315 struct netdev_queue_stats *stats)
3318 struct ofpbuf *reply;
3320 error = tc_query_class(netdev, handle, parent, &reply);
3325 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3326 ofpbuf_delete(reply);
3331 hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
3332 struct hfsc_class *class)
3334 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3336 const char *max_rate_s;
3338 max_rate_s = smap_get(details, "max-rate");
3339 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3342 enum netdev_features current;
3344 netdev_linux_read_features(netdev);
3345 current = !netdev->get_features_error ? netdev->current : 0;
3346 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3349 class->min_rate = max_rate;
3350 class->max_rate = max_rate;
3354 hfsc_parse_class_details__(struct netdev *netdev,
3355 const struct smap *details,
3356 struct hfsc_class * class)
3358 const struct hfsc *hfsc;
3359 uint32_t min_rate, max_rate;
3360 const char *min_rate_s, *max_rate_s;
3362 hfsc = hfsc_get__(netdev);
3363 min_rate_s = smap_get(details, "min-rate");
3364 max_rate_s = smap_get(details, "max-rate");
3366 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3367 min_rate = MAX(min_rate, 1);
3368 min_rate = MIN(min_rate, hfsc->max_rate);
3370 max_rate = (max_rate_s
3371 ? strtoull(max_rate_s, NULL, 10) / 8
3373 max_rate = MAX(max_rate, min_rate);
3374 max_rate = MIN(max_rate, hfsc->max_rate);
3376 class->min_rate = min_rate;
3377 class->max_rate = max_rate;
3382 /* Create an HFSC qdisc.
3384 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3386 hfsc_setup_qdisc__(struct netdev * netdev)
3388 struct tcmsg *tcmsg;
3389 struct ofpbuf request;
3390 struct tc_hfsc_qopt opt;
3392 tc_del_qdisc(netdev);
3394 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3395 NLM_F_EXCL | NLM_F_CREATE, &request);
3401 tcmsg->tcm_handle = tc_make_handle(1, 0);
3402 tcmsg->tcm_parent = TC_H_ROOT;
3404 memset(&opt, 0, sizeof opt);
3407 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3408 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3410 return tc_transact(&request, NULL);
3413 /* Create an HFSC class.
3415 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3416 * sc rate <min_rate> ul rate <max_rate>" */
3418 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3419 unsigned int parent, struct hfsc_class *class)
3423 struct tcmsg *tcmsg;
3424 struct ofpbuf request;
3425 struct tc_service_curve min, max;
3427 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3433 tcmsg->tcm_handle = handle;
3434 tcmsg->tcm_parent = parent;
3438 min.m2 = class->min_rate;
3442 max.m2 = class->max_rate;
3444 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3445 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3446 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3447 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3448 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3449 nl_msg_end_nested(&request, opt_offset);
3451 error = tc_transact(&request, NULL);
3453 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3454 "min-rate %ubps, max-rate %ubps (%s)",
3455 netdev_get_name(netdev),
3456 tc_get_major(handle), tc_get_minor(handle),
3457 tc_get_major(parent), tc_get_minor(parent),
3458 class->min_rate, class->max_rate, ovs_strerror(error));
3465 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
3468 struct hfsc_class class;
3470 error = hfsc_setup_qdisc__(netdev);
3476 hfsc_parse_qdisc_details__(netdev, details, &class);
3477 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3478 tc_make_handle(1, 0), &class);
3484 hfsc_install__(netdev, class.max_rate);
3489 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3492 struct nl_dump dump;
3493 struct hfsc_class hc;
3496 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3497 hfsc_install__(netdev, hc.max_rate);
3499 if (!start_queue_dump(netdev, &dump)) {
3503 while (nl_dump_next(&dump, &msg)) {
3504 unsigned int queue_id;
3506 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3507 hfsc_update_queue__(netdev, queue_id, &hc);
3511 nl_dump_done(&dump);
3516 hfsc_tc_destroy(struct tc *tc)
3519 struct hfsc_class *hc, *next;
3521 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3523 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3524 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3533 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
3535 const struct hfsc *hfsc;
3536 hfsc = hfsc_get__(netdev);
3537 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
3542 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
3545 struct hfsc_class class;
3547 hfsc_parse_qdisc_details__(netdev, details, &class);
3548 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3549 tc_make_handle(1, 0), &class);
3552 hfsc_get__(netdev)->max_rate = class.max_rate;
3559 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3560 const struct tc_queue *queue, struct smap *details)
3562 const struct hfsc_class *hc;
3564 hc = hfsc_class_cast__(queue);
3565 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3566 if (hc->min_rate != hc->max_rate) {
3567 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3573 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3574 const struct smap *details)
3577 struct hfsc_class class;
3579 error = hfsc_parse_class_details__(netdev, details, &class);
3584 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3585 tc_make_handle(1, 0xfffe), &class);
3590 hfsc_update_queue__(netdev, queue_id, &class);
3595 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3599 struct hfsc_class *hc;
3601 hc = hfsc_class_cast__(queue);
3602 hfsc = hfsc_get__(netdev);
3604 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3606 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3613 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3614 struct netdev_queue_stats *stats)
3616 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3617 tc_make_handle(1, 0xfffe), NULL, stats);
3621 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3622 const struct ofpbuf *nlmsg,
3623 netdev_dump_queue_stats_cb *cb, void *aux)
3625 struct netdev_queue_stats stats;
3626 unsigned int handle, major, minor;
3629 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3634 major = tc_get_major(handle);
3635 minor = tc_get_minor(handle);
3636 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3637 (*cb)(minor - 1, &stats, aux);
3642 static const struct tc_ops tc_ops_hfsc = {
3643 "hfsc", /* linux_name */
3644 "linux-hfsc", /* ovs_name */
3645 HFSC_N_QUEUES, /* n_queues */
3646 hfsc_tc_install, /* tc_install */
3647 hfsc_tc_load, /* tc_load */
3648 hfsc_tc_destroy, /* tc_destroy */
3649 hfsc_qdisc_get, /* qdisc_get */
3650 hfsc_qdisc_set, /* qdisc_set */
3651 hfsc_class_get, /* class_get */
3652 hfsc_class_set, /* class_set */
3653 hfsc_class_delete, /* class_delete */
3654 hfsc_class_get_stats, /* class_get_stats */
3655 hfsc_class_dump_stats /* class_dump_stats */
3658 /* "linux-default" traffic control class.
3660 * This class represents the default, unnamed Linux qdisc. It corresponds to
3661 * the "" (empty string) QoS type in the OVS database. */
3664 default_install__(struct netdev *netdev_)
3666 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3667 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
3669 /* Nothing but a tc class implementation is allowed to write to a tc. This
3670 * class never does that, so we can legitimately use a const tc object. */
3671 netdev->tc = CONST_CAST(struct tc *, &tc);
3675 default_tc_install(struct netdev *netdev,
3676 const struct smap *details OVS_UNUSED)
3678 default_install__(netdev);
3683 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3685 default_install__(netdev);
3689 static const struct tc_ops tc_ops_default = {
3690 NULL, /* linux_name */
3695 NULL, /* tc_destroy */
3696 NULL, /* qdisc_get */
3697 NULL, /* qdisc_set */
3698 NULL, /* class_get */
3699 NULL, /* class_set */
3700 NULL, /* class_delete */
3701 NULL, /* class_get_stats */
3702 NULL /* class_dump_stats */
3705 /* "linux-other" traffic control class.
3710 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
3712 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3713 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
3715 /* Nothing but a tc class implementation is allowed to write to a tc. This
3716 * class never does that, so we can legitimately use a const tc object. */
3717 netdev->tc = CONST_CAST(struct tc *, &tc);
3721 static const struct tc_ops tc_ops_other = {
3722 NULL, /* linux_name */
3723 "linux-other", /* ovs_name */
3725 NULL, /* tc_install */
3727 NULL, /* tc_destroy */
3728 NULL, /* qdisc_get */
3729 NULL, /* qdisc_set */
3730 NULL, /* class_get */
3731 NULL, /* class_set */
3732 NULL, /* class_delete */
3733 NULL, /* class_get_stats */
3734 NULL /* class_dump_stats */
3737 /* Traffic control. */
3739 /* Number of kernel "tc" ticks per second. */
3740 static double ticks_per_s;
3742 /* Number of kernel "jiffies" per second. This is used for the purpose of
3743 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3744 * one jiffy's worth of data.
3746 * There are two possibilities here:
3748 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3749 * approximate range of 100 to 1024. That means that we really need to
3750 * make sure that the qdisc can buffer that much data.
3752 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3753 * has finely granular timers and there's no need to fudge additional room
3754 * for buffers. (There's no extra effort needed to implement that: the
3755 * large 'buffer_hz' is used as a divisor, so practically any number will
3756 * come out as 0 in the division. Small integer results in the case of
3757 * really high dividends won't have any real effect anyhow.)
3759 static unsigned int buffer_hz;
3761 /* Returns tc handle 'major':'minor'. */
3763 tc_make_handle(unsigned int major, unsigned int minor)
3765 return TC_H_MAKE(major << 16, minor);
3768 /* Returns the major number from 'handle'. */
3770 tc_get_major(unsigned int handle)
3772 return TC_H_MAJ(handle) >> 16;
3775 /* Returns the minor number from 'handle'. */
3777 tc_get_minor(unsigned int handle)
3779 return TC_H_MIN(handle);
3782 static struct tcmsg *
3783 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3784 struct ofpbuf *request)
3786 struct tcmsg *tcmsg;
3790 error = get_ifindex(netdev, &ifindex);
3795 ofpbuf_init(request, 512);
3796 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3797 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3798 tcmsg->tcm_family = AF_UNSPEC;
3799 tcmsg->tcm_ifindex = ifindex;
3800 /* Caller should fill in tcmsg->tcm_handle. */
3801 /* Caller should fill in tcmsg->tcm_parent. */
3807 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3809 int error = nl_transact(NETLINK_ROUTE, request, replyp);
3810 ofpbuf_uninit(request);
3814 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3815 * policing configuration.
3817 * This function is equivalent to running the following when 'add' is true:
3818 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3820 * This function is equivalent to running the following when 'add' is false:
3821 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3823 * The configuration and stats may be seen with the following command:
3824 * /sbin/tc -s qdisc show dev <devname>
3826 * Returns 0 if successful, otherwise a positive errno value.
3829 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3831 struct ofpbuf request;
3832 struct tcmsg *tcmsg;
3834 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3835 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3837 tcmsg = tc_make_request(netdev, type, flags, &request);
3841 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3842 tcmsg->tcm_parent = TC_H_INGRESS;
3843 nl_msg_put_string(&request, TCA_KIND, "ingress");
3844 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3846 error = tc_transact(&request, NULL);
3848 /* If we're deleting the qdisc, don't worry about some of the
3849 * error conditions. */
3850 if (!add && (error == ENOENT || error == EINVAL)) {
3859 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3862 * This function is equivalent to running:
3863 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3864 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3867 * The configuration and stats may be seen with the following command:
3868 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3870 * Returns 0 if successful, otherwise a positive errno value.
3873 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3875 struct tc_police tc_police;
3876 struct ofpbuf request;
3877 struct tcmsg *tcmsg;
3878 size_t basic_offset;
3879 size_t police_offset;
3883 memset(&tc_police, 0, sizeof tc_police);
3884 tc_police.action = TC_POLICE_SHOT;
3885 tc_police.mtu = mtu;
3886 tc_fill_rate(&tc_police.rate, (kbits_rate * 1000)/8, mtu);
3887 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3888 kbits_burst * 1024);
3890 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3891 NLM_F_EXCL | NLM_F_CREATE, &request);
3895 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3896 tcmsg->tcm_info = tc_make_handle(49,
3897 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3899 nl_msg_put_string(&request, TCA_KIND, "basic");
3900 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3901 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3902 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3903 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3904 nl_msg_end_nested(&request, police_offset);
3905 nl_msg_end_nested(&request, basic_offset);
3907 error = tc_transact(&request, NULL);
3918 /* The values in psched are not individually very meaningful, but they are
3919 * important. The tables below show some values seen in the wild.
3923 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3924 * (Before that, there are hints that it was 1000000000.)
3926 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3930 * -----------------------------------
3931 * [1] 000c8000 000f4240 000f4240 00000064
3932 * [2] 000003e8 00000400 000f4240 3b9aca00
3933 * [3] 000003e8 00000400 000f4240 3b9aca00
3934 * [4] 000003e8 00000400 000f4240 00000064
3935 * [5] 000003e8 00000040 000f4240 3b9aca00
3936 * [6] 000003e8 00000040 000f4240 000000f9
3938 * a b c d ticks_per_s buffer_hz
3939 * ------- --------- ---------- ------------- ----------- -------------
3940 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3941 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3942 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3943 * [4] 1,000 1,024 1,000,000 100 976,562 100
3944 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3945 * [6] 1,000 64 1,000,000 249 15,625,000 249
3947 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3948 * [2] 2.6.26-1-686-bigmem from Debian lenny
3949 * [3] 2.6.26-2-sparc64 from Debian lenny
3950 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3951 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3952 * [6] 2.6.34 from kernel.org on KVM
3954 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
3955 static const char fn[] = "/proc/net/psched";
3956 unsigned int a, b, c, d;
3959 if (!ovsthread_once_start(&once)) {
3966 stream = fopen(fn, "r");
3968 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
3972 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3973 VLOG_WARN("%s: read failed", fn);
3977 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3981 VLOG_WARN("%s: invalid scheduler parameters", fn);
3985 ticks_per_s = (double) a * c / b;
3989 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3992 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3995 ovsthread_once_done(&once);
3998 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3999 * rate of 'rate' bytes per second. */
4001 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
4004 return (rate * ticks) / ticks_per_s;
4007 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
4008 * rate of 'rate' bytes per second. */
4010 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
4013 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
4016 /* Returns the number of bytes that need to be reserved for qdisc buffering at
4017 * a transmission rate of 'rate' bytes per second. */
4019 tc_buffer_per_jiffy(unsigned int rate)
4022 return rate / buffer_hz;
4025 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
4026 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
4027 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
4028 * stores NULL into it if it is absent.
4030 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
4033 * Returns 0 if successful, otherwise a positive errno value. */
4035 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
4036 struct nlattr **options)
4038 static const struct nl_policy tca_policy[] = {
4039 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
4040 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
4042 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4044 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4045 tca_policy, ta, ARRAY_SIZE(ta))) {
4046 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
4051 *kind = nl_attr_get_string(ta[TCA_KIND]);
4055 *options = ta[TCA_OPTIONS];
4070 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
4071 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
4072 * into '*options', and its queue statistics into '*stats'. Any of the output
4073 * arguments may be null.
4075 * Returns 0 if successful, otherwise a positive errno value. */
4077 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
4078 struct nlattr **options, struct netdev_queue_stats *stats)
4080 static const struct nl_policy tca_policy[] = {
4081 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
4082 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
4084 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4086 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4087 tca_policy, ta, ARRAY_SIZE(ta))) {
4088 VLOG_WARN_RL(&rl, "failed to parse class message");
4093 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
4094 *handlep = tc->tcm_handle;
4098 *options = ta[TCA_OPTIONS];
4102 const struct gnet_stats_queue *gsq;
4103 struct gnet_stats_basic gsb;
4105 static const struct nl_policy stats_policy[] = {
4106 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
4107 .min_len = sizeof gsb },
4108 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
4109 .min_len = sizeof *gsq },
4111 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
4113 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
4114 sa, ARRAY_SIZE(sa))) {
4115 VLOG_WARN_RL(&rl, "failed to parse class stats");
4119 /* Alignment issues screw up the length of struct gnet_stats_basic on
4120 * some arch/bitsize combinations. Newer versions of Linux have a
4121 * struct gnet_stats_basic_packed, but we can't depend on that. The
4122 * easiest thing to do is just to make a copy. */
4123 memset(&gsb, 0, sizeof gsb);
4124 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
4125 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
4126 stats->tx_bytes = gsb.bytes;
4127 stats->tx_packets = gsb.packets;
4129 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
4130 stats->tx_errors = gsq->drops;
4140 memset(stats, 0, sizeof *stats);
4145 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
4148 tc_query_class(const struct netdev *netdev,
4149 unsigned int handle, unsigned int parent,
4150 struct ofpbuf **replyp)
4152 struct ofpbuf request;
4153 struct tcmsg *tcmsg;
4156 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
4160 tcmsg->tcm_handle = handle;
4161 tcmsg->tcm_parent = parent;
4163 error = tc_transact(&request, replyp);
4165 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
4166 netdev_get_name(netdev),
4167 tc_get_major(handle), tc_get_minor(handle),
4168 tc_get_major(parent), tc_get_minor(parent),
4169 ovs_strerror(error));
4174 /* Equivalent to "tc class del dev <name> handle <handle>". */
4176 tc_delete_class(const struct netdev *netdev, unsigned int handle)
4178 struct ofpbuf request;
4179 struct tcmsg *tcmsg;
4182 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
4186 tcmsg->tcm_handle = handle;
4187 tcmsg->tcm_parent = 0;
4189 error = tc_transact(&request, NULL);
4191 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4192 netdev_get_name(netdev),
4193 tc_get_major(handle), tc_get_minor(handle),
4194 ovs_strerror(error));
4199 /* Equivalent to "tc qdisc del dev <name> root". */
4201 tc_del_qdisc(struct netdev *netdev_)
4203 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4204 struct ofpbuf request;
4205 struct tcmsg *tcmsg;
4208 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
4212 tcmsg->tcm_handle = tc_make_handle(1, 0);
4213 tcmsg->tcm_parent = TC_H_ROOT;
4215 error = tc_transact(&request, NULL);
4216 if (error == EINVAL) {
4217 /* EINVAL probably means that the default qdisc was in use, in which
4218 * case we've accomplished our purpose. */
4221 if (!error && netdev->tc) {
4222 if (netdev->tc->ops->tc_destroy) {
4223 netdev->tc->ops->tc_destroy(netdev->tc);
4230 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4231 * kernel to determine what they are. Returns 0 if successful, otherwise a
4232 * positive errno value. */
4234 tc_query_qdisc(const struct netdev *netdev_)
4236 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4237 struct ofpbuf request, *qdisc;
4238 const struct tc_ops *ops;
4239 struct tcmsg *tcmsg;
4247 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4248 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4249 * 2.6.35 without that fix backported to it.
4251 * To avoid the OOPS, we must not make a request that would attempt to dump
4252 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4253 * few others. There are a few ways that I can see to do this, but most of
4254 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4255 * technique chosen here is to assume that any non-default qdisc that we
4256 * create will have a class with handle 1:0. The built-in qdiscs only have
4257 * a class with handle 0:0.
4259 * We could check for Linux 2.6.35+ and use a more straightforward method
4261 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
4265 tcmsg->tcm_handle = tc_make_handle(1, 0);
4266 tcmsg->tcm_parent = 0;
4268 /* Figure out what tc class to instantiate. */
4269 error = tc_transact(&request, &qdisc);
4273 error = tc_parse_qdisc(qdisc, &kind, NULL);
4275 ops = &tc_ops_other;
4277 ops = tc_lookup_linux_name(kind);
4279 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4280 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4282 ops = &tc_ops_other;
4285 } else if (error == ENOENT) {
4286 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4287 * other entity that doesn't have a handle 1:0. We will assume
4288 * that it's the system default qdisc. */
4289 ops = &tc_ops_default;
4292 /* Who knows? Maybe the device got deleted. */
4293 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4294 netdev_get_name(netdev_), ovs_strerror(error));
4295 ops = &tc_ops_other;
4298 /* Instantiate it. */
4299 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
4300 ovs_assert((load_error == 0) == (netdev->tc != NULL));
4301 ofpbuf_delete(qdisc);
4303 return error ? error : load_error;
4306 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4307 approximate the time to transmit packets of various lengths. For an MTU of
4308 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4309 represents two possible packet lengths; for a MTU of 513 through 1024, four
4310 possible lengths; and so on.
4312 Returns, for the specified 'mtu', the number of bits that packet lengths
4313 need to be shifted right to fit within such a 256-entry table. */
4315 tc_calc_cell_log(unsigned int mtu)
4320 mtu = ETH_PAYLOAD_MAX;
4322 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4324 for (cell_log = 0; mtu >= 256; cell_log++) {
4331 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4334 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4336 memset(rate, 0, sizeof *rate);
4337 rate->cell_log = tc_calc_cell_log(mtu);
4338 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4339 /* rate->cell_align = 0; */ /* distro headers. */
4340 rate->mpu = ETH_TOTAL_MIN;
4344 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4345 * attribute of the specified "type".
4347 * See tc_calc_cell_log() above for a description of "rtab"s. */
4349 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4354 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4355 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4356 unsigned packet_size = (i + 1) << rate->cell_log;
4357 if (packet_size < rate->mpu) {
4358 packet_size = rate->mpu;
4360 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4364 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4365 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4366 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4369 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4371 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4372 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4375 /* Linux-only functions declared in netdev-linux.h */
4377 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4378 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4380 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4381 const char *flag_name, bool enable)
4383 const char *netdev_name = netdev_get_name(netdev);
4384 struct ethtool_value evalue;
4388 COVERAGE_INC(netdev_get_ethtool);
4389 memset(&evalue, 0, sizeof evalue);
4390 error = netdev_linux_do_ethtool(netdev_name,
4391 (struct ethtool_cmd *)&evalue,
4392 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4397 COVERAGE_INC(netdev_set_ethtool);
4398 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4399 error = netdev_linux_do_ethtool(netdev_name,
4400 (struct ethtool_cmd *)&evalue,
4401 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4406 COVERAGE_INC(netdev_get_ethtool);
4407 memset(&evalue, 0, sizeof evalue);
4408 error = netdev_linux_do_ethtool(netdev_name,
4409 (struct ethtool_cmd *)&evalue,
4410 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4415 if (new_flags != evalue.data) {
4416 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4417 "device %s failed", enable ? "enable" : "disable",
4418 flag_name, netdev_name);
4425 /* Utility functions. */
4427 /* Copies 'src' into 'dst', performing format conversion in the process. */
4429 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4430 const struct rtnl_link_stats *src)
4432 dst->rx_packets = src->rx_packets;
4433 dst->tx_packets = src->tx_packets;
4434 dst->rx_bytes = src->rx_bytes;
4435 dst->tx_bytes = src->tx_bytes;
4436 dst->rx_errors = src->rx_errors;
4437 dst->tx_errors = src->tx_errors;
4438 dst->rx_dropped = src->rx_dropped;
4439 dst->tx_dropped = src->tx_dropped;
4440 dst->multicast = src->multicast;
4441 dst->collisions = src->collisions;
4442 dst->rx_length_errors = src->rx_length_errors;
4443 dst->rx_over_errors = src->rx_over_errors;
4444 dst->rx_crc_errors = src->rx_crc_errors;
4445 dst->rx_frame_errors = src->rx_frame_errors;
4446 dst->rx_fifo_errors = src->rx_fifo_errors;
4447 dst->rx_missed_errors = src->rx_missed_errors;
4448 dst->tx_aborted_errors = src->tx_aborted_errors;
4449 dst->tx_carrier_errors = src->tx_carrier_errors;
4450 dst->tx_fifo_errors = src->tx_fifo_errors;
4451 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4452 dst->tx_window_errors = src->tx_window_errors;
4456 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4458 /* Policy for RTNLGRP_LINK messages.
4460 * There are *many* more fields in these messages, but currently we only
4461 * care about these fields. */
4462 static const struct nl_policy rtnlgrp_link_policy[] = {
4463 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4464 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4465 .min_len = sizeof(struct rtnl_link_stats) },
4468 struct ofpbuf request;
4469 struct ofpbuf *reply;
4470 struct ifinfomsg *ifi;
4471 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4474 ofpbuf_init(&request, 0);
4475 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4476 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4477 ifi->ifi_family = PF_UNSPEC;
4478 ifi->ifi_index = ifindex;
4479 error = nl_transact(NETLINK_ROUTE, &request, &reply);
4480 ofpbuf_uninit(&request);
4485 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4486 rtnlgrp_link_policy,
4487 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4488 ofpbuf_delete(reply);
4492 if (!attrs[IFLA_STATS]) {
4493 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4494 ofpbuf_delete(reply);
4498 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4500 ofpbuf_delete(reply);
4506 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4508 static const char fn[] = "/proc/net/dev";
4513 stream = fopen(fn, "r");
4515 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
4520 while (fgets(line, sizeof line, stream)) {
4523 #define X64 "%"SCNu64
4526 X64 X64 X64 X64 X64 X64 X64 "%*u"
4527 X64 X64 X64 X64 X64 X64 X64 "%*u",
4533 &stats->rx_fifo_errors,
4534 &stats->rx_frame_errors,
4540 &stats->tx_fifo_errors,
4542 &stats->tx_carrier_errors) != 15) {
4543 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4544 } else if (!strcmp(devname, netdev_name)) {
4545 stats->rx_length_errors = UINT64_MAX;
4546 stats->rx_over_errors = UINT64_MAX;
4547 stats->rx_crc_errors = UINT64_MAX;
4548 stats->rx_missed_errors = UINT64_MAX;
4549 stats->tx_aborted_errors = UINT64_MAX;
4550 stats->tx_heartbeat_errors = UINT64_MAX;
4551 stats->tx_window_errors = UINT64_MAX;
4557 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4563 get_flags(const struct netdev *dev, unsigned int *flags)
4569 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
4571 *flags = ifr.ifr_flags;
4577 set_flags(const char *name, unsigned int flags)
4581 ifr.ifr_flags = flags;
4582 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
4586 do_get_ifindex(const char *netdev_name)
4591 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4592 COVERAGE_INC(netdev_get_ifindex);
4594 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
4596 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4597 netdev_name, ovs_strerror(error));
4600 return ifr.ifr_ifindex;
4604 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4606 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4608 if (!(netdev->cache_valid & VALID_IFINDEX)) {
4609 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4612 netdev->get_ifindex_error = -ifindex;
4613 netdev->ifindex = 0;
4615 netdev->get_ifindex_error = 0;
4616 netdev->ifindex = ifindex;
4618 netdev->cache_valid |= VALID_IFINDEX;
4621 *ifindexp = netdev->ifindex;
4622 return netdev->get_ifindex_error;
4626 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4632 memset(&ifr, 0, sizeof ifr);
4633 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4634 COVERAGE_INC(netdev_get_hwaddr);
4635 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
4637 /* ENODEV probably means that a vif disappeared asynchronously and
4638 * hasn't been removed from the database yet, so reduce the log level
4639 * to INFO for that case. */
4640 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
4641 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4642 netdev_name, ovs_strerror(error));
4645 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4646 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4647 VLOG_WARN("%s device has unknown hardware address family %d",
4648 netdev_name, hwaddr_family);
4650 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4655 set_etheraddr(const char *netdev_name,
4656 const uint8_t mac[ETH_ADDR_LEN])
4661 memset(&ifr, 0, sizeof ifr);
4662 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4663 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4664 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4665 COVERAGE_INC(netdev_set_hwaddr);
4666 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
4668 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4669 netdev_name, ovs_strerror(error));
4675 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4676 int cmd, const char *cmd_name)
4681 memset(&ifr, 0, sizeof ifr);
4682 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4683 ifr.ifr_data = (caddr_t) ecmd;
4686 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
4688 if (error != EOPNOTSUPP) {
4689 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4690 "failed: %s", cmd_name, name, ovs_strerror(error));
4692 /* The device doesn't support this operation. That's pretty
4693 * common, so there's no point in logging anything. */
4700 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4701 int cmd, const char *cmd_name)
4706 ifr.ifr_addr.sa_family = AF_INET;
4707 error = af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4709 const struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *,
4711 *ip = sin->sin_addr;
4716 /* Returns an AF_PACKET raw socket or a negative errno value. */
4718 af_packet_sock(void)
4720 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4723 if (ovsthread_once_start(&once)) {
4724 sock = socket(AF_PACKET, SOCK_RAW, 0);
4726 int error = set_nonblocking(sock);
4733 VLOG_ERR("failed to create packet socket: %s",
4734 ovs_strerror(errno));
4736 ovsthread_once_done(&once);