2 * Copyright (c) 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <linux/version.h>
37 #include <sys/types.h>
38 #include <sys/ioctl.h>
39 #include <sys/socket.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
52 #include "dpif-linux.h"
53 #include "dynamic-string.h"
54 #include "fatal-signal.h"
57 #include "netdev-provider.h"
58 #include "netdev-vport.h"
59 #include "netlink-notifier.h"
60 #include "netlink-socket.h"
63 #include "openflow/openflow.h"
65 #include "poll-loop.h"
66 #include "rtnetlink-link.h"
68 #include "socket-util.h"
71 #include "unaligned.h"
74 VLOG_DEFINE_THIS_MODULE(netdev_linux);
76 COVERAGE_DEFINE(netdev_set_policing);
77 COVERAGE_DEFINE(netdev_arp_lookup);
78 COVERAGE_DEFINE(netdev_get_ifindex);
79 COVERAGE_DEFINE(netdev_get_hwaddr);
80 COVERAGE_DEFINE(netdev_set_hwaddr);
81 COVERAGE_DEFINE(netdev_get_ethtool);
82 COVERAGE_DEFINE(netdev_set_ethtool);
85 /* These were introduced in Linux 2.6.14, so they might be missing if we have
87 #ifndef ADVERTISED_Pause
88 #define ADVERTISED_Pause (1 << 13)
90 #ifndef ADVERTISED_Asym_Pause
91 #define ADVERTISED_Asym_Pause (1 << 14)
94 /* These were introduced in Linux 2.6.24, so they might be missing if we
95 * have old headers. */
96 #ifndef ETHTOOL_GFLAGS
97 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
99 #ifndef ETHTOOL_SFLAGS
100 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
103 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
106 #define TC_RTAB_SIZE 1024
110 VALID_IFINDEX = 1 << 0,
111 VALID_ETHERADDR = 1 << 1,
115 VALID_POLICING = 1 << 5,
116 VALID_VPORT_STAT_ERROR = 1 << 6,
117 VALID_DRVINFO = 1 << 7,
118 VALID_FEATURES = 1 << 8,
121 /* Traffic control. */
123 /* An instance of a traffic control class. Always associated with a particular
126 * Each TC implementation subclasses this with whatever additional data it
129 const struct tc_ops *ops;
130 struct hmap queues; /* Contains "struct tc_queue"s.
131 * Read by generic TC layer.
132 * Written only by TC implementation. */
135 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
137 /* One traffic control queue.
139 * Each TC implementation subclasses this with whatever additional data it
142 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
143 unsigned int queue_id; /* OpenFlow queue ID. */
144 long long int created; /* Time queue was created, in msecs. */
147 /* A particular kind of traffic control. Each implementation generally maps to
148 * one particular Linux qdisc class.
150 * The functions below return 0 if successful or a positive errno value on
151 * failure, except where otherwise noted. All of them must be provided, except
152 * where otherwise noted. */
154 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
155 * This is null for tc_ops_default and tc_ops_other, for which there are no
156 * appropriate values. */
157 const char *linux_name;
159 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
160 const char *ovs_name;
162 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
163 * queues. The queues are numbered 0 through n_queues - 1. */
164 unsigned int n_queues;
166 /* Called to install this TC class on 'netdev'. The implementation should
167 * make the Netlink calls required to set up 'netdev' with the right qdisc
168 * and configure it according to 'details'. The implementation may assume
169 * that the current qdisc is the default; that is, there is no need for it
170 * to delete the current qdisc before installing itself.
172 * The contents of 'details' should be documented as valid for 'ovs_name'
173 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
174 * (which is built as ovs-vswitchd.conf.db(8)).
176 * This function must return 0 if and only if it sets 'netdev->tc' to an
177 * initialized 'struct tc'.
179 * (This function is null for tc_ops_other, which cannot be installed. For
180 * other TC classes it should always be nonnull.) */
181 int (*tc_install)(struct netdev *netdev, const struct smap *details);
183 /* Called when the netdev code determines (through a Netlink query) that
184 * this TC class's qdisc is installed on 'netdev', but we didn't install
185 * it ourselves and so don't know any of the details.
187 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
188 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
189 * implementation should parse the other attributes of 'nlmsg' as
190 * necessary to determine its configuration. If necessary it should also
191 * use Netlink queries to determine the configuration of queues on
194 * This function must return 0 if and only if it sets 'netdev->tc' to an
195 * initialized 'struct tc'. */
196 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
198 /* Destroys the data structures allocated by the implementation as part of
199 * 'tc'. (This includes destroying 'tc->queues' by calling
202 * The implementation should not need to perform any Netlink calls. If
203 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
204 * (But it may not be desirable.)
206 * This function may be null if 'tc' is trivial. */
207 void (*tc_destroy)(struct tc *tc);
209 /* Retrieves details of 'netdev->tc' configuration into 'details'.
211 * The implementation should not need to perform any Netlink calls, because
212 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
213 * cached the configuration.
215 * The contents of 'details' should be documented as valid for 'ovs_name'
216 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
217 * (which is built as ovs-vswitchd.conf.db(8)).
219 * This function may be null if 'tc' is not configurable.
221 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
223 /* Reconfigures 'netdev->tc' according to 'details', performing any
224 * required Netlink calls to complete the reconfiguration.
226 * The contents of 'details' should be documented as valid for 'ovs_name'
227 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
228 * (which is built as ovs-vswitchd.conf.db(8)).
230 * This function may be null if 'tc' is not configurable.
232 int (*qdisc_set)(struct netdev *, const struct smap *details);
234 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
235 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
237 * The contents of 'details' should be documented as valid for 'ovs_name'
238 * in the "other_config" column in the "Queue" table in
239 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
241 * The implementation should not need to perform any Netlink calls, because
242 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
243 * cached the queue configuration.
245 * This function may be null if 'tc' does not have queues ('n_queues' is
247 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
248 struct smap *details);
250 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
251 * 'details', perfoming any required Netlink calls to complete the
252 * reconfiguration. The caller ensures that 'queue_id' is less than
255 * The contents of 'details' should be documented as valid for 'ovs_name'
256 * in the "other_config" column in the "Queue" table in
257 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
259 * This function may be null if 'tc' does not have queues or its queues are
260 * not configurable. */
261 int (*class_set)(struct netdev *, unsigned int queue_id,
262 const struct smap *details);
264 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
265 * tc_queue's within 'netdev->tc->queues'.
267 * This function may be null if 'tc' does not have queues or its queues
268 * cannot be deleted. */
269 int (*class_delete)(struct netdev *, struct tc_queue *queue);
271 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
272 * 'struct tc_queue's within 'netdev->tc->queues'.
274 * On success, initializes '*stats'.
276 * This function may be null if 'tc' does not have queues or if it cannot
277 * report queue statistics. */
278 int (*class_get_stats)(const struct netdev *netdev,
279 const struct tc_queue *queue,
280 struct netdev_queue_stats *stats);
282 /* Extracts queue stats from 'nlmsg', which is a response to a
283 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
285 * This function may be null if 'tc' does not have queues or if it cannot
286 * report queue statistics. */
287 int (*class_dump_stats)(const struct netdev *netdev,
288 const struct ofpbuf *nlmsg,
289 netdev_dump_queue_stats_cb *cb, void *aux);
293 tc_init(struct tc *tc, const struct tc_ops *ops)
296 hmap_init(&tc->queues);
300 tc_destroy(struct tc *tc)
302 hmap_destroy(&tc->queues);
305 static const struct tc_ops tc_ops_htb;
306 static const struct tc_ops tc_ops_hfsc;
307 static const struct tc_ops tc_ops_default;
308 static const struct tc_ops tc_ops_other;
310 static const struct tc_ops *const tcs[] = {
311 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
312 &tc_ops_hfsc, /* Hierarchical fair service curve. */
313 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
314 &tc_ops_other, /* Some other qdisc. */
318 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
319 static unsigned int tc_get_major(unsigned int handle);
320 static unsigned int tc_get_minor(unsigned int handle);
322 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
323 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
324 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
326 static struct tcmsg *tc_make_request(const struct netdev *, int type,
327 unsigned int flags, struct ofpbuf *);
328 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
329 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
330 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
333 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
334 struct nlattr **options);
335 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
336 struct nlattr **options,
337 struct netdev_queue_stats *);
338 static int tc_query_class(const struct netdev *,
339 unsigned int handle, unsigned int parent,
340 struct ofpbuf **replyp);
341 static int tc_delete_class(const struct netdev *, unsigned int handle);
343 static int tc_del_qdisc(struct netdev *netdev);
344 static int tc_query_qdisc(const struct netdev *netdev);
346 static int tc_calc_cell_log(unsigned int mtu);
347 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
348 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
349 const struct tc_ratespec *rate);
350 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
352 struct netdev_linux {
355 unsigned int cache_valid;
356 unsigned int change_seq;
358 bool miimon; /* Link status of last poll. */
359 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
360 struct timer miimon_timer;
362 /* The following are figured out "on demand" only. They are only valid
363 * when the corresponding VALID_* bit in 'cache_valid' is set. */
365 uint8_t etheraddr[ETH_ADDR_LEN];
366 struct in_addr address, netmask;
369 unsigned int ifi_flags;
370 long long int carrier_resets;
371 uint32_t kbits_rate; /* Policing data. */
372 uint32_t kbits_burst;
373 int vport_stats_error; /* Cached error code from vport_get_stats().
374 0 or an errno value. */
375 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
376 int ether_addr_error; /* Cached error code from set/get etheraddr. */
377 int netdev_policing_error; /* Cached error code from set policing. */
378 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
379 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
381 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
382 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
383 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
385 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
388 /* For devices of class netdev_tap_class only. */
392 struct netdev_rx_linux {
398 /* This is set pretty low because we probably won't learn anything from the
399 * additional log messages. */
400 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
402 static void netdev_linux_run(void);
404 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
405 int cmd, const char *cmd_name);
406 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
407 int cmd, const char *cmd_name);
408 static int get_flags(const struct netdev *, unsigned int *flags);
409 static int set_flags(const char *, unsigned int flags);
410 static int do_get_ifindex(const char *netdev_name);
411 static int get_ifindex(const struct netdev *, int *ifindexp);
412 static int do_set_addr(struct netdev *netdev,
413 int ioctl_nr, const char *ioctl_name,
414 struct in_addr addr);
415 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
416 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
417 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
418 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
419 static int af_packet_sock(void);
420 static void netdev_linux_miimon_run(void);
421 static void netdev_linux_miimon_wait(void);
424 is_netdev_linux_class(const struct netdev_class *netdev_class)
426 return netdev_class->run == netdev_linux_run;
430 is_tap_netdev(const struct netdev *netdev)
432 return netdev_get_class(netdev) == &netdev_tap_class;
435 static struct netdev_linux *
436 netdev_linux_cast(const struct netdev *netdev)
438 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
440 return CONTAINER_OF(netdev, struct netdev_linux, up);
443 static struct netdev_rx_linux *
444 netdev_rx_linux_cast(const struct netdev_rx *rx)
446 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
447 return CONTAINER_OF(rx, struct netdev_rx_linux, up);
450 static void netdev_linux_update(struct netdev_linux *netdev,
451 const struct rtnetlink_link_change *);
452 static void netdev_linux_changed(struct netdev_linux *netdev,
453 unsigned int ifi_flags, unsigned int mask);
455 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK changes, or NULL
456 * if no such socket could be created. */
457 static struct nl_sock *
458 netdev_linux_notify_sock(void)
460 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
461 static struct nl_sock *sock;
463 if (ovsthread_once_start(&once)) {
466 error = nl_sock_create(NETLINK_ROUTE, &sock);
468 error = nl_sock_join_mcgroup(sock, RTNLGRP_LINK);
470 nl_sock_destroy(sock);
474 ovsthread_once_done(&once);
481 netdev_linux_run(void)
483 struct nl_sock *sock;
486 netdev_linux_miimon_run();
488 sock = netdev_linux_notify_sock();
494 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
495 uint64_t buf_stub[4096 / 8];
498 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
499 error = nl_sock_recv(sock, &buf, false);
501 struct rtnetlink_link_change change;
503 if (rtnetlink_link_parse(&buf, &change)) {
504 struct netdev *netdev_ = netdev_from_name(change.ifname);
505 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
506 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
507 netdev_linux_update(netdev, &change);
508 netdev_close(netdev_);
511 } else if (error == ENOBUFS) {
512 struct shash device_shash;
513 struct shash_node *node;
517 shash_init(&device_shash);
518 netdev_get_devices(&netdev_linux_class, &device_shash);
519 SHASH_FOR_EACH (node, &device_shash) {
520 struct netdev *netdev_ = node->data;
521 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
524 get_flags(netdev_, &flags);
525 netdev_linux_changed(netdev, flags, 0);
526 netdev_close(netdev_);
528 shash_destroy(&device_shash);
529 } else if (error != EAGAIN) {
530 VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)",
531 ovs_strerror(error));
538 netdev_linux_wait(void)
540 struct nl_sock *sock;
542 netdev_linux_miimon_wait();
543 sock = netdev_linux_notify_sock();
545 nl_sock_wait(sock, POLLIN);
550 netdev_linux_changed(struct netdev_linux *dev,
551 unsigned int ifi_flags, unsigned int mask)
554 if (!dev->change_seq) {
558 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
559 dev->carrier_resets++;
561 dev->ifi_flags = ifi_flags;
563 dev->cache_valid &= mask;
567 netdev_linux_update(struct netdev_linux *dev,
568 const struct rtnetlink_link_change *change)
570 if (change->nlmsg_type == RTM_NEWLINK) {
572 netdev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
574 /* Update netdev from rtnl-change msg. */
576 dev->mtu = change->mtu;
577 dev->cache_valid |= VALID_MTU;
578 dev->netdev_mtu_error = 0;
581 if (!eth_addr_is_zero(change->addr)) {
582 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
583 dev->cache_valid |= VALID_ETHERADDR;
584 dev->ether_addr_error = 0;
587 dev->ifindex = change->ifi_index;
588 dev->cache_valid |= VALID_IFINDEX;
589 dev->get_ifindex_error = 0;
592 netdev_linux_changed(dev, change->ifi_flags, 0);
596 static struct netdev *
597 netdev_linux_alloc(void)
599 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
604 netdev_linux_common_construct(struct netdev_linux *netdev)
606 netdev->change_seq = 1;
609 /* Creates system and internal devices. */
611 netdev_linux_construct(struct netdev *netdev_)
613 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
616 netdev_linux_common_construct(netdev);
618 error = get_flags(&netdev->up, &netdev->ifi_flags);
619 if (error == ENODEV) {
620 if (netdev->up.netdev_class != &netdev_internal_class) {
621 /* The device does not exist, so don't allow it to be opened. */
624 /* "Internal" netdevs have to be created as netdev objects before
625 * they exist in the kernel, because creating them in the kernel
626 * happens by passing a netdev object to dpif_port_add().
627 * Therefore, ignore the error. */
634 /* For most types of netdevs we open the device for each call of
635 * netdev_open(). However, this is not the case with tap devices,
636 * since it is only possible to open the device once. In this
637 * situation we share a single file descriptor, and consequently
638 * buffers, across all readers. Therefore once data is read it will
639 * be unavailable to other reads for tap devices. */
641 netdev_linux_construct_tap(struct netdev *netdev_)
643 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
644 static const char tap_dev[] = "/dev/net/tun";
645 const char *name = netdev_->name;
649 netdev_linux_common_construct(netdev);
651 /* Open tap device. */
652 netdev->tap_fd = open(tap_dev, O_RDWR);
653 if (netdev->tap_fd < 0) {
655 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
659 /* Create tap device. */
660 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
661 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
662 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
663 VLOG_WARN("%s: creating tap device failed: %s", name,
664 ovs_strerror(errno));
669 /* Make non-blocking. */
670 error = set_nonblocking(netdev->tap_fd);
678 close(netdev->tap_fd);
683 netdev_linux_destruct(struct netdev *netdev_)
685 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
687 if (netdev->tc && netdev->tc->ops->tc_destroy) {
688 netdev->tc->ops->tc_destroy(netdev->tc);
691 if (netdev_get_class(netdev_) == &netdev_tap_class
692 && netdev->tap_fd >= 0)
694 close(netdev->tap_fd);
699 netdev_linux_dealloc(struct netdev *netdev_)
701 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
705 static struct netdev_rx *
706 netdev_linux_rx_alloc(void)
708 struct netdev_rx_linux *rx = xzalloc(sizeof *rx);
713 netdev_linux_rx_construct(struct netdev_rx *rx_)
715 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
716 struct netdev *netdev_ = rx->up.netdev;
717 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
720 rx->is_tap = is_tap_netdev(netdev_);
722 rx->fd = netdev->tap_fd;
724 struct sockaddr_ll sll;
726 /* Result of tcpdump -dd inbound */
727 static const struct sock_filter filt[] = {
728 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
729 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
730 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
731 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
733 static const struct sock_fprog fprog = {
734 ARRAY_SIZE(filt), (struct sock_filter *) filt
737 /* Create file descriptor. */
738 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
741 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
745 /* Set non-blocking mode. */
746 error = set_nonblocking(rx->fd);
751 /* Get ethernet device index. */
752 error = get_ifindex(&netdev->up, &ifindex);
757 /* Bind to specific ethernet device. */
758 memset(&sll, 0, sizeof sll);
759 sll.sll_family = AF_PACKET;
760 sll.sll_ifindex = ifindex;
761 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
762 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
764 VLOG_ERR("%s: failed to bind raw socket (%s)",
765 netdev_get_name(netdev_), ovs_strerror(error));
769 /* Filter for only inbound packets. */
770 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
774 VLOG_ERR("%s: failed to attach filter (%s)",
775 netdev_get_name(netdev_), ovs_strerror(error));
790 netdev_linux_rx_destruct(struct netdev_rx *rx_)
792 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
800 netdev_linux_rx_dealloc(struct netdev_rx *rx_)
802 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
808 netdev_linux_rx_recv(struct netdev_rx *rx_, void *data, size_t size)
810 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
815 ? read(rx->fd, data, size)
816 : recv(rx->fd, data, size, MSG_TRUNC));
817 } while (retval < 0 && errno == EINTR);
820 return retval > size ? -EMSGSIZE : retval;
822 if (errno != EAGAIN) {
823 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
824 ovs_strerror(errno), netdev_rx_get_name(rx_));
831 netdev_linux_rx_wait(struct netdev_rx *rx_)
833 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
834 poll_fd_wait(rx->fd, POLLIN);
838 netdev_linux_rx_drain(struct netdev_rx *rx_)
840 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
843 int error = af_inet_ifreq_ioctl(netdev_rx_get_name(rx_), &ifr,
844 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
848 drain_fd(rx->fd, ifr.ifr_qlen);
851 return drain_rcvbuf(rx->fd);
855 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
856 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
857 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
858 * the packet is too big or too small to transmit on the device.
860 * The caller retains ownership of 'buffer' in all cases.
862 * The kernel maintains a packet transmission queue, so the caller is not
863 * expected to do additional queuing of packets. */
865 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
870 if (!is_tap_netdev(netdev_)) {
871 /* Use our AF_PACKET socket to send to this device. */
872 struct sockaddr_ll sll;
879 sock = af_packet_sock();
884 error = get_ifindex(netdev_, &ifindex);
889 /* We don't bother setting most fields in sockaddr_ll because the
890 * kernel ignores them for SOCK_RAW. */
891 memset(&sll, 0, sizeof sll);
892 sll.sll_family = AF_PACKET;
893 sll.sll_ifindex = ifindex;
895 iov.iov_base = CONST_CAST(void *, data);
899 msg.msg_namelen = sizeof sll;
902 msg.msg_control = NULL;
903 msg.msg_controllen = 0;
906 retval = sendmsg(sock, &msg, 0);
908 /* Use the tap fd to send to this device. This is essential for
909 * tap devices, because packets sent to a tap device with an
910 * AF_PACKET socket will loop back to be *received* again on the
911 * tap device. This doesn't occur on other interface types
912 * because we attach a socket filter to the rx socket. */
913 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
915 retval = write(netdev->tap_fd, data, size);
919 /* The Linux AF_PACKET implementation never blocks waiting for room
920 * for packets, instead returning ENOBUFS. Translate this into
921 * EAGAIN for the caller. */
922 if (errno == ENOBUFS) {
924 } else if (errno == EINTR) {
926 } else if (errno != EAGAIN) {
927 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
928 netdev_get_name(netdev_), ovs_strerror(errno));
931 } else if (retval != size) {
932 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
933 "%zu) on %s", retval, size, netdev_get_name(netdev_));
941 /* Registers with the poll loop to wake up from the next call to poll_block()
942 * when the packet transmission queue has sufficient room to transmit a packet
943 * with netdev_send().
945 * The kernel maintains a packet transmission queue, so the client is not
946 * expected to do additional queuing of packets. Thus, this function is
947 * unlikely to ever be used. It is included for completeness. */
949 netdev_linux_send_wait(struct netdev *netdev)
951 if (is_tap_netdev(netdev)) {
952 /* TAP device always accepts packets.*/
953 poll_immediate_wake();
957 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
958 * otherwise a positive errno value. */
960 netdev_linux_set_etheraddr(struct netdev *netdev_,
961 const uint8_t mac[ETH_ADDR_LEN])
963 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
964 struct netdev_saved_flags *sf = NULL;
967 if (netdev->cache_valid & VALID_ETHERADDR) {
968 if (netdev->ether_addr_error) {
969 return netdev->ether_addr_error;
971 if (eth_addr_equals(netdev->etheraddr, mac)) {
974 netdev->cache_valid &= ~VALID_ETHERADDR;
977 /* Tap devices must be brought down before setting the address. */
978 if (is_tap_netdev(netdev_)) {
979 netdev_turn_flags_off(netdev_, NETDEV_UP, &sf);
981 error = set_etheraddr(netdev_get_name(netdev_), mac);
982 if (!error || error == ENODEV) {
983 netdev->ether_addr_error = error;
984 netdev->cache_valid |= VALID_ETHERADDR;
986 memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
990 netdev_restore_flags(sf);
995 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
997 netdev_linux_get_etheraddr(const struct netdev *netdev_,
998 uint8_t mac[ETH_ADDR_LEN])
1000 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1002 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1003 int error = get_etheraddr(netdev_get_name(netdev_),
1006 netdev->ether_addr_error = error;
1007 netdev->cache_valid |= VALID_ETHERADDR;
1010 if (!netdev->ether_addr_error) {
1011 memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
1014 return netdev->ether_addr_error;
1017 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1018 * in bytes, not including the hardware header; thus, this is typically 1500
1019 * bytes for Ethernet devices. */
1021 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1023 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1024 if (!(netdev->cache_valid & VALID_MTU)) {
1028 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1029 SIOCGIFMTU, "SIOCGIFMTU");
1031 netdev->netdev_mtu_error = error;
1032 netdev->mtu = ifr.ifr_mtu;
1033 netdev->cache_valid |= VALID_MTU;
1036 if (!netdev->netdev_mtu_error) {
1037 *mtup = netdev->mtu;
1039 return netdev->netdev_mtu_error;
1042 /* Sets the maximum size of transmitted (MTU) for given device using linux
1043 * networking ioctl interface.
1046 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1048 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1052 if (netdev->cache_valid & VALID_MTU) {
1053 if (netdev->netdev_mtu_error) {
1054 return netdev->netdev_mtu_error;
1056 if (netdev->mtu == mtu) {
1059 netdev->cache_valid &= ~VALID_MTU;
1062 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1063 SIOCSIFMTU, "SIOCSIFMTU");
1064 if (!error || error == ENODEV) {
1065 netdev->netdev_mtu_error = error;
1066 netdev->mtu = ifr.ifr_mtu;
1067 netdev->cache_valid |= VALID_MTU;
1072 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1073 * On failure, returns a negative errno value. */
1075 netdev_linux_get_ifindex(const struct netdev *netdev)
1079 error = get_ifindex(netdev, &ifindex);
1080 return error ? -error : ifindex;
1084 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1086 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1088 if (netdev->miimon_interval > 0) {
1089 *carrier = netdev->miimon;
1091 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1097 static long long int
1098 netdev_linux_get_carrier_resets(const struct netdev *netdev)
1100 return netdev_linux_cast(netdev)->carrier_resets;
1104 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1105 struct mii_ioctl_data *data)
1110 memset(&ifr, 0, sizeof ifr);
1111 memcpy(&ifr.ifr_data, data, sizeof *data);
1112 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
1113 memcpy(data, &ifr.ifr_data, sizeof *data);
1119 netdev_linux_get_miimon(const char *name, bool *miimon)
1121 struct mii_ioctl_data data;
1126 memset(&data, 0, sizeof data);
1127 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1129 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1130 data.reg_num = MII_BMSR;
1131 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1135 *miimon = !!(data.val_out & BMSR_LSTATUS);
1137 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1140 struct ethtool_cmd ecmd;
1142 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1145 COVERAGE_INC(netdev_get_ethtool);
1146 memset(&ecmd, 0, sizeof ecmd);
1147 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1150 struct ethtool_value eval;
1152 memcpy(&eval, &ecmd, sizeof eval);
1153 *miimon = !!eval.data;
1155 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1163 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1164 long long int interval)
1166 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1168 interval = interval > 0 ? MAX(interval, 100) : 0;
1169 if (netdev->miimon_interval != interval) {
1170 netdev->miimon_interval = interval;
1171 timer_set_expired(&netdev->miimon_timer);
1178 netdev_linux_miimon_run(void)
1180 struct shash device_shash;
1181 struct shash_node *node;
1183 shash_init(&device_shash);
1184 netdev_get_devices(&netdev_linux_class, &device_shash);
1185 SHASH_FOR_EACH (node, &device_shash) {
1186 struct netdev *netdev = node->data;
1187 struct netdev_linux *dev = netdev_linux_cast(netdev);
1190 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1191 netdev_close(netdev);
1195 netdev_linux_get_miimon(dev->up.name, &miimon);
1196 if (miimon != dev->miimon) {
1197 dev->miimon = miimon;
1198 netdev_linux_changed(dev, dev->ifi_flags, 0);
1201 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1202 netdev_close(netdev);
1205 shash_destroy(&device_shash);
1209 netdev_linux_miimon_wait(void)
1211 struct shash device_shash;
1212 struct shash_node *node;
1214 shash_init(&device_shash);
1215 netdev_get_devices(&netdev_linux_class, &device_shash);
1216 SHASH_FOR_EACH (node, &device_shash) {
1217 struct netdev *netdev = node->data;
1218 struct netdev_linux *dev = netdev_linux_cast(netdev);
1220 if (dev->miimon_interval > 0) {
1221 timer_wait(&dev->miimon_timer);
1223 netdev_close(netdev);
1225 shash_destroy(&device_shash);
1228 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1229 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1232 check_for_working_netlink_stats(void)
1234 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1235 * preferable, so if that works, we'll use it. */
1236 int ifindex = do_get_ifindex("lo");
1238 VLOG_WARN("failed to get ifindex for lo, "
1239 "obtaining netdev stats from proc");
1242 struct netdev_stats stats;
1243 int error = get_stats_via_netlink(ifindex, &stats);
1245 VLOG_DBG("obtaining netdev stats via rtnetlink");
1248 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1249 "via proc (you are probably running a pre-2.6.19 "
1250 "kernel)", ovs_strerror(error));
1257 swap_uint64(uint64_t *a, uint64_t *b)
1264 /* Copies 'src' into 'dst', performing format conversion in the process.
1266 * 'src' is allowed to be misaligned. */
1268 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1269 const struct ovs_vport_stats *src)
1271 dst->rx_packets = get_unaligned_u64(&src->rx_packets);
1272 dst->tx_packets = get_unaligned_u64(&src->tx_packets);
1273 dst->rx_bytes = get_unaligned_u64(&src->rx_bytes);
1274 dst->tx_bytes = get_unaligned_u64(&src->tx_bytes);
1275 dst->rx_errors = get_unaligned_u64(&src->rx_errors);
1276 dst->tx_errors = get_unaligned_u64(&src->tx_errors);
1277 dst->rx_dropped = get_unaligned_u64(&src->rx_dropped);
1278 dst->tx_dropped = get_unaligned_u64(&src->tx_dropped);
1280 dst->collisions = 0;
1281 dst->rx_length_errors = 0;
1282 dst->rx_over_errors = 0;
1283 dst->rx_crc_errors = 0;
1284 dst->rx_frame_errors = 0;
1285 dst->rx_fifo_errors = 0;
1286 dst->rx_missed_errors = 0;
1287 dst->tx_aborted_errors = 0;
1288 dst->tx_carrier_errors = 0;
1289 dst->tx_fifo_errors = 0;
1290 dst->tx_heartbeat_errors = 0;
1291 dst->tx_window_errors = 0;
1295 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1297 struct dpif_linux_vport reply;
1301 error = dpif_linux_vport_get(netdev_get_name(netdev), &reply, &buf);
1304 } else if (!reply.stats) {
1309 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1317 get_stats_via_vport(const struct netdev *netdev_,
1318 struct netdev_stats *stats)
1320 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1322 if (!netdev->vport_stats_error ||
1323 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1326 error = get_stats_via_vport__(netdev_, stats);
1327 if (error && error != ENOENT) {
1328 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1330 netdev_get_name(netdev_), ovs_strerror(error));
1332 netdev->vport_stats_error = error;
1333 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1338 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1339 struct netdev_stats *stats)
1341 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
1342 static int use_netlink_stats;
1345 if (ovsthread_once_start(&once)) {
1346 use_netlink_stats = check_for_working_netlink_stats();
1347 ovsthread_once_done(&once);
1350 if (use_netlink_stats) {
1353 error = get_ifindex(netdev_, &ifindex);
1355 error = get_stats_via_netlink(ifindex, stats);
1358 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1362 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1363 netdev_get_name(netdev_), error);
1369 /* Retrieves current device stats for 'netdev-linux'. */
1371 netdev_linux_get_stats(const struct netdev *netdev_,
1372 struct netdev_stats *stats)
1374 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1375 struct netdev_stats dev_stats;
1378 get_stats_via_vport(netdev_, stats);
1380 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1383 if (netdev->vport_stats_error) {
1390 if (netdev->vport_stats_error) {
1391 /* stats not available from OVS then use ioctl stats. */
1394 stats->rx_errors += dev_stats.rx_errors;
1395 stats->tx_errors += dev_stats.tx_errors;
1396 stats->rx_dropped += dev_stats.rx_dropped;
1397 stats->tx_dropped += dev_stats.tx_dropped;
1398 stats->multicast += dev_stats.multicast;
1399 stats->collisions += dev_stats.collisions;
1400 stats->rx_length_errors += dev_stats.rx_length_errors;
1401 stats->rx_over_errors += dev_stats.rx_over_errors;
1402 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1403 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1404 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1405 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1406 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1407 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1408 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1409 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1410 stats->tx_window_errors += dev_stats.tx_window_errors;
1415 /* Retrieves current device stats for 'netdev-tap' netdev or
1416 * netdev-internal. */
1418 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1420 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1421 struct netdev_stats dev_stats;
1424 get_stats_via_vport(netdev_, stats);
1426 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1428 if (netdev->vport_stats_error) {
1435 /* If this port is an internal port then the transmit and receive stats
1436 * will appear to be swapped relative to the other ports since we are the
1437 * one sending the data, not a remote computer. For consistency, we swap
1438 * them back here. This does not apply if we are getting stats from the
1439 * vport layer because it always tracks stats from the perspective of the
1441 if (netdev->vport_stats_error) {
1443 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1444 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1445 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1446 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1447 stats->rx_length_errors = 0;
1448 stats->rx_over_errors = 0;
1449 stats->rx_crc_errors = 0;
1450 stats->rx_frame_errors = 0;
1451 stats->rx_fifo_errors = 0;
1452 stats->rx_missed_errors = 0;
1453 stats->tx_aborted_errors = 0;
1454 stats->tx_carrier_errors = 0;
1455 stats->tx_fifo_errors = 0;
1456 stats->tx_heartbeat_errors = 0;
1457 stats->tx_window_errors = 0;
1459 stats->rx_dropped += dev_stats.tx_dropped;
1460 stats->tx_dropped += dev_stats.rx_dropped;
1462 stats->rx_errors += dev_stats.tx_errors;
1463 stats->tx_errors += dev_stats.rx_errors;
1465 stats->multicast += dev_stats.multicast;
1466 stats->collisions += dev_stats.collisions;
1472 netdev_internal_get_stats(const struct netdev *netdev_,
1473 struct netdev_stats *stats)
1475 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1477 get_stats_via_vport(netdev_, stats);
1478 return netdev->vport_stats_error;
1482 netdev_internal_set_stats(struct netdev *netdev,
1483 const struct netdev_stats *stats)
1485 struct ovs_vport_stats vport_stats;
1486 struct dpif_linux_vport vport;
1489 vport_stats.rx_packets = stats->rx_packets;
1490 vport_stats.tx_packets = stats->tx_packets;
1491 vport_stats.rx_bytes = stats->rx_bytes;
1492 vport_stats.tx_bytes = stats->tx_bytes;
1493 vport_stats.rx_errors = stats->rx_errors;
1494 vport_stats.tx_errors = stats->tx_errors;
1495 vport_stats.rx_dropped = stats->rx_dropped;
1496 vport_stats.tx_dropped = stats->tx_dropped;
1498 dpif_linux_vport_init(&vport);
1499 vport.cmd = OVS_VPORT_CMD_SET;
1500 vport.name = netdev_get_name(netdev);
1501 vport.stats = &vport_stats;
1503 err = dpif_linux_vport_transact(&vport, NULL, NULL);
1505 /* If the vport layer doesn't know about the device, that doesn't mean it
1506 * doesn't exist (after all were able to open it when netdev_open() was
1507 * called), it just means that it isn't attached and we'll be getting
1508 * stats a different way. */
1509 if (err == ENODEV) {
1517 netdev_linux_read_features(struct netdev_linux *netdev)
1519 struct ethtool_cmd ecmd;
1523 if (netdev->cache_valid & VALID_FEATURES) {
1527 COVERAGE_INC(netdev_get_ethtool);
1528 memset(&ecmd, 0, sizeof ecmd);
1529 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1530 ETHTOOL_GSET, "ETHTOOL_GSET");
1535 /* Supported features. */
1536 netdev->supported = 0;
1537 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1538 netdev->supported |= NETDEV_F_10MB_HD;
1540 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1541 netdev->supported |= NETDEV_F_10MB_FD;
1543 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1544 netdev->supported |= NETDEV_F_100MB_HD;
1546 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1547 netdev->supported |= NETDEV_F_100MB_FD;
1549 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1550 netdev->supported |= NETDEV_F_1GB_HD;
1552 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1553 netdev->supported |= NETDEV_F_1GB_FD;
1555 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1556 netdev->supported |= NETDEV_F_10GB_FD;
1558 if (ecmd.supported & SUPPORTED_TP) {
1559 netdev->supported |= NETDEV_F_COPPER;
1561 if (ecmd.supported & SUPPORTED_FIBRE) {
1562 netdev->supported |= NETDEV_F_FIBER;
1564 if (ecmd.supported & SUPPORTED_Autoneg) {
1565 netdev->supported |= NETDEV_F_AUTONEG;
1567 if (ecmd.supported & SUPPORTED_Pause) {
1568 netdev->supported |= NETDEV_F_PAUSE;
1570 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1571 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1574 /* Advertised features. */
1575 netdev->advertised = 0;
1576 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1577 netdev->advertised |= NETDEV_F_10MB_HD;
1579 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1580 netdev->advertised |= NETDEV_F_10MB_FD;
1582 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1583 netdev->advertised |= NETDEV_F_100MB_HD;
1585 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1586 netdev->advertised |= NETDEV_F_100MB_FD;
1588 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1589 netdev->advertised |= NETDEV_F_1GB_HD;
1591 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1592 netdev->advertised |= NETDEV_F_1GB_FD;
1594 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1595 netdev->advertised |= NETDEV_F_10GB_FD;
1597 if (ecmd.advertising & ADVERTISED_TP) {
1598 netdev->advertised |= NETDEV_F_COPPER;
1600 if (ecmd.advertising & ADVERTISED_FIBRE) {
1601 netdev->advertised |= NETDEV_F_FIBER;
1603 if (ecmd.advertising & ADVERTISED_Autoneg) {
1604 netdev->advertised |= NETDEV_F_AUTONEG;
1606 if (ecmd.advertising & ADVERTISED_Pause) {
1607 netdev->advertised |= NETDEV_F_PAUSE;
1609 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1610 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1613 /* Current settings. */
1615 if (speed == SPEED_10) {
1616 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1617 } else if (speed == SPEED_100) {
1618 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1619 } else if (speed == SPEED_1000) {
1620 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1621 } else if (speed == SPEED_10000) {
1622 netdev->current = NETDEV_F_10GB_FD;
1623 } else if (speed == 40000) {
1624 netdev->current = NETDEV_F_40GB_FD;
1625 } else if (speed == 100000) {
1626 netdev->current = NETDEV_F_100GB_FD;
1627 } else if (speed == 1000000) {
1628 netdev->current = NETDEV_F_1TB_FD;
1630 netdev->current = 0;
1633 if (ecmd.port == PORT_TP) {
1634 netdev->current |= NETDEV_F_COPPER;
1635 } else if (ecmd.port == PORT_FIBRE) {
1636 netdev->current |= NETDEV_F_FIBER;
1640 netdev->current |= NETDEV_F_AUTONEG;
1644 netdev->cache_valid |= VALID_FEATURES;
1645 netdev->get_features_error = error;
1648 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
1649 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1650 * Returns 0 if successful, otherwise a positive errno value. */
1652 netdev_linux_get_features(const struct netdev *netdev_,
1653 enum netdev_features *current,
1654 enum netdev_features *advertised,
1655 enum netdev_features *supported,
1656 enum netdev_features *peer)
1658 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1660 netdev_linux_read_features(netdev);
1662 if (!netdev->get_features_error) {
1663 *current = netdev->current;
1664 *advertised = netdev->advertised;
1665 *supported = netdev->supported;
1666 *peer = 0; /* XXX */
1668 return netdev->get_features_error;
1671 /* Set the features advertised by 'netdev' to 'advertise'. */
1673 netdev_linux_set_advertisements(struct netdev *netdev,
1674 enum netdev_features advertise)
1676 struct ethtool_cmd ecmd;
1679 COVERAGE_INC(netdev_get_ethtool);
1680 memset(&ecmd, 0, sizeof ecmd);
1681 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1682 ETHTOOL_GSET, "ETHTOOL_GSET");
1687 ecmd.advertising = 0;
1688 if (advertise & NETDEV_F_10MB_HD) {
1689 ecmd.advertising |= ADVERTISED_10baseT_Half;
1691 if (advertise & NETDEV_F_10MB_FD) {
1692 ecmd.advertising |= ADVERTISED_10baseT_Full;
1694 if (advertise & NETDEV_F_100MB_HD) {
1695 ecmd.advertising |= ADVERTISED_100baseT_Half;
1697 if (advertise & NETDEV_F_100MB_FD) {
1698 ecmd.advertising |= ADVERTISED_100baseT_Full;
1700 if (advertise & NETDEV_F_1GB_HD) {
1701 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1703 if (advertise & NETDEV_F_1GB_FD) {
1704 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1706 if (advertise & NETDEV_F_10GB_FD) {
1707 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1709 if (advertise & NETDEV_F_COPPER) {
1710 ecmd.advertising |= ADVERTISED_TP;
1712 if (advertise & NETDEV_F_FIBER) {
1713 ecmd.advertising |= ADVERTISED_FIBRE;
1715 if (advertise & NETDEV_F_AUTONEG) {
1716 ecmd.advertising |= ADVERTISED_Autoneg;
1718 if (advertise & NETDEV_F_PAUSE) {
1719 ecmd.advertising |= ADVERTISED_Pause;
1721 if (advertise & NETDEV_F_PAUSE_ASYM) {
1722 ecmd.advertising |= ADVERTISED_Asym_Pause;
1724 COVERAGE_INC(netdev_set_ethtool);
1725 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1726 ETHTOOL_SSET, "ETHTOOL_SSET");
1729 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1730 * successful, otherwise a positive errno value. */
1732 netdev_linux_set_policing(struct netdev *netdev_,
1733 uint32_t kbits_rate, uint32_t kbits_burst)
1735 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1736 const char *netdev_name = netdev_get_name(netdev_);
1740 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1741 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1742 : kbits_burst); /* Stick with user-specified value. */
1744 if (netdev->cache_valid & VALID_POLICING) {
1745 if (netdev->netdev_policing_error) {
1746 return netdev->netdev_policing_error;
1749 if (netdev->kbits_rate == kbits_rate &&
1750 netdev->kbits_burst == kbits_burst) {
1751 /* Assume that settings haven't changed since we last set them. */
1754 netdev->cache_valid &= ~VALID_POLICING;
1757 COVERAGE_INC(netdev_set_policing);
1758 /* Remove any existing ingress qdisc. */
1759 error = tc_add_del_ingress_qdisc(netdev_, false);
1761 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1762 netdev_name, ovs_strerror(error));
1767 error = tc_add_del_ingress_qdisc(netdev_, true);
1769 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1770 netdev_name, ovs_strerror(error));
1774 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
1776 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1777 netdev_name, ovs_strerror(error));
1782 netdev->kbits_rate = kbits_rate;
1783 netdev->kbits_burst = kbits_burst;
1786 if (!error || error == ENODEV) {
1787 netdev->netdev_policing_error = error;
1788 netdev->cache_valid |= VALID_POLICING;
1794 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1797 const struct tc_ops *const *opsp;
1799 for (opsp = tcs; *opsp != NULL; opsp++) {
1800 const struct tc_ops *ops = *opsp;
1801 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1802 sset_add(types, ops->ovs_name);
1808 static const struct tc_ops *
1809 tc_lookup_ovs_name(const char *name)
1811 const struct tc_ops *const *opsp;
1813 for (opsp = tcs; *opsp != NULL; opsp++) {
1814 const struct tc_ops *ops = *opsp;
1815 if (!strcmp(name, ops->ovs_name)) {
1822 static const struct tc_ops *
1823 tc_lookup_linux_name(const char *name)
1825 const struct tc_ops *const *opsp;
1827 for (opsp = tcs; *opsp != NULL; opsp++) {
1828 const struct tc_ops *ops = *opsp;
1829 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1836 static struct tc_queue *
1837 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
1840 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1841 struct tc_queue *queue;
1843 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
1844 if (queue->queue_id == queue_id) {
1851 static struct tc_queue *
1852 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1854 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1858 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1860 struct netdev_qos_capabilities *caps)
1862 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1866 caps->n_queues = ops->n_queues;
1871 netdev_linux_get_qos(const struct netdev *netdev_,
1872 const char **typep, struct smap *details)
1874 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1877 error = tc_query_qdisc(netdev_);
1882 *typep = netdev->tc->ops->ovs_name;
1883 return (netdev->tc->ops->qdisc_get
1884 ? netdev->tc->ops->qdisc_get(netdev_, details)
1889 netdev_linux_set_qos(struct netdev *netdev_,
1890 const char *type, const struct smap *details)
1892 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1893 const struct tc_ops *new_ops;
1896 new_ops = tc_lookup_ovs_name(type);
1897 if (!new_ops || !new_ops->tc_install) {
1901 error = tc_query_qdisc(netdev_);
1906 if (new_ops == netdev->tc->ops) {
1907 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
1909 /* Delete existing qdisc. */
1910 error = tc_del_qdisc(netdev_);
1914 ovs_assert(netdev->tc == NULL);
1916 /* Install new qdisc. */
1917 error = new_ops->tc_install(netdev_, details);
1918 ovs_assert((error == 0) == (netdev->tc != NULL));
1925 netdev_linux_get_queue(const struct netdev *netdev_,
1926 unsigned int queue_id, struct smap *details)
1928 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1931 error = tc_query_qdisc(netdev_);
1935 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
1937 ? netdev->tc->ops->class_get(netdev_, queue, details)
1943 netdev_linux_set_queue(struct netdev *netdev_,
1944 unsigned int queue_id, const struct smap *details)
1946 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1949 error = tc_query_qdisc(netdev_);
1952 } else if (queue_id >= netdev->tc->ops->n_queues
1953 || !netdev->tc->ops->class_set) {
1957 return netdev->tc->ops->class_set(netdev_, queue_id, details);
1961 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
1963 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1966 error = tc_query_qdisc(netdev_);
1969 } else if (!netdev->tc->ops->class_delete) {
1972 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
1974 ? netdev->tc->ops->class_delete(netdev_, queue)
1980 netdev_linux_get_queue_stats(const struct netdev *netdev_,
1981 unsigned int queue_id,
1982 struct netdev_queue_stats *stats)
1984 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1987 error = tc_query_qdisc(netdev_);
1990 } else if (!netdev->tc->ops->class_get_stats) {
1993 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
1997 stats->created = queue->created;
1998 return netdev->tc->ops->class_get_stats(netdev_, queue, stats);
2003 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
2005 struct ofpbuf request;
2006 struct tcmsg *tcmsg;
2008 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2012 tcmsg->tcm_parent = 0;
2013 nl_dump_start(dump, NETLINK_ROUTE, &request);
2014 ofpbuf_uninit(&request);
2019 netdev_linux_dump_queues(const struct netdev *netdev_,
2020 netdev_dump_queues_cb *cb, void *aux)
2022 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2023 struct tc_queue *queue, *next_queue;
2024 struct smap details;
2028 error = tc_query_qdisc(netdev_);
2031 } else if (!netdev->tc->ops->class_get) {
2036 smap_init(&details);
2037 HMAP_FOR_EACH_SAFE (queue, next_queue, hmap_node,
2038 &netdev->tc->queues) {
2039 smap_clear(&details);
2041 error = netdev->tc->ops->class_get(netdev_, queue, &details);
2043 (*cb)(queue->queue_id, &details, aux);
2048 smap_destroy(&details);
2054 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2055 netdev_dump_queue_stats_cb *cb, void *aux)
2057 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2058 struct nl_dump dump;
2063 error = tc_query_qdisc(netdev_);
2066 } else if (!netdev->tc->ops->class_dump_stats) {
2071 if (!start_queue_dump(netdev_, &dump)) {
2074 while (nl_dump_next(&dump, &msg)) {
2075 error = netdev->tc->ops->class_dump_stats(netdev_, &msg, cb, aux);
2081 error = nl_dump_done(&dump);
2082 return error ? error : last_error;
2086 netdev_linux_get_in4(const struct netdev *netdev_,
2087 struct in_addr *address, struct in_addr *netmask)
2089 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2091 if (!(netdev->cache_valid & VALID_IN4)) {
2094 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
2095 SIOCGIFADDR, "SIOCGIFADDR");
2100 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2101 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2106 netdev->cache_valid |= VALID_IN4;
2108 *address = netdev->address;
2109 *netmask = netdev->netmask;
2110 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
2114 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2115 struct in_addr netmask)
2117 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2120 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2122 netdev->cache_valid |= VALID_IN4;
2123 netdev->address = address;
2124 netdev->netmask = netmask;
2125 if (address.s_addr != INADDR_ANY) {
2126 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2127 "SIOCSIFNETMASK", netmask);
2134 parse_if_inet6_line(const char *line,
2135 struct in6_addr *in6, char ifname[16 + 1])
2137 uint8_t *s6 = in6->s6_addr;
2138 #define X8 "%2"SCNx8
2140 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2141 "%*x %*x %*x %*x %16s\n",
2142 &s6[0], &s6[1], &s6[2], &s6[3],
2143 &s6[4], &s6[5], &s6[6], &s6[7],
2144 &s6[8], &s6[9], &s6[10], &s6[11],
2145 &s6[12], &s6[13], &s6[14], &s6[15],
2149 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2150 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2152 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2154 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2155 if (!(netdev->cache_valid & VALID_IN6)) {
2159 netdev->in6 = in6addr_any;
2161 file = fopen("/proc/net/if_inet6", "r");
2163 const char *name = netdev_get_name(netdev_);
2164 while (fgets(line, sizeof line, file)) {
2165 struct in6_addr in6_tmp;
2166 char ifname[16 + 1];
2167 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2168 && !strcmp(name, ifname))
2170 netdev->in6 = in6_tmp;
2176 netdev->cache_valid |= VALID_IN6;
2183 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2185 struct sockaddr_in sin;
2186 memset(&sin, 0, sizeof sin);
2187 sin.sin_family = AF_INET;
2188 sin.sin_addr = addr;
2191 memset(sa, 0, sizeof *sa);
2192 memcpy(sa, &sin, sizeof sin);
2196 do_set_addr(struct netdev *netdev,
2197 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2201 make_in4_sockaddr(&ifr.ifr_addr, addr);
2202 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2206 /* Adds 'router' as a default IP gateway. */
2208 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2210 struct in_addr any = { INADDR_ANY };
2214 memset(&rt, 0, sizeof rt);
2215 make_in4_sockaddr(&rt.rt_dst, any);
2216 make_in4_sockaddr(&rt.rt_gateway, router);
2217 make_in4_sockaddr(&rt.rt_genmask, any);
2218 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2219 error = af_inet_ioctl(SIOCADDRT, &rt);
2221 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2227 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2230 static const char fn[] = "/proc/net/route";
2235 *netdev_name = NULL;
2236 stream = fopen(fn, "r");
2237 if (stream == NULL) {
2238 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2243 while (fgets(line, sizeof line, stream)) {
2246 ovs_be32 dest, gateway, mask;
2247 int refcnt, metric, mtu;
2248 unsigned int flags, use, window, irtt;
2251 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2253 iface, &dest, &gateway, &flags, &refcnt,
2254 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2256 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2260 if (!(flags & RTF_UP)) {
2261 /* Skip routes that aren't up. */
2265 /* The output of 'dest', 'mask', and 'gateway' were given in
2266 * network byte order, so we don't need need any endian
2267 * conversions here. */
2268 if ((dest & mask) == (host->s_addr & mask)) {
2270 /* The host is directly reachable. */
2271 next_hop->s_addr = 0;
2273 /* To reach the host, we must go through a gateway. */
2274 next_hop->s_addr = gateway;
2276 *netdev_name = xstrdup(iface);
2288 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2290 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2293 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2294 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2296 COVERAGE_INC(netdev_get_ethtool);
2297 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2298 error = netdev_linux_do_ethtool(netdev->up.name,
2301 "ETHTOOL_GDRVINFO");
2303 netdev->cache_valid |= VALID_DRVINFO;
2308 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2309 smap_add(smap, "driver_version", netdev->drvinfo.version);
2310 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2316 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2319 smap_add(smap, "driver_name", "openvswitch");
2323 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2324 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2325 * returns 0. Otherwise, it returns a positive errno value; in particular,
2326 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2328 netdev_linux_arp_lookup(const struct netdev *netdev,
2329 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2332 struct sockaddr_in sin;
2335 memset(&r, 0, sizeof r);
2336 memset(&sin, 0, sizeof sin);
2337 sin.sin_family = AF_INET;
2338 sin.sin_addr.s_addr = ip;
2340 memcpy(&r.arp_pa, &sin, sizeof sin);
2341 r.arp_ha.sa_family = ARPHRD_ETHER;
2343 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2344 COVERAGE_INC(netdev_arp_lookup);
2345 retval = af_inet_ioctl(SIOCGARP, &r);
2347 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2348 } else if (retval != ENXIO) {
2349 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2350 netdev_get_name(netdev), IP_ARGS(ip),
2351 ovs_strerror(retval));
2357 nd_to_iff_flags(enum netdev_flags nd)
2360 if (nd & NETDEV_UP) {
2363 if (nd & NETDEV_PROMISC) {
2370 iff_to_nd_flags(int iff)
2372 enum netdev_flags nd = 0;
2376 if (iff & IFF_PROMISC) {
2377 nd |= NETDEV_PROMISC;
2383 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2384 enum netdev_flags on, enum netdev_flags *old_flagsp)
2386 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2387 int old_flags, new_flags;
2390 old_flags = netdev->ifi_flags;
2391 *old_flagsp = iff_to_nd_flags(old_flags);
2392 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2393 if (new_flags != old_flags) {
2394 error = set_flags(netdev_get_name(netdev_), new_flags);
2395 get_flags(netdev_, &netdev->ifi_flags);
2401 netdev_linux_change_seq(const struct netdev *netdev)
2403 return netdev_linux_cast(netdev)->change_seq;
2406 #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, SET_STATS, \
2407 GET_FEATURES, GET_STATUS) \
2413 netdev_linux_wait, \
2415 netdev_linux_alloc, \
2417 netdev_linux_destruct, \
2418 netdev_linux_dealloc, \
2419 NULL, /* get_config */ \
2420 NULL, /* set_config */ \
2421 NULL, /* get_tunnel_config */ \
2423 netdev_linux_send, \
2424 netdev_linux_send_wait, \
2426 netdev_linux_set_etheraddr, \
2427 netdev_linux_get_etheraddr, \
2428 netdev_linux_get_mtu, \
2429 netdev_linux_set_mtu, \
2430 netdev_linux_get_ifindex, \
2431 netdev_linux_get_carrier, \
2432 netdev_linux_get_carrier_resets, \
2433 netdev_linux_set_miimon_interval, \
2438 netdev_linux_set_advertisements, \
2440 netdev_linux_set_policing, \
2441 netdev_linux_get_qos_types, \
2442 netdev_linux_get_qos_capabilities, \
2443 netdev_linux_get_qos, \
2444 netdev_linux_set_qos, \
2445 netdev_linux_get_queue, \
2446 netdev_linux_set_queue, \
2447 netdev_linux_delete_queue, \
2448 netdev_linux_get_queue_stats, \
2449 netdev_linux_dump_queues, \
2450 netdev_linux_dump_queue_stats, \
2452 netdev_linux_get_in4, \
2453 netdev_linux_set_in4, \
2454 netdev_linux_get_in6, \
2455 netdev_linux_add_router, \
2456 netdev_linux_get_next_hop, \
2458 netdev_linux_arp_lookup, \
2460 netdev_linux_update_flags, \
2462 netdev_linux_change_seq, \
2464 netdev_linux_rx_alloc, \
2465 netdev_linux_rx_construct, \
2466 netdev_linux_rx_destruct, \
2467 netdev_linux_rx_dealloc, \
2468 netdev_linux_rx_recv, \
2469 netdev_linux_rx_wait, \
2470 netdev_linux_rx_drain, \
2473 const struct netdev_class netdev_linux_class =
2476 netdev_linux_construct,
2477 netdev_linux_get_stats,
2478 NULL, /* set_stats */
2479 netdev_linux_get_features,
2480 netdev_linux_get_status);
2482 const struct netdev_class netdev_tap_class =
2485 netdev_linux_construct_tap,
2486 netdev_tap_get_stats,
2487 NULL, /* set_stats */
2488 netdev_linux_get_features,
2489 netdev_linux_get_status);
2491 const struct netdev_class netdev_internal_class =
2494 netdev_linux_construct,
2495 netdev_internal_get_stats,
2496 netdev_internal_set_stats,
2497 NULL, /* get_features */
2498 netdev_internal_get_status);
2500 /* HTB traffic control class. */
2502 #define HTB_N_QUEUES 0xf000
2506 unsigned int max_rate; /* In bytes/s. */
2510 struct tc_queue tc_queue;
2511 unsigned int min_rate; /* In bytes/s. */
2512 unsigned int max_rate; /* In bytes/s. */
2513 unsigned int burst; /* In bytes. */
2514 unsigned int priority; /* Lower values are higher priorities. */
2518 htb_get__(const struct netdev *netdev_)
2520 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2521 return CONTAINER_OF(netdev->tc, struct htb, tc);
2525 htb_install__(struct netdev *netdev_, uint64_t max_rate)
2527 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2530 htb = xmalloc(sizeof *htb);
2531 tc_init(&htb->tc, &tc_ops_htb);
2532 htb->max_rate = max_rate;
2534 netdev->tc = &htb->tc;
2537 /* Create an HTB qdisc.
2539 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2541 htb_setup_qdisc__(struct netdev *netdev)
2544 struct tc_htb_glob opt;
2545 struct ofpbuf request;
2546 struct tcmsg *tcmsg;
2548 tc_del_qdisc(netdev);
2550 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2551 NLM_F_EXCL | NLM_F_CREATE, &request);
2555 tcmsg->tcm_handle = tc_make_handle(1, 0);
2556 tcmsg->tcm_parent = TC_H_ROOT;
2558 nl_msg_put_string(&request, TCA_KIND, "htb");
2560 memset(&opt, 0, sizeof opt);
2561 opt.rate2quantum = 10;
2565 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2566 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2567 nl_msg_end_nested(&request, opt_offset);
2569 return tc_transact(&request, NULL);
2572 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2573 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2575 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2576 unsigned int parent, struct htb_class *class)
2579 struct tc_htb_opt opt;
2580 struct ofpbuf request;
2581 struct tcmsg *tcmsg;
2585 error = netdev_get_mtu(netdev, &mtu);
2587 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2588 netdev_get_name(netdev));
2592 memset(&opt, 0, sizeof opt);
2593 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2594 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2595 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2596 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2597 opt.prio = class->priority;
2599 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2603 tcmsg->tcm_handle = handle;
2604 tcmsg->tcm_parent = parent;
2606 nl_msg_put_string(&request, TCA_KIND, "htb");
2607 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2608 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2609 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2610 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2611 nl_msg_end_nested(&request, opt_offset);
2613 error = tc_transact(&request, NULL);
2615 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2616 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2617 netdev_get_name(netdev),
2618 tc_get_major(handle), tc_get_minor(handle),
2619 tc_get_major(parent), tc_get_minor(parent),
2620 class->min_rate, class->max_rate,
2621 class->burst, class->priority, ovs_strerror(error));
2626 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2627 * description of them into 'details'. The description complies with the
2628 * specification given in the vswitch database documentation for linux-htb
2631 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2633 static const struct nl_policy tca_htb_policy[] = {
2634 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2635 .min_len = sizeof(struct tc_htb_opt) },
2638 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2639 const struct tc_htb_opt *htb;
2641 if (!nl_parse_nested(nl_options, tca_htb_policy,
2642 attrs, ARRAY_SIZE(tca_htb_policy))) {
2643 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2647 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2648 class->min_rate = htb->rate.rate;
2649 class->max_rate = htb->ceil.rate;
2650 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2651 class->priority = htb->prio;
2656 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2657 struct htb_class *options,
2658 struct netdev_queue_stats *stats)
2660 struct nlattr *nl_options;
2661 unsigned int handle;
2664 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2665 if (!error && queue_id) {
2666 unsigned int major = tc_get_major(handle);
2667 unsigned int minor = tc_get_minor(handle);
2668 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2669 *queue_id = minor - 1;
2674 if (!error && options) {
2675 error = htb_parse_tca_options__(nl_options, options);
2681 htb_parse_qdisc_details__(struct netdev *netdev,
2682 const struct smap *details, struct htb_class *hc)
2684 const char *max_rate_s;
2686 max_rate_s = smap_get(details, "max-rate");
2687 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2688 if (!hc->max_rate) {
2689 enum netdev_features current;
2691 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2692 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
2694 hc->min_rate = hc->max_rate;
2700 htb_parse_class_details__(struct netdev *netdev,
2701 const struct smap *details, struct htb_class *hc)
2703 const struct htb *htb = htb_get__(netdev);
2704 const char *min_rate_s = smap_get(details, "min-rate");
2705 const char *max_rate_s = smap_get(details, "max-rate");
2706 const char *burst_s = smap_get(details, "burst");
2707 const char *priority_s = smap_get(details, "priority");
2710 error = netdev_get_mtu(netdev, &mtu);
2712 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2713 netdev_get_name(netdev));
2717 /* HTB requires at least an mtu sized min-rate to send any traffic even
2718 * on uncongested links. */
2719 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2720 hc->min_rate = MAX(hc->min_rate, mtu);
2721 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2724 hc->max_rate = (max_rate_s
2725 ? strtoull(max_rate_s, NULL, 10) / 8
2727 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2728 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2732 * According to hints in the documentation that I've read, it is important
2733 * that 'burst' be at least as big as the largest frame that might be
2734 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2735 * but having it a bit too small is a problem. Since netdev_get_mtu()
2736 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2737 * the MTU. We actually add 64, instead of 14, as a guard against
2738 * additional headers get tacked on somewhere that we're not aware of. */
2739 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2740 hc->burst = MAX(hc->burst, mtu + 64);
2743 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2749 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2750 unsigned int parent, struct htb_class *options,
2751 struct netdev_queue_stats *stats)
2753 struct ofpbuf *reply;
2756 error = tc_query_class(netdev, handle, parent, &reply);
2758 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2759 ofpbuf_delete(reply);
2765 htb_tc_install(struct netdev *netdev, const struct smap *details)
2769 error = htb_setup_qdisc__(netdev);
2771 struct htb_class hc;
2773 htb_parse_qdisc_details__(netdev, details, &hc);
2774 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2775 tc_make_handle(1, 0), &hc);
2777 htb_install__(netdev, hc.max_rate);
2783 static struct htb_class *
2784 htb_class_cast__(const struct tc_queue *queue)
2786 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2790 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2791 const struct htb_class *hc)
2793 struct htb *htb = htb_get__(netdev);
2794 size_t hash = hash_int(queue_id, 0);
2795 struct tc_queue *queue;
2796 struct htb_class *hcp;
2798 queue = tc_find_queue__(netdev, queue_id, hash);
2800 hcp = htb_class_cast__(queue);
2802 hcp = xmalloc(sizeof *hcp);
2803 queue = &hcp->tc_queue;
2804 queue->queue_id = queue_id;
2805 queue->created = time_msec();
2806 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2809 hcp->min_rate = hc->min_rate;
2810 hcp->max_rate = hc->max_rate;
2811 hcp->burst = hc->burst;
2812 hcp->priority = hc->priority;
2816 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2819 struct nl_dump dump;
2820 struct htb_class hc;
2822 /* Get qdisc options. */
2824 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2825 htb_install__(netdev, hc.max_rate);
2828 if (!start_queue_dump(netdev, &dump)) {
2831 while (nl_dump_next(&dump, &msg)) {
2832 unsigned int queue_id;
2834 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2835 htb_update_queue__(netdev, queue_id, &hc);
2838 nl_dump_done(&dump);
2844 htb_tc_destroy(struct tc *tc)
2846 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2847 struct htb_class *hc, *next;
2849 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2850 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2858 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
2860 const struct htb *htb = htb_get__(netdev);
2861 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
2866 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
2868 struct htb_class hc;
2871 htb_parse_qdisc_details__(netdev, details, &hc);
2872 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2873 tc_make_handle(1, 0), &hc);
2875 htb_get__(netdev)->max_rate = hc.max_rate;
2881 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2882 const struct tc_queue *queue, struct smap *details)
2884 const struct htb_class *hc = htb_class_cast__(queue);
2886 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
2887 if (hc->min_rate != hc->max_rate) {
2888 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
2890 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
2892 smap_add_format(details, "priority", "%u", hc->priority);
2898 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2899 const struct smap *details)
2901 struct htb_class hc;
2904 error = htb_parse_class_details__(netdev, details, &hc);
2909 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2910 tc_make_handle(1, 0xfffe), &hc);
2915 htb_update_queue__(netdev, queue_id, &hc);
2920 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2922 struct htb_class *hc = htb_class_cast__(queue);
2923 struct htb *htb = htb_get__(netdev);
2926 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2928 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2935 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2936 struct netdev_queue_stats *stats)
2938 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2939 tc_make_handle(1, 0xfffe), NULL, stats);
2943 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2944 const struct ofpbuf *nlmsg,
2945 netdev_dump_queue_stats_cb *cb, void *aux)
2947 struct netdev_queue_stats stats;
2948 unsigned int handle, major, minor;
2951 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2956 major = tc_get_major(handle);
2957 minor = tc_get_minor(handle);
2958 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2959 (*cb)(minor - 1, &stats, aux);
2964 static const struct tc_ops tc_ops_htb = {
2965 "htb", /* linux_name */
2966 "linux-htb", /* ovs_name */
2967 HTB_N_QUEUES, /* n_queues */
2976 htb_class_get_stats,
2977 htb_class_dump_stats
2980 /* "linux-hfsc" traffic control class. */
2982 #define HFSC_N_QUEUES 0xf000
2990 struct tc_queue tc_queue;
2995 static struct hfsc *
2996 hfsc_get__(const struct netdev *netdev_)
2998 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2999 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
3002 static struct hfsc_class *
3003 hfsc_class_cast__(const struct tc_queue *queue)
3005 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3009 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
3011 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3014 hfsc = xmalloc(sizeof *hfsc);
3015 tc_init(&hfsc->tc, &tc_ops_hfsc);
3016 hfsc->max_rate = max_rate;
3017 netdev->tc = &hfsc->tc;
3021 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3022 const struct hfsc_class *hc)
3026 struct hfsc_class *hcp;
3027 struct tc_queue *queue;
3029 hfsc = hfsc_get__(netdev);
3030 hash = hash_int(queue_id, 0);
3032 queue = tc_find_queue__(netdev, queue_id, hash);
3034 hcp = hfsc_class_cast__(queue);
3036 hcp = xmalloc(sizeof *hcp);
3037 queue = &hcp->tc_queue;
3038 queue->queue_id = queue_id;
3039 queue->created = time_msec();
3040 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3043 hcp->min_rate = hc->min_rate;
3044 hcp->max_rate = hc->max_rate;
3048 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3050 const struct tc_service_curve *rsc, *fsc, *usc;
3051 static const struct nl_policy tca_hfsc_policy[] = {
3053 .type = NL_A_UNSPEC,
3055 .min_len = sizeof(struct tc_service_curve),
3058 .type = NL_A_UNSPEC,
3060 .min_len = sizeof(struct tc_service_curve),
3063 .type = NL_A_UNSPEC,
3065 .min_len = sizeof(struct tc_service_curve),
3068 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3070 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3071 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3072 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3076 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3077 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3078 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3080 if (rsc->m1 != 0 || rsc->d != 0 ||
3081 fsc->m1 != 0 || fsc->d != 0 ||
3082 usc->m1 != 0 || usc->d != 0) {
3083 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3084 "Non-linear service curves are not supported.");
3088 if (rsc->m2 != fsc->m2) {
3089 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3090 "Real-time service curves are not supported ");
3094 if (rsc->m2 > usc->m2) {
3095 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3096 "Min-rate service curve is greater than "
3097 "the max-rate service curve.");
3101 class->min_rate = fsc->m2;
3102 class->max_rate = usc->m2;
3107 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3108 struct hfsc_class *options,
3109 struct netdev_queue_stats *stats)
3112 unsigned int handle;
3113 struct nlattr *nl_options;
3115 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3121 unsigned int major, minor;
3123 major = tc_get_major(handle);
3124 minor = tc_get_minor(handle);
3125 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3126 *queue_id = minor - 1;
3133 error = hfsc_parse_tca_options__(nl_options, options);
3140 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3141 unsigned int parent, struct hfsc_class *options,
3142 struct netdev_queue_stats *stats)
3145 struct ofpbuf *reply;
3147 error = tc_query_class(netdev, handle, parent, &reply);
3152 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3153 ofpbuf_delete(reply);
3158 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct smap *details,
3159 struct hfsc_class *class)
3162 const char *max_rate_s;
3164 max_rate_s = smap_get(details, "max-rate");
3165 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3168 enum netdev_features current;
3170 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
3171 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3174 class->min_rate = max_rate;
3175 class->max_rate = max_rate;
3179 hfsc_parse_class_details__(struct netdev *netdev,
3180 const struct smap *details,
3181 struct hfsc_class * class)
3183 const struct hfsc *hfsc;
3184 uint32_t min_rate, max_rate;
3185 const char *min_rate_s, *max_rate_s;
3187 hfsc = hfsc_get__(netdev);
3188 min_rate_s = smap_get(details, "min-rate");
3189 max_rate_s = smap_get(details, "max-rate");
3191 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3192 min_rate = MAX(min_rate, 1);
3193 min_rate = MIN(min_rate, hfsc->max_rate);
3195 max_rate = (max_rate_s
3196 ? strtoull(max_rate_s, NULL, 10) / 8
3198 max_rate = MAX(max_rate, min_rate);
3199 max_rate = MIN(max_rate, hfsc->max_rate);
3201 class->min_rate = min_rate;
3202 class->max_rate = max_rate;
3207 /* Create an HFSC qdisc.
3209 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3211 hfsc_setup_qdisc__(struct netdev * netdev)
3213 struct tcmsg *tcmsg;
3214 struct ofpbuf request;
3215 struct tc_hfsc_qopt opt;
3217 tc_del_qdisc(netdev);
3219 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3220 NLM_F_EXCL | NLM_F_CREATE, &request);
3226 tcmsg->tcm_handle = tc_make_handle(1, 0);
3227 tcmsg->tcm_parent = TC_H_ROOT;
3229 memset(&opt, 0, sizeof opt);
3232 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3233 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3235 return tc_transact(&request, NULL);
3238 /* Create an HFSC class.
3240 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3241 * sc rate <min_rate> ul rate <max_rate>" */
3243 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3244 unsigned int parent, struct hfsc_class *class)
3248 struct tcmsg *tcmsg;
3249 struct ofpbuf request;
3250 struct tc_service_curve min, max;
3252 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3258 tcmsg->tcm_handle = handle;
3259 tcmsg->tcm_parent = parent;
3263 min.m2 = class->min_rate;
3267 max.m2 = class->max_rate;
3269 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3270 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3271 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3272 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3273 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3274 nl_msg_end_nested(&request, opt_offset);
3276 error = tc_transact(&request, NULL);
3278 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3279 "min-rate %ubps, max-rate %ubps (%s)",
3280 netdev_get_name(netdev),
3281 tc_get_major(handle), tc_get_minor(handle),
3282 tc_get_major(parent), tc_get_minor(parent),
3283 class->min_rate, class->max_rate, ovs_strerror(error));
3290 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
3293 struct hfsc_class class;
3295 error = hfsc_setup_qdisc__(netdev);
3301 hfsc_parse_qdisc_details__(netdev, details, &class);
3302 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3303 tc_make_handle(1, 0), &class);
3309 hfsc_install__(netdev, class.max_rate);
3314 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3317 struct nl_dump dump;
3318 struct hfsc_class hc;
3321 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3322 hfsc_install__(netdev, hc.max_rate);
3324 if (!start_queue_dump(netdev, &dump)) {
3328 while (nl_dump_next(&dump, &msg)) {
3329 unsigned int queue_id;
3331 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3332 hfsc_update_queue__(netdev, queue_id, &hc);
3336 nl_dump_done(&dump);
3341 hfsc_tc_destroy(struct tc *tc)
3344 struct hfsc_class *hc, *next;
3346 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3348 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3349 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3358 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
3360 const struct hfsc *hfsc;
3361 hfsc = hfsc_get__(netdev);
3362 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
3367 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
3370 struct hfsc_class class;
3372 hfsc_parse_qdisc_details__(netdev, details, &class);
3373 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3374 tc_make_handle(1, 0), &class);
3377 hfsc_get__(netdev)->max_rate = class.max_rate;
3384 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3385 const struct tc_queue *queue, struct smap *details)
3387 const struct hfsc_class *hc;
3389 hc = hfsc_class_cast__(queue);
3390 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3391 if (hc->min_rate != hc->max_rate) {
3392 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3398 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3399 const struct smap *details)
3402 struct hfsc_class class;
3404 error = hfsc_parse_class_details__(netdev, details, &class);
3409 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3410 tc_make_handle(1, 0xfffe), &class);
3415 hfsc_update_queue__(netdev, queue_id, &class);
3420 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3424 struct hfsc_class *hc;
3426 hc = hfsc_class_cast__(queue);
3427 hfsc = hfsc_get__(netdev);
3429 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3431 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3438 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3439 struct netdev_queue_stats *stats)
3441 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3442 tc_make_handle(1, 0xfffe), NULL, stats);
3446 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3447 const struct ofpbuf *nlmsg,
3448 netdev_dump_queue_stats_cb *cb, void *aux)
3450 struct netdev_queue_stats stats;
3451 unsigned int handle, major, minor;
3454 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3459 major = tc_get_major(handle);
3460 minor = tc_get_minor(handle);
3461 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3462 (*cb)(minor - 1, &stats, aux);
3467 static const struct tc_ops tc_ops_hfsc = {
3468 "hfsc", /* linux_name */
3469 "linux-hfsc", /* ovs_name */
3470 HFSC_N_QUEUES, /* n_queues */
3471 hfsc_tc_install, /* tc_install */
3472 hfsc_tc_load, /* tc_load */
3473 hfsc_tc_destroy, /* tc_destroy */
3474 hfsc_qdisc_get, /* qdisc_get */
3475 hfsc_qdisc_set, /* qdisc_set */
3476 hfsc_class_get, /* class_get */
3477 hfsc_class_set, /* class_set */
3478 hfsc_class_delete, /* class_delete */
3479 hfsc_class_get_stats, /* class_get_stats */
3480 hfsc_class_dump_stats /* class_dump_stats */
3483 /* "linux-default" traffic control class.
3485 * This class represents the default, unnamed Linux qdisc. It corresponds to
3486 * the "" (empty string) QoS type in the OVS database. */
3489 default_install__(struct netdev *netdev_)
3491 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3492 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
3494 /* Nothing but a tc class implementation is allowed to write to a tc. This
3495 * class never does that, so we can legitimately use a const tc object. */
3496 netdev->tc = CONST_CAST(struct tc *, &tc);
3500 default_tc_install(struct netdev *netdev,
3501 const struct smap *details OVS_UNUSED)
3503 default_install__(netdev);
3508 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3510 default_install__(netdev);
3514 static const struct tc_ops tc_ops_default = {
3515 NULL, /* linux_name */
3520 NULL, /* tc_destroy */
3521 NULL, /* qdisc_get */
3522 NULL, /* qdisc_set */
3523 NULL, /* class_get */
3524 NULL, /* class_set */
3525 NULL, /* class_delete */
3526 NULL, /* class_get_stats */
3527 NULL /* class_dump_stats */
3530 /* "linux-other" traffic control class.
3535 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
3537 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3538 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
3540 /* Nothing but a tc class implementation is allowed to write to a tc. This
3541 * class never does that, so we can legitimately use a const tc object. */
3542 netdev->tc = CONST_CAST(struct tc *, &tc);
3546 static const struct tc_ops tc_ops_other = {
3547 NULL, /* linux_name */
3548 "linux-other", /* ovs_name */
3550 NULL, /* tc_install */
3552 NULL, /* tc_destroy */
3553 NULL, /* qdisc_get */
3554 NULL, /* qdisc_set */
3555 NULL, /* class_get */
3556 NULL, /* class_set */
3557 NULL, /* class_delete */
3558 NULL, /* class_get_stats */
3559 NULL /* class_dump_stats */
3562 /* Traffic control. */
3564 /* Number of kernel "tc" ticks per second. */
3565 static double ticks_per_s;
3567 /* Number of kernel "jiffies" per second. This is used for the purpose of
3568 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3569 * one jiffy's worth of data.
3571 * There are two possibilities here:
3573 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3574 * approximate range of 100 to 1024. That means that we really need to
3575 * make sure that the qdisc can buffer that much data.
3577 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3578 * has finely granular timers and there's no need to fudge additional room
3579 * for buffers. (There's no extra effort needed to implement that: the
3580 * large 'buffer_hz' is used as a divisor, so practically any number will
3581 * come out as 0 in the division. Small integer results in the case of
3582 * really high dividends won't have any real effect anyhow.)
3584 static unsigned int buffer_hz;
3586 /* Returns tc handle 'major':'minor'. */
3588 tc_make_handle(unsigned int major, unsigned int minor)
3590 return TC_H_MAKE(major << 16, minor);
3593 /* Returns the major number from 'handle'. */
3595 tc_get_major(unsigned int handle)
3597 return TC_H_MAJ(handle) >> 16;
3600 /* Returns the minor number from 'handle'. */
3602 tc_get_minor(unsigned int handle)
3604 return TC_H_MIN(handle);
3607 static struct tcmsg *
3608 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3609 struct ofpbuf *request)
3611 struct tcmsg *tcmsg;
3615 error = get_ifindex(netdev, &ifindex);
3620 ofpbuf_init(request, 512);
3621 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3622 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3623 tcmsg->tcm_family = AF_UNSPEC;
3624 tcmsg->tcm_ifindex = ifindex;
3625 /* Caller should fill in tcmsg->tcm_handle. */
3626 /* Caller should fill in tcmsg->tcm_parent. */
3632 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3634 int error = nl_transact(NETLINK_ROUTE, request, replyp);
3635 ofpbuf_uninit(request);
3639 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3640 * policing configuration.
3642 * This function is equivalent to running the following when 'add' is true:
3643 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3645 * This function is equivalent to running the following when 'add' is false:
3646 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3648 * The configuration and stats may be seen with the following command:
3649 * /sbin/tc -s qdisc show dev <devname>
3651 * Returns 0 if successful, otherwise a positive errno value.
3654 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3656 struct ofpbuf request;
3657 struct tcmsg *tcmsg;
3659 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3660 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3662 tcmsg = tc_make_request(netdev, type, flags, &request);
3666 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3667 tcmsg->tcm_parent = TC_H_INGRESS;
3668 nl_msg_put_string(&request, TCA_KIND, "ingress");
3669 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3671 error = tc_transact(&request, NULL);
3673 /* If we're deleting the qdisc, don't worry about some of the
3674 * error conditions. */
3675 if (!add && (error == ENOENT || error == EINVAL)) {
3684 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3687 * This function is equivalent to running:
3688 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3689 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3692 * The configuration and stats may be seen with the following command:
3693 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3695 * Returns 0 if successful, otherwise a positive errno value.
3698 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3700 struct tc_police tc_police;
3701 struct ofpbuf request;
3702 struct tcmsg *tcmsg;
3703 size_t basic_offset;
3704 size_t police_offset;
3708 memset(&tc_police, 0, sizeof tc_police);
3709 tc_police.action = TC_POLICE_SHOT;
3710 tc_police.mtu = mtu;
3711 tc_fill_rate(&tc_police.rate, (kbits_rate * 1000)/8, mtu);
3712 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3713 kbits_burst * 1024);
3715 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3716 NLM_F_EXCL | NLM_F_CREATE, &request);
3720 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3721 tcmsg->tcm_info = tc_make_handle(49,
3722 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3724 nl_msg_put_string(&request, TCA_KIND, "basic");
3725 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3726 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3727 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3728 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3729 nl_msg_end_nested(&request, police_offset);
3730 nl_msg_end_nested(&request, basic_offset);
3732 error = tc_transact(&request, NULL);
3743 /* The values in psched are not individually very meaningful, but they are
3744 * important. The tables below show some values seen in the wild.
3748 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3749 * (Before that, there are hints that it was 1000000000.)
3751 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3755 * -----------------------------------
3756 * [1] 000c8000 000f4240 000f4240 00000064
3757 * [2] 000003e8 00000400 000f4240 3b9aca00
3758 * [3] 000003e8 00000400 000f4240 3b9aca00
3759 * [4] 000003e8 00000400 000f4240 00000064
3760 * [5] 000003e8 00000040 000f4240 3b9aca00
3761 * [6] 000003e8 00000040 000f4240 000000f9
3763 * a b c d ticks_per_s buffer_hz
3764 * ------- --------- ---------- ------------- ----------- -------------
3765 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3766 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3767 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3768 * [4] 1,000 1,024 1,000,000 100 976,562 100
3769 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3770 * [6] 1,000 64 1,000,000 249 15,625,000 249
3772 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3773 * [2] 2.6.26-1-686-bigmem from Debian lenny
3774 * [3] 2.6.26-2-sparc64 from Debian lenny
3775 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3776 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3777 * [6] 2.6.34 from kernel.org on KVM
3779 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
3780 static const char fn[] = "/proc/net/psched";
3781 unsigned int a, b, c, d;
3784 if (!ovsthread_once_start(&once)) {
3791 stream = fopen(fn, "r");
3793 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
3797 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3798 VLOG_WARN("%s: read failed", fn);
3802 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3806 VLOG_WARN("%s: invalid scheduler parameters", fn);
3810 ticks_per_s = (double) a * c / b;
3814 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3817 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3820 ovsthread_once_done(&once);
3823 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3824 * rate of 'rate' bytes per second. */
3826 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3829 return (rate * ticks) / ticks_per_s;
3832 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3833 * rate of 'rate' bytes per second. */
3835 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3838 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3841 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3842 * a transmission rate of 'rate' bytes per second. */
3844 tc_buffer_per_jiffy(unsigned int rate)
3847 return rate / buffer_hz;
3850 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3851 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3852 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3853 * stores NULL into it if it is absent.
3855 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3858 * Returns 0 if successful, otherwise a positive errno value. */
3860 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3861 struct nlattr **options)
3863 static const struct nl_policy tca_policy[] = {
3864 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3865 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3867 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3869 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3870 tca_policy, ta, ARRAY_SIZE(ta))) {
3871 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3876 *kind = nl_attr_get_string(ta[TCA_KIND]);
3880 *options = ta[TCA_OPTIONS];
3895 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3896 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3897 * into '*options', and its queue statistics into '*stats'. Any of the output
3898 * arguments may be null.
3900 * Returns 0 if successful, otherwise a positive errno value. */
3902 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3903 struct nlattr **options, struct netdev_queue_stats *stats)
3905 static const struct nl_policy tca_policy[] = {
3906 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3907 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3909 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3911 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3912 tca_policy, ta, ARRAY_SIZE(ta))) {
3913 VLOG_WARN_RL(&rl, "failed to parse class message");
3918 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3919 *handlep = tc->tcm_handle;
3923 *options = ta[TCA_OPTIONS];
3927 const struct gnet_stats_queue *gsq;
3928 struct gnet_stats_basic gsb;
3930 static const struct nl_policy stats_policy[] = {
3931 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3932 .min_len = sizeof gsb },
3933 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3934 .min_len = sizeof *gsq },
3936 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3938 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3939 sa, ARRAY_SIZE(sa))) {
3940 VLOG_WARN_RL(&rl, "failed to parse class stats");
3944 /* Alignment issues screw up the length of struct gnet_stats_basic on
3945 * some arch/bitsize combinations. Newer versions of Linux have a
3946 * struct gnet_stats_basic_packed, but we can't depend on that. The
3947 * easiest thing to do is just to make a copy. */
3948 memset(&gsb, 0, sizeof gsb);
3949 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3950 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3951 stats->tx_bytes = gsb.bytes;
3952 stats->tx_packets = gsb.packets;
3954 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3955 stats->tx_errors = gsq->drops;
3965 memset(stats, 0, sizeof *stats);
3970 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3973 tc_query_class(const struct netdev *netdev,
3974 unsigned int handle, unsigned int parent,
3975 struct ofpbuf **replyp)
3977 struct ofpbuf request;
3978 struct tcmsg *tcmsg;
3981 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3985 tcmsg->tcm_handle = handle;
3986 tcmsg->tcm_parent = parent;
3988 error = tc_transact(&request, replyp);
3990 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3991 netdev_get_name(netdev),
3992 tc_get_major(handle), tc_get_minor(handle),
3993 tc_get_major(parent), tc_get_minor(parent),
3994 ovs_strerror(error));
3999 /* Equivalent to "tc class del dev <name> handle <handle>". */
4001 tc_delete_class(const struct netdev *netdev, unsigned int handle)
4003 struct ofpbuf request;
4004 struct tcmsg *tcmsg;
4007 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
4011 tcmsg->tcm_handle = handle;
4012 tcmsg->tcm_parent = 0;
4014 error = tc_transact(&request, NULL);
4016 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4017 netdev_get_name(netdev),
4018 tc_get_major(handle), tc_get_minor(handle),
4019 ovs_strerror(error));
4024 /* Equivalent to "tc qdisc del dev <name> root". */
4026 tc_del_qdisc(struct netdev *netdev_)
4028 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4029 struct ofpbuf request;
4030 struct tcmsg *tcmsg;
4033 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
4037 tcmsg->tcm_handle = tc_make_handle(1, 0);
4038 tcmsg->tcm_parent = TC_H_ROOT;
4040 error = tc_transact(&request, NULL);
4041 if (error == EINVAL) {
4042 /* EINVAL probably means that the default qdisc was in use, in which
4043 * case we've accomplished our purpose. */
4046 if (!error && netdev->tc) {
4047 if (netdev->tc->ops->tc_destroy) {
4048 netdev->tc->ops->tc_destroy(netdev->tc);
4055 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4056 * kernel to determine what they are. Returns 0 if successful, otherwise a
4057 * positive errno value. */
4059 tc_query_qdisc(const struct netdev *netdev_)
4061 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4062 struct ofpbuf request, *qdisc;
4063 const struct tc_ops *ops;
4064 struct tcmsg *tcmsg;
4072 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4073 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4074 * 2.6.35 without that fix backported to it.
4076 * To avoid the OOPS, we must not make a request that would attempt to dump
4077 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4078 * few others. There are a few ways that I can see to do this, but most of
4079 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4080 * technique chosen here is to assume that any non-default qdisc that we
4081 * create will have a class with handle 1:0. The built-in qdiscs only have
4082 * a class with handle 0:0.
4084 * We could check for Linux 2.6.35+ and use a more straightforward method
4086 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
4090 tcmsg->tcm_handle = tc_make_handle(1, 0);
4091 tcmsg->tcm_parent = 0;
4093 /* Figure out what tc class to instantiate. */
4094 error = tc_transact(&request, &qdisc);
4098 error = tc_parse_qdisc(qdisc, &kind, NULL);
4100 ops = &tc_ops_other;
4102 ops = tc_lookup_linux_name(kind);
4104 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4105 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4107 ops = &tc_ops_other;
4110 } else if (error == ENOENT) {
4111 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4112 * other entity that doesn't have a handle 1:0. We will assume
4113 * that it's the system default qdisc. */
4114 ops = &tc_ops_default;
4117 /* Who knows? Maybe the device got deleted. */
4118 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4119 netdev_get_name(netdev_), ovs_strerror(error));
4120 ops = &tc_ops_other;
4123 /* Instantiate it. */
4124 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
4125 ovs_assert((load_error == 0) == (netdev->tc != NULL));
4126 ofpbuf_delete(qdisc);
4128 return error ? error : load_error;
4131 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4132 approximate the time to transmit packets of various lengths. For an MTU of
4133 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4134 represents two possible packet lengths; for a MTU of 513 through 1024, four
4135 possible lengths; and so on.
4137 Returns, for the specified 'mtu', the number of bits that packet lengths
4138 need to be shifted right to fit within such a 256-entry table. */
4140 tc_calc_cell_log(unsigned int mtu)
4145 mtu = ETH_PAYLOAD_MAX;
4147 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4149 for (cell_log = 0; mtu >= 256; cell_log++) {
4156 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4159 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4161 memset(rate, 0, sizeof *rate);
4162 rate->cell_log = tc_calc_cell_log(mtu);
4163 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4164 /* rate->cell_align = 0; */ /* distro headers. */
4165 rate->mpu = ETH_TOTAL_MIN;
4169 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4170 * attribute of the specified "type".
4172 * See tc_calc_cell_log() above for a description of "rtab"s. */
4174 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4179 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4180 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4181 unsigned packet_size = (i + 1) << rate->cell_log;
4182 if (packet_size < rate->mpu) {
4183 packet_size = rate->mpu;
4185 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4189 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4190 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4191 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4194 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4196 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4197 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4200 /* Linux-only functions declared in netdev-linux.h */
4202 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4203 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4205 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4206 const char *flag_name, bool enable)
4208 const char *netdev_name = netdev_get_name(netdev);
4209 struct ethtool_value evalue;
4213 COVERAGE_INC(netdev_get_ethtool);
4214 memset(&evalue, 0, sizeof evalue);
4215 error = netdev_linux_do_ethtool(netdev_name,
4216 (struct ethtool_cmd *)&evalue,
4217 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4222 COVERAGE_INC(netdev_set_ethtool);
4223 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4224 error = netdev_linux_do_ethtool(netdev_name,
4225 (struct ethtool_cmd *)&evalue,
4226 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4231 COVERAGE_INC(netdev_get_ethtool);
4232 memset(&evalue, 0, sizeof evalue);
4233 error = netdev_linux_do_ethtool(netdev_name,
4234 (struct ethtool_cmd *)&evalue,
4235 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4240 if (new_flags != evalue.data) {
4241 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4242 "device %s failed", enable ? "enable" : "disable",
4243 flag_name, netdev_name);
4250 /* Utility functions. */
4252 /* Copies 'src' into 'dst', performing format conversion in the process. */
4254 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4255 const struct rtnl_link_stats *src)
4257 dst->rx_packets = src->rx_packets;
4258 dst->tx_packets = src->tx_packets;
4259 dst->rx_bytes = src->rx_bytes;
4260 dst->tx_bytes = src->tx_bytes;
4261 dst->rx_errors = src->rx_errors;
4262 dst->tx_errors = src->tx_errors;
4263 dst->rx_dropped = src->rx_dropped;
4264 dst->tx_dropped = src->tx_dropped;
4265 dst->multicast = src->multicast;
4266 dst->collisions = src->collisions;
4267 dst->rx_length_errors = src->rx_length_errors;
4268 dst->rx_over_errors = src->rx_over_errors;
4269 dst->rx_crc_errors = src->rx_crc_errors;
4270 dst->rx_frame_errors = src->rx_frame_errors;
4271 dst->rx_fifo_errors = src->rx_fifo_errors;
4272 dst->rx_missed_errors = src->rx_missed_errors;
4273 dst->tx_aborted_errors = src->tx_aborted_errors;
4274 dst->tx_carrier_errors = src->tx_carrier_errors;
4275 dst->tx_fifo_errors = src->tx_fifo_errors;
4276 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4277 dst->tx_window_errors = src->tx_window_errors;
4281 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4283 /* Policy for RTNLGRP_LINK messages.
4285 * There are *many* more fields in these messages, but currently we only
4286 * care about these fields. */
4287 static const struct nl_policy rtnlgrp_link_policy[] = {
4288 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4289 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4290 .min_len = sizeof(struct rtnl_link_stats) },
4293 struct ofpbuf request;
4294 struct ofpbuf *reply;
4295 struct ifinfomsg *ifi;
4296 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4299 ofpbuf_init(&request, 0);
4300 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4301 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4302 ifi->ifi_family = PF_UNSPEC;
4303 ifi->ifi_index = ifindex;
4304 error = nl_transact(NETLINK_ROUTE, &request, &reply);
4305 ofpbuf_uninit(&request);
4310 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4311 rtnlgrp_link_policy,
4312 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4313 ofpbuf_delete(reply);
4317 if (!attrs[IFLA_STATS]) {
4318 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4319 ofpbuf_delete(reply);
4323 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4325 ofpbuf_delete(reply);
4331 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4333 static const char fn[] = "/proc/net/dev";
4338 stream = fopen(fn, "r");
4340 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
4345 while (fgets(line, sizeof line, stream)) {
4348 #define X64 "%"SCNu64
4351 X64 X64 X64 X64 X64 X64 X64 "%*u"
4352 X64 X64 X64 X64 X64 X64 X64 "%*u",
4358 &stats->rx_fifo_errors,
4359 &stats->rx_frame_errors,
4365 &stats->tx_fifo_errors,
4367 &stats->tx_carrier_errors) != 15) {
4368 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4369 } else if (!strcmp(devname, netdev_name)) {
4370 stats->rx_length_errors = UINT64_MAX;
4371 stats->rx_over_errors = UINT64_MAX;
4372 stats->rx_crc_errors = UINT64_MAX;
4373 stats->rx_missed_errors = UINT64_MAX;
4374 stats->tx_aborted_errors = UINT64_MAX;
4375 stats->tx_heartbeat_errors = UINT64_MAX;
4376 stats->tx_window_errors = UINT64_MAX;
4382 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4388 get_flags(const struct netdev *dev, unsigned int *flags)
4394 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
4396 *flags = ifr.ifr_flags;
4402 set_flags(const char *name, unsigned int flags)
4406 ifr.ifr_flags = flags;
4407 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
4411 do_get_ifindex(const char *netdev_name)
4416 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4417 COVERAGE_INC(netdev_get_ifindex);
4419 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
4421 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4422 netdev_name, ovs_strerror(error));
4425 return ifr.ifr_ifindex;
4429 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4431 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4433 if (!(netdev->cache_valid & VALID_IFINDEX)) {
4434 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4437 netdev->get_ifindex_error = -ifindex;
4438 netdev->ifindex = 0;
4440 netdev->get_ifindex_error = 0;
4441 netdev->ifindex = ifindex;
4443 netdev->cache_valid |= VALID_IFINDEX;
4446 *ifindexp = netdev->ifindex;
4447 return netdev->get_ifindex_error;
4451 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4457 memset(&ifr, 0, sizeof ifr);
4458 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4459 COVERAGE_INC(netdev_get_hwaddr);
4460 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
4462 /* ENODEV probably means that a vif disappeared asynchronously and
4463 * hasn't been removed from the database yet, so reduce the log level
4464 * to INFO for that case. */
4465 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
4466 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4467 netdev_name, ovs_strerror(error));
4470 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4471 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4472 VLOG_WARN("%s device has unknown hardware address family %d",
4473 netdev_name, hwaddr_family);
4475 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4480 set_etheraddr(const char *netdev_name,
4481 const uint8_t mac[ETH_ADDR_LEN])
4486 memset(&ifr, 0, sizeof ifr);
4487 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4488 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4489 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4490 COVERAGE_INC(netdev_set_hwaddr);
4491 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
4493 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4494 netdev_name, ovs_strerror(error));
4500 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4501 int cmd, const char *cmd_name)
4506 memset(&ifr, 0, sizeof ifr);
4507 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4508 ifr.ifr_data = (caddr_t) ecmd;
4511 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
4513 if (error != EOPNOTSUPP) {
4514 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4515 "failed: %s", cmd_name, name, ovs_strerror(error));
4517 /* The device doesn't support this operation. That's pretty
4518 * common, so there's no point in logging anything. */
4525 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4526 int cmd, const char *cmd_name)
4531 ifr.ifr_addr.sa_family = AF_INET;
4532 error = af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4534 const struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *,
4536 *ip = sin->sin_addr;
4541 /* Returns an AF_PACKET raw socket or a negative errno value. */
4543 af_packet_sock(void)
4545 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4548 if (ovsthread_once_start(&once)) {
4549 sock = socket(AF_PACKET, SOCK_RAW, 0);
4551 int error = set_nonblocking(sock);
4558 VLOG_ERR("failed to create packet socket: %s",
4559 ovs_strerror(errno));
4561 ovsthread_once_done(&once);