2 * Copyright (c) 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <linux/version.h>
37 #include <sys/types.h>
38 #include <sys/ioctl.h>
39 #include <sys/socket.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
52 #include "dpif-linux.h"
53 #include "dynamic-string.h"
54 #include "fatal-signal.h"
57 #include "netdev-provider.h"
58 #include "netdev-vport.h"
59 #include "netlink-notifier.h"
60 #include "netlink-socket.h"
63 #include "openflow/openflow.h"
65 #include "poll-loop.h"
66 #include "rtnetlink-link.h"
68 #include "socket-util.h"
71 #include "unaligned.h"
74 VLOG_DEFINE_THIS_MODULE(netdev_linux);
76 COVERAGE_DEFINE(netdev_set_policing);
77 COVERAGE_DEFINE(netdev_arp_lookup);
78 COVERAGE_DEFINE(netdev_get_ifindex);
79 COVERAGE_DEFINE(netdev_get_hwaddr);
80 COVERAGE_DEFINE(netdev_set_hwaddr);
81 COVERAGE_DEFINE(netdev_get_ethtool);
82 COVERAGE_DEFINE(netdev_set_ethtool);
85 /* These were introduced in Linux 2.6.14, so they might be missing if we have
87 #ifndef ADVERTISED_Pause
88 #define ADVERTISED_Pause (1 << 13)
90 #ifndef ADVERTISED_Asym_Pause
91 #define ADVERTISED_Asym_Pause (1 << 14)
94 /* These were introduced in Linux 2.6.24, so they might be missing if we
95 * have old headers. */
96 #ifndef ETHTOOL_GFLAGS
97 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
99 #ifndef ETHTOOL_SFLAGS
100 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
103 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
106 #define TC_RTAB_SIZE 1024
109 static struct nln_notifier *netdev_linux_cache_notifier = NULL;
110 static int cache_notifier_refcount;
113 VALID_IFINDEX = 1 << 0,
114 VALID_ETHERADDR = 1 << 1,
118 VALID_POLICING = 1 << 5,
119 VALID_VPORT_STAT_ERROR = 1 << 6,
120 VALID_DRVINFO = 1 << 7,
121 VALID_FEATURES = 1 << 8,
128 /* Traffic control. */
130 /* An instance of a traffic control class. Always associated with a particular
133 * Each TC implementation subclasses this with whatever additional data it
136 const struct tc_ops *ops;
137 struct hmap queues; /* Contains "struct tc_queue"s.
138 * Read by generic TC layer.
139 * Written only by TC implementation. */
142 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
144 /* One traffic control queue.
146 * Each TC implementation subclasses this with whatever additional data it
149 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
150 unsigned int queue_id; /* OpenFlow queue ID. */
153 /* A particular kind of traffic control. Each implementation generally maps to
154 * one particular Linux qdisc class.
156 * The functions below return 0 if successful or a positive errno value on
157 * failure, except where otherwise noted. All of them must be provided, except
158 * where otherwise noted. */
160 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
161 * This is null for tc_ops_default and tc_ops_other, for which there are no
162 * appropriate values. */
163 const char *linux_name;
165 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
166 const char *ovs_name;
168 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
169 * queues. The queues are numbered 0 through n_queues - 1. */
170 unsigned int n_queues;
172 /* Called to install this TC class on 'netdev'. The implementation should
173 * make the Netlink calls required to set up 'netdev' with the right qdisc
174 * and configure it according to 'details'. The implementation may assume
175 * that the current qdisc is the default; that is, there is no need for it
176 * to delete the current qdisc before installing itself.
178 * The contents of 'details' should be documented as valid for 'ovs_name'
179 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
180 * (which is built as ovs-vswitchd.conf.db(8)).
182 * This function must return 0 if and only if it sets 'netdev->tc' to an
183 * initialized 'struct tc'.
185 * (This function is null for tc_ops_other, which cannot be installed. For
186 * other TC classes it should always be nonnull.) */
187 int (*tc_install)(struct netdev *netdev, const struct smap *details);
189 /* Called when the netdev code determines (through a Netlink query) that
190 * this TC class's qdisc is installed on 'netdev', but we didn't install
191 * it ourselves and so don't know any of the details.
193 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
194 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
195 * implementation should parse the other attributes of 'nlmsg' as
196 * necessary to determine its configuration. If necessary it should also
197 * use Netlink queries to determine the configuration of queues on
200 * This function must return 0 if and only if it sets 'netdev->tc' to an
201 * initialized 'struct tc'. */
202 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
204 /* Destroys the data structures allocated by the implementation as part of
205 * 'tc'. (This includes destroying 'tc->queues' by calling
208 * The implementation should not need to perform any Netlink calls. If
209 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
210 * (But it may not be desirable.)
212 * This function may be null if 'tc' is trivial. */
213 void (*tc_destroy)(struct tc *tc);
215 /* Retrieves details of 'netdev->tc' configuration into 'details'.
217 * The implementation should not need to perform any Netlink calls, because
218 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
219 * cached the configuration.
221 * The contents of 'details' should be documented as valid for 'ovs_name'
222 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
223 * (which is built as ovs-vswitchd.conf.db(8)).
225 * This function may be null if 'tc' is not configurable.
227 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
229 /* Reconfigures 'netdev->tc' according to 'details', performing any
230 * required Netlink calls to complete the reconfiguration.
232 * The contents of 'details' should be documented as valid for 'ovs_name'
233 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
234 * (which is built as ovs-vswitchd.conf.db(8)).
236 * This function may be null if 'tc' is not configurable.
238 int (*qdisc_set)(struct netdev *, const struct smap *details);
240 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
241 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
243 * The contents of 'details' should be documented as valid for 'ovs_name'
244 * in the "other_config" column in the "Queue" table in
245 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
247 * The implementation should not need to perform any Netlink calls, because
248 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
249 * cached the queue configuration.
251 * This function may be null if 'tc' does not have queues ('n_queues' is
253 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
254 struct smap *details);
256 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
257 * 'details', perfoming any required Netlink calls to complete the
258 * reconfiguration. The caller ensures that 'queue_id' is less than
261 * The contents of 'details' should be documented as valid for 'ovs_name'
262 * in the "other_config" column in the "Queue" table in
263 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
265 * This function may be null if 'tc' does not have queues or its queues are
266 * not configurable. */
267 int (*class_set)(struct netdev *, unsigned int queue_id,
268 const struct smap *details);
270 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
271 * tc_queue's within 'netdev->tc->queues'.
273 * This function may be null if 'tc' does not have queues or its queues
274 * cannot be deleted. */
275 int (*class_delete)(struct netdev *, struct tc_queue *queue);
277 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
278 * 'struct tc_queue's within 'netdev->tc->queues'.
280 * On success, initializes '*stats'.
282 * This function may be null if 'tc' does not have queues or if it cannot
283 * report queue statistics. */
284 int (*class_get_stats)(const struct netdev *netdev,
285 const struct tc_queue *queue,
286 struct netdev_queue_stats *stats);
288 /* Extracts queue stats from 'nlmsg', which is a response to a
289 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
291 * This function may be null if 'tc' does not have queues or if it cannot
292 * report queue statistics. */
293 int (*class_dump_stats)(const struct netdev *netdev,
294 const struct ofpbuf *nlmsg,
295 netdev_dump_queue_stats_cb *cb, void *aux);
299 tc_init(struct tc *tc, const struct tc_ops *ops)
302 hmap_init(&tc->queues);
306 tc_destroy(struct tc *tc)
308 hmap_destroy(&tc->queues);
311 static const struct tc_ops tc_ops_htb;
312 static const struct tc_ops tc_ops_hfsc;
313 static const struct tc_ops tc_ops_default;
314 static const struct tc_ops tc_ops_other;
316 static const struct tc_ops *const tcs[] = {
317 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
318 &tc_ops_hfsc, /* Hierarchical fair service curve. */
319 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
320 &tc_ops_other, /* Some other qdisc. */
324 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
325 static unsigned int tc_get_major(unsigned int handle);
326 static unsigned int tc_get_minor(unsigned int handle);
328 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
329 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
330 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
332 static struct tcmsg *tc_make_request(const struct netdev *, int type,
333 unsigned int flags, struct ofpbuf *);
334 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
335 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
336 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
339 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
340 struct nlattr **options);
341 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
342 struct nlattr **options,
343 struct netdev_queue_stats *);
344 static int tc_query_class(const struct netdev *,
345 unsigned int handle, unsigned int parent,
346 struct ofpbuf **replyp);
347 static int tc_delete_class(const struct netdev *, unsigned int handle);
349 static int tc_del_qdisc(struct netdev *netdev);
350 static int tc_query_qdisc(const struct netdev *netdev);
352 static int tc_calc_cell_log(unsigned int mtu);
353 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
354 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
355 const struct tc_ratespec *rate);
356 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
358 struct netdev_linux {
361 struct shash_node *shash_node;
362 unsigned int cache_valid;
363 unsigned int change_seq;
365 bool miimon; /* Link status of last poll. */
366 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
367 struct timer miimon_timer;
369 /* The following are figured out "on demand" only. They are only valid
370 * when the corresponding VALID_* bit in 'cache_valid' is set. */
372 uint8_t etheraddr[ETH_ADDR_LEN];
373 struct in_addr address, netmask;
376 unsigned int ifi_flags;
377 long long int carrier_resets;
378 uint32_t kbits_rate; /* Policing data. */
379 uint32_t kbits_burst;
380 int vport_stats_error; /* Cached error code from vport_get_stats().
381 0 or an errno value. */
382 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
383 int ether_addr_error; /* Cached error code from set/get etheraddr. */
384 int netdev_policing_error; /* Cached error code from set policing. */
385 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
386 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
388 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
389 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
390 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
391 enum netdev_features peer; /* Cached from ETHTOOL_GSET. */
393 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
397 struct tap_state tap;
401 struct netdev_rx_linux {
407 static const struct netdev_rx_class netdev_rx_linux_class;
409 /* Sockets used for ioctl operations. */
410 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
412 /* A Netlink routing socket that is not subscribed to any multicast groups. */
413 static struct nl_sock *rtnl_sock;
415 /* This is set pretty low because we probably won't learn anything from the
416 * additional log messages. */
417 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
419 static int netdev_linux_init(void);
421 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
422 int cmd, const char *cmd_name);
423 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
424 const char *cmd_name);
425 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
426 int cmd, const char *cmd_name);
427 static int get_flags(const struct netdev *, unsigned int *flags);
428 static int set_flags(const char *, unsigned int flags);
429 static int do_get_ifindex(const char *netdev_name);
430 static int get_ifindex(const struct netdev *, int *ifindexp);
431 static int do_set_addr(struct netdev *netdev,
432 int ioctl_nr, const char *ioctl_name,
433 struct in_addr addr);
434 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
435 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
436 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
437 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
438 static int af_packet_sock(void);
439 static void netdev_linux_miimon_run(void);
440 static void netdev_linux_miimon_wait(void);
443 is_netdev_linux_class(const struct netdev_class *netdev_class)
445 return netdev_class->init == netdev_linux_init;
449 is_tap_netdev(const struct netdev *netdev)
451 return netdev_get_class(netdev) == &netdev_tap_class;
454 static struct netdev_linux *
455 netdev_linux_cast(const struct netdev *netdev)
457 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
459 return CONTAINER_OF(netdev, struct netdev_linux, up);
462 static struct netdev_rx_linux *
463 netdev_rx_linux_cast(const struct netdev_rx *rx)
465 netdev_rx_assert_class(rx, &netdev_rx_linux_class);
466 return CONTAINER_OF(rx, struct netdev_rx_linux, up);
470 netdev_linux_init(void)
472 static int status = -1;
474 /* Create AF_INET socket. */
475 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
476 status = af_inet_sock >= 0 ? 0 : errno;
478 VLOG_ERR("failed to create inet socket: %s", ovs_strerror(status));
481 /* Create rtnetlink socket. */
483 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
485 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
486 ovs_strerror(status));
494 netdev_linux_run(void)
496 rtnetlink_link_run();
497 netdev_linux_miimon_run();
501 netdev_linux_wait(void)
503 rtnetlink_link_wait();
504 netdev_linux_miimon_wait();
508 netdev_linux_changed(struct netdev_linux *dev,
509 unsigned int ifi_flags, unsigned int mask)
512 if (!dev->change_seq) {
516 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
517 dev->carrier_resets++;
519 dev->ifi_flags = ifi_flags;
521 dev->cache_valid &= mask;
525 netdev_linux_update(struct netdev_linux *dev,
526 const struct rtnetlink_link_change *change)
528 if (change->nlmsg_type == RTM_NEWLINK) {
530 netdev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
532 /* Update netdev from rtnl-change msg. */
534 dev->mtu = change->mtu;
535 dev->cache_valid |= VALID_MTU;
536 dev->netdev_mtu_error = 0;
539 if (!eth_addr_is_zero(change->addr)) {
540 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
541 dev->cache_valid |= VALID_ETHERADDR;
542 dev->ether_addr_error = 0;
545 dev->ifindex = change->ifi_index;
546 dev->cache_valid |= VALID_IFINDEX;
547 dev->get_ifindex_error = 0;
550 netdev_linux_changed(dev, change->ifi_flags, 0);
555 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
556 void *aux OVS_UNUSED)
558 struct netdev_linux *dev;
560 struct netdev *base_dev = netdev_from_name(change->ifname);
561 if (base_dev && is_netdev_linux_class(netdev_get_class(base_dev))) {
562 netdev_linux_update(netdev_linux_cast(base_dev), change);
565 struct shash device_shash;
566 struct shash_node *node;
568 shash_init(&device_shash);
569 netdev_get_devices(&netdev_linux_class, &device_shash);
570 SHASH_FOR_EACH (node, &device_shash) {
575 get_flags(&dev->up, &flags);
576 netdev_linux_changed(dev, flags, 0);
578 shash_destroy(&device_shash);
583 cache_notifier_ref(void)
585 if (!cache_notifier_refcount) {
586 ovs_assert(!netdev_linux_cache_notifier);
588 netdev_linux_cache_notifier =
589 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
591 if (!netdev_linux_cache_notifier) {
595 cache_notifier_refcount++;
601 cache_notifier_unref(void)
603 ovs_assert(cache_notifier_refcount > 0);
604 if (!--cache_notifier_refcount) {
605 ovs_assert(netdev_linux_cache_notifier);
606 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
607 netdev_linux_cache_notifier = NULL;
611 /* Creates system and internal devices. */
613 netdev_linux_create(const struct netdev_class *class, const char *name,
614 struct netdev **netdevp)
616 struct netdev_linux *netdev;
619 error = cache_notifier_ref();
624 netdev = xzalloc(sizeof *netdev);
625 netdev->change_seq = 1;
626 netdev_init(&netdev->up, name, class);
627 error = get_flags(&netdev->up, &netdev->ifi_flags);
628 if (error == ENODEV) {
629 if (class != &netdev_internal_class) {
630 /* The device does not exist, so don't allow it to be opened. */
631 netdev_uninit(&netdev->up, false);
632 cache_notifier_unref();
636 /* "Internal" netdevs have to be created as netdev objects before
637 * they exist in the kernel, because creating them in the kernel
638 * happens by passing a netdev object to dpif_port_add().
639 * Therefore, ignore the error. */
643 *netdevp = &netdev->up;
647 /* For most types of netdevs we open the device for each call of
648 * netdev_open(). However, this is not the case with tap devices,
649 * since it is only possible to open the device once. In this
650 * situation we share a single file descriptor, and consequently
651 * buffers, across all readers. Therefore once data is read it will
652 * be unavailable to other reads for tap devices. */
654 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
655 const char *name, struct netdev **netdevp)
657 struct netdev_linux *netdev;
658 struct tap_state *state;
659 static const char tap_dev[] = "/dev/net/tun";
663 netdev = xzalloc(sizeof *netdev);
664 state = &netdev->state.tap;
666 error = cache_notifier_ref();
671 /* Open tap device. */
672 state->fd = open(tap_dev, O_RDWR);
675 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
676 goto error_unref_notifier;
679 /* Create tap device. */
680 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
681 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
682 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
683 VLOG_WARN("%s: creating tap device failed: %s", name,
684 ovs_strerror(errno));
686 goto error_unref_notifier;
689 /* Make non-blocking. */
690 error = set_nonblocking(state->fd);
692 goto error_unref_notifier;
695 netdev_init(&netdev->up, name, &netdev_tap_class);
696 *netdevp = &netdev->up;
699 error_unref_notifier:
700 cache_notifier_unref();
707 destroy_tap(struct netdev_linux *netdev)
709 struct tap_state *state = &netdev->state.tap;
711 if (state->fd >= 0) {
716 /* Destroys the netdev device 'netdev_'. */
718 netdev_linux_destroy(struct netdev *netdev_)
720 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
722 if (netdev->tc && netdev->tc->ops->tc_destroy) {
723 netdev->tc->ops->tc_destroy(netdev->tc);
726 if (netdev_get_class(netdev_) == &netdev_tap_class) {
731 cache_notifier_unref();
735 netdev_linux_rx_open(struct netdev *netdev_, struct netdev_rx **rxp)
737 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
738 bool is_tap = is_tap_netdev(netdev_);
739 struct netdev_rx_linux *rx;
744 fd = netdev->state.tap.fd;
746 struct sockaddr_ll sll;
748 /* Result of tcpdump -dd inbound */
749 static struct sock_filter filt[] = {
750 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
751 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
752 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
753 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
755 static struct sock_fprog fprog = { ARRAY_SIZE(filt), filt };
757 /* Create file descriptor. */
758 fd = socket(PF_PACKET, SOCK_RAW, 0);
761 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
765 /* Set non-blocking mode. */
766 error = set_nonblocking(fd);
771 /* Get ethernet device index. */
772 error = get_ifindex(&netdev->up, &ifindex);
777 /* Bind to specific ethernet device. */
778 memset(&sll, 0, sizeof sll);
779 sll.sll_family = AF_PACKET;
780 sll.sll_ifindex = ifindex;
781 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
782 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
784 VLOG_ERR("%s: failed to bind raw socket (%s)",
785 netdev_get_name(netdev_), ovs_strerror(error));
789 /* Filter for only inbound packets. */
790 error = setsockopt(fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
794 VLOG_ERR("%s: failed attach filter (%s)",
795 netdev_get_name(netdev_), ovs_strerror(error));
800 rx = xmalloc(sizeof *rx);
801 netdev_rx_init(&rx->up, netdev_, &netdev_rx_linux_class);
816 netdev_rx_linux_destroy(struct netdev_rx *rx_)
818 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
827 netdev_rx_linux_recv(struct netdev_rx *rx_, void *data, size_t size)
829 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
834 ? read(rx->fd, data, size)
835 : recv(rx->fd, data, size, MSG_TRUNC));
836 } while (retval < 0 && errno == EINTR);
840 } else if (retval >= 0) {
843 if (errno != EAGAIN) {
844 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
845 ovs_strerror(errno), netdev_rx_get_name(rx_));
852 netdev_rx_linux_wait(struct netdev_rx *rx_)
854 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
855 poll_fd_wait(rx->fd, POLLIN);
859 netdev_rx_linux_drain(struct netdev_rx *rx_)
861 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
864 int error = netdev_linux_do_ioctl(netdev_rx_get_name(rx_), &ifr,
865 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
869 drain_fd(rx->fd, ifr.ifr_qlen);
872 return drain_rcvbuf(rx->fd);
876 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
877 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
878 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
879 * the packet is too big or too small to transmit on the device.
881 * The caller retains ownership of 'buffer' in all cases.
883 * The kernel maintains a packet transmission queue, so the caller is not
884 * expected to do additional queuing of packets. */
886 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
891 if (!is_tap_netdev(netdev_)) {
892 /* Use our AF_PACKET socket to send to this device. */
893 struct sockaddr_ll sll;
900 sock = af_packet_sock();
905 error = get_ifindex(netdev_, &ifindex);
910 /* We don't bother setting most fields in sockaddr_ll because the
911 * kernel ignores them for SOCK_RAW. */
912 memset(&sll, 0, sizeof sll);
913 sll.sll_family = AF_PACKET;
914 sll.sll_ifindex = ifindex;
916 iov.iov_base = CONST_CAST(void *, data);
920 msg.msg_namelen = sizeof sll;
923 msg.msg_control = NULL;
924 msg.msg_controllen = 0;
927 retval = sendmsg(sock, &msg, 0);
929 /* Use the tap fd to send to this device. This is essential for
930 * tap devices, because packets sent to a tap device with an
931 * AF_PACKET socket will loop back to be *received* again on the
932 * tap device. This doesn't occur on other interface types
933 * because we attach a socket filter to the rx socket. */
934 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
936 retval = write(netdev->state.tap.fd, data, size);
940 /* The Linux AF_PACKET implementation never blocks waiting for room
941 * for packets, instead returning ENOBUFS. Translate this into
942 * EAGAIN for the caller. */
943 if (errno == ENOBUFS) {
945 } else if (errno == EINTR) {
947 } else if (errno != EAGAIN) {
948 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
949 netdev_get_name(netdev_), ovs_strerror(errno));
952 } else if (retval != size) {
953 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
954 "%zu) on %s", retval, size, netdev_get_name(netdev_));
962 /* Registers with the poll loop to wake up from the next call to poll_block()
963 * when the packet transmission queue has sufficient room to transmit a packet
964 * with netdev_send().
966 * The kernel maintains a packet transmission queue, so the client is not
967 * expected to do additional queuing of packets. Thus, this function is
968 * unlikely to ever be used. It is included for completeness. */
970 netdev_linux_send_wait(struct netdev *netdev)
972 if (is_tap_netdev(netdev)) {
973 /* TAP device always accepts packets.*/
974 poll_immediate_wake();
978 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
979 * otherwise a positive errno value. */
981 netdev_linux_set_etheraddr(struct netdev *netdev_,
982 const uint8_t mac[ETH_ADDR_LEN])
984 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
985 struct netdev_saved_flags *sf = NULL;
988 if (netdev->cache_valid & VALID_ETHERADDR) {
989 if (netdev->ether_addr_error) {
990 return netdev->ether_addr_error;
992 if (eth_addr_equals(netdev->etheraddr, mac)) {
995 netdev->cache_valid &= ~VALID_ETHERADDR;
998 /* Tap devices must be brought down before setting the address. */
999 if (is_tap_netdev(netdev_)) {
1000 netdev_turn_flags_off(netdev_, NETDEV_UP, &sf);
1002 error = set_etheraddr(netdev_get_name(netdev_), mac);
1003 if (!error || error == ENODEV) {
1004 netdev->ether_addr_error = error;
1005 netdev->cache_valid |= VALID_ETHERADDR;
1007 memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
1011 netdev_restore_flags(sf);
1016 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1018 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1019 uint8_t mac[ETH_ADDR_LEN])
1021 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1023 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1024 int error = get_etheraddr(netdev_get_name(netdev_),
1027 netdev->ether_addr_error = error;
1028 netdev->cache_valid |= VALID_ETHERADDR;
1031 if (!netdev->ether_addr_error) {
1032 memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
1035 return netdev->ether_addr_error;
1038 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1039 * in bytes, not including the hardware header; thus, this is typically 1500
1040 * bytes for Ethernet devices. */
1042 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1044 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1045 if (!(netdev->cache_valid & VALID_MTU)) {
1049 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1050 SIOCGIFMTU, "SIOCGIFMTU");
1052 netdev->netdev_mtu_error = error;
1053 netdev->mtu = ifr.ifr_mtu;
1054 netdev->cache_valid |= VALID_MTU;
1057 if (!netdev->netdev_mtu_error) {
1058 *mtup = netdev->mtu;
1060 return netdev->netdev_mtu_error;
1063 /* Sets the maximum size of transmitted (MTU) for given device using linux
1064 * networking ioctl interface.
1067 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1069 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1073 if (netdev->cache_valid & VALID_MTU) {
1074 if (netdev->netdev_mtu_error) {
1075 return netdev->netdev_mtu_error;
1077 if (netdev->mtu == mtu) {
1080 netdev->cache_valid &= ~VALID_MTU;
1083 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1084 SIOCSIFMTU, "SIOCSIFMTU");
1085 if (!error || error == ENODEV) {
1086 netdev->netdev_mtu_error = error;
1087 netdev->mtu = ifr.ifr_mtu;
1088 netdev->cache_valid |= VALID_MTU;
1093 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1094 * On failure, returns a negative errno value. */
1096 netdev_linux_get_ifindex(const struct netdev *netdev)
1100 error = get_ifindex(netdev, &ifindex);
1101 return error ? -error : ifindex;
1105 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1107 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1109 if (netdev->miimon_interval > 0) {
1110 *carrier = netdev->miimon;
1112 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1118 static long long int
1119 netdev_linux_get_carrier_resets(const struct netdev *netdev)
1121 return netdev_linux_cast(netdev)->carrier_resets;
1125 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1126 struct mii_ioctl_data *data)
1131 memset(&ifr, 0, sizeof ifr);
1132 memcpy(&ifr.ifr_data, data, sizeof *data);
1133 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1134 memcpy(data, &ifr.ifr_data, sizeof *data);
1140 netdev_linux_get_miimon(const char *name, bool *miimon)
1142 struct mii_ioctl_data data;
1147 memset(&data, 0, sizeof data);
1148 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1150 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1151 data.reg_num = MII_BMSR;
1152 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1156 *miimon = !!(data.val_out & BMSR_LSTATUS);
1158 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1161 struct ethtool_cmd ecmd;
1163 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1166 COVERAGE_INC(netdev_get_ethtool);
1167 memset(&ecmd, 0, sizeof ecmd);
1168 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1171 struct ethtool_value eval;
1173 memcpy(&eval, &ecmd, sizeof eval);
1174 *miimon = !!eval.data;
1176 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1184 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1185 long long int interval)
1187 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1189 interval = interval > 0 ? MAX(interval, 100) : 0;
1190 if (netdev->miimon_interval != interval) {
1191 netdev->miimon_interval = interval;
1192 timer_set_expired(&netdev->miimon_timer);
1199 netdev_linux_miimon_run(void)
1201 struct shash device_shash;
1202 struct shash_node *node;
1204 shash_init(&device_shash);
1205 netdev_get_devices(&netdev_linux_class, &device_shash);
1206 SHASH_FOR_EACH (node, &device_shash) {
1207 struct netdev_linux *dev = node->data;
1210 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1214 netdev_linux_get_miimon(dev->up.name, &miimon);
1215 if (miimon != dev->miimon) {
1216 dev->miimon = miimon;
1217 netdev_linux_changed(dev, dev->ifi_flags, 0);
1220 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1223 shash_destroy(&device_shash);
1227 netdev_linux_miimon_wait(void)
1229 struct shash device_shash;
1230 struct shash_node *node;
1232 shash_init(&device_shash);
1233 netdev_get_devices(&netdev_linux_class, &device_shash);
1234 SHASH_FOR_EACH (node, &device_shash) {
1235 struct netdev_linux *dev = node->data;
1237 if (dev->miimon_interval > 0) {
1238 timer_wait(&dev->miimon_timer);
1241 shash_destroy(&device_shash);
1244 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1245 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1248 check_for_working_netlink_stats(void)
1250 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1251 * preferable, so if that works, we'll use it. */
1252 int ifindex = do_get_ifindex("lo");
1254 VLOG_WARN("failed to get ifindex for lo, "
1255 "obtaining netdev stats from proc");
1258 struct netdev_stats stats;
1259 int error = get_stats_via_netlink(ifindex, &stats);
1261 VLOG_DBG("obtaining netdev stats via rtnetlink");
1264 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1265 "via proc (you are probably running a pre-2.6.19 "
1266 "kernel)", ovs_strerror(error));
1273 swap_uint64(uint64_t *a, uint64_t *b)
1280 /* Copies 'src' into 'dst', performing format conversion in the process.
1282 * 'src' is allowed to be misaligned. */
1284 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1285 const struct ovs_vport_stats *src)
1287 dst->rx_packets = get_unaligned_u64(&src->rx_packets);
1288 dst->tx_packets = get_unaligned_u64(&src->tx_packets);
1289 dst->rx_bytes = get_unaligned_u64(&src->rx_bytes);
1290 dst->tx_bytes = get_unaligned_u64(&src->tx_bytes);
1291 dst->rx_errors = get_unaligned_u64(&src->rx_errors);
1292 dst->tx_errors = get_unaligned_u64(&src->tx_errors);
1293 dst->rx_dropped = get_unaligned_u64(&src->rx_dropped);
1294 dst->tx_dropped = get_unaligned_u64(&src->tx_dropped);
1296 dst->collisions = 0;
1297 dst->rx_length_errors = 0;
1298 dst->rx_over_errors = 0;
1299 dst->rx_crc_errors = 0;
1300 dst->rx_frame_errors = 0;
1301 dst->rx_fifo_errors = 0;
1302 dst->rx_missed_errors = 0;
1303 dst->tx_aborted_errors = 0;
1304 dst->tx_carrier_errors = 0;
1305 dst->tx_fifo_errors = 0;
1306 dst->tx_heartbeat_errors = 0;
1307 dst->tx_window_errors = 0;
1311 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1313 struct dpif_linux_vport reply;
1317 error = dpif_linux_vport_get(netdev_get_name(netdev), &reply, &buf);
1320 } else if (!reply.stats) {
1325 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1333 get_stats_via_vport(const struct netdev *netdev_,
1334 struct netdev_stats *stats)
1336 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1338 if (!netdev->vport_stats_error ||
1339 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1342 error = get_stats_via_vport__(netdev_, stats);
1343 if (error && error != ENOENT) {
1344 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1346 netdev_get_name(netdev_), ovs_strerror(error));
1348 netdev->vport_stats_error = error;
1349 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1354 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1355 struct netdev_stats *stats)
1357 static int use_netlink_stats = -1;
1360 if (use_netlink_stats < 0) {
1361 use_netlink_stats = check_for_working_netlink_stats();
1364 if (use_netlink_stats) {
1367 error = get_ifindex(netdev_, &ifindex);
1369 error = get_stats_via_netlink(ifindex, stats);
1372 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1376 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1377 netdev_get_name(netdev_), error);
1383 /* Retrieves current device stats for 'netdev-linux'. */
1385 netdev_linux_get_stats(const struct netdev *netdev_,
1386 struct netdev_stats *stats)
1388 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1389 struct netdev_stats dev_stats;
1392 get_stats_via_vport(netdev_, stats);
1394 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1397 if (netdev->vport_stats_error) {
1404 if (netdev->vport_stats_error) {
1405 /* stats not available from OVS then use ioctl stats. */
1408 stats->rx_errors += dev_stats.rx_errors;
1409 stats->tx_errors += dev_stats.tx_errors;
1410 stats->rx_dropped += dev_stats.rx_dropped;
1411 stats->tx_dropped += dev_stats.tx_dropped;
1412 stats->multicast += dev_stats.multicast;
1413 stats->collisions += dev_stats.collisions;
1414 stats->rx_length_errors += dev_stats.rx_length_errors;
1415 stats->rx_over_errors += dev_stats.rx_over_errors;
1416 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1417 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1418 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1419 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1420 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1421 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1422 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1423 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1424 stats->tx_window_errors += dev_stats.tx_window_errors;
1429 /* Retrieves current device stats for 'netdev-tap' netdev or
1430 * netdev-internal. */
1432 netdev_tap_get_stats(const struct netdev *netdev_,
1433 struct netdev_stats *stats)
1435 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1436 struct netdev_stats dev_stats;
1439 get_stats_via_vport(netdev_, stats);
1441 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1443 if (netdev->vport_stats_error) {
1450 /* If this port is an internal port then the transmit and receive stats
1451 * will appear to be swapped relative to the other ports since we are the
1452 * one sending the data, not a remote computer. For consistency, we swap
1453 * them back here. This does not apply if we are getting stats from the
1454 * vport layer because it always tracks stats from the perspective of the
1456 if (netdev->vport_stats_error) {
1458 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1459 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1460 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1461 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1462 stats->rx_length_errors = 0;
1463 stats->rx_over_errors = 0;
1464 stats->rx_crc_errors = 0;
1465 stats->rx_frame_errors = 0;
1466 stats->rx_fifo_errors = 0;
1467 stats->rx_missed_errors = 0;
1468 stats->tx_aborted_errors = 0;
1469 stats->tx_carrier_errors = 0;
1470 stats->tx_fifo_errors = 0;
1471 stats->tx_heartbeat_errors = 0;
1472 stats->tx_window_errors = 0;
1474 stats->rx_dropped += dev_stats.tx_dropped;
1475 stats->tx_dropped += dev_stats.rx_dropped;
1477 stats->rx_errors += dev_stats.tx_errors;
1478 stats->tx_errors += dev_stats.rx_errors;
1480 stats->multicast += dev_stats.multicast;
1481 stats->collisions += dev_stats.collisions;
1487 netdev_internal_get_stats(const struct netdev *netdev_,
1488 struct netdev_stats *stats)
1490 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1492 get_stats_via_vport(netdev_, stats);
1493 return netdev->vport_stats_error;
1497 netdev_internal_set_stats(struct netdev *netdev,
1498 const struct netdev_stats *stats)
1500 struct ovs_vport_stats vport_stats;
1501 struct dpif_linux_vport vport;
1504 vport_stats.rx_packets = stats->rx_packets;
1505 vport_stats.tx_packets = stats->tx_packets;
1506 vport_stats.rx_bytes = stats->rx_bytes;
1507 vport_stats.tx_bytes = stats->tx_bytes;
1508 vport_stats.rx_errors = stats->rx_errors;
1509 vport_stats.tx_errors = stats->tx_errors;
1510 vport_stats.rx_dropped = stats->rx_dropped;
1511 vport_stats.tx_dropped = stats->tx_dropped;
1513 dpif_linux_vport_init(&vport);
1514 vport.cmd = OVS_VPORT_CMD_SET;
1515 vport.name = netdev_get_name(netdev);
1516 vport.stats = &vport_stats;
1518 err = dpif_linux_vport_transact(&vport, NULL, NULL);
1520 /* If the vport layer doesn't know about the device, that doesn't mean it
1521 * doesn't exist (after all were able to open it when netdev_open() was
1522 * called), it just means that it isn't attached and we'll be getting
1523 * stats a different way. */
1524 if (err == ENODEV) {
1532 netdev_linux_read_features(struct netdev_linux *netdev)
1534 struct ethtool_cmd ecmd;
1538 if (netdev->cache_valid & VALID_FEATURES) {
1542 COVERAGE_INC(netdev_get_ethtool);
1543 memset(&ecmd, 0, sizeof ecmd);
1544 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1545 ETHTOOL_GSET, "ETHTOOL_GSET");
1550 /* Supported features. */
1551 netdev->supported = 0;
1552 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1553 netdev->supported |= NETDEV_F_10MB_HD;
1555 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1556 netdev->supported |= NETDEV_F_10MB_FD;
1558 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1559 netdev->supported |= NETDEV_F_100MB_HD;
1561 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1562 netdev->supported |= NETDEV_F_100MB_FD;
1564 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1565 netdev->supported |= NETDEV_F_1GB_HD;
1567 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1568 netdev->supported |= NETDEV_F_1GB_FD;
1570 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1571 netdev->supported |= NETDEV_F_10GB_FD;
1573 if (ecmd.supported & SUPPORTED_TP) {
1574 netdev->supported |= NETDEV_F_COPPER;
1576 if (ecmd.supported & SUPPORTED_FIBRE) {
1577 netdev->supported |= NETDEV_F_FIBER;
1579 if (ecmd.supported & SUPPORTED_Autoneg) {
1580 netdev->supported |= NETDEV_F_AUTONEG;
1582 if (ecmd.supported & SUPPORTED_Pause) {
1583 netdev->supported |= NETDEV_F_PAUSE;
1585 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1586 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1589 /* Advertised features. */
1590 netdev->advertised = 0;
1591 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1592 netdev->advertised |= NETDEV_F_10MB_HD;
1594 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1595 netdev->advertised |= NETDEV_F_10MB_FD;
1597 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1598 netdev->advertised |= NETDEV_F_100MB_HD;
1600 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1601 netdev->advertised |= NETDEV_F_100MB_FD;
1603 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1604 netdev->advertised |= NETDEV_F_1GB_HD;
1606 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1607 netdev->advertised |= NETDEV_F_1GB_FD;
1609 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1610 netdev->advertised |= NETDEV_F_10GB_FD;
1612 if (ecmd.advertising & ADVERTISED_TP) {
1613 netdev->advertised |= NETDEV_F_COPPER;
1615 if (ecmd.advertising & ADVERTISED_FIBRE) {
1616 netdev->advertised |= NETDEV_F_FIBER;
1618 if (ecmd.advertising & ADVERTISED_Autoneg) {
1619 netdev->advertised |= NETDEV_F_AUTONEG;
1621 if (ecmd.advertising & ADVERTISED_Pause) {
1622 netdev->advertised |= NETDEV_F_PAUSE;
1624 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1625 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1628 /* Current settings. */
1630 if (speed == SPEED_10) {
1631 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1632 } else if (speed == SPEED_100) {
1633 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1634 } else if (speed == SPEED_1000) {
1635 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1636 } else if (speed == SPEED_10000) {
1637 netdev->current = NETDEV_F_10GB_FD;
1638 } else if (speed == 40000) {
1639 netdev->current = NETDEV_F_40GB_FD;
1640 } else if (speed == 100000) {
1641 netdev->current = NETDEV_F_100GB_FD;
1642 } else if (speed == 1000000) {
1643 netdev->current = NETDEV_F_1TB_FD;
1645 netdev->current = 0;
1648 if (ecmd.port == PORT_TP) {
1649 netdev->current |= NETDEV_F_COPPER;
1650 } else if (ecmd.port == PORT_FIBRE) {
1651 netdev->current |= NETDEV_F_FIBER;
1655 netdev->current |= NETDEV_F_AUTONEG;
1658 /* Peer advertisements. */
1659 netdev->peer = 0; /* XXX */
1662 netdev->cache_valid |= VALID_FEATURES;
1663 netdev->get_features_error = error;
1666 /* Stores the features supported by 'netdev' into each of '*current',
1667 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1668 * bitmap of NETDEV_* bits. Returns 0 if successful, otherwise a positive
1671 netdev_linux_get_features(const struct netdev *netdev_,
1672 enum netdev_features *current,
1673 enum netdev_features *advertised,
1674 enum netdev_features *supported,
1675 enum netdev_features *peer)
1677 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1679 netdev_linux_read_features(netdev);
1681 if (!netdev->get_features_error) {
1682 *current = netdev->current;
1683 *advertised = netdev->advertised;
1684 *supported = netdev->supported;
1685 *peer = netdev->peer;
1687 return netdev->get_features_error;
1690 /* Set the features advertised by 'netdev' to 'advertise'. */
1692 netdev_linux_set_advertisements(struct netdev *netdev,
1693 enum netdev_features advertise)
1695 struct ethtool_cmd ecmd;
1698 COVERAGE_INC(netdev_get_ethtool);
1699 memset(&ecmd, 0, sizeof ecmd);
1700 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1701 ETHTOOL_GSET, "ETHTOOL_GSET");
1706 ecmd.advertising = 0;
1707 if (advertise & NETDEV_F_10MB_HD) {
1708 ecmd.advertising |= ADVERTISED_10baseT_Half;
1710 if (advertise & NETDEV_F_10MB_FD) {
1711 ecmd.advertising |= ADVERTISED_10baseT_Full;
1713 if (advertise & NETDEV_F_100MB_HD) {
1714 ecmd.advertising |= ADVERTISED_100baseT_Half;
1716 if (advertise & NETDEV_F_100MB_FD) {
1717 ecmd.advertising |= ADVERTISED_100baseT_Full;
1719 if (advertise & NETDEV_F_1GB_HD) {
1720 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1722 if (advertise & NETDEV_F_1GB_FD) {
1723 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1725 if (advertise & NETDEV_F_10GB_FD) {
1726 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1728 if (advertise & NETDEV_F_COPPER) {
1729 ecmd.advertising |= ADVERTISED_TP;
1731 if (advertise & NETDEV_F_FIBER) {
1732 ecmd.advertising |= ADVERTISED_FIBRE;
1734 if (advertise & NETDEV_F_AUTONEG) {
1735 ecmd.advertising |= ADVERTISED_Autoneg;
1737 if (advertise & NETDEV_F_PAUSE) {
1738 ecmd.advertising |= ADVERTISED_Pause;
1740 if (advertise & NETDEV_F_PAUSE_ASYM) {
1741 ecmd.advertising |= ADVERTISED_Asym_Pause;
1743 COVERAGE_INC(netdev_set_ethtool);
1744 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1745 ETHTOOL_SSET, "ETHTOOL_SSET");
1748 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1749 * successful, otherwise a positive errno value. */
1751 netdev_linux_set_policing(struct netdev *netdev_,
1752 uint32_t kbits_rate, uint32_t kbits_burst)
1754 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1755 const char *netdev_name = netdev_get_name(netdev_);
1759 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1760 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1761 : kbits_burst); /* Stick with user-specified value. */
1763 if (netdev->cache_valid & VALID_POLICING) {
1764 if (netdev->netdev_policing_error) {
1765 return netdev->netdev_policing_error;
1768 if (netdev->kbits_rate == kbits_rate &&
1769 netdev->kbits_burst == kbits_burst) {
1770 /* Assume that settings haven't changed since we last set them. */
1773 netdev->cache_valid &= ~VALID_POLICING;
1776 COVERAGE_INC(netdev_set_policing);
1777 /* Remove any existing ingress qdisc. */
1778 error = tc_add_del_ingress_qdisc(netdev_, false);
1780 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1781 netdev_name, ovs_strerror(error));
1786 error = tc_add_del_ingress_qdisc(netdev_, true);
1788 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1789 netdev_name, ovs_strerror(error));
1793 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
1795 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1796 netdev_name, ovs_strerror(error));
1801 netdev->kbits_rate = kbits_rate;
1802 netdev->kbits_burst = kbits_burst;
1805 if (!error || error == ENODEV) {
1806 netdev->netdev_policing_error = error;
1807 netdev->cache_valid |= VALID_POLICING;
1813 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1816 const struct tc_ops *const *opsp;
1818 for (opsp = tcs; *opsp != NULL; opsp++) {
1819 const struct tc_ops *ops = *opsp;
1820 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1821 sset_add(types, ops->ovs_name);
1827 static const struct tc_ops *
1828 tc_lookup_ovs_name(const char *name)
1830 const struct tc_ops *const *opsp;
1832 for (opsp = tcs; *opsp != NULL; opsp++) {
1833 const struct tc_ops *ops = *opsp;
1834 if (!strcmp(name, ops->ovs_name)) {
1841 static const struct tc_ops *
1842 tc_lookup_linux_name(const char *name)
1844 const struct tc_ops *const *opsp;
1846 for (opsp = tcs; *opsp != NULL; opsp++) {
1847 const struct tc_ops *ops = *opsp;
1848 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1855 static struct tc_queue *
1856 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
1859 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1860 struct tc_queue *queue;
1862 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
1863 if (queue->queue_id == queue_id) {
1870 static struct tc_queue *
1871 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1873 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1877 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1879 struct netdev_qos_capabilities *caps)
1881 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1885 caps->n_queues = ops->n_queues;
1890 netdev_linux_get_qos(const struct netdev *netdev_,
1891 const char **typep, struct smap *details)
1893 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1896 error = tc_query_qdisc(netdev_);
1901 *typep = netdev->tc->ops->ovs_name;
1902 return (netdev->tc->ops->qdisc_get
1903 ? netdev->tc->ops->qdisc_get(netdev_, details)
1908 netdev_linux_set_qos(struct netdev *netdev_,
1909 const char *type, const struct smap *details)
1911 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1912 const struct tc_ops *new_ops;
1915 new_ops = tc_lookup_ovs_name(type);
1916 if (!new_ops || !new_ops->tc_install) {
1920 error = tc_query_qdisc(netdev_);
1925 if (new_ops == netdev->tc->ops) {
1926 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
1928 /* Delete existing qdisc. */
1929 error = tc_del_qdisc(netdev_);
1933 ovs_assert(netdev->tc == NULL);
1935 /* Install new qdisc. */
1936 error = new_ops->tc_install(netdev_, details);
1937 ovs_assert((error == 0) == (netdev->tc != NULL));
1944 netdev_linux_get_queue(const struct netdev *netdev_,
1945 unsigned int queue_id, struct smap *details)
1947 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1950 error = tc_query_qdisc(netdev_);
1954 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
1956 ? netdev->tc->ops->class_get(netdev_, queue, details)
1962 netdev_linux_set_queue(struct netdev *netdev_,
1963 unsigned int queue_id, const struct smap *details)
1965 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1968 error = tc_query_qdisc(netdev_);
1971 } else if (queue_id >= netdev->tc->ops->n_queues
1972 || !netdev->tc->ops->class_set) {
1976 return netdev->tc->ops->class_set(netdev_, queue_id, details);
1980 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
1982 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1985 error = tc_query_qdisc(netdev_);
1988 } else if (!netdev->tc->ops->class_delete) {
1991 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
1993 ? netdev->tc->ops->class_delete(netdev_, queue)
1999 netdev_linux_get_queue_stats(const struct netdev *netdev_,
2000 unsigned int queue_id,
2001 struct netdev_queue_stats *stats)
2003 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2006 error = tc_query_qdisc(netdev_);
2009 } else if (!netdev->tc->ops->class_get_stats) {
2012 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2014 ? netdev->tc->ops->class_get_stats(netdev_, queue, stats)
2020 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
2022 struct ofpbuf request;
2023 struct tcmsg *tcmsg;
2025 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2029 tcmsg->tcm_parent = 0;
2030 nl_dump_start(dump, rtnl_sock, &request);
2031 ofpbuf_uninit(&request);
2036 netdev_linux_dump_queues(const struct netdev *netdev_,
2037 netdev_dump_queues_cb *cb, void *aux)
2039 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2040 struct tc_queue *queue, *next_queue;
2041 struct smap details;
2045 error = tc_query_qdisc(netdev_);
2048 } else if (!netdev->tc->ops->class_get) {
2053 smap_init(&details);
2054 HMAP_FOR_EACH_SAFE (queue, next_queue, hmap_node,
2055 &netdev->tc->queues) {
2056 smap_clear(&details);
2058 error = netdev->tc->ops->class_get(netdev_, queue, &details);
2060 (*cb)(queue->queue_id, &details, aux);
2065 smap_destroy(&details);
2071 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2072 netdev_dump_queue_stats_cb *cb, void *aux)
2074 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2075 struct nl_dump dump;
2080 error = tc_query_qdisc(netdev_);
2083 } else if (!netdev->tc->ops->class_dump_stats) {
2088 if (!start_queue_dump(netdev_, &dump)) {
2091 while (nl_dump_next(&dump, &msg)) {
2092 error = netdev->tc->ops->class_dump_stats(netdev_, &msg, cb, aux);
2098 error = nl_dump_done(&dump);
2099 return error ? error : last_error;
2103 netdev_linux_get_in4(const struct netdev *netdev_,
2104 struct in_addr *address, struct in_addr *netmask)
2106 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2108 if (!(netdev->cache_valid & VALID_IN4)) {
2111 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
2112 SIOCGIFADDR, "SIOCGIFADDR");
2117 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2118 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2123 netdev->cache_valid |= VALID_IN4;
2125 *address = netdev->address;
2126 *netmask = netdev->netmask;
2127 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
2131 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2132 struct in_addr netmask)
2134 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2137 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2139 netdev->cache_valid |= VALID_IN4;
2140 netdev->address = address;
2141 netdev->netmask = netmask;
2142 if (address.s_addr != INADDR_ANY) {
2143 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2144 "SIOCSIFNETMASK", netmask);
2151 parse_if_inet6_line(const char *line,
2152 struct in6_addr *in6, char ifname[16 + 1])
2154 uint8_t *s6 = in6->s6_addr;
2155 #define X8 "%2"SCNx8
2157 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2158 "%*x %*x %*x %*x %16s\n",
2159 &s6[0], &s6[1], &s6[2], &s6[3],
2160 &s6[4], &s6[5], &s6[6], &s6[7],
2161 &s6[8], &s6[9], &s6[10], &s6[11],
2162 &s6[12], &s6[13], &s6[14], &s6[15],
2166 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2167 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2169 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2171 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2172 if (!(netdev->cache_valid & VALID_IN6)) {
2176 netdev->in6 = in6addr_any;
2178 file = fopen("/proc/net/if_inet6", "r");
2180 const char *name = netdev_get_name(netdev_);
2181 while (fgets(line, sizeof line, file)) {
2182 struct in6_addr in6_tmp;
2183 char ifname[16 + 1];
2184 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2185 && !strcmp(name, ifname))
2187 netdev->in6 = in6_tmp;
2193 netdev->cache_valid |= VALID_IN6;
2200 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2202 struct sockaddr_in sin;
2203 memset(&sin, 0, sizeof sin);
2204 sin.sin_family = AF_INET;
2205 sin.sin_addr = addr;
2208 memset(sa, 0, sizeof *sa);
2209 memcpy(sa, &sin, sizeof sin);
2213 do_set_addr(struct netdev *netdev,
2214 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2217 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2218 make_in4_sockaddr(&ifr.ifr_addr, addr);
2220 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2224 /* Adds 'router' as a default IP gateway. */
2226 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2228 struct in_addr any = { INADDR_ANY };
2232 memset(&rt, 0, sizeof rt);
2233 make_in4_sockaddr(&rt.rt_dst, any);
2234 make_in4_sockaddr(&rt.rt_gateway, router);
2235 make_in4_sockaddr(&rt.rt_genmask, any);
2236 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2237 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2239 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2245 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2248 static const char fn[] = "/proc/net/route";
2253 *netdev_name = NULL;
2254 stream = fopen(fn, "r");
2255 if (stream == NULL) {
2256 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2261 while (fgets(line, sizeof line, stream)) {
2264 ovs_be32 dest, gateway, mask;
2265 int refcnt, metric, mtu;
2266 unsigned int flags, use, window, irtt;
2269 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2271 iface, &dest, &gateway, &flags, &refcnt,
2272 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2274 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2278 if (!(flags & RTF_UP)) {
2279 /* Skip routes that aren't up. */
2283 /* The output of 'dest', 'mask', and 'gateway' were given in
2284 * network byte order, so we don't need need any endian
2285 * conversions here. */
2286 if ((dest & mask) == (host->s_addr & mask)) {
2288 /* The host is directly reachable. */
2289 next_hop->s_addr = 0;
2291 /* To reach the host, we must go through a gateway. */
2292 next_hop->s_addr = gateway;
2294 *netdev_name = xstrdup(iface);
2306 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2308 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2311 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2312 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2314 COVERAGE_INC(netdev_get_ethtool);
2315 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2316 error = netdev_linux_do_ethtool(netdev->up.name,
2319 "ETHTOOL_GDRVINFO");
2321 netdev->cache_valid |= VALID_DRVINFO;
2326 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2327 smap_add(smap, "driver_version", netdev->drvinfo.version);
2328 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2334 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2337 smap_add(smap, "driver_name", "openvswitch");
2341 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2342 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2343 * returns 0. Otherwise, it returns a positive errno value; in particular,
2344 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2346 netdev_linux_arp_lookup(const struct netdev *netdev,
2347 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2350 struct sockaddr_in sin;
2353 memset(&r, 0, sizeof r);
2354 memset(&sin, 0, sizeof sin);
2355 sin.sin_family = AF_INET;
2356 sin.sin_addr.s_addr = ip;
2358 memcpy(&r.arp_pa, &sin, sizeof sin);
2359 r.arp_ha.sa_family = ARPHRD_ETHER;
2361 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2362 COVERAGE_INC(netdev_arp_lookup);
2363 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2365 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2366 } else if (retval != ENXIO) {
2367 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2368 netdev_get_name(netdev), IP_ARGS(ip),
2369 ovs_strerror(retval));
2375 nd_to_iff_flags(enum netdev_flags nd)
2378 if (nd & NETDEV_UP) {
2381 if (nd & NETDEV_PROMISC) {
2388 iff_to_nd_flags(int iff)
2390 enum netdev_flags nd = 0;
2394 if (iff & IFF_PROMISC) {
2395 nd |= NETDEV_PROMISC;
2401 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2402 enum netdev_flags on, enum netdev_flags *old_flagsp)
2404 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2405 int old_flags, new_flags;
2408 old_flags = netdev->ifi_flags;
2409 *old_flagsp = iff_to_nd_flags(old_flags);
2410 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2411 if (new_flags != old_flags) {
2412 error = set_flags(netdev_get_name(netdev_), new_flags);
2413 get_flags(netdev_, &netdev->ifi_flags);
2419 netdev_linux_change_seq(const struct netdev *netdev)
2421 return netdev_linux_cast(netdev)->change_seq;
2424 #define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS, \
2425 GET_FEATURES, GET_STATUS) \
2429 netdev_linux_init, \
2431 netdev_linux_wait, \
2434 netdev_linux_destroy, \
2435 NULL, /* get_config */ \
2436 NULL, /* set_config */ \
2437 NULL, /* get_tunnel_config */ \
2439 netdev_linux_rx_open, \
2441 netdev_linux_send, \
2442 netdev_linux_send_wait, \
2444 netdev_linux_set_etheraddr, \
2445 netdev_linux_get_etheraddr, \
2446 netdev_linux_get_mtu, \
2447 netdev_linux_set_mtu, \
2448 netdev_linux_get_ifindex, \
2449 netdev_linux_get_carrier, \
2450 netdev_linux_get_carrier_resets, \
2451 netdev_linux_set_miimon_interval, \
2456 netdev_linux_set_advertisements, \
2458 netdev_linux_set_policing, \
2459 netdev_linux_get_qos_types, \
2460 netdev_linux_get_qos_capabilities, \
2461 netdev_linux_get_qos, \
2462 netdev_linux_set_qos, \
2463 netdev_linux_get_queue, \
2464 netdev_linux_set_queue, \
2465 netdev_linux_delete_queue, \
2466 netdev_linux_get_queue_stats, \
2467 netdev_linux_dump_queues, \
2468 netdev_linux_dump_queue_stats, \
2470 netdev_linux_get_in4, \
2471 netdev_linux_set_in4, \
2472 netdev_linux_get_in6, \
2473 netdev_linux_add_router, \
2474 netdev_linux_get_next_hop, \
2476 netdev_linux_arp_lookup, \
2478 netdev_linux_update_flags, \
2480 netdev_linux_change_seq \
2483 const struct netdev_class netdev_linux_class =
2486 netdev_linux_create,
2487 netdev_linux_get_stats,
2488 NULL, /* set_stats */
2489 netdev_linux_get_features,
2490 netdev_linux_get_status);
2492 const struct netdev_class netdev_tap_class =
2495 netdev_linux_create_tap,
2496 netdev_tap_get_stats,
2497 NULL, /* set_stats */
2498 netdev_linux_get_features,
2499 netdev_linux_get_status);
2501 const struct netdev_class netdev_internal_class =
2504 netdev_linux_create,
2505 netdev_internal_get_stats,
2506 netdev_internal_set_stats,
2507 NULL, /* get_features */
2508 netdev_internal_get_status);
2510 static const struct netdev_rx_class netdev_rx_linux_class = {
2511 netdev_rx_linux_destroy,
2512 netdev_rx_linux_recv,
2513 netdev_rx_linux_wait,
2514 netdev_rx_linux_drain,
2517 /* HTB traffic control class. */
2519 #define HTB_N_QUEUES 0xf000
2523 unsigned int max_rate; /* In bytes/s. */
2527 struct tc_queue tc_queue;
2528 unsigned int min_rate; /* In bytes/s. */
2529 unsigned int max_rate; /* In bytes/s. */
2530 unsigned int burst; /* In bytes. */
2531 unsigned int priority; /* Lower values are higher priorities. */
2535 htb_get__(const struct netdev *netdev_)
2537 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2538 return CONTAINER_OF(netdev->tc, struct htb, tc);
2542 htb_install__(struct netdev *netdev_, uint64_t max_rate)
2544 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2547 htb = xmalloc(sizeof *htb);
2548 tc_init(&htb->tc, &tc_ops_htb);
2549 htb->max_rate = max_rate;
2551 netdev->tc = &htb->tc;
2554 /* Create an HTB qdisc.
2556 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2558 htb_setup_qdisc__(struct netdev *netdev)
2561 struct tc_htb_glob opt;
2562 struct ofpbuf request;
2563 struct tcmsg *tcmsg;
2565 tc_del_qdisc(netdev);
2567 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2568 NLM_F_EXCL | NLM_F_CREATE, &request);
2572 tcmsg->tcm_handle = tc_make_handle(1, 0);
2573 tcmsg->tcm_parent = TC_H_ROOT;
2575 nl_msg_put_string(&request, TCA_KIND, "htb");
2577 memset(&opt, 0, sizeof opt);
2578 opt.rate2quantum = 10;
2582 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2583 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2584 nl_msg_end_nested(&request, opt_offset);
2586 return tc_transact(&request, NULL);
2589 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2590 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2592 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2593 unsigned int parent, struct htb_class *class)
2596 struct tc_htb_opt opt;
2597 struct ofpbuf request;
2598 struct tcmsg *tcmsg;
2602 error = netdev_get_mtu(netdev, &mtu);
2604 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2605 netdev_get_name(netdev));
2609 memset(&opt, 0, sizeof opt);
2610 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2611 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2612 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2613 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2614 opt.prio = class->priority;
2616 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2620 tcmsg->tcm_handle = handle;
2621 tcmsg->tcm_parent = parent;
2623 nl_msg_put_string(&request, TCA_KIND, "htb");
2624 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2625 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2626 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2627 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2628 nl_msg_end_nested(&request, opt_offset);
2630 error = tc_transact(&request, NULL);
2632 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2633 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2634 netdev_get_name(netdev),
2635 tc_get_major(handle), tc_get_minor(handle),
2636 tc_get_major(parent), tc_get_minor(parent),
2637 class->min_rate, class->max_rate,
2638 class->burst, class->priority, ovs_strerror(error));
2643 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2644 * description of them into 'details'. The description complies with the
2645 * specification given in the vswitch database documentation for linux-htb
2648 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2650 static const struct nl_policy tca_htb_policy[] = {
2651 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2652 .min_len = sizeof(struct tc_htb_opt) },
2655 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2656 const struct tc_htb_opt *htb;
2658 if (!nl_parse_nested(nl_options, tca_htb_policy,
2659 attrs, ARRAY_SIZE(tca_htb_policy))) {
2660 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2664 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2665 class->min_rate = htb->rate.rate;
2666 class->max_rate = htb->ceil.rate;
2667 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2668 class->priority = htb->prio;
2673 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2674 struct htb_class *options,
2675 struct netdev_queue_stats *stats)
2677 struct nlattr *nl_options;
2678 unsigned int handle;
2681 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2682 if (!error && queue_id) {
2683 unsigned int major = tc_get_major(handle);
2684 unsigned int minor = tc_get_minor(handle);
2685 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2686 *queue_id = minor - 1;
2691 if (!error && options) {
2692 error = htb_parse_tca_options__(nl_options, options);
2698 htb_parse_qdisc_details__(struct netdev *netdev,
2699 const struct smap *details, struct htb_class *hc)
2701 const char *max_rate_s;
2703 max_rate_s = smap_get(details, "max-rate");
2704 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2705 if (!hc->max_rate) {
2706 enum netdev_features current;
2708 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2709 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
2711 hc->min_rate = hc->max_rate;
2717 htb_parse_class_details__(struct netdev *netdev,
2718 const struct smap *details, struct htb_class *hc)
2720 const struct htb *htb = htb_get__(netdev);
2721 const char *min_rate_s = smap_get(details, "min-rate");
2722 const char *max_rate_s = smap_get(details, "max-rate");
2723 const char *burst_s = smap_get(details, "burst");
2724 const char *priority_s = smap_get(details, "priority");
2727 error = netdev_get_mtu(netdev, &mtu);
2729 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2730 netdev_get_name(netdev));
2734 /* HTB requires at least an mtu sized min-rate to send any traffic even
2735 * on uncongested links. */
2736 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2737 hc->min_rate = MAX(hc->min_rate, mtu);
2738 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2741 hc->max_rate = (max_rate_s
2742 ? strtoull(max_rate_s, NULL, 10) / 8
2744 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2745 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2749 * According to hints in the documentation that I've read, it is important
2750 * that 'burst' be at least as big as the largest frame that might be
2751 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2752 * but having it a bit too small is a problem. Since netdev_get_mtu()
2753 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2754 * the MTU. We actually add 64, instead of 14, as a guard against
2755 * additional headers get tacked on somewhere that we're not aware of. */
2756 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2757 hc->burst = MAX(hc->burst, mtu + 64);
2760 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2766 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2767 unsigned int parent, struct htb_class *options,
2768 struct netdev_queue_stats *stats)
2770 struct ofpbuf *reply;
2773 error = tc_query_class(netdev, handle, parent, &reply);
2775 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2776 ofpbuf_delete(reply);
2782 htb_tc_install(struct netdev *netdev, const struct smap *details)
2786 error = htb_setup_qdisc__(netdev);
2788 struct htb_class hc;
2790 htb_parse_qdisc_details__(netdev, details, &hc);
2791 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2792 tc_make_handle(1, 0), &hc);
2794 htb_install__(netdev, hc.max_rate);
2800 static struct htb_class *
2801 htb_class_cast__(const struct tc_queue *queue)
2803 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2807 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2808 const struct htb_class *hc)
2810 struct htb *htb = htb_get__(netdev);
2811 size_t hash = hash_int(queue_id, 0);
2812 struct tc_queue *queue;
2813 struct htb_class *hcp;
2815 queue = tc_find_queue__(netdev, queue_id, hash);
2817 hcp = htb_class_cast__(queue);
2819 hcp = xmalloc(sizeof *hcp);
2820 queue = &hcp->tc_queue;
2821 queue->queue_id = queue_id;
2822 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2825 hcp->min_rate = hc->min_rate;
2826 hcp->max_rate = hc->max_rate;
2827 hcp->burst = hc->burst;
2828 hcp->priority = hc->priority;
2832 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2835 struct nl_dump dump;
2836 struct htb_class hc;
2838 /* Get qdisc options. */
2840 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2841 htb_install__(netdev, hc.max_rate);
2844 if (!start_queue_dump(netdev, &dump)) {
2847 while (nl_dump_next(&dump, &msg)) {
2848 unsigned int queue_id;
2850 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2851 htb_update_queue__(netdev, queue_id, &hc);
2854 nl_dump_done(&dump);
2860 htb_tc_destroy(struct tc *tc)
2862 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2863 struct htb_class *hc, *next;
2865 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2866 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2874 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
2876 const struct htb *htb = htb_get__(netdev);
2877 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
2882 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
2884 struct htb_class hc;
2887 htb_parse_qdisc_details__(netdev, details, &hc);
2888 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2889 tc_make_handle(1, 0), &hc);
2891 htb_get__(netdev)->max_rate = hc.max_rate;
2897 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2898 const struct tc_queue *queue, struct smap *details)
2900 const struct htb_class *hc = htb_class_cast__(queue);
2902 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
2903 if (hc->min_rate != hc->max_rate) {
2904 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
2906 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
2908 smap_add_format(details, "priority", "%u", hc->priority);
2914 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2915 const struct smap *details)
2917 struct htb_class hc;
2920 error = htb_parse_class_details__(netdev, details, &hc);
2925 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2926 tc_make_handle(1, 0xfffe), &hc);
2931 htb_update_queue__(netdev, queue_id, &hc);
2936 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2938 struct htb_class *hc = htb_class_cast__(queue);
2939 struct htb *htb = htb_get__(netdev);
2942 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2944 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2951 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2952 struct netdev_queue_stats *stats)
2954 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2955 tc_make_handle(1, 0xfffe), NULL, stats);
2959 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2960 const struct ofpbuf *nlmsg,
2961 netdev_dump_queue_stats_cb *cb, void *aux)
2963 struct netdev_queue_stats stats;
2964 unsigned int handle, major, minor;
2967 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2972 major = tc_get_major(handle);
2973 minor = tc_get_minor(handle);
2974 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2975 (*cb)(minor - 1, &stats, aux);
2980 static const struct tc_ops tc_ops_htb = {
2981 "htb", /* linux_name */
2982 "linux-htb", /* ovs_name */
2983 HTB_N_QUEUES, /* n_queues */
2992 htb_class_get_stats,
2993 htb_class_dump_stats
2996 /* "linux-hfsc" traffic control class. */
2998 #define HFSC_N_QUEUES 0xf000
3006 struct tc_queue tc_queue;
3011 static struct hfsc *
3012 hfsc_get__(const struct netdev *netdev_)
3014 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3015 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
3018 static struct hfsc_class *
3019 hfsc_class_cast__(const struct tc_queue *queue)
3021 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3025 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
3027 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3030 hfsc = xmalloc(sizeof *hfsc);
3031 tc_init(&hfsc->tc, &tc_ops_hfsc);
3032 hfsc->max_rate = max_rate;
3033 netdev->tc = &hfsc->tc;
3037 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3038 const struct hfsc_class *hc)
3042 struct hfsc_class *hcp;
3043 struct tc_queue *queue;
3045 hfsc = hfsc_get__(netdev);
3046 hash = hash_int(queue_id, 0);
3048 queue = tc_find_queue__(netdev, queue_id, hash);
3050 hcp = hfsc_class_cast__(queue);
3052 hcp = xmalloc(sizeof *hcp);
3053 queue = &hcp->tc_queue;
3054 queue->queue_id = queue_id;
3055 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3058 hcp->min_rate = hc->min_rate;
3059 hcp->max_rate = hc->max_rate;
3063 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3065 const struct tc_service_curve *rsc, *fsc, *usc;
3066 static const struct nl_policy tca_hfsc_policy[] = {
3068 .type = NL_A_UNSPEC,
3070 .min_len = sizeof(struct tc_service_curve),
3073 .type = NL_A_UNSPEC,
3075 .min_len = sizeof(struct tc_service_curve),
3078 .type = NL_A_UNSPEC,
3080 .min_len = sizeof(struct tc_service_curve),
3083 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3085 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3086 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3087 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3091 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3092 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3093 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3095 if (rsc->m1 != 0 || rsc->d != 0 ||
3096 fsc->m1 != 0 || fsc->d != 0 ||
3097 usc->m1 != 0 || usc->d != 0) {
3098 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3099 "Non-linear service curves are not supported.");
3103 if (rsc->m2 != fsc->m2) {
3104 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3105 "Real-time service curves are not supported ");
3109 if (rsc->m2 > usc->m2) {
3110 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3111 "Min-rate service curve is greater than "
3112 "the max-rate service curve.");
3116 class->min_rate = fsc->m2;
3117 class->max_rate = usc->m2;
3122 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3123 struct hfsc_class *options,
3124 struct netdev_queue_stats *stats)
3127 unsigned int handle;
3128 struct nlattr *nl_options;
3130 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3136 unsigned int major, minor;
3138 major = tc_get_major(handle);
3139 minor = tc_get_minor(handle);
3140 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3141 *queue_id = minor - 1;
3148 error = hfsc_parse_tca_options__(nl_options, options);
3155 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3156 unsigned int parent, struct hfsc_class *options,
3157 struct netdev_queue_stats *stats)
3160 struct ofpbuf *reply;
3162 error = tc_query_class(netdev, handle, parent, &reply);
3167 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3168 ofpbuf_delete(reply);
3173 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct smap *details,
3174 struct hfsc_class *class)
3177 const char *max_rate_s;
3179 max_rate_s = smap_get(details, "max-rate");
3180 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3183 enum netdev_features current;
3185 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
3186 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3189 class->min_rate = max_rate;
3190 class->max_rate = max_rate;
3194 hfsc_parse_class_details__(struct netdev *netdev,
3195 const struct smap *details,
3196 struct hfsc_class * class)
3198 const struct hfsc *hfsc;
3199 uint32_t min_rate, max_rate;
3200 const char *min_rate_s, *max_rate_s;
3202 hfsc = hfsc_get__(netdev);
3203 min_rate_s = smap_get(details, "min-rate");
3204 max_rate_s = smap_get(details, "max-rate");
3206 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3207 min_rate = MAX(min_rate, 1);
3208 min_rate = MIN(min_rate, hfsc->max_rate);
3210 max_rate = (max_rate_s
3211 ? strtoull(max_rate_s, NULL, 10) / 8
3213 max_rate = MAX(max_rate, min_rate);
3214 max_rate = MIN(max_rate, hfsc->max_rate);
3216 class->min_rate = min_rate;
3217 class->max_rate = max_rate;
3222 /* Create an HFSC qdisc.
3224 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3226 hfsc_setup_qdisc__(struct netdev * netdev)
3228 struct tcmsg *tcmsg;
3229 struct ofpbuf request;
3230 struct tc_hfsc_qopt opt;
3232 tc_del_qdisc(netdev);
3234 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3235 NLM_F_EXCL | NLM_F_CREATE, &request);
3241 tcmsg->tcm_handle = tc_make_handle(1, 0);
3242 tcmsg->tcm_parent = TC_H_ROOT;
3244 memset(&opt, 0, sizeof opt);
3247 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3248 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3250 return tc_transact(&request, NULL);
3253 /* Create an HFSC class.
3255 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3256 * sc rate <min_rate> ul rate <max_rate>" */
3258 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3259 unsigned int parent, struct hfsc_class *class)
3263 struct tcmsg *tcmsg;
3264 struct ofpbuf request;
3265 struct tc_service_curve min, max;
3267 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3273 tcmsg->tcm_handle = handle;
3274 tcmsg->tcm_parent = parent;
3278 min.m2 = class->min_rate;
3282 max.m2 = class->max_rate;
3284 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3285 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3286 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3287 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3288 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3289 nl_msg_end_nested(&request, opt_offset);
3291 error = tc_transact(&request, NULL);
3293 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3294 "min-rate %ubps, max-rate %ubps (%s)",
3295 netdev_get_name(netdev),
3296 tc_get_major(handle), tc_get_minor(handle),
3297 tc_get_major(parent), tc_get_minor(parent),
3298 class->min_rate, class->max_rate, ovs_strerror(error));
3305 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
3308 struct hfsc_class class;
3310 error = hfsc_setup_qdisc__(netdev);
3316 hfsc_parse_qdisc_details__(netdev, details, &class);
3317 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3318 tc_make_handle(1, 0), &class);
3324 hfsc_install__(netdev, class.max_rate);
3329 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3332 struct nl_dump dump;
3333 struct hfsc_class hc;
3336 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3337 hfsc_install__(netdev, hc.max_rate);
3339 if (!start_queue_dump(netdev, &dump)) {
3343 while (nl_dump_next(&dump, &msg)) {
3344 unsigned int queue_id;
3346 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3347 hfsc_update_queue__(netdev, queue_id, &hc);
3351 nl_dump_done(&dump);
3356 hfsc_tc_destroy(struct tc *tc)
3359 struct hfsc_class *hc, *next;
3361 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3363 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3364 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3373 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
3375 const struct hfsc *hfsc;
3376 hfsc = hfsc_get__(netdev);
3377 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
3382 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
3385 struct hfsc_class class;
3387 hfsc_parse_qdisc_details__(netdev, details, &class);
3388 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3389 tc_make_handle(1, 0), &class);
3392 hfsc_get__(netdev)->max_rate = class.max_rate;
3399 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3400 const struct tc_queue *queue, struct smap *details)
3402 const struct hfsc_class *hc;
3404 hc = hfsc_class_cast__(queue);
3405 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3406 if (hc->min_rate != hc->max_rate) {
3407 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3413 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3414 const struct smap *details)
3417 struct hfsc_class class;
3419 error = hfsc_parse_class_details__(netdev, details, &class);
3424 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3425 tc_make_handle(1, 0xfffe), &class);
3430 hfsc_update_queue__(netdev, queue_id, &class);
3435 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3439 struct hfsc_class *hc;
3441 hc = hfsc_class_cast__(queue);
3442 hfsc = hfsc_get__(netdev);
3444 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3446 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3453 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3454 struct netdev_queue_stats *stats)
3456 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3457 tc_make_handle(1, 0xfffe), NULL, stats);
3461 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3462 const struct ofpbuf *nlmsg,
3463 netdev_dump_queue_stats_cb *cb, void *aux)
3465 struct netdev_queue_stats stats;
3466 unsigned int handle, major, minor;
3469 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3474 major = tc_get_major(handle);
3475 minor = tc_get_minor(handle);
3476 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3477 (*cb)(minor - 1, &stats, aux);
3482 static const struct tc_ops tc_ops_hfsc = {
3483 "hfsc", /* linux_name */
3484 "linux-hfsc", /* ovs_name */
3485 HFSC_N_QUEUES, /* n_queues */
3486 hfsc_tc_install, /* tc_install */
3487 hfsc_tc_load, /* tc_load */
3488 hfsc_tc_destroy, /* tc_destroy */
3489 hfsc_qdisc_get, /* qdisc_get */
3490 hfsc_qdisc_set, /* qdisc_set */
3491 hfsc_class_get, /* class_get */
3492 hfsc_class_set, /* class_set */
3493 hfsc_class_delete, /* class_delete */
3494 hfsc_class_get_stats, /* class_get_stats */
3495 hfsc_class_dump_stats /* class_dump_stats */
3498 /* "linux-default" traffic control class.
3500 * This class represents the default, unnamed Linux qdisc. It corresponds to
3501 * the "" (empty string) QoS type in the OVS database. */
3504 default_install__(struct netdev *netdev_)
3506 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3507 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
3509 /* Nothing but a tc class implementation is allowed to write to a tc. This
3510 * class never does that, so we can legitimately use a const tc object. */
3511 netdev->tc = CONST_CAST(struct tc *, &tc);
3515 default_tc_install(struct netdev *netdev,
3516 const struct smap *details OVS_UNUSED)
3518 default_install__(netdev);
3523 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3525 default_install__(netdev);
3529 static const struct tc_ops tc_ops_default = {
3530 NULL, /* linux_name */
3535 NULL, /* tc_destroy */
3536 NULL, /* qdisc_get */
3537 NULL, /* qdisc_set */
3538 NULL, /* class_get */
3539 NULL, /* class_set */
3540 NULL, /* class_delete */
3541 NULL, /* class_get_stats */
3542 NULL /* class_dump_stats */
3545 /* "linux-other" traffic control class.
3550 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
3552 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3553 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
3555 /* Nothing but a tc class implementation is allowed to write to a tc. This
3556 * class never does that, so we can legitimately use a const tc object. */
3557 netdev->tc = CONST_CAST(struct tc *, &tc);
3561 static const struct tc_ops tc_ops_other = {
3562 NULL, /* linux_name */
3563 "linux-other", /* ovs_name */
3565 NULL, /* tc_install */
3567 NULL, /* tc_destroy */
3568 NULL, /* qdisc_get */
3569 NULL, /* qdisc_set */
3570 NULL, /* class_get */
3571 NULL, /* class_set */
3572 NULL, /* class_delete */
3573 NULL, /* class_get_stats */
3574 NULL /* class_dump_stats */
3577 /* Traffic control. */
3579 /* Number of kernel "tc" ticks per second. */
3580 static double ticks_per_s;
3582 /* Number of kernel "jiffies" per second. This is used for the purpose of
3583 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3584 * one jiffy's worth of data.
3586 * There are two possibilities here:
3588 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3589 * approximate range of 100 to 1024. That means that we really need to
3590 * make sure that the qdisc can buffer that much data.
3592 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3593 * has finely granular timers and there's no need to fudge additional room
3594 * for buffers. (There's no extra effort needed to implement that: the
3595 * large 'buffer_hz' is used as a divisor, so practically any number will
3596 * come out as 0 in the division. Small integer results in the case of
3597 * really high dividends won't have any real effect anyhow.)
3599 static unsigned int buffer_hz;
3601 /* Returns tc handle 'major':'minor'. */
3603 tc_make_handle(unsigned int major, unsigned int minor)
3605 return TC_H_MAKE(major << 16, minor);
3608 /* Returns the major number from 'handle'. */
3610 tc_get_major(unsigned int handle)
3612 return TC_H_MAJ(handle) >> 16;
3615 /* Returns the minor number from 'handle'. */
3617 tc_get_minor(unsigned int handle)
3619 return TC_H_MIN(handle);
3622 static struct tcmsg *
3623 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3624 struct ofpbuf *request)
3626 struct tcmsg *tcmsg;
3630 error = get_ifindex(netdev, &ifindex);
3635 ofpbuf_init(request, 512);
3636 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3637 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3638 tcmsg->tcm_family = AF_UNSPEC;
3639 tcmsg->tcm_ifindex = ifindex;
3640 /* Caller should fill in tcmsg->tcm_handle. */
3641 /* Caller should fill in tcmsg->tcm_parent. */
3647 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3649 int error = nl_sock_transact(rtnl_sock, request, replyp);
3650 ofpbuf_uninit(request);
3654 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3655 * policing configuration.
3657 * This function is equivalent to running the following when 'add' is true:
3658 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3660 * This function is equivalent to running the following when 'add' is false:
3661 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3663 * The configuration and stats may be seen with the following command:
3664 * /sbin/tc -s qdisc show dev <devname>
3666 * Returns 0 if successful, otherwise a positive errno value.
3669 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3671 struct ofpbuf request;
3672 struct tcmsg *tcmsg;
3674 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3675 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3677 tcmsg = tc_make_request(netdev, type, flags, &request);
3681 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3682 tcmsg->tcm_parent = TC_H_INGRESS;
3683 nl_msg_put_string(&request, TCA_KIND, "ingress");
3684 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3686 error = tc_transact(&request, NULL);
3688 /* If we're deleting the qdisc, don't worry about some of the
3689 * error conditions. */
3690 if (!add && (error == ENOENT || error == EINVAL)) {
3699 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3702 * This function is equivalent to running:
3703 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3704 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3707 * The configuration and stats may be seen with the following command:
3708 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3710 * Returns 0 if successful, otherwise a positive errno value.
3713 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3715 struct tc_police tc_police;
3716 struct ofpbuf request;
3717 struct tcmsg *tcmsg;
3718 size_t basic_offset;
3719 size_t police_offset;
3723 memset(&tc_police, 0, sizeof tc_police);
3724 tc_police.action = TC_POLICE_SHOT;
3725 tc_police.mtu = mtu;
3726 tc_fill_rate(&tc_police.rate, (kbits_rate * 1000)/8, mtu);
3727 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3728 kbits_burst * 1024);
3730 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3731 NLM_F_EXCL | NLM_F_CREATE, &request);
3735 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3736 tcmsg->tcm_info = tc_make_handle(49,
3737 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3739 nl_msg_put_string(&request, TCA_KIND, "basic");
3740 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3741 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3742 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3743 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3744 nl_msg_end_nested(&request, police_offset);
3745 nl_msg_end_nested(&request, basic_offset);
3747 error = tc_transact(&request, NULL);
3758 /* The values in psched are not individually very meaningful, but they are
3759 * important. The tables below show some values seen in the wild.
3763 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3764 * (Before that, there are hints that it was 1000000000.)
3766 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3770 * -----------------------------------
3771 * [1] 000c8000 000f4240 000f4240 00000064
3772 * [2] 000003e8 00000400 000f4240 3b9aca00
3773 * [3] 000003e8 00000400 000f4240 3b9aca00
3774 * [4] 000003e8 00000400 000f4240 00000064
3775 * [5] 000003e8 00000040 000f4240 3b9aca00
3776 * [6] 000003e8 00000040 000f4240 000000f9
3778 * a b c d ticks_per_s buffer_hz
3779 * ------- --------- ---------- ------------- ----------- -------------
3780 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3781 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3782 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3783 * [4] 1,000 1,024 1,000,000 100 976,562 100
3784 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3785 * [6] 1,000 64 1,000,000 249 15,625,000 249
3787 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3788 * [2] 2.6.26-1-686-bigmem from Debian lenny
3789 * [3] 2.6.26-2-sparc64 from Debian lenny
3790 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3791 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3792 * [6] 2.6.34 from kernel.org on KVM
3794 static const char fn[] = "/proc/net/psched";
3795 unsigned int a, b, c, d;
3801 stream = fopen(fn, "r");
3803 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
3807 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3808 VLOG_WARN("%s: read failed", fn);
3812 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3816 VLOG_WARN("%s: invalid scheduler parameters", fn);
3820 ticks_per_s = (double) a * c / b;
3824 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3827 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3830 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3831 * rate of 'rate' bytes per second. */
3833 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3838 return (rate * ticks) / ticks_per_s;
3841 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3842 * rate of 'rate' bytes per second. */
3844 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3849 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3852 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3853 * a transmission rate of 'rate' bytes per second. */
3855 tc_buffer_per_jiffy(unsigned int rate)
3860 return rate / buffer_hz;
3863 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3864 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3865 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3866 * stores NULL into it if it is absent.
3868 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3871 * Returns 0 if successful, otherwise a positive errno value. */
3873 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3874 struct nlattr **options)
3876 static const struct nl_policy tca_policy[] = {
3877 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3878 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3880 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3882 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3883 tca_policy, ta, ARRAY_SIZE(ta))) {
3884 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3889 *kind = nl_attr_get_string(ta[TCA_KIND]);
3893 *options = ta[TCA_OPTIONS];
3908 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3909 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3910 * into '*options', and its queue statistics into '*stats'. Any of the output
3911 * arguments may be null.
3913 * Returns 0 if successful, otherwise a positive errno value. */
3915 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3916 struct nlattr **options, struct netdev_queue_stats *stats)
3918 static const struct nl_policy tca_policy[] = {
3919 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3920 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3922 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3924 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3925 tca_policy, ta, ARRAY_SIZE(ta))) {
3926 VLOG_WARN_RL(&rl, "failed to parse class message");
3931 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3932 *handlep = tc->tcm_handle;
3936 *options = ta[TCA_OPTIONS];
3940 const struct gnet_stats_queue *gsq;
3941 struct gnet_stats_basic gsb;
3943 static const struct nl_policy stats_policy[] = {
3944 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3945 .min_len = sizeof gsb },
3946 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3947 .min_len = sizeof *gsq },
3949 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3951 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3952 sa, ARRAY_SIZE(sa))) {
3953 VLOG_WARN_RL(&rl, "failed to parse class stats");
3957 /* Alignment issues screw up the length of struct gnet_stats_basic on
3958 * some arch/bitsize combinations. Newer versions of Linux have a
3959 * struct gnet_stats_basic_packed, but we can't depend on that. The
3960 * easiest thing to do is just to make a copy. */
3961 memset(&gsb, 0, sizeof gsb);
3962 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3963 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3964 stats->tx_bytes = gsb.bytes;
3965 stats->tx_packets = gsb.packets;
3967 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3968 stats->tx_errors = gsq->drops;
3978 memset(stats, 0, sizeof *stats);
3983 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3986 tc_query_class(const struct netdev *netdev,
3987 unsigned int handle, unsigned int parent,
3988 struct ofpbuf **replyp)
3990 struct ofpbuf request;
3991 struct tcmsg *tcmsg;
3994 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3998 tcmsg->tcm_handle = handle;
3999 tcmsg->tcm_parent = parent;
4001 error = tc_transact(&request, replyp);
4003 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
4004 netdev_get_name(netdev),
4005 tc_get_major(handle), tc_get_minor(handle),
4006 tc_get_major(parent), tc_get_minor(parent),
4007 ovs_strerror(error));
4012 /* Equivalent to "tc class del dev <name> handle <handle>". */
4014 tc_delete_class(const struct netdev *netdev, unsigned int handle)
4016 struct ofpbuf request;
4017 struct tcmsg *tcmsg;
4020 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
4024 tcmsg->tcm_handle = handle;
4025 tcmsg->tcm_parent = 0;
4027 error = tc_transact(&request, NULL);
4029 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4030 netdev_get_name(netdev),
4031 tc_get_major(handle), tc_get_minor(handle),
4032 ovs_strerror(error));
4037 /* Equivalent to "tc qdisc del dev <name> root". */
4039 tc_del_qdisc(struct netdev *netdev_)
4041 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4042 struct ofpbuf request;
4043 struct tcmsg *tcmsg;
4046 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
4050 tcmsg->tcm_handle = tc_make_handle(1, 0);
4051 tcmsg->tcm_parent = TC_H_ROOT;
4053 error = tc_transact(&request, NULL);
4054 if (error == EINVAL) {
4055 /* EINVAL probably means that the default qdisc was in use, in which
4056 * case we've accomplished our purpose. */
4059 if (!error && netdev->tc) {
4060 if (netdev->tc->ops->tc_destroy) {
4061 netdev->tc->ops->tc_destroy(netdev->tc);
4068 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4069 * kernel to determine what they are. Returns 0 if successful, otherwise a
4070 * positive errno value. */
4072 tc_query_qdisc(const struct netdev *netdev_)
4074 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4075 struct ofpbuf request, *qdisc;
4076 const struct tc_ops *ops;
4077 struct tcmsg *tcmsg;
4085 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4086 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4087 * 2.6.35 without that fix backported to it.
4089 * To avoid the OOPS, we must not make a request that would attempt to dump
4090 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4091 * few others. There are a few ways that I can see to do this, but most of
4092 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4093 * technique chosen here is to assume that any non-default qdisc that we
4094 * create will have a class with handle 1:0. The built-in qdiscs only have
4095 * a class with handle 0:0.
4097 * We could check for Linux 2.6.35+ and use a more straightforward method
4099 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
4103 tcmsg->tcm_handle = tc_make_handle(1, 0);
4104 tcmsg->tcm_parent = 0;
4106 /* Figure out what tc class to instantiate. */
4107 error = tc_transact(&request, &qdisc);
4111 error = tc_parse_qdisc(qdisc, &kind, NULL);
4113 ops = &tc_ops_other;
4115 ops = tc_lookup_linux_name(kind);
4117 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4118 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4120 ops = &tc_ops_other;
4123 } else if (error == ENOENT) {
4124 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4125 * other entity that doesn't have a handle 1:0. We will assume
4126 * that it's the system default qdisc. */
4127 ops = &tc_ops_default;
4130 /* Who knows? Maybe the device got deleted. */
4131 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4132 netdev_get_name(netdev_), ovs_strerror(error));
4133 ops = &tc_ops_other;
4136 /* Instantiate it. */
4137 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
4138 ovs_assert((load_error == 0) == (netdev->tc != NULL));
4139 ofpbuf_delete(qdisc);
4141 return error ? error : load_error;
4144 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4145 approximate the time to transmit packets of various lengths. For an MTU of
4146 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4147 represents two possible packet lengths; for a MTU of 513 through 1024, four
4148 possible lengths; and so on.
4150 Returns, for the specified 'mtu', the number of bits that packet lengths
4151 need to be shifted right to fit within such a 256-entry table. */
4153 tc_calc_cell_log(unsigned int mtu)
4158 mtu = ETH_PAYLOAD_MAX;
4160 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4162 for (cell_log = 0; mtu >= 256; cell_log++) {
4169 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4172 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4174 memset(rate, 0, sizeof *rate);
4175 rate->cell_log = tc_calc_cell_log(mtu);
4176 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4177 /* rate->cell_align = 0; */ /* distro headers. */
4178 rate->mpu = ETH_TOTAL_MIN;
4182 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4183 * attribute of the specified "type".
4185 * See tc_calc_cell_log() above for a description of "rtab"s. */
4187 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4192 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4193 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4194 unsigned packet_size = (i + 1) << rate->cell_log;
4195 if (packet_size < rate->mpu) {
4196 packet_size = rate->mpu;
4198 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4202 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4203 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4204 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4207 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4209 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4210 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4213 /* Linux-only functions declared in netdev-linux.h */
4215 /* Returns a fd for an AF_INET socket or a negative errno value. */
4217 netdev_linux_get_af_inet_sock(void)
4219 int error = netdev_linux_init();
4220 return error ? -error : af_inet_sock;
4223 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4224 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4226 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4227 const char *flag_name, bool enable)
4229 const char *netdev_name = netdev_get_name(netdev);
4230 struct ethtool_value evalue;
4234 COVERAGE_INC(netdev_get_ethtool);
4235 memset(&evalue, 0, sizeof evalue);
4236 error = netdev_linux_do_ethtool(netdev_name,
4237 (struct ethtool_cmd *)&evalue,
4238 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4243 COVERAGE_INC(netdev_set_ethtool);
4244 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4245 error = netdev_linux_do_ethtool(netdev_name,
4246 (struct ethtool_cmd *)&evalue,
4247 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4252 COVERAGE_INC(netdev_get_ethtool);
4253 memset(&evalue, 0, sizeof evalue);
4254 error = netdev_linux_do_ethtool(netdev_name,
4255 (struct ethtool_cmd *)&evalue,
4256 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4261 if (new_flags != evalue.data) {
4262 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4263 "device %s failed", enable ? "enable" : "disable",
4264 flag_name, netdev_name);
4271 /* Utility functions. */
4273 /* Copies 'src' into 'dst', performing format conversion in the process. */
4275 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4276 const struct rtnl_link_stats *src)
4278 dst->rx_packets = src->rx_packets;
4279 dst->tx_packets = src->tx_packets;
4280 dst->rx_bytes = src->rx_bytes;
4281 dst->tx_bytes = src->tx_bytes;
4282 dst->rx_errors = src->rx_errors;
4283 dst->tx_errors = src->tx_errors;
4284 dst->rx_dropped = src->rx_dropped;
4285 dst->tx_dropped = src->tx_dropped;
4286 dst->multicast = src->multicast;
4287 dst->collisions = src->collisions;
4288 dst->rx_length_errors = src->rx_length_errors;
4289 dst->rx_over_errors = src->rx_over_errors;
4290 dst->rx_crc_errors = src->rx_crc_errors;
4291 dst->rx_frame_errors = src->rx_frame_errors;
4292 dst->rx_fifo_errors = src->rx_fifo_errors;
4293 dst->rx_missed_errors = src->rx_missed_errors;
4294 dst->tx_aborted_errors = src->tx_aborted_errors;
4295 dst->tx_carrier_errors = src->tx_carrier_errors;
4296 dst->tx_fifo_errors = src->tx_fifo_errors;
4297 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4298 dst->tx_window_errors = src->tx_window_errors;
4302 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4304 /* Policy for RTNLGRP_LINK messages.
4306 * There are *many* more fields in these messages, but currently we only
4307 * care about these fields. */
4308 static const struct nl_policy rtnlgrp_link_policy[] = {
4309 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4310 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4311 .min_len = sizeof(struct rtnl_link_stats) },
4314 struct ofpbuf request;
4315 struct ofpbuf *reply;
4316 struct ifinfomsg *ifi;
4317 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4320 ofpbuf_init(&request, 0);
4321 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4322 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4323 ifi->ifi_family = PF_UNSPEC;
4324 ifi->ifi_index = ifindex;
4325 error = nl_sock_transact(rtnl_sock, &request, &reply);
4326 ofpbuf_uninit(&request);
4331 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4332 rtnlgrp_link_policy,
4333 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4334 ofpbuf_delete(reply);
4338 if (!attrs[IFLA_STATS]) {
4339 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4340 ofpbuf_delete(reply);
4344 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4346 ofpbuf_delete(reply);
4352 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4354 static const char fn[] = "/proc/net/dev";
4359 stream = fopen(fn, "r");
4361 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
4366 while (fgets(line, sizeof line, stream)) {
4369 #define X64 "%"SCNu64
4372 X64 X64 X64 X64 X64 X64 X64 "%*u"
4373 X64 X64 X64 X64 X64 X64 X64 "%*u",
4379 &stats->rx_fifo_errors,
4380 &stats->rx_frame_errors,
4386 &stats->tx_fifo_errors,
4388 &stats->tx_carrier_errors) != 15) {
4389 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4390 } else if (!strcmp(devname, netdev_name)) {
4391 stats->rx_length_errors = UINT64_MAX;
4392 stats->rx_over_errors = UINT64_MAX;
4393 stats->rx_crc_errors = UINT64_MAX;
4394 stats->rx_missed_errors = UINT64_MAX;
4395 stats->tx_aborted_errors = UINT64_MAX;
4396 stats->tx_heartbeat_errors = UINT64_MAX;
4397 stats->tx_window_errors = UINT64_MAX;
4403 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4409 get_flags(const struct netdev *dev, unsigned int *flags)
4415 error = netdev_linux_do_ioctl(dev->name, &ifr, SIOCGIFFLAGS,
4418 *flags = ifr.ifr_flags;
4424 set_flags(const char *name, unsigned int flags)
4428 ifr.ifr_flags = flags;
4429 return netdev_linux_do_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
4433 do_get_ifindex(const char *netdev_name)
4437 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4438 COVERAGE_INC(netdev_get_ifindex);
4439 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4440 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4441 netdev_name, ovs_strerror(errno));
4444 return ifr.ifr_ifindex;
4448 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4450 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4452 if (!(netdev->cache_valid & VALID_IFINDEX)) {
4453 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4456 netdev->get_ifindex_error = -ifindex;
4457 netdev->ifindex = 0;
4459 netdev->get_ifindex_error = 0;
4460 netdev->ifindex = ifindex;
4462 netdev->cache_valid |= VALID_IFINDEX;
4465 *ifindexp = netdev->ifindex;
4466 return netdev->get_ifindex_error;
4470 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4475 memset(&ifr, 0, sizeof ifr);
4476 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4477 COVERAGE_INC(netdev_get_hwaddr);
4478 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4479 /* ENODEV probably means that a vif disappeared asynchronously and
4480 * hasn't been removed from the database yet, so reduce the log level
4481 * to INFO for that case. */
4482 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4483 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4484 netdev_name, ovs_strerror(errno));
4487 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4488 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4489 VLOG_WARN("%s device has unknown hardware address family %d",
4490 netdev_name, hwaddr_family);
4492 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4497 set_etheraddr(const char *netdev_name,
4498 const uint8_t mac[ETH_ADDR_LEN])
4502 memset(&ifr, 0, sizeof ifr);
4503 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4504 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4505 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4506 COVERAGE_INC(netdev_set_hwaddr);
4507 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4508 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4509 netdev_name, ovs_strerror(errno));
4516 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4517 int cmd, const char *cmd_name)
4521 memset(&ifr, 0, sizeof ifr);
4522 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4523 ifr.ifr_data = (caddr_t) ecmd;
4526 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4529 if (errno != EOPNOTSUPP) {
4530 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4531 "failed: %s", cmd_name, name, ovs_strerror(errno));
4533 /* The device doesn't support this operation. That's pretty
4534 * common, so there's no point in logging anything. */
4541 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4542 const char *cmd_name)
4544 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4545 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4546 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4547 ovs_strerror(errno));
4554 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4555 int cmd, const char *cmd_name)
4560 ifr.ifr_addr.sa_family = AF_INET;
4561 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4563 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4564 *ip = sin->sin_addr;
4569 /* Returns an AF_PACKET raw socket or a negative errno value. */
4571 af_packet_sock(void)
4573 static int sock = INT_MIN;
4575 if (sock == INT_MIN) {
4576 sock = socket(AF_PACKET, SOCK_RAW, 0);
4578 int error = set_nonblocking(sock);
4585 VLOG_ERR("failed to create packet socket: %s",
4586 ovs_strerror(errno));