2 * Copyright (c) 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <linux/version.h>
37 #include <sys/types.h>
38 #include <sys/ioctl.h>
39 #include <sys/socket.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
52 #include "dpif-linux.h"
53 #include "dynamic-string.h"
54 #include "fatal-signal.h"
57 #include "netdev-provider.h"
58 #include "netdev-vport.h"
59 #include "netlink-notifier.h"
60 #include "netlink-socket.h"
63 #include "openflow/openflow.h"
65 #include "poll-loop.h"
66 #include "rtnetlink-link.h"
68 #include "socket-util.h"
71 #include "unaligned.h"
74 VLOG_DEFINE_THIS_MODULE(netdev_linux);
76 COVERAGE_DEFINE(netdev_set_policing);
77 COVERAGE_DEFINE(netdev_arp_lookup);
78 COVERAGE_DEFINE(netdev_get_ifindex);
79 COVERAGE_DEFINE(netdev_get_hwaddr);
80 COVERAGE_DEFINE(netdev_set_hwaddr);
81 COVERAGE_DEFINE(netdev_get_ethtool);
82 COVERAGE_DEFINE(netdev_set_ethtool);
85 /* These were introduced in Linux 2.6.14, so they might be missing if we have
87 #ifndef ADVERTISED_Pause
88 #define ADVERTISED_Pause (1 << 13)
90 #ifndef ADVERTISED_Asym_Pause
91 #define ADVERTISED_Asym_Pause (1 << 14)
94 /* These were introduced in Linux 2.6.24, so they might be missing if we
95 * have old headers. */
96 #ifndef ETHTOOL_GFLAGS
97 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
99 #ifndef ETHTOOL_SFLAGS
100 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
103 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
106 #define TC_RTAB_SIZE 1024
109 static struct nln_notifier *netdev_linux_cache_notifier = NULL;
110 static int cache_notifier_refcount;
113 VALID_IFINDEX = 1 << 0,
114 VALID_ETHERADDR = 1 << 1,
118 VALID_POLICING = 1 << 5,
119 VALID_VPORT_STAT_ERROR = 1 << 6,
120 VALID_DRVINFO = 1 << 7,
121 VALID_FEATURES = 1 << 8,
128 /* Traffic control. */
130 /* An instance of a traffic control class. Always associated with a particular
133 * Each TC implementation subclasses this with whatever additional data it
136 const struct tc_ops *ops;
137 struct hmap queues; /* Contains "struct tc_queue"s.
138 * Read by generic TC layer.
139 * Written only by TC implementation. */
142 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
144 /* One traffic control queue.
146 * Each TC implementation subclasses this with whatever additional data it
149 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
150 unsigned int queue_id; /* OpenFlow queue ID. */
153 /* A particular kind of traffic control. Each implementation generally maps to
154 * one particular Linux qdisc class.
156 * The functions below return 0 if successful or a positive errno value on
157 * failure, except where otherwise noted. All of them must be provided, except
158 * where otherwise noted. */
160 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
161 * This is null for tc_ops_default and tc_ops_other, for which there are no
162 * appropriate values. */
163 const char *linux_name;
165 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
166 const char *ovs_name;
168 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
169 * queues. The queues are numbered 0 through n_queues - 1. */
170 unsigned int n_queues;
172 /* Called to install this TC class on 'netdev'. The implementation should
173 * make the Netlink calls required to set up 'netdev' with the right qdisc
174 * and configure it according to 'details'. The implementation may assume
175 * that the current qdisc is the default; that is, there is no need for it
176 * to delete the current qdisc before installing itself.
178 * The contents of 'details' should be documented as valid for 'ovs_name'
179 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
180 * (which is built as ovs-vswitchd.conf.db(8)).
182 * This function must return 0 if and only if it sets 'netdev->tc' to an
183 * initialized 'struct tc'.
185 * (This function is null for tc_ops_other, which cannot be installed. For
186 * other TC classes it should always be nonnull.) */
187 int (*tc_install)(struct netdev *netdev, const struct smap *details);
189 /* Called when the netdev code determines (through a Netlink query) that
190 * this TC class's qdisc is installed on 'netdev', but we didn't install
191 * it ourselves and so don't know any of the details.
193 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
194 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
195 * implementation should parse the other attributes of 'nlmsg' as
196 * necessary to determine its configuration. If necessary it should also
197 * use Netlink queries to determine the configuration of queues on
200 * This function must return 0 if and only if it sets 'netdev->tc' to an
201 * initialized 'struct tc'. */
202 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
204 /* Destroys the data structures allocated by the implementation as part of
205 * 'tc'. (This includes destroying 'tc->queues' by calling
208 * The implementation should not need to perform any Netlink calls. If
209 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
210 * (But it may not be desirable.)
212 * This function may be null if 'tc' is trivial. */
213 void (*tc_destroy)(struct tc *tc);
215 /* Retrieves details of 'netdev->tc' configuration into 'details'.
217 * The implementation should not need to perform any Netlink calls, because
218 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
219 * cached the configuration.
221 * The contents of 'details' should be documented as valid for 'ovs_name'
222 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
223 * (which is built as ovs-vswitchd.conf.db(8)).
225 * This function may be null if 'tc' is not configurable.
227 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
229 /* Reconfigures 'netdev->tc' according to 'details', performing any
230 * required Netlink calls to complete the reconfiguration.
232 * The contents of 'details' should be documented as valid for 'ovs_name'
233 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
234 * (which is built as ovs-vswitchd.conf.db(8)).
236 * This function may be null if 'tc' is not configurable.
238 int (*qdisc_set)(struct netdev *, const struct smap *details);
240 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
241 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
243 * The contents of 'details' should be documented as valid for 'ovs_name'
244 * in the "other_config" column in the "Queue" table in
245 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
247 * The implementation should not need to perform any Netlink calls, because
248 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
249 * cached the queue configuration.
251 * This function may be null if 'tc' does not have queues ('n_queues' is
253 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
254 struct smap *details);
256 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
257 * 'details', perfoming any required Netlink calls to complete the
258 * reconfiguration. The caller ensures that 'queue_id' is less than
261 * The contents of 'details' should be documented as valid for 'ovs_name'
262 * in the "other_config" column in the "Queue" table in
263 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
265 * This function may be null if 'tc' does not have queues or its queues are
266 * not configurable. */
267 int (*class_set)(struct netdev *, unsigned int queue_id,
268 const struct smap *details);
270 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
271 * tc_queue's within 'netdev->tc->queues'.
273 * This function may be null if 'tc' does not have queues or its queues
274 * cannot be deleted. */
275 int (*class_delete)(struct netdev *, struct tc_queue *queue);
277 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
278 * 'struct tc_queue's within 'netdev->tc->queues'.
280 * On success, initializes '*stats'.
282 * This function may be null if 'tc' does not have queues or if it cannot
283 * report queue statistics. */
284 int (*class_get_stats)(const struct netdev *netdev,
285 const struct tc_queue *queue,
286 struct netdev_queue_stats *stats);
288 /* Extracts queue stats from 'nlmsg', which is a response to a
289 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
291 * This function may be null if 'tc' does not have queues or if it cannot
292 * report queue statistics. */
293 int (*class_dump_stats)(const struct netdev *netdev,
294 const struct ofpbuf *nlmsg,
295 netdev_dump_queue_stats_cb *cb, void *aux);
299 tc_init(struct tc *tc, const struct tc_ops *ops)
302 hmap_init(&tc->queues);
306 tc_destroy(struct tc *tc)
308 hmap_destroy(&tc->queues);
311 static const struct tc_ops tc_ops_htb;
312 static const struct tc_ops tc_ops_hfsc;
313 static const struct tc_ops tc_ops_default;
314 static const struct tc_ops tc_ops_other;
316 static const struct tc_ops *const tcs[] = {
317 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
318 &tc_ops_hfsc, /* Hierarchical fair service curve. */
319 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
320 &tc_ops_other, /* Some other qdisc. */
324 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
325 static unsigned int tc_get_major(unsigned int handle);
326 static unsigned int tc_get_minor(unsigned int handle);
328 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
329 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
330 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
332 static struct tcmsg *tc_make_request(const struct netdev *, int type,
333 unsigned int flags, struct ofpbuf *);
334 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
335 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
336 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
339 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
340 struct nlattr **options);
341 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
342 struct nlattr **options,
343 struct netdev_queue_stats *);
344 static int tc_query_class(const struct netdev *,
345 unsigned int handle, unsigned int parent,
346 struct ofpbuf **replyp);
347 static int tc_delete_class(const struct netdev *, unsigned int handle);
349 static int tc_del_qdisc(struct netdev *netdev);
350 static int tc_query_qdisc(const struct netdev *netdev);
352 static int tc_calc_cell_log(unsigned int mtu);
353 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
354 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
355 const struct tc_ratespec *rate);
356 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
358 struct netdev_linux {
361 struct shash_node *shash_node;
362 unsigned int cache_valid;
363 unsigned int change_seq;
365 bool miimon; /* Link status of last poll. */
366 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
367 struct timer miimon_timer;
369 /* The following are figured out "on demand" only. They are only valid
370 * when the corresponding VALID_* bit in 'cache_valid' is set. */
372 uint8_t etheraddr[ETH_ADDR_LEN];
373 struct in_addr address, netmask;
376 unsigned int ifi_flags;
377 long long int carrier_resets;
378 uint32_t kbits_rate; /* Policing data. */
379 uint32_t kbits_burst;
380 int vport_stats_error; /* Cached error code from vport_get_stats().
381 0 or an errno value. */
382 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
383 int ether_addr_error; /* Cached error code from set/get etheraddr. */
384 int netdev_policing_error; /* Cached error code from set policing. */
385 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
386 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
388 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
389 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
390 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
391 enum netdev_features peer; /* Cached from ETHTOOL_GSET. */
393 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
397 struct tap_state tap;
401 struct netdev_rx_linux {
407 static const struct netdev_rx_class netdev_rx_linux_class;
409 /* Sockets used for ioctl operations. */
410 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
412 /* A Netlink routing socket that is not subscribed to any multicast groups. */
413 static struct nl_sock *rtnl_sock;
415 /* This is set pretty low because we probably won't learn anything from the
416 * additional log messages. */
417 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
419 static int netdev_linux_init(void);
421 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
422 int cmd, const char *cmd_name);
423 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
424 const char *cmd_name);
425 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
426 int cmd, const char *cmd_name);
427 static int get_flags(const struct netdev *, unsigned int *flags);
428 static int set_flags(const char *, unsigned int flags);
429 static int do_get_ifindex(const char *netdev_name);
430 static int get_ifindex(const struct netdev *, int *ifindexp);
431 static int do_set_addr(struct netdev *netdev,
432 int ioctl_nr, const char *ioctl_name,
433 struct in_addr addr);
434 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
435 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
436 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
437 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
438 static int af_packet_sock(void);
439 static void netdev_linux_miimon_run(void);
440 static void netdev_linux_miimon_wait(void);
443 is_netdev_linux_class(const struct netdev_class *netdev_class)
445 return netdev_class->init == netdev_linux_init;
449 is_tap_netdev(const struct netdev *netdev)
451 return netdev_get_class(netdev) == &netdev_tap_class;
454 static struct netdev_linux *
455 netdev_linux_cast(const struct netdev *netdev)
457 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
459 return CONTAINER_OF(netdev, struct netdev_linux, up);
462 static struct netdev_rx_linux *
463 netdev_rx_linux_cast(const struct netdev_rx *rx)
465 netdev_rx_assert_class(rx, &netdev_rx_linux_class);
466 return CONTAINER_OF(rx, struct netdev_rx_linux, up);
470 netdev_linux_init(void)
472 static int status = -1;
474 /* Create AF_INET socket. */
475 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
476 status = af_inet_sock >= 0 ? 0 : errno;
478 VLOG_ERR("failed to create inet socket: %s", ovs_strerror(status));
481 /* Create rtnetlink socket. */
483 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
485 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
486 ovs_strerror(status));
494 netdev_linux_run(void)
496 rtnetlink_link_run();
497 netdev_linux_miimon_run();
501 netdev_linux_wait(void)
503 rtnetlink_link_wait();
504 netdev_linux_miimon_wait();
508 netdev_linux_changed(struct netdev_linux *dev,
509 unsigned int ifi_flags, unsigned int mask)
512 if (!dev->change_seq) {
516 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
517 dev->carrier_resets++;
519 dev->ifi_flags = ifi_flags;
521 dev->cache_valid &= mask;
525 netdev_linux_update(struct netdev_linux *dev,
526 const struct rtnetlink_link_change *change)
528 if (change->nlmsg_type == RTM_NEWLINK) {
530 netdev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
532 /* Update netdev from rtnl-change msg. */
534 dev->mtu = change->mtu;
535 dev->cache_valid |= VALID_MTU;
536 dev->netdev_mtu_error = 0;
539 if (!eth_addr_is_zero(change->addr)) {
540 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
541 dev->cache_valid |= VALID_ETHERADDR;
542 dev->ether_addr_error = 0;
545 dev->ifindex = change->ifi_index;
546 dev->cache_valid |= VALID_IFINDEX;
547 dev->get_ifindex_error = 0;
550 netdev_linux_changed(dev, change->ifi_flags, 0);
555 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
556 void *aux OVS_UNUSED)
558 struct netdev_linux *dev;
560 struct netdev *base_dev = netdev_from_name(change->ifname);
561 if (base_dev && is_netdev_linux_class(netdev_get_class(base_dev))) {
562 netdev_linux_update(netdev_linux_cast(base_dev), change);
565 struct shash device_shash;
566 struct shash_node *node;
568 shash_init(&device_shash);
569 netdev_get_devices(&netdev_linux_class, &device_shash);
570 SHASH_FOR_EACH (node, &device_shash) {
575 get_flags(&dev->up, &flags);
576 netdev_linux_changed(dev, flags, 0);
578 shash_destroy(&device_shash);
583 cache_notifier_ref(void)
585 if (!cache_notifier_refcount) {
586 ovs_assert(!netdev_linux_cache_notifier);
588 netdev_linux_cache_notifier =
589 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
591 if (!netdev_linux_cache_notifier) {
595 cache_notifier_refcount++;
601 cache_notifier_unref(void)
603 ovs_assert(cache_notifier_refcount > 0);
604 if (!--cache_notifier_refcount) {
605 ovs_assert(netdev_linux_cache_notifier);
606 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
607 netdev_linux_cache_notifier = NULL;
611 /* Creates system and internal devices. */
613 netdev_linux_create(const struct netdev_class *class, const char *name,
614 struct netdev **netdevp)
616 struct netdev_linux *netdev;
619 error = cache_notifier_ref();
624 netdev = xzalloc(sizeof *netdev);
625 netdev->change_seq = 1;
626 netdev_init(&netdev->up, name, class);
627 error = get_flags(&netdev->up, &netdev->ifi_flags);
628 if (error == ENODEV) {
629 if (class != &netdev_internal_class) {
630 /* The device does not exist, so don't allow it to be opened. */
631 netdev_uninit(&netdev->up, false);
632 cache_notifier_unref();
636 /* "Internal" netdevs have to be created as netdev objects before
637 * they exist in the kernel, because creating them in the kernel
638 * happens by passing a netdev object to dpif_port_add().
639 * Therefore, ignore the error. */
643 *netdevp = &netdev->up;
647 /* For most types of netdevs we open the device for each call of
648 * netdev_open(). However, this is not the case with tap devices,
649 * since it is only possible to open the device once. In this
650 * situation we share a single file descriptor, and consequently
651 * buffers, across all readers. Therefore once data is read it will
652 * be unavailable to other reads for tap devices. */
654 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
655 const char *name, struct netdev **netdevp)
657 struct netdev_linux *netdev;
658 struct tap_state *state;
659 static const char tap_dev[] = "/dev/net/tun";
663 netdev = xzalloc(sizeof *netdev);
664 state = &netdev->state.tap;
666 error = cache_notifier_ref();
671 /* Open tap device. */
672 state->fd = open(tap_dev, O_RDWR);
675 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
676 goto error_unref_notifier;
679 /* Create tap device. */
680 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
681 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
682 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
683 VLOG_WARN("%s: creating tap device failed: %s", name,
684 ovs_strerror(errno));
686 goto error_unref_notifier;
689 /* Make non-blocking. */
690 error = set_nonblocking(state->fd);
692 goto error_unref_notifier;
695 netdev_init(&netdev->up, name, &netdev_tap_class);
696 *netdevp = &netdev->up;
699 error_unref_notifier:
700 cache_notifier_unref();
707 destroy_tap(struct netdev_linux *netdev)
709 struct tap_state *state = &netdev->state.tap;
711 if (state->fd >= 0) {
716 /* Destroys the netdev device 'netdev_'. */
718 netdev_linux_destroy(struct netdev *netdev_)
720 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
722 if (netdev->tc && netdev->tc->ops->tc_destroy) {
723 netdev->tc->ops->tc_destroy(netdev->tc);
726 if (netdev_get_class(netdev_) == &netdev_tap_class) {
731 cache_notifier_unref();
735 netdev_linux_rx_open(struct netdev *netdev_, struct netdev_rx **rxp)
737 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
738 bool is_tap = is_tap_netdev(netdev_);
739 struct netdev_rx_linux *rx;
744 fd = netdev->state.tap.fd;
746 struct sockaddr_ll sll;
748 /* Result of tcpdump -dd inbound */
749 static struct sock_filter filt[] = {
750 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
751 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
752 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
753 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
755 static struct sock_fprog fprog = { ARRAY_SIZE(filt), filt };
757 /* Create file descriptor. */
758 fd = socket(PF_PACKET, SOCK_RAW, 0);
761 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
765 /* Set non-blocking mode. */
766 error = set_nonblocking(fd);
771 /* Get ethernet device index. */
772 error = get_ifindex(&netdev->up, &ifindex);
777 /* Bind to specific ethernet device. */
778 memset(&sll, 0, sizeof sll);
779 sll.sll_family = AF_PACKET;
780 sll.sll_ifindex = ifindex;
781 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
782 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
784 VLOG_ERR("%s: failed to bind raw socket (%s)",
785 netdev_get_name(netdev_), ovs_strerror(error));
789 /* Filter for only inbound packets. */
790 error = setsockopt(fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
794 VLOG_ERR("%s: failed attach filter (%s)",
795 netdev_get_name(netdev_), ovs_strerror(error));
800 rx = xmalloc(sizeof *rx);
801 netdev_rx_init(&rx->up, netdev_, &netdev_rx_linux_class);
816 netdev_rx_linux_destroy(struct netdev_rx *rx_)
818 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
827 netdev_rx_linux_recv(struct netdev_rx *rx_, void *data, size_t size)
829 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
834 ? read(rx->fd, data, size)
835 : recv(rx->fd, data, size, MSG_TRUNC));
836 } while (retval < 0 && errno == EINTR);
839 return retval > size ? -EMSGSIZE : retval;
841 if (errno != EAGAIN) {
842 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
843 ovs_strerror(errno), netdev_rx_get_name(rx_));
850 netdev_rx_linux_wait(struct netdev_rx *rx_)
852 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
853 poll_fd_wait(rx->fd, POLLIN);
857 netdev_rx_linux_drain(struct netdev_rx *rx_)
859 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
862 int error = netdev_linux_do_ioctl(netdev_rx_get_name(rx_), &ifr,
863 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
867 drain_fd(rx->fd, ifr.ifr_qlen);
870 return drain_rcvbuf(rx->fd);
874 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
875 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
876 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
877 * the packet is too big or too small to transmit on the device.
879 * The caller retains ownership of 'buffer' in all cases.
881 * The kernel maintains a packet transmission queue, so the caller is not
882 * expected to do additional queuing of packets. */
884 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
889 if (!is_tap_netdev(netdev_)) {
890 /* Use our AF_PACKET socket to send to this device. */
891 struct sockaddr_ll sll;
898 sock = af_packet_sock();
903 error = get_ifindex(netdev_, &ifindex);
908 /* We don't bother setting most fields in sockaddr_ll because the
909 * kernel ignores them for SOCK_RAW. */
910 memset(&sll, 0, sizeof sll);
911 sll.sll_family = AF_PACKET;
912 sll.sll_ifindex = ifindex;
914 iov.iov_base = CONST_CAST(void *, data);
918 msg.msg_namelen = sizeof sll;
921 msg.msg_control = NULL;
922 msg.msg_controllen = 0;
925 retval = sendmsg(sock, &msg, 0);
927 /* Use the tap fd to send to this device. This is essential for
928 * tap devices, because packets sent to a tap device with an
929 * AF_PACKET socket will loop back to be *received* again on the
930 * tap device. This doesn't occur on other interface types
931 * because we attach a socket filter to the rx socket. */
932 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
934 retval = write(netdev->state.tap.fd, data, size);
938 /* The Linux AF_PACKET implementation never blocks waiting for room
939 * for packets, instead returning ENOBUFS. Translate this into
940 * EAGAIN for the caller. */
941 if (errno == ENOBUFS) {
943 } else if (errno == EINTR) {
945 } else if (errno != EAGAIN) {
946 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
947 netdev_get_name(netdev_), ovs_strerror(errno));
950 } else if (retval != size) {
951 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
952 "%zu) on %s", retval, size, netdev_get_name(netdev_));
960 /* Registers with the poll loop to wake up from the next call to poll_block()
961 * when the packet transmission queue has sufficient room to transmit a packet
962 * with netdev_send().
964 * The kernel maintains a packet transmission queue, so the client is not
965 * expected to do additional queuing of packets. Thus, this function is
966 * unlikely to ever be used. It is included for completeness. */
968 netdev_linux_send_wait(struct netdev *netdev)
970 if (is_tap_netdev(netdev)) {
971 /* TAP device always accepts packets.*/
972 poll_immediate_wake();
976 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
977 * otherwise a positive errno value. */
979 netdev_linux_set_etheraddr(struct netdev *netdev_,
980 const uint8_t mac[ETH_ADDR_LEN])
982 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
983 struct netdev_saved_flags *sf = NULL;
986 if (netdev->cache_valid & VALID_ETHERADDR) {
987 if (netdev->ether_addr_error) {
988 return netdev->ether_addr_error;
990 if (eth_addr_equals(netdev->etheraddr, mac)) {
993 netdev->cache_valid &= ~VALID_ETHERADDR;
996 /* Tap devices must be brought down before setting the address. */
997 if (is_tap_netdev(netdev_)) {
998 netdev_turn_flags_off(netdev_, NETDEV_UP, &sf);
1000 error = set_etheraddr(netdev_get_name(netdev_), mac);
1001 if (!error || error == ENODEV) {
1002 netdev->ether_addr_error = error;
1003 netdev->cache_valid |= VALID_ETHERADDR;
1005 memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
1009 netdev_restore_flags(sf);
1014 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1016 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1017 uint8_t mac[ETH_ADDR_LEN])
1019 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1021 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1022 int error = get_etheraddr(netdev_get_name(netdev_),
1025 netdev->ether_addr_error = error;
1026 netdev->cache_valid |= VALID_ETHERADDR;
1029 if (!netdev->ether_addr_error) {
1030 memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
1033 return netdev->ether_addr_error;
1036 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1037 * in bytes, not including the hardware header; thus, this is typically 1500
1038 * bytes for Ethernet devices. */
1040 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1042 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1043 if (!(netdev->cache_valid & VALID_MTU)) {
1047 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1048 SIOCGIFMTU, "SIOCGIFMTU");
1050 netdev->netdev_mtu_error = error;
1051 netdev->mtu = ifr.ifr_mtu;
1052 netdev->cache_valid |= VALID_MTU;
1055 if (!netdev->netdev_mtu_error) {
1056 *mtup = netdev->mtu;
1058 return netdev->netdev_mtu_error;
1061 /* Sets the maximum size of transmitted (MTU) for given device using linux
1062 * networking ioctl interface.
1065 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1067 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1071 if (netdev->cache_valid & VALID_MTU) {
1072 if (netdev->netdev_mtu_error) {
1073 return netdev->netdev_mtu_error;
1075 if (netdev->mtu == mtu) {
1078 netdev->cache_valid &= ~VALID_MTU;
1081 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1082 SIOCSIFMTU, "SIOCSIFMTU");
1083 if (!error || error == ENODEV) {
1084 netdev->netdev_mtu_error = error;
1085 netdev->mtu = ifr.ifr_mtu;
1086 netdev->cache_valid |= VALID_MTU;
1091 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1092 * On failure, returns a negative errno value. */
1094 netdev_linux_get_ifindex(const struct netdev *netdev)
1098 error = get_ifindex(netdev, &ifindex);
1099 return error ? -error : ifindex;
1103 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1105 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1107 if (netdev->miimon_interval > 0) {
1108 *carrier = netdev->miimon;
1110 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1116 static long long int
1117 netdev_linux_get_carrier_resets(const struct netdev *netdev)
1119 return netdev_linux_cast(netdev)->carrier_resets;
1123 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1124 struct mii_ioctl_data *data)
1129 memset(&ifr, 0, sizeof ifr);
1130 memcpy(&ifr.ifr_data, data, sizeof *data);
1131 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1132 memcpy(data, &ifr.ifr_data, sizeof *data);
1138 netdev_linux_get_miimon(const char *name, bool *miimon)
1140 struct mii_ioctl_data data;
1145 memset(&data, 0, sizeof data);
1146 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1148 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1149 data.reg_num = MII_BMSR;
1150 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1154 *miimon = !!(data.val_out & BMSR_LSTATUS);
1156 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1159 struct ethtool_cmd ecmd;
1161 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1164 COVERAGE_INC(netdev_get_ethtool);
1165 memset(&ecmd, 0, sizeof ecmd);
1166 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1169 struct ethtool_value eval;
1171 memcpy(&eval, &ecmd, sizeof eval);
1172 *miimon = !!eval.data;
1174 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1182 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1183 long long int interval)
1185 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1187 interval = interval > 0 ? MAX(interval, 100) : 0;
1188 if (netdev->miimon_interval != interval) {
1189 netdev->miimon_interval = interval;
1190 timer_set_expired(&netdev->miimon_timer);
1197 netdev_linux_miimon_run(void)
1199 struct shash device_shash;
1200 struct shash_node *node;
1202 shash_init(&device_shash);
1203 netdev_get_devices(&netdev_linux_class, &device_shash);
1204 SHASH_FOR_EACH (node, &device_shash) {
1205 struct netdev_linux *dev = node->data;
1208 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1212 netdev_linux_get_miimon(dev->up.name, &miimon);
1213 if (miimon != dev->miimon) {
1214 dev->miimon = miimon;
1215 netdev_linux_changed(dev, dev->ifi_flags, 0);
1218 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1221 shash_destroy(&device_shash);
1225 netdev_linux_miimon_wait(void)
1227 struct shash device_shash;
1228 struct shash_node *node;
1230 shash_init(&device_shash);
1231 netdev_get_devices(&netdev_linux_class, &device_shash);
1232 SHASH_FOR_EACH (node, &device_shash) {
1233 struct netdev_linux *dev = node->data;
1235 if (dev->miimon_interval > 0) {
1236 timer_wait(&dev->miimon_timer);
1239 shash_destroy(&device_shash);
1242 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1243 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1246 check_for_working_netlink_stats(void)
1248 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1249 * preferable, so if that works, we'll use it. */
1250 int ifindex = do_get_ifindex("lo");
1252 VLOG_WARN("failed to get ifindex for lo, "
1253 "obtaining netdev stats from proc");
1256 struct netdev_stats stats;
1257 int error = get_stats_via_netlink(ifindex, &stats);
1259 VLOG_DBG("obtaining netdev stats via rtnetlink");
1262 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1263 "via proc (you are probably running a pre-2.6.19 "
1264 "kernel)", ovs_strerror(error));
1271 swap_uint64(uint64_t *a, uint64_t *b)
1278 /* Copies 'src' into 'dst', performing format conversion in the process.
1280 * 'src' is allowed to be misaligned. */
1282 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1283 const struct ovs_vport_stats *src)
1285 dst->rx_packets = get_unaligned_u64(&src->rx_packets);
1286 dst->tx_packets = get_unaligned_u64(&src->tx_packets);
1287 dst->rx_bytes = get_unaligned_u64(&src->rx_bytes);
1288 dst->tx_bytes = get_unaligned_u64(&src->tx_bytes);
1289 dst->rx_errors = get_unaligned_u64(&src->rx_errors);
1290 dst->tx_errors = get_unaligned_u64(&src->tx_errors);
1291 dst->rx_dropped = get_unaligned_u64(&src->rx_dropped);
1292 dst->tx_dropped = get_unaligned_u64(&src->tx_dropped);
1294 dst->collisions = 0;
1295 dst->rx_length_errors = 0;
1296 dst->rx_over_errors = 0;
1297 dst->rx_crc_errors = 0;
1298 dst->rx_frame_errors = 0;
1299 dst->rx_fifo_errors = 0;
1300 dst->rx_missed_errors = 0;
1301 dst->tx_aborted_errors = 0;
1302 dst->tx_carrier_errors = 0;
1303 dst->tx_fifo_errors = 0;
1304 dst->tx_heartbeat_errors = 0;
1305 dst->tx_window_errors = 0;
1309 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1311 struct dpif_linux_vport reply;
1315 error = dpif_linux_vport_get(netdev_get_name(netdev), &reply, &buf);
1318 } else if (!reply.stats) {
1323 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1331 get_stats_via_vport(const struct netdev *netdev_,
1332 struct netdev_stats *stats)
1334 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1336 if (!netdev->vport_stats_error ||
1337 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1340 error = get_stats_via_vport__(netdev_, stats);
1341 if (error && error != ENOENT) {
1342 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1344 netdev_get_name(netdev_), ovs_strerror(error));
1346 netdev->vport_stats_error = error;
1347 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1352 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1353 struct netdev_stats *stats)
1355 static int use_netlink_stats = -1;
1358 if (use_netlink_stats < 0) {
1359 use_netlink_stats = check_for_working_netlink_stats();
1362 if (use_netlink_stats) {
1365 error = get_ifindex(netdev_, &ifindex);
1367 error = get_stats_via_netlink(ifindex, stats);
1370 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1374 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1375 netdev_get_name(netdev_), error);
1381 /* Retrieves current device stats for 'netdev-linux'. */
1383 netdev_linux_get_stats(const struct netdev *netdev_,
1384 struct netdev_stats *stats)
1386 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1387 struct netdev_stats dev_stats;
1390 get_stats_via_vport(netdev_, stats);
1392 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1395 if (netdev->vport_stats_error) {
1402 if (netdev->vport_stats_error) {
1403 /* stats not available from OVS then use ioctl stats. */
1406 stats->rx_errors += dev_stats.rx_errors;
1407 stats->tx_errors += dev_stats.tx_errors;
1408 stats->rx_dropped += dev_stats.rx_dropped;
1409 stats->tx_dropped += dev_stats.tx_dropped;
1410 stats->multicast += dev_stats.multicast;
1411 stats->collisions += dev_stats.collisions;
1412 stats->rx_length_errors += dev_stats.rx_length_errors;
1413 stats->rx_over_errors += dev_stats.rx_over_errors;
1414 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1415 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1416 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1417 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1418 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1419 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1420 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1421 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1422 stats->tx_window_errors += dev_stats.tx_window_errors;
1427 /* Retrieves current device stats for 'netdev-tap' netdev or
1428 * netdev-internal. */
1430 netdev_tap_get_stats(const struct netdev *netdev_,
1431 struct netdev_stats *stats)
1433 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1434 struct netdev_stats dev_stats;
1437 get_stats_via_vport(netdev_, stats);
1439 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1441 if (netdev->vport_stats_error) {
1448 /* If this port is an internal port then the transmit and receive stats
1449 * will appear to be swapped relative to the other ports since we are the
1450 * one sending the data, not a remote computer. For consistency, we swap
1451 * them back here. This does not apply if we are getting stats from the
1452 * vport layer because it always tracks stats from the perspective of the
1454 if (netdev->vport_stats_error) {
1456 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1457 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1458 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1459 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1460 stats->rx_length_errors = 0;
1461 stats->rx_over_errors = 0;
1462 stats->rx_crc_errors = 0;
1463 stats->rx_frame_errors = 0;
1464 stats->rx_fifo_errors = 0;
1465 stats->rx_missed_errors = 0;
1466 stats->tx_aborted_errors = 0;
1467 stats->tx_carrier_errors = 0;
1468 stats->tx_fifo_errors = 0;
1469 stats->tx_heartbeat_errors = 0;
1470 stats->tx_window_errors = 0;
1472 stats->rx_dropped += dev_stats.tx_dropped;
1473 stats->tx_dropped += dev_stats.rx_dropped;
1475 stats->rx_errors += dev_stats.tx_errors;
1476 stats->tx_errors += dev_stats.rx_errors;
1478 stats->multicast += dev_stats.multicast;
1479 stats->collisions += dev_stats.collisions;
1485 netdev_internal_get_stats(const struct netdev *netdev_,
1486 struct netdev_stats *stats)
1488 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1490 get_stats_via_vport(netdev_, stats);
1491 return netdev->vport_stats_error;
1495 netdev_internal_set_stats(struct netdev *netdev,
1496 const struct netdev_stats *stats)
1498 struct ovs_vport_stats vport_stats;
1499 struct dpif_linux_vport vport;
1502 vport_stats.rx_packets = stats->rx_packets;
1503 vport_stats.tx_packets = stats->tx_packets;
1504 vport_stats.rx_bytes = stats->rx_bytes;
1505 vport_stats.tx_bytes = stats->tx_bytes;
1506 vport_stats.rx_errors = stats->rx_errors;
1507 vport_stats.tx_errors = stats->tx_errors;
1508 vport_stats.rx_dropped = stats->rx_dropped;
1509 vport_stats.tx_dropped = stats->tx_dropped;
1511 dpif_linux_vport_init(&vport);
1512 vport.cmd = OVS_VPORT_CMD_SET;
1513 vport.name = netdev_get_name(netdev);
1514 vport.stats = &vport_stats;
1516 err = dpif_linux_vport_transact(&vport, NULL, NULL);
1518 /* If the vport layer doesn't know about the device, that doesn't mean it
1519 * doesn't exist (after all were able to open it when netdev_open() was
1520 * called), it just means that it isn't attached and we'll be getting
1521 * stats a different way. */
1522 if (err == ENODEV) {
1530 netdev_linux_read_features(struct netdev_linux *netdev)
1532 struct ethtool_cmd ecmd;
1536 if (netdev->cache_valid & VALID_FEATURES) {
1540 COVERAGE_INC(netdev_get_ethtool);
1541 memset(&ecmd, 0, sizeof ecmd);
1542 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1543 ETHTOOL_GSET, "ETHTOOL_GSET");
1548 /* Supported features. */
1549 netdev->supported = 0;
1550 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1551 netdev->supported |= NETDEV_F_10MB_HD;
1553 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1554 netdev->supported |= NETDEV_F_10MB_FD;
1556 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1557 netdev->supported |= NETDEV_F_100MB_HD;
1559 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1560 netdev->supported |= NETDEV_F_100MB_FD;
1562 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1563 netdev->supported |= NETDEV_F_1GB_HD;
1565 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1566 netdev->supported |= NETDEV_F_1GB_FD;
1568 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1569 netdev->supported |= NETDEV_F_10GB_FD;
1571 if (ecmd.supported & SUPPORTED_TP) {
1572 netdev->supported |= NETDEV_F_COPPER;
1574 if (ecmd.supported & SUPPORTED_FIBRE) {
1575 netdev->supported |= NETDEV_F_FIBER;
1577 if (ecmd.supported & SUPPORTED_Autoneg) {
1578 netdev->supported |= NETDEV_F_AUTONEG;
1580 if (ecmd.supported & SUPPORTED_Pause) {
1581 netdev->supported |= NETDEV_F_PAUSE;
1583 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1584 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1587 /* Advertised features. */
1588 netdev->advertised = 0;
1589 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1590 netdev->advertised |= NETDEV_F_10MB_HD;
1592 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1593 netdev->advertised |= NETDEV_F_10MB_FD;
1595 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1596 netdev->advertised |= NETDEV_F_100MB_HD;
1598 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1599 netdev->advertised |= NETDEV_F_100MB_FD;
1601 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1602 netdev->advertised |= NETDEV_F_1GB_HD;
1604 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1605 netdev->advertised |= NETDEV_F_1GB_FD;
1607 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1608 netdev->advertised |= NETDEV_F_10GB_FD;
1610 if (ecmd.advertising & ADVERTISED_TP) {
1611 netdev->advertised |= NETDEV_F_COPPER;
1613 if (ecmd.advertising & ADVERTISED_FIBRE) {
1614 netdev->advertised |= NETDEV_F_FIBER;
1616 if (ecmd.advertising & ADVERTISED_Autoneg) {
1617 netdev->advertised |= NETDEV_F_AUTONEG;
1619 if (ecmd.advertising & ADVERTISED_Pause) {
1620 netdev->advertised |= NETDEV_F_PAUSE;
1622 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1623 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1626 /* Current settings. */
1628 if (speed == SPEED_10) {
1629 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1630 } else if (speed == SPEED_100) {
1631 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1632 } else if (speed == SPEED_1000) {
1633 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1634 } else if (speed == SPEED_10000) {
1635 netdev->current = NETDEV_F_10GB_FD;
1636 } else if (speed == 40000) {
1637 netdev->current = NETDEV_F_40GB_FD;
1638 } else if (speed == 100000) {
1639 netdev->current = NETDEV_F_100GB_FD;
1640 } else if (speed == 1000000) {
1641 netdev->current = NETDEV_F_1TB_FD;
1643 netdev->current = 0;
1646 if (ecmd.port == PORT_TP) {
1647 netdev->current |= NETDEV_F_COPPER;
1648 } else if (ecmd.port == PORT_FIBRE) {
1649 netdev->current |= NETDEV_F_FIBER;
1653 netdev->current |= NETDEV_F_AUTONEG;
1656 /* Peer advertisements. */
1657 netdev->peer = 0; /* XXX */
1660 netdev->cache_valid |= VALID_FEATURES;
1661 netdev->get_features_error = error;
1664 /* Stores the features supported by 'netdev' into each of '*current',
1665 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1666 * bitmap of NETDEV_* bits. Returns 0 if successful, otherwise a positive
1669 netdev_linux_get_features(const struct netdev *netdev_,
1670 enum netdev_features *current,
1671 enum netdev_features *advertised,
1672 enum netdev_features *supported,
1673 enum netdev_features *peer)
1675 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1677 netdev_linux_read_features(netdev);
1679 if (!netdev->get_features_error) {
1680 *current = netdev->current;
1681 *advertised = netdev->advertised;
1682 *supported = netdev->supported;
1683 *peer = netdev->peer;
1685 return netdev->get_features_error;
1688 /* Set the features advertised by 'netdev' to 'advertise'. */
1690 netdev_linux_set_advertisements(struct netdev *netdev,
1691 enum netdev_features advertise)
1693 struct ethtool_cmd ecmd;
1696 COVERAGE_INC(netdev_get_ethtool);
1697 memset(&ecmd, 0, sizeof ecmd);
1698 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1699 ETHTOOL_GSET, "ETHTOOL_GSET");
1704 ecmd.advertising = 0;
1705 if (advertise & NETDEV_F_10MB_HD) {
1706 ecmd.advertising |= ADVERTISED_10baseT_Half;
1708 if (advertise & NETDEV_F_10MB_FD) {
1709 ecmd.advertising |= ADVERTISED_10baseT_Full;
1711 if (advertise & NETDEV_F_100MB_HD) {
1712 ecmd.advertising |= ADVERTISED_100baseT_Half;
1714 if (advertise & NETDEV_F_100MB_FD) {
1715 ecmd.advertising |= ADVERTISED_100baseT_Full;
1717 if (advertise & NETDEV_F_1GB_HD) {
1718 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1720 if (advertise & NETDEV_F_1GB_FD) {
1721 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1723 if (advertise & NETDEV_F_10GB_FD) {
1724 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1726 if (advertise & NETDEV_F_COPPER) {
1727 ecmd.advertising |= ADVERTISED_TP;
1729 if (advertise & NETDEV_F_FIBER) {
1730 ecmd.advertising |= ADVERTISED_FIBRE;
1732 if (advertise & NETDEV_F_AUTONEG) {
1733 ecmd.advertising |= ADVERTISED_Autoneg;
1735 if (advertise & NETDEV_F_PAUSE) {
1736 ecmd.advertising |= ADVERTISED_Pause;
1738 if (advertise & NETDEV_F_PAUSE_ASYM) {
1739 ecmd.advertising |= ADVERTISED_Asym_Pause;
1741 COVERAGE_INC(netdev_set_ethtool);
1742 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1743 ETHTOOL_SSET, "ETHTOOL_SSET");
1746 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1747 * successful, otherwise a positive errno value. */
1749 netdev_linux_set_policing(struct netdev *netdev_,
1750 uint32_t kbits_rate, uint32_t kbits_burst)
1752 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1753 const char *netdev_name = netdev_get_name(netdev_);
1757 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1758 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1759 : kbits_burst); /* Stick with user-specified value. */
1761 if (netdev->cache_valid & VALID_POLICING) {
1762 if (netdev->netdev_policing_error) {
1763 return netdev->netdev_policing_error;
1766 if (netdev->kbits_rate == kbits_rate &&
1767 netdev->kbits_burst == kbits_burst) {
1768 /* Assume that settings haven't changed since we last set them. */
1771 netdev->cache_valid &= ~VALID_POLICING;
1774 COVERAGE_INC(netdev_set_policing);
1775 /* Remove any existing ingress qdisc. */
1776 error = tc_add_del_ingress_qdisc(netdev_, false);
1778 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1779 netdev_name, ovs_strerror(error));
1784 error = tc_add_del_ingress_qdisc(netdev_, true);
1786 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1787 netdev_name, ovs_strerror(error));
1791 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
1793 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1794 netdev_name, ovs_strerror(error));
1799 netdev->kbits_rate = kbits_rate;
1800 netdev->kbits_burst = kbits_burst;
1803 if (!error || error == ENODEV) {
1804 netdev->netdev_policing_error = error;
1805 netdev->cache_valid |= VALID_POLICING;
1811 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1814 const struct tc_ops *const *opsp;
1816 for (opsp = tcs; *opsp != NULL; opsp++) {
1817 const struct tc_ops *ops = *opsp;
1818 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1819 sset_add(types, ops->ovs_name);
1825 static const struct tc_ops *
1826 tc_lookup_ovs_name(const char *name)
1828 const struct tc_ops *const *opsp;
1830 for (opsp = tcs; *opsp != NULL; opsp++) {
1831 const struct tc_ops *ops = *opsp;
1832 if (!strcmp(name, ops->ovs_name)) {
1839 static const struct tc_ops *
1840 tc_lookup_linux_name(const char *name)
1842 const struct tc_ops *const *opsp;
1844 for (opsp = tcs; *opsp != NULL; opsp++) {
1845 const struct tc_ops *ops = *opsp;
1846 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1853 static struct tc_queue *
1854 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
1857 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1858 struct tc_queue *queue;
1860 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
1861 if (queue->queue_id == queue_id) {
1868 static struct tc_queue *
1869 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1871 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1875 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1877 struct netdev_qos_capabilities *caps)
1879 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1883 caps->n_queues = ops->n_queues;
1888 netdev_linux_get_qos(const struct netdev *netdev_,
1889 const char **typep, struct smap *details)
1891 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1894 error = tc_query_qdisc(netdev_);
1899 *typep = netdev->tc->ops->ovs_name;
1900 return (netdev->tc->ops->qdisc_get
1901 ? netdev->tc->ops->qdisc_get(netdev_, details)
1906 netdev_linux_set_qos(struct netdev *netdev_,
1907 const char *type, const struct smap *details)
1909 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1910 const struct tc_ops *new_ops;
1913 new_ops = tc_lookup_ovs_name(type);
1914 if (!new_ops || !new_ops->tc_install) {
1918 error = tc_query_qdisc(netdev_);
1923 if (new_ops == netdev->tc->ops) {
1924 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
1926 /* Delete existing qdisc. */
1927 error = tc_del_qdisc(netdev_);
1931 ovs_assert(netdev->tc == NULL);
1933 /* Install new qdisc. */
1934 error = new_ops->tc_install(netdev_, details);
1935 ovs_assert((error == 0) == (netdev->tc != NULL));
1942 netdev_linux_get_queue(const struct netdev *netdev_,
1943 unsigned int queue_id, struct smap *details)
1945 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1948 error = tc_query_qdisc(netdev_);
1952 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
1954 ? netdev->tc->ops->class_get(netdev_, queue, details)
1960 netdev_linux_set_queue(struct netdev *netdev_,
1961 unsigned int queue_id, const struct smap *details)
1963 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1966 error = tc_query_qdisc(netdev_);
1969 } else if (queue_id >= netdev->tc->ops->n_queues
1970 || !netdev->tc->ops->class_set) {
1974 return netdev->tc->ops->class_set(netdev_, queue_id, details);
1978 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
1980 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1983 error = tc_query_qdisc(netdev_);
1986 } else if (!netdev->tc->ops->class_delete) {
1989 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
1991 ? netdev->tc->ops->class_delete(netdev_, queue)
1997 netdev_linux_get_queue_stats(const struct netdev *netdev_,
1998 unsigned int queue_id,
1999 struct netdev_queue_stats *stats)
2001 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2004 error = tc_query_qdisc(netdev_);
2007 } else if (!netdev->tc->ops->class_get_stats) {
2010 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2012 ? netdev->tc->ops->class_get_stats(netdev_, queue, stats)
2018 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
2020 struct ofpbuf request;
2021 struct tcmsg *tcmsg;
2023 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2027 tcmsg->tcm_parent = 0;
2028 nl_dump_start(dump, rtnl_sock, &request);
2029 ofpbuf_uninit(&request);
2034 netdev_linux_dump_queues(const struct netdev *netdev_,
2035 netdev_dump_queues_cb *cb, void *aux)
2037 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2038 struct tc_queue *queue, *next_queue;
2039 struct smap details;
2043 error = tc_query_qdisc(netdev_);
2046 } else if (!netdev->tc->ops->class_get) {
2051 smap_init(&details);
2052 HMAP_FOR_EACH_SAFE (queue, next_queue, hmap_node,
2053 &netdev->tc->queues) {
2054 smap_clear(&details);
2056 error = netdev->tc->ops->class_get(netdev_, queue, &details);
2058 (*cb)(queue->queue_id, &details, aux);
2063 smap_destroy(&details);
2069 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2070 netdev_dump_queue_stats_cb *cb, void *aux)
2072 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2073 struct nl_dump dump;
2078 error = tc_query_qdisc(netdev_);
2081 } else if (!netdev->tc->ops->class_dump_stats) {
2086 if (!start_queue_dump(netdev_, &dump)) {
2089 while (nl_dump_next(&dump, &msg)) {
2090 error = netdev->tc->ops->class_dump_stats(netdev_, &msg, cb, aux);
2096 error = nl_dump_done(&dump);
2097 return error ? error : last_error;
2101 netdev_linux_get_in4(const struct netdev *netdev_,
2102 struct in_addr *address, struct in_addr *netmask)
2104 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2106 if (!(netdev->cache_valid & VALID_IN4)) {
2109 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
2110 SIOCGIFADDR, "SIOCGIFADDR");
2115 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2116 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2121 netdev->cache_valid |= VALID_IN4;
2123 *address = netdev->address;
2124 *netmask = netdev->netmask;
2125 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
2129 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2130 struct in_addr netmask)
2132 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2135 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2137 netdev->cache_valid |= VALID_IN4;
2138 netdev->address = address;
2139 netdev->netmask = netmask;
2140 if (address.s_addr != INADDR_ANY) {
2141 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2142 "SIOCSIFNETMASK", netmask);
2149 parse_if_inet6_line(const char *line,
2150 struct in6_addr *in6, char ifname[16 + 1])
2152 uint8_t *s6 = in6->s6_addr;
2153 #define X8 "%2"SCNx8
2155 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2156 "%*x %*x %*x %*x %16s\n",
2157 &s6[0], &s6[1], &s6[2], &s6[3],
2158 &s6[4], &s6[5], &s6[6], &s6[7],
2159 &s6[8], &s6[9], &s6[10], &s6[11],
2160 &s6[12], &s6[13], &s6[14], &s6[15],
2164 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2165 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2167 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2169 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2170 if (!(netdev->cache_valid & VALID_IN6)) {
2174 netdev->in6 = in6addr_any;
2176 file = fopen("/proc/net/if_inet6", "r");
2178 const char *name = netdev_get_name(netdev_);
2179 while (fgets(line, sizeof line, file)) {
2180 struct in6_addr in6_tmp;
2181 char ifname[16 + 1];
2182 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2183 && !strcmp(name, ifname))
2185 netdev->in6 = in6_tmp;
2191 netdev->cache_valid |= VALID_IN6;
2198 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2200 struct sockaddr_in sin;
2201 memset(&sin, 0, sizeof sin);
2202 sin.sin_family = AF_INET;
2203 sin.sin_addr = addr;
2206 memset(sa, 0, sizeof *sa);
2207 memcpy(sa, &sin, sizeof sin);
2211 do_set_addr(struct netdev *netdev,
2212 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2215 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2216 make_in4_sockaddr(&ifr.ifr_addr, addr);
2218 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2222 /* Adds 'router' as a default IP gateway. */
2224 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2226 struct in_addr any = { INADDR_ANY };
2230 memset(&rt, 0, sizeof rt);
2231 make_in4_sockaddr(&rt.rt_dst, any);
2232 make_in4_sockaddr(&rt.rt_gateway, router);
2233 make_in4_sockaddr(&rt.rt_genmask, any);
2234 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2235 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2237 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2243 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2246 static const char fn[] = "/proc/net/route";
2251 *netdev_name = NULL;
2252 stream = fopen(fn, "r");
2253 if (stream == NULL) {
2254 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2259 while (fgets(line, sizeof line, stream)) {
2262 ovs_be32 dest, gateway, mask;
2263 int refcnt, metric, mtu;
2264 unsigned int flags, use, window, irtt;
2267 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2269 iface, &dest, &gateway, &flags, &refcnt,
2270 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2272 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2276 if (!(flags & RTF_UP)) {
2277 /* Skip routes that aren't up. */
2281 /* The output of 'dest', 'mask', and 'gateway' were given in
2282 * network byte order, so we don't need need any endian
2283 * conversions here. */
2284 if ((dest & mask) == (host->s_addr & mask)) {
2286 /* The host is directly reachable. */
2287 next_hop->s_addr = 0;
2289 /* To reach the host, we must go through a gateway. */
2290 next_hop->s_addr = gateway;
2292 *netdev_name = xstrdup(iface);
2304 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2306 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2309 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2310 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2312 COVERAGE_INC(netdev_get_ethtool);
2313 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2314 error = netdev_linux_do_ethtool(netdev->up.name,
2317 "ETHTOOL_GDRVINFO");
2319 netdev->cache_valid |= VALID_DRVINFO;
2324 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2325 smap_add(smap, "driver_version", netdev->drvinfo.version);
2326 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2332 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2335 smap_add(smap, "driver_name", "openvswitch");
2339 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2340 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2341 * returns 0. Otherwise, it returns a positive errno value; in particular,
2342 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2344 netdev_linux_arp_lookup(const struct netdev *netdev,
2345 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2348 struct sockaddr_in sin;
2351 memset(&r, 0, sizeof r);
2352 memset(&sin, 0, sizeof sin);
2353 sin.sin_family = AF_INET;
2354 sin.sin_addr.s_addr = ip;
2356 memcpy(&r.arp_pa, &sin, sizeof sin);
2357 r.arp_ha.sa_family = ARPHRD_ETHER;
2359 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2360 COVERAGE_INC(netdev_arp_lookup);
2361 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2363 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2364 } else if (retval != ENXIO) {
2365 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2366 netdev_get_name(netdev), IP_ARGS(ip),
2367 ovs_strerror(retval));
2373 nd_to_iff_flags(enum netdev_flags nd)
2376 if (nd & NETDEV_UP) {
2379 if (nd & NETDEV_PROMISC) {
2386 iff_to_nd_flags(int iff)
2388 enum netdev_flags nd = 0;
2392 if (iff & IFF_PROMISC) {
2393 nd |= NETDEV_PROMISC;
2399 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2400 enum netdev_flags on, enum netdev_flags *old_flagsp)
2402 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2403 int old_flags, new_flags;
2406 old_flags = netdev->ifi_flags;
2407 *old_flagsp = iff_to_nd_flags(old_flags);
2408 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2409 if (new_flags != old_flags) {
2410 error = set_flags(netdev_get_name(netdev_), new_flags);
2411 get_flags(netdev_, &netdev->ifi_flags);
2417 netdev_linux_change_seq(const struct netdev *netdev)
2419 return netdev_linux_cast(netdev)->change_seq;
2422 #define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS, \
2423 GET_FEATURES, GET_STATUS) \
2427 netdev_linux_init, \
2429 netdev_linux_wait, \
2432 netdev_linux_destroy, \
2433 NULL, /* get_config */ \
2434 NULL, /* set_config */ \
2435 NULL, /* get_tunnel_config */ \
2437 netdev_linux_rx_open, \
2439 netdev_linux_send, \
2440 netdev_linux_send_wait, \
2442 netdev_linux_set_etheraddr, \
2443 netdev_linux_get_etheraddr, \
2444 netdev_linux_get_mtu, \
2445 netdev_linux_set_mtu, \
2446 netdev_linux_get_ifindex, \
2447 netdev_linux_get_carrier, \
2448 netdev_linux_get_carrier_resets, \
2449 netdev_linux_set_miimon_interval, \
2454 netdev_linux_set_advertisements, \
2456 netdev_linux_set_policing, \
2457 netdev_linux_get_qos_types, \
2458 netdev_linux_get_qos_capabilities, \
2459 netdev_linux_get_qos, \
2460 netdev_linux_set_qos, \
2461 netdev_linux_get_queue, \
2462 netdev_linux_set_queue, \
2463 netdev_linux_delete_queue, \
2464 netdev_linux_get_queue_stats, \
2465 netdev_linux_dump_queues, \
2466 netdev_linux_dump_queue_stats, \
2468 netdev_linux_get_in4, \
2469 netdev_linux_set_in4, \
2470 netdev_linux_get_in6, \
2471 netdev_linux_add_router, \
2472 netdev_linux_get_next_hop, \
2474 netdev_linux_arp_lookup, \
2476 netdev_linux_update_flags, \
2478 netdev_linux_change_seq \
2481 const struct netdev_class netdev_linux_class =
2484 netdev_linux_create,
2485 netdev_linux_get_stats,
2486 NULL, /* set_stats */
2487 netdev_linux_get_features,
2488 netdev_linux_get_status);
2490 const struct netdev_class netdev_tap_class =
2493 netdev_linux_create_tap,
2494 netdev_tap_get_stats,
2495 NULL, /* set_stats */
2496 netdev_linux_get_features,
2497 netdev_linux_get_status);
2499 const struct netdev_class netdev_internal_class =
2502 netdev_linux_create,
2503 netdev_internal_get_stats,
2504 netdev_internal_set_stats,
2505 NULL, /* get_features */
2506 netdev_internal_get_status);
2508 static const struct netdev_rx_class netdev_rx_linux_class = {
2509 netdev_rx_linux_destroy,
2510 netdev_rx_linux_recv,
2511 netdev_rx_linux_wait,
2512 netdev_rx_linux_drain,
2515 /* HTB traffic control class. */
2517 #define HTB_N_QUEUES 0xf000
2521 unsigned int max_rate; /* In bytes/s. */
2525 struct tc_queue tc_queue;
2526 unsigned int min_rate; /* In bytes/s. */
2527 unsigned int max_rate; /* In bytes/s. */
2528 unsigned int burst; /* In bytes. */
2529 unsigned int priority; /* Lower values are higher priorities. */
2533 htb_get__(const struct netdev *netdev_)
2535 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2536 return CONTAINER_OF(netdev->tc, struct htb, tc);
2540 htb_install__(struct netdev *netdev_, uint64_t max_rate)
2542 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2545 htb = xmalloc(sizeof *htb);
2546 tc_init(&htb->tc, &tc_ops_htb);
2547 htb->max_rate = max_rate;
2549 netdev->tc = &htb->tc;
2552 /* Create an HTB qdisc.
2554 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2556 htb_setup_qdisc__(struct netdev *netdev)
2559 struct tc_htb_glob opt;
2560 struct ofpbuf request;
2561 struct tcmsg *tcmsg;
2563 tc_del_qdisc(netdev);
2565 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2566 NLM_F_EXCL | NLM_F_CREATE, &request);
2570 tcmsg->tcm_handle = tc_make_handle(1, 0);
2571 tcmsg->tcm_parent = TC_H_ROOT;
2573 nl_msg_put_string(&request, TCA_KIND, "htb");
2575 memset(&opt, 0, sizeof opt);
2576 opt.rate2quantum = 10;
2580 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2581 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2582 nl_msg_end_nested(&request, opt_offset);
2584 return tc_transact(&request, NULL);
2587 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2588 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2590 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2591 unsigned int parent, struct htb_class *class)
2594 struct tc_htb_opt opt;
2595 struct ofpbuf request;
2596 struct tcmsg *tcmsg;
2600 error = netdev_get_mtu(netdev, &mtu);
2602 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2603 netdev_get_name(netdev));
2607 memset(&opt, 0, sizeof opt);
2608 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2609 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2610 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2611 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2612 opt.prio = class->priority;
2614 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2618 tcmsg->tcm_handle = handle;
2619 tcmsg->tcm_parent = parent;
2621 nl_msg_put_string(&request, TCA_KIND, "htb");
2622 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2623 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2624 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2625 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2626 nl_msg_end_nested(&request, opt_offset);
2628 error = tc_transact(&request, NULL);
2630 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2631 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2632 netdev_get_name(netdev),
2633 tc_get_major(handle), tc_get_minor(handle),
2634 tc_get_major(parent), tc_get_minor(parent),
2635 class->min_rate, class->max_rate,
2636 class->burst, class->priority, ovs_strerror(error));
2641 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2642 * description of them into 'details'. The description complies with the
2643 * specification given in the vswitch database documentation for linux-htb
2646 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2648 static const struct nl_policy tca_htb_policy[] = {
2649 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2650 .min_len = sizeof(struct tc_htb_opt) },
2653 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2654 const struct tc_htb_opt *htb;
2656 if (!nl_parse_nested(nl_options, tca_htb_policy,
2657 attrs, ARRAY_SIZE(tca_htb_policy))) {
2658 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2662 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2663 class->min_rate = htb->rate.rate;
2664 class->max_rate = htb->ceil.rate;
2665 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2666 class->priority = htb->prio;
2671 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2672 struct htb_class *options,
2673 struct netdev_queue_stats *stats)
2675 struct nlattr *nl_options;
2676 unsigned int handle;
2679 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2680 if (!error && queue_id) {
2681 unsigned int major = tc_get_major(handle);
2682 unsigned int minor = tc_get_minor(handle);
2683 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2684 *queue_id = minor - 1;
2689 if (!error && options) {
2690 error = htb_parse_tca_options__(nl_options, options);
2696 htb_parse_qdisc_details__(struct netdev *netdev,
2697 const struct smap *details, struct htb_class *hc)
2699 const char *max_rate_s;
2701 max_rate_s = smap_get(details, "max-rate");
2702 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2703 if (!hc->max_rate) {
2704 enum netdev_features current;
2706 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2707 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
2709 hc->min_rate = hc->max_rate;
2715 htb_parse_class_details__(struct netdev *netdev,
2716 const struct smap *details, struct htb_class *hc)
2718 const struct htb *htb = htb_get__(netdev);
2719 const char *min_rate_s = smap_get(details, "min-rate");
2720 const char *max_rate_s = smap_get(details, "max-rate");
2721 const char *burst_s = smap_get(details, "burst");
2722 const char *priority_s = smap_get(details, "priority");
2725 error = netdev_get_mtu(netdev, &mtu);
2727 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2728 netdev_get_name(netdev));
2732 /* HTB requires at least an mtu sized min-rate to send any traffic even
2733 * on uncongested links. */
2734 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2735 hc->min_rate = MAX(hc->min_rate, mtu);
2736 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2739 hc->max_rate = (max_rate_s
2740 ? strtoull(max_rate_s, NULL, 10) / 8
2742 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2743 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2747 * According to hints in the documentation that I've read, it is important
2748 * that 'burst' be at least as big as the largest frame that might be
2749 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2750 * but having it a bit too small is a problem. Since netdev_get_mtu()
2751 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2752 * the MTU. We actually add 64, instead of 14, as a guard against
2753 * additional headers get tacked on somewhere that we're not aware of. */
2754 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2755 hc->burst = MAX(hc->burst, mtu + 64);
2758 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2764 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2765 unsigned int parent, struct htb_class *options,
2766 struct netdev_queue_stats *stats)
2768 struct ofpbuf *reply;
2771 error = tc_query_class(netdev, handle, parent, &reply);
2773 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2774 ofpbuf_delete(reply);
2780 htb_tc_install(struct netdev *netdev, const struct smap *details)
2784 error = htb_setup_qdisc__(netdev);
2786 struct htb_class hc;
2788 htb_parse_qdisc_details__(netdev, details, &hc);
2789 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2790 tc_make_handle(1, 0), &hc);
2792 htb_install__(netdev, hc.max_rate);
2798 static struct htb_class *
2799 htb_class_cast__(const struct tc_queue *queue)
2801 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2805 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2806 const struct htb_class *hc)
2808 struct htb *htb = htb_get__(netdev);
2809 size_t hash = hash_int(queue_id, 0);
2810 struct tc_queue *queue;
2811 struct htb_class *hcp;
2813 queue = tc_find_queue__(netdev, queue_id, hash);
2815 hcp = htb_class_cast__(queue);
2817 hcp = xmalloc(sizeof *hcp);
2818 queue = &hcp->tc_queue;
2819 queue->queue_id = queue_id;
2820 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2823 hcp->min_rate = hc->min_rate;
2824 hcp->max_rate = hc->max_rate;
2825 hcp->burst = hc->burst;
2826 hcp->priority = hc->priority;
2830 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2833 struct nl_dump dump;
2834 struct htb_class hc;
2836 /* Get qdisc options. */
2838 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2839 htb_install__(netdev, hc.max_rate);
2842 if (!start_queue_dump(netdev, &dump)) {
2845 while (nl_dump_next(&dump, &msg)) {
2846 unsigned int queue_id;
2848 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2849 htb_update_queue__(netdev, queue_id, &hc);
2852 nl_dump_done(&dump);
2858 htb_tc_destroy(struct tc *tc)
2860 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2861 struct htb_class *hc, *next;
2863 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2864 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2872 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
2874 const struct htb *htb = htb_get__(netdev);
2875 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
2880 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
2882 struct htb_class hc;
2885 htb_parse_qdisc_details__(netdev, details, &hc);
2886 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2887 tc_make_handle(1, 0), &hc);
2889 htb_get__(netdev)->max_rate = hc.max_rate;
2895 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2896 const struct tc_queue *queue, struct smap *details)
2898 const struct htb_class *hc = htb_class_cast__(queue);
2900 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
2901 if (hc->min_rate != hc->max_rate) {
2902 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
2904 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
2906 smap_add_format(details, "priority", "%u", hc->priority);
2912 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2913 const struct smap *details)
2915 struct htb_class hc;
2918 error = htb_parse_class_details__(netdev, details, &hc);
2923 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2924 tc_make_handle(1, 0xfffe), &hc);
2929 htb_update_queue__(netdev, queue_id, &hc);
2934 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2936 struct htb_class *hc = htb_class_cast__(queue);
2937 struct htb *htb = htb_get__(netdev);
2940 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2942 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2949 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2950 struct netdev_queue_stats *stats)
2952 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2953 tc_make_handle(1, 0xfffe), NULL, stats);
2957 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2958 const struct ofpbuf *nlmsg,
2959 netdev_dump_queue_stats_cb *cb, void *aux)
2961 struct netdev_queue_stats stats;
2962 unsigned int handle, major, minor;
2965 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2970 major = tc_get_major(handle);
2971 minor = tc_get_minor(handle);
2972 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2973 (*cb)(minor - 1, &stats, aux);
2978 static const struct tc_ops tc_ops_htb = {
2979 "htb", /* linux_name */
2980 "linux-htb", /* ovs_name */
2981 HTB_N_QUEUES, /* n_queues */
2990 htb_class_get_stats,
2991 htb_class_dump_stats
2994 /* "linux-hfsc" traffic control class. */
2996 #define HFSC_N_QUEUES 0xf000
3004 struct tc_queue tc_queue;
3009 static struct hfsc *
3010 hfsc_get__(const struct netdev *netdev_)
3012 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3013 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
3016 static struct hfsc_class *
3017 hfsc_class_cast__(const struct tc_queue *queue)
3019 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3023 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
3025 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3028 hfsc = xmalloc(sizeof *hfsc);
3029 tc_init(&hfsc->tc, &tc_ops_hfsc);
3030 hfsc->max_rate = max_rate;
3031 netdev->tc = &hfsc->tc;
3035 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3036 const struct hfsc_class *hc)
3040 struct hfsc_class *hcp;
3041 struct tc_queue *queue;
3043 hfsc = hfsc_get__(netdev);
3044 hash = hash_int(queue_id, 0);
3046 queue = tc_find_queue__(netdev, queue_id, hash);
3048 hcp = hfsc_class_cast__(queue);
3050 hcp = xmalloc(sizeof *hcp);
3051 queue = &hcp->tc_queue;
3052 queue->queue_id = queue_id;
3053 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3056 hcp->min_rate = hc->min_rate;
3057 hcp->max_rate = hc->max_rate;
3061 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3063 const struct tc_service_curve *rsc, *fsc, *usc;
3064 static const struct nl_policy tca_hfsc_policy[] = {
3066 .type = NL_A_UNSPEC,
3068 .min_len = sizeof(struct tc_service_curve),
3071 .type = NL_A_UNSPEC,
3073 .min_len = sizeof(struct tc_service_curve),
3076 .type = NL_A_UNSPEC,
3078 .min_len = sizeof(struct tc_service_curve),
3081 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3083 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3084 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3085 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3089 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3090 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3091 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3093 if (rsc->m1 != 0 || rsc->d != 0 ||
3094 fsc->m1 != 0 || fsc->d != 0 ||
3095 usc->m1 != 0 || usc->d != 0) {
3096 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3097 "Non-linear service curves are not supported.");
3101 if (rsc->m2 != fsc->m2) {
3102 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3103 "Real-time service curves are not supported ");
3107 if (rsc->m2 > usc->m2) {
3108 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3109 "Min-rate service curve is greater than "
3110 "the max-rate service curve.");
3114 class->min_rate = fsc->m2;
3115 class->max_rate = usc->m2;
3120 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3121 struct hfsc_class *options,
3122 struct netdev_queue_stats *stats)
3125 unsigned int handle;
3126 struct nlattr *nl_options;
3128 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3134 unsigned int major, minor;
3136 major = tc_get_major(handle);
3137 minor = tc_get_minor(handle);
3138 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3139 *queue_id = minor - 1;
3146 error = hfsc_parse_tca_options__(nl_options, options);
3153 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3154 unsigned int parent, struct hfsc_class *options,
3155 struct netdev_queue_stats *stats)
3158 struct ofpbuf *reply;
3160 error = tc_query_class(netdev, handle, parent, &reply);
3165 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3166 ofpbuf_delete(reply);
3171 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct smap *details,
3172 struct hfsc_class *class)
3175 const char *max_rate_s;
3177 max_rate_s = smap_get(details, "max-rate");
3178 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3181 enum netdev_features current;
3183 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
3184 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3187 class->min_rate = max_rate;
3188 class->max_rate = max_rate;
3192 hfsc_parse_class_details__(struct netdev *netdev,
3193 const struct smap *details,
3194 struct hfsc_class * class)
3196 const struct hfsc *hfsc;
3197 uint32_t min_rate, max_rate;
3198 const char *min_rate_s, *max_rate_s;
3200 hfsc = hfsc_get__(netdev);
3201 min_rate_s = smap_get(details, "min-rate");
3202 max_rate_s = smap_get(details, "max-rate");
3204 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3205 min_rate = MAX(min_rate, 1);
3206 min_rate = MIN(min_rate, hfsc->max_rate);
3208 max_rate = (max_rate_s
3209 ? strtoull(max_rate_s, NULL, 10) / 8
3211 max_rate = MAX(max_rate, min_rate);
3212 max_rate = MIN(max_rate, hfsc->max_rate);
3214 class->min_rate = min_rate;
3215 class->max_rate = max_rate;
3220 /* Create an HFSC qdisc.
3222 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3224 hfsc_setup_qdisc__(struct netdev * netdev)
3226 struct tcmsg *tcmsg;
3227 struct ofpbuf request;
3228 struct tc_hfsc_qopt opt;
3230 tc_del_qdisc(netdev);
3232 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3233 NLM_F_EXCL | NLM_F_CREATE, &request);
3239 tcmsg->tcm_handle = tc_make_handle(1, 0);
3240 tcmsg->tcm_parent = TC_H_ROOT;
3242 memset(&opt, 0, sizeof opt);
3245 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3246 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3248 return tc_transact(&request, NULL);
3251 /* Create an HFSC class.
3253 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3254 * sc rate <min_rate> ul rate <max_rate>" */
3256 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3257 unsigned int parent, struct hfsc_class *class)
3261 struct tcmsg *tcmsg;
3262 struct ofpbuf request;
3263 struct tc_service_curve min, max;
3265 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3271 tcmsg->tcm_handle = handle;
3272 tcmsg->tcm_parent = parent;
3276 min.m2 = class->min_rate;
3280 max.m2 = class->max_rate;
3282 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3283 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3284 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3285 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3286 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3287 nl_msg_end_nested(&request, opt_offset);
3289 error = tc_transact(&request, NULL);
3291 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3292 "min-rate %ubps, max-rate %ubps (%s)",
3293 netdev_get_name(netdev),
3294 tc_get_major(handle), tc_get_minor(handle),
3295 tc_get_major(parent), tc_get_minor(parent),
3296 class->min_rate, class->max_rate, ovs_strerror(error));
3303 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
3306 struct hfsc_class class;
3308 error = hfsc_setup_qdisc__(netdev);
3314 hfsc_parse_qdisc_details__(netdev, details, &class);
3315 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3316 tc_make_handle(1, 0), &class);
3322 hfsc_install__(netdev, class.max_rate);
3327 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3330 struct nl_dump dump;
3331 struct hfsc_class hc;
3334 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3335 hfsc_install__(netdev, hc.max_rate);
3337 if (!start_queue_dump(netdev, &dump)) {
3341 while (nl_dump_next(&dump, &msg)) {
3342 unsigned int queue_id;
3344 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3345 hfsc_update_queue__(netdev, queue_id, &hc);
3349 nl_dump_done(&dump);
3354 hfsc_tc_destroy(struct tc *tc)
3357 struct hfsc_class *hc, *next;
3359 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3361 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3362 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3371 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
3373 const struct hfsc *hfsc;
3374 hfsc = hfsc_get__(netdev);
3375 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
3380 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
3383 struct hfsc_class class;
3385 hfsc_parse_qdisc_details__(netdev, details, &class);
3386 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3387 tc_make_handle(1, 0), &class);
3390 hfsc_get__(netdev)->max_rate = class.max_rate;
3397 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3398 const struct tc_queue *queue, struct smap *details)
3400 const struct hfsc_class *hc;
3402 hc = hfsc_class_cast__(queue);
3403 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3404 if (hc->min_rate != hc->max_rate) {
3405 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3411 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3412 const struct smap *details)
3415 struct hfsc_class class;
3417 error = hfsc_parse_class_details__(netdev, details, &class);
3422 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3423 tc_make_handle(1, 0xfffe), &class);
3428 hfsc_update_queue__(netdev, queue_id, &class);
3433 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3437 struct hfsc_class *hc;
3439 hc = hfsc_class_cast__(queue);
3440 hfsc = hfsc_get__(netdev);
3442 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3444 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3451 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3452 struct netdev_queue_stats *stats)
3454 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3455 tc_make_handle(1, 0xfffe), NULL, stats);
3459 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3460 const struct ofpbuf *nlmsg,
3461 netdev_dump_queue_stats_cb *cb, void *aux)
3463 struct netdev_queue_stats stats;
3464 unsigned int handle, major, minor;
3467 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3472 major = tc_get_major(handle);
3473 minor = tc_get_minor(handle);
3474 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3475 (*cb)(minor - 1, &stats, aux);
3480 static const struct tc_ops tc_ops_hfsc = {
3481 "hfsc", /* linux_name */
3482 "linux-hfsc", /* ovs_name */
3483 HFSC_N_QUEUES, /* n_queues */
3484 hfsc_tc_install, /* tc_install */
3485 hfsc_tc_load, /* tc_load */
3486 hfsc_tc_destroy, /* tc_destroy */
3487 hfsc_qdisc_get, /* qdisc_get */
3488 hfsc_qdisc_set, /* qdisc_set */
3489 hfsc_class_get, /* class_get */
3490 hfsc_class_set, /* class_set */
3491 hfsc_class_delete, /* class_delete */
3492 hfsc_class_get_stats, /* class_get_stats */
3493 hfsc_class_dump_stats /* class_dump_stats */
3496 /* "linux-default" traffic control class.
3498 * This class represents the default, unnamed Linux qdisc. It corresponds to
3499 * the "" (empty string) QoS type in the OVS database. */
3502 default_install__(struct netdev *netdev_)
3504 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3505 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
3507 /* Nothing but a tc class implementation is allowed to write to a tc. This
3508 * class never does that, so we can legitimately use a const tc object. */
3509 netdev->tc = CONST_CAST(struct tc *, &tc);
3513 default_tc_install(struct netdev *netdev,
3514 const struct smap *details OVS_UNUSED)
3516 default_install__(netdev);
3521 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3523 default_install__(netdev);
3527 static const struct tc_ops tc_ops_default = {
3528 NULL, /* linux_name */
3533 NULL, /* tc_destroy */
3534 NULL, /* qdisc_get */
3535 NULL, /* qdisc_set */
3536 NULL, /* class_get */
3537 NULL, /* class_set */
3538 NULL, /* class_delete */
3539 NULL, /* class_get_stats */
3540 NULL /* class_dump_stats */
3543 /* "linux-other" traffic control class.
3548 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
3550 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3551 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
3553 /* Nothing but a tc class implementation is allowed to write to a tc. This
3554 * class never does that, so we can legitimately use a const tc object. */
3555 netdev->tc = CONST_CAST(struct tc *, &tc);
3559 static const struct tc_ops tc_ops_other = {
3560 NULL, /* linux_name */
3561 "linux-other", /* ovs_name */
3563 NULL, /* tc_install */
3565 NULL, /* tc_destroy */
3566 NULL, /* qdisc_get */
3567 NULL, /* qdisc_set */
3568 NULL, /* class_get */
3569 NULL, /* class_set */
3570 NULL, /* class_delete */
3571 NULL, /* class_get_stats */
3572 NULL /* class_dump_stats */
3575 /* Traffic control. */
3577 /* Number of kernel "tc" ticks per second. */
3578 static double ticks_per_s;
3580 /* Number of kernel "jiffies" per second. This is used for the purpose of
3581 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3582 * one jiffy's worth of data.
3584 * There are two possibilities here:
3586 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3587 * approximate range of 100 to 1024. That means that we really need to
3588 * make sure that the qdisc can buffer that much data.
3590 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3591 * has finely granular timers and there's no need to fudge additional room
3592 * for buffers. (There's no extra effort needed to implement that: the
3593 * large 'buffer_hz' is used as a divisor, so practically any number will
3594 * come out as 0 in the division. Small integer results in the case of
3595 * really high dividends won't have any real effect anyhow.)
3597 static unsigned int buffer_hz;
3599 /* Returns tc handle 'major':'minor'. */
3601 tc_make_handle(unsigned int major, unsigned int minor)
3603 return TC_H_MAKE(major << 16, minor);
3606 /* Returns the major number from 'handle'. */
3608 tc_get_major(unsigned int handle)
3610 return TC_H_MAJ(handle) >> 16;
3613 /* Returns the minor number from 'handle'. */
3615 tc_get_minor(unsigned int handle)
3617 return TC_H_MIN(handle);
3620 static struct tcmsg *
3621 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3622 struct ofpbuf *request)
3624 struct tcmsg *tcmsg;
3628 error = get_ifindex(netdev, &ifindex);
3633 ofpbuf_init(request, 512);
3634 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3635 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3636 tcmsg->tcm_family = AF_UNSPEC;
3637 tcmsg->tcm_ifindex = ifindex;
3638 /* Caller should fill in tcmsg->tcm_handle. */
3639 /* Caller should fill in tcmsg->tcm_parent. */
3645 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3647 int error = nl_sock_transact(rtnl_sock, request, replyp);
3648 ofpbuf_uninit(request);
3652 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3653 * policing configuration.
3655 * This function is equivalent to running the following when 'add' is true:
3656 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3658 * This function is equivalent to running the following when 'add' is false:
3659 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3661 * The configuration and stats may be seen with the following command:
3662 * /sbin/tc -s qdisc show dev <devname>
3664 * Returns 0 if successful, otherwise a positive errno value.
3667 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3669 struct ofpbuf request;
3670 struct tcmsg *tcmsg;
3672 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3673 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3675 tcmsg = tc_make_request(netdev, type, flags, &request);
3679 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3680 tcmsg->tcm_parent = TC_H_INGRESS;
3681 nl_msg_put_string(&request, TCA_KIND, "ingress");
3682 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3684 error = tc_transact(&request, NULL);
3686 /* If we're deleting the qdisc, don't worry about some of the
3687 * error conditions. */
3688 if (!add && (error == ENOENT || error == EINVAL)) {
3697 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3700 * This function is equivalent to running:
3701 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3702 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3705 * The configuration and stats may be seen with the following command:
3706 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3708 * Returns 0 if successful, otherwise a positive errno value.
3711 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3713 struct tc_police tc_police;
3714 struct ofpbuf request;
3715 struct tcmsg *tcmsg;
3716 size_t basic_offset;
3717 size_t police_offset;
3721 memset(&tc_police, 0, sizeof tc_police);
3722 tc_police.action = TC_POLICE_SHOT;
3723 tc_police.mtu = mtu;
3724 tc_fill_rate(&tc_police.rate, (kbits_rate * 1000)/8, mtu);
3725 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3726 kbits_burst * 1024);
3728 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3729 NLM_F_EXCL | NLM_F_CREATE, &request);
3733 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3734 tcmsg->tcm_info = tc_make_handle(49,
3735 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3737 nl_msg_put_string(&request, TCA_KIND, "basic");
3738 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3739 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3740 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3741 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3742 nl_msg_end_nested(&request, police_offset);
3743 nl_msg_end_nested(&request, basic_offset);
3745 error = tc_transact(&request, NULL);
3756 /* The values in psched are not individually very meaningful, but they are
3757 * important. The tables below show some values seen in the wild.
3761 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3762 * (Before that, there are hints that it was 1000000000.)
3764 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3768 * -----------------------------------
3769 * [1] 000c8000 000f4240 000f4240 00000064
3770 * [2] 000003e8 00000400 000f4240 3b9aca00
3771 * [3] 000003e8 00000400 000f4240 3b9aca00
3772 * [4] 000003e8 00000400 000f4240 00000064
3773 * [5] 000003e8 00000040 000f4240 3b9aca00
3774 * [6] 000003e8 00000040 000f4240 000000f9
3776 * a b c d ticks_per_s buffer_hz
3777 * ------- --------- ---------- ------------- ----------- -------------
3778 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3779 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3780 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3781 * [4] 1,000 1,024 1,000,000 100 976,562 100
3782 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3783 * [6] 1,000 64 1,000,000 249 15,625,000 249
3785 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3786 * [2] 2.6.26-1-686-bigmem from Debian lenny
3787 * [3] 2.6.26-2-sparc64 from Debian lenny
3788 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3789 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3790 * [6] 2.6.34 from kernel.org on KVM
3792 static const char fn[] = "/proc/net/psched";
3793 unsigned int a, b, c, d;
3799 stream = fopen(fn, "r");
3801 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
3805 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3806 VLOG_WARN("%s: read failed", fn);
3810 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3814 VLOG_WARN("%s: invalid scheduler parameters", fn);
3818 ticks_per_s = (double) a * c / b;
3822 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3825 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3828 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3829 * rate of 'rate' bytes per second. */
3831 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3836 return (rate * ticks) / ticks_per_s;
3839 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3840 * rate of 'rate' bytes per second. */
3842 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3847 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3850 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3851 * a transmission rate of 'rate' bytes per second. */
3853 tc_buffer_per_jiffy(unsigned int rate)
3858 return rate / buffer_hz;
3861 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3862 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3863 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3864 * stores NULL into it if it is absent.
3866 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3869 * Returns 0 if successful, otherwise a positive errno value. */
3871 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3872 struct nlattr **options)
3874 static const struct nl_policy tca_policy[] = {
3875 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3876 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3878 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3880 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3881 tca_policy, ta, ARRAY_SIZE(ta))) {
3882 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3887 *kind = nl_attr_get_string(ta[TCA_KIND]);
3891 *options = ta[TCA_OPTIONS];
3906 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3907 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3908 * into '*options', and its queue statistics into '*stats'. Any of the output
3909 * arguments may be null.
3911 * Returns 0 if successful, otherwise a positive errno value. */
3913 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3914 struct nlattr **options, struct netdev_queue_stats *stats)
3916 static const struct nl_policy tca_policy[] = {
3917 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3918 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3920 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3922 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3923 tca_policy, ta, ARRAY_SIZE(ta))) {
3924 VLOG_WARN_RL(&rl, "failed to parse class message");
3929 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3930 *handlep = tc->tcm_handle;
3934 *options = ta[TCA_OPTIONS];
3938 const struct gnet_stats_queue *gsq;
3939 struct gnet_stats_basic gsb;
3941 static const struct nl_policy stats_policy[] = {
3942 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3943 .min_len = sizeof gsb },
3944 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3945 .min_len = sizeof *gsq },
3947 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3949 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3950 sa, ARRAY_SIZE(sa))) {
3951 VLOG_WARN_RL(&rl, "failed to parse class stats");
3955 /* Alignment issues screw up the length of struct gnet_stats_basic on
3956 * some arch/bitsize combinations. Newer versions of Linux have a
3957 * struct gnet_stats_basic_packed, but we can't depend on that. The
3958 * easiest thing to do is just to make a copy. */
3959 memset(&gsb, 0, sizeof gsb);
3960 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3961 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3962 stats->tx_bytes = gsb.bytes;
3963 stats->tx_packets = gsb.packets;
3965 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3966 stats->tx_errors = gsq->drops;
3976 memset(stats, 0, sizeof *stats);
3981 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3984 tc_query_class(const struct netdev *netdev,
3985 unsigned int handle, unsigned int parent,
3986 struct ofpbuf **replyp)
3988 struct ofpbuf request;
3989 struct tcmsg *tcmsg;
3992 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3996 tcmsg->tcm_handle = handle;
3997 tcmsg->tcm_parent = parent;
3999 error = tc_transact(&request, replyp);
4001 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
4002 netdev_get_name(netdev),
4003 tc_get_major(handle), tc_get_minor(handle),
4004 tc_get_major(parent), tc_get_minor(parent),
4005 ovs_strerror(error));
4010 /* Equivalent to "tc class del dev <name> handle <handle>". */
4012 tc_delete_class(const struct netdev *netdev, unsigned int handle)
4014 struct ofpbuf request;
4015 struct tcmsg *tcmsg;
4018 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
4022 tcmsg->tcm_handle = handle;
4023 tcmsg->tcm_parent = 0;
4025 error = tc_transact(&request, NULL);
4027 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4028 netdev_get_name(netdev),
4029 tc_get_major(handle), tc_get_minor(handle),
4030 ovs_strerror(error));
4035 /* Equivalent to "tc qdisc del dev <name> root". */
4037 tc_del_qdisc(struct netdev *netdev_)
4039 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4040 struct ofpbuf request;
4041 struct tcmsg *tcmsg;
4044 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
4048 tcmsg->tcm_handle = tc_make_handle(1, 0);
4049 tcmsg->tcm_parent = TC_H_ROOT;
4051 error = tc_transact(&request, NULL);
4052 if (error == EINVAL) {
4053 /* EINVAL probably means that the default qdisc was in use, in which
4054 * case we've accomplished our purpose. */
4057 if (!error && netdev->tc) {
4058 if (netdev->tc->ops->tc_destroy) {
4059 netdev->tc->ops->tc_destroy(netdev->tc);
4066 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4067 * kernel to determine what they are. Returns 0 if successful, otherwise a
4068 * positive errno value. */
4070 tc_query_qdisc(const struct netdev *netdev_)
4072 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4073 struct ofpbuf request, *qdisc;
4074 const struct tc_ops *ops;
4075 struct tcmsg *tcmsg;
4083 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4084 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4085 * 2.6.35 without that fix backported to it.
4087 * To avoid the OOPS, we must not make a request that would attempt to dump
4088 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4089 * few others. There are a few ways that I can see to do this, but most of
4090 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4091 * technique chosen here is to assume that any non-default qdisc that we
4092 * create will have a class with handle 1:0. The built-in qdiscs only have
4093 * a class with handle 0:0.
4095 * We could check for Linux 2.6.35+ and use a more straightforward method
4097 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
4101 tcmsg->tcm_handle = tc_make_handle(1, 0);
4102 tcmsg->tcm_parent = 0;
4104 /* Figure out what tc class to instantiate. */
4105 error = tc_transact(&request, &qdisc);
4109 error = tc_parse_qdisc(qdisc, &kind, NULL);
4111 ops = &tc_ops_other;
4113 ops = tc_lookup_linux_name(kind);
4115 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4116 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4118 ops = &tc_ops_other;
4121 } else if (error == ENOENT) {
4122 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4123 * other entity that doesn't have a handle 1:0. We will assume
4124 * that it's the system default qdisc. */
4125 ops = &tc_ops_default;
4128 /* Who knows? Maybe the device got deleted. */
4129 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4130 netdev_get_name(netdev_), ovs_strerror(error));
4131 ops = &tc_ops_other;
4134 /* Instantiate it. */
4135 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
4136 ovs_assert((load_error == 0) == (netdev->tc != NULL));
4137 ofpbuf_delete(qdisc);
4139 return error ? error : load_error;
4142 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4143 approximate the time to transmit packets of various lengths. For an MTU of
4144 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4145 represents two possible packet lengths; for a MTU of 513 through 1024, four
4146 possible lengths; and so on.
4148 Returns, for the specified 'mtu', the number of bits that packet lengths
4149 need to be shifted right to fit within such a 256-entry table. */
4151 tc_calc_cell_log(unsigned int mtu)
4156 mtu = ETH_PAYLOAD_MAX;
4158 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4160 for (cell_log = 0; mtu >= 256; cell_log++) {
4167 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4170 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4172 memset(rate, 0, sizeof *rate);
4173 rate->cell_log = tc_calc_cell_log(mtu);
4174 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4175 /* rate->cell_align = 0; */ /* distro headers. */
4176 rate->mpu = ETH_TOTAL_MIN;
4180 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4181 * attribute of the specified "type".
4183 * See tc_calc_cell_log() above for a description of "rtab"s. */
4185 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4190 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4191 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4192 unsigned packet_size = (i + 1) << rate->cell_log;
4193 if (packet_size < rate->mpu) {
4194 packet_size = rate->mpu;
4196 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4200 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4201 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4202 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4205 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4207 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4208 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4211 /* Linux-only functions declared in netdev-linux.h */
4213 /* Returns a fd for an AF_INET socket or a negative errno value. */
4215 netdev_linux_get_af_inet_sock(void)
4217 int error = netdev_linux_init();
4218 return error ? -error : af_inet_sock;
4221 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4222 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4224 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4225 const char *flag_name, bool enable)
4227 const char *netdev_name = netdev_get_name(netdev);
4228 struct ethtool_value evalue;
4232 COVERAGE_INC(netdev_get_ethtool);
4233 memset(&evalue, 0, sizeof evalue);
4234 error = netdev_linux_do_ethtool(netdev_name,
4235 (struct ethtool_cmd *)&evalue,
4236 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4241 COVERAGE_INC(netdev_set_ethtool);
4242 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4243 error = netdev_linux_do_ethtool(netdev_name,
4244 (struct ethtool_cmd *)&evalue,
4245 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4250 COVERAGE_INC(netdev_get_ethtool);
4251 memset(&evalue, 0, sizeof evalue);
4252 error = netdev_linux_do_ethtool(netdev_name,
4253 (struct ethtool_cmd *)&evalue,
4254 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4259 if (new_flags != evalue.data) {
4260 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4261 "device %s failed", enable ? "enable" : "disable",
4262 flag_name, netdev_name);
4269 /* Utility functions. */
4271 /* Copies 'src' into 'dst', performing format conversion in the process. */
4273 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4274 const struct rtnl_link_stats *src)
4276 dst->rx_packets = src->rx_packets;
4277 dst->tx_packets = src->tx_packets;
4278 dst->rx_bytes = src->rx_bytes;
4279 dst->tx_bytes = src->tx_bytes;
4280 dst->rx_errors = src->rx_errors;
4281 dst->tx_errors = src->tx_errors;
4282 dst->rx_dropped = src->rx_dropped;
4283 dst->tx_dropped = src->tx_dropped;
4284 dst->multicast = src->multicast;
4285 dst->collisions = src->collisions;
4286 dst->rx_length_errors = src->rx_length_errors;
4287 dst->rx_over_errors = src->rx_over_errors;
4288 dst->rx_crc_errors = src->rx_crc_errors;
4289 dst->rx_frame_errors = src->rx_frame_errors;
4290 dst->rx_fifo_errors = src->rx_fifo_errors;
4291 dst->rx_missed_errors = src->rx_missed_errors;
4292 dst->tx_aborted_errors = src->tx_aborted_errors;
4293 dst->tx_carrier_errors = src->tx_carrier_errors;
4294 dst->tx_fifo_errors = src->tx_fifo_errors;
4295 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4296 dst->tx_window_errors = src->tx_window_errors;
4300 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4302 /* Policy for RTNLGRP_LINK messages.
4304 * There are *many* more fields in these messages, but currently we only
4305 * care about these fields. */
4306 static const struct nl_policy rtnlgrp_link_policy[] = {
4307 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4308 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4309 .min_len = sizeof(struct rtnl_link_stats) },
4312 struct ofpbuf request;
4313 struct ofpbuf *reply;
4314 struct ifinfomsg *ifi;
4315 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4318 ofpbuf_init(&request, 0);
4319 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4320 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4321 ifi->ifi_family = PF_UNSPEC;
4322 ifi->ifi_index = ifindex;
4323 error = nl_sock_transact(rtnl_sock, &request, &reply);
4324 ofpbuf_uninit(&request);
4329 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4330 rtnlgrp_link_policy,
4331 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4332 ofpbuf_delete(reply);
4336 if (!attrs[IFLA_STATS]) {
4337 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4338 ofpbuf_delete(reply);
4342 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4344 ofpbuf_delete(reply);
4350 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4352 static const char fn[] = "/proc/net/dev";
4357 stream = fopen(fn, "r");
4359 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
4364 while (fgets(line, sizeof line, stream)) {
4367 #define X64 "%"SCNu64
4370 X64 X64 X64 X64 X64 X64 X64 "%*u"
4371 X64 X64 X64 X64 X64 X64 X64 "%*u",
4377 &stats->rx_fifo_errors,
4378 &stats->rx_frame_errors,
4384 &stats->tx_fifo_errors,
4386 &stats->tx_carrier_errors) != 15) {
4387 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4388 } else if (!strcmp(devname, netdev_name)) {
4389 stats->rx_length_errors = UINT64_MAX;
4390 stats->rx_over_errors = UINT64_MAX;
4391 stats->rx_crc_errors = UINT64_MAX;
4392 stats->rx_missed_errors = UINT64_MAX;
4393 stats->tx_aborted_errors = UINT64_MAX;
4394 stats->tx_heartbeat_errors = UINT64_MAX;
4395 stats->tx_window_errors = UINT64_MAX;
4401 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4407 get_flags(const struct netdev *dev, unsigned int *flags)
4413 error = netdev_linux_do_ioctl(dev->name, &ifr, SIOCGIFFLAGS,
4416 *flags = ifr.ifr_flags;
4422 set_flags(const char *name, unsigned int flags)
4426 ifr.ifr_flags = flags;
4427 return netdev_linux_do_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
4431 do_get_ifindex(const char *netdev_name)
4435 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4436 COVERAGE_INC(netdev_get_ifindex);
4437 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4438 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4439 netdev_name, ovs_strerror(errno));
4442 return ifr.ifr_ifindex;
4446 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4448 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4450 if (!(netdev->cache_valid & VALID_IFINDEX)) {
4451 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4454 netdev->get_ifindex_error = -ifindex;
4455 netdev->ifindex = 0;
4457 netdev->get_ifindex_error = 0;
4458 netdev->ifindex = ifindex;
4460 netdev->cache_valid |= VALID_IFINDEX;
4463 *ifindexp = netdev->ifindex;
4464 return netdev->get_ifindex_error;
4468 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4473 memset(&ifr, 0, sizeof ifr);
4474 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4475 COVERAGE_INC(netdev_get_hwaddr);
4476 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4477 /* ENODEV probably means that a vif disappeared asynchronously and
4478 * hasn't been removed from the database yet, so reduce the log level
4479 * to INFO for that case. */
4480 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4481 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4482 netdev_name, ovs_strerror(errno));
4485 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4486 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4487 VLOG_WARN("%s device has unknown hardware address family %d",
4488 netdev_name, hwaddr_family);
4490 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4495 set_etheraddr(const char *netdev_name,
4496 const uint8_t mac[ETH_ADDR_LEN])
4500 memset(&ifr, 0, sizeof ifr);
4501 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4502 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4503 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4504 COVERAGE_INC(netdev_set_hwaddr);
4505 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4506 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4507 netdev_name, ovs_strerror(errno));
4514 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4515 int cmd, const char *cmd_name)
4519 memset(&ifr, 0, sizeof ifr);
4520 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4521 ifr.ifr_data = (caddr_t) ecmd;
4524 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4527 if (errno != EOPNOTSUPP) {
4528 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4529 "failed: %s", cmd_name, name, ovs_strerror(errno));
4531 /* The device doesn't support this operation. That's pretty
4532 * common, so there's no point in logging anything. */
4539 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4540 const char *cmd_name)
4542 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4543 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4544 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4545 ovs_strerror(errno));
4552 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4553 int cmd, const char *cmd_name)
4558 ifr.ifr_addr.sa_family = AF_INET;
4559 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4561 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4562 *ip = sin->sin_addr;
4567 /* Returns an AF_PACKET raw socket or a negative errno value. */
4569 af_packet_sock(void)
4571 static int sock = INT_MIN;
4573 if (sock == INT_MIN) {
4574 sock = socket(AF_PACKET, SOCK_RAW, 0);
4576 int error = set_nonblocking(sock);
4583 VLOG_ERR("failed to create packet socket: %s",
4584 ovs_strerror(errno));