2 * Copyright (c) 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <linux/version.h>
37 #include <sys/types.h>
38 #include <sys/ioctl.h>
39 #include <sys/socket.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
52 #include "dpif-linux.h"
53 #include "dynamic-string.h"
54 #include "fatal-signal.h"
57 #include "netdev-provider.h"
58 #include "netdev-vport.h"
59 #include "netlink-notifier.h"
60 #include "netlink-socket.h"
63 #include "openflow/openflow.h"
65 #include "poll-loop.h"
66 #include "rtnetlink-link.h"
68 #include "socket-util.h"
71 #include "unaligned.h"
74 VLOG_DEFINE_THIS_MODULE(netdev_linux);
76 COVERAGE_DEFINE(netdev_set_policing);
77 COVERAGE_DEFINE(netdev_arp_lookup);
78 COVERAGE_DEFINE(netdev_get_ifindex);
79 COVERAGE_DEFINE(netdev_get_hwaddr);
80 COVERAGE_DEFINE(netdev_set_hwaddr);
81 COVERAGE_DEFINE(netdev_get_ethtool);
82 COVERAGE_DEFINE(netdev_set_ethtool);
85 /* These were introduced in Linux 2.6.14, so they might be missing if we have
87 #ifndef ADVERTISED_Pause
88 #define ADVERTISED_Pause (1 << 13)
90 #ifndef ADVERTISED_Asym_Pause
91 #define ADVERTISED_Asym_Pause (1 << 14)
94 /* These were introduced in Linux 2.6.24, so they might be missing if we
95 * have old headers. */
96 #ifndef ETHTOOL_GFLAGS
97 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
99 #ifndef ETHTOOL_SFLAGS
100 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
103 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
106 #define TC_RTAB_SIZE 1024
109 static struct nln_notifier *netdev_linux_cache_notifier = NULL;
110 static int cache_notifier_refcount;
113 VALID_IFINDEX = 1 << 0,
114 VALID_ETHERADDR = 1 << 1,
118 VALID_POLICING = 1 << 5,
119 VALID_VPORT_STAT_ERROR = 1 << 6,
120 VALID_DRVINFO = 1 << 7,
121 VALID_FEATURES = 1 << 8,
128 /* Traffic control. */
130 /* An instance of a traffic control class. Always associated with a particular
133 * Each TC implementation subclasses this with whatever additional data it
136 const struct tc_ops *ops;
137 struct hmap queues; /* Contains "struct tc_queue"s.
138 * Read by generic TC layer.
139 * Written only by TC implementation. */
142 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
144 /* One traffic control queue.
146 * Each TC implementation subclasses this with whatever additional data it
149 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
150 unsigned int queue_id; /* OpenFlow queue ID. */
151 long long int created; /* Time queue was created, in msecs. */
154 /* A particular kind of traffic control. Each implementation generally maps to
155 * one particular Linux qdisc class.
157 * The functions below return 0 if successful or a positive errno value on
158 * failure, except where otherwise noted. All of them must be provided, except
159 * where otherwise noted. */
161 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
162 * This is null for tc_ops_default and tc_ops_other, for which there are no
163 * appropriate values. */
164 const char *linux_name;
166 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
167 const char *ovs_name;
169 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
170 * queues. The queues are numbered 0 through n_queues - 1. */
171 unsigned int n_queues;
173 /* Called to install this TC class on 'netdev'. The implementation should
174 * make the Netlink calls required to set up 'netdev' with the right qdisc
175 * and configure it according to 'details'. The implementation may assume
176 * that the current qdisc is the default; that is, there is no need for it
177 * to delete the current qdisc before installing itself.
179 * The contents of 'details' should be documented as valid for 'ovs_name'
180 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
181 * (which is built as ovs-vswitchd.conf.db(8)).
183 * This function must return 0 if and only if it sets 'netdev->tc' to an
184 * initialized 'struct tc'.
186 * (This function is null for tc_ops_other, which cannot be installed. For
187 * other TC classes it should always be nonnull.) */
188 int (*tc_install)(struct netdev *netdev, const struct smap *details);
190 /* Called when the netdev code determines (through a Netlink query) that
191 * this TC class's qdisc is installed on 'netdev', but we didn't install
192 * it ourselves and so don't know any of the details.
194 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
195 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
196 * implementation should parse the other attributes of 'nlmsg' as
197 * necessary to determine its configuration. If necessary it should also
198 * use Netlink queries to determine the configuration of queues on
201 * This function must return 0 if and only if it sets 'netdev->tc' to an
202 * initialized 'struct tc'. */
203 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
205 /* Destroys the data structures allocated by the implementation as part of
206 * 'tc'. (This includes destroying 'tc->queues' by calling
209 * The implementation should not need to perform any Netlink calls. If
210 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
211 * (But it may not be desirable.)
213 * This function may be null if 'tc' is trivial. */
214 void (*tc_destroy)(struct tc *tc);
216 /* Retrieves details of 'netdev->tc' configuration into 'details'.
218 * The implementation should not need to perform any Netlink calls, because
219 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
220 * cached the configuration.
222 * The contents of 'details' should be documented as valid for 'ovs_name'
223 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
224 * (which is built as ovs-vswitchd.conf.db(8)).
226 * This function may be null if 'tc' is not configurable.
228 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
230 /* Reconfigures 'netdev->tc' according to 'details', performing any
231 * required Netlink calls to complete the reconfiguration.
233 * The contents of 'details' should be documented as valid for 'ovs_name'
234 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
235 * (which is built as ovs-vswitchd.conf.db(8)).
237 * This function may be null if 'tc' is not configurable.
239 int (*qdisc_set)(struct netdev *, const struct smap *details);
241 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
242 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
244 * The contents of 'details' should be documented as valid for 'ovs_name'
245 * in the "other_config" column in the "Queue" table in
246 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
248 * The implementation should not need to perform any Netlink calls, because
249 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
250 * cached the queue configuration.
252 * This function may be null if 'tc' does not have queues ('n_queues' is
254 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
255 struct smap *details);
257 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
258 * 'details', perfoming any required Netlink calls to complete the
259 * reconfiguration. The caller ensures that 'queue_id' is less than
262 * The contents of 'details' should be documented as valid for 'ovs_name'
263 * in the "other_config" column in the "Queue" table in
264 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
266 * This function may be null if 'tc' does not have queues or its queues are
267 * not configurable. */
268 int (*class_set)(struct netdev *, unsigned int queue_id,
269 const struct smap *details);
271 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
272 * tc_queue's within 'netdev->tc->queues'.
274 * This function may be null if 'tc' does not have queues or its queues
275 * cannot be deleted. */
276 int (*class_delete)(struct netdev *, struct tc_queue *queue);
278 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
279 * 'struct tc_queue's within 'netdev->tc->queues'.
281 * On success, initializes '*stats'.
283 * This function may be null if 'tc' does not have queues or if it cannot
284 * report queue statistics. */
285 int (*class_get_stats)(const struct netdev *netdev,
286 const struct tc_queue *queue,
287 struct netdev_queue_stats *stats);
289 /* Extracts queue stats from 'nlmsg', which is a response to a
290 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
292 * This function may be null if 'tc' does not have queues or if it cannot
293 * report queue statistics. */
294 int (*class_dump_stats)(const struct netdev *netdev,
295 const struct ofpbuf *nlmsg,
296 netdev_dump_queue_stats_cb *cb, void *aux);
300 tc_init(struct tc *tc, const struct tc_ops *ops)
303 hmap_init(&tc->queues);
307 tc_destroy(struct tc *tc)
309 hmap_destroy(&tc->queues);
312 static const struct tc_ops tc_ops_htb;
313 static const struct tc_ops tc_ops_hfsc;
314 static const struct tc_ops tc_ops_default;
315 static const struct tc_ops tc_ops_other;
317 static const struct tc_ops *const tcs[] = {
318 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
319 &tc_ops_hfsc, /* Hierarchical fair service curve. */
320 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
321 &tc_ops_other, /* Some other qdisc. */
325 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
326 static unsigned int tc_get_major(unsigned int handle);
327 static unsigned int tc_get_minor(unsigned int handle);
329 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
330 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
331 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
333 static struct tcmsg *tc_make_request(const struct netdev *, int type,
334 unsigned int flags, struct ofpbuf *);
335 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
336 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
337 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
340 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
341 struct nlattr **options);
342 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
343 struct nlattr **options,
344 struct netdev_queue_stats *);
345 static int tc_query_class(const struct netdev *,
346 unsigned int handle, unsigned int parent,
347 struct ofpbuf **replyp);
348 static int tc_delete_class(const struct netdev *, unsigned int handle);
350 static int tc_del_qdisc(struct netdev *netdev);
351 static int tc_query_qdisc(const struct netdev *netdev);
353 static int tc_calc_cell_log(unsigned int mtu);
354 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
355 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
356 const struct tc_ratespec *rate);
357 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
359 struct netdev_linux {
362 struct shash_node *shash_node;
363 unsigned int cache_valid;
364 unsigned int change_seq;
366 bool miimon; /* Link status of last poll. */
367 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
368 struct timer miimon_timer;
370 /* The following are figured out "on demand" only. They are only valid
371 * when the corresponding VALID_* bit in 'cache_valid' is set. */
373 uint8_t etheraddr[ETH_ADDR_LEN];
374 struct in_addr address, netmask;
377 unsigned int ifi_flags;
378 long long int carrier_resets;
379 uint32_t kbits_rate; /* Policing data. */
380 uint32_t kbits_burst;
381 int vport_stats_error; /* Cached error code from vport_get_stats().
382 0 or an errno value. */
383 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
384 int ether_addr_error; /* Cached error code from set/get etheraddr. */
385 int netdev_policing_error; /* Cached error code from set policing. */
386 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
387 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
389 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
390 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
391 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
392 enum netdev_features peer; /* Cached from ETHTOOL_GSET. */
394 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
398 struct tap_state tap;
402 struct netdev_rx_linux {
408 static const struct netdev_rx_class netdev_rx_linux_class;
410 /* Sockets used for ioctl operations. */
411 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
413 /* This is set pretty low because we probably won't learn anything from the
414 * additional log messages. */
415 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
417 static int netdev_linux_init(void);
419 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
420 int cmd, const char *cmd_name);
421 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
422 const char *cmd_name);
423 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
424 int cmd, const char *cmd_name);
425 static int get_flags(const struct netdev *, unsigned int *flags);
426 static int set_flags(const char *, unsigned int flags);
427 static int do_get_ifindex(const char *netdev_name);
428 static int get_ifindex(const struct netdev *, int *ifindexp);
429 static int do_set_addr(struct netdev *netdev,
430 int ioctl_nr, const char *ioctl_name,
431 struct in_addr addr);
432 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
433 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
434 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
435 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
436 static int af_packet_sock(void);
437 static void netdev_linux_miimon_run(void);
438 static void netdev_linux_miimon_wait(void);
441 is_netdev_linux_class(const struct netdev_class *netdev_class)
443 return netdev_class->init == netdev_linux_init;
447 is_tap_netdev(const struct netdev *netdev)
449 return netdev_get_class(netdev) == &netdev_tap_class;
452 static struct netdev_linux *
453 netdev_linux_cast(const struct netdev *netdev)
455 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
457 return CONTAINER_OF(netdev, struct netdev_linux, up);
460 static struct netdev_rx_linux *
461 netdev_rx_linux_cast(const struct netdev_rx *rx)
463 netdev_rx_assert_class(rx, &netdev_rx_linux_class);
464 return CONTAINER_OF(rx, struct netdev_rx_linux, up);
468 netdev_linux_init(void)
470 static int status = -1;
472 /* Create AF_INET socket. */
473 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
474 status = af_inet_sock >= 0 ? 0 : errno;
476 VLOG_ERR("failed to create inet socket: %s", ovs_strerror(status));
483 netdev_linux_run(void)
485 rtnetlink_link_run();
486 netdev_linux_miimon_run();
490 netdev_linux_wait(void)
492 rtnetlink_link_wait();
493 netdev_linux_miimon_wait();
497 netdev_linux_changed(struct netdev_linux *dev,
498 unsigned int ifi_flags, unsigned int mask)
501 if (!dev->change_seq) {
505 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
506 dev->carrier_resets++;
508 dev->ifi_flags = ifi_flags;
510 dev->cache_valid &= mask;
514 netdev_linux_update(struct netdev_linux *dev,
515 const struct rtnetlink_link_change *change)
517 if (change->nlmsg_type == RTM_NEWLINK) {
519 netdev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
521 /* Update netdev from rtnl-change msg. */
523 dev->mtu = change->mtu;
524 dev->cache_valid |= VALID_MTU;
525 dev->netdev_mtu_error = 0;
528 if (!eth_addr_is_zero(change->addr)) {
529 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
530 dev->cache_valid |= VALID_ETHERADDR;
531 dev->ether_addr_error = 0;
534 dev->ifindex = change->ifi_index;
535 dev->cache_valid |= VALID_IFINDEX;
536 dev->get_ifindex_error = 0;
539 netdev_linux_changed(dev, change->ifi_flags, 0);
544 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
545 void *aux OVS_UNUSED)
547 struct netdev_linux *dev;
549 struct netdev *base_dev = netdev_from_name(change->ifname);
550 if (base_dev && is_netdev_linux_class(netdev_get_class(base_dev))) {
551 netdev_linux_update(netdev_linux_cast(base_dev), change);
554 struct shash device_shash;
555 struct shash_node *node;
557 shash_init(&device_shash);
558 netdev_get_devices(&netdev_linux_class, &device_shash);
559 SHASH_FOR_EACH (node, &device_shash) {
564 get_flags(&dev->up, &flags);
565 netdev_linux_changed(dev, flags, 0);
567 shash_destroy(&device_shash);
572 cache_notifier_ref(void)
574 if (!cache_notifier_refcount) {
575 ovs_assert(!netdev_linux_cache_notifier);
577 netdev_linux_cache_notifier =
578 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
580 if (!netdev_linux_cache_notifier) {
584 cache_notifier_refcount++;
590 cache_notifier_unref(void)
592 ovs_assert(cache_notifier_refcount > 0);
593 if (!--cache_notifier_refcount) {
594 ovs_assert(netdev_linux_cache_notifier);
595 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
596 netdev_linux_cache_notifier = NULL;
600 /* Creates system and internal devices. */
602 netdev_linux_create(const struct netdev_class *class, const char *name,
603 struct netdev **netdevp)
605 struct netdev_linux *netdev;
608 error = cache_notifier_ref();
613 netdev = xzalloc(sizeof *netdev);
614 netdev->change_seq = 1;
615 netdev_init(&netdev->up, name, class);
616 error = get_flags(&netdev->up, &netdev->ifi_flags);
617 if (error == ENODEV) {
618 if (class != &netdev_internal_class) {
619 /* The device does not exist, so don't allow it to be opened. */
620 netdev_uninit(&netdev->up, false);
621 cache_notifier_unref();
625 /* "Internal" netdevs have to be created as netdev objects before
626 * they exist in the kernel, because creating them in the kernel
627 * happens by passing a netdev object to dpif_port_add().
628 * Therefore, ignore the error. */
632 *netdevp = &netdev->up;
636 /* For most types of netdevs we open the device for each call of
637 * netdev_open(). However, this is not the case with tap devices,
638 * since it is only possible to open the device once. In this
639 * situation we share a single file descriptor, and consequently
640 * buffers, across all readers. Therefore once data is read it will
641 * be unavailable to other reads for tap devices. */
643 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
644 const char *name, struct netdev **netdevp)
646 struct netdev_linux *netdev;
647 struct tap_state *state;
648 static const char tap_dev[] = "/dev/net/tun";
652 netdev = xzalloc(sizeof *netdev);
653 netdev->change_seq = 1;
654 state = &netdev->state.tap;
656 error = cache_notifier_ref();
661 /* Open tap device. */
662 state->fd = open(tap_dev, O_RDWR);
665 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
666 goto error_unref_notifier;
669 /* Create tap device. */
670 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
671 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
672 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
673 VLOG_WARN("%s: creating tap device failed: %s", name,
674 ovs_strerror(errno));
679 /* Make non-blocking. */
680 error = set_nonblocking(state->fd);
685 netdev_init(&netdev->up, name, &netdev_tap_class);
686 *netdevp = &netdev->up;
691 error_unref_notifier:
692 cache_notifier_unref();
699 destroy_tap(struct netdev_linux *netdev)
701 struct tap_state *state = &netdev->state.tap;
703 if (state->fd >= 0) {
708 /* Destroys the netdev device 'netdev_'. */
710 netdev_linux_destroy(struct netdev *netdev_)
712 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
714 if (netdev->tc && netdev->tc->ops->tc_destroy) {
715 netdev->tc->ops->tc_destroy(netdev->tc);
718 if (netdev_get_class(netdev_) == &netdev_tap_class) {
723 cache_notifier_unref();
727 netdev_linux_rx_open(struct netdev *netdev_, struct netdev_rx **rxp)
729 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
730 bool is_tap = is_tap_netdev(netdev_);
731 struct netdev_rx_linux *rx;
736 fd = netdev->state.tap.fd;
738 struct sockaddr_ll sll;
740 /* Result of tcpdump -dd inbound */
741 static struct sock_filter filt[] = {
742 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
743 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
744 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
745 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
747 static struct sock_fprog fprog = { ARRAY_SIZE(filt), filt };
749 /* Create file descriptor. */
750 fd = socket(PF_PACKET, SOCK_RAW, 0);
753 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
757 /* Set non-blocking mode. */
758 error = set_nonblocking(fd);
763 /* Get ethernet device index. */
764 error = get_ifindex(&netdev->up, &ifindex);
769 /* Bind to specific ethernet device. */
770 memset(&sll, 0, sizeof sll);
771 sll.sll_family = AF_PACKET;
772 sll.sll_ifindex = ifindex;
773 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
774 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
776 VLOG_ERR("%s: failed to bind raw socket (%s)",
777 netdev_get_name(netdev_), ovs_strerror(error));
781 /* Filter for only inbound packets. */
782 error = setsockopt(fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
786 VLOG_ERR("%s: failed attach filter (%s)",
787 netdev_get_name(netdev_), ovs_strerror(error));
792 rx = xmalloc(sizeof *rx);
793 netdev_rx_init(&rx->up, netdev_, &netdev_rx_linux_class);
808 netdev_rx_linux_destroy(struct netdev_rx *rx_)
810 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
819 netdev_rx_linux_recv(struct netdev_rx *rx_, void *data, size_t size)
821 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
826 ? read(rx->fd, data, size)
827 : recv(rx->fd, data, size, MSG_TRUNC));
828 } while (retval < 0 && errno == EINTR);
831 return retval > size ? -EMSGSIZE : retval;
833 if (errno != EAGAIN) {
834 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
835 ovs_strerror(errno), netdev_rx_get_name(rx_));
842 netdev_rx_linux_wait(struct netdev_rx *rx_)
844 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
845 poll_fd_wait(rx->fd, POLLIN);
849 netdev_rx_linux_drain(struct netdev_rx *rx_)
851 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
854 int error = netdev_linux_do_ioctl(netdev_rx_get_name(rx_), &ifr,
855 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
859 drain_fd(rx->fd, ifr.ifr_qlen);
862 return drain_rcvbuf(rx->fd);
866 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
867 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
868 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
869 * the packet is too big or too small to transmit on the device.
871 * The caller retains ownership of 'buffer' in all cases.
873 * The kernel maintains a packet transmission queue, so the caller is not
874 * expected to do additional queuing of packets. */
876 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
881 if (!is_tap_netdev(netdev_)) {
882 /* Use our AF_PACKET socket to send to this device. */
883 struct sockaddr_ll sll;
890 sock = af_packet_sock();
895 error = get_ifindex(netdev_, &ifindex);
900 /* We don't bother setting most fields in sockaddr_ll because the
901 * kernel ignores them for SOCK_RAW. */
902 memset(&sll, 0, sizeof sll);
903 sll.sll_family = AF_PACKET;
904 sll.sll_ifindex = ifindex;
906 iov.iov_base = CONST_CAST(void *, data);
910 msg.msg_namelen = sizeof sll;
913 msg.msg_control = NULL;
914 msg.msg_controllen = 0;
917 retval = sendmsg(sock, &msg, 0);
919 /* Use the tap fd to send to this device. This is essential for
920 * tap devices, because packets sent to a tap device with an
921 * AF_PACKET socket will loop back to be *received* again on the
922 * tap device. This doesn't occur on other interface types
923 * because we attach a socket filter to the rx socket. */
924 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
926 retval = write(netdev->state.tap.fd, data, size);
930 /* The Linux AF_PACKET implementation never blocks waiting for room
931 * for packets, instead returning ENOBUFS. Translate this into
932 * EAGAIN for the caller. */
933 if (errno == ENOBUFS) {
935 } else if (errno == EINTR) {
937 } else if (errno != EAGAIN) {
938 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
939 netdev_get_name(netdev_), ovs_strerror(errno));
942 } else if (retval != size) {
943 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
944 "%zu) on %s", retval, size, netdev_get_name(netdev_));
952 /* Registers with the poll loop to wake up from the next call to poll_block()
953 * when the packet transmission queue has sufficient room to transmit a packet
954 * with netdev_send().
956 * The kernel maintains a packet transmission queue, so the client is not
957 * expected to do additional queuing of packets. Thus, this function is
958 * unlikely to ever be used. It is included for completeness. */
960 netdev_linux_send_wait(struct netdev *netdev)
962 if (is_tap_netdev(netdev)) {
963 /* TAP device always accepts packets.*/
964 poll_immediate_wake();
968 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
969 * otherwise a positive errno value. */
971 netdev_linux_set_etheraddr(struct netdev *netdev_,
972 const uint8_t mac[ETH_ADDR_LEN])
974 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
975 struct netdev_saved_flags *sf = NULL;
978 if (netdev->cache_valid & VALID_ETHERADDR) {
979 if (netdev->ether_addr_error) {
980 return netdev->ether_addr_error;
982 if (eth_addr_equals(netdev->etheraddr, mac)) {
985 netdev->cache_valid &= ~VALID_ETHERADDR;
988 /* Tap devices must be brought down before setting the address. */
989 if (is_tap_netdev(netdev_)) {
990 netdev_turn_flags_off(netdev_, NETDEV_UP, &sf);
992 error = set_etheraddr(netdev_get_name(netdev_), mac);
993 if (!error || error == ENODEV) {
994 netdev->ether_addr_error = error;
995 netdev->cache_valid |= VALID_ETHERADDR;
997 memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
1001 netdev_restore_flags(sf);
1006 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1008 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1009 uint8_t mac[ETH_ADDR_LEN])
1011 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1013 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1014 int error = get_etheraddr(netdev_get_name(netdev_),
1017 netdev->ether_addr_error = error;
1018 netdev->cache_valid |= VALID_ETHERADDR;
1021 if (!netdev->ether_addr_error) {
1022 memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
1025 return netdev->ether_addr_error;
1028 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1029 * in bytes, not including the hardware header; thus, this is typically 1500
1030 * bytes for Ethernet devices. */
1032 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1034 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1035 if (!(netdev->cache_valid & VALID_MTU)) {
1039 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1040 SIOCGIFMTU, "SIOCGIFMTU");
1042 netdev->netdev_mtu_error = error;
1043 netdev->mtu = ifr.ifr_mtu;
1044 netdev->cache_valid |= VALID_MTU;
1047 if (!netdev->netdev_mtu_error) {
1048 *mtup = netdev->mtu;
1050 return netdev->netdev_mtu_error;
1053 /* Sets the maximum size of transmitted (MTU) for given device using linux
1054 * networking ioctl interface.
1057 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1059 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1063 if (netdev->cache_valid & VALID_MTU) {
1064 if (netdev->netdev_mtu_error) {
1065 return netdev->netdev_mtu_error;
1067 if (netdev->mtu == mtu) {
1070 netdev->cache_valid &= ~VALID_MTU;
1073 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1074 SIOCSIFMTU, "SIOCSIFMTU");
1075 if (!error || error == ENODEV) {
1076 netdev->netdev_mtu_error = error;
1077 netdev->mtu = ifr.ifr_mtu;
1078 netdev->cache_valid |= VALID_MTU;
1083 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1084 * On failure, returns a negative errno value. */
1086 netdev_linux_get_ifindex(const struct netdev *netdev)
1090 error = get_ifindex(netdev, &ifindex);
1091 return error ? -error : ifindex;
1095 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1097 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1099 if (netdev->miimon_interval > 0) {
1100 *carrier = netdev->miimon;
1102 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1108 static long long int
1109 netdev_linux_get_carrier_resets(const struct netdev *netdev)
1111 return netdev_linux_cast(netdev)->carrier_resets;
1115 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1116 struct mii_ioctl_data *data)
1121 memset(&ifr, 0, sizeof ifr);
1122 memcpy(&ifr.ifr_data, data, sizeof *data);
1123 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1124 memcpy(data, &ifr.ifr_data, sizeof *data);
1130 netdev_linux_get_miimon(const char *name, bool *miimon)
1132 struct mii_ioctl_data data;
1137 memset(&data, 0, sizeof data);
1138 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1140 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1141 data.reg_num = MII_BMSR;
1142 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1146 *miimon = !!(data.val_out & BMSR_LSTATUS);
1148 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1151 struct ethtool_cmd ecmd;
1153 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1156 COVERAGE_INC(netdev_get_ethtool);
1157 memset(&ecmd, 0, sizeof ecmd);
1158 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1161 struct ethtool_value eval;
1163 memcpy(&eval, &ecmd, sizeof eval);
1164 *miimon = !!eval.data;
1166 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1174 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1175 long long int interval)
1177 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1179 interval = interval > 0 ? MAX(interval, 100) : 0;
1180 if (netdev->miimon_interval != interval) {
1181 netdev->miimon_interval = interval;
1182 timer_set_expired(&netdev->miimon_timer);
1189 netdev_linux_miimon_run(void)
1191 struct shash device_shash;
1192 struct shash_node *node;
1194 shash_init(&device_shash);
1195 netdev_get_devices(&netdev_linux_class, &device_shash);
1196 SHASH_FOR_EACH (node, &device_shash) {
1197 struct netdev_linux *dev = node->data;
1200 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1204 netdev_linux_get_miimon(dev->up.name, &miimon);
1205 if (miimon != dev->miimon) {
1206 dev->miimon = miimon;
1207 netdev_linux_changed(dev, dev->ifi_flags, 0);
1210 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1213 shash_destroy(&device_shash);
1217 netdev_linux_miimon_wait(void)
1219 struct shash device_shash;
1220 struct shash_node *node;
1222 shash_init(&device_shash);
1223 netdev_get_devices(&netdev_linux_class, &device_shash);
1224 SHASH_FOR_EACH (node, &device_shash) {
1225 struct netdev_linux *dev = node->data;
1227 if (dev->miimon_interval > 0) {
1228 timer_wait(&dev->miimon_timer);
1231 shash_destroy(&device_shash);
1234 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1235 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1238 check_for_working_netlink_stats(void)
1240 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1241 * preferable, so if that works, we'll use it. */
1242 int ifindex = do_get_ifindex("lo");
1244 VLOG_WARN("failed to get ifindex for lo, "
1245 "obtaining netdev stats from proc");
1248 struct netdev_stats stats;
1249 int error = get_stats_via_netlink(ifindex, &stats);
1251 VLOG_DBG("obtaining netdev stats via rtnetlink");
1254 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1255 "via proc (you are probably running a pre-2.6.19 "
1256 "kernel)", ovs_strerror(error));
1263 swap_uint64(uint64_t *a, uint64_t *b)
1270 /* Copies 'src' into 'dst', performing format conversion in the process.
1272 * 'src' is allowed to be misaligned. */
1274 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1275 const struct ovs_vport_stats *src)
1277 dst->rx_packets = get_unaligned_u64(&src->rx_packets);
1278 dst->tx_packets = get_unaligned_u64(&src->tx_packets);
1279 dst->rx_bytes = get_unaligned_u64(&src->rx_bytes);
1280 dst->tx_bytes = get_unaligned_u64(&src->tx_bytes);
1281 dst->rx_errors = get_unaligned_u64(&src->rx_errors);
1282 dst->tx_errors = get_unaligned_u64(&src->tx_errors);
1283 dst->rx_dropped = get_unaligned_u64(&src->rx_dropped);
1284 dst->tx_dropped = get_unaligned_u64(&src->tx_dropped);
1286 dst->collisions = 0;
1287 dst->rx_length_errors = 0;
1288 dst->rx_over_errors = 0;
1289 dst->rx_crc_errors = 0;
1290 dst->rx_frame_errors = 0;
1291 dst->rx_fifo_errors = 0;
1292 dst->rx_missed_errors = 0;
1293 dst->tx_aborted_errors = 0;
1294 dst->tx_carrier_errors = 0;
1295 dst->tx_fifo_errors = 0;
1296 dst->tx_heartbeat_errors = 0;
1297 dst->tx_window_errors = 0;
1301 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1303 struct dpif_linux_vport reply;
1307 error = dpif_linux_vport_get(netdev_get_name(netdev), &reply, &buf);
1310 } else if (!reply.stats) {
1315 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1323 get_stats_via_vport(const struct netdev *netdev_,
1324 struct netdev_stats *stats)
1326 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1328 if (!netdev->vport_stats_error ||
1329 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1332 error = get_stats_via_vport__(netdev_, stats);
1333 if (error && error != ENOENT) {
1334 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1336 netdev_get_name(netdev_), ovs_strerror(error));
1338 netdev->vport_stats_error = error;
1339 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1344 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1345 struct netdev_stats *stats)
1347 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
1348 static int use_netlink_stats;
1351 if (ovsthread_once_start(&once)) {
1352 use_netlink_stats = check_for_working_netlink_stats();
1353 ovsthread_once_done(&once);
1356 if (use_netlink_stats) {
1359 error = get_ifindex(netdev_, &ifindex);
1361 error = get_stats_via_netlink(ifindex, stats);
1364 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1368 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1369 netdev_get_name(netdev_), error);
1375 /* Retrieves current device stats for 'netdev-linux'. */
1377 netdev_linux_get_stats(const struct netdev *netdev_,
1378 struct netdev_stats *stats)
1380 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1381 struct netdev_stats dev_stats;
1384 get_stats_via_vport(netdev_, stats);
1386 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1389 if (netdev->vport_stats_error) {
1396 if (netdev->vport_stats_error) {
1397 /* stats not available from OVS then use ioctl stats. */
1400 stats->rx_errors += dev_stats.rx_errors;
1401 stats->tx_errors += dev_stats.tx_errors;
1402 stats->rx_dropped += dev_stats.rx_dropped;
1403 stats->tx_dropped += dev_stats.tx_dropped;
1404 stats->multicast += dev_stats.multicast;
1405 stats->collisions += dev_stats.collisions;
1406 stats->rx_length_errors += dev_stats.rx_length_errors;
1407 stats->rx_over_errors += dev_stats.rx_over_errors;
1408 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1409 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1410 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1411 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1412 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1413 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1414 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1415 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1416 stats->tx_window_errors += dev_stats.tx_window_errors;
1421 /* Retrieves current device stats for 'netdev-tap' netdev or
1422 * netdev-internal. */
1424 netdev_tap_get_stats(const struct netdev *netdev_,
1425 struct netdev_stats *stats)
1427 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1428 struct netdev_stats dev_stats;
1431 get_stats_via_vport(netdev_, stats);
1433 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1435 if (netdev->vport_stats_error) {
1442 /* If this port is an internal port then the transmit and receive stats
1443 * will appear to be swapped relative to the other ports since we are the
1444 * one sending the data, not a remote computer. For consistency, we swap
1445 * them back here. This does not apply if we are getting stats from the
1446 * vport layer because it always tracks stats from the perspective of the
1448 if (netdev->vport_stats_error) {
1450 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1451 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1452 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1453 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1454 stats->rx_length_errors = 0;
1455 stats->rx_over_errors = 0;
1456 stats->rx_crc_errors = 0;
1457 stats->rx_frame_errors = 0;
1458 stats->rx_fifo_errors = 0;
1459 stats->rx_missed_errors = 0;
1460 stats->tx_aborted_errors = 0;
1461 stats->tx_carrier_errors = 0;
1462 stats->tx_fifo_errors = 0;
1463 stats->tx_heartbeat_errors = 0;
1464 stats->tx_window_errors = 0;
1466 stats->rx_dropped += dev_stats.tx_dropped;
1467 stats->tx_dropped += dev_stats.rx_dropped;
1469 stats->rx_errors += dev_stats.tx_errors;
1470 stats->tx_errors += dev_stats.rx_errors;
1472 stats->multicast += dev_stats.multicast;
1473 stats->collisions += dev_stats.collisions;
1479 netdev_internal_get_stats(const struct netdev *netdev_,
1480 struct netdev_stats *stats)
1482 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1484 get_stats_via_vport(netdev_, stats);
1485 return netdev->vport_stats_error;
1489 netdev_internal_set_stats(struct netdev *netdev,
1490 const struct netdev_stats *stats)
1492 struct ovs_vport_stats vport_stats;
1493 struct dpif_linux_vport vport;
1496 vport_stats.rx_packets = stats->rx_packets;
1497 vport_stats.tx_packets = stats->tx_packets;
1498 vport_stats.rx_bytes = stats->rx_bytes;
1499 vport_stats.tx_bytes = stats->tx_bytes;
1500 vport_stats.rx_errors = stats->rx_errors;
1501 vport_stats.tx_errors = stats->tx_errors;
1502 vport_stats.rx_dropped = stats->rx_dropped;
1503 vport_stats.tx_dropped = stats->tx_dropped;
1505 dpif_linux_vport_init(&vport);
1506 vport.cmd = OVS_VPORT_CMD_SET;
1507 vport.name = netdev_get_name(netdev);
1508 vport.stats = &vport_stats;
1510 err = dpif_linux_vport_transact(&vport, NULL, NULL);
1512 /* If the vport layer doesn't know about the device, that doesn't mean it
1513 * doesn't exist (after all were able to open it when netdev_open() was
1514 * called), it just means that it isn't attached and we'll be getting
1515 * stats a different way. */
1516 if (err == ENODEV) {
1524 netdev_linux_read_features(struct netdev_linux *netdev)
1526 struct ethtool_cmd ecmd;
1530 if (netdev->cache_valid & VALID_FEATURES) {
1534 COVERAGE_INC(netdev_get_ethtool);
1535 memset(&ecmd, 0, sizeof ecmd);
1536 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1537 ETHTOOL_GSET, "ETHTOOL_GSET");
1542 /* Supported features. */
1543 netdev->supported = 0;
1544 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1545 netdev->supported |= NETDEV_F_10MB_HD;
1547 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1548 netdev->supported |= NETDEV_F_10MB_FD;
1550 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1551 netdev->supported |= NETDEV_F_100MB_HD;
1553 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1554 netdev->supported |= NETDEV_F_100MB_FD;
1556 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1557 netdev->supported |= NETDEV_F_1GB_HD;
1559 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1560 netdev->supported |= NETDEV_F_1GB_FD;
1562 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1563 netdev->supported |= NETDEV_F_10GB_FD;
1565 if (ecmd.supported & SUPPORTED_TP) {
1566 netdev->supported |= NETDEV_F_COPPER;
1568 if (ecmd.supported & SUPPORTED_FIBRE) {
1569 netdev->supported |= NETDEV_F_FIBER;
1571 if (ecmd.supported & SUPPORTED_Autoneg) {
1572 netdev->supported |= NETDEV_F_AUTONEG;
1574 if (ecmd.supported & SUPPORTED_Pause) {
1575 netdev->supported |= NETDEV_F_PAUSE;
1577 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1578 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1581 /* Advertised features. */
1582 netdev->advertised = 0;
1583 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1584 netdev->advertised |= NETDEV_F_10MB_HD;
1586 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1587 netdev->advertised |= NETDEV_F_10MB_FD;
1589 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1590 netdev->advertised |= NETDEV_F_100MB_HD;
1592 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1593 netdev->advertised |= NETDEV_F_100MB_FD;
1595 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1596 netdev->advertised |= NETDEV_F_1GB_HD;
1598 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1599 netdev->advertised |= NETDEV_F_1GB_FD;
1601 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1602 netdev->advertised |= NETDEV_F_10GB_FD;
1604 if (ecmd.advertising & ADVERTISED_TP) {
1605 netdev->advertised |= NETDEV_F_COPPER;
1607 if (ecmd.advertising & ADVERTISED_FIBRE) {
1608 netdev->advertised |= NETDEV_F_FIBER;
1610 if (ecmd.advertising & ADVERTISED_Autoneg) {
1611 netdev->advertised |= NETDEV_F_AUTONEG;
1613 if (ecmd.advertising & ADVERTISED_Pause) {
1614 netdev->advertised |= NETDEV_F_PAUSE;
1616 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1617 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1620 /* Current settings. */
1622 if (speed == SPEED_10) {
1623 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1624 } else if (speed == SPEED_100) {
1625 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1626 } else if (speed == SPEED_1000) {
1627 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1628 } else if (speed == SPEED_10000) {
1629 netdev->current = NETDEV_F_10GB_FD;
1630 } else if (speed == 40000) {
1631 netdev->current = NETDEV_F_40GB_FD;
1632 } else if (speed == 100000) {
1633 netdev->current = NETDEV_F_100GB_FD;
1634 } else if (speed == 1000000) {
1635 netdev->current = NETDEV_F_1TB_FD;
1637 netdev->current = 0;
1640 if (ecmd.port == PORT_TP) {
1641 netdev->current |= NETDEV_F_COPPER;
1642 } else if (ecmd.port == PORT_FIBRE) {
1643 netdev->current |= NETDEV_F_FIBER;
1647 netdev->current |= NETDEV_F_AUTONEG;
1650 /* Peer advertisements. */
1651 netdev->peer = 0; /* XXX */
1654 netdev->cache_valid |= VALID_FEATURES;
1655 netdev->get_features_error = error;
1658 /* Stores the features supported by 'netdev' into each of '*current',
1659 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1660 * bitmap of NETDEV_* bits. Returns 0 if successful, otherwise a positive
1663 netdev_linux_get_features(const struct netdev *netdev_,
1664 enum netdev_features *current,
1665 enum netdev_features *advertised,
1666 enum netdev_features *supported,
1667 enum netdev_features *peer)
1669 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1671 netdev_linux_read_features(netdev);
1673 if (!netdev->get_features_error) {
1674 *current = netdev->current;
1675 *advertised = netdev->advertised;
1676 *supported = netdev->supported;
1677 *peer = netdev->peer;
1679 return netdev->get_features_error;
1682 /* Set the features advertised by 'netdev' to 'advertise'. */
1684 netdev_linux_set_advertisements(struct netdev *netdev,
1685 enum netdev_features advertise)
1687 struct ethtool_cmd ecmd;
1690 COVERAGE_INC(netdev_get_ethtool);
1691 memset(&ecmd, 0, sizeof ecmd);
1692 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1693 ETHTOOL_GSET, "ETHTOOL_GSET");
1698 ecmd.advertising = 0;
1699 if (advertise & NETDEV_F_10MB_HD) {
1700 ecmd.advertising |= ADVERTISED_10baseT_Half;
1702 if (advertise & NETDEV_F_10MB_FD) {
1703 ecmd.advertising |= ADVERTISED_10baseT_Full;
1705 if (advertise & NETDEV_F_100MB_HD) {
1706 ecmd.advertising |= ADVERTISED_100baseT_Half;
1708 if (advertise & NETDEV_F_100MB_FD) {
1709 ecmd.advertising |= ADVERTISED_100baseT_Full;
1711 if (advertise & NETDEV_F_1GB_HD) {
1712 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1714 if (advertise & NETDEV_F_1GB_FD) {
1715 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1717 if (advertise & NETDEV_F_10GB_FD) {
1718 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1720 if (advertise & NETDEV_F_COPPER) {
1721 ecmd.advertising |= ADVERTISED_TP;
1723 if (advertise & NETDEV_F_FIBER) {
1724 ecmd.advertising |= ADVERTISED_FIBRE;
1726 if (advertise & NETDEV_F_AUTONEG) {
1727 ecmd.advertising |= ADVERTISED_Autoneg;
1729 if (advertise & NETDEV_F_PAUSE) {
1730 ecmd.advertising |= ADVERTISED_Pause;
1732 if (advertise & NETDEV_F_PAUSE_ASYM) {
1733 ecmd.advertising |= ADVERTISED_Asym_Pause;
1735 COVERAGE_INC(netdev_set_ethtool);
1736 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1737 ETHTOOL_SSET, "ETHTOOL_SSET");
1740 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1741 * successful, otherwise a positive errno value. */
1743 netdev_linux_set_policing(struct netdev *netdev_,
1744 uint32_t kbits_rate, uint32_t kbits_burst)
1746 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1747 const char *netdev_name = netdev_get_name(netdev_);
1751 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1752 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1753 : kbits_burst); /* Stick with user-specified value. */
1755 if (netdev->cache_valid & VALID_POLICING) {
1756 if (netdev->netdev_policing_error) {
1757 return netdev->netdev_policing_error;
1760 if (netdev->kbits_rate == kbits_rate &&
1761 netdev->kbits_burst == kbits_burst) {
1762 /* Assume that settings haven't changed since we last set them. */
1765 netdev->cache_valid &= ~VALID_POLICING;
1768 COVERAGE_INC(netdev_set_policing);
1769 /* Remove any existing ingress qdisc. */
1770 error = tc_add_del_ingress_qdisc(netdev_, false);
1772 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1773 netdev_name, ovs_strerror(error));
1778 error = tc_add_del_ingress_qdisc(netdev_, true);
1780 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1781 netdev_name, ovs_strerror(error));
1785 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
1787 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1788 netdev_name, ovs_strerror(error));
1793 netdev->kbits_rate = kbits_rate;
1794 netdev->kbits_burst = kbits_burst;
1797 if (!error || error == ENODEV) {
1798 netdev->netdev_policing_error = error;
1799 netdev->cache_valid |= VALID_POLICING;
1805 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1808 const struct tc_ops *const *opsp;
1810 for (opsp = tcs; *opsp != NULL; opsp++) {
1811 const struct tc_ops *ops = *opsp;
1812 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1813 sset_add(types, ops->ovs_name);
1819 static const struct tc_ops *
1820 tc_lookup_ovs_name(const char *name)
1822 const struct tc_ops *const *opsp;
1824 for (opsp = tcs; *opsp != NULL; opsp++) {
1825 const struct tc_ops *ops = *opsp;
1826 if (!strcmp(name, ops->ovs_name)) {
1833 static const struct tc_ops *
1834 tc_lookup_linux_name(const char *name)
1836 const struct tc_ops *const *opsp;
1838 for (opsp = tcs; *opsp != NULL; opsp++) {
1839 const struct tc_ops *ops = *opsp;
1840 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1847 static struct tc_queue *
1848 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
1851 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1852 struct tc_queue *queue;
1854 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
1855 if (queue->queue_id == queue_id) {
1862 static struct tc_queue *
1863 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1865 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1869 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1871 struct netdev_qos_capabilities *caps)
1873 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1877 caps->n_queues = ops->n_queues;
1882 netdev_linux_get_qos(const struct netdev *netdev_,
1883 const char **typep, struct smap *details)
1885 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1888 error = tc_query_qdisc(netdev_);
1893 *typep = netdev->tc->ops->ovs_name;
1894 return (netdev->tc->ops->qdisc_get
1895 ? netdev->tc->ops->qdisc_get(netdev_, details)
1900 netdev_linux_set_qos(struct netdev *netdev_,
1901 const char *type, const struct smap *details)
1903 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1904 const struct tc_ops *new_ops;
1907 new_ops = tc_lookup_ovs_name(type);
1908 if (!new_ops || !new_ops->tc_install) {
1912 error = tc_query_qdisc(netdev_);
1917 if (new_ops == netdev->tc->ops) {
1918 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
1920 /* Delete existing qdisc. */
1921 error = tc_del_qdisc(netdev_);
1925 ovs_assert(netdev->tc == NULL);
1927 /* Install new qdisc. */
1928 error = new_ops->tc_install(netdev_, details);
1929 ovs_assert((error == 0) == (netdev->tc != NULL));
1936 netdev_linux_get_queue(const struct netdev *netdev_,
1937 unsigned int queue_id, struct smap *details)
1939 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1942 error = tc_query_qdisc(netdev_);
1946 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
1948 ? netdev->tc->ops->class_get(netdev_, queue, details)
1954 netdev_linux_set_queue(struct netdev *netdev_,
1955 unsigned int queue_id, const struct smap *details)
1957 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1960 error = tc_query_qdisc(netdev_);
1963 } else if (queue_id >= netdev->tc->ops->n_queues
1964 || !netdev->tc->ops->class_set) {
1968 return netdev->tc->ops->class_set(netdev_, queue_id, details);
1972 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
1974 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1977 error = tc_query_qdisc(netdev_);
1980 } else if (!netdev->tc->ops->class_delete) {
1983 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
1985 ? netdev->tc->ops->class_delete(netdev_, queue)
1991 netdev_linux_get_queue_stats(const struct netdev *netdev_,
1992 unsigned int queue_id,
1993 struct netdev_queue_stats *stats)
1995 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1998 error = tc_query_qdisc(netdev_);
2001 } else if (!netdev->tc->ops->class_get_stats) {
2004 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2008 stats->created = queue->created;
2009 return netdev->tc->ops->class_get_stats(netdev_, queue, stats);
2014 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
2016 struct ofpbuf request;
2017 struct tcmsg *tcmsg;
2019 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2023 tcmsg->tcm_parent = 0;
2024 nl_dump_start(dump, NETLINK_ROUTE, &request);
2025 ofpbuf_uninit(&request);
2030 netdev_linux_dump_queues(const struct netdev *netdev_,
2031 netdev_dump_queues_cb *cb, void *aux)
2033 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2034 struct tc_queue *queue, *next_queue;
2035 struct smap details;
2039 error = tc_query_qdisc(netdev_);
2042 } else if (!netdev->tc->ops->class_get) {
2047 smap_init(&details);
2048 HMAP_FOR_EACH_SAFE (queue, next_queue, hmap_node,
2049 &netdev->tc->queues) {
2050 smap_clear(&details);
2052 error = netdev->tc->ops->class_get(netdev_, queue, &details);
2054 (*cb)(queue->queue_id, &details, aux);
2059 smap_destroy(&details);
2065 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2066 netdev_dump_queue_stats_cb *cb, void *aux)
2068 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2069 struct nl_dump dump;
2074 error = tc_query_qdisc(netdev_);
2077 } else if (!netdev->tc->ops->class_dump_stats) {
2082 if (!start_queue_dump(netdev_, &dump)) {
2085 while (nl_dump_next(&dump, &msg)) {
2086 error = netdev->tc->ops->class_dump_stats(netdev_, &msg, cb, aux);
2092 error = nl_dump_done(&dump);
2093 return error ? error : last_error;
2097 netdev_linux_get_in4(const struct netdev *netdev_,
2098 struct in_addr *address, struct in_addr *netmask)
2100 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2102 if (!(netdev->cache_valid & VALID_IN4)) {
2105 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
2106 SIOCGIFADDR, "SIOCGIFADDR");
2111 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2112 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2117 netdev->cache_valid |= VALID_IN4;
2119 *address = netdev->address;
2120 *netmask = netdev->netmask;
2121 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
2125 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2126 struct in_addr netmask)
2128 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2131 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2133 netdev->cache_valid |= VALID_IN4;
2134 netdev->address = address;
2135 netdev->netmask = netmask;
2136 if (address.s_addr != INADDR_ANY) {
2137 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2138 "SIOCSIFNETMASK", netmask);
2145 parse_if_inet6_line(const char *line,
2146 struct in6_addr *in6, char ifname[16 + 1])
2148 uint8_t *s6 = in6->s6_addr;
2149 #define X8 "%2"SCNx8
2151 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2152 "%*x %*x %*x %*x %16s\n",
2153 &s6[0], &s6[1], &s6[2], &s6[3],
2154 &s6[4], &s6[5], &s6[6], &s6[7],
2155 &s6[8], &s6[9], &s6[10], &s6[11],
2156 &s6[12], &s6[13], &s6[14], &s6[15],
2160 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2161 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2163 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2165 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2166 if (!(netdev->cache_valid & VALID_IN6)) {
2170 netdev->in6 = in6addr_any;
2172 file = fopen("/proc/net/if_inet6", "r");
2174 const char *name = netdev_get_name(netdev_);
2175 while (fgets(line, sizeof line, file)) {
2176 struct in6_addr in6_tmp;
2177 char ifname[16 + 1];
2178 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2179 && !strcmp(name, ifname))
2181 netdev->in6 = in6_tmp;
2187 netdev->cache_valid |= VALID_IN6;
2194 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2196 struct sockaddr_in sin;
2197 memset(&sin, 0, sizeof sin);
2198 sin.sin_family = AF_INET;
2199 sin.sin_addr = addr;
2202 memset(sa, 0, sizeof *sa);
2203 memcpy(sa, &sin, sizeof sin);
2207 do_set_addr(struct netdev *netdev,
2208 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2211 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2212 make_in4_sockaddr(&ifr.ifr_addr, addr);
2214 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2218 /* Adds 'router' as a default IP gateway. */
2220 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2222 struct in_addr any = { INADDR_ANY };
2226 memset(&rt, 0, sizeof rt);
2227 make_in4_sockaddr(&rt.rt_dst, any);
2228 make_in4_sockaddr(&rt.rt_gateway, router);
2229 make_in4_sockaddr(&rt.rt_genmask, any);
2230 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2231 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2233 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2239 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2242 static const char fn[] = "/proc/net/route";
2247 *netdev_name = NULL;
2248 stream = fopen(fn, "r");
2249 if (stream == NULL) {
2250 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2255 while (fgets(line, sizeof line, stream)) {
2258 ovs_be32 dest, gateway, mask;
2259 int refcnt, metric, mtu;
2260 unsigned int flags, use, window, irtt;
2263 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2265 iface, &dest, &gateway, &flags, &refcnt,
2266 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2268 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2272 if (!(flags & RTF_UP)) {
2273 /* Skip routes that aren't up. */
2277 /* The output of 'dest', 'mask', and 'gateway' were given in
2278 * network byte order, so we don't need need any endian
2279 * conversions here. */
2280 if ((dest & mask) == (host->s_addr & mask)) {
2282 /* The host is directly reachable. */
2283 next_hop->s_addr = 0;
2285 /* To reach the host, we must go through a gateway. */
2286 next_hop->s_addr = gateway;
2288 *netdev_name = xstrdup(iface);
2300 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2302 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2305 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2306 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2308 COVERAGE_INC(netdev_get_ethtool);
2309 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2310 error = netdev_linux_do_ethtool(netdev->up.name,
2313 "ETHTOOL_GDRVINFO");
2315 netdev->cache_valid |= VALID_DRVINFO;
2320 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2321 smap_add(smap, "driver_version", netdev->drvinfo.version);
2322 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2328 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2331 smap_add(smap, "driver_name", "openvswitch");
2335 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2336 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2337 * returns 0. Otherwise, it returns a positive errno value; in particular,
2338 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2340 netdev_linux_arp_lookup(const struct netdev *netdev,
2341 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2344 struct sockaddr_in sin;
2347 memset(&r, 0, sizeof r);
2348 memset(&sin, 0, sizeof sin);
2349 sin.sin_family = AF_INET;
2350 sin.sin_addr.s_addr = ip;
2352 memcpy(&r.arp_pa, &sin, sizeof sin);
2353 r.arp_ha.sa_family = ARPHRD_ETHER;
2355 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2356 COVERAGE_INC(netdev_arp_lookup);
2357 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2359 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2360 } else if (retval != ENXIO) {
2361 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2362 netdev_get_name(netdev), IP_ARGS(ip),
2363 ovs_strerror(retval));
2369 nd_to_iff_flags(enum netdev_flags nd)
2372 if (nd & NETDEV_UP) {
2375 if (nd & NETDEV_PROMISC) {
2382 iff_to_nd_flags(int iff)
2384 enum netdev_flags nd = 0;
2388 if (iff & IFF_PROMISC) {
2389 nd |= NETDEV_PROMISC;
2395 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2396 enum netdev_flags on, enum netdev_flags *old_flagsp)
2398 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2399 int old_flags, new_flags;
2402 old_flags = netdev->ifi_flags;
2403 *old_flagsp = iff_to_nd_flags(old_flags);
2404 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2405 if (new_flags != old_flags) {
2406 error = set_flags(netdev_get_name(netdev_), new_flags);
2407 get_flags(netdev_, &netdev->ifi_flags);
2413 netdev_linux_change_seq(const struct netdev *netdev)
2415 return netdev_linux_cast(netdev)->change_seq;
2418 #define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS, \
2419 GET_FEATURES, GET_STATUS) \
2423 netdev_linux_init, \
2425 netdev_linux_wait, \
2428 netdev_linux_destroy, \
2429 NULL, /* get_config */ \
2430 NULL, /* set_config */ \
2431 NULL, /* get_tunnel_config */ \
2433 netdev_linux_rx_open, \
2435 netdev_linux_send, \
2436 netdev_linux_send_wait, \
2438 netdev_linux_set_etheraddr, \
2439 netdev_linux_get_etheraddr, \
2440 netdev_linux_get_mtu, \
2441 netdev_linux_set_mtu, \
2442 netdev_linux_get_ifindex, \
2443 netdev_linux_get_carrier, \
2444 netdev_linux_get_carrier_resets, \
2445 netdev_linux_set_miimon_interval, \
2450 netdev_linux_set_advertisements, \
2452 netdev_linux_set_policing, \
2453 netdev_linux_get_qos_types, \
2454 netdev_linux_get_qos_capabilities, \
2455 netdev_linux_get_qos, \
2456 netdev_linux_set_qos, \
2457 netdev_linux_get_queue, \
2458 netdev_linux_set_queue, \
2459 netdev_linux_delete_queue, \
2460 netdev_linux_get_queue_stats, \
2461 netdev_linux_dump_queues, \
2462 netdev_linux_dump_queue_stats, \
2464 netdev_linux_get_in4, \
2465 netdev_linux_set_in4, \
2466 netdev_linux_get_in6, \
2467 netdev_linux_add_router, \
2468 netdev_linux_get_next_hop, \
2470 netdev_linux_arp_lookup, \
2472 netdev_linux_update_flags, \
2474 netdev_linux_change_seq \
2477 const struct netdev_class netdev_linux_class =
2480 netdev_linux_create,
2481 netdev_linux_get_stats,
2482 NULL, /* set_stats */
2483 netdev_linux_get_features,
2484 netdev_linux_get_status);
2486 const struct netdev_class netdev_tap_class =
2489 netdev_linux_create_tap,
2490 netdev_tap_get_stats,
2491 NULL, /* set_stats */
2492 netdev_linux_get_features,
2493 netdev_linux_get_status);
2495 const struct netdev_class netdev_internal_class =
2498 netdev_linux_create,
2499 netdev_internal_get_stats,
2500 netdev_internal_set_stats,
2501 NULL, /* get_features */
2502 netdev_internal_get_status);
2504 static const struct netdev_rx_class netdev_rx_linux_class = {
2505 netdev_rx_linux_destroy,
2506 netdev_rx_linux_recv,
2507 netdev_rx_linux_wait,
2508 netdev_rx_linux_drain,
2511 /* HTB traffic control class. */
2513 #define HTB_N_QUEUES 0xf000
2517 unsigned int max_rate; /* In bytes/s. */
2521 struct tc_queue tc_queue;
2522 unsigned int min_rate; /* In bytes/s. */
2523 unsigned int max_rate; /* In bytes/s. */
2524 unsigned int burst; /* In bytes. */
2525 unsigned int priority; /* Lower values are higher priorities. */
2529 htb_get__(const struct netdev *netdev_)
2531 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2532 return CONTAINER_OF(netdev->tc, struct htb, tc);
2536 htb_install__(struct netdev *netdev_, uint64_t max_rate)
2538 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2541 htb = xmalloc(sizeof *htb);
2542 tc_init(&htb->tc, &tc_ops_htb);
2543 htb->max_rate = max_rate;
2545 netdev->tc = &htb->tc;
2548 /* Create an HTB qdisc.
2550 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2552 htb_setup_qdisc__(struct netdev *netdev)
2555 struct tc_htb_glob opt;
2556 struct ofpbuf request;
2557 struct tcmsg *tcmsg;
2559 tc_del_qdisc(netdev);
2561 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2562 NLM_F_EXCL | NLM_F_CREATE, &request);
2566 tcmsg->tcm_handle = tc_make_handle(1, 0);
2567 tcmsg->tcm_parent = TC_H_ROOT;
2569 nl_msg_put_string(&request, TCA_KIND, "htb");
2571 memset(&opt, 0, sizeof opt);
2572 opt.rate2quantum = 10;
2576 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2577 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2578 nl_msg_end_nested(&request, opt_offset);
2580 return tc_transact(&request, NULL);
2583 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2584 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2586 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2587 unsigned int parent, struct htb_class *class)
2590 struct tc_htb_opt opt;
2591 struct ofpbuf request;
2592 struct tcmsg *tcmsg;
2596 error = netdev_get_mtu(netdev, &mtu);
2598 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2599 netdev_get_name(netdev));
2603 memset(&opt, 0, sizeof opt);
2604 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2605 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2606 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2607 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2608 opt.prio = class->priority;
2610 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2614 tcmsg->tcm_handle = handle;
2615 tcmsg->tcm_parent = parent;
2617 nl_msg_put_string(&request, TCA_KIND, "htb");
2618 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2619 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2620 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2621 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2622 nl_msg_end_nested(&request, opt_offset);
2624 error = tc_transact(&request, NULL);
2626 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2627 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2628 netdev_get_name(netdev),
2629 tc_get_major(handle), tc_get_minor(handle),
2630 tc_get_major(parent), tc_get_minor(parent),
2631 class->min_rate, class->max_rate,
2632 class->burst, class->priority, ovs_strerror(error));
2637 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2638 * description of them into 'details'. The description complies with the
2639 * specification given in the vswitch database documentation for linux-htb
2642 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2644 static const struct nl_policy tca_htb_policy[] = {
2645 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2646 .min_len = sizeof(struct tc_htb_opt) },
2649 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2650 const struct tc_htb_opt *htb;
2652 if (!nl_parse_nested(nl_options, tca_htb_policy,
2653 attrs, ARRAY_SIZE(tca_htb_policy))) {
2654 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2658 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2659 class->min_rate = htb->rate.rate;
2660 class->max_rate = htb->ceil.rate;
2661 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2662 class->priority = htb->prio;
2667 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2668 struct htb_class *options,
2669 struct netdev_queue_stats *stats)
2671 struct nlattr *nl_options;
2672 unsigned int handle;
2675 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2676 if (!error && queue_id) {
2677 unsigned int major = tc_get_major(handle);
2678 unsigned int minor = tc_get_minor(handle);
2679 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2680 *queue_id = minor - 1;
2685 if (!error && options) {
2686 error = htb_parse_tca_options__(nl_options, options);
2692 htb_parse_qdisc_details__(struct netdev *netdev,
2693 const struct smap *details, struct htb_class *hc)
2695 const char *max_rate_s;
2697 max_rate_s = smap_get(details, "max-rate");
2698 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2699 if (!hc->max_rate) {
2700 enum netdev_features current;
2702 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2703 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
2705 hc->min_rate = hc->max_rate;
2711 htb_parse_class_details__(struct netdev *netdev,
2712 const struct smap *details, struct htb_class *hc)
2714 const struct htb *htb = htb_get__(netdev);
2715 const char *min_rate_s = smap_get(details, "min-rate");
2716 const char *max_rate_s = smap_get(details, "max-rate");
2717 const char *burst_s = smap_get(details, "burst");
2718 const char *priority_s = smap_get(details, "priority");
2721 error = netdev_get_mtu(netdev, &mtu);
2723 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2724 netdev_get_name(netdev));
2728 /* HTB requires at least an mtu sized min-rate to send any traffic even
2729 * on uncongested links. */
2730 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2731 hc->min_rate = MAX(hc->min_rate, mtu);
2732 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2735 hc->max_rate = (max_rate_s
2736 ? strtoull(max_rate_s, NULL, 10) / 8
2738 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2739 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2743 * According to hints in the documentation that I've read, it is important
2744 * that 'burst' be at least as big as the largest frame that might be
2745 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2746 * but having it a bit too small is a problem. Since netdev_get_mtu()
2747 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2748 * the MTU. We actually add 64, instead of 14, as a guard against
2749 * additional headers get tacked on somewhere that we're not aware of. */
2750 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2751 hc->burst = MAX(hc->burst, mtu + 64);
2754 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2760 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2761 unsigned int parent, struct htb_class *options,
2762 struct netdev_queue_stats *stats)
2764 struct ofpbuf *reply;
2767 error = tc_query_class(netdev, handle, parent, &reply);
2769 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2770 ofpbuf_delete(reply);
2776 htb_tc_install(struct netdev *netdev, const struct smap *details)
2780 error = htb_setup_qdisc__(netdev);
2782 struct htb_class hc;
2784 htb_parse_qdisc_details__(netdev, details, &hc);
2785 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2786 tc_make_handle(1, 0), &hc);
2788 htb_install__(netdev, hc.max_rate);
2794 static struct htb_class *
2795 htb_class_cast__(const struct tc_queue *queue)
2797 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2801 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2802 const struct htb_class *hc)
2804 struct htb *htb = htb_get__(netdev);
2805 size_t hash = hash_int(queue_id, 0);
2806 struct tc_queue *queue;
2807 struct htb_class *hcp;
2809 queue = tc_find_queue__(netdev, queue_id, hash);
2811 hcp = htb_class_cast__(queue);
2813 hcp = xmalloc(sizeof *hcp);
2814 queue = &hcp->tc_queue;
2815 queue->queue_id = queue_id;
2816 queue->created = time_msec();
2817 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2820 hcp->min_rate = hc->min_rate;
2821 hcp->max_rate = hc->max_rate;
2822 hcp->burst = hc->burst;
2823 hcp->priority = hc->priority;
2827 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2830 struct nl_dump dump;
2831 struct htb_class hc;
2833 /* Get qdisc options. */
2835 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2836 htb_install__(netdev, hc.max_rate);
2839 if (!start_queue_dump(netdev, &dump)) {
2842 while (nl_dump_next(&dump, &msg)) {
2843 unsigned int queue_id;
2845 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2846 htb_update_queue__(netdev, queue_id, &hc);
2849 nl_dump_done(&dump);
2855 htb_tc_destroy(struct tc *tc)
2857 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2858 struct htb_class *hc, *next;
2860 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2861 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2869 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
2871 const struct htb *htb = htb_get__(netdev);
2872 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
2877 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
2879 struct htb_class hc;
2882 htb_parse_qdisc_details__(netdev, details, &hc);
2883 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2884 tc_make_handle(1, 0), &hc);
2886 htb_get__(netdev)->max_rate = hc.max_rate;
2892 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2893 const struct tc_queue *queue, struct smap *details)
2895 const struct htb_class *hc = htb_class_cast__(queue);
2897 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
2898 if (hc->min_rate != hc->max_rate) {
2899 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
2901 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
2903 smap_add_format(details, "priority", "%u", hc->priority);
2909 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2910 const struct smap *details)
2912 struct htb_class hc;
2915 error = htb_parse_class_details__(netdev, details, &hc);
2920 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2921 tc_make_handle(1, 0xfffe), &hc);
2926 htb_update_queue__(netdev, queue_id, &hc);
2931 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2933 struct htb_class *hc = htb_class_cast__(queue);
2934 struct htb *htb = htb_get__(netdev);
2937 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2939 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2946 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2947 struct netdev_queue_stats *stats)
2949 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2950 tc_make_handle(1, 0xfffe), NULL, stats);
2954 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2955 const struct ofpbuf *nlmsg,
2956 netdev_dump_queue_stats_cb *cb, void *aux)
2958 struct netdev_queue_stats stats;
2959 unsigned int handle, major, minor;
2962 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2967 major = tc_get_major(handle);
2968 minor = tc_get_minor(handle);
2969 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2970 (*cb)(minor - 1, &stats, aux);
2975 static const struct tc_ops tc_ops_htb = {
2976 "htb", /* linux_name */
2977 "linux-htb", /* ovs_name */
2978 HTB_N_QUEUES, /* n_queues */
2987 htb_class_get_stats,
2988 htb_class_dump_stats
2991 /* "linux-hfsc" traffic control class. */
2993 #define HFSC_N_QUEUES 0xf000
3001 struct tc_queue tc_queue;
3006 static struct hfsc *
3007 hfsc_get__(const struct netdev *netdev_)
3009 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3010 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
3013 static struct hfsc_class *
3014 hfsc_class_cast__(const struct tc_queue *queue)
3016 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3020 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
3022 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3025 hfsc = xmalloc(sizeof *hfsc);
3026 tc_init(&hfsc->tc, &tc_ops_hfsc);
3027 hfsc->max_rate = max_rate;
3028 netdev->tc = &hfsc->tc;
3032 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3033 const struct hfsc_class *hc)
3037 struct hfsc_class *hcp;
3038 struct tc_queue *queue;
3040 hfsc = hfsc_get__(netdev);
3041 hash = hash_int(queue_id, 0);
3043 queue = tc_find_queue__(netdev, queue_id, hash);
3045 hcp = hfsc_class_cast__(queue);
3047 hcp = xmalloc(sizeof *hcp);
3048 queue = &hcp->tc_queue;
3049 queue->queue_id = queue_id;
3050 queue->created = time_msec();
3051 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3054 hcp->min_rate = hc->min_rate;
3055 hcp->max_rate = hc->max_rate;
3059 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3061 const struct tc_service_curve *rsc, *fsc, *usc;
3062 static const struct nl_policy tca_hfsc_policy[] = {
3064 .type = NL_A_UNSPEC,
3066 .min_len = sizeof(struct tc_service_curve),
3069 .type = NL_A_UNSPEC,
3071 .min_len = sizeof(struct tc_service_curve),
3074 .type = NL_A_UNSPEC,
3076 .min_len = sizeof(struct tc_service_curve),
3079 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3081 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3082 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3083 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3087 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3088 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3089 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3091 if (rsc->m1 != 0 || rsc->d != 0 ||
3092 fsc->m1 != 0 || fsc->d != 0 ||
3093 usc->m1 != 0 || usc->d != 0) {
3094 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3095 "Non-linear service curves are not supported.");
3099 if (rsc->m2 != fsc->m2) {
3100 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3101 "Real-time service curves are not supported ");
3105 if (rsc->m2 > usc->m2) {
3106 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3107 "Min-rate service curve is greater than "
3108 "the max-rate service curve.");
3112 class->min_rate = fsc->m2;
3113 class->max_rate = usc->m2;
3118 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3119 struct hfsc_class *options,
3120 struct netdev_queue_stats *stats)
3123 unsigned int handle;
3124 struct nlattr *nl_options;
3126 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3132 unsigned int major, minor;
3134 major = tc_get_major(handle);
3135 minor = tc_get_minor(handle);
3136 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3137 *queue_id = minor - 1;
3144 error = hfsc_parse_tca_options__(nl_options, options);
3151 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3152 unsigned int parent, struct hfsc_class *options,
3153 struct netdev_queue_stats *stats)
3156 struct ofpbuf *reply;
3158 error = tc_query_class(netdev, handle, parent, &reply);
3163 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3164 ofpbuf_delete(reply);
3169 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct smap *details,
3170 struct hfsc_class *class)
3173 const char *max_rate_s;
3175 max_rate_s = smap_get(details, "max-rate");
3176 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3179 enum netdev_features current;
3181 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
3182 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3185 class->min_rate = max_rate;
3186 class->max_rate = max_rate;
3190 hfsc_parse_class_details__(struct netdev *netdev,
3191 const struct smap *details,
3192 struct hfsc_class * class)
3194 const struct hfsc *hfsc;
3195 uint32_t min_rate, max_rate;
3196 const char *min_rate_s, *max_rate_s;
3198 hfsc = hfsc_get__(netdev);
3199 min_rate_s = smap_get(details, "min-rate");
3200 max_rate_s = smap_get(details, "max-rate");
3202 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3203 min_rate = MAX(min_rate, 1);
3204 min_rate = MIN(min_rate, hfsc->max_rate);
3206 max_rate = (max_rate_s
3207 ? strtoull(max_rate_s, NULL, 10) / 8
3209 max_rate = MAX(max_rate, min_rate);
3210 max_rate = MIN(max_rate, hfsc->max_rate);
3212 class->min_rate = min_rate;
3213 class->max_rate = max_rate;
3218 /* Create an HFSC qdisc.
3220 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3222 hfsc_setup_qdisc__(struct netdev * netdev)
3224 struct tcmsg *tcmsg;
3225 struct ofpbuf request;
3226 struct tc_hfsc_qopt opt;
3228 tc_del_qdisc(netdev);
3230 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3231 NLM_F_EXCL | NLM_F_CREATE, &request);
3237 tcmsg->tcm_handle = tc_make_handle(1, 0);
3238 tcmsg->tcm_parent = TC_H_ROOT;
3240 memset(&opt, 0, sizeof opt);
3243 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3244 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3246 return tc_transact(&request, NULL);
3249 /* Create an HFSC class.
3251 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3252 * sc rate <min_rate> ul rate <max_rate>" */
3254 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3255 unsigned int parent, struct hfsc_class *class)
3259 struct tcmsg *tcmsg;
3260 struct ofpbuf request;
3261 struct tc_service_curve min, max;
3263 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3269 tcmsg->tcm_handle = handle;
3270 tcmsg->tcm_parent = parent;
3274 min.m2 = class->min_rate;
3278 max.m2 = class->max_rate;
3280 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3281 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3282 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3283 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3284 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3285 nl_msg_end_nested(&request, opt_offset);
3287 error = tc_transact(&request, NULL);
3289 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3290 "min-rate %ubps, max-rate %ubps (%s)",
3291 netdev_get_name(netdev),
3292 tc_get_major(handle), tc_get_minor(handle),
3293 tc_get_major(parent), tc_get_minor(parent),
3294 class->min_rate, class->max_rate, ovs_strerror(error));
3301 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
3304 struct hfsc_class class;
3306 error = hfsc_setup_qdisc__(netdev);
3312 hfsc_parse_qdisc_details__(netdev, details, &class);
3313 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3314 tc_make_handle(1, 0), &class);
3320 hfsc_install__(netdev, class.max_rate);
3325 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3328 struct nl_dump dump;
3329 struct hfsc_class hc;
3332 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3333 hfsc_install__(netdev, hc.max_rate);
3335 if (!start_queue_dump(netdev, &dump)) {
3339 while (nl_dump_next(&dump, &msg)) {
3340 unsigned int queue_id;
3342 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3343 hfsc_update_queue__(netdev, queue_id, &hc);
3347 nl_dump_done(&dump);
3352 hfsc_tc_destroy(struct tc *tc)
3355 struct hfsc_class *hc, *next;
3357 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3359 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3360 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3369 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
3371 const struct hfsc *hfsc;
3372 hfsc = hfsc_get__(netdev);
3373 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
3378 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
3381 struct hfsc_class class;
3383 hfsc_parse_qdisc_details__(netdev, details, &class);
3384 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3385 tc_make_handle(1, 0), &class);
3388 hfsc_get__(netdev)->max_rate = class.max_rate;
3395 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3396 const struct tc_queue *queue, struct smap *details)
3398 const struct hfsc_class *hc;
3400 hc = hfsc_class_cast__(queue);
3401 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3402 if (hc->min_rate != hc->max_rate) {
3403 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3409 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3410 const struct smap *details)
3413 struct hfsc_class class;
3415 error = hfsc_parse_class_details__(netdev, details, &class);
3420 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3421 tc_make_handle(1, 0xfffe), &class);
3426 hfsc_update_queue__(netdev, queue_id, &class);
3431 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3435 struct hfsc_class *hc;
3437 hc = hfsc_class_cast__(queue);
3438 hfsc = hfsc_get__(netdev);
3440 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3442 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3449 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3450 struct netdev_queue_stats *stats)
3452 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3453 tc_make_handle(1, 0xfffe), NULL, stats);
3457 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3458 const struct ofpbuf *nlmsg,
3459 netdev_dump_queue_stats_cb *cb, void *aux)
3461 struct netdev_queue_stats stats;
3462 unsigned int handle, major, minor;
3465 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3470 major = tc_get_major(handle);
3471 minor = tc_get_minor(handle);
3472 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3473 (*cb)(minor - 1, &stats, aux);
3478 static const struct tc_ops tc_ops_hfsc = {
3479 "hfsc", /* linux_name */
3480 "linux-hfsc", /* ovs_name */
3481 HFSC_N_QUEUES, /* n_queues */
3482 hfsc_tc_install, /* tc_install */
3483 hfsc_tc_load, /* tc_load */
3484 hfsc_tc_destroy, /* tc_destroy */
3485 hfsc_qdisc_get, /* qdisc_get */
3486 hfsc_qdisc_set, /* qdisc_set */
3487 hfsc_class_get, /* class_get */
3488 hfsc_class_set, /* class_set */
3489 hfsc_class_delete, /* class_delete */
3490 hfsc_class_get_stats, /* class_get_stats */
3491 hfsc_class_dump_stats /* class_dump_stats */
3494 /* "linux-default" traffic control class.
3496 * This class represents the default, unnamed Linux qdisc. It corresponds to
3497 * the "" (empty string) QoS type in the OVS database. */
3500 default_install__(struct netdev *netdev_)
3502 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3503 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
3505 /* Nothing but a tc class implementation is allowed to write to a tc. This
3506 * class never does that, so we can legitimately use a const tc object. */
3507 netdev->tc = CONST_CAST(struct tc *, &tc);
3511 default_tc_install(struct netdev *netdev,
3512 const struct smap *details OVS_UNUSED)
3514 default_install__(netdev);
3519 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3521 default_install__(netdev);
3525 static const struct tc_ops tc_ops_default = {
3526 NULL, /* linux_name */
3531 NULL, /* tc_destroy */
3532 NULL, /* qdisc_get */
3533 NULL, /* qdisc_set */
3534 NULL, /* class_get */
3535 NULL, /* class_set */
3536 NULL, /* class_delete */
3537 NULL, /* class_get_stats */
3538 NULL /* class_dump_stats */
3541 /* "linux-other" traffic control class.
3546 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
3548 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3549 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
3551 /* Nothing but a tc class implementation is allowed to write to a tc. This
3552 * class never does that, so we can legitimately use a const tc object. */
3553 netdev->tc = CONST_CAST(struct tc *, &tc);
3557 static const struct tc_ops tc_ops_other = {
3558 NULL, /* linux_name */
3559 "linux-other", /* ovs_name */
3561 NULL, /* tc_install */
3563 NULL, /* tc_destroy */
3564 NULL, /* qdisc_get */
3565 NULL, /* qdisc_set */
3566 NULL, /* class_get */
3567 NULL, /* class_set */
3568 NULL, /* class_delete */
3569 NULL, /* class_get_stats */
3570 NULL /* class_dump_stats */
3573 /* Traffic control. */
3575 /* Number of kernel "tc" ticks per second. */
3576 static double ticks_per_s;
3578 /* Number of kernel "jiffies" per second. This is used for the purpose of
3579 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3580 * one jiffy's worth of data.
3582 * There are two possibilities here:
3584 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3585 * approximate range of 100 to 1024. That means that we really need to
3586 * make sure that the qdisc can buffer that much data.
3588 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3589 * has finely granular timers and there's no need to fudge additional room
3590 * for buffers. (There's no extra effort needed to implement that: the
3591 * large 'buffer_hz' is used as a divisor, so practically any number will
3592 * come out as 0 in the division. Small integer results in the case of
3593 * really high dividends won't have any real effect anyhow.)
3595 static unsigned int buffer_hz;
3597 /* Returns tc handle 'major':'minor'. */
3599 tc_make_handle(unsigned int major, unsigned int minor)
3601 return TC_H_MAKE(major << 16, minor);
3604 /* Returns the major number from 'handle'. */
3606 tc_get_major(unsigned int handle)
3608 return TC_H_MAJ(handle) >> 16;
3611 /* Returns the minor number from 'handle'. */
3613 tc_get_minor(unsigned int handle)
3615 return TC_H_MIN(handle);
3618 static struct tcmsg *
3619 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3620 struct ofpbuf *request)
3622 struct tcmsg *tcmsg;
3626 error = get_ifindex(netdev, &ifindex);
3631 ofpbuf_init(request, 512);
3632 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3633 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3634 tcmsg->tcm_family = AF_UNSPEC;
3635 tcmsg->tcm_ifindex = ifindex;
3636 /* Caller should fill in tcmsg->tcm_handle. */
3637 /* Caller should fill in tcmsg->tcm_parent. */
3643 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3645 int error = nl_transact(NETLINK_ROUTE, request, replyp);
3646 ofpbuf_uninit(request);
3650 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3651 * policing configuration.
3653 * This function is equivalent to running the following when 'add' is true:
3654 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3656 * This function is equivalent to running the following when 'add' is false:
3657 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3659 * The configuration and stats may be seen with the following command:
3660 * /sbin/tc -s qdisc show dev <devname>
3662 * Returns 0 if successful, otherwise a positive errno value.
3665 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3667 struct ofpbuf request;
3668 struct tcmsg *tcmsg;
3670 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3671 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3673 tcmsg = tc_make_request(netdev, type, flags, &request);
3677 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3678 tcmsg->tcm_parent = TC_H_INGRESS;
3679 nl_msg_put_string(&request, TCA_KIND, "ingress");
3680 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3682 error = tc_transact(&request, NULL);
3684 /* If we're deleting the qdisc, don't worry about some of the
3685 * error conditions. */
3686 if (!add && (error == ENOENT || error == EINVAL)) {
3695 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3698 * This function is equivalent to running:
3699 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3700 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3703 * The configuration and stats may be seen with the following command:
3704 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3706 * Returns 0 if successful, otherwise a positive errno value.
3709 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3711 struct tc_police tc_police;
3712 struct ofpbuf request;
3713 struct tcmsg *tcmsg;
3714 size_t basic_offset;
3715 size_t police_offset;
3719 memset(&tc_police, 0, sizeof tc_police);
3720 tc_police.action = TC_POLICE_SHOT;
3721 tc_police.mtu = mtu;
3722 tc_fill_rate(&tc_police.rate, (kbits_rate * 1000)/8, mtu);
3723 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3724 kbits_burst * 1024);
3726 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3727 NLM_F_EXCL | NLM_F_CREATE, &request);
3731 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3732 tcmsg->tcm_info = tc_make_handle(49,
3733 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3735 nl_msg_put_string(&request, TCA_KIND, "basic");
3736 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3737 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3738 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3739 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3740 nl_msg_end_nested(&request, police_offset);
3741 nl_msg_end_nested(&request, basic_offset);
3743 error = tc_transact(&request, NULL);
3754 /* The values in psched are not individually very meaningful, but they are
3755 * important. The tables below show some values seen in the wild.
3759 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3760 * (Before that, there are hints that it was 1000000000.)
3762 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3766 * -----------------------------------
3767 * [1] 000c8000 000f4240 000f4240 00000064
3768 * [2] 000003e8 00000400 000f4240 3b9aca00
3769 * [3] 000003e8 00000400 000f4240 3b9aca00
3770 * [4] 000003e8 00000400 000f4240 00000064
3771 * [5] 000003e8 00000040 000f4240 3b9aca00
3772 * [6] 000003e8 00000040 000f4240 000000f9
3774 * a b c d ticks_per_s buffer_hz
3775 * ------- --------- ---------- ------------- ----------- -------------
3776 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3777 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3778 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3779 * [4] 1,000 1,024 1,000,000 100 976,562 100
3780 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3781 * [6] 1,000 64 1,000,000 249 15,625,000 249
3783 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3784 * [2] 2.6.26-1-686-bigmem from Debian lenny
3785 * [3] 2.6.26-2-sparc64 from Debian lenny
3786 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3787 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3788 * [6] 2.6.34 from kernel.org on KVM
3790 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
3791 static const char fn[] = "/proc/net/psched";
3792 unsigned int a, b, c, d;
3795 if (!ovsthread_once_start(&once)) {
3802 stream = fopen(fn, "r");
3804 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
3808 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3809 VLOG_WARN("%s: read failed", fn);
3813 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3817 VLOG_WARN("%s: invalid scheduler parameters", fn);
3821 ticks_per_s = (double) a * c / b;
3825 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3828 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3831 ovsthread_once_done(&once);
3834 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3835 * rate of 'rate' bytes per second. */
3837 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3840 return (rate * ticks) / ticks_per_s;
3843 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3844 * rate of 'rate' bytes per second. */
3846 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3849 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3852 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3853 * a transmission rate of 'rate' bytes per second. */
3855 tc_buffer_per_jiffy(unsigned int rate)
3858 return rate / buffer_hz;
3861 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3862 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3863 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3864 * stores NULL into it if it is absent.
3866 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3869 * Returns 0 if successful, otherwise a positive errno value. */
3871 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3872 struct nlattr **options)
3874 static const struct nl_policy tca_policy[] = {
3875 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3876 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3878 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3880 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3881 tca_policy, ta, ARRAY_SIZE(ta))) {
3882 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3887 *kind = nl_attr_get_string(ta[TCA_KIND]);
3891 *options = ta[TCA_OPTIONS];
3906 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3907 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3908 * into '*options', and its queue statistics into '*stats'. Any of the output
3909 * arguments may be null.
3911 * Returns 0 if successful, otherwise a positive errno value. */
3913 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3914 struct nlattr **options, struct netdev_queue_stats *stats)
3916 static const struct nl_policy tca_policy[] = {
3917 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3918 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3920 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3922 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3923 tca_policy, ta, ARRAY_SIZE(ta))) {
3924 VLOG_WARN_RL(&rl, "failed to parse class message");
3929 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3930 *handlep = tc->tcm_handle;
3934 *options = ta[TCA_OPTIONS];
3938 const struct gnet_stats_queue *gsq;
3939 struct gnet_stats_basic gsb;
3941 static const struct nl_policy stats_policy[] = {
3942 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3943 .min_len = sizeof gsb },
3944 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3945 .min_len = sizeof *gsq },
3947 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3949 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3950 sa, ARRAY_SIZE(sa))) {
3951 VLOG_WARN_RL(&rl, "failed to parse class stats");
3955 /* Alignment issues screw up the length of struct gnet_stats_basic on
3956 * some arch/bitsize combinations. Newer versions of Linux have a
3957 * struct gnet_stats_basic_packed, but we can't depend on that. The
3958 * easiest thing to do is just to make a copy. */
3959 memset(&gsb, 0, sizeof gsb);
3960 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3961 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3962 stats->tx_bytes = gsb.bytes;
3963 stats->tx_packets = gsb.packets;
3965 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3966 stats->tx_errors = gsq->drops;
3976 memset(stats, 0, sizeof *stats);
3981 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3984 tc_query_class(const struct netdev *netdev,
3985 unsigned int handle, unsigned int parent,
3986 struct ofpbuf **replyp)
3988 struct ofpbuf request;
3989 struct tcmsg *tcmsg;
3992 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3996 tcmsg->tcm_handle = handle;
3997 tcmsg->tcm_parent = parent;
3999 error = tc_transact(&request, replyp);
4001 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
4002 netdev_get_name(netdev),
4003 tc_get_major(handle), tc_get_minor(handle),
4004 tc_get_major(parent), tc_get_minor(parent),
4005 ovs_strerror(error));
4010 /* Equivalent to "tc class del dev <name> handle <handle>". */
4012 tc_delete_class(const struct netdev *netdev, unsigned int handle)
4014 struct ofpbuf request;
4015 struct tcmsg *tcmsg;
4018 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
4022 tcmsg->tcm_handle = handle;
4023 tcmsg->tcm_parent = 0;
4025 error = tc_transact(&request, NULL);
4027 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4028 netdev_get_name(netdev),
4029 tc_get_major(handle), tc_get_minor(handle),
4030 ovs_strerror(error));
4035 /* Equivalent to "tc qdisc del dev <name> root". */
4037 tc_del_qdisc(struct netdev *netdev_)
4039 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4040 struct ofpbuf request;
4041 struct tcmsg *tcmsg;
4044 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
4048 tcmsg->tcm_handle = tc_make_handle(1, 0);
4049 tcmsg->tcm_parent = TC_H_ROOT;
4051 error = tc_transact(&request, NULL);
4052 if (error == EINVAL) {
4053 /* EINVAL probably means that the default qdisc was in use, in which
4054 * case we've accomplished our purpose. */
4057 if (!error && netdev->tc) {
4058 if (netdev->tc->ops->tc_destroy) {
4059 netdev->tc->ops->tc_destroy(netdev->tc);
4066 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4067 * kernel to determine what they are. Returns 0 if successful, otherwise a
4068 * positive errno value. */
4070 tc_query_qdisc(const struct netdev *netdev_)
4072 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4073 struct ofpbuf request, *qdisc;
4074 const struct tc_ops *ops;
4075 struct tcmsg *tcmsg;
4083 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4084 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4085 * 2.6.35 without that fix backported to it.
4087 * To avoid the OOPS, we must not make a request that would attempt to dump
4088 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4089 * few others. There are a few ways that I can see to do this, but most of
4090 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4091 * technique chosen here is to assume that any non-default qdisc that we
4092 * create will have a class with handle 1:0. The built-in qdiscs only have
4093 * a class with handle 0:0.
4095 * We could check for Linux 2.6.35+ and use a more straightforward method
4097 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
4101 tcmsg->tcm_handle = tc_make_handle(1, 0);
4102 tcmsg->tcm_parent = 0;
4104 /* Figure out what tc class to instantiate. */
4105 error = tc_transact(&request, &qdisc);
4109 error = tc_parse_qdisc(qdisc, &kind, NULL);
4111 ops = &tc_ops_other;
4113 ops = tc_lookup_linux_name(kind);
4115 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4116 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4118 ops = &tc_ops_other;
4121 } else if (error == ENOENT) {
4122 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4123 * other entity that doesn't have a handle 1:0. We will assume
4124 * that it's the system default qdisc. */
4125 ops = &tc_ops_default;
4128 /* Who knows? Maybe the device got deleted. */
4129 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4130 netdev_get_name(netdev_), ovs_strerror(error));
4131 ops = &tc_ops_other;
4134 /* Instantiate it. */
4135 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
4136 ovs_assert((load_error == 0) == (netdev->tc != NULL));
4137 ofpbuf_delete(qdisc);
4139 return error ? error : load_error;
4142 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4143 approximate the time to transmit packets of various lengths. For an MTU of
4144 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4145 represents two possible packet lengths; for a MTU of 513 through 1024, four
4146 possible lengths; and so on.
4148 Returns, for the specified 'mtu', the number of bits that packet lengths
4149 need to be shifted right to fit within such a 256-entry table. */
4151 tc_calc_cell_log(unsigned int mtu)
4156 mtu = ETH_PAYLOAD_MAX;
4158 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4160 for (cell_log = 0; mtu >= 256; cell_log++) {
4167 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4170 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4172 memset(rate, 0, sizeof *rate);
4173 rate->cell_log = tc_calc_cell_log(mtu);
4174 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4175 /* rate->cell_align = 0; */ /* distro headers. */
4176 rate->mpu = ETH_TOTAL_MIN;
4180 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4181 * attribute of the specified "type".
4183 * See tc_calc_cell_log() above for a description of "rtab"s. */
4185 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4190 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4191 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4192 unsigned packet_size = (i + 1) << rate->cell_log;
4193 if (packet_size < rate->mpu) {
4194 packet_size = rate->mpu;
4196 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4200 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4201 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4202 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4205 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4207 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4208 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4211 /* Linux-only functions declared in netdev-linux.h */
4213 /* Returns a fd for an AF_INET socket or a negative errno value. */
4215 netdev_linux_get_af_inet_sock(void)
4217 int error = netdev_linux_init();
4218 return error ? -error : af_inet_sock;
4221 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4222 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4224 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4225 const char *flag_name, bool enable)
4227 const char *netdev_name = netdev_get_name(netdev);
4228 struct ethtool_value evalue;
4232 COVERAGE_INC(netdev_get_ethtool);
4233 memset(&evalue, 0, sizeof evalue);
4234 error = netdev_linux_do_ethtool(netdev_name,
4235 (struct ethtool_cmd *)&evalue,
4236 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4241 COVERAGE_INC(netdev_set_ethtool);
4242 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4243 error = netdev_linux_do_ethtool(netdev_name,
4244 (struct ethtool_cmd *)&evalue,
4245 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4250 COVERAGE_INC(netdev_get_ethtool);
4251 memset(&evalue, 0, sizeof evalue);
4252 error = netdev_linux_do_ethtool(netdev_name,
4253 (struct ethtool_cmd *)&evalue,
4254 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4259 if (new_flags != evalue.data) {
4260 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4261 "device %s failed", enable ? "enable" : "disable",
4262 flag_name, netdev_name);
4269 /* Utility functions. */
4271 /* Copies 'src' into 'dst', performing format conversion in the process. */
4273 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4274 const struct rtnl_link_stats *src)
4276 dst->rx_packets = src->rx_packets;
4277 dst->tx_packets = src->tx_packets;
4278 dst->rx_bytes = src->rx_bytes;
4279 dst->tx_bytes = src->tx_bytes;
4280 dst->rx_errors = src->rx_errors;
4281 dst->tx_errors = src->tx_errors;
4282 dst->rx_dropped = src->rx_dropped;
4283 dst->tx_dropped = src->tx_dropped;
4284 dst->multicast = src->multicast;
4285 dst->collisions = src->collisions;
4286 dst->rx_length_errors = src->rx_length_errors;
4287 dst->rx_over_errors = src->rx_over_errors;
4288 dst->rx_crc_errors = src->rx_crc_errors;
4289 dst->rx_frame_errors = src->rx_frame_errors;
4290 dst->rx_fifo_errors = src->rx_fifo_errors;
4291 dst->rx_missed_errors = src->rx_missed_errors;
4292 dst->tx_aborted_errors = src->tx_aborted_errors;
4293 dst->tx_carrier_errors = src->tx_carrier_errors;
4294 dst->tx_fifo_errors = src->tx_fifo_errors;
4295 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4296 dst->tx_window_errors = src->tx_window_errors;
4300 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4302 /* Policy for RTNLGRP_LINK messages.
4304 * There are *many* more fields in these messages, but currently we only
4305 * care about these fields. */
4306 static const struct nl_policy rtnlgrp_link_policy[] = {
4307 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4308 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4309 .min_len = sizeof(struct rtnl_link_stats) },
4312 struct ofpbuf request;
4313 struct ofpbuf *reply;
4314 struct ifinfomsg *ifi;
4315 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4318 ofpbuf_init(&request, 0);
4319 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4320 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4321 ifi->ifi_family = PF_UNSPEC;
4322 ifi->ifi_index = ifindex;
4323 error = nl_transact(NETLINK_ROUTE, &request, &reply);
4324 ofpbuf_uninit(&request);
4329 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4330 rtnlgrp_link_policy,
4331 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4332 ofpbuf_delete(reply);
4336 if (!attrs[IFLA_STATS]) {
4337 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4338 ofpbuf_delete(reply);
4342 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4344 ofpbuf_delete(reply);
4350 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4352 static const char fn[] = "/proc/net/dev";
4357 stream = fopen(fn, "r");
4359 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
4364 while (fgets(line, sizeof line, stream)) {
4367 #define X64 "%"SCNu64
4370 X64 X64 X64 X64 X64 X64 X64 "%*u"
4371 X64 X64 X64 X64 X64 X64 X64 "%*u",
4377 &stats->rx_fifo_errors,
4378 &stats->rx_frame_errors,
4384 &stats->tx_fifo_errors,
4386 &stats->tx_carrier_errors) != 15) {
4387 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4388 } else if (!strcmp(devname, netdev_name)) {
4389 stats->rx_length_errors = UINT64_MAX;
4390 stats->rx_over_errors = UINT64_MAX;
4391 stats->rx_crc_errors = UINT64_MAX;
4392 stats->rx_missed_errors = UINT64_MAX;
4393 stats->tx_aborted_errors = UINT64_MAX;
4394 stats->tx_heartbeat_errors = UINT64_MAX;
4395 stats->tx_window_errors = UINT64_MAX;
4401 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4407 get_flags(const struct netdev *dev, unsigned int *flags)
4413 error = netdev_linux_do_ioctl(dev->name, &ifr, SIOCGIFFLAGS,
4416 *flags = ifr.ifr_flags;
4422 set_flags(const char *name, unsigned int flags)
4426 ifr.ifr_flags = flags;
4427 return netdev_linux_do_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
4431 do_get_ifindex(const char *netdev_name)
4435 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4436 COVERAGE_INC(netdev_get_ifindex);
4437 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4438 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4439 netdev_name, ovs_strerror(errno));
4442 return ifr.ifr_ifindex;
4446 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4448 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4450 if (!(netdev->cache_valid & VALID_IFINDEX)) {
4451 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4454 netdev->get_ifindex_error = -ifindex;
4455 netdev->ifindex = 0;
4457 netdev->get_ifindex_error = 0;
4458 netdev->ifindex = ifindex;
4460 netdev->cache_valid |= VALID_IFINDEX;
4463 *ifindexp = netdev->ifindex;
4464 return netdev->get_ifindex_error;
4468 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4473 memset(&ifr, 0, sizeof ifr);
4474 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4475 COVERAGE_INC(netdev_get_hwaddr);
4476 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4477 /* ENODEV probably means that a vif disappeared asynchronously and
4478 * hasn't been removed from the database yet, so reduce the log level
4479 * to INFO for that case. */
4480 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4481 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4482 netdev_name, ovs_strerror(errno));
4485 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4486 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4487 VLOG_WARN("%s device has unknown hardware address family %d",
4488 netdev_name, hwaddr_family);
4490 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4495 set_etheraddr(const char *netdev_name,
4496 const uint8_t mac[ETH_ADDR_LEN])
4500 memset(&ifr, 0, sizeof ifr);
4501 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4502 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4503 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4504 COVERAGE_INC(netdev_set_hwaddr);
4505 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4506 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4507 netdev_name, ovs_strerror(errno));
4514 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4515 int cmd, const char *cmd_name)
4519 memset(&ifr, 0, sizeof ifr);
4520 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4521 ifr.ifr_data = (caddr_t) ecmd;
4524 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4527 if (errno != EOPNOTSUPP) {
4528 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4529 "failed: %s", cmd_name, name, ovs_strerror(errno));
4531 /* The device doesn't support this operation. That's pretty
4532 * common, so there's no point in logging anything. */
4539 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4540 const char *cmd_name)
4542 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4543 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4544 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4545 ovs_strerror(errno));
4552 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4553 int cmd, const char *cmd_name)
4558 ifr.ifr_addr.sa_family = AF_INET;
4559 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4561 const struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *,
4563 *ip = sin->sin_addr;
4568 /* Returns an AF_PACKET raw socket or a negative errno value. */
4570 af_packet_sock(void)
4572 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4575 if (ovsthread_once_start(&once)) {
4576 sock = socket(AF_PACKET, SOCK_RAW, 0);
4578 int error = set_nonblocking(sock);
4585 VLOG_ERR("failed to create packet socket: %s",
4586 ovs_strerror(errno));
4588 ovsthread_once_done(&once);