2 * Copyright (c) 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <linux/version.h>
37 #include <sys/types.h>
38 #include <sys/ioctl.h>
39 #include <sys/socket.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
52 #include "dpif-linux.h"
53 #include "dynamic-string.h"
54 #include "fatal-signal.h"
57 #include "netdev-provider.h"
58 #include "netdev-vport.h"
59 #include "netlink-notifier.h"
60 #include "netlink-socket.h"
63 #include "openflow/openflow.h"
65 #include "poll-loop.h"
66 #include "rtnetlink-link.h"
68 #include "socket-util.h"
71 #include "unaligned.h"
74 VLOG_DEFINE_THIS_MODULE(netdev_linux);
76 COVERAGE_DEFINE(netdev_set_policing);
77 COVERAGE_DEFINE(netdev_arp_lookup);
78 COVERAGE_DEFINE(netdev_get_ifindex);
79 COVERAGE_DEFINE(netdev_get_hwaddr);
80 COVERAGE_DEFINE(netdev_set_hwaddr);
81 COVERAGE_DEFINE(netdev_get_ethtool);
82 COVERAGE_DEFINE(netdev_set_ethtool);
85 /* These were introduced in Linux 2.6.14, so they might be missing if we have
87 #ifndef ADVERTISED_Pause
88 #define ADVERTISED_Pause (1 << 13)
90 #ifndef ADVERTISED_Asym_Pause
91 #define ADVERTISED_Asym_Pause (1 << 14)
94 /* These were introduced in Linux 2.6.24, so they might be missing if we
95 * have old headers. */
96 #ifndef ETHTOOL_GFLAGS
97 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
99 #ifndef ETHTOOL_SFLAGS
100 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
103 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
106 #define TC_RTAB_SIZE 1024
109 static struct nln_notifier *netdev_linux_cache_notifier = NULL;
110 static int cache_notifier_refcount;
113 VALID_IFINDEX = 1 << 0,
114 VALID_ETHERADDR = 1 << 1,
118 VALID_POLICING = 1 << 5,
119 VALID_VPORT_STAT_ERROR = 1 << 6,
120 VALID_DRVINFO = 1 << 7,
121 VALID_FEATURES = 1 << 8,
128 /* Traffic control. */
130 /* An instance of a traffic control class. Always associated with a particular
133 * Each TC implementation subclasses this with whatever additional data it
136 const struct tc_ops *ops;
137 struct hmap queues; /* Contains "struct tc_queue"s.
138 * Read by generic TC layer.
139 * Written only by TC implementation. */
142 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
144 /* One traffic control queue.
146 * Each TC implementation subclasses this with whatever additional data it
149 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
150 unsigned int queue_id; /* OpenFlow queue ID. */
151 long long int created; /* Time queue was created, in msecs. */
154 /* A particular kind of traffic control. Each implementation generally maps to
155 * one particular Linux qdisc class.
157 * The functions below return 0 if successful or a positive errno value on
158 * failure, except where otherwise noted. All of them must be provided, except
159 * where otherwise noted. */
161 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
162 * This is null for tc_ops_default and tc_ops_other, for which there are no
163 * appropriate values. */
164 const char *linux_name;
166 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
167 const char *ovs_name;
169 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
170 * queues. The queues are numbered 0 through n_queues - 1. */
171 unsigned int n_queues;
173 /* Called to install this TC class on 'netdev'. The implementation should
174 * make the Netlink calls required to set up 'netdev' with the right qdisc
175 * and configure it according to 'details'. The implementation may assume
176 * that the current qdisc is the default; that is, there is no need for it
177 * to delete the current qdisc before installing itself.
179 * The contents of 'details' should be documented as valid for 'ovs_name'
180 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
181 * (which is built as ovs-vswitchd.conf.db(8)).
183 * This function must return 0 if and only if it sets 'netdev->tc' to an
184 * initialized 'struct tc'.
186 * (This function is null for tc_ops_other, which cannot be installed. For
187 * other TC classes it should always be nonnull.) */
188 int (*tc_install)(struct netdev *netdev, const struct smap *details);
190 /* Called when the netdev code determines (through a Netlink query) that
191 * this TC class's qdisc is installed on 'netdev', but we didn't install
192 * it ourselves and so don't know any of the details.
194 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
195 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
196 * implementation should parse the other attributes of 'nlmsg' as
197 * necessary to determine its configuration. If necessary it should also
198 * use Netlink queries to determine the configuration of queues on
201 * This function must return 0 if and only if it sets 'netdev->tc' to an
202 * initialized 'struct tc'. */
203 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
205 /* Destroys the data structures allocated by the implementation as part of
206 * 'tc'. (This includes destroying 'tc->queues' by calling
209 * The implementation should not need to perform any Netlink calls. If
210 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
211 * (But it may not be desirable.)
213 * This function may be null if 'tc' is trivial. */
214 void (*tc_destroy)(struct tc *tc);
216 /* Retrieves details of 'netdev->tc' configuration into 'details'.
218 * The implementation should not need to perform any Netlink calls, because
219 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
220 * cached the configuration.
222 * The contents of 'details' should be documented as valid for 'ovs_name'
223 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
224 * (which is built as ovs-vswitchd.conf.db(8)).
226 * This function may be null if 'tc' is not configurable.
228 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
230 /* Reconfigures 'netdev->tc' according to 'details', performing any
231 * required Netlink calls to complete the reconfiguration.
233 * The contents of 'details' should be documented as valid for 'ovs_name'
234 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
235 * (which is built as ovs-vswitchd.conf.db(8)).
237 * This function may be null if 'tc' is not configurable.
239 int (*qdisc_set)(struct netdev *, const struct smap *details);
241 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
242 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
244 * The contents of 'details' should be documented as valid for 'ovs_name'
245 * in the "other_config" column in the "Queue" table in
246 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
248 * The implementation should not need to perform any Netlink calls, because
249 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
250 * cached the queue configuration.
252 * This function may be null if 'tc' does not have queues ('n_queues' is
254 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
255 struct smap *details);
257 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
258 * 'details', perfoming any required Netlink calls to complete the
259 * reconfiguration. The caller ensures that 'queue_id' is less than
262 * The contents of 'details' should be documented as valid for 'ovs_name'
263 * in the "other_config" column in the "Queue" table in
264 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
266 * This function may be null if 'tc' does not have queues or its queues are
267 * not configurable. */
268 int (*class_set)(struct netdev *, unsigned int queue_id,
269 const struct smap *details);
271 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
272 * tc_queue's within 'netdev->tc->queues'.
274 * This function may be null if 'tc' does not have queues or its queues
275 * cannot be deleted. */
276 int (*class_delete)(struct netdev *, struct tc_queue *queue);
278 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
279 * 'struct tc_queue's within 'netdev->tc->queues'.
281 * On success, initializes '*stats'.
283 * This function may be null if 'tc' does not have queues or if it cannot
284 * report queue statistics. */
285 int (*class_get_stats)(const struct netdev *netdev,
286 const struct tc_queue *queue,
287 struct netdev_queue_stats *stats);
289 /* Extracts queue stats from 'nlmsg', which is a response to a
290 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
292 * This function may be null if 'tc' does not have queues or if it cannot
293 * report queue statistics. */
294 int (*class_dump_stats)(const struct netdev *netdev,
295 const struct ofpbuf *nlmsg,
296 netdev_dump_queue_stats_cb *cb, void *aux);
300 tc_init(struct tc *tc, const struct tc_ops *ops)
303 hmap_init(&tc->queues);
307 tc_destroy(struct tc *tc)
309 hmap_destroy(&tc->queues);
312 static const struct tc_ops tc_ops_htb;
313 static const struct tc_ops tc_ops_hfsc;
314 static const struct tc_ops tc_ops_default;
315 static const struct tc_ops tc_ops_other;
317 static const struct tc_ops *const tcs[] = {
318 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
319 &tc_ops_hfsc, /* Hierarchical fair service curve. */
320 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
321 &tc_ops_other, /* Some other qdisc. */
325 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
326 static unsigned int tc_get_major(unsigned int handle);
327 static unsigned int tc_get_minor(unsigned int handle);
329 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
330 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
331 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
333 static struct tcmsg *tc_make_request(const struct netdev *, int type,
334 unsigned int flags, struct ofpbuf *);
335 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
336 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
337 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
340 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
341 struct nlattr **options);
342 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
343 struct nlattr **options,
344 struct netdev_queue_stats *);
345 static int tc_query_class(const struct netdev *,
346 unsigned int handle, unsigned int parent,
347 struct ofpbuf **replyp);
348 static int tc_delete_class(const struct netdev *, unsigned int handle);
350 static int tc_del_qdisc(struct netdev *netdev);
351 static int tc_query_qdisc(const struct netdev *netdev);
353 static int tc_calc_cell_log(unsigned int mtu);
354 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
355 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
356 const struct tc_ratespec *rate);
357 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
359 struct netdev_linux {
362 struct shash_node *shash_node;
363 unsigned int cache_valid;
364 unsigned int change_seq;
366 bool miimon; /* Link status of last poll. */
367 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
368 struct timer miimon_timer;
370 /* The following are figured out "on demand" only. They are only valid
371 * when the corresponding VALID_* bit in 'cache_valid' is set. */
373 uint8_t etheraddr[ETH_ADDR_LEN];
374 struct in_addr address, netmask;
377 unsigned int ifi_flags;
378 long long int carrier_resets;
379 uint32_t kbits_rate; /* Policing data. */
380 uint32_t kbits_burst;
381 int vport_stats_error; /* Cached error code from vport_get_stats().
382 0 or an errno value. */
383 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
384 int ether_addr_error; /* Cached error code from set/get etheraddr. */
385 int netdev_policing_error; /* Cached error code from set policing. */
386 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
387 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
389 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
390 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
391 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
392 enum netdev_features peer; /* Cached from ETHTOOL_GSET. */
394 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
398 struct tap_state tap;
402 struct netdev_rx_linux {
408 static const struct netdev_rx_class netdev_rx_linux_class;
410 /* Sockets used for ioctl operations. */
411 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
413 /* This is set pretty low because we probably won't learn anything from the
414 * additional log messages. */
415 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
417 static int netdev_linux_init(void);
419 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
420 int cmd, const char *cmd_name);
421 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
422 const char *cmd_name);
423 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
424 int cmd, const char *cmd_name);
425 static int get_flags(const struct netdev *, unsigned int *flags);
426 static int set_flags(const char *, unsigned int flags);
427 static int do_get_ifindex(const char *netdev_name);
428 static int get_ifindex(const struct netdev *, int *ifindexp);
429 static int do_set_addr(struct netdev *netdev,
430 int ioctl_nr, const char *ioctl_name,
431 struct in_addr addr);
432 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
433 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
434 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
435 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
436 static int af_packet_sock(void);
437 static void netdev_linux_miimon_run(void);
438 static void netdev_linux_miimon_wait(void);
441 is_netdev_linux_class(const struct netdev_class *netdev_class)
443 return netdev_class->init == netdev_linux_init;
447 is_tap_netdev(const struct netdev *netdev)
449 return netdev_get_class(netdev) == &netdev_tap_class;
452 static struct netdev_linux *
453 netdev_linux_cast(const struct netdev *netdev)
455 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
457 return CONTAINER_OF(netdev, struct netdev_linux, up);
460 static struct netdev_rx_linux *
461 netdev_rx_linux_cast(const struct netdev_rx *rx)
463 netdev_rx_assert_class(rx, &netdev_rx_linux_class);
464 return CONTAINER_OF(rx, struct netdev_rx_linux, up);
468 netdev_linux_init(void)
470 static int status = -1;
472 /* Create AF_INET socket. */
473 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
474 status = af_inet_sock >= 0 ? 0 : errno;
476 VLOG_ERR("failed to create inet socket: %s", ovs_strerror(status));
483 netdev_linux_run(void)
485 rtnetlink_link_run();
486 netdev_linux_miimon_run();
490 netdev_linux_wait(void)
492 rtnetlink_link_wait();
493 netdev_linux_miimon_wait();
497 netdev_linux_changed(struct netdev_linux *dev,
498 unsigned int ifi_flags, unsigned int mask)
501 if (!dev->change_seq) {
505 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
506 dev->carrier_resets++;
508 dev->ifi_flags = ifi_flags;
510 dev->cache_valid &= mask;
514 netdev_linux_update(struct netdev_linux *dev,
515 const struct rtnetlink_link_change *change)
517 if (change->nlmsg_type == RTM_NEWLINK) {
519 netdev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
521 /* Update netdev from rtnl-change msg. */
523 dev->mtu = change->mtu;
524 dev->cache_valid |= VALID_MTU;
525 dev->netdev_mtu_error = 0;
528 if (!eth_addr_is_zero(change->addr)) {
529 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
530 dev->cache_valid |= VALID_ETHERADDR;
531 dev->ether_addr_error = 0;
534 dev->ifindex = change->ifi_index;
535 dev->cache_valid |= VALID_IFINDEX;
536 dev->get_ifindex_error = 0;
539 netdev_linux_changed(dev, change->ifi_flags, 0);
544 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
545 void *aux OVS_UNUSED)
547 struct netdev_linux *dev;
549 struct netdev *base_dev = netdev_from_name(change->ifname);
550 if (base_dev && is_netdev_linux_class(netdev_get_class(base_dev))) {
551 netdev_linux_update(netdev_linux_cast(base_dev), change);
554 struct shash device_shash;
555 struct shash_node *node;
557 shash_init(&device_shash);
558 netdev_get_devices(&netdev_linux_class, &device_shash);
559 SHASH_FOR_EACH (node, &device_shash) {
560 struct netdev *netdev = node->data;
563 dev = netdev_linux_cast(netdev);
565 get_flags(&dev->up, &flags);
566 netdev_linux_changed(dev, flags, 0);
568 shash_destroy(&device_shash);
573 cache_notifier_ref(void)
575 if (!cache_notifier_refcount) {
576 ovs_assert(!netdev_linux_cache_notifier);
578 netdev_linux_cache_notifier =
579 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
581 if (!netdev_linux_cache_notifier) {
585 cache_notifier_refcount++;
591 cache_notifier_unref(void)
593 ovs_assert(cache_notifier_refcount > 0);
594 if (!--cache_notifier_refcount) {
595 ovs_assert(netdev_linux_cache_notifier);
596 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
597 netdev_linux_cache_notifier = NULL;
601 /* Creates system and internal devices. */
603 netdev_linux_create(const struct netdev_class *class, const char *name,
604 struct netdev **netdevp)
606 struct netdev_linux *netdev;
609 error = cache_notifier_ref();
614 netdev = xzalloc(sizeof *netdev);
615 netdev->change_seq = 1;
616 netdev_init(&netdev->up, name, class);
617 error = get_flags(&netdev->up, &netdev->ifi_flags);
618 if (error == ENODEV) {
619 if (class != &netdev_internal_class) {
620 /* The device does not exist, so don't allow it to be opened. */
621 netdev_uninit(&netdev->up, false);
622 cache_notifier_unref();
626 /* "Internal" netdevs have to be created as netdev objects before
627 * they exist in the kernel, because creating them in the kernel
628 * happens by passing a netdev object to dpif_port_add().
629 * Therefore, ignore the error. */
633 *netdevp = &netdev->up;
637 /* For most types of netdevs we open the device for each call of
638 * netdev_open(). However, this is not the case with tap devices,
639 * since it is only possible to open the device once. In this
640 * situation we share a single file descriptor, and consequently
641 * buffers, across all readers. Therefore once data is read it will
642 * be unavailable to other reads for tap devices. */
644 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
645 const char *name, struct netdev **netdevp)
647 struct netdev_linux *netdev;
648 struct tap_state *state;
649 static const char tap_dev[] = "/dev/net/tun";
653 netdev = xzalloc(sizeof *netdev);
654 netdev->change_seq = 1;
655 state = &netdev->state.tap;
657 error = cache_notifier_ref();
662 /* Open tap device. */
663 state->fd = open(tap_dev, O_RDWR);
666 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
667 goto error_unref_notifier;
670 /* Create tap device. */
671 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
672 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
673 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
674 VLOG_WARN("%s: creating tap device failed: %s", name,
675 ovs_strerror(errno));
680 /* Make non-blocking. */
681 error = set_nonblocking(state->fd);
686 netdev_init(&netdev->up, name, &netdev_tap_class);
687 *netdevp = &netdev->up;
692 error_unref_notifier:
693 cache_notifier_unref();
700 destroy_tap(struct netdev_linux *netdev)
702 struct tap_state *state = &netdev->state.tap;
704 if (state->fd >= 0) {
709 /* Destroys the netdev device 'netdev_'. */
711 netdev_linux_destroy(struct netdev *netdev_)
713 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
715 if (netdev->tc && netdev->tc->ops->tc_destroy) {
716 netdev->tc->ops->tc_destroy(netdev->tc);
719 if (netdev_get_class(netdev_) == &netdev_tap_class) {
724 cache_notifier_unref();
728 netdev_linux_rx_open(struct netdev *netdev_, struct netdev_rx **rxp)
730 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
731 bool is_tap = is_tap_netdev(netdev_);
732 struct netdev_rx_linux *rx;
737 fd = netdev->state.tap.fd;
739 struct sockaddr_ll sll;
741 /* Result of tcpdump -dd inbound */
742 static struct sock_filter filt[] = {
743 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
744 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
745 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
746 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
748 static struct sock_fprog fprog = { ARRAY_SIZE(filt), filt };
750 /* Create file descriptor. */
751 fd = socket(PF_PACKET, SOCK_RAW, 0);
754 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
758 /* Set non-blocking mode. */
759 error = set_nonblocking(fd);
764 /* Get ethernet device index. */
765 error = get_ifindex(&netdev->up, &ifindex);
770 /* Bind to specific ethernet device. */
771 memset(&sll, 0, sizeof sll);
772 sll.sll_family = AF_PACKET;
773 sll.sll_ifindex = ifindex;
774 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
775 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
777 VLOG_ERR("%s: failed to bind raw socket (%s)",
778 netdev_get_name(netdev_), ovs_strerror(error));
782 /* Filter for only inbound packets. */
783 error = setsockopt(fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
787 VLOG_ERR("%s: failed attach filter (%s)",
788 netdev_get_name(netdev_), ovs_strerror(error));
793 rx = xmalloc(sizeof *rx);
794 netdev_rx_init(&rx->up, netdev_, &netdev_rx_linux_class);
809 netdev_rx_linux_destroy(struct netdev_rx *rx_)
811 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
820 netdev_rx_linux_recv(struct netdev_rx *rx_, void *data, size_t size)
822 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
827 ? read(rx->fd, data, size)
828 : recv(rx->fd, data, size, MSG_TRUNC));
829 } while (retval < 0 && errno == EINTR);
832 return retval > size ? -EMSGSIZE : retval;
834 if (errno != EAGAIN) {
835 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
836 ovs_strerror(errno), netdev_rx_get_name(rx_));
843 netdev_rx_linux_wait(struct netdev_rx *rx_)
845 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
846 poll_fd_wait(rx->fd, POLLIN);
850 netdev_rx_linux_drain(struct netdev_rx *rx_)
852 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
855 int error = netdev_linux_do_ioctl(netdev_rx_get_name(rx_), &ifr,
856 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
860 drain_fd(rx->fd, ifr.ifr_qlen);
863 return drain_rcvbuf(rx->fd);
867 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
868 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
869 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
870 * the packet is too big or too small to transmit on the device.
872 * The caller retains ownership of 'buffer' in all cases.
874 * The kernel maintains a packet transmission queue, so the caller is not
875 * expected to do additional queuing of packets. */
877 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
882 if (!is_tap_netdev(netdev_)) {
883 /* Use our AF_PACKET socket to send to this device. */
884 struct sockaddr_ll sll;
891 sock = af_packet_sock();
896 error = get_ifindex(netdev_, &ifindex);
901 /* We don't bother setting most fields in sockaddr_ll because the
902 * kernel ignores them for SOCK_RAW. */
903 memset(&sll, 0, sizeof sll);
904 sll.sll_family = AF_PACKET;
905 sll.sll_ifindex = ifindex;
907 iov.iov_base = CONST_CAST(void *, data);
911 msg.msg_namelen = sizeof sll;
914 msg.msg_control = NULL;
915 msg.msg_controllen = 0;
918 retval = sendmsg(sock, &msg, 0);
920 /* Use the tap fd to send to this device. This is essential for
921 * tap devices, because packets sent to a tap device with an
922 * AF_PACKET socket will loop back to be *received* again on the
923 * tap device. This doesn't occur on other interface types
924 * because we attach a socket filter to the rx socket. */
925 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
927 retval = write(netdev->state.tap.fd, data, size);
931 /* The Linux AF_PACKET implementation never blocks waiting for room
932 * for packets, instead returning ENOBUFS. Translate this into
933 * EAGAIN for the caller. */
934 if (errno == ENOBUFS) {
936 } else if (errno == EINTR) {
938 } else if (errno != EAGAIN) {
939 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
940 netdev_get_name(netdev_), ovs_strerror(errno));
943 } else if (retval != size) {
944 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
945 "%zu) on %s", retval, size, netdev_get_name(netdev_));
953 /* Registers with the poll loop to wake up from the next call to poll_block()
954 * when the packet transmission queue has sufficient room to transmit a packet
955 * with netdev_send().
957 * The kernel maintains a packet transmission queue, so the client is not
958 * expected to do additional queuing of packets. Thus, this function is
959 * unlikely to ever be used. It is included for completeness. */
961 netdev_linux_send_wait(struct netdev *netdev)
963 if (is_tap_netdev(netdev)) {
964 /* TAP device always accepts packets.*/
965 poll_immediate_wake();
969 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
970 * otherwise a positive errno value. */
972 netdev_linux_set_etheraddr(struct netdev *netdev_,
973 const uint8_t mac[ETH_ADDR_LEN])
975 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
976 struct netdev_saved_flags *sf = NULL;
979 if (netdev->cache_valid & VALID_ETHERADDR) {
980 if (netdev->ether_addr_error) {
981 return netdev->ether_addr_error;
983 if (eth_addr_equals(netdev->etheraddr, mac)) {
986 netdev->cache_valid &= ~VALID_ETHERADDR;
989 /* Tap devices must be brought down before setting the address. */
990 if (is_tap_netdev(netdev_)) {
991 netdev_turn_flags_off(netdev_, NETDEV_UP, &sf);
993 error = set_etheraddr(netdev_get_name(netdev_), mac);
994 if (!error || error == ENODEV) {
995 netdev->ether_addr_error = error;
996 netdev->cache_valid |= VALID_ETHERADDR;
998 memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
1002 netdev_restore_flags(sf);
1007 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1009 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1010 uint8_t mac[ETH_ADDR_LEN])
1012 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1014 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1015 int error = get_etheraddr(netdev_get_name(netdev_),
1018 netdev->ether_addr_error = error;
1019 netdev->cache_valid |= VALID_ETHERADDR;
1022 if (!netdev->ether_addr_error) {
1023 memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
1026 return netdev->ether_addr_error;
1029 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1030 * in bytes, not including the hardware header; thus, this is typically 1500
1031 * bytes for Ethernet devices. */
1033 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1035 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1036 if (!(netdev->cache_valid & VALID_MTU)) {
1040 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1041 SIOCGIFMTU, "SIOCGIFMTU");
1043 netdev->netdev_mtu_error = error;
1044 netdev->mtu = ifr.ifr_mtu;
1045 netdev->cache_valid |= VALID_MTU;
1048 if (!netdev->netdev_mtu_error) {
1049 *mtup = netdev->mtu;
1051 return netdev->netdev_mtu_error;
1054 /* Sets the maximum size of transmitted (MTU) for given device using linux
1055 * networking ioctl interface.
1058 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1060 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1064 if (netdev->cache_valid & VALID_MTU) {
1065 if (netdev->netdev_mtu_error) {
1066 return netdev->netdev_mtu_error;
1068 if (netdev->mtu == mtu) {
1071 netdev->cache_valid &= ~VALID_MTU;
1074 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1075 SIOCSIFMTU, "SIOCSIFMTU");
1076 if (!error || error == ENODEV) {
1077 netdev->netdev_mtu_error = error;
1078 netdev->mtu = ifr.ifr_mtu;
1079 netdev->cache_valid |= VALID_MTU;
1084 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1085 * On failure, returns a negative errno value. */
1087 netdev_linux_get_ifindex(const struct netdev *netdev)
1091 error = get_ifindex(netdev, &ifindex);
1092 return error ? -error : ifindex;
1096 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1098 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1100 if (netdev->miimon_interval > 0) {
1101 *carrier = netdev->miimon;
1103 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1109 static long long int
1110 netdev_linux_get_carrier_resets(const struct netdev *netdev)
1112 return netdev_linux_cast(netdev)->carrier_resets;
1116 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1117 struct mii_ioctl_data *data)
1122 memset(&ifr, 0, sizeof ifr);
1123 memcpy(&ifr.ifr_data, data, sizeof *data);
1124 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1125 memcpy(data, &ifr.ifr_data, sizeof *data);
1131 netdev_linux_get_miimon(const char *name, bool *miimon)
1133 struct mii_ioctl_data data;
1138 memset(&data, 0, sizeof data);
1139 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1141 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1142 data.reg_num = MII_BMSR;
1143 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1147 *miimon = !!(data.val_out & BMSR_LSTATUS);
1149 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1152 struct ethtool_cmd ecmd;
1154 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1157 COVERAGE_INC(netdev_get_ethtool);
1158 memset(&ecmd, 0, sizeof ecmd);
1159 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1162 struct ethtool_value eval;
1164 memcpy(&eval, &ecmd, sizeof eval);
1165 *miimon = !!eval.data;
1167 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1175 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1176 long long int interval)
1178 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1180 interval = interval > 0 ? MAX(interval, 100) : 0;
1181 if (netdev->miimon_interval != interval) {
1182 netdev->miimon_interval = interval;
1183 timer_set_expired(&netdev->miimon_timer);
1190 netdev_linux_miimon_run(void)
1192 struct shash device_shash;
1193 struct shash_node *node;
1195 shash_init(&device_shash);
1196 netdev_get_devices(&netdev_linux_class, &device_shash);
1197 SHASH_FOR_EACH (node, &device_shash) {
1198 struct netdev *netdev = node->data;
1199 struct netdev_linux *dev = netdev_linux_cast(netdev);
1202 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1206 netdev_linux_get_miimon(dev->up.name, &miimon);
1207 if (miimon != dev->miimon) {
1208 dev->miimon = miimon;
1209 netdev_linux_changed(dev, dev->ifi_flags, 0);
1212 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1215 shash_destroy(&device_shash);
1219 netdev_linux_miimon_wait(void)
1221 struct shash device_shash;
1222 struct shash_node *node;
1224 shash_init(&device_shash);
1225 netdev_get_devices(&netdev_linux_class, &device_shash);
1226 SHASH_FOR_EACH (node, &device_shash) {
1227 struct netdev *netdev = node->data;
1228 struct netdev_linux *dev = netdev_linux_cast(netdev);
1230 if (dev->miimon_interval > 0) {
1231 timer_wait(&dev->miimon_timer);
1234 shash_destroy(&device_shash);
1237 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1238 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1241 check_for_working_netlink_stats(void)
1243 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1244 * preferable, so if that works, we'll use it. */
1245 int ifindex = do_get_ifindex("lo");
1247 VLOG_WARN("failed to get ifindex for lo, "
1248 "obtaining netdev stats from proc");
1251 struct netdev_stats stats;
1252 int error = get_stats_via_netlink(ifindex, &stats);
1254 VLOG_DBG("obtaining netdev stats via rtnetlink");
1257 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1258 "via proc (you are probably running a pre-2.6.19 "
1259 "kernel)", ovs_strerror(error));
1266 swap_uint64(uint64_t *a, uint64_t *b)
1273 /* Copies 'src' into 'dst', performing format conversion in the process.
1275 * 'src' is allowed to be misaligned. */
1277 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1278 const struct ovs_vport_stats *src)
1280 dst->rx_packets = get_unaligned_u64(&src->rx_packets);
1281 dst->tx_packets = get_unaligned_u64(&src->tx_packets);
1282 dst->rx_bytes = get_unaligned_u64(&src->rx_bytes);
1283 dst->tx_bytes = get_unaligned_u64(&src->tx_bytes);
1284 dst->rx_errors = get_unaligned_u64(&src->rx_errors);
1285 dst->tx_errors = get_unaligned_u64(&src->tx_errors);
1286 dst->rx_dropped = get_unaligned_u64(&src->rx_dropped);
1287 dst->tx_dropped = get_unaligned_u64(&src->tx_dropped);
1289 dst->collisions = 0;
1290 dst->rx_length_errors = 0;
1291 dst->rx_over_errors = 0;
1292 dst->rx_crc_errors = 0;
1293 dst->rx_frame_errors = 0;
1294 dst->rx_fifo_errors = 0;
1295 dst->rx_missed_errors = 0;
1296 dst->tx_aborted_errors = 0;
1297 dst->tx_carrier_errors = 0;
1298 dst->tx_fifo_errors = 0;
1299 dst->tx_heartbeat_errors = 0;
1300 dst->tx_window_errors = 0;
1304 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1306 struct dpif_linux_vport reply;
1310 error = dpif_linux_vport_get(netdev_get_name(netdev), &reply, &buf);
1313 } else if (!reply.stats) {
1318 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1326 get_stats_via_vport(const struct netdev *netdev_,
1327 struct netdev_stats *stats)
1329 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1331 if (!netdev->vport_stats_error ||
1332 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1335 error = get_stats_via_vport__(netdev_, stats);
1336 if (error && error != ENOENT) {
1337 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1339 netdev_get_name(netdev_), ovs_strerror(error));
1341 netdev->vport_stats_error = error;
1342 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1347 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1348 struct netdev_stats *stats)
1350 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
1351 static int use_netlink_stats;
1354 if (ovsthread_once_start(&once)) {
1355 use_netlink_stats = check_for_working_netlink_stats();
1356 ovsthread_once_done(&once);
1359 if (use_netlink_stats) {
1362 error = get_ifindex(netdev_, &ifindex);
1364 error = get_stats_via_netlink(ifindex, stats);
1367 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1371 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1372 netdev_get_name(netdev_), error);
1378 /* Retrieves current device stats for 'netdev-linux'. */
1380 netdev_linux_get_stats(const struct netdev *netdev_,
1381 struct netdev_stats *stats)
1383 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1384 struct netdev_stats dev_stats;
1387 get_stats_via_vport(netdev_, stats);
1389 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1392 if (netdev->vport_stats_error) {
1399 if (netdev->vport_stats_error) {
1400 /* stats not available from OVS then use ioctl stats. */
1403 stats->rx_errors += dev_stats.rx_errors;
1404 stats->tx_errors += dev_stats.tx_errors;
1405 stats->rx_dropped += dev_stats.rx_dropped;
1406 stats->tx_dropped += dev_stats.tx_dropped;
1407 stats->multicast += dev_stats.multicast;
1408 stats->collisions += dev_stats.collisions;
1409 stats->rx_length_errors += dev_stats.rx_length_errors;
1410 stats->rx_over_errors += dev_stats.rx_over_errors;
1411 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1412 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1413 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1414 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1415 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1416 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1417 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1418 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1419 stats->tx_window_errors += dev_stats.tx_window_errors;
1424 /* Retrieves current device stats for 'netdev-tap' netdev or
1425 * netdev-internal. */
1427 netdev_tap_get_stats(const struct netdev *netdev_,
1428 struct netdev_stats *stats)
1430 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1431 struct netdev_stats dev_stats;
1434 get_stats_via_vport(netdev_, stats);
1436 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1438 if (netdev->vport_stats_error) {
1445 /* If this port is an internal port then the transmit and receive stats
1446 * will appear to be swapped relative to the other ports since we are the
1447 * one sending the data, not a remote computer. For consistency, we swap
1448 * them back here. This does not apply if we are getting stats from the
1449 * vport layer because it always tracks stats from the perspective of the
1451 if (netdev->vport_stats_error) {
1453 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1454 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1455 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1456 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1457 stats->rx_length_errors = 0;
1458 stats->rx_over_errors = 0;
1459 stats->rx_crc_errors = 0;
1460 stats->rx_frame_errors = 0;
1461 stats->rx_fifo_errors = 0;
1462 stats->rx_missed_errors = 0;
1463 stats->tx_aborted_errors = 0;
1464 stats->tx_carrier_errors = 0;
1465 stats->tx_fifo_errors = 0;
1466 stats->tx_heartbeat_errors = 0;
1467 stats->tx_window_errors = 0;
1469 stats->rx_dropped += dev_stats.tx_dropped;
1470 stats->tx_dropped += dev_stats.rx_dropped;
1472 stats->rx_errors += dev_stats.tx_errors;
1473 stats->tx_errors += dev_stats.rx_errors;
1475 stats->multicast += dev_stats.multicast;
1476 stats->collisions += dev_stats.collisions;
1482 netdev_internal_get_stats(const struct netdev *netdev_,
1483 struct netdev_stats *stats)
1485 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1487 get_stats_via_vport(netdev_, stats);
1488 return netdev->vport_stats_error;
1492 netdev_internal_set_stats(struct netdev *netdev,
1493 const struct netdev_stats *stats)
1495 struct ovs_vport_stats vport_stats;
1496 struct dpif_linux_vport vport;
1499 vport_stats.rx_packets = stats->rx_packets;
1500 vport_stats.tx_packets = stats->tx_packets;
1501 vport_stats.rx_bytes = stats->rx_bytes;
1502 vport_stats.tx_bytes = stats->tx_bytes;
1503 vport_stats.rx_errors = stats->rx_errors;
1504 vport_stats.tx_errors = stats->tx_errors;
1505 vport_stats.rx_dropped = stats->rx_dropped;
1506 vport_stats.tx_dropped = stats->tx_dropped;
1508 dpif_linux_vport_init(&vport);
1509 vport.cmd = OVS_VPORT_CMD_SET;
1510 vport.name = netdev_get_name(netdev);
1511 vport.stats = &vport_stats;
1513 err = dpif_linux_vport_transact(&vport, NULL, NULL);
1515 /* If the vport layer doesn't know about the device, that doesn't mean it
1516 * doesn't exist (after all were able to open it when netdev_open() was
1517 * called), it just means that it isn't attached and we'll be getting
1518 * stats a different way. */
1519 if (err == ENODEV) {
1527 netdev_linux_read_features(struct netdev_linux *netdev)
1529 struct ethtool_cmd ecmd;
1533 if (netdev->cache_valid & VALID_FEATURES) {
1537 COVERAGE_INC(netdev_get_ethtool);
1538 memset(&ecmd, 0, sizeof ecmd);
1539 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1540 ETHTOOL_GSET, "ETHTOOL_GSET");
1545 /* Supported features. */
1546 netdev->supported = 0;
1547 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1548 netdev->supported |= NETDEV_F_10MB_HD;
1550 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1551 netdev->supported |= NETDEV_F_10MB_FD;
1553 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1554 netdev->supported |= NETDEV_F_100MB_HD;
1556 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1557 netdev->supported |= NETDEV_F_100MB_FD;
1559 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1560 netdev->supported |= NETDEV_F_1GB_HD;
1562 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1563 netdev->supported |= NETDEV_F_1GB_FD;
1565 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1566 netdev->supported |= NETDEV_F_10GB_FD;
1568 if (ecmd.supported & SUPPORTED_TP) {
1569 netdev->supported |= NETDEV_F_COPPER;
1571 if (ecmd.supported & SUPPORTED_FIBRE) {
1572 netdev->supported |= NETDEV_F_FIBER;
1574 if (ecmd.supported & SUPPORTED_Autoneg) {
1575 netdev->supported |= NETDEV_F_AUTONEG;
1577 if (ecmd.supported & SUPPORTED_Pause) {
1578 netdev->supported |= NETDEV_F_PAUSE;
1580 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1581 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1584 /* Advertised features. */
1585 netdev->advertised = 0;
1586 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1587 netdev->advertised |= NETDEV_F_10MB_HD;
1589 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1590 netdev->advertised |= NETDEV_F_10MB_FD;
1592 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1593 netdev->advertised |= NETDEV_F_100MB_HD;
1595 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1596 netdev->advertised |= NETDEV_F_100MB_FD;
1598 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1599 netdev->advertised |= NETDEV_F_1GB_HD;
1601 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1602 netdev->advertised |= NETDEV_F_1GB_FD;
1604 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1605 netdev->advertised |= NETDEV_F_10GB_FD;
1607 if (ecmd.advertising & ADVERTISED_TP) {
1608 netdev->advertised |= NETDEV_F_COPPER;
1610 if (ecmd.advertising & ADVERTISED_FIBRE) {
1611 netdev->advertised |= NETDEV_F_FIBER;
1613 if (ecmd.advertising & ADVERTISED_Autoneg) {
1614 netdev->advertised |= NETDEV_F_AUTONEG;
1616 if (ecmd.advertising & ADVERTISED_Pause) {
1617 netdev->advertised |= NETDEV_F_PAUSE;
1619 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1620 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1623 /* Current settings. */
1625 if (speed == SPEED_10) {
1626 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1627 } else if (speed == SPEED_100) {
1628 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1629 } else if (speed == SPEED_1000) {
1630 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1631 } else if (speed == SPEED_10000) {
1632 netdev->current = NETDEV_F_10GB_FD;
1633 } else if (speed == 40000) {
1634 netdev->current = NETDEV_F_40GB_FD;
1635 } else if (speed == 100000) {
1636 netdev->current = NETDEV_F_100GB_FD;
1637 } else if (speed == 1000000) {
1638 netdev->current = NETDEV_F_1TB_FD;
1640 netdev->current = 0;
1643 if (ecmd.port == PORT_TP) {
1644 netdev->current |= NETDEV_F_COPPER;
1645 } else if (ecmd.port == PORT_FIBRE) {
1646 netdev->current |= NETDEV_F_FIBER;
1650 netdev->current |= NETDEV_F_AUTONEG;
1653 /* Peer advertisements. */
1654 netdev->peer = 0; /* XXX */
1657 netdev->cache_valid |= VALID_FEATURES;
1658 netdev->get_features_error = error;
1661 /* Stores the features supported by 'netdev' into each of '*current',
1662 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1663 * bitmap of NETDEV_* bits. Returns 0 if successful, otherwise a positive
1666 netdev_linux_get_features(const struct netdev *netdev_,
1667 enum netdev_features *current,
1668 enum netdev_features *advertised,
1669 enum netdev_features *supported,
1670 enum netdev_features *peer)
1672 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1674 netdev_linux_read_features(netdev);
1676 if (!netdev->get_features_error) {
1677 *current = netdev->current;
1678 *advertised = netdev->advertised;
1679 *supported = netdev->supported;
1680 *peer = netdev->peer;
1682 return netdev->get_features_error;
1685 /* Set the features advertised by 'netdev' to 'advertise'. */
1687 netdev_linux_set_advertisements(struct netdev *netdev,
1688 enum netdev_features advertise)
1690 struct ethtool_cmd ecmd;
1693 COVERAGE_INC(netdev_get_ethtool);
1694 memset(&ecmd, 0, sizeof ecmd);
1695 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1696 ETHTOOL_GSET, "ETHTOOL_GSET");
1701 ecmd.advertising = 0;
1702 if (advertise & NETDEV_F_10MB_HD) {
1703 ecmd.advertising |= ADVERTISED_10baseT_Half;
1705 if (advertise & NETDEV_F_10MB_FD) {
1706 ecmd.advertising |= ADVERTISED_10baseT_Full;
1708 if (advertise & NETDEV_F_100MB_HD) {
1709 ecmd.advertising |= ADVERTISED_100baseT_Half;
1711 if (advertise & NETDEV_F_100MB_FD) {
1712 ecmd.advertising |= ADVERTISED_100baseT_Full;
1714 if (advertise & NETDEV_F_1GB_HD) {
1715 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1717 if (advertise & NETDEV_F_1GB_FD) {
1718 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1720 if (advertise & NETDEV_F_10GB_FD) {
1721 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1723 if (advertise & NETDEV_F_COPPER) {
1724 ecmd.advertising |= ADVERTISED_TP;
1726 if (advertise & NETDEV_F_FIBER) {
1727 ecmd.advertising |= ADVERTISED_FIBRE;
1729 if (advertise & NETDEV_F_AUTONEG) {
1730 ecmd.advertising |= ADVERTISED_Autoneg;
1732 if (advertise & NETDEV_F_PAUSE) {
1733 ecmd.advertising |= ADVERTISED_Pause;
1735 if (advertise & NETDEV_F_PAUSE_ASYM) {
1736 ecmd.advertising |= ADVERTISED_Asym_Pause;
1738 COVERAGE_INC(netdev_set_ethtool);
1739 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1740 ETHTOOL_SSET, "ETHTOOL_SSET");
1743 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1744 * successful, otherwise a positive errno value. */
1746 netdev_linux_set_policing(struct netdev *netdev_,
1747 uint32_t kbits_rate, uint32_t kbits_burst)
1749 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1750 const char *netdev_name = netdev_get_name(netdev_);
1754 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1755 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1756 : kbits_burst); /* Stick with user-specified value. */
1758 if (netdev->cache_valid & VALID_POLICING) {
1759 if (netdev->netdev_policing_error) {
1760 return netdev->netdev_policing_error;
1763 if (netdev->kbits_rate == kbits_rate &&
1764 netdev->kbits_burst == kbits_burst) {
1765 /* Assume that settings haven't changed since we last set them. */
1768 netdev->cache_valid &= ~VALID_POLICING;
1771 COVERAGE_INC(netdev_set_policing);
1772 /* Remove any existing ingress qdisc. */
1773 error = tc_add_del_ingress_qdisc(netdev_, false);
1775 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1776 netdev_name, ovs_strerror(error));
1781 error = tc_add_del_ingress_qdisc(netdev_, true);
1783 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1784 netdev_name, ovs_strerror(error));
1788 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
1790 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1791 netdev_name, ovs_strerror(error));
1796 netdev->kbits_rate = kbits_rate;
1797 netdev->kbits_burst = kbits_burst;
1800 if (!error || error == ENODEV) {
1801 netdev->netdev_policing_error = error;
1802 netdev->cache_valid |= VALID_POLICING;
1808 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1811 const struct tc_ops *const *opsp;
1813 for (opsp = tcs; *opsp != NULL; opsp++) {
1814 const struct tc_ops *ops = *opsp;
1815 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1816 sset_add(types, ops->ovs_name);
1822 static const struct tc_ops *
1823 tc_lookup_ovs_name(const char *name)
1825 const struct tc_ops *const *opsp;
1827 for (opsp = tcs; *opsp != NULL; opsp++) {
1828 const struct tc_ops *ops = *opsp;
1829 if (!strcmp(name, ops->ovs_name)) {
1836 static const struct tc_ops *
1837 tc_lookup_linux_name(const char *name)
1839 const struct tc_ops *const *opsp;
1841 for (opsp = tcs; *opsp != NULL; opsp++) {
1842 const struct tc_ops *ops = *opsp;
1843 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1850 static struct tc_queue *
1851 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
1854 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1855 struct tc_queue *queue;
1857 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
1858 if (queue->queue_id == queue_id) {
1865 static struct tc_queue *
1866 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1868 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1872 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1874 struct netdev_qos_capabilities *caps)
1876 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1880 caps->n_queues = ops->n_queues;
1885 netdev_linux_get_qos(const struct netdev *netdev_,
1886 const char **typep, struct smap *details)
1888 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1891 error = tc_query_qdisc(netdev_);
1896 *typep = netdev->tc->ops->ovs_name;
1897 return (netdev->tc->ops->qdisc_get
1898 ? netdev->tc->ops->qdisc_get(netdev_, details)
1903 netdev_linux_set_qos(struct netdev *netdev_,
1904 const char *type, const struct smap *details)
1906 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1907 const struct tc_ops *new_ops;
1910 new_ops = tc_lookup_ovs_name(type);
1911 if (!new_ops || !new_ops->tc_install) {
1915 error = tc_query_qdisc(netdev_);
1920 if (new_ops == netdev->tc->ops) {
1921 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
1923 /* Delete existing qdisc. */
1924 error = tc_del_qdisc(netdev_);
1928 ovs_assert(netdev->tc == NULL);
1930 /* Install new qdisc. */
1931 error = new_ops->tc_install(netdev_, details);
1932 ovs_assert((error == 0) == (netdev->tc != NULL));
1939 netdev_linux_get_queue(const struct netdev *netdev_,
1940 unsigned int queue_id, struct smap *details)
1942 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1945 error = tc_query_qdisc(netdev_);
1949 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
1951 ? netdev->tc->ops->class_get(netdev_, queue, details)
1957 netdev_linux_set_queue(struct netdev *netdev_,
1958 unsigned int queue_id, const struct smap *details)
1960 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1963 error = tc_query_qdisc(netdev_);
1966 } else if (queue_id >= netdev->tc->ops->n_queues
1967 || !netdev->tc->ops->class_set) {
1971 return netdev->tc->ops->class_set(netdev_, queue_id, details);
1975 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
1977 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1980 error = tc_query_qdisc(netdev_);
1983 } else if (!netdev->tc->ops->class_delete) {
1986 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
1988 ? netdev->tc->ops->class_delete(netdev_, queue)
1994 netdev_linux_get_queue_stats(const struct netdev *netdev_,
1995 unsigned int queue_id,
1996 struct netdev_queue_stats *stats)
1998 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2001 error = tc_query_qdisc(netdev_);
2004 } else if (!netdev->tc->ops->class_get_stats) {
2007 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2011 stats->created = queue->created;
2012 return netdev->tc->ops->class_get_stats(netdev_, queue, stats);
2017 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
2019 struct ofpbuf request;
2020 struct tcmsg *tcmsg;
2022 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2026 tcmsg->tcm_parent = 0;
2027 nl_dump_start(dump, NETLINK_ROUTE, &request);
2028 ofpbuf_uninit(&request);
2033 netdev_linux_dump_queues(const struct netdev *netdev_,
2034 netdev_dump_queues_cb *cb, void *aux)
2036 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2037 struct tc_queue *queue, *next_queue;
2038 struct smap details;
2042 error = tc_query_qdisc(netdev_);
2045 } else if (!netdev->tc->ops->class_get) {
2050 smap_init(&details);
2051 HMAP_FOR_EACH_SAFE (queue, next_queue, hmap_node,
2052 &netdev->tc->queues) {
2053 smap_clear(&details);
2055 error = netdev->tc->ops->class_get(netdev_, queue, &details);
2057 (*cb)(queue->queue_id, &details, aux);
2062 smap_destroy(&details);
2068 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2069 netdev_dump_queue_stats_cb *cb, void *aux)
2071 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2072 struct nl_dump dump;
2077 error = tc_query_qdisc(netdev_);
2080 } else if (!netdev->tc->ops->class_dump_stats) {
2085 if (!start_queue_dump(netdev_, &dump)) {
2088 while (nl_dump_next(&dump, &msg)) {
2089 error = netdev->tc->ops->class_dump_stats(netdev_, &msg, cb, aux);
2095 error = nl_dump_done(&dump);
2096 return error ? error : last_error;
2100 netdev_linux_get_in4(const struct netdev *netdev_,
2101 struct in_addr *address, struct in_addr *netmask)
2103 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2105 if (!(netdev->cache_valid & VALID_IN4)) {
2108 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
2109 SIOCGIFADDR, "SIOCGIFADDR");
2114 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2115 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2120 netdev->cache_valid |= VALID_IN4;
2122 *address = netdev->address;
2123 *netmask = netdev->netmask;
2124 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
2128 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2129 struct in_addr netmask)
2131 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2134 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2136 netdev->cache_valid |= VALID_IN4;
2137 netdev->address = address;
2138 netdev->netmask = netmask;
2139 if (address.s_addr != INADDR_ANY) {
2140 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2141 "SIOCSIFNETMASK", netmask);
2148 parse_if_inet6_line(const char *line,
2149 struct in6_addr *in6, char ifname[16 + 1])
2151 uint8_t *s6 = in6->s6_addr;
2152 #define X8 "%2"SCNx8
2154 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2155 "%*x %*x %*x %*x %16s\n",
2156 &s6[0], &s6[1], &s6[2], &s6[3],
2157 &s6[4], &s6[5], &s6[6], &s6[7],
2158 &s6[8], &s6[9], &s6[10], &s6[11],
2159 &s6[12], &s6[13], &s6[14], &s6[15],
2163 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2164 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2166 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2168 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2169 if (!(netdev->cache_valid & VALID_IN6)) {
2173 netdev->in6 = in6addr_any;
2175 file = fopen("/proc/net/if_inet6", "r");
2177 const char *name = netdev_get_name(netdev_);
2178 while (fgets(line, sizeof line, file)) {
2179 struct in6_addr in6_tmp;
2180 char ifname[16 + 1];
2181 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2182 && !strcmp(name, ifname))
2184 netdev->in6 = in6_tmp;
2190 netdev->cache_valid |= VALID_IN6;
2197 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2199 struct sockaddr_in sin;
2200 memset(&sin, 0, sizeof sin);
2201 sin.sin_family = AF_INET;
2202 sin.sin_addr = addr;
2205 memset(sa, 0, sizeof *sa);
2206 memcpy(sa, &sin, sizeof sin);
2210 do_set_addr(struct netdev *netdev,
2211 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2214 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2215 make_in4_sockaddr(&ifr.ifr_addr, addr);
2217 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2221 /* Adds 'router' as a default IP gateway. */
2223 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2225 struct in_addr any = { INADDR_ANY };
2229 memset(&rt, 0, sizeof rt);
2230 make_in4_sockaddr(&rt.rt_dst, any);
2231 make_in4_sockaddr(&rt.rt_gateway, router);
2232 make_in4_sockaddr(&rt.rt_genmask, any);
2233 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2234 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2236 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2242 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2245 static const char fn[] = "/proc/net/route";
2250 *netdev_name = NULL;
2251 stream = fopen(fn, "r");
2252 if (stream == NULL) {
2253 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2258 while (fgets(line, sizeof line, stream)) {
2261 ovs_be32 dest, gateway, mask;
2262 int refcnt, metric, mtu;
2263 unsigned int flags, use, window, irtt;
2266 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2268 iface, &dest, &gateway, &flags, &refcnt,
2269 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2271 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2275 if (!(flags & RTF_UP)) {
2276 /* Skip routes that aren't up. */
2280 /* The output of 'dest', 'mask', and 'gateway' were given in
2281 * network byte order, so we don't need need any endian
2282 * conversions here. */
2283 if ((dest & mask) == (host->s_addr & mask)) {
2285 /* The host is directly reachable. */
2286 next_hop->s_addr = 0;
2288 /* To reach the host, we must go through a gateway. */
2289 next_hop->s_addr = gateway;
2291 *netdev_name = xstrdup(iface);
2303 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2305 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2308 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2309 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2311 COVERAGE_INC(netdev_get_ethtool);
2312 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2313 error = netdev_linux_do_ethtool(netdev->up.name,
2316 "ETHTOOL_GDRVINFO");
2318 netdev->cache_valid |= VALID_DRVINFO;
2323 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2324 smap_add(smap, "driver_version", netdev->drvinfo.version);
2325 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2331 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2334 smap_add(smap, "driver_name", "openvswitch");
2338 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2339 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2340 * returns 0. Otherwise, it returns a positive errno value; in particular,
2341 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2343 netdev_linux_arp_lookup(const struct netdev *netdev,
2344 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2347 struct sockaddr_in sin;
2350 memset(&r, 0, sizeof r);
2351 memset(&sin, 0, sizeof sin);
2352 sin.sin_family = AF_INET;
2353 sin.sin_addr.s_addr = ip;
2355 memcpy(&r.arp_pa, &sin, sizeof sin);
2356 r.arp_ha.sa_family = ARPHRD_ETHER;
2358 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2359 COVERAGE_INC(netdev_arp_lookup);
2360 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2362 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2363 } else if (retval != ENXIO) {
2364 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2365 netdev_get_name(netdev), IP_ARGS(ip),
2366 ovs_strerror(retval));
2372 nd_to_iff_flags(enum netdev_flags nd)
2375 if (nd & NETDEV_UP) {
2378 if (nd & NETDEV_PROMISC) {
2385 iff_to_nd_flags(int iff)
2387 enum netdev_flags nd = 0;
2391 if (iff & IFF_PROMISC) {
2392 nd |= NETDEV_PROMISC;
2398 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2399 enum netdev_flags on, enum netdev_flags *old_flagsp)
2401 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2402 int old_flags, new_flags;
2405 old_flags = netdev->ifi_flags;
2406 *old_flagsp = iff_to_nd_flags(old_flags);
2407 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2408 if (new_flags != old_flags) {
2409 error = set_flags(netdev_get_name(netdev_), new_flags);
2410 get_flags(netdev_, &netdev->ifi_flags);
2416 netdev_linux_change_seq(const struct netdev *netdev)
2418 return netdev_linux_cast(netdev)->change_seq;
2421 #define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS, \
2422 GET_FEATURES, GET_STATUS) \
2426 netdev_linux_init, \
2428 netdev_linux_wait, \
2431 netdev_linux_destroy, \
2432 NULL, /* get_config */ \
2433 NULL, /* set_config */ \
2434 NULL, /* get_tunnel_config */ \
2436 netdev_linux_rx_open, \
2438 netdev_linux_send, \
2439 netdev_linux_send_wait, \
2441 netdev_linux_set_etheraddr, \
2442 netdev_linux_get_etheraddr, \
2443 netdev_linux_get_mtu, \
2444 netdev_linux_set_mtu, \
2445 netdev_linux_get_ifindex, \
2446 netdev_linux_get_carrier, \
2447 netdev_linux_get_carrier_resets, \
2448 netdev_linux_set_miimon_interval, \
2453 netdev_linux_set_advertisements, \
2455 netdev_linux_set_policing, \
2456 netdev_linux_get_qos_types, \
2457 netdev_linux_get_qos_capabilities, \
2458 netdev_linux_get_qos, \
2459 netdev_linux_set_qos, \
2460 netdev_linux_get_queue, \
2461 netdev_linux_set_queue, \
2462 netdev_linux_delete_queue, \
2463 netdev_linux_get_queue_stats, \
2464 netdev_linux_dump_queues, \
2465 netdev_linux_dump_queue_stats, \
2467 netdev_linux_get_in4, \
2468 netdev_linux_set_in4, \
2469 netdev_linux_get_in6, \
2470 netdev_linux_add_router, \
2471 netdev_linux_get_next_hop, \
2473 netdev_linux_arp_lookup, \
2475 netdev_linux_update_flags, \
2477 netdev_linux_change_seq \
2480 const struct netdev_class netdev_linux_class =
2483 netdev_linux_create,
2484 netdev_linux_get_stats,
2485 NULL, /* set_stats */
2486 netdev_linux_get_features,
2487 netdev_linux_get_status);
2489 const struct netdev_class netdev_tap_class =
2492 netdev_linux_create_tap,
2493 netdev_tap_get_stats,
2494 NULL, /* set_stats */
2495 netdev_linux_get_features,
2496 netdev_linux_get_status);
2498 const struct netdev_class netdev_internal_class =
2501 netdev_linux_create,
2502 netdev_internal_get_stats,
2503 netdev_internal_set_stats,
2504 NULL, /* get_features */
2505 netdev_internal_get_status);
2507 static const struct netdev_rx_class netdev_rx_linux_class = {
2508 netdev_rx_linux_destroy,
2509 netdev_rx_linux_recv,
2510 netdev_rx_linux_wait,
2511 netdev_rx_linux_drain,
2514 /* HTB traffic control class. */
2516 #define HTB_N_QUEUES 0xf000
2520 unsigned int max_rate; /* In bytes/s. */
2524 struct tc_queue tc_queue;
2525 unsigned int min_rate; /* In bytes/s. */
2526 unsigned int max_rate; /* In bytes/s. */
2527 unsigned int burst; /* In bytes. */
2528 unsigned int priority; /* Lower values are higher priorities. */
2532 htb_get__(const struct netdev *netdev_)
2534 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2535 return CONTAINER_OF(netdev->tc, struct htb, tc);
2539 htb_install__(struct netdev *netdev_, uint64_t max_rate)
2541 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2544 htb = xmalloc(sizeof *htb);
2545 tc_init(&htb->tc, &tc_ops_htb);
2546 htb->max_rate = max_rate;
2548 netdev->tc = &htb->tc;
2551 /* Create an HTB qdisc.
2553 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2555 htb_setup_qdisc__(struct netdev *netdev)
2558 struct tc_htb_glob opt;
2559 struct ofpbuf request;
2560 struct tcmsg *tcmsg;
2562 tc_del_qdisc(netdev);
2564 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2565 NLM_F_EXCL | NLM_F_CREATE, &request);
2569 tcmsg->tcm_handle = tc_make_handle(1, 0);
2570 tcmsg->tcm_parent = TC_H_ROOT;
2572 nl_msg_put_string(&request, TCA_KIND, "htb");
2574 memset(&opt, 0, sizeof opt);
2575 opt.rate2quantum = 10;
2579 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2580 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2581 nl_msg_end_nested(&request, opt_offset);
2583 return tc_transact(&request, NULL);
2586 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2587 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2589 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2590 unsigned int parent, struct htb_class *class)
2593 struct tc_htb_opt opt;
2594 struct ofpbuf request;
2595 struct tcmsg *tcmsg;
2599 error = netdev_get_mtu(netdev, &mtu);
2601 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2602 netdev_get_name(netdev));
2606 memset(&opt, 0, sizeof opt);
2607 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2608 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2609 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2610 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2611 opt.prio = class->priority;
2613 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2617 tcmsg->tcm_handle = handle;
2618 tcmsg->tcm_parent = parent;
2620 nl_msg_put_string(&request, TCA_KIND, "htb");
2621 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2622 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2623 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2624 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2625 nl_msg_end_nested(&request, opt_offset);
2627 error = tc_transact(&request, NULL);
2629 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2630 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2631 netdev_get_name(netdev),
2632 tc_get_major(handle), tc_get_minor(handle),
2633 tc_get_major(parent), tc_get_minor(parent),
2634 class->min_rate, class->max_rate,
2635 class->burst, class->priority, ovs_strerror(error));
2640 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2641 * description of them into 'details'. The description complies with the
2642 * specification given in the vswitch database documentation for linux-htb
2645 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2647 static const struct nl_policy tca_htb_policy[] = {
2648 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2649 .min_len = sizeof(struct tc_htb_opt) },
2652 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2653 const struct tc_htb_opt *htb;
2655 if (!nl_parse_nested(nl_options, tca_htb_policy,
2656 attrs, ARRAY_SIZE(tca_htb_policy))) {
2657 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2661 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2662 class->min_rate = htb->rate.rate;
2663 class->max_rate = htb->ceil.rate;
2664 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2665 class->priority = htb->prio;
2670 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2671 struct htb_class *options,
2672 struct netdev_queue_stats *stats)
2674 struct nlattr *nl_options;
2675 unsigned int handle;
2678 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2679 if (!error && queue_id) {
2680 unsigned int major = tc_get_major(handle);
2681 unsigned int minor = tc_get_minor(handle);
2682 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2683 *queue_id = minor - 1;
2688 if (!error && options) {
2689 error = htb_parse_tca_options__(nl_options, options);
2695 htb_parse_qdisc_details__(struct netdev *netdev,
2696 const struct smap *details, struct htb_class *hc)
2698 const char *max_rate_s;
2700 max_rate_s = smap_get(details, "max-rate");
2701 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2702 if (!hc->max_rate) {
2703 enum netdev_features current;
2705 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2706 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
2708 hc->min_rate = hc->max_rate;
2714 htb_parse_class_details__(struct netdev *netdev,
2715 const struct smap *details, struct htb_class *hc)
2717 const struct htb *htb = htb_get__(netdev);
2718 const char *min_rate_s = smap_get(details, "min-rate");
2719 const char *max_rate_s = smap_get(details, "max-rate");
2720 const char *burst_s = smap_get(details, "burst");
2721 const char *priority_s = smap_get(details, "priority");
2724 error = netdev_get_mtu(netdev, &mtu);
2726 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2727 netdev_get_name(netdev));
2731 /* HTB requires at least an mtu sized min-rate to send any traffic even
2732 * on uncongested links. */
2733 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2734 hc->min_rate = MAX(hc->min_rate, mtu);
2735 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2738 hc->max_rate = (max_rate_s
2739 ? strtoull(max_rate_s, NULL, 10) / 8
2741 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2742 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2746 * According to hints in the documentation that I've read, it is important
2747 * that 'burst' be at least as big as the largest frame that might be
2748 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2749 * but having it a bit too small is a problem. Since netdev_get_mtu()
2750 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2751 * the MTU. We actually add 64, instead of 14, as a guard against
2752 * additional headers get tacked on somewhere that we're not aware of. */
2753 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2754 hc->burst = MAX(hc->burst, mtu + 64);
2757 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2763 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2764 unsigned int parent, struct htb_class *options,
2765 struct netdev_queue_stats *stats)
2767 struct ofpbuf *reply;
2770 error = tc_query_class(netdev, handle, parent, &reply);
2772 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2773 ofpbuf_delete(reply);
2779 htb_tc_install(struct netdev *netdev, const struct smap *details)
2783 error = htb_setup_qdisc__(netdev);
2785 struct htb_class hc;
2787 htb_parse_qdisc_details__(netdev, details, &hc);
2788 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2789 tc_make_handle(1, 0), &hc);
2791 htb_install__(netdev, hc.max_rate);
2797 static struct htb_class *
2798 htb_class_cast__(const struct tc_queue *queue)
2800 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2804 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2805 const struct htb_class *hc)
2807 struct htb *htb = htb_get__(netdev);
2808 size_t hash = hash_int(queue_id, 0);
2809 struct tc_queue *queue;
2810 struct htb_class *hcp;
2812 queue = tc_find_queue__(netdev, queue_id, hash);
2814 hcp = htb_class_cast__(queue);
2816 hcp = xmalloc(sizeof *hcp);
2817 queue = &hcp->tc_queue;
2818 queue->queue_id = queue_id;
2819 queue->created = time_msec();
2820 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2823 hcp->min_rate = hc->min_rate;
2824 hcp->max_rate = hc->max_rate;
2825 hcp->burst = hc->burst;
2826 hcp->priority = hc->priority;
2830 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2833 struct nl_dump dump;
2834 struct htb_class hc;
2836 /* Get qdisc options. */
2838 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2839 htb_install__(netdev, hc.max_rate);
2842 if (!start_queue_dump(netdev, &dump)) {
2845 while (nl_dump_next(&dump, &msg)) {
2846 unsigned int queue_id;
2848 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2849 htb_update_queue__(netdev, queue_id, &hc);
2852 nl_dump_done(&dump);
2858 htb_tc_destroy(struct tc *tc)
2860 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2861 struct htb_class *hc, *next;
2863 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2864 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2872 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
2874 const struct htb *htb = htb_get__(netdev);
2875 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
2880 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
2882 struct htb_class hc;
2885 htb_parse_qdisc_details__(netdev, details, &hc);
2886 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2887 tc_make_handle(1, 0), &hc);
2889 htb_get__(netdev)->max_rate = hc.max_rate;
2895 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2896 const struct tc_queue *queue, struct smap *details)
2898 const struct htb_class *hc = htb_class_cast__(queue);
2900 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
2901 if (hc->min_rate != hc->max_rate) {
2902 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
2904 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
2906 smap_add_format(details, "priority", "%u", hc->priority);
2912 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2913 const struct smap *details)
2915 struct htb_class hc;
2918 error = htb_parse_class_details__(netdev, details, &hc);
2923 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2924 tc_make_handle(1, 0xfffe), &hc);
2929 htb_update_queue__(netdev, queue_id, &hc);
2934 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2936 struct htb_class *hc = htb_class_cast__(queue);
2937 struct htb *htb = htb_get__(netdev);
2940 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2942 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2949 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2950 struct netdev_queue_stats *stats)
2952 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2953 tc_make_handle(1, 0xfffe), NULL, stats);
2957 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2958 const struct ofpbuf *nlmsg,
2959 netdev_dump_queue_stats_cb *cb, void *aux)
2961 struct netdev_queue_stats stats;
2962 unsigned int handle, major, minor;
2965 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2970 major = tc_get_major(handle);
2971 minor = tc_get_minor(handle);
2972 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2973 (*cb)(minor - 1, &stats, aux);
2978 static const struct tc_ops tc_ops_htb = {
2979 "htb", /* linux_name */
2980 "linux-htb", /* ovs_name */
2981 HTB_N_QUEUES, /* n_queues */
2990 htb_class_get_stats,
2991 htb_class_dump_stats
2994 /* "linux-hfsc" traffic control class. */
2996 #define HFSC_N_QUEUES 0xf000
3004 struct tc_queue tc_queue;
3009 static struct hfsc *
3010 hfsc_get__(const struct netdev *netdev_)
3012 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3013 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
3016 static struct hfsc_class *
3017 hfsc_class_cast__(const struct tc_queue *queue)
3019 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3023 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
3025 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3028 hfsc = xmalloc(sizeof *hfsc);
3029 tc_init(&hfsc->tc, &tc_ops_hfsc);
3030 hfsc->max_rate = max_rate;
3031 netdev->tc = &hfsc->tc;
3035 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3036 const struct hfsc_class *hc)
3040 struct hfsc_class *hcp;
3041 struct tc_queue *queue;
3043 hfsc = hfsc_get__(netdev);
3044 hash = hash_int(queue_id, 0);
3046 queue = tc_find_queue__(netdev, queue_id, hash);
3048 hcp = hfsc_class_cast__(queue);
3050 hcp = xmalloc(sizeof *hcp);
3051 queue = &hcp->tc_queue;
3052 queue->queue_id = queue_id;
3053 queue->created = time_msec();
3054 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3057 hcp->min_rate = hc->min_rate;
3058 hcp->max_rate = hc->max_rate;
3062 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3064 const struct tc_service_curve *rsc, *fsc, *usc;
3065 static const struct nl_policy tca_hfsc_policy[] = {
3067 .type = NL_A_UNSPEC,
3069 .min_len = sizeof(struct tc_service_curve),
3072 .type = NL_A_UNSPEC,
3074 .min_len = sizeof(struct tc_service_curve),
3077 .type = NL_A_UNSPEC,
3079 .min_len = sizeof(struct tc_service_curve),
3082 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3084 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3085 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3086 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3090 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3091 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3092 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3094 if (rsc->m1 != 0 || rsc->d != 0 ||
3095 fsc->m1 != 0 || fsc->d != 0 ||
3096 usc->m1 != 0 || usc->d != 0) {
3097 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3098 "Non-linear service curves are not supported.");
3102 if (rsc->m2 != fsc->m2) {
3103 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3104 "Real-time service curves are not supported ");
3108 if (rsc->m2 > usc->m2) {
3109 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3110 "Min-rate service curve is greater than "
3111 "the max-rate service curve.");
3115 class->min_rate = fsc->m2;
3116 class->max_rate = usc->m2;
3121 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3122 struct hfsc_class *options,
3123 struct netdev_queue_stats *stats)
3126 unsigned int handle;
3127 struct nlattr *nl_options;
3129 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3135 unsigned int major, minor;
3137 major = tc_get_major(handle);
3138 minor = tc_get_minor(handle);
3139 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3140 *queue_id = minor - 1;
3147 error = hfsc_parse_tca_options__(nl_options, options);
3154 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3155 unsigned int parent, struct hfsc_class *options,
3156 struct netdev_queue_stats *stats)
3159 struct ofpbuf *reply;
3161 error = tc_query_class(netdev, handle, parent, &reply);
3166 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3167 ofpbuf_delete(reply);
3172 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct smap *details,
3173 struct hfsc_class *class)
3176 const char *max_rate_s;
3178 max_rate_s = smap_get(details, "max-rate");
3179 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3182 enum netdev_features current;
3184 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
3185 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3188 class->min_rate = max_rate;
3189 class->max_rate = max_rate;
3193 hfsc_parse_class_details__(struct netdev *netdev,
3194 const struct smap *details,
3195 struct hfsc_class * class)
3197 const struct hfsc *hfsc;
3198 uint32_t min_rate, max_rate;
3199 const char *min_rate_s, *max_rate_s;
3201 hfsc = hfsc_get__(netdev);
3202 min_rate_s = smap_get(details, "min-rate");
3203 max_rate_s = smap_get(details, "max-rate");
3205 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3206 min_rate = MAX(min_rate, 1);
3207 min_rate = MIN(min_rate, hfsc->max_rate);
3209 max_rate = (max_rate_s
3210 ? strtoull(max_rate_s, NULL, 10) / 8
3212 max_rate = MAX(max_rate, min_rate);
3213 max_rate = MIN(max_rate, hfsc->max_rate);
3215 class->min_rate = min_rate;
3216 class->max_rate = max_rate;
3221 /* Create an HFSC qdisc.
3223 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3225 hfsc_setup_qdisc__(struct netdev * netdev)
3227 struct tcmsg *tcmsg;
3228 struct ofpbuf request;
3229 struct tc_hfsc_qopt opt;
3231 tc_del_qdisc(netdev);
3233 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3234 NLM_F_EXCL | NLM_F_CREATE, &request);
3240 tcmsg->tcm_handle = tc_make_handle(1, 0);
3241 tcmsg->tcm_parent = TC_H_ROOT;
3243 memset(&opt, 0, sizeof opt);
3246 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3247 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3249 return tc_transact(&request, NULL);
3252 /* Create an HFSC class.
3254 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3255 * sc rate <min_rate> ul rate <max_rate>" */
3257 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3258 unsigned int parent, struct hfsc_class *class)
3262 struct tcmsg *tcmsg;
3263 struct ofpbuf request;
3264 struct tc_service_curve min, max;
3266 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3272 tcmsg->tcm_handle = handle;
3273 tcmsg->tcm_parent = parent;
3277 min.m2 = class->min_rate;
3281 max.m2 = class->max_rate;
3283 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3284 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3285 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3286 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3287 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3288 nl_msg_end_nested(&request, opt_offset);
3290 error = tc_transact(&request, NULL);
3292 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3293 "min-rate %ubps, max-rate %ubps (%s)",
3294 netdev_get_name(netdev),
3295 tc_get_major(handle), tc_get_minor(handle),
3296 tc_get_major(parent), tc_get_minor(parent),
3297 class->min_rate, class->max_rate, ovs_strerror(error));
3304 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
3307 struct hfsc_class class;
3309 error = hfsc_setup_qdisc__(netdev);
3315 hfsc_parse_qdisc_details__(netdev, details, &class);
3316 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3317 tc_make_handle(1, 0), &class);
3323 hfsc_install__(netdev, class.max_rate);
3328 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3331 struct nl_dump dump;
3332 struct hfsc_class hc;
3335 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3336 hfsc_install__(netdev, hc.max_rate);
3338 if (!start_queue_dump(netdev, &dump)) {
3342 while (nl_dump_next(&dump, &msg)) {
3343 unsigned int queue_id;
3345 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3346 hfsc_update_queue__(netdev, queue_id, &hc);
3350 nl_dump_done(&dump);
3355 hfsc_tc_destroy(struct tc *tc)
3358 struct hfsc_class *hc, *next;
3360 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3362 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3363 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3372 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
3374 const struct hfsc *hfsc;
3375 hfsc = hfsc_get__(netdev);
3376 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
3381 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
3384 struct hfsc_class class;
3386 hfsc_parse_qdisc_details__(netdev, details, &class);
3387 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3388 tc_make_handle(1, 0), &class);
3391 hfsc_get__(netdev)->max_rate = class.max_rate;
3398 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3399 const struct tc_queue *queue, struct smap *details)
3401 const struct hfsc_class *hc;
3403 hc = hfsc_class_cast__(queue);
3404 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3405 if (hc->min_rate != hc->max_rate) {
3406 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3412 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3413 const struct smap *details)
3416 struct hfsc_class class;
3418 error = hfsc_parse_class_details__(netdev, details, &class);
3423 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3424 tc_make_handle(1, 0xfffe), &class);
3429 hfsc_update_queue__(netdev, queue_id, &class);
3434 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3438 struct hfsc_class *hc;
3440 hc = hfsc_class_cast__(queue);
3441 hfsc = hfsc_get__(netdev);
3443 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3445 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3452 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3453 struct netdev_queue_stats *stats)
3455 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3456 tc_make_handle(1, 0xfffe), NULL, stats);
3460 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3461 const struct ofpbuf *nlmsg,
3462 netdev_dump_queue_stats_cb *cb, void *aux)
3464 struct netdev_queue_stats stats;
3465 unsigned int handle, major, minor;
3468 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3473 major = tc_get_major(handle);
3474 minor = tc_get_minor(handle);
3475 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3476 (*cb)(minor - 1, &stats, aux);
3481 static const struct tc_ops tc_ops_hfsc = {
3482 "hfsc", /* linux_name */
3483 "linux-hfsc", /* ovs_name */
3484 HFSC_N_QUEUES, /* n_queues */
3485 hfsc_tc_install, /* tc_install */
3486 hfsc_tc_load, /* tc_load */
3487 hfsc_tc_destroy, /* tc_destroy */
3488 hfsc_qdisc_get, /* qdisc_get */
3489 hfsc_qdisc_set, /* qdisc_set */
3490 hfsc_class_get, /* class_get */
3491 hfsc_class_set, /* class_set */
3492 hfsc_class_delete, /* class_delete */
3493 hfsc_class_get_stats, /* class_get_stats */
3494 hfsc_class_dump_stats /* class_dump_stats */
3497 /* "linux-default" traffic control class.
3499 * This class represents the default, unnamed Linux qdisc. It corresponds to
3500 * the "" (empty string) QoS type in the OVS database. */
3503 default_install__(struct netdev *netdev_)
3505 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3506 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
3508 /* Nothing but a tc class implementation is allowed to write to a tc. This
3509 * class never does that, so we can legitimately use a const tc object. */
3510 netdev->tc = CONST_CAST(struct tc *, &tc);
3514 default_tc_install(struct netdev *netdev,
3515 const struct smap *details OVS_UNUSED)
3517 default_install__(netdev);
3522 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3524 default_install__(netdev);
3528 static const struct tc_ops tc_ops_default = {
3529 NULL, /* linux_name */
3534 NULL, /* tc_destroy */
3535 NULL, /* qdisc_get */
3536 NULL, /* qdisc_set */
3537 NULL, /* class_get */
3538 NULL, /* class_set */
3539 NULL, /* class_delete */
3540 NULL, /* class_get_stats */
3541 NULL /* class_dump_stats */
3544 /* "linux-other" traffic control class.
3549 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
3551 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3552 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
3554 /* Nothing but a tc class implementation is allowed to write to a tc. This
3555 * class never does that, so we can legitimately use a const tc object. */
3556 netdev->tc = CONST_CAST(struct tc *, &tc);
3560 static const struct tc_ops tc_ops_other = {
3561 NULL, /* linux_name */
3562 "linux-other", /* ovs_name */
3564 NULL, /* tc_install */
3566 NULL, /* tc_destroy */
3567 NULL, /* qdisc_get */
3568 NULL, /* qdisc_set */
3569 NULL, /* class_get */
3570 NULL, /* class_set */
3571 NULL, /* class_delete */
3572 NULL, /* class_get_stats */
3573 NULL /* class_dump_stats */
3576 /* Traffic control. */
3578 /* Number of kernel "tc" ticks per second. */
3579 static double ticks_per_s;
3581 /* Number of kernel "jiffies" per second. This is used for the purpose of
3582 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3583 * one jiffy's worth of data.
3585 * There are two possibilities here:
3587 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3588 * approximate range of 100 to 1024. That means that we really need to
3589 * make sure that the qdisc can buffer that much data.
3591 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3592 * has finely granular timers and there's no need to fudge additional room
3593 * for buffers. (There's no extra effort needed to implement that: the
3594 * large 'buffer_hz' is used as a divisor, so practically any number will
3595 * come out as 0 in the division. Small integer results in the case of
3596 * really high dividends won't have any real effect anyhow.)
3598 static unsigned int buffer_hz;
3600 /* Returns tc handle 'major':'minor'. */
3602 tc_make_handle(unsigned int major, unsigned int minor)
3604 return TC_H_MAKE(major << 16, minor);
3607 /* Returns the major number from 'handle'. */
3609 tc_get_major(unsigned int handle)
3611 return TC_H_MAJ(handle) >> 16;
3614 /* Returns the minor number from 'handle'. */
3616 tc_get_minor(unsigned int handle)
3618 return TC_H_MIN(handle);
3621 static struct tcmsg *
3622 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3623 struct ofpbuf *request)
3625 struct tcmsg *tcmsg;
3629 error = get_ifindex(netdev, &ifindex);
3634 ofpbuf_init(request, 512);
3635 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3636 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3637 tcmsg->tcm_family = AF_UNSPEC;
3638 tcmsg->tcm_ifindex = ifindex;
3639 /* Caller should fill in tcmsg->tcm_handle. */
3640 /* Caller should fill in tcmsg->tcm_parent. */
3646 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3648 int error = nl_transact(NETLINK_ROUTE, request, replyp);
3649 ofpbuf_uninit(request);
3653 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3654 * policing configuration.
3656 * This function is equivalent to running the following when 'add' is true:
3657 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3659 * This function is equivalent to running the following when 'add' is false:
3660 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3662 * The configuration and stats may be seen with the following command:
3663 * /sbin/tc -s qdisc show dev <devname>
3665 * Returns 0 if successful, otherwise a positive errno value.
3668 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3670 struct ofpbuf request;
3671 struct tcmsg *tcmsg;
3673 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3674 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3676 tcmsg = tc_make_request(netdev, type, flags, &request);
3680 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3681 tcmsg->tcm_parent = TC_H_INGRESS;
3682 nl_msg_put_string(&request, TCA_KIND, "ingress");
3683 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3685 error = tc_transact(&request, NULL);
3687 /* If we're deleting the qdisc, don't worry about some of the
3688 * error conditions. */
3689 if (!add && (error == ENOENT || error == EINVAL)) {
3698 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3701 * This function is equivalent to running:
3702 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3703 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3706 * The configuration and stats may be seen with the following command:
3707 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3709 * Returns 0 if successful, otherwise a positive errno value.
3712 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3714 struct tc_police tc_police;
3715 struct ofpbuf request;
3716 struct tcmsg *tcmsg;
3717 size_t basic_offset;
3718 size_t police_offset;
3722 memset(&tc_police, 0, sizeof tc_police);
3723 tc_police.action = TC_POLICE_SHOT;
3724 tc_police.mtu = mtu;
3725 tc_fill_rate(&tc_police.rate, (kbits_rate * 1000)/8, mtu);
3726 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3727 kbits_burst * 1024);
3729 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3730 NLM_F_EXCL | NLM_F_CREATE, &request);
3734 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3735 tcmsg->tcm_info = tc_make_handle(49,
3736 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3738 nl_msg_put_string(&request, TCA_KIND, "basic");
3739 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3740 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3741 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3742 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3743 nl_msg_end_nested(&request, police_offset);
3744 nl_msg_end_nested(&request, basic_offset);
3746 error = tc_transact(&request, NULL);
3757 /* The values in psched are not individually very meaningful, but they are
3758 * important. The tables below show some values seen in the wild.
3762 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3763 * (Before that, there are hints that it was 1000000000.)
3765 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3769 * -----------------------------------
3770 * [1] 000c8000 000f4240 000f4240 00000064
3771 * [2] 000003e8 00000400 000f4240 3b9aca00
3772 * [3] 000003e8 00000400 000f4240 3b9aca00
3773 * [4] 000003e8 00000400 000f4240 00000064
3774 * [5] 000003e8 00000040 000f4240 3b9aca00
3775 * [6] 000003e8 00000040 000f4240 000000f9
3777 * a b c d ticks_per_s buffer_hz
3778 * ------- --------- ---------- ------------- ----------- -------------
3779 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3780 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3781 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3782 * [4] 1,000 1,024 1,000,000 100 976,562 100
3783 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3784 * [6] 1,000 64 1,000,000 249 15,625,000 249
3786 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3787 * [2] 2.6.26-1-686-bigmem from Debian lenny
3788 * [3] 2.6.26-2-sparc64 from Debian lenny
3789 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3790 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3791 * [6] 2.6.34 from kernel.org on KVM
3793 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
3794 static const char fn[] = "/proc/net/psched";
3795 unsigned int a, b, c, d;
3798 if (!ovsthread_once_start(&once)) {
3805 stream = fopen(fn, "r");
3807 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
3811 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3812 VLOG_WARN("%s: read failed", fn);
3816 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3820 VLOG_WARN("%s: invalid scheduler parameters", fn);
3824 ticks_per_s = (double) a * c / b;
3828 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3831 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3834 ovsthread_once_done(&once);
3837 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3838 * rate of 'rate' bytes per second. */
3840 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3843 return (rate * ticks) / ticks_per_s;
3846 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3847 * rate of 'rate' bytes per second. */
3849 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3852 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3855 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3856 * a transmission rate of 'rate' bytes per second. */
3858 tc_buffer_per_jiffy(unsigned int rate)
3861 return rate / buffer_hz;
3864 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3865 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3866 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3867 * stores NULL into it if it is absent.
3869 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3872 * Returns 0 if successful, otherwise a positive errno value. */
3874 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3875 struct nlattr **options)
3877 static const struct nl_policy tca_policy[] = {
3878 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3879 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3881 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3883 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3884 tca_policy, ta, ARRAY_SIZE(ta))) {
3885 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3890 *kind = nl_attr_get_string(ta[TCA_KIND]);
3894 *options = ta[TCA_OPTIONS];
3909 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3910 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3911 * into '*options', and its queue statistics into '*stats'. Any of the output
3912 * arguments may be null.
3914 * Returns 0 if successful, otherwise a positive errno value. */
3916 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3917 struct nlattr **options, struct netdev_queue_stats *stats)
3919 static const struct nl_policy tca_policy[] = {
3920 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3921 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3923 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3925 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3926 tca_policy, ta, ARRAY_SIZE(ta))) {
3927 VLOG_WARN_RL(&rl, "failed to parse class message");
3932 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3933 *handlep = tc->tcm_handle;
3937 *options = ta[TCA_OPTIONS];
3941 const struct gnet_stats_queue *gsq;
3942 struct gnet_stats_basic gsb;
3944 static const struct nl_policy stats_policy[] = {
3945 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3946 .min_len = sizeof gsb },
3947 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3948 .min_len = sizeof *gsq },
3950 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3952 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3953 sa, ARRAY_SIZE(sa))) {
3954 VLOG_WARN_RL(&rl, "failed to parse class stats");
3958 /* Alignment issues screw up the length of struct gnet_stats_basic on
3959 * some arch/bitsize combinations. Newer versions of Linux have a
3960 * struct gnet_stats_basic_packed, but we can't depend on that. The
3961 * easiest thing to do is just to make a copy. */
3962 memset(&gsb, 0, sizeof gsb);
3963 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3964 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3965 stats->tx_bytes = gsb.bytes;
3966 stats->tx_packets = gsb.packets;
3968 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3969 stats->tx_errors = gsq->drops;
3979 memset(stats, 0, sizeof *stats);
3984 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3987 tc_query_class(const struct netdev *netdev,
3988 unsigned int handle, unsigned int parent,
3989 struct ofpbuf **replyp)
3991 struct ofpbuf request;
3992 struct tcmsg *tcmsg;
3995 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3999 tcmsg->tcm_handle = handle;
4000 tcmsg->tcm_parent = parent;
4002 error = tc_transact(&request, replyp);
4004 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
4005 netdev_get_name(netdev),
4006 tc_get_major(handle), tc_get_minor(handle),
4007 tc_get_major(parent), tc_get_minor(parent),
4008 ovs_strerror(error));
4013 /* Equivalent to "tc class del dev <name> handle <handle>". */
4015 tc_delete_class(const struct netdev *netdev, unsigned int handle)
4017 struct ofpbuf request;
4018 struct tcmsg *tcmsg;
4021 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
4025 tcmsg->tcm_handle = handle;
4026 tcmsg->tcm_parent = 0;
4028 error = tc_transact(&request, NULL);
4030 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4031 netdev_get_name(netdev),
4032 tc_get_major(handle), tc_get_minor(handle),
4033 ovs_strerror(error));
4038 /* Equivalent to "tc qdisc del dev <name> root". */
4040 tc_del_qdisc(struct netdev *netdev_)
4042 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4043 struct ofpbuf request;
4044 struct tcmsg *tcmsg;
4047 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
4051 tcmsg->tcm_handle = tc_make_handle(1, 0);
4052 tcmsg->tcm_parent = TC_H_ROOT;
4054 error = tc_transact(&request, NULL);
4055 if (error == EINVAL) {
4056 /* EINVAL probably means that the default qdisc was in use, in which
4057 * case we've accomplished our purpose. */
4060 if (!error && netdev->tc) {
4061 if (netdev->tc->ops->tc_destroy) {
4062 netdev->tc->ops->tc_destroy(netdev->tc);
4069 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4070 * kernel to determine what they are. Returns 0 if successful, otherwise a
4071 * positive errno value. */
4073 tc_query_qdisc(const struct netdev *netdev_)
4075 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4076 struct ofpbuf request, *qdisc;
4077 const struct tc_ops *ops;
4078 struct tcmsg *tcmsg;
4086 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4087 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4088 * 2.6.35 without that fix backported to it.
4090 * To avoid the OOPS, we must not make a request that would attempt to dump
4091 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4092 * few others. There are a few ways that I can see to do this, but most of
4093 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4094 * technique chosen here is to assume that any non-default qdisc that we
4095 * create will have a class with handle 1:0. The built-in qdiscs only have
4096 * a class with handle 0:0.
4098 * We could check for Linux 2.6.35+ and use a more straightforward method
4100 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
4104 tcmsg->tcm_handle = tc_make_handle(1, 0);
4105 tcmsg->tcm_parent = 0;
4107 /* Figure out what tc class to instantiate. */
4108 error = tc_transact(&request, &qdisc);
4112 error = tc_parse_qdisc(qdisc, &kind, NULL);
4114 ops = &tc_ops_other;
4116 ops = tc_lookup_linux_name(kind);
4118 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4119 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4121 ops = &tc_ops_other;
4124 } else if (error == ENOENT) {
4125 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4126 * other entity that doesn't have a handle 1:0. We will assume
4127 * that it's the system default qdisc. */
4128 ops = &tc_ops_default;
4131 /* Who knows? Maybe the device got deleted. */
4132 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4133 netdev_get_name(netdev_), ovs_strerror(error));
4134 ops = &tc_ops_other;
4137 /* Instantiate it. */
4138 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
4139 ovs_assert((load_error == 0) == (netdev->tc != NULL));
4140 ofpbuf_delete(qdisc);
4142 return error ? error : load_error;
4145 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4146 approximate the time to transmit packets of various lengths. For an MTU of
4147 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4148 represents two possible packet lengths; for a MTU of 513 through 1024, four
4149 possible lengths; and so on.
4151 Returns, for the specified 'mtu', the number of bits that packet lengths
4152 need to be shifted right to fit within such a 256-entry table. */
4154 tc_calc_cell_log(unsigned int mtu)
4159 mtu = ETH_PAYLOAD_MAX;
4161 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4163 for (cell_log = 0; mtu >= 256; cell_log++) {
4170 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4173 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4175 memset(rate, 0, sizeof *rate);
4176 rate->cell_log = tc_calc_cell_log(mtu);
4177 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4178 /* rate->cell_align = 0; */ /* distro headers. */
4179 rate->mpu = ETH_TOTAL_MIN;
4183 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4184 * attribute of the specified "type".
4186 * See tc_calc_cell_log() above for a description of "rtab"s. */
4188 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4193 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4194 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4195 unsigned packet_size = (i + 1) << rate->cell_log;
4196 if (packet_size < rate->mpu) {
4197 packet_size = rate->mpu;
4199 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4203 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4204 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4205 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4208 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4210 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4211 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4214 /* Linux-only functions declared in netdev-linux.h */
4216 /* Returns a fd for an AF_INET socket or a negative errno value. */
4218 netdev_linux_get_af_inet_sock(void)
4220 int error = netdev_linux_init();
4221 return error ? -error : af_inet_sock;
4224 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4225 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4227 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4228 const char *flag_name, bool enable)
4230 const char *netdev_name = netdev_get_name(netdev);
4231 struct ethtool_value evalue;
4235 COVERAGE_INC(netdev_get_ethtool);
4236 memset(&evalue, 0, sizeof evalue);
4237 error = netdev_linux_do_ethtool(netdev_name,
4238 (struct ethtool_cmd *)&evalue,
4239 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4244 COVERAGE_INC(netdev_set_ethtool);
4245 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4246 error = netdev_linux_do_ethtool(netdev_name,
4247 (struct ethtool_cmd *)&evalue,
4248 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4253 COVERAGE_INC(netdev_get_ethtool);
4254 memset(&evalue, 0, sizeof evalue);
4255 error = netdev_linux_do_ethtool(netdev_name,
4256 (struct ethtool_cmd *)&evalue,
4257 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4262 if (new_flags != evalue.data) {
4263 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4264 "device %s failed", enable ? "enable" : "disable",
4265 flag_name, netdev_name);
4272 /* Utility functions. */
4274 /* Copies 'src' into 'dst', performing format conversion in the process. */
4276 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4277 const struct rtnl_link_stats *src)
4279 dst->rx_packets = src->rx_packets;
4280 dst->tx_packets = src->tx_packets;
4281 dst->rx_bytes = src->rx_bytes;
4282 dst->tx_bytes = src->tx_bytes;
4283 dst->rx_errors = src->rx_errors;
4284 dst->tx_errors = src->tx_errors;
4285 dst->rx_dropped = src->rx_dropped;
4286 dst->tx_dropped = src->tx_dropped;
4287 dst->multicast = src->multicast;
4288 dst->collisions = src->collisions;
4289 dst->rx_length_errors = src->rx_length_errors;
4290 dst->rx_over_errors = src->rx_over_errors;
4291 dst->rx_crc_errors = src->rx_crc_errors;
4292 dst->rx_frame_errors = src->rx_frame_errors;
4293 dst->rx_fifo_errors = src->rx_fifo_errors;
4294 dst->rx_missed_errors = src->rx_missed_errors;
4295 dst->tx_aborted_errors = src->tx_aborted_errors;
4296 dst->tx_carrier_errors = src->tx_carrier_errors;
4297 dst->tx_fifo_errors = src->tx_fifo_errors;
4298 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4299 dst->tx_window_errors = src->tx_window_errors;
4303 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4305 /* Policy for RTNLGRP_LINK messages.
4307 * There are *many* more fields in these messages, but currently we only
4308 * care about these fields. */
4309 static const struct nl_policy rtnlgrp_link_policy[] = {
4310 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4311 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4312 .min_len = sizeof(struct rtnl_link_stats) },
4315 struct ofpbuf request;
4316 struct ofpbuf *reply;
4317 struct ifinfomsg *ifi;
4318 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4321 ofpbuf_init(&request, 0);
4322 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4323 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4324 ifi->ifi_family = PF_UNSPEC;
4325 ifi->ifi_index = ifindex;
4326 error = nl_transact(NETLINK_ROUTE, &request, &reply);
4327 ofpbuf_uninit(&request);
4332 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4333 rtnlgrp_link_policy,
4334 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4335 ofpbuf_delete(reply);
4339 if (!attrs[IFLA_STATS]) {
4340 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4341 ofpbuf_delete(reply);
4345 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4347 ofpbuf_delete(reply);
4353 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4355 static const char fn[] = "/proc/net/dev";
4360 stream = fopen(fn, "r");
4362 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
4367 while (fgets(line, sizeof line, stream)) {
4370 #define X64 "%"SCNu64
4373 X64 X64 X64 X64 X64 X64 X64 "%*u"
4374 X64 X64 X64 X64 X64 X64 X64 "%*u",
4380 &stats->rx_fifo_errors,
4381 &stats->rx_frame_errors,
4387 &stats->tx_fifo_errors,
4389 &stats->tx_carrier_errors) != 15) {
4390 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4391 } else if (!strcmp(devname, netdev_name)) {
4392 stats->rx_length_errors = UINT64_MAX;
4393 stats->rx_over_errors = UINT64_MAX;
4394 stats->rx_crc_errors = UINT64_MAX;
4395 stats->rx_missed_errors = UINT64_MAX;
4396 stats->tx_aborted_errors = UINT64_MAX;
4397 stats->tx_heartbeat_errors = UINT64_MAX;
4398 stats->tx_window_errors = UINT64_MAX;
4404 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4410 get_flags(const struct netdev *dev, unsigned int *flags)
4416 error = netdev_linux_do_ioctl(dev->name, &ifr, SIOCGIFFLAGS,
4419 *flags = ifr.ifr_flags;
4425 set_flags(const char *name, unsigned int flags)
4429 ifr.ifr_flags = flags;
4430 return netdev_linux_do_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
4434 do_get_ifindex(const char *netdev_name)
4438 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4439 COVERAGE_INC(netdev_get_ifindex);
4440 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4441 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4442 netdev_name, ovs_strerror(errno));
4445 return ifr.ifr_ifindex;
4449 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4451 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4453 if (!(netdev->cache_valid & VALID_IFINDEX)) {
4454 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4457 netdev->get_ifindex_error = -ifindex;
4458 netdev->ifindex = 0;
4460 netdev->get_ifindex_error = 0;
4461 netdev->ifindex = ifindex;
4463 netdev->cache_valid |= VALID_IFINDEX;
4466 *ifindexp = netdev->ifindex;
4467 return netdev->get_ifindex_error;
4471 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4476 memset(&ifr, 0, sizeof ifr);
4477 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4478 COVERAGE_INC(netdev_get_hwaddr);
4479 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4480 /* ENODEV probably means that a vif disappeared asynchronously and
4481 * hasn't been removed from the database yet, so reduce the log level
4482 * to INFO for that case. */
4483 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4484 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4485 netdev_name, ovs_strerror(errno));
4488 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4489 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4490 VLOG_WARN("%s device has unknown hardware address family %d",
4491 netdev_name, hwaddr_family);
4493 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4498 set_etheraddr(const char *netdev_name,
4499 const uint8_t mac[ETH_ADDR_LEN])
4503 memset(&ifr, 0, sizeof ifr);
4504 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4505 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4506 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4507 COVERAGE_INC(netdev_set_hwaddr);
4508 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4509 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4510 netdev_name, ovs_strerror(errno));
4517 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4518 int cmd, const char *cmd_name)
4522 memset(&ifr, 0, sizeof ifr);
4523 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4524 ifr.ifr_data = (caddr_t) ecmd;
4527 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4530 if (errno != EOPNOTSUPP) {
4531 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4532 "failed: %s", cmd_name, name, ovs_strerror(errno));
4534 /* The device doesn't support this operation. That's pretty
4535 * common, so there's no point in logging anything. */
4542 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4543 const char *cmd_name)
4545 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4546 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4547 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4548 ovs_strerror(errno));
4555 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4556 int cmd, const char *cmd_name)
4561 ifr.ifr_addr.sa_family = AF_INET;
4562 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4564 const struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *,
4566 *ip = sin->sin_addr;
4571 /* Returns an AF_PACKET raw socket or a negative errno value. */
4573 af_packet_sock(void)
4575 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4578 if (ovsthread_once_start(&once)) {
4579 sock = socket(AF_PACKET, SOCK_RAW, 0);
4581 int error = set_nonblocking(sock);
4588 VLOG_ERR("failed to create packet socket: %s",
4589 ovs_strerror(errno));
4591 ovsthread_once_done(&once);