2 * Copyright (c) 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <linux/version.h>
37 #include <sys/types.h>
38 #include <sys/ioctl.h>
39 #include <sys/socket.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
52 #include "dpif-linux.h"
53 #include "dynamic-string.h"
54 #include "fatal-signal.h"
57 #include "netdev-provider.h"
58 #include "netdev-vport.h"
59 #include "netlink-notifier.h"
60 #include "netlink-socket.h"
63 #include "openflow/openflow.h"
65 #include "poll-loop.h"
66 #include "rtnetlink-link.h"
68 #include "socket-util.h"
71 #include "unaligned.h"
74 VLOG_DEFINE_THIS_MODULE(netdev_linux);
76 COVERAGE_DEFINE(netdev_set_policing);
77 COVERAGE_DEFINE(netdev_arp_lookup);
78 COVERAGE_DEFINE(netdev_get_ifindex);
79 COVERAGE_DEFINE(netdev_get_hwaddr);
80 COVERAGE_DEFINE(netdev_set_hwaddr);
81 COVERAGE_DEFINE(netdev_get_ethtool);
82 COVERAGE_DEFINE(netdev_set_ethtool);
85 /* These were introduced in Linux 2.6.14, so they might be missing if we have
87 #ifndef ADVERTISED_Pause
88 #define ADVERTISED_Pause (1 << 13)
90 #ifndef ADVERTISED_Asym_Pause
91 #define ADVERTISED_Asym_Pause (1 << 14)
94 /* These were introduced in Linux 2.6.24, so they might be missing if we
95 * have old headers. */
96 #ifndef ETHTOOL_GFLAGS
97 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
99 #ifndef ETHTOOL_SFLAGS
100 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
103 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
106 #define TC_RTAB_SIZE 1024
109 static struct nln_notifier *netdev_linux_cache_notifier = NULL;
110 static int cache_notifier_refcount;
113 VALID_IFINDEX = 1 << 0,
114 VALID_ETHERADDR = 1 << 1,
118 VALID_POLICING = 1 << 5,
119 VALID_VPORT_STAT_ERROR = 1 << 6,
120 VALID_DRVINFO = 1 << 7,
121 VALID_FEATURES = 1 << 8,
128 /* Traffic control. */
130 /* An instance of a traffic control class. Always associated with a particular
133 * Each TC implementation subclasses this with whatever additional data it
136 const struct tc_ops *ops;
137 struct hmap queues; /* Contains "struct tc_queue"s.
138 * Read by generic TC layer.
139 * Written only by TC implementation. */
142 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
144 /* One traffic control queue.
146 * Each TC implementation subclasses this with whatever additional data it
149 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
150 unsigned int queue_id; /* OpenFlow queue ID. */
151 long long int created; /* Time queue was created, in msecs. */
154 /* A particular kind of traffic control. Each implementation generally maps to
155 * one particular Linux qdisc class.
157 * The functions below return 0 if successful or a positive errno value on
158 * failure, except where otherwise noted. All of them must be provided, except
159 * where otherwise noted. */
161 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
162 * This is null for tc_ops_default and tc_ops_other, for which there are no
163 * appropriate values. */
164 const char *linux_name;
166 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
167 const char *ovs_name;
169 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
170 * queues. The queues are numbered 0 through n_queues - 1. */
171 unsigned int n_queues;
173 /* Called to install this TC class on 'netdev'. The implementation should
174 * make the Netlink calls required to set up 'netdev' with the right qdisc
175 * and configure it according to 'details'. The implementation may assume
176 * that the current qdisc is the default; that is, there is no need for it
177 * to delete the current qdisc before installing itself.
179 * The contents of 'details' should be documented as valid for 'ovs_name'
180 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
181 * (which is built as ovs-vswitchd.conf.db(8)).
183 * This function must return 0 if and only if it sets 'netdev->tc' to an
184 * initialized 'struct tc'.
186 * (This function is null for tc_ops_other, which cannot be installed. For
187 * other TC classes it should always be nonnull.) */
188 int (*tc_install)(struct netdev *netdev, const struct smap *details);
190 /* Called when the netdev code determines (through a Netlink query) that
191 * this TC class's qdisc is installed on 'netdev', but we didn't install
192 * it ourselves and so don't know any of the details.
194 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
195 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
196 * implementation should parse the other attributes of 'nlmsg' as
197 * necessary to determine its configuration. If necessary it should also
198 * use Netlink queries to determine the configuration of queues on
201 * This function must return 0 if and only if it sets 'netdev->tc' to an
202 * initialized 'struct tc'. */
203 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
205 /* Destroys the data structures allocated by the implementation as part of
206 * 'tc'. (This includes destroying 'tc->queues' by calling
209 * The implementation should not need to perform any Netlink calls. If
210 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
211 * (But it may not be desirable.)
213 * This function may be null if 'tc' is trivial. */
214 void (*tc_destroy)(struct tc *tc);
216 /* Retrieves details of 'netdev->tc' configuration into 'details'.
218 * The implementation should not need to perform any Netlink calls, because
219 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
220 * cached the configuration.
222 * The contents of 'details' should be documented as valid for 'ovs_name'
223 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
224 * (which is built as ovs-vswitchd.conf.db(8)).
226 * This function may be null if 'tc' is not configurable.
228 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
230 /* Reconfigures 'netdev->tc' according to 'details', performing any
231 * required Netlink calls to complete the reconfiguration.
233 * The contents of 'details' should be documented as valid for 'ovs_name'
234 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
235 * (which is built as ovs-vswitchd.conf.db(8)).
237 * This function may be null if 'tc' is not configurable.
239 int (*qdisc_set)(struct netdev *, const struct smap *details);
241 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
242 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
244 * The contents of 'details' should be documented as valid for 'ovs_name'
245 * in the "other_config" column in the "Queue" table in
246 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
248 * The implementation should not need to perform any Netlink calls, because
249 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
250 * cached the queue configuration.
252 * This function may be null if 'tc' does not have queues ('n_queues' is
254 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
255 struct smap *details);
257 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
258 * 'details', perfoming any required Netlink calls to complete the
259 * reconfiguration. The caller ensures that 'queue_id' is less than
262 * The contents of 'details' should be documented as valid for 'ovs_name'
263 * in the "other_config" column in the "Queue" table in
264 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
266 * This function may be null if 'tc' does not have queues or its queues are
267 * not configurable. */
268 int (*class_set)(struct netdev *, unsigned int queue_id,
269 const struct smap *details);
271 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
272 * tc_queue's within 'netdev->tc->queues'.
274 * This function may be null if 'tc' does not have queues or its queues
275 * cannot be deleted. */
276 int (*class_delete)(struct netdev *, struct tc_queue *queue);
278 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
279 * 'struct tc_queue's within 'netdev->tc->queues'.
281 * On success, initializes '*stats'.
283 * This function may be null if 'tc' does not have queues or if it cannot
284 * report queue statistics. */
285 int (*class_get_stats)(const struct netdev *netdev,
286 const struct tc_queue *queue,
287 struct netdev_queue_stats *stats);
289 /* Extracts queue stats from 'nlmsg', which is a response to a
290 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
292 * This function may be null if 'tc' does not have queues or if it cannot
293 * report queue statistics. */
294 int (*class_dump_stats)(const struct netdev *netdev,
295 const struct ofpbuf *nlmsg,
296 netdev_dump_queue_stats_cb *cb, void *aux);
300 tc_init(struct tc *tc, const struct tc_ops *ops)
303 hmap_init(&tc->queues);
307 tc_destroy(struct tc *tc)
309 hmap_destroy(&tc->queues);
312 static const struct tc_ops tc_ops_htb;
313 static const struct tc_ops tc_ops_hfsc;
314 static const struct tc_ops tc_ops_default;
315 static const struct tc_ops tc_ops_other;
317 static const struct tc_ops *const tcs[] = {
318 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
319 &tc_ops_hfsc, /* Hierarchical fair service curve. */
320 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
321 &tc_ops_other, /* Some other qdisc. */
325 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
326 static unsigned int tc_get_major(unsigned int handle);
327 static unsigned int tc_get_minor(unsigned int handle);
329 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
330 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
331 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
333 static struct tcmsg *tc_make_request(const struct netdev *, int type,
334 unsigned int flags, struct ofpbuf *);
335 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
336 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
337 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
340 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
341 struct nlattr **options);
342 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
343 struct nlattr **options,
344 struct netdev_queue_stats *);
345 static int tc_query_class(const struct netdev *,
346 unsigned int handle, unsigned int parent,
347 struct ofpbuf **replyp);
348 static int tc_delete_class(const struct netdev *, unsigned int handle);
350 static int tc_del_qdisc(struct netdev *netdev);
351 static int tc_query_qdisc(const struct netdev *netdev);
353 static int tc_calc_cell_log(unsigned int mtu);
354 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
355 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
356 const struct tc_ratespec *rate);
357 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
359 struct netdev_linux {
362 struct shash_node *shash_node;
363 unsigned int cache_valid;
364 unsigned int change_seq;
366 bool miimon; /* Link status of last poll. */
367 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
368 struct timer miimon_timer;
370 /* The following are figured out "on demand" only. They are only valid
371 * when the corresponding VALID_* bit in 'cache_valid' is set. */
373 uint8_t etheraddr[ETH_ADDR_LEN];
374 struct in_addr address, netmask;
377 unsigned int ifi_flags;
378 long long int carrier_resets;
379 uint32_t kbits_rate; /* Policing data. */
380 uint32_t kbits_burst;
381 int vport_stats_error; /* Cached error code from vport_get_stats().
382 0 or an errno value. */
383 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
384 int ether_addr_error; /* Cached error code from set/get etheraddr. */
385 int netdev_policing_error; /* Cached error code from set policing. */
386 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
387 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
389 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
390 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
391 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
392 enum netdev_features peer; /* Cached from ETHTOOL_GSET. */
394 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
398 struct tap_state tap;
402 struct netdev_rx_linux {
408 static const struct netdev_rx_class netdev_rx_linux_class;
410 /* Sockets used for ioctl operations. */
411 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
413 /* This is set pretty low because we probably won't learn anything from the
414 * additional log messages. */
415 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
417 static int netdev_linux_init(void);
419 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
420 int cmd, const char *cmd_name);
421 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
422 const char *cmd_name);
423 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
424 int cmd, const char *cmd_name);
425 static int get_flags(const struct netdev *, unsigned int *flags);
426 static int set_flags(const char *, unsigned int flags);
427 static int do_get_ifindex(const char *netdev_name);
428 static int get_ifindex(const struct netdev *, int *ifindexp);
429 static int do_set_addr(struct netdev *netdev,
430 int ioctl_nr, const char *ioctl_name,
431 struct in_addr addr);
432 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
433 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
434 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
435 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
436 static int af_packet_sock(void);
437 static void netdev_linux_miimon_run(void);
438 static void netdev_linux_miimon_wait(void);
441 is_netdev_linux_class(const struct netdev_class *netdev_class)
443 return netdev_class->init == netdev_linux_init;
447 is_tap_netdev(const struct netdev *netdev)
449 return netdev_get_class(netdev) == &netdev_tap_class;
452 static struct netdev_linux *
453 netdev_linux_cast(const struct netdev *netdev)
455 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
457 return CONTAINER_OF(netdev, struct netdev_linux, up);
460 static struct netdev_rx_linux *
461 netdev_rx_linux_cast(const struct netdev_rx *rx)
463 netdev_rx_assert_class(rx, &netdev_rx_linux_class);
464 return CONTAINER_OF(rx, struct netdev_rx_linux, up);
468 netdev_linux_init(void)
470 static int status = -1;
472 /* Create AF_INET socket. */
473 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
474 status = af_inet_sock >= 0 ? 0 : errno;
476 VLOG_ERR("failed to create inet socket: %s", ovs_strerror(status));
483 netdev_linux_run(void)
485 rtnetlink_link_run();
486 netdev_linux_miimon_run();
490 netdev_linux_wait(void)
492 rtnetlink_link_wait();
493 netdev_linux_miimon_wait();
497 netdev_linux_changed(struct netdev_linux *dev,
498 unsigned int ifi_flags, unsigned int mask)
501 if (!dev->change_seq) {
505 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
506 dev->carrier_resets++;
508 dev->ifi_flags = ifi_flags;
510 dev->cache_valid &= mask;
514 netdev_linux_update(struct netdev_linux *dev,
515 const struct rtnetlink_link_change *change)
517 if (change->nlmsg_type == RTM_NEWLINK) {
519 netdev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
521 /* Update netdev from rtnl-change msg. */
523 dev->mtu = change->mtu;
524 dev->cache_valid |= VALID_MTU;
525 dev->netdev_mtu_error = 0;
528 if (!eth_addr_is_zero(change->addr)) {
529 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
530 dev->cache_valid |= VALID_ETHERADDR;
531 dev->ether_addr_error = 0;
534 dev->ifindex = change->ifi_index;
535 dev->cache_valid |= VALID_IFINDEX;
536 dev->get_ifindex_error = 0;
539 netdev_linux_changed(dev, change->ifi_flags, 0);
544 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
545 void *aux OVS_UNUSED)
547 struct netdev_linux *dev;
549 struct netdev *base_dev = netdev_from_name(change->ifname);
550 if (base_dev && is_netdev_linux_class(netdev_get_class(base_dev))) {
551 netdev_linux_update(netdev_linux_cast(base_dev), change);
554 struct shash device_shash;
555 struct shash_node *node;
557 shash_init(&device_shash);
558 netdev_get_devices(&netdev_linux_class, &device_shash);
559 SHASH_FOR_EACH (node, &device_shash) {
560 struct netdev *netdev = node->data;
563 dev = netdev_linux_cast(netdev);
565 get_flags(&dev->up, &flags);
566 netdev_linux_changed(dev, flags, 0);
568 shash_destroy(&device_shash);
573 cache_notifier_ref(void)
575 if (!cache_notifier_refcount) {
576 ovs_assert(!netdev_linux_cache_notifier);
578 netdev_linux_cache_notifier =
579 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
581 if (!netdev_linux_cache_notifier) {
585 cache_notifier_refcount++;
591 cache_notifier_unref(void)
593 ovs_assert(cache_notifier_refcount > 0);
594 if (!--cache_notifier_refcount) {
595 ovs_assert(netdev_linux_cache_notifier);
596 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
597 netdev_linux_cache_notifier = NULL;
601 /* Creates system and internal devices. */
603 netdev_linux_create(const struct netdev_class *class, const char *name,
604 struct netdev **netdevp)
606 struct netdev_linux *netdev;
609 error = cache_notifier_ref();
614 netdev = xzalloc(sizeof *netdev);
615 netdev->change_seq = 1;
616 netdev_init(&netdev->up, name, class);
617 error = get_flags(&netdev->up, &netdev->ifi_flags);
618 if (error == ENODEV) {
619 if (class != &netdev_internal_class) {
620 /* The device does not exist, so don't allow it to be opened. */
621 netdev_uninit(&netdev->up, false);
622 cache_notifier_unref();
626 /* "Internal" netdevs have to be created as netdev objects before
627 * they exist in the kernel, because creating them in the kernel
628 * happens by passing a netdev object to dpif_port_add().
629 * Therefore, ignore the error. */
633 *netdevp = &netdev->up;
637 /* For most types of netdevs we open the device for each call of
638 * netdev_open(). However, this is not the case with tap devices,
639 * since it is only possible to open the device once. In this
640 * situation we share a single file descriptor, and consequently
641 * buffers, across all readers. Therefore once data is read it will
642 * be unavailable to other reads for tap devices. */
644 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
645 const char *name, struct netdev **netdevp)
647 struct netdev_linux *netdev;
648 struct tap_state *state;
649 static const char tap_dev[] = "/dev/net/tun";
653 netdev = xzalloc(sizeof *netdev);
654 netdev->change_seq = 1;
655 state = &netdev->state.tap;
657 error = cache_notifier_ref();
662 /* Open tap device. */
663 state->fd = open(tap_dev, O_RDWR);
666 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
667 goto error_unref_notifier;
670 /* Create tap device. */
671 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
672 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
673 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
674 VLOG_WARN("%s: creating tap device failed: %s", name,
675 ovs_strerror(errno));
680 /* Make non-blocking. */
681 error = set_nonblocking(state->fd);
686 netdev_init(&netdev->up, name, &netdev_tap_class);
687 *netdevp = &netdev->up;
692 error_unref_notifier:
693 cache_notifier_unref();
700 destroy_tap(struct netdev_linux *netdev)
702 struct tap_state *state = &netdev->state.tap;
704 if (state->fd >= 0) {
709 /* Destroys the netdev device 'netdev_'. */
711 netdev_linux_destroy(struct netdev *netdev_)
713 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
715 if (netdev->tc && netdev->tc->ops->tc_destroy) {
716 netdev->tc->ops->tc_destroy(netdev->tc);
719 if (netdev_get_class(netdev_) == &netdev_tap_class) {
724 cache_notifier_unref();
728 netdev_linux_rx_open(struct netdev *netdev_, struct netdev_rx **rxp)
730 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
731 bool is_tap = is_tap_netdev(netdev_);
732 struct netdev_rx_linux *rx;
737 fd = netdev->state.tap.fd;
739 struct sockaddr_ll sll;
741 /* Result of tcpdump -dd inbound */
742 static struct sock_filter filt[] = {
743 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
744 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
745 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
746 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
748 static struct sock_fprog fprog = { ARRAY_SIZE(filt), filt };
750 /* Create file descriptor. */
751 fd = socket(PF_PACKET, SOCK_RAW, 0);
754 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
758 /* Set non-blocking mode. */
759 error = set_nonblocking(fd);
764 /* Get ethernet device index. */
765 error = get_ifindex(&netdev->up, &ifindex);
770 /* Bind to specific ethernet device. */
771 memset(&sll, 0, sizeof sll);
772 sll.sll_family = AF_PACKET;
773 sll.sll_ifindex = ifindex;
774 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
775 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
777 VLOG_ERR("%s: failed to bind raw socket (%s)",
778 netdev_get_name(netdev_), ovs_strerror(error));
782 /* Filter for only inbound packets. */
783 error = setsockopt(fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
787 VLOG_ERR("%s: failed attach filter (%s)",
788 netdev_get_name(netdev_), ovs_strerror(error));
793 rx = xmalloc(sizeof *rx);
794 netdev_rx_init(&rx->up, netdev_, &netdev_rx_linux_class);
809 netdev_rx_linux_destroy(struct netdev_rx *rx_)
811 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
820 netdev_rx_linux_recv(struct netdev_rx *rx_, void *data, size_t size)
822 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
827 ? read(rx->fd, data, size)
828 : recv(rx->fd, data, size, MSG_TRUNC));
829 } while (retval < 0 && errno == EINTR);
832 return retval > size ? -EMSGSIZE : retval;
834 if (errno != EAGAIN) {
835 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
836 ovs_strerror(errno), netdev_rx_get_name(rx_));
843 netdev_rx_linux_wait(struct netdev_rx *rx_)
845 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
846 poll_fd_wait(rx->fd, POLLIN);
850 netdev_rx_linux_drain(struct netdev_rx *rx_)
852 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
855 int error = netdev_linux_do_ioctl(netdev_rx_get_name(rx_), &ifr,
856 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
860 drain_fd(rx->fd, ifr.ifr_qlen);
863 return drain_rcvbuf(rx->fd);
867 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
868 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
869 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
870 * the packet is too big or too small to transmit on the device.
872 * The caller retains ownership of 'buffer' in all cases.
874 * The kernel maintains a packet transmission queue, so the caller is not
875 * expected to do additional queuing of packets. */
877 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
882 if (!is_tap_netdev(netdev_)) {
883 /* Use our AF_PACKET socket to send to this device. */
884 struct sockaddr_ll sll;
891 sock = af_packet_sock();
896 error = get_ifindex(netdev_, &ifindex);
901 /* We don't bother setting most fields in sockaddr_ll because the
902 * kernel ignores them for SOCK_RAW. */
903 memset(&sll, 0, sizeof sll);
904 sll.sll_family = AF_PACKET;
905 sll.sll_ifindex = ifindex;
907 iov.iov_base = CONST_CAST(void *, data);
911 msg.msg_namelen = sizeof sll;
914 msg.msg_control = NULL;
915 msg.msg_controllen = 0;
918 retval = sendmsg(sock, &msg, 0);
920 /* Use the tap fd to send to this device. This is essential for
921 * tap devices, because packets sent to a tap device with an
922 * AF_PACKET socket will loop back to be *received* again on the
923 * tap device. This doesn't occur on other interface types
924 * because we attach a socket filter to the rx socket. */
925 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
927 retval = write(netdev->state.tap.fd, data, size);
931 /* The Linux AF_PACKET implementation never blocks waiting for room
932 * for packets, instead returning ENOBUFS. Translate this into
933 * EAGAIN for the caller. */
934 if (errno == ENOBUFS) {
936 } else if (errno == EINTR) {
938 } else if (errno != EAGAIN) {
939 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
940 netdev_get_name(netdev_), ovs_strerror(errno));
943 } else if (retval != size) {
944 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
945 "%zu) on %s", retval, size, netdev_get_name(netdev_));
953 /* Registers with the poll loop to wake up from the next call to poll_block()
954 * when the packet transmission queue has sufficient room to transmit a packet
955 * with netdev_send().
957 * The kernel maintains a packet transmission queue, so the client is not
958 * expected to do additional queuing of packets. Thus, this function is
959 * unlikely to ever be used. It is included for completeness. */
961 netdev_linux_send_wait(struct netdev *netdev)
963 if (is_tap_netdev(netdev)) {
964 /* TAP device always accepts packets.*/
965 poll_immediate_wake();
969 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
970 * otherwise a positive errno value. */
972 netdev_linux_set_etheraddr(struct netdev *netdev_,
973 const uint8_t mac[ETH_ADDR_LEN])
975 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
976 struct netdev_saved_flags *sf = NULL;
979 if (netdev->cache_valid & VALID_ETHERADDR) {
980 if (netdev->ether_addr_error) {
981 return netdev->ether_addr_error;
983 if (eth_addr_equals(netdev->etheraddr, mac)) {
986 netdev->cache_valid &= ~VALID_ETHERADDR;
989 /* Tap devices must be brought down before setting the address. */
990 if (is_tap_netdev(netdev_)) {
991 netdev_turn_flags_off(netdev_, NETDEV_UP, &sf);
993 error = set_etheraddr(netdev_get_name(netdev_), mac);
994 if (!error || error == ENODEV) {
995 netdev->ether_addr_error = error;
996 netdev->cache_valid |= VALID_ETHERADDR;
998 memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
1002 netdev_restore_flags(sf);
1007 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1009 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1010 uint8_t mac[ETH_ADDR_LEN])
1012 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1014 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1015 int error = get_etheraddr(netdev_get_name(netdev_),
1018 netdev->ether_addr_error = error;
1019 netdev->cache_valid |= VALID_ETHERADDR;
1022 if (!netdev->ether_addr_error) {
1023 memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
1026 return netdev->ether_addr_error;
1029 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1030 * in bytes, not including the hardware header; thus, this is typically 1500
1031 * bytes for Ethernet devices. */
1033 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1035 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1036 if (!(netdev->cache_valid & VALID_MTU)) {
1040 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1041 SIOCGIFMTU, "SIOCGIFMTU");
1043 netdev->netdev_mtu_error = error;
1044 netdev->mtu = ifr.ifr_mtu;
1045 netdev->cache_valid |= VALID_MTU;
1048 if (!netdev->netdev_mtu_error) {
1049 *mtup = netdev->mtu;
1051 return netdev->netdev_mtu_error;
1054 /* Sets the maximum size of transmitted (MTU) for given device using linux
1055 * networking ioctl interface.
1058 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1060 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1064 if (netdev->cache_valid & VALID_MTU) {
1065 if (netdev->netdev_mtu_error) {
1066 return netdev->netdev_mtu_error;
1068 if (netdev->mtu == mtu) {
1071 netdev->cache_valid &= ~VALID_MTU;
1074 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1075 SIOCSIFMTU, "SIOCSIFMTU");
1076 if (!error || error == ENODEV) {
1077 netdev->netdev_mtu_error = error;
1078 netdev->mtu = ifr.ifr_mtu;
1079 netdev->cache_valid |= VALID_MTU;
1084 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1085 * On failure, returns a negative errno value. */
1087 netdev_linux_get_ifindex(const struct netdev *netdev)
1091 error = get_ifindex(netdev, &ifindex);
1092 return error ? -error : ifindex;
1096 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1098 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1100 if (netdev->miimon_interval > 0) {
1101 *carrier = netdev->miimon;
1103 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1109 static long long int
1110 netdev_linux_get_carrier_resets(const struct netdev *netdev)
1112 return netdev_linux_cast(netdev)->carrier_resets;
1116 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1117 struct mii_ioctl_data *data)
1122 memset(&ifr, 0, sizeof ifr);
1123 memcpy(&ifr.ifr_data, data, sizeof *data);
1124 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1125 memcpy(data, &ifr.ifr_data, sizeof *data);
1131 netdev_linux_get_miimon(const char *name, bool *miimon)
1133 struct mii_ioctl_data data;
1138 memset(&data, 0, sizeof data);
1139 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1141 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1142 data.reg_num = MII_BMSR;
1143 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1147 *miimon = !!(data.val_out & BMSR_LSTATUS);
1149 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1152 struct ethtool_cmd ecmd;
1154 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1157 COVERAGE_INC(netdev_get_ethtool);
1158 memset(&ecmd, 0, sizeof ecmd);
1159 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1162 struct ethtool_value eval;
1164 memcpy(&eval, &ecmd, sizeof eval);
1165 *miimon = !!eval.data;
1167 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1175 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1176 long long int interval)
1178 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1180 interval = interval > 0 ? MAX(interval, 100) : 0;
1181 if (netdev->miimon_interval != interval) {
1182 netdev->miimon_interval = interval;
1183 timer_set_expired(&netdev->miimon_timer);
1190 netdev_linux_miimon_run(void)
1192 struct shash device_shash;
1193 struct shash_node *node;
1195 shash_init(&device_shash);
1196 netdev_get_devices(&netdev_linux_class, &device_shash);
1197 SHASH_FOR_EACH (node, &device_shash) {
1198 struct netdev *netdev = node->data;
1199 struct netdev_linux *dev = netdev_linux_cast(netdev);
1202 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1206 netdev_linux_get_miimon(dev->up.name, &miimon);
1207 if (miimon != dev->miimon) {
1208 dev->miimon = miimon;
1209 netdev_linux_changed(dev, dev->ifi_flags, 0);
1212 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1215 shash_destroy(&device_shash);
1219 netdev_linux_miimon_wait(void)
1221 struct shash device_shash;
1222 struct shash_node *node;
1224 shash_init(&device_shash);
1225 netdev_get_devices(&netdev_linux_class, &device_shash);
1226 SHASH_FOR_EACH (node, &device_shash) {
1227 struct netdev *netdev = node->data;
1228 struct netdev_linux *dev = netdev_linux_cast(netdev);
1230 if (dev->miimon_interval > 0) {
1231 timer_wait(&dev->miimon_timer);
1234 shash_destroy(&device_shash);
1237 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1238 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1241 check_for_working_netlink_stats(void)
1243 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1244 * preferable, so if that works, we'll use it. */
1245 int ifindex = do_get_ifindex("lo");
1247 VLOG_WARN("failed to get ifindex for lo, "
1248 "obtaining netdev stats from proc");
1251 struct netdev_stats stats;
1252 int error = get_stats_via_netlink(ifindex, &stats);
1254 VLOG_DBG("obtaining netdev stats via rtnetlink");
1257 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1258 "via proc (you are probably running a pre-2.6.19 "
1259 "kernel)", ovs_strerror(error));
1266 swap_uint64(uint64_t *a, uint64_t *b)
1273 /* Copies 'src' into 'dst', performing format conversion in the process.
1275 * 'src' is allowed to be misaligned. */
1277 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1278 const struct ovs_vport_stats *src)
1280 dst->rx_packets = get_unaligned_u64(&src->rx_packets);
1281 dst->tx_packets = get_unaligned_u64(&src->tx_packets);
1282 dst->rx_bytes = get_unaligned_u64(&src->rx_bytes);
1283 dst->tx_bytes = get_unaligned_u64(&src->tx_bytes);
1284 dst->rx_errors = get_unaligned_u64(&src->rx_errors);
1285 dst->tx_errors = get_unaligned_u64(&src->tx_errors);
1286 dst->rx_dropped = get_unaligned_u64(&src->rx_dropped);
1287 dst->tx_dropped = get_unaligned_u64(&src->tx_dropped);
1289 dst->collisions = 0;
1290 dst->rx_length_errors = 0;
1291 dst->rx_over_errors = 0;
1292 dst->rx_crc_errors = 0;
1293 dst->rx_frame_errors = 0;
1294 dst->rx_fifo_errors = 0;
1295 dst->rx_missed_errors = 0;
1296 dst->tx_aborted_errors = 0;
1297 dst->tx_carrier_errors = 0;
1298 dst->tx_fifo_errors = 0;
1299 dst->tx_heartbeat_errors = 0;
1300 dst->tx_window_errors = 0;
1304 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1306 struct dpif_linux_vport reply;
1310 error = dpif_linux_vport_get(netdev_get_name(netdev), &reply, &buf);
1313 } else if (!reply.stats) {
1318 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1326 get_stats_via_vport(const struct netdev *netdev_,
1327 struct netdev_stats *stats)
1329 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1331 if (!netdev->vport_stats_error ||
1332 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1335 error = get_stats_via_vport__(netdev_, stats);
1336 if (error && error != ENOENT) {
1337 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1339 netdev_get_name(netdev_), ovs_strerror(error));
1341 netdev->vport_stats_error = error;
1342 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1347 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1348 struct netdev_stats *stats)
1350 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
1351 static int use_netlink_stats;
1354 if (ovsthread_once_start(&once)) {
1355 use_netlink_stats = check_for_working_netlink_stats();
1356 ovsthread_once_done(&once);
1359 if (use_netlink_stats) {
1362 error = get_ifindex(netdev_, &ifindex);
1364 error = get_stats_via_netlink(ifindex, stats);
1367 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1371 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1372 netdev_get_name(netdev_), error);
1378 /* Retrieves current device stats for 'netdev-linux'. */
1380 netdev_linux_get_stats(const struct netdev *netdev_,
1381 struct netdev_stats *stats)
1383 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1384 struct netdev_stats dev_stats;
1387 get_stats_via_vport(netdev_, stats);
1389 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1392 if (netdev->vport_stats_error) {
1399 if (netdev->vport_stats_error) {
1400 /* stats not available from OVS then use ioctl stats. */
1403 stats->rx_errors += dev_stats.rx_errors;
1404 stats->tx_errors += dev_stats.tx_errors;
1405 stats->rx_dropped += dev_stats.rx_dropped;
1406 stats->tx_dropped += dev_stats.tx_dropped;
1407 stats->multicast += dev_stats.multicast;
1408 stats->collisions += dev_stats.collisions;
1409 stats->rx_length_errors += dev_stats.rx_length_errors;
1410 stats->rx_over_errors += dev_stats.rx_over_errors;
1411 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1412 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1413 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1414 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1415 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1416 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1417 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1418 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1419 stats->tx_window_errors += dev_stats.tx_window_errors;
1424 /* Retrieves current device stats for 'netdev-tap' netdev or
1425 * netdev-internal. */
1427 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1429 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1430 struct netdev_stats dev_stats;
1433 get_stats_via_vport(netdev_, stats);
1435 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1437 if (netdev->vport_stats_error) {
1444 /* If this port is an internal port then the transmit and receive stats
1445 * will appear to be swapped relative to the other ports since we are the
1446 * one sending the data, not a remote computer. For consistency, we swap
1447 * them back here. This does not apply if we are getting stats from the
1448 * vport layer because it always tracks stats from the perspective of the
1450 if (netdev->vport_stats_error) {
1452 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1453 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1454 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1455 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1456 stats->rx_length_errors = 0;
1457 stats->rx_over_errors = 0;
1458 stats->rx_crc_errors = 0;
1459 stats->rx_frame_errors = 0;
1460 stats->rx_fifo_errors = 0;
1461 stats->rx_missed_errors = 0;
1462 stats->tx_aborted_errors = 0;
1463 stats->tx_carrier_errors = 0;
1464 stats->tx_fifo_errors = 0;
1465 stats->tx_heartbeat_errors = 0;
1466 stats->tx_window_errors = 0;
1468 stats->rx_dropped += dev_stats.tx_dropped;
1469 stats->tx_dropped += dev_stats.rx_dropped;
1471 stats->rx_errors += dev_stats.tx_errors;
1472 stats->tx_errors += dev_stats.rx_errors;
1474 stats->multicast += dev_stats.multicast;
1475 stats->collisions += dev_stats.collisions;
1481 netdev_internal_get_stats(const struct netdev *netdev_,
1482 struct netdev_stats *stats)
1484 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1486 get_stats_via_vport(netdev_, stats);
1487 return netdev->vport_stats_error;
1491 netdev_internal_set_stats(struct netdev *netdev,
1492 const struct netdev_stats *stats)
1494 struct ovs_vport_stats vport_stats;
1495 struct dpif_linux_vport vport;
1498 vport_stats.rx_packets = stats->rx_packets;
1499 vport_stats.tx_packets = stats->tx_packets;
1500 vport_stats.rx_bytes = stats->rx_bytes;
1501 vport_stats.tx_bytes = stats->tx_bytes;
1502 vport_stats.rx_errors = stats->rx_errors;
1503 vport_stats.tx_errors = stats->tx_errors;
1504 vport_stats.rx_dropped = stats->rx_dropped;
1505 vport_stats.tx_dropped = stats->tx_dropped;
1507 dpif_linux_vport_init(&vport);
1508 vport.cmd = OVS_VPORT_CMD_SET;
1509 vport.name = netdev_get_name(netdev);
1510 vport.stats = &vport_stats;
1512 err = dpif_linux_vport_transact(&vport, NULL, NULL);
1514 /* If the vport layer doesn't know about the device, that doesn't mean it
1515 * doesn't exist (after all were able to open it when netdev_open() was
1516 * called), it just means that it isn't attached and we'll be getting
1517 * stats a different way. */
1518 if (err == ENODEV) {
1526 netdev_linux_read_features(struct netdev_linux *netdev)
1528 struct ethtool_cmd ecmd;
1532 if (netdev->cache_valid & VALID_FEATURES) {
1536 COVERAGE_INC(netdev_get_ethtool);
1537 memset(&ecmd, 0, sizeof ecmd);
1538 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1539 ETHTOOL_GSET, "ETHTOOL_GSET");
1544 /* Supported features. */
1545 netdev->supported = 0;
1546 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1547 netdev->supported |= NETDEV_F_10MB_HD;
1549 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1550 netdev->supported |= NETDEV_F_10MB_FD;
1552 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1553 netdev->supported |= NETDEV_F_100MB_HD;
1555 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1556 netdev->supported |= NETDEV_F_100MB_FD;
1558 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1559 netdev->supported |= NETDEV_F_1GB_HD;
1561 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1562 netdev->supported |= NETDEV_F_1GB_FD;
1564 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1565 netdev->supported |= NETDEV_F_10GB_FD;
1567 if (ecmd.supported & SUPPORTED_TP) {
1568 netdev->supported |= NETDEV_F_COPPER;
1570 if (ecmd.supported & SUPPORTED_FIBRE) {
1571 netdev->supported |= NETDEV_F_FIBER;
1573 if (ecmd.supported & SUPPORTED_Autoneg) {
1574 netdev->supported |= NETDEV_F_AUTONEG;
1576 if (ecmd.supported & SUPPORTED_Pause) {
1577 netdev->supported |= NETDEV_F_PAUSE;
1579 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1580 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1583 /* Advertised features. */
1584 netdev->advertised = 0;
1585 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1586 netdev->advertised |= NETDEV_F_10MB_HD;
1588 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1589 netdev->advertised |= NETDEV_F_10MB_FD;
1591 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1592 netdev->advertised |= NETDEV_F_100MB_HD;
1594 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1595 netdev->advertised |= NETDEV_F_100MB_FD;
1597 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1598 netdev->advertised |= NETDEV_F_1GB_HD;
1600 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1601 netdev->advertised |= NETDEV_F_1GB_FD;
1603 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1604 netdev->advertised |= NETDEV_F_10GB_FD;
1606 if (ecmd.advertising & ADVERTISED_TP) {
1607 netdev->advertised |= NETDEV_F_COPPER;
1609 if (ecmd.advertising & ADVERTISED_FIBRE) {
1610 netdev->advertised |= NETDEV_F_FIBER;
1612 if (ecmd.advertising & ADVERTISED_Autoneg) {
1613 netdev->advertised |= NETDEV_F_AUTONEG;
1615 if (ecmd.advertising & ADVERTISED_Pause) {
1616 netdev->advertised |= NETDEV_F_PAUSE;
1618 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1619 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1622 /* Current settings. */
1624 if (speed == SPEED_10) {
1625 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1626 } else if (speed == SPEED_100) {
1627 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1628 } else if (speed == SPEED_1000) {
1629 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1630 } else if (speed == SPEED_10000) {
1631 netdev->current = NETDEV_F_10GB_FD;
1632 } else if (speed == 40000) {
1633 netdev->current = NETDEV_F_40GB_FD;
1634 } else if (speed == 100000) {
1635 netdev->current = NETDEV_F_100GB_FD;
1636 } else if (speed == 1000000) {
1637 netdev->current = NETDEV_F_1TB_FD;
1639 netdev->current = 0;
1642 if (ecmd.port == PORT_TP) {
1643 netdev->current |= NETDEV_F_COPPER;
1644 } else if (ecmd.port == PORT_FIBRE) {
1645 netdev->current |= NETDEV_F_FIBER;
1649 netdev->current |= NETDEV_F_AUTONEG;
1652 /* Peer advertisements. */
1653 netdev->peer = 0; /* XXX */
1656 netdev->cache_valid |= VALID_FEATURES;
1657 netdev->get_features_error = error;
1660 /* Stores the features supported by 'netdev' into each of '*current',
1661 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1662 * bitmap of NETDEV_* bits. Returns 0 if successful, otherwise a positive
1665 netdev_linux_get_features(const struct netdev *netdev_,
1666 enum netdev_features *current,
1667 enum netdev_features *advertised,
1668 enum netdev_features *supported,
1669 enum netdev_features *peer)
1671 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1673 netdev_linux_read_features(netdev);
1675 if (!netdev->get_features_error) {
1676 *current = netdev->current;
1677 *advertised = netdev->advertised;
1678 *supported = netdev->supported;
1679 *peer = netdev->peer;
1681 return netdev->get_features_error;
1684 /* Set the features advertised by 'netdev' to 'advertise'. */
1686 netdev_linux_set_advertisements(struct netdev *netdev,
1687 enum netdev_features advertise)
1689 struct ethtool_cmd ecmd;
1692 COVERAGE_INC(netdev_get_ethtool);
1693 memset(&ecmd, 0, sizeof ecmd);
1694 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1695 ETHTOOL_GSET, "ETHTOOL_GSET");
1700 ecmd.advertising = 0;
1701 if (advertise & NETDEV_F_10MB_HD) {
1702 ecmd.advertising |= ADVERTISED_10baseT_Half;
1704 if (advertise & NETDEV_F_10MB_FD) {
1705 ecmd.advertising |= ADVERTISED_10baseT_Full;
1707 if (advertise & NETDEV_F_100MB_HD) {
1708 ecmd.advertising |= ADVERTISED_100baseT_Half;
1710 if (advertise & NETDEV_F_100MB_FD) {
1711 ecmd.advertising |= ADVERTISED_100baseT_Full;
1713 if (advertise & NETDEV_F_1GB_HD) {
1714 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1716 if (advertise & NETDEV_F_1GB_FD) {
1717 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1719 if (advertise & NETDEV_F_10GB_FD) {
1720 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1722 if (advertise & NETDEV_F_COPPER) {
1723 ecmd.advertising |= ADVERTISED_TP;
1725 if (advertise & NETDEV_F_FIBER) {
1726 ecmd.advertising |= ADVERTISED_FIBRE;
1728 if (advertise & NETDEV_F_AUTONEG) {
1729 ecmd.advertising |= ADVERTISED_Autoneg;
1731 if (advertise & NETDEV_F_PAUSE) {
1732 ecmd.advertising |= ADVERTISED_Pause;
1734 if (advertise & NETDEV_F_PAUSE_ASYM) {
1735 ecmd.advertising |= ADVERTISED_Asym_Pause;
1737 COVERAGE_INC(netdev_set_ethtool);
1738 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1739 ETHTOOL_SSET, "ETHTOOL_SSET");
1742 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1743 * successful, otherwise a positive errno value. */
1745 netdev_linux_set_policing(struct netdev *netdev_,
1746 uint32_t kbits_rate, uint32_t kbits_burst)
1748 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1749 const char *netdev_name = netdev_get_name(netdev_);
1753 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1754 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1755 : kbits_burst); /* Stick with user-specified value. */
1757 if (netdev->cache_valid & VALID_POLICING) {
1758 if (netdev->netdev_policing_error) {
1759 return netdev->netdev_policing_error;
1762 if (netdev->kbits_rate == kbits_rate &&
1763 netdev->kbits_burst == kbits_burst) {
1764 /* Assume that settings haven't changed since we last set them. */
1767 netdev->cache_valid &= ~VALID_POLICING;
1770 COVERAGE_INC(netdev_set_policing);
1771 /* Remove any existing ingress qdisc. */
1772 error = tc_add_del_ingress_qdisc(netdev_, false);
1774 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1775 netdev_name, ovs_strerror(error));
1780 error = tc_add_del_ingress_qdisc(netdev_, true);
1782 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1783 netdev_name, ovs_strerror(error));
1787 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
1789 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1790 netdev_name, ovs_strerror(error));
1795 netdev->kbits_rate = kbits_rate;
1796 netdev->kbits_burst = kbits_burst;
1799 if (!error || error == ENODEV) {
1800 netdev->netdev_policing_error = error;
1801 netdev->cache_valid |= VALID_POLICING;
1807 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1810 const struct tc_ops *const *opsp;
1812 for (opsp = tcs; *opsp != NULL; opsp++) {
1813 const struct tc_ops *ops = *opsp;
1814 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1815 sset_add(types, ops->ovs_name);
1821 static const struct tc_ops *
1822 tc_lookup_ovs_name(const char *name)
1824 const struct tc_ops *const *opsp;
1826 for (opsp = tcs; *opsp != NULL; opsp++) {
1827 const struct tc_ops *ops = *opsp;
1828 if (!strcmp(name, ops->ovs_name)) {
1835 static const struct tc_ops *
1836 tc_lookup_linux_name(const char *name)
1838 const struct tc_ops *const *opsp;
1840 for (opsp = tcs; *opsp != NULL; opsp++) {
1841 const struct tc_ops *ops = *opsp;
1842 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1849 static struct tc_queue *
1850 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
1853 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1854 struct tc_queue *queue;
1856 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
1857 if (queue->queue_id == queue_id) {
1864 static struct tc_queue *
1865 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1867 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1871 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1873 struct netdev_qos_capabilities *caps)
1875 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1879 caps->n_queues = ops->n_queues;
1884 netdev_linux_get_qos(const struct netdev *netdev_,
1885 const char **typep, struct smap *details)
1887 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1890 error = tc_query_qdisc(netdev_);
1895 *typep = netdev->tc->ops->ovs_name;
1896 return (netdev->tc->ops->qdisc_get
1897 ? netdev->tc->ops->qdisc_get(netdev_, details)
1902 netdev_linux_set_qos(struct netdev *netdev_,
1903 const char *type, const struct smap *details)
1905 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1906 const struct tc_ops *new_ops;
1909 new_ops = tc_lookup_ovs_name(type);
1910 if (!new_ops || !new_ops->tc_install) {
1914 error = tc_query_qdisc(netdev_);
1919 if (new_ops == netdev->tc->ops) {
1920 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
1922 /* Delete existing qdisc. */
1923 error = tc_del_qdisc(netdev_);
1927 ovs_assert(netdev->tc == NULL);
1929 /* Install new qdisc. */
1930 error = new_ops->tc_install(netdev_, details);
1931 ovs_assert((error == 0) == (netdev->tc != NULL));
1938 netdev_linux_get_queue(const struct netdev *netdev_,
1939 unsigned int queue_id, struct smap *details)
1941 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1944 error = tc_query_qdisc(netdev_);
1948 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
1950 ? netdev->tc->ops->class_get(netdev_, queue, details)
1956 netdev_linux_set_queue(struct netdev *netdev_,
1957 unsigned int queue_id, const struct smap *details)
1959 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1962 error = tc_query_qdisc(netdev_);
1965 } else if (queue_id >= netdev->tc->ops->n_queues
1966 || !netdev->tc->ops->class_set) {
1970 return netdev->tc->ops->class_set(netdev_, queue_id, details);
1974 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
1976 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1979 error = tc_query_qdisc(netdev_);
1982 } else if (!netdev->tc->ops->class_delete) {
1985 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
1987 ? netdev->tc->ops->class_delete(netdev_, queue)
1993 netdev_linux_get_queue_stats(const struct netdev *netdev_,
1994 unsigned int queue_id,
1995 struct netdev_queue_stats *stats)
1997 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2000 error = tc_query_qdisc(netdev_);
2003 } else if (!netdev->tc->ops->class_get_stats) {
2006 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2010 stats->created = queue->created;
2011 return netdev->tc->ops->class_get_stats(netdev_, queue, stats);
2016 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
2018 struct ofpbuf request;
2019 struct tcmsg *tcmsg;
2021 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2025 tcmsg->tcm_parent = 0;
2026 nl_dump_start(dump, NETLINK_ROUTE, &request);
2027 ofpbuf_uninit(&request);
2032 netdev_linux_dump_queues(const struct netdev *netdev_,
2033 netdev_dump_queues_cb *cb, void *aux)
2035 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2036 struct tc_queue *queue, *next_queue;
2037 struct smap details;
2041 error = tc_query_qdisc(netdev_);
2044 } else if (!netdev->tc->ops->class_get) {
2049 smap_init(&details);
2050 HMAP_FOR_EACH_SAFE (queue, next_queue, hmap_node,
2051 &netdev->tc->queues) {
2052 smap_clear(&details);
2054 error = netdev->tc->ops->class_get(netdev_, queue, &details);
2056 (*cb)(queue->queue_id, &details, aux);
2061 smap_destroy(&details);
2067 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2068 netdev_dump_queue_stats_cb *cb, void *aux)
2070 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2071 struct nl_dump dump;
2076 error = tc_query_qdisc(netdev_);
2079 } else if (!netdev->tc->ops->class_dump_stats) {
2084 if (!start_queue_dump(netdev_, &dump)) {
2087 while (nl_dump_next(&dump, &msg)) {
2088 error = netdev->tc->ops->class_dump_stats(netdev_, &msg, cb, aux);
2094 error = nl_dump_done(&dump);
2095 return error ? error : last_error;
2099 netdev_linux_get_in4(const struct netdev *netdev_,
2100 struct in_addr *address, struct in_addr *netmask)
2102 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2104 if (!(netdev->cache_valid & VALID_IN4)) {
2107 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
2108 SIOCGIFADDR, "SIOCGIFADDR");
2113 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2114 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2119 netdev->cache_valid |= VALID_IN4;
2121 *address = netdev->address;
2122 *netmask = netdev->netmask;
2123 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
2127 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2128 struct in_addr netmask)
2130 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2133 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2135 netdev->cache_valid |= VALID_IN4;
2136 netdev->address = address;
2137 netdev->netmask = netmask;
2138 if (address.s_addr != INADDR_ANY) {
2139 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2140 "SIOCSIFNETMASK", netmask);
2147 parse_if_inet6_line(const char *line,
2148 struct in6_addr *in6, char ifname[16 + 1])
2150 uint8_t *s6 = in6->s6_addr;
2151 #define X8 "%2"SCNx8
2153 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2154 "%*x %*x %*x %*x %16s\n",
2155 &s6[0], &s6[1], &s6[2], &s6[3],
2156 &s6[4], &s6[5], &s6[6], &s6[7],
2157 &s6[8], &s6[9], &s6[10], &s6[11],
2158 &s6[12], &s6[13], &s6[14], &s6[15],
2162 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2163 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2165 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2167 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2168 if (!(netdev->cache_valid & VALID_IN6)) {
2172 netdev->in6 = in6addr_any;
2174 file = fopen("/proc/net/if_inet6", "r");
2176 const char *name = netdev_get_name(netdev_);
2177 while (fgets(line, sizeof line, file)) {
2178 struct in6_addr in6_tmp;
2179 char ifname[16 + 1];
2180 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2181 && !strcmp(name, ifname))
2183 netdev->in6 = in6_tmp;
2189 netdev->cache_valid |= VALID_IN6;
2196 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2198 struct sockaddr_in sin;
2199 memset(&sin, 0, sizeof sin);
2200 sin.sin_family = AF_INET;
2201 sin.sin_addr = addr;
2204 memset(sa, 0, sizeof *sa);
2205 memcpy(sa, &sin, sizeof sin);
2209 do_set_addr(struct netdev *netdev,
2210 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2213 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2214 make_in4_sockaddr(&ifr.ifr_addr, addr);
2216 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2220 /* Adds 'router' as a default IP gateway. */
2222 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2224 struct in_addr any = { INADDR_ANY };
2228 memset(&rt, 0, sizeof rt);
2229 make_in4_sockaddr(&rt.rt_dst, any);
2230 make_in4_sockaddr(&rt.rt_gateway, router);
2231 make_in4_sockaddr(&rt.rt_genmask, any);
2232 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2233 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2235 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2241 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2244 static const char fn[] = "/proc/net/route";
2249 *netdev_name = NULL;
2250 stream = fopen(fn, "r");
2251 if (stream == NULL) {
2252 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2257 while (fgets(line, sizeof line, stream)) {
2260 ovs_be32 dest, gateway, mask;
2261 int refcnt, metric, mtu;
2262 unsigned int flags, use, window, irtt;
2265 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2267 iface, &dest, &gateway, &flags, &refcnt,
2268 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2270 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2274 if (!(flags & RTF_UP)) {
2275 /* Skip routes that aren't up. */
2279 /* The output of 'dest', 'mask', and 'gateway' were given in
2280 * network byte order, so we don't need need any endian
2281 * conversions here. */
2282 if ((dest & mask) == (host->s_addr & mask)) {
2284 /* The host is directly reachable. */
2285 next_hop->s_addr = 0;
2287 /* To reach the host, we must go through a gateway. */
2288 next_hop->s_addr = gateway;
2290 *netdev_name = xstrdup(iface);
2302 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2304 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2307 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2308 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2310 COVERAGE_INC(netdev_get_ethtool);
2311 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2312 error = netdev_linux_do_ethtool(netdev->up.name,
2315 "ETHTOOL_GDRVINFO");
2317 netdev->cache_valid |= VALID_DRVINFO;
2322 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2323 smap_add(smap, "driver_version", netdev->drvinfo.version);
2324 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2330 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2333 smap_add(smap, "driver_name", "openvswitch");
2337 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2338 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2339 * returns 0. Otherwise, it returns a positive errno value; in particular,
2340 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2342 netdev_linux_arp_lookup(const struct netdev *netdev,
2343 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2346 struct sockaddr_in sin;
2349 memset(&r, 0, sizeof r);
2350 memset(&sin, 0, sizeof sin);
2351 sin.sin_family = AF_INET;
2352 sin.sin_addr.s_addr = ip;
2354 memcpy(&r.arp_pa, &sin, sizeof sin);
2355 r.arp_ha.sa_family = ARPHRD_ETHER;
2357 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2358 COVERAGE_INC(netdev_arp_lookup);
2359 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2361 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2362 } else if (retval != ENXIO) {
2363 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2364 netdev_get_name(netdev), IP_ARGS(ip),
2365 ovs_strerror(retval));
2371 nd_to_iff_flags(enum netdev_flags nd)
2374 if (nd & NETDEV_UP) {
2377 if (nd & NETDEV_PROMISC) {
2384 iff_to_nd_flags(int iff)
2386 enum netdev_flags nd = 0;
2390 if (iff & IFF_PROMISC) {
2391 nd |= NETDEV_PROMISC;
2397 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2398 enum netdev_flags on, enum netdev_flags *old_flagsp)
2400 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2401 int old_flags, new_flags;
2404 old_flags = netdev->ifi_flags;
2405 *old_flagsp = iff_to_nd_flags(old_flags);
2406 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2407 if (new_flags != old_flags) {
2408 error = set_flags(netdev_get_name(netdev_), new_flags);
2409 get_flags(netdev_, &netdev->ifi_flags);
2415 netdev_linux_change_seq(const struct netdev *netdev)
2417 return netdev_linux_cast(netdev)->change_seq;
2420 #define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS, \
2421 GET_FEATURES, GET_STATUS) \
2425 netdev_linux_init, \
2427 netdev_linux_wait, \
2430 netdev_linux_destroy, \
2431 NULL, /* get_config */ \
2432 NULL, /* set_config */ \
2433 NULL, /* get_tunnel_config */ \
2435 netdev_linux_rx_open, \
2437 netdev_linux_send, \
2438 netdev_linux_send_wait, \
2440 netdev_linux_set_etheraddr, \
2441 netdev_linux_get_etheraddr, \
2442 netdev_linux_get_mtu, \
2443 netdev_linux_set_mtu, \
2444 netdev_linux_get_ifindex, \
2445 netdev_linux_get_carrier, \
2446 netdev_linux_get_carrier_resets, \
2447 netdev_linux_set_miimon_interval, \
2452 netdev_linux_set_advertisements, \
2454 netdev_linux_set_policing, \
2455 netdev_linux_get_qos_types, \
2456 netdev_linux_get_qos_capabilities, \
2457 netdev_linux_get_qos, \
2458 netdev_linux_set_qos, \
2459 netdev_linux_get_queue, \
2460 netdev_linux_set_queue, \
2461 netdev_linux_delete_queue, \
2462 netdev_linux_get_queue_stats, \
2463 netdev_linux_dump_queues, \
2464 netdev_linux_dump_queue_stats, \
2466 netdev_linux_get_in4, \
2467 netdev_linux_set_in4, \
2468 netdev_linux_get_in6, \
2469 netdev_linux_add_router, \
2470 netdev_linux_get_next_hop, \
2472 netdev_linux_arp_lookup, \
2474 netdev_linux_update_flags, \
2476 netdev_linux_change_seq \
2479 const struct netdev_class netdev_linux_class =
2482 netdev_linux_create,
2483 netdev_linux_get_stats,
2484 NULL, /* set_stats */
2485 netdev_linux_get_features,
2486 netdev_linux_get_status);
2488 const struct netdev_class netdev_tap_class =
2491 netdev_linux_create_tap,
2492 netdev_tap_get_stats,
2493 NULL, /* set_stats */
2494 netdev_linux_get_features,
2495 netdev_linux_get_status);
2497 const struct netdev_class netdev_internal_class =
2500 netdev_linux_create,
2501 netdev_internal_get_stats,
2502 netdev_internal_set_stats,
2503 NULL, /* get_features */
2504 netdev_internal_get_status);
2506 static const struct netdev_rx_class netdev_rx_linux_class = {
2507 netdev_rx_linux_destroy,
2508 netdev_rx_linux_recv,
2509 netdev_rx_linux_wait,
2510 netdev_rx_linux_drain,
2513 /* HTB traffic control class. */
2515 #define HTB_N_QUEUES 0xf000
2519 unsigned int max_rate; /* In bytes/s. */
2523 struct tc_queue tc_queue;
2524 unsigned int min_rate; /* In bytes/s. */
2525 unsigned int max_rate; /* In bytes/s. */
2526 unsigned int burst; /* In bytes. */
2527 unsigned int priority; /* Lower values are higher priorities. */
2531 htb_get__(const struct netdev *netdev_)
2533 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2534 return CONTAINER_OF(netdev->tc, struct htb, tc);
2538 htb_install__(struct netdev *netdev_, uint64_t max_rate)
2540 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2543 htb = xmalloc(sizeof *htb);
2544 tc_init(&htb->tc, &tc_ops_htb);
2545 htb->max_rate = max_rate;
2547 netdev->tc = &htb->tc;
2550 /* Create an HTB qdisc.
2552 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2554 htb_setup_qdisc__(struct netdev *netdev)
2557 struct tc_htb_glob opt;
2558 struct ofpbuf request;
2559 struct tcmsg *tcmsg;
2561 tc_del_qdisc(netdev);
2563 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2564 NLM_F_EXCL | NLM_F_CREATE, &request);
2568 tcmsg->tcm_handle = tc_make_handle(1, 0);
2569 tcmsg->tcm_parent = TC_H_ROOT;
2571 nl_msg_put_string(&request, TCA_KIND, "htb");
2573 memset(&opt, 0, sizeof opt);
2574 opt.rate2quantum = 10;
2578 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2579 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2580 nl_msg_end_nested(&request, opt_offset);
2582 return tc_transact(&request, NULL);
2585 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2586 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2588 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2589 unsigned int parent, struct htb_class *class)
2592 struct tc_htb_opt opt;
2593 struct ofpbuf request;
2594 struct tcmsg *tcmsg;
2598 error = netdev_get_mtu(netdev, &mtu);
2600 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2601 netdev_get_name(netdev));
2605 memset(&opt, 0, sizeof opt);
2606 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2607 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2608 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2609 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2610 opt.prio = class->priority;
2612 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2616 tcmsg->tcm_handle = handle;
2617 tcmsg->tcm_parent = parent;
2619 nl_msg_put_string(&request, TCA_KIND, "htb");
2620 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2621 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2622 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2623 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2624 nl_msg_end_nested(&request, opt_offset);
2626 error = tc_transact(&request, NULL);
2628 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2629 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2630 netdev_get_name(netdev),
2631 tc_get_major(handle), tc_get_minor(handle),
2632 tc_get_major(parent), tc_get_minor(parent),
2633 class->min_rate, class->max_rate,
2634 class->burst, class->priority, ovs_strerror(error));
2639 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2640 * description of them into 'details'. The description complies with the
2641 * specification given in the vswitch database documentation for linux-htb
2644 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2646 static const struct nl_policy tca_htb_policy[] = {
2647 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2648 .min_len = sizeof(struct tc_htb_opt) },
2651 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2652 const struct tc_htb_opt *htb;
2654 if (!nl_parse_nested(nl_options, tca_htb_policy,
2655 attrs, ARRAY_SIZE(tca_htb_policy))) {
2656 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2660 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2661 class->min_rate = htb->rate.rate;
2662 class->max_rate = htb->ceil.rate;
2663 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2664 class->priority = htb->prio;
2669 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2670 struct htb_class *options,
2671 struct netdev_queue_stats *stats)
2673 struct nlattr *nl_options;
2674 unsigned int handle;
2677 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2678 if (!error && queue_id) {
2679 unsigned int major = tc_get_major(handle);
2680 unsigned int minor = tc_get_minor(handle);
2681 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2682 *queue_id = minor - 1;
2687 if (!error && options) {
2688 error = htb_parse_tca_options__(nl_options, options);
2694 htb_parse_qdisc_details__(struct netdev *netdev,
2695 const struct smap *details, struct htb_class *hc)
2697 const char *max_rate_s;
2699 max_rate_s = smap_get(details, "max-rate");
2700 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2701 if (!hc->max_rate) {
2702 enum netdev_features current;
2704 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2705 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
2707 hc->min_rate = hc->max_rate;
2713 htb_parse_class_details__(struct netdev *netdev,
2714 const struct smap *details, struct htb_class *hc)
2716 const struct htb *htb = htb_get__(netdev);
2717 const char *min_rate_s = smap_get(details, "min-rate");
2718 const char *max_rate_s = smap_get(details, "max-rate");
2719 const char *burst_s = smap_get(details, "burst");
2720 const char *priority_s = smap_get(details, "priority");
2723 error = netdev_get_mtu(netdev, &mtu);
2725 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2726 netdev_get_name(netdev));
2730 /* HTB requires at least an mtu sized min-rate to send any traffic even
2731 * on uncongested links. */
2732 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2733 hc->min_rate = MAX(hc->min_rate, mtu);
2734 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2737 hc->max_rate = (max_rate_s
2738 ? strtoull(max_rate_s, NULL, 10) / 8
2740 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2741 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2745 * According to hints in the documentation that I've read, it is important
2746 * that 'burst' be at least as big as the largest frame that might be
2747 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2748 * but having it a bit too small is a problem. Since netdev_get_mtu()
2749 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2750 * the MTU. We actually add 64, instead of 14, as a guard against
2751 * additional headers get tacked on somewhere that we're not aware of. */
2752 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2753 hc->burst = MAX(hc->burst, mtu + 64);
2756 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2762 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2763 unsigned int parent, struct htb_class *options,
2764 struct netdev_queue_stats *stats)
2766 struct ofpbuf *reply;
2769 error = tc_query_class(netdev, handle, parent, &reply);
2771 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2772 ofpbuf_delete(reply);
2778 htb_tc_install(struct netdev *netdev, const struct smap *details)
2782 error = htb_setup_qdisc__(netdev);
2784 struct htb_class hc;
2786 htb_parse_qdisc_details__(netdev, details, &hc);
2787 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2788 tc_make_handle(1, 0), &hc);
2790 htb_install__(netdev, hc.max_rate);
2796 static struct htb_class *
2797 htb_class_cast__(const struct tc_queue *queue)
2799 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2803 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2804 const struct htb_class *hc)
2806 struct htb *htb = htb_get__(netdev);
2807 size_t hash = hash_int(queue_id, 0);
2808 struct tc_queue *queue;
2809 struct htb_class *hcp;
2811 queue = tc_find_queue__(netdev, queue_id, hash);
2813 hcp = htb_class_cast__(queue);
2815 hcp = xmalloc(sizeof *hcp);
2816 queue = &hcp->tc_queue;
2817 queue->queue_id = queue_id;
2818 queue->created = time_msec();
2819 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2822 hcp->min_rate = hc->min_rate;
2823 hcp->max_rate = hc->max_rate;
2824 hcp->burst = hc->burst;
2825 hcp->priority = hc->priority;
2829 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2832 struct nl_dump dump;
2833 struct htb_class hc;
2835 /* Get qdisc options. */
2837 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2838 htb_install__(netdev, hc.max_rate);
2841 if (!start_queue_dump(netdev, &dump)) {
2844 while (nl_dump_next(&dump, &msg)) {
2845 unsigned int queue_id;
2847 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2848 htb_update_queue__(netdev, queue_id, &hc);
2851 nl_dump_done(&dump);
2857 htb_tc_destroy(struct tc *tc)
2859 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2860 struct htb_class *hc, *next;
2862 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2863 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2871 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
2873 const struct htb *htb = htb_get__(netdev);
2874 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
2879 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
2881 struct htb_class hc;
2884 htb_parse_qdisc_details__(netdev, details, &hc);
2885 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2886 tc_make_handle(1, 0), &hc);
2888 htb_get__(netdev)->max_rate = hc.max_rate;
2894 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2895 const struct tc_queue *queue, struct smap *details)
2897 const struct htb_class *hc = htb_class_cast__(queue);
2899 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
2900 if (hc->min_rate != hc->max_rate) {
2901 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
2903 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
2905 smap_add_format(details, "priority", "%u", hc->priority);
2911 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2912 const struct smap *details)
2914 struct htb_class hc;
2917 error = htb_parse_class_details__(netdev, details, &hc);
2922 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2923 tc_make_handle(1, 0xfffe), &hc);
2928 htb_update_queue__(netdev, queue_id, &hc);
2933 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2935 struct htb_class *hc = htb_class_cast__(queue);
2936 struct htb *htb = htb_get__(netdev);
2939 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2941 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2948 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2949 struct netdev_queue_stats *stats)
2951 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2952 tc_make_handle(1, 0xfffe), NULL, stats);
2956 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2957 const struct ofpbuf *nlmsg,
2958 netdev_dump_queue_stats_cb *cb, void *aux)
2960 struct netdev_queue_stats stats;
2961 unsigned int handle, major, minor;
2964 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2969 major = tc_get_major(handle);
2970 minor = tc_get_minor(handle);
2971 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2972 (*cb)(minor - 1, &stats, aux);
2977 static const struct tc_ops tc_ops_htb = {
2978 "htb", /* linux_name */
2979 "linux-htb", /* ovs_name */
2980 HTB_N_QUEUES, /* n_queues */
2989 htb_class_get_stats,
2990 htb_class_dump_stats
2993 /* "linux-hfsc" traffic control class. */
2995 #define HFSC_N_QUEUES 0xf000
3003 struct tc_queue tc_queue;
3008 static struct hfsc *
3009 hfsc_get__(const struct netdev *netdev_)
3011 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3012 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
3015 static struct hfsc_class *
3016 hfsc_class_cast__(const struct tc_queue *queue)
3018 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3022 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
3024 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3027 hfsc = xmalloc(sizeof *hfsc);
3028 tc_init(&hfsc->tc, &tc_ops_hfsc);
3029 hfsc->max_rate = max_rate;
3030 netdev->tc = &hfsc->tc;
3034 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3035 const struct hfsc_class *hc)
3039 struct hfsc_class *hcp;
3040 struct tc_queue *queue;
3042 hfsc = hfsc_get__(netdev);
3043 hash = hash_int(queue_id, 0);
3045 queue = tc_find_queue__(netdev, queue_id, hash);
3047 hcp = hfsc_class_cast__(queue);
3049 hcp = xmalloc(sizeof *hcp);
3050 queue = &hcp->tc_queue;
3051 queue->queue_id = queue_id;
3052 queue->created = time_msec();
3053 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3056 hcp->min_rate = hc->min_rate;
3057 hcp->max_rate = hc->max_rate;
3061 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3063 const struct tc_service_curve *rsc, *fsc, *usc;
3064 static const struct nl_policy tca_hfsc_policy[] = {
3066 .type = NL_A_UNSPEC,
3068 .min_len = sizeof(struct tc_service_curve),
3071 .type = NL_A_UNSPEC,
3073 .min_len = sizeof(struct tc_service_curve),
3076 .type = NL_A_UNSPEC,
3078 .min_len = sizeof(struct tc_service_curve),
3081 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3083 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3084 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3085 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3089 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3090 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3091 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3093 if (rsc->m1 != 0 || rsc->d != 0 ||
3094 fsc->m1 != 0 || fsc->d != 0 ||
3095 usc->m1 != 0 || usc->d != 0) {
3096 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3097 "Non-linear service curves are not supported.");
3101 if (rsc->m2 != fsc->m2) {
3102 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3103 "Real-time service curves are not supported ");
3107 if (rsc->m2 > usc->m2) {
3108 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3109 "Min-rate service curve is greater than "
3110 "the max-rate service curve.");
3114 class->min_rate = fsc->m2;
3115 class->max_rate = usc->m2;
3120 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3121 struct hfsc_class *options,
3122 struct netdev_queue_stats *stats)
3125 unsigned int handle;
3126 struct nlattr *nl_options;
3128 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3134 unsigned int major, minor;
3136 major = tc_get_major(handle);
3137 minor = tc_get_minor(handle);
3138 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3139 *queue_id = minor - 1;
3146 error = hfsc_parse_tca_options__(nl_options, options);
3153 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3154 unsigned int parent, struct hfsc_class *options,
3155 struct netdev_queue_stats *stats)
3158 struct ofpbuf *reply;
3160 error = tc_query_class(netdev, handle, parent, &reply);
3165 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3166 ofpbuf_delete(reply);
3171 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct smap *details,
3172 struct hfsc_class *class)
3175 const char *max_rate_s;
3177 max_rate_s = smap_get(details, "max-rate");
3178 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3181 enum netdev_features current;
3183 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
3184 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3187 class->min_rate = max_rate;
3188 class->max_rate = max_rate;
3192 hfsc_parse_class_details__(struct netdev *netdev,
3193 const struct smap *details,
3194 struct hfsc_class * class)
3196 const struct hfsc *hfsc;
3197 uint32_t min_rate, max_rate;
3198 const char *min_rate_s, *max_rate_s;
3200 hfsc = hfsc_get__(netdev);
3201 min_rate_s = smap_get(details, "min-rate");
3202 max_rate_s = smap_get(details, "max-rate");
3204 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3205 min_rate = MAX(min_rate, 1);
3206 min_rate = MIN(min_rate, hfsc->max_rate);
3208 max_rate = (max_rate_s
3209 ? strtoull(max_rate_s, NULL, 10) / 8
3211 max_rate = MAX(max_rate, min_rate);
3212 max_rate = MIN(max_rate, hfsc->max_rate);
3214 class->min_rate = min_rate;
3215 class->max_rate = max_rate;
3220 /* Create an HFSC qdisc.
3222 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3224 hfsc_setup_qdisc__(struct netdev * netdev)
3226 struct tcmsg *tcmsg;
3227 struct ofpbuf request;
3228 struct tc_hfsc_qopt opt;
3230 tc_del_qdisc(netdev);
3232 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3233 NLM_F_EXCL | NLM_F_CREATE, &request);
3239 tcmsg->tcm_handle = tc_make_handle(1, 0);
3240 tcmsg->tcm_parent = TC_H_ROOT;
3242 memset(&opt, 0, sizeof opt);
3245 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3246 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3248 return tc_transact(&request, NULL);
3251 /* Create an HFSC class.
3253 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3254 * sc rate <min_rate> ul rate <max_rate>" */
3256 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3257 unsigned int parent, struct hfsc_class *class)
3261 struct tcmsg *tcmsg;
3262 struct ofpbuf request;
3263 struct tc_service_curve min, max;
3265 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3271 tcmsg->tcm_handle = handle;
3272 tcmsg->tcm_parent = parent;
3276 min.m2 = class->min_rate;
3280 max.m2 = class->max_rate;
3282 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3283 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3284 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3285 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3286 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3287 nl_msg_end_nested(&request, opt_offset);
3289 error = tc_transact(&request, NULL);
3291 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3292 "min-rate %ubps, max-rate %ubps (%s)",
3293 netdev_get_name(netdev),
3294 tc_get_major(handle), tc_get_minor(handle),
3295 tc_get_major(parent), tc_get_minor(parent),
3296 class->min_rate, class->max_rate, ovs_strerror(error));
3303 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
3306 struct hfsc_class class;
3308 error = hfsc_setup_qdisc__(netdev);
3314 hfsc_parse_qdisc_details__(netdev, details, &class);
3315 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3316 tc_make_handle(1, 0), &class);
3322 hfsc_install__(netdev, class.max_rate);
3327 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3330 struct nl_dump dump;
3331 struct hfsc_class hc;
3334 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3335 hfsc_install__(netdev, hc.max_rate);
3337 if (!start_queue_dump(netdev, &dump)) {
3341 while (nl_dump_next(&dump, &msg)) {
3342 unsigned int queue_id;
3344 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3345 hfsc_update_queue__(netdev, queue_id, &hc);
3349 nl_dump_done(&dump);
3354 hfsc_tc_destroy(struct tc *tc)
3357 struct hfsc_class *hc, *next;
3359 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3361 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3362 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3371 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
3373 const struct hfsc *hfsc;
3374 hfsc = hfsc_get__(netdev);
3375 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
3380 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
3383 struct hfsc_class class;
3385 hfsc_parse_qdisc_details__(netdev, details, &class);
3386 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3387 tc_make_handle(1, 0), &class);
3390 hfsc_get__(netdev)->max_rate = class.max_rate;
3397 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3398 const struct tc_queue *queue, struct smap *details)
3400 const struct hfsc_class *hc;
3402 hc = hfsc_class_cast__(queue);
3403 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3404 if (hc->min_rate != hc->max_rate) {
3405 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3411 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3412 const struct smap *details)
3415 struct hfsc_class class;
3417 error = hfsc_parse_class_details__(netdev, details, &class);
3422 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3423 tc_make_handle(1, 0xfffe), &class);
3428 hfsc_update_queue__(netdev, queue_id, &class);
3433 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3437 struct hfsc_class *hc;
3439 hc = hfsc_class_cast__(queue);
3440 hfsc = hfsc_get__(netdev);
3442 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3444 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3451 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3452 struct netdev_queue_stats *stats)
3454 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3455 tc_make_handle(1, 0xfffe), NULL, stats);
3459 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3460 const struct ofpbuf *nlmsg,
3461 netdev_dump_queue_stats_cb *cb, void *aux)
3463 struct netdev_queue_stats stats;
3464 unsigned int handle, major, minor;
3467 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3472 major = tc_get_major(handle);
3473 minor = tc_get_minor(handle);
3474 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3475 (*cb)(minor - 1, &stats, aux);
3480 static const struct tc_ops tc_ops_hfsc = {
3481 "hfsc", /* linux_name */
3482 "linux-hfsc", /* ovs_name */
3483 HFSC_N_QUEUES, /* n_queues */
3484 hfsc_tc_install, /* tc_install */
3485 hfsc_tc_load, /* tc_load */
3486 hfsc_tc_destroy, /* tc_destroy */
3487 hfsc_qdisc_get, /* qdisc_get */
3488 hfsc_qdisc_set, /* qdisc_set */
3489 hfsc_class_get, /* class_get */
3490 hfsc_class_set, /* class_set */
3491 hfsc_class_delete, /* class_delete */
3492 hfsc_class_get_stats, /* class_get_stats */
3493 hfsc_class_dump_stats /* class_dump_stats */
3496 /* "linux-default" traffic control class.
3498 * This class represents the default, unnamed Linux qdisc. It corresponds to
3499 * the "" (empty string) QoS type in the OVS database. */
3502 default_install__(struct netdev *netdev_)
3504 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3505 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
3507 /* Nothing but a tc class implementation is allowed to write to a tc. This
3508 * class never does that, so we can legitimately use a const tc object. */
3509 netdev->tc = CONST_CAST(struct tc *, &tc);
3513 default_tc_install(struct netdev *netdev,
3514 const struct smap *details OVS_UNUSED)
3516 default_install__(netdev);
3521 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3523 default_install__(netdev);
3527 static const struct tc_ops tc_ops_default = {
3528 NULL, /* linux_name */
3533 NULL, /* tc_destroy */
3534 NULL, /* qdisc_get */
3535 NULL, /* qdisc_set */
3536 NULL, /* class_get */
3537 NULL, /* class_set */
3538 NULL, /* class_delete */
3539 NULL, /* class_get_stats */
3540 NULL /* class_dump_stats */
3543 /* "linux-other" traffic control class.
3548 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
3550 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3551 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
3553 /* Nothing but a tc class implementation is allowed to write to a tc. This
3554 * class never does that, so we can legitimately use a const tc object. */
3555 netdev->tc = CONST_CAST(struct tc *, &tc);
3559 static const struct tc_ops tc_ops_other = {
3560 NULL, /* linux_name */
3561 "linux-other", /* ovs_name */
3563 NULL, /* tc_install */
3565 NULL, /* tc_destroy */
3566 NULL, /* qdisc_get */
3567 NULL, /* qdisc_set */
3568 NULL, /* class_get */
3569 NULL, /* class_set */
3570 NULL, /* class_delete */
3571 NULL, /* class_get_stats */
3572 NULL /* class_dump_stats */
3575 /* Traffic control. */
3577 /* Number of kernel "tc" ticks per second. */
3578 static double ticks_per_s;
3580 /* Number of kernel "jiffies" per second. This is used for the purpose of
3581 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3582 * one jiffy's worth of data.
3584 * There are two possibilities here:
3586 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3587 * approximate range of 100 to 1024. That means that we really need to
3588 * make sure that the qdisc can buffer that much data.
3590 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3591 * has finely granular timers and there's no need to fudge additional room
3592 * for buffers. (There's no extra effort needed to implement that: the
3593 * large 'buffer_hz' is used as a divisor, so practically any number will
3594 * come out as 0 in the division. Small integer results in the case of
3595 * really high dividends won't have any real effect anyhow.)
3597 static unsigned int buffer_hz;
3599 /* Returns tc handle 'major':'minor'. */
3601 tc_make_handle(unsigned int major, unsigned int minor)
3603 return TC_H_MAKE(major << 16, minor);
3606 /* Returns the major number from 'handle'. */
3608 tc_get_major(unsigned int handle)
3610 return TC_H_MAJ(handle) >> 16;
3613 /* Returns the minor number from 'handle'. */
3615 tc_get_minor(unsigned int handle)
3617 return TC_H_MIN(handle);
3620 static struct tcmsg *
3621 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3622 struct ofpbuf *request)
3624 struct tcmsg *tcmsg;
3628 error = get_ifindex(netdev, &ifindex);
3633 ofpbuf_init(request, 512);
3634 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3635 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3636 tcmsg->tcm_family = AF_UNSPEC;
3637 tcmsg->tcm_ifindex = ifindex;
3638 /* Caller should fill in tcmsg->tcm_handle. */
3639 /* Caller should fill in tcmsg->tcm_parent. */
3645 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3647 int error = nl_transact(NETLINK_ROUTE, request, replyp);
3648 ofpbuf_uninit(request);
3652 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3653 * policing configuration.
3655 * This function is equivalent to running the following when 'add' is true:
3656 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3658 * This function is equivalent to running the following when 'add' is false:
3659 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3661 * The configuration and stats may be seen with the following command:
3662 * /sbin/tc -s qdisc show dev <devname>
3664 * Returns 0 if successful, otherwise a positive errno value.
3667 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3669 struct ofpbuf request;
3670 struct tcmsg *tcmsg;
3672 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3673 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3675 tcmsg = tc_make_request(netdev, type, flags, &request);
3679 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3680 tcmsg->tcm_parent = TC_H_INGRESS;
3681 nl_msg_put_string(&request, TCA_KIND, "ingress");
3682 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3684 error = tc_transact(&request, NULL);
3686 /* If we're deleting the qdisc, don't worry about some of the
3687 * error conditions. */
3688 if (!add && (error == ENOENT || error == EINVAL)) {
3697 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3700 * This function is equivalent to running:
3701 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3702 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3705 * The configuration and stats may be seen with the following command:
3706 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3708 * Returns 0 if successful, otherwise a positive errno value.
3711 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3713 struct tc_police tc_police;
3714 struct ofpbuf request;
3715 struct tcmsg *tcmsg;
3716 size_t basic_offset;
3717 size_t police_offset;
3721 memset(&tc_police, 0, sizeof tc_police);
3722 tc_police.action = TC_POLICE_SHOT;
3723 tc_police.mtu = mtu;
3724 tc_fill_rate(&tc_police.rate, (kbits_rate * 1000)/8, mtu);
3725 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3726 kbits_burst * 1024);
3728 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3729 NLM_F_EXCL | NLM_F_CREATE, &request);
3733 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3734 tcmsg->tcm_info = tc_make_handle(49,
3735 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3737 nl_msg_put_string(&request, TCA_KIND, "basic");
3738 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3739 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3740 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3741 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3742 nl_msg_end_nested(&request, police_offset);
3743 nl_msg_end_nested(&request, basic_offset);
3745 error = tc_transact(&request, NULL);
3756 /* The values in psched are not individually very meaningful, but they are
3757 * important. The tables below show some values seen in the wild.
3761 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3762 * (Before that, there are hints that it was 1000000000.)
3764 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3768 * -----------------------------------
3769 * [1] 000c8000 000f4240 000f4240 00000064
3770 * [2] 000003e8 00000400 000f4240 3b9aca00
3771 * [3] 000003e8 00000400 000f4240 3b9aca00
3772 * [4] 000003e8 00000400 000f4240 00000064
3773 * [5] 000003e8 00000040 000f4240 3b9aca00
3774 * [6] 000003e8 00000040 000f4240 000000f9
3776 * a b c d ticks_per_s buffer_hz
3777 * ------- --------- ---------- ------------- ----------- -------------
3778 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3779 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3780 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3781 * [4] 1,000 1,024 1,000,000 100 976,562 100
3782 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3783 * [6] 1,000 64 1,000,000 249 15,625,000 249
3785 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3786 * [2] 2.6.26-1-686-bigmem from Debian lenny
3787 * [3] 2.6.26-2-sparc64 from Debian lenny
3788 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3789 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3790 * [6] 2.6.34 from kernel.org on KVM
3792 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
3793 static const char fn[] = "/proc/net/psched";
3794 unsigned int a, b, c, d;
3797 if (!ovsthread_once_start(&once)) {
3804 stream = fopen(fn, "r");
3806 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
3810 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3811 VLOG_WARN("%s: read failed", fn);
3815 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3819 VLOG_WARN("%s: invalid scheduler parameters", fn);
3823 ticks_per_s = (double) a * c / b;
3827 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3830 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3833 ovsthread_once_done(&once);
3836 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3837 * rate of 'rate' bytes per second. */
3839 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3842 return (rate * ticks) / ticks_per_s;
3845 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3846 * rate of 'rate' bytes per second. */
3848 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3851 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3854 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3855 * a transmission rate of 'rate' bytes per second. */
3857 tc_buffer_per_jiffy(unsigned int rate)
3860 return rate / buffer_hz;
3863 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3864 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3865 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3866 * stores NULL into it if it is absent.
3868 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3871 * Returns 0 if successful, otherwise a positive errno value. */
3873 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3874 struct nlattr **options)
3876 static const struct nl_policy tca_policy[] = {
3877 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3878 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3880 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3882 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3883 tca_policy, ta, ARRAY_SIZE(ta))) {
3884 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3889 *kind = nl_attr_get_string(ta[TCA_KIND]);
3893 *options = ta[TCA_OPTIONS];
3908 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3909 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3910 * into '*options', and its queue statistics into '*stats'. Any of the output
3911 * arguments may be null.
3913 * Returns 0 if successful, otherwise a positive errno value. */
3915 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3916 struct nlattr **options, struct netdev_queue_stats *stats)
3918 static const struct nl_policy tca_policy[] = {
3919 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3920 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3922 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3924 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3925 tca_policy, ta, ARRAY_SIZE(ta))) {
3926 VLOG_WARN_RL(&rl, "failed to parse class message");
3931 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3932 *handlep = tc->tcm_handle;
3936 *options = ta[TCA_OPTIONS];
3940 const struct gnet_stats_queue *gsq;
3941 struct gnet_stats_basic gsb;
3943 static const struct nl_policy stats_policy[] = {
3944 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3945 .min_len = sizeof gsb },
3946 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3947 .min_len = sizeof *gsq },
3949 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3951 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3952 sa, ARRAY_SIZE(sa))) {
3953 VLOG_WARN_RL(&rl, "failed to parse class stats");
3957 /* Alignment issues screw up the length of struct gnet_stats_basic on
3958 * some arch/bitsize combinations. Newer versions of Linux have a
3959 * struct gnet_stats_basic_packed, but we can't depend on that. The
3960 * easiest thing to do is just to make a copy. */
3961 memset(&gsb, 0, sizeof gsb);
3962 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3963 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3964 stats->tx_bytes = gsb.bytes;
3965 stats->tx_packets = gsb.packets;
3967 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3968 stats->tx_errors = gsq->drops;
3978 memset(stats, 0, sizeof *stats);
3983 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3986 tc_query_class(const struct netdev *netdev,
3987 unsigned int handle, unsigned int parent,
3988 struct ofpbuf **replyp)
3990 struct ofpbuf request;
3991 struct tcmsg *tcmsg;
3994 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3998 tcmsg->tcm_handle = handle;
3999 tcmsg->tcm_parent = parent;
4001 error = tc_transact(&request, replyp);
4003 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
4004 netdev_get_name(netdev),
4005 tc_get_major(handle), tc_get_minor(handle),
4006 tc_get_major(parent), tc_get_minor(parent),
4007 ovs_strerror(error));
4012 /* Equivalent to "tc class del dev <name> handle <handle>". */
4014 tc_delete_class(const struct netdev *netdev, unsigned int handle)
4016 struct ofpbuf request;
4017 struct tcmsg *tcmsg;
4020 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
4024 tcmsg->tcm_handle = handle;
4025 tcmsg->tcm_parent = 0;
4027 error = tc_transact(&request, NULL);
4029 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4030 netdev_get_name(netdev),
4031 tc_get_major(handle), tc_get_minor(handle),
4032 ovs_strerror(error));
4037 /* Equivalent to "tc qdisc del dev <name> root". */
4039 tc_del_qdisc(struct netdev *netdev_)
4041 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4042 struct ofpbuf request;
4043 struct tcmsg *tcmsg;
4046 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
4050 tcmsg->tcm_handle = tc_make_handle(1, 0);
4051 tcmsg->tcm_parent = TC_H_ROOT;
4053 error = tc_transact(&request, NULL);
4054 if (error == EINVAL) {
4055 /* EINVAL probably means that the default qdisc was in use, in which
4056 * case we've accomplished our purpose. */
4059 if (!error && netdev->tc) {
4060 if (netdev->tc->ops->tc_destroy) {
4061 netdev->tc->ops->tc_destroy(netdev->tc);
4068 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4069 * kernel to determine what they are. Returns 0 if successful, otherwise a
4070 * positive errno value. */
4072 tc_query_qdisc(const struct netdev *netdev_)
4074 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4075 struct ofpbuf request, *qdisc;
4076 const struct tc_ops *ops;
4077 struct tcmsg *tcmsg;
4085 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4086 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4087 * 2.6.35 without that fix backported to it.
4089 * To avoid the OOPS, we must not make a request that would attempt to dump
4090 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4091 * few others. There are a few ways that I can see to do this, but most of
4092 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4093 * technique chosen here is to assume that any non-default qdisc that we
4094 * create will have a class with handle 1:0. The built-in qdiscs only have
4095 * a class with handle 0:0.
4097 * We could check for Linux 2.6.35+ and use a more straightforward method
4099 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
4103 tcmsg->tcm_handle = tc_make_handle(1, 0);
4104 tcmsg->tcm_parent = 0;
4106 /* Figure out what tc class to instantiate. */
4107 error = tc_transact(&request, &qdisc);
4111 error = tc_parse_qdisc(qdisc, &kind, NULL);
4113 ops = &tc_ops_other;
4115 ops = tc_lookup_linux_name(kind);
4117 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4118 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4120 ops = &tc_ops_other;
4123 } else if (error == ENOENT) {
4124 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4125 * other entity that doesn't have a handle 1:0. We will assume
4126 * that it's the system default qdisc. */
4127 ops = &tc_ops_default;
4130 /* Who knows? Maybe the device got deleted. */
4131 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4132 netdev_get_name(netdev_), ovs_strerror(error));
4133 ops = &tc_ops_other;
4136 /* Instantiate it. */
4137 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
4138 ovs_assert((load_error == 0) == (netdev->tc != NULL));
4139 ofpbuf_delete(qdisc);
4141 return error ? error : load_error;
4144 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4145 approximate the time to transmit packets of various lengths. For an MTU of
4146 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4147 represents two possible packet lengths; for a MTU of 513 through 1024, four
4148 possible lengths; and so on.
4150 Returns, for the specified 'mtu', the number of bits that packet lengths
4151 need to be shifted right to fit within such a 256-entry table. */
4153 tc_calc_cell_log(unsigned int mtu)
4158 mtu = ETH_PAYLOAD_MAX;
4160 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4162 for (cell_log = 0; mtu >= 256; cell_log++) {
4169 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4172 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4174 memset(rate, 0, sizeof *rate);
4175 rate->cell_log = tc_calc_cell_log(mtu);
4176 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4177 /* rate->cell_align = 0; */ /* distro headers. */
4178 rate->mpu = ETH_TOTAL_MIN;
4182 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4183 * attribute of the specified "type".
4185 * See tc_calc_cell_log() above for a description of "rtab"s. */
4187 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4192 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4193 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4194 unsigned packet_size = (i + 1) << rate->cell_log;
4195 if (packet_size < rate->mpu) {
4196 packet_size = rate->mpu;
4198 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4202 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4203 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4204 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4207 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4209 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4210 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4213 /* Linux-only functions declared in netdev-linux.h */
4215 /* Returns a fd for an AF_INET socket or a negative errno value. */
4217 netdev_linux_get_af_inet_sock(void)
4219 int error = netdev_linux_init();
4220 return error ? -error : af_inet_sock;
4223 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4224 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4226 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4227 const char *flag_name, bool enable)
4229 const char *netdev_name = netdev_get_name(netdev);
4230 struct ethtool_value evalue;
4234 COVERAGE_INC(netdev_get_ethtool);
4235 memset(&evalue, 0, sizeof evalue);
4236 error = netdev_linux_do_ethtool(netdev_name,
4237 (struct ethtool_cmd *)&evalue,
4238 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4243 COVERAGE_INC(netdev_set_ethtool);
4244 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4245 error = netdev_linux_do_ethtool(netdev_name,
4246 (struct ethtool_cmd *)&evalue,
4247 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4252 COVERAGE_INC(netdev_get_ethtool);
4253 memset(&evalue, 0, sizeof evalue);
4254 error = netdev_linux_do_ethtool(netdev_name,
4255 (struct ethtool_cmd *)&evalue,
4256 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4261 if (new_flags != evalue.data) {
4262 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4263 "device %s failed", enable ? "enable" : "disable",
4264 flag_name, netdev_name);
4271 /* Utility functions. */
4273 /* Copies 'src' into 'dst', performing format conversion in the process. */
4275 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4276 const struct rtnl_link_stats *src)
4278 dst->rx_packets = src->rx_packets;
4279 dst->tx_packets = src->tx_packets;
4280 dst->rx_bytes = src->rx_bytes;
4281 dst->tx_bytes = src->tx_bytes;
4282 dst->rx_errors = src->rx_errors;
4283 dst->tx_errors = src->tx_errors;
4284 dst->rx_dropped = src->rx_dropped;
4285 dst->tx_dropped = src->tx_dropped;
4286 dst->multicast = src->multicast;
4287 dst->collisions = src->collisions;
4288 dst->rx_length_errors = src->rx_length_errors;
4289 dst->rx_over_errors = src->rx_over_errors;
4290 dst->rx_crc_errors = src->rx_crc_errors;
4291 dst->rx_frame_errors = src->rx_frame_errors;
4292 dst->rx_fifo_errors = src->rx_fifo_errors;
4293 dst->rx_missed_errors = src->rx_missed_errors;
4294 dst->tx_aborted_errors = src->tx_aborted_errors;
4295 dst->tx_carrier_errors = src->tx_carrier_errors;
4296 dst->tx_fifo_errors = src->tx_fifo_errors;
4297 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4298 dst->tx_window_errors = src->tx_window_errors;
4302 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4304 /* Policy for RTNLGRP_LINK messages.
4306 * There are *many* more fields in these messages, but currently we only
4307 * care about these fields. */
4308 static const struct nl_policy rtnlgrp_link_policy[] = {
4309 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4310 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4311 .min_len = sizeof(struct rtnl_link_stats) },
4314 struct ofpbuf request;
4315 struct ofpbuf *reply;
4316 struct ifinfomsg *ifi;
4317 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4320 ofpbuf_init(&request, 0);
4321 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4322 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4323 ifi->ifi_family = PF_UNSPEC;
4324 ifi->ifi_index = ifindex;
4325 error = nl_transact(NETLINK_ROUTE, &request, &reply);
4326 ofpbuf_uninit(&request);
4331 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4332 rtnlgrp_link_policy,
4333 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4334 ofpbuf_delete(reply);
4338 if (!attrs[IFLA_STATS]) {
4339 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4340 ofpbuf_delete(reply);
4344 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4346 ofpbuf_delete(reply);
4352 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4354 static const char fn[] = "/proc/net/dev";
4359 stream = fopen(fn, "r");
4361 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
4366 while (fgets(line, sizeof line, stream)) {
4369 #define X64 "%"SCNu64
4372 X64 X64 X64 X64 X64 X64 X64 "%*u"
4373 X64 X64 X64 X64 X64 X64 X64 "%*u",
4379 &stats->rx_fifo_errors,
4380 &stats->rx_frame_errors,
4386 &stats->tx_fifo_errors,
4388 &stats->tx_carrier_errors) != 15) {
4389 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4390 } else if (!strcmp(devname, netdev_name)) {
4391 stats->rx_length_errors = UINT64_MAX;
4392 stats->rx_over_errors = UINT64_MAX;
4393 stats->rx_crc_errors = UINT64_MAX;
4394 stats->rx_missed_errors = UINT64_MAX;
4395 stats->tx_aborted_errors = UINT64_MAX;
4396 stats->tx_heartbeat_errors = UINT64_MAX;
4397 stats->tx_window_errors = UINT64_MAX;
4403 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4409 get_flags(const struct netdev *dev, unsigned int *flags)
4415 error = netdev_linux_do_ioctl(dev->name, &ifr, SIOCGIFFLAGS,
4418 *flags = ifr.ifr_flags;
4424 set_flags(const char *name, unsigned int flags)
4428 ifr.ifr_flags = flags;
4429 return netdev_linux_do_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
4433 do_get_ifindex(const char *netdev_name)
4437 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4438 COVERAGE_INC(netdev_get_ifindex);
4439 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4440 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4441 netdev_name, ovs_strerror(errno));
4444 return ifr.ifr_ifindex;
4448 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4450 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4452 if (!(netdev->cache_valid & VALID_IFINDEX)) {
4453 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4456 netdev->get_ifindex_error = -ifindex;
4457 netdev->ifindex = 0;
4459 netdev->get_ifindex_error = 0;
4460 netdev->ifindex = ifindex;
4462 netdev->cache_valid |= VALID_IFINDEX;
4465 *ifindexp = netdev->ifindex;
4466 return netdev->get_ifindex_error;
4470 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4475 memset(&ifr, 0, sizeof ifr);
4476 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4477 COVERAGE_INC(netdev_get_hwaddr);
4478 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4479 /* ENODEV probably means that a vif disappeared asynchronously and
4480 * hasn't been removed from the database yet, so reduce the log level
4481 * to INFO for that case. */
4482 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4483 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4484 netdev_name, ovs_strerror(errno));
4487 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4488 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4489 VLOG_WARN("%s device has unknown hardware address family %d",
4490 netdev_name, hwaddr_family);
4492 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4497 set_etheraddr(const char *netdev_name,
4498 const uint8_t mac[ETH_ADDR_LEN])
4502 memset(&ifr, 0, sizeof ifr);
4503 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4504 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4505 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4506 COVERAGE_INC(netdev_set_hwaddr);
4507 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4508 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4509 netdev_name, ovs_strerror(errno));
4516 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4517 int cmd, const char *cmd_name)
4521 memset(&ifr, 0, sizeof ifr);
4522 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4523 ifr.ifr_data = (caddr_t) ecmd;
4526 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4529 if (errno != EOPNOTSUPP) {
4530 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4531 "failed: %s", cmd_name, name, ovs_strerror(errno));
4533 /* The device doesn't support this operation. That's pretty
4534 * common, so there's no point in logging anything. */
4541 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4542 const char *cmd_name)
4544 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4545 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4546 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4547 ovs_strerror(errno));
4554 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4555 int cmd, const char *cmd_name)
4560 ifr.ifr_addr.sa_family = AF_INET;
4561 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4563 const struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *,
4565 *ip = sin->sin_addr;
4570 /* Returns an AF_PACKET raw socket or a negative errno value. */
4572 af_packet_sock(void)
4574 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4577 if (ovsthread_once_start(&once)) {
4578 sock = socket(AF_PACKET, SOCK_RAW, 0);
4580 int error = set_nonblocking(sock);
4587 VLOG_ERR("failed to create packet socket: %s",
4588 ovs_strerror(errno));
4590 ovsthread_once_done(&once);