2 * Copyright (c) 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <linux/version.h>
37 #include <sys/types.h>
38 #include <sys/ioctl.h>
39 #include <sys/socket.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
52 #include "dpif-linux.h"
53 #include "dynamic-string.h"
54 #include "fatal-signal.h"
57 #include "netdev-provider.h"
58 #include "netdev-vport.h"
59 #include "netlink-notifier.h"
60 #include "netlink-socket.h"
63 #include "openflow/openflow.h"
65 #include "poll-loop.h"
66 #include "rtnetlink-link.h"
68 #include "socket-util.h"
71 #include "unaligned.h"
74 VLOG_DEFINE_THIS_MODULE(netdev_linux);
76 COVERAGE_DEFINE(netdev_set_policing);
77 COVERAGE_DEFINE(netdev_arp_lookup);
78 COVERAGE_DEFINE(netdev_get_ifindex);
79 COVERAGE_DEFINE(netdev_get_hwaddr);
80 COVERAGE_DEFINE(netdev_set_hwaddr);
81 COVERAGE_DEFINE(netdev_get_ethtool);
82 COVERAGE_DEFINE(netdev_set_ethtool);
85 /* These were introduced in Linux 2.6.14, so they might be missing if we have
87 #ifndef ADVERTISED_Pause
88 #define ADVERTISED_Pause (1 << 13)
90 #ifndef ADVERTISED_Asym_Pause
91 #define ADVERTISED_Asym_Pause (1 << 14)
94 /* These were introduced in Linux 2.6.24, so they might be missing if we
95 * have old headers. */
96 #ifndef ETHTOOL_GFLAGS
97 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
99 #ifndef ETHTOOL_SFLAGS
100 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
103 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
106 #define TC_RTAB_SIZE 1024
109 static struct nln_notifier *netdev_linux_cache_notifier = NULL;
110 static int cache_notifier_refcount;
113 VALID_IFINDEX = 1 << 0,
114 VALID_ETHERADDR = 1 << 1,
118 VALID_POLICING = 1 << 5,
119 VALID_VPORT_STAT_ERROR = 1 << 6,
120 VALID_DRVINFO = 1 << 7,
121 VALID_FEATURES = 1 << 8,
128 /* Traffic control. */
130 /* An instance of a traffic control class. Always associated with a particular
133 * Each TC implementation subclasses this with whatever additional data it
136 const struct tc_ops *ops;
137 struct hmap queues; /* Contains "struct tc_queue"s.
138 * Read by generic TC layer.
139 * Written only by TC implementation. */
142 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
144 /* One traffic control queue.
146 * Each TC implementation subclasses this with whatever additional data it
149 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
150 unsigned int queue_id; /* OpenFlow queue ID. */
151 long long int created; /* Time queue was created, in msecs. */
154 /* A particular kind of traffic control. Each implementation generally maps to
155 * one particular Linux qdisc class.
157 * The functions below return 0 if successful or a positive errno value on
158 * failure, except where otherwise noted. All of them must be provided, except
159 * where otherwise noted. */
161 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
162 * This is null for tc_ops_default and tc_ops_other, for which there are no
163 * appropriate values. */
164 const char *linux_name;
166 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
167 const char *ovs_name;
169 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
170 * queues. The queues are numbered 0 through n_queues - 1. */
171 unsigned int n_queues;
173 /* Called to install this TC class on 'netdev'. The implementation should
174 * make the Netlink calls required to set up 'netdev' with the right qdisc
175 * and configure it according to 'details'. The implementation may assume
176 * that the current qdisc is the default; that is, there is no need for it
177 * to delete the current qdisc before installing itself.
179 * The contents of 'details' should be documented as valid for 'ovs_name'
180 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
181 * (which is built as ovs-vswitchd.conf.db(8)).
183 * This function must return 0 if and only if it sets 'netdev->tc' to an
184 * initialized 'struct tc'.
186 * (This function is null for tc_ops_other, which cannot be installed. For
187 * other TC classes it should always be nonnull.) */
188 int (*tc_install)(struct netdev *netdev, const struct smap *details);
190 /* Called when the netdev code determines (through a Netlink query) that
191 * this TC class's qdisc is installed on 'netdev', but we didn't install
192 * it ourselves and so don't know any of the details.
194 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
195 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
196 * implementation should parse the other attributes of 'nlmsg' as
197 * necessary to determine its configuration. If necessary it should also
198 * use Netlink queries to determine the configuration of queues on
201 * This function must return 0 if and only if it sets 'netdev->tc' to an
202 * initialized 'struct tc'. */
203 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
205 /* Destroys the data structures allocated by the implementation as part of
206 * 'tc'. (This includes destroying 'tc->queues' by calling
209 * The implementation should not need to perform any Netlink calls. If
210 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
211 * (But it may not be desirable.)
213 * This function may be null if 'tc' is trivial. */
214 void (*tc_destroy)(struct tc *tc);
216 /* Retrieves details of 'netdev->tc' configuration into 'details'.
218 * The implementation should not need to perform any Netlink calls, because
219 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
220 * cached the configuration.
222 * The contents of 'details' should be documented as valid for 'ovs_name'
223 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
224 * (which is built as ovs-vswitchd.conf.db(8)).
226 * This function may be null if 'tc' is not configurable.
228 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
230 /* Reconfigures 'netdev->tc' according to 'details', performing any
231 * required Netlink calls to complete the reconfiguration.
233 * The contents of 'details' should be documented as valid for 'ovs_name'
234 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
235 * (which is built as ovs-vswitchd.conf.db(8)).
237 * This function may be null if 'tc' is not configurable.
239 int (*qdisc_set)(struct netdev *, const struct smap *details);
241 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
242 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
244 * The contents of 'details' should be documented as valid for 'ovs_name'
245 * in the "other_config" column in the "Queue" table in
246 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
248 * The implementation should not need to perform any Netlink calls, because
249 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
250 * cached the queue configuration.
252 * This function may be null if 'tc' does not have queues ('n_queues' is
254 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
255 struct smap *details);
257 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
258 * 'details', perfoming any required Netlink calls to complete the
259 * reconfiguration. The caller ensures that 'queue_id' is less than
262 * The contents of 'details' should be documented as valid for 'ovs_name'
263 * in the "other_config" column in the "Queue" table in
264 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
266 * This function may be null if 'tc' does not have queues or its queues are
267 * not configurable. */
268 int (*class_set)(struct netdev *, unsigned int queue_id,
269 const struct smap *details);
271 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
272 * tc_queue's within 'netdev->tc->queues'.
274 * This function may be null if 'tc' does not have queues or its queues
275 * cannot be deleted. */
276 int (*class_delete)(struct netdev *, struct tc_queue *queue);
278 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
279 * 'struct tc_queue's within 'netdev->tc->queues'.
281 * On success, initializes '*stats'.
283 * This function may be null if 'tc' does not have queues or if it cannot
284 * report queue statistics. */
285 int (*class_get_stats)(const struct netdev *netdev,
286 const struct tc_queue *queue,
287 struct netdev_queue_stats *stats);
289 /* Extracts queue stats from 'nlmsg', which is a response to a
290 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
292 * This function may be null if 'tc' does not have queues or if it cannot
293 * report queue statistics. */
294 int (*class_dump_stats)(const struct netdev *netdev,
295 const struct ofpbuf *nlmsg,
296 netdev_dump_queue_stats_cb *cb, void *aux);
300 tc_init(struct tc *tc, const struct tc_ops *ops)
303 hmap_init(&tc->queues);
307 tc_destroy(struct tc *tc)
309 hmap_destroy(&tc->queues);
312 static const struct tc_ops tc_ops_htb;
313 static const struct tc_ops tc_ops_hfsc;
314 static const struct tc_ops tc_ops_default;
315 static const struct tc_ops tc_ops_other;
317 static const struct tc_ops *const tcs[] = {
318 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
319 &tc_ops_hfsc, /* Hierarchical fair service curve. */
320 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
321 &tc_ops_other, /* Some other qdisc. */
325 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
326 static unsigned int tc_get_major(unsigned int handle);
327 static unsigned int tc_get_minor(unsigned int handle);
329 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
330 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
331 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
333 static struct tcmsg *tc_make_request(const struct netdev *, int type,
334 unsigned int flags, struct ofpbuf *);
335 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
336 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
337 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
340 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
341 struct nlattr **options);
342 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
343 struct nlattr **options,
344 struct netdev_queue_stats *);
345 static int tc_query_class(const struct netdev *,
346 unsigned int handle, unsigned int parent,
347 struct ofpbuf **replyp);
348 static int tc_delete_class(const struct netdev *, unsigned int handle);
350 static int tc_del_qdisc(struct netdev *netdev);
351 static int tc_query_qdisc(const struct netdev *netdev);
353 static int tc_calc_cell_log(unsigned int mtu);
354 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
355 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
356 const struct tc_ratespec *rate);
357 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
359 struct netdev_linux {
362 struct shash_node *shash_node;
363 unsigned int cache_valid;
364 unsigned int change_seq;
366 bool miimon; /* Link status of last poll. */
367 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
368 struct timer miimon_timer;
370 /* The following are figured out "on demand" only. They are only valid
371 * when the corresponding VALID_* bit in 'cache_valid' is set. */
373 uint8_t etheraddr[ETH_ADDR_LEN];
374 struct in_addr address, netmask;
377 unsigned int ifi_flags;
378 long long int carrier_resets;
379 uint32_t kbits_rate; /* Policing data. */
380 uint32_t kbits_burst;
381 int vport_stats_error; /* Cached error code from vport_get_stats().
382 0 or an errno value. */
383 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
384 int ether_addr_error; /* Cached error code from set/get etheraddr. */
385 int netdev_policing_error; /* Cached error code from set policing. */
386 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
387 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
389 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
390 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
391 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
392 enum netdev_features peer; /* Cached from ETHTOOL_GSET. */
394 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
398 struct tap_state tap;
402 struct netdev_rx_linux {
408 static const struct netdev_rx_class netdev_rx_linux_class;
410 /* Sockets used for ioctl operations. */
411 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
413 /* This is set pretty low because we probably won't learn anything from the
414 * additional log messages. */
415 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
417 static int netdev_linux_init(void);
419 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
420 int cmd, const char *cmd_name);
421 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
422 const char *cmd_name);
423 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
424 int cmd, const char *cmd_name);
425 static int get_flags(const struct netdev *, unsigned int *flags);
426 static int set_flags(const char *, unsigned int flags);
427 static int do_get_ifindex(const char *netdev_name);
428 static int get_ifindex(const struct netdev *, int *ifindexp);
429 static int do_set_addr(struct netdev *netdev,
430 int ioctl_nr, const char *ioctl_name,
431 struct in_addr addr);
432 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
433 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
434 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
435 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
436 static int af_packet_sock(void);
437 static void netdev_linux_miimon_run(void);
438 static void netdev_linux_miimon_wait(void);
441 is_netdev_linux_class(const struct netdev_class *netdev_class)
443 return netdev_class->init == netdev_linux_init;
447 is_tap_netdev(const struct netdev *netdev)
449 return netdev_get_class(netdev) == &netdev_tap_class;
452 static struct netdev_linux *
453 netdev_linux_cast(const struct netdev *netdev)
455 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
457 return CONTAINER_OF(netdev, struct netdev_linux, up);
460 static struct netdev_rx_linux *
461 netdev_rx_linux_cast(const struct netdev_rx *rx)
463 netdev_rx_assert_class(rx, &netdev_rx_linux_class);
464 return CONTAINER_OF(rx, struct netdev_rx_linux, up);
468 netdev_linux_init(void)
470 static int status = -1;
472 /* Create AF_INET socket. */
473 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
474 status = af_inet_sock >= 0 ? 0 : errno;
476 VLOG_ERR("failed to create inet socket: %s", ovs_strerror(status));
483 netdev_linux_run(void)
485 rtnetlink_link_run();
486 netdev_linux_miimon_run();
490 netdev_linux_wait(void)
492 rtnetlink_link_wait();
493 netdev_linux_miimon_wait();
497 netdev_linux_changed(struct netdev_linux *dev,
498 unsigned int ifi_flags, unsigned int mask)
501 if (!dev->change_seq) {
505 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
506 dev->carrier_resets++;
508 dev->ifi_flags = ifi_flags;
510 dev->cache_valid &= mask;
514 netdev_linux_update(struct netdev_linux *dev,
515 const struct rtnetlink_link_change *change)
517 if (change->nlmsg_type == RTM_NEWLINK) {
519 netdev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
521 /* Update netdev from rtnl-change msg. */
523 dev->mtu = change->mtu;
524 dev->cache_valid |= VALID_MTU;
525 dev->netdev_mtu_error = 0;
528 if (!eth_addr_is_zero(change->addr)) {
529 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
530 dev->cache_valid |= VALID_ETHERADDR;
531 dev->ether_addr_error = 0;
534 dev->ifindex = change->ifi_index;
535 dev->cache_valid |= VALID_IFINDEX;
536 dev->get_ifindex_error = 0;
539 netdev_linux_changed(dev, change->ifi_flags, 0);
544 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
545 void *aux OVS_UNUSED)
547 struct netdev_linux *dev;
549 struct netdev *base_dev = netdev_from_name(change->ifname);
550 if (base_dev && is_netdev_linux_class(netdev_get_class(base_dev))) {
551 netdev_linux_update(netdev_linux_cast(base_dev), change);
554 struct shash device_shash;
555 struct shash_node *node;
557 shash_init(&device_shash);
558 netdev_get_devices(&netdev_linux_class, &device_shash);
559 SHASH_FOR_EACH (node, &device_shash) {
564 get_flags(&dev->up, &flags);
565 netdev_linux_changed(dev, flags, 0);
567 shash_destroy(&device_shash);
572 cache_notifier_ref(void)
574 if (!cache_notifier_refcount) {
575 ovs_assert(!netdev_linux_cache_notifier);
577 netdev_linux_cache_notifier =
578 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
580 if (!netdev_linux_cache_notifier) {
584 cache_notifier_refcount++;
590 cache_notifier_unref(void)
592 ovs_assert(cache_notifier_refcount > 0);
593 if (!--cache_notifier_refcount) {
594 ovs_assert(netdev_linux_cache_notifier);
595 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
596 netdev_linux_cache_notifier = NULL;
600 /* Creates system and internal devices. */
602 netdev_linux_create(const struct netdev_class *class, const char *name,
603 struct netdev **netdevp)
605 struct netdev_linux *netdev;
608 error = cache_notifier_ref();
613 netdev = xzalloc(sizeof *netdev);
614 netdev->change_seq = 1;
615 netdev_init(&netdev->up, name, class);
616 error = get_flags(&netdev->up, &netdev->ifi_flags);
617 if (error == ENODEV) {
618 if (class != &netdev_internal_class) {
619 /* The device does not exist, so don't allow it to be opened. */
620 netdev_uninit(&netdev->up, false);
621 cache_notifier_unref();
625 /* "Internal" netdevs have to be created as netdev objects before
626 * they exist in the kernel, because creating them in the kernel
627 * happens by passing a netdev object to dpif_port_add().
628 * Therefore, ignore the error. */
632 *netdevp = &netdev->up;
636 /* For most types of netdevs we open the device for each call of
637 * netdev_open(). However, this is not the case with tap devices,
638 * since it is only possible to open the device once. In this
639 * situation we share a single file descriptor, and consequently
640 * buffers, across all readers. Therefore once data is read it will
641 * be unavailable to other reads for tap devices. */
643 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
644 const char *name, struct netdev **netdevp)
646 struct netdev_linux *netdev;
647 struct tap_state *state;
648 static const char tap_dev[] = "/dev/net/tun";
652 netdev = xzalloc(sizeof *netdev);
653 state = &netdev->state.tap;
655 error = cache_notifier_ref();
660 /* Open tap device. */
661 state->fd = open(tap_dev, O_RDWR);
664 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
665 goto error_unref_notifier;
668 /* Create tap device. */
669 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
670 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
671 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
672 VLOG_WARN("%s: creating tap device failed: %s", name,
673 ovs_strerror(errno));
675 goto error_unref_notifier;
678 /* Make non-blocking. */
679 error = set_nonblocking(state->fd);
681 goto error_unref_notifier;
684 netdev_init(&netdev->up, name, &netdev_tap_class);
685 *netdevp = &netdev->up;
688 error_unref_notifier:
689 cache_notifier_unref();
696 destroy_tap(struct netdev_linux *netdev)
698 struct tap_state *state = &netdev->state.tap;
700 if (state->fd >= 0) {
705 /* Destroys the netdev device 'netdev_'. */
707 netdev_linux_destroy(struct netdev *netdev_)
709 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
711 if (netdev->tc && netdev->tc->ops->tc_destroy) {
712 netdev->tc->ops->tc_destroy(netdev->tc);
715 if (netdev_get_class(netdev_) == &netdev_tap_class) {
720 cache_notifier_unref();
724 netdev_linux_rx_open(struct netdev *netdev_, struct netdev_rx **rxp)
726 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
727 bool is_tap = is_tap_netdev(netdev_);
728 struct netdev_rx_linux *rx;
733 fd = netdev->state.tap.fd;
735 struct sockaddr_ll sll;
737 /* Result of tcpdump -dd inbound */
738 static struct sock_filter filt[] = {
739 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
740 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
741 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
742 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
744 static struct sock_fprog fprog = { ARRAY_SIZE(filt), filt };
746 /* Create file descriptor. */
747 fd = socket(PF_PACKET, SOCK_RAW, 0);
750 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
754 /* Set non-blocking mode. */
755 error = set_nonblocking(fd);
760 /* Get ethernet device index. */
761 error = get_ifindex(&netdev->up, &ifindex);
766 /* Bind to specific ethernet device. */
767 memset(&sll, 0, sizeof sll);
768 sll.sll_family = AF_PACKET;
769 sll.sll_ifindex = ifindex;
770 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
771 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
773 VLOG_ERR("%s: failed to bind raw socket (%s)",
774 netdev_get_name(netdev_), ovs_strerror(error));
778 /* Filter for only inbound packets. */
779 error = setsockopt(fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
783 VLOG_ERR("%s: failed attach filter (%s)",
784 netdev_get_name(netdev_), ovs_strerror(error));
789 rx = xmalloc(sizeof *rx);
790 netdev_rx_init(&rx->up, netdev_, &netdev_rx_linux_class);
805 netdev_rx_linux_destroy(struct netdev_rx *rx_)
807 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
816 netdev_rx_linux_recv(struct netdev_rx *rx_, void *data, size_t size)
818 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
823 ? read(rx->fd, data, size)
824 : recv(rx->fd, data, size, MSG_TRUNC));
825 } while (retval < 0 && errno == EINTR);
828 return retval > size ? -EMSGSIZE : retval;
830 if (errno != EAGAIN) {
831 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
832 ovs_strerror(errno), netdev_rx_get_name(rx_));
839 netdev_rx_linux_wait(struct netdev_rx *rx_)
841 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
842 poll_fd_wait(rx->fd, POLLIN);
846 netdev_rx_linux_drain(struct netdev_rx *rx_)
848 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
851 int error = netdev_linux_do_ioctl(netdev_rx_get_name(rx_), &ifr,
852 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
856 drain_fd(rx->fd, ifr.ifr_qlen);
859 return drain_rcvbuf(rx->fd);
863 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
864 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
865 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
866 * the packet is too big or too small to transmit on the device.
868 * The caller retains ownership of 'buffer' in all cases.
870 * The kernel maintains a packet transmission queue, so the caller is not
871 * expected to do additional queuing of packets. */
873 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
878 if (!is_tap_netdev(netdev_)) {
879 /* Use our AF_PACKET socket to send to this device. */
880 struct sockaddr_ll sll;
887 sock = af_packet_sock();
892 error = get_ifindex(netdev_, &ifindex);
897 /* We don't bother setting most fields in sockaddr_ll because the
898 * kernel ignores them for SOCK_RAW. */
899 memset(&sll, 0, sizeof sll);
900 sll.sll_family = AF_PACKET;
901 sll.sll_ifindex = ifindex;
903 iov.iov_base = CONST_CAST(void *, data);
907 msg.msg_namelen = sizeof sll;
910 msg.msg_control = NULL;
911 msg.msg_controllen = 0;
914 retval = sendmsg(sock, &msg, 0);
916 /* Use the tap fd to send to this device. This is essential for
917 * tap devices, because packets sent to a tap device with an
918 * AF_PACKET socket will loop back to be *received* again on the
919 * tap device. This doesn't occur on other interface types
920 * because we attach a socket filter to the rx socket. */
921 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
923 retval = write(netdev->state.tap.fd, data, size);
927 /* The Linux AF_PACKET implementation never blocks waiting for room
928 * for packets, instead returning ENOBUFS. Translate this into
929 * EAGAIN for the caller. */
930 if (errno == ENOBUFS) {
932 } else if (errno == EINTR) {
934 } else if (errno != EAGAIN) {
935 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
936 netdev_get_name(netdev_), ovs_strerror(errno));
939 } else if (retval != size) {
940 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
941 "%zu) on %s", retval, size, netdev_get_name(netdev_));
949 /* Registers with the poll loop to wake up from the next call to poll_block()
950 * when the packet transmission queue has sufficient room to transmit a packet
951 * with netdev_send().
953 * The kernel maintains a packet transmission queue, so the client is not
954 * expected to do additional queuing of packets. Thus, this function is
955 * unlikely to ever be used. It is included for completeness. */
957 netdev_linux_send_wait(struct netdev *netdev)
959 if (is_tap_netdev(netdev)) {
960 /* TAP device always accepts packets.*/
961 poll_immediate_wake();
965 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
966 * otherwise a positive errno value. */
968 netdev_linux_set_etheraddr(struct netdev *netdev_,
969 const uint8_t mac[ETH_ADDR_LEN])
971 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
972 struct netdev_saved_flags *sf = NULL;
975 if (netdev->cache_valid & VALID_ETHERADDR) {
976 if (netdev->ether_addr_error) {
977 return netdev->ether_addr_error;
979 if (eth_addr_equals(netdev->etheraddr, mac)) {
982 netdev->cache_valid &= ~VALID_ETHERADDR;
985 /* Tap devices must be brought down before setting the address. */
986 if (is_tap_netdev(netdev_)) {
987 netdev_turn_flags_off(netdev_, NETDEV_UP, &sf);
989 error = set_etheraddr(netdev_get_name(netdev_), mac);
990 if (!error || error == ENODEV) {
991 netdev->ether_addr_error = error;
992 netdev->cache_valid |= VALID_ETHERADDR;
994 memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
998 netdev_restore_flags(sf);
1003 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1005 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1006 uint8_t mac[ETH_ADDR_LEN])
1008 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1010 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1011 int error = get_etheraddr(netdev_get_name(netdev_),
1014 netdev->ether_addr_error = error;
1015 netdev->cache_valid |= VALID_ETHERADDR;
1018 if (!netdev->ether_addr_error) {
1019 memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
1022 return netdev->ether_addr_error;
1025 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1026 * in bytes, not including the hardware header; thus, this is typically 1500
1027 * bytes for Ethernet devices. */
1029 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1031 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1032 if (!(netdev->cache_valid & VALID_MTU)) {
1036 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1037 SIOCGIFMTU, "SIOCGIFMTU");
1039 netdev->netdev_mtu_error = error;
1040 netdev->mtu = ifr.ifr_mtu;
1041 netdev->cache_valid |= VALID_MTU;
1044 if (!netdev->netdev_mtu_error) {
1045 *mtup = netdev->mtu;
1047 return netdev->netdev_mtu_error;
1050 /* Sets the maximum size of transmitted (MTU) for given device using linux
1051 * networking ioctl interface.
1054 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1056 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1060 if (netdev->cache_valid & VALID_MTU) {
1061 if (netdev->netdev_mtu_error) {
1062 return netdev->netdev_mtu_error;
1064 if (netdev->mtu == mtu) {
1067 netdev->cache_valid &= ~VALID_MTU;
1070 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1071 SIOCSIFMTU, "SIOCSIFMTU");
1072 if (!error || error == ENODEV) {
1073 netdev->netdev_mtu_error = error;
1074 netdev->mtu = ifr.ifr_mtu;
1075 netdev->cache_valid |= VALID_MTU;
1080 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1081 * On failure, returns a negative errno value. */
1083 netdev_linux_get_ifindex(const struct netdev *netdev)
1087 error = get_ifindex(netdev, &ifindex);
1088 return error ? -error : ifindex;
1092 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1094 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1096 if (netdev->miimon_interval > 0) {
1097 *carrier = netdev->miimon;
1099 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1105 static long long int
1106 netdev_linux_get_carrier_resets(const struct netdev *netdev)
1108 return netdev_linux_cast(netdev)->carrier_resets;
1112 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1113 struct mii_ioctl_data *data)
1118 memset(&ifr, 0, sizeof ifr);
1119 memcpy(&ifr.ifr_data, data, sizeof *data);
1120 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1121 memcpy(data, &ifr.ifr_data, sizeof *data);
1127 netdev_linux_get_miimon(const char *name, bool *miimon)
1129 struct mii_ioctl_data data;
1134 memset(&data, 0, sizeof data);
1135 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1137 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1138 data.reg_num = MII_BMSR;
1139 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1143 *miimon = !!(data.val_out & BMSR_LSTATUS);
1145 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1148 struct ethtool_cmd ecmd;
1150 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1153 COVERAGE_INC(netdev_get_ethtool);
1154 memset(&ecmd, 0, sizeof ecmd);
1155 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1158 struct ethtool_value eval;
1160 memcpy(&eval, &ecmd, sizeof eval);
1161 *miimon = !!eval.data;
1163 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1171 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1172 long long int interval)
1174 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1176 interval = interval > 0 ? MAX(interval, 100) : 0;
1177 if (netdev->miimon_interval != interval) {
1178 netdev->miimon_interval = interval;
1179 timer_set_expired(&netdev->miimon_timer);
1186 netdev_linux_miimon_run(void)
1188 struct shash device_shash;
1189 struct shash_node *node;
1191 shash_init(&device_shash);
1192 netdev_get_devices(&netdev_linux_class, &device_shash);
1193 SHASH_FOR_EACH (node, &device_shash) {
1194 struct netdev_linux *dev = node->data;
1197 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1201 netdev_linux_get_miimon(dev->up.name, &miimon);
1202 if (miimon != dev->miimon) {
1203 dev->miimon = miimon;
1204 netdev_linux_changed(dev, dev->ifi_flags, 0);
1207 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1210 shash_destroy(&device_shash);
1214 netdev_linux_miimon_wait(void)
1216 struct shash device_shash;
1217 struct shash_node *node;
1219 shash_init(&device_shash);
1220 netdev_get_devices(&netdev_linux_class, &device_shash);
1221 SHASH_FOR_EACH (node, &device_shash) {
1222 struct netdev_linux *dev = node->data;
1224 if (dev->miimon_interval > 0) {
1225 timer_wait(&dev->miimon_timer);
1228 shash_destroy(&device_shash);
1231 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1232 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1235 check_for_working_netlink_stats(void)
1237 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1238 * preferable, so if that works, we'll use it. */
1239 int ifindex = do_get_ifindex("lo");
1241 VLOG_WARN("failed to get ifindex for lo, "
1242 "obtaining netdev stats from proc");
1245 struct netdev_stats stats;
1246 int error = get_stats_via_netlink(ifindex, &stats);
1248 VLOG_DBG("obtaining netdev stats via rtnetlink");
1251 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1252 "via proc (you are probably running a pre-2.6.19 "
1253 "kernel)", ovs_strerror(error));
1260 swap_uint64(uint64_t *a, uint64_t *b)
1267 /* Copies 'src' into 'dst', performing format conversion in the process.
1269 * 'src' is allowed to be misaligned. */
1271 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1272 const struct ovs_vport_stats *src)
1274 dst->rx_packets = get_unaligned_u64(&src->rx_packets);
1275 dst->tx_packets = get_unaligned_u64(&src->tx_packets);
1276 dst->rx_bytes = get_unaligned_u64(&src->rx_bytes);
1277 dst->tx_bytes = get_unaligned_u64(&src->tx_bytes);
1278 dst->rx_errors = get_unaligned_u64(&src->rx_errors);
1279 dst->tx_errors = get_unaligned_u64(&src->tx_errors);
1280 dst->rx_dropped = get_unaligned_u64(&src->rx_dropped);
1281 dst->tx_dropped = get_unaligned_u64(&src->tx_dropped);
1283 dst->collisions = 0;
1284 dst->rx_length_errors = 0;
1285 dst->rx_over_errors = 0;
1286 dst->rx_crc_errors = 0;
1287 dst->rx_frame_errors = 0;
1288 dst->rx_fifo_errors = 0;
1289 dst->rx_missed_errors = 0;
1290 dst->tx_aborted_errors = 0;
1291 dst->tx_carrier_errors = 0;
1292 dst->tx_fifo_errors = 0;
1293 dst->tx_heartbeat_errors = 0;
1294 dst->tx_window_errors = 0;
1298 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1300 struct dpif_linux_vport reply;
1304 error = dpif_linux_vport_get(netdev_get_name(netdev), &reply, &buf);
1307 } else if (!reply.stats) {
1312 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1320 get_stats_via_vport(const struct netdev *netdev_,
1321 struct netdev_stats *stats)
1323 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1325 if (!netdev->vport_stats_error ||
1326 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1329 error = get_stats_via_vport__(netdev_, stats);
1330 if (error && error != ENOENT) {
1331 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1333 netdev_get_name(netdev_), ovs_strerror(error));
1335 netdev->vport_stats_error = error;
1336 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1341 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1342 struct netdev_stats *stats)
1344 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
1345 static int use_netlink_stats;
1348 if (ovsthread_once_start(&once)) {
1349 use_netlink_stats = check_for_working_netlink_stats();
1350 ovsthread_once_done(&once);
1353 if (use_netlink_stats) {
1356 error = get_ifindex(netdev_, &ifindex);
1358 error = get_stats_via_netlink(ifindex, stats);
1361 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1365 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1366 netdev_get_name(netdev_), error);
1372 /* Retrieves current device stats for 'netdev-linux'. */
1374 netdev_linux_get_stats(const struct netdev *netdev_,
1375 struct netdev_stats *stats)
1377 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1378 struct netdev_stats dev_stats;
1381 get_stats_via_vport(netdev_, stats);
1383 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1386 if (netdev->vport_stats_error) {
1393 if (netdev->vport_stats_error) {
1394 /* stats not available from OVS then use ioctl stats. */
1397 stats->rx_errors += dev_stats.rx_errors;
1398 stats->tx_errors += dev_stats.tx_errors;
1399 stats->rx_dropped += dev_stats.rx_dropped;
1400 stats->tx_dropped += dev_stats.tx_dropped;
1401 stats->multicast += dev_stats.multicast;
1402 stats->collisions += dev_stats.collisions;
1403 stats->rx_length_errors += dev_stats.rx_length_errors;
1404 stats->rx_over_errors += dev_stats.rx_over_errors;
1405 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1406 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1407 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1408 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1409 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1410 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1411 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1412 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1413 stats->tx_window_errors += dev_stats.tx_window_errors;
1418 /* Retrieves current device stats for 'netdev-tap' netdev or
1419 * netdev-internal. */
1421 netdev_tap_get_stats(const struct netdev *netdev_,
1422 struct netdev_stats *stats)
1424 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1425 struct netdev_stats dev_stats;
1428 get_stats_via_vport(netdev_, stats);
1430 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1432 if (netdev->vport_stats_error) {
1439 /* If this port is an internal port then the transmit and receive stats
1440 * will appear to be swapped relative to the other ports since we are the
1441 * one sending the data, not a remote computer. For consistency, we swap
1442 * them back here. This does not apply if we are getting stats from the
1443 * vport layer because it always tracks stats from the perspective of the
1445 if (netdev->vport_stats_error) {
1447 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1448 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1449 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1450 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1451 stats->rx_length_errors = 0;
1452 stats->rx_over_errors = 0;
1453 stats->rx_crc_errors = 0;
1454 stats->rx_frame_errors = 0;
1455 stats->rx_fifo_errors = 0;
1456 stats->rx_missed_errors = 0;
1457 stats->tx_aborted_errors = 0;
1458 stats->tx_carrier_errors = 0;
1459 stats->tx_fifo_errors = 0;
1460 stats->tx_heartbeat_errors = 0;
1461 stats->tx_window_errors = 0;
1463 stats->rx_dropped += dev_stats.tx_dropped;
1464 stats->tx_dropped += dev_stats.rx_dropped;
1466 stats->rx_errors += dev_stats.tx_errors;
1467 stats->tx_errors += dev_stats.rx_errors;
1469 stats->multicast += dev_stats.multicast;
1470 stats->collisions += dev_stats.collisions;
1476 netdev_internal_get_stats(const struct netdev *netdev_,
1477 struct netdev_stats *stats)
1479 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1481 get_stats_via_vport(netdev_, stats);
1482 return netdev->vport_stats_error;
1486 netdev_internal_set_stats(struct netdev *netdev,
1487 const struct netdev_stats *stats)
1489 struct ovs_vport_stats vport_stats;
1490 struct dpif_linux_vport vport;
1493 vport_stats.rx_packets = stats->rx_packets;
1494 vport_stats.tx_packets = stats->tx_packets;
1495 vport_stats.rx_bytes = stats->rx_bytes;
1496 vport_stats.tx_bytes = stats->tx_bytes;
1497 vport_stats.rx_errors = stats->rx_errors;
1498 vport_stats.tx_errors = stats->tx_errors;
1499 vport_stats.rx_dropped = stats->rx_dropped;
1500 vport_stats.tx_dropped = stats->tx_dropped;
1502 dpif_linux_vport_init(&vport);
1503 vport.cmd = OVS_VPORT_CMD_SET;
1504 vport.name = netdev_get_name(netdev);
1505 vport.stats = &vport_stats;
1507 err = dpif_linux_vport_transact(&vport, NULL, NULL);
1509 /* If the vport layer doesn't know about the device, that doesn't mean it
1510 * doesn't exist (after all were able to open it when netdev_open() was
1511 * called), it just means that it isn't attached and we'll be getting
1512 * stats a different way. */
1513 if (err == ENODEV) {
1521 netdev_linux_read_features(struct netdev_linux *netdev)
1523 struct ethtool_cmd ecmd;
1527 if (netdev->cache_valid & VALID_FEATURES) {
1531 COVERAGE_INC(netdev_get_ethtool);
1532 memset(&ecmd, 0, sizeof ecmd);
1533 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1534 ETHTOOL_GSET, "ETHTOOL_GSET");
1539 /* Supported features. */
1540 netdev->supported = 0;
1541 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1542 netdev->supported |= NETDEV_F_10MB_HD;
1544 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1545 netdev->supported |= NETDEV_F_10MB_FD;
1547 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1548 netdev->supported |= NETDEV_F_100MB_HD;
1550 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1551 netdev->supported |= NETDEV_F_100MB_FD;
1553 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1554 netdev->supported |= NETDEV_F_1GB_HD;
1556 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1557 netdev->supported |= NETDEV_F_1GB_FD;
1559 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1560 netdev->supported |= NETDEV_F_10GB_FD;
1562 if (ecmd.supported & SUPPORTED_TP) {
1563 netdev->supported |= NETDEV_F_COPPER;
1565 if (ecmd.supported & SUPPORTED_FIBRE) {
1566 netdev->supported |= NETDEV_F_FIBER;
1568 if (ecmd.supported & SUPPORTED_Autoneg) {
1569 netdev->supported |= NETDEV_F_AUTONEG;
1571 if (ecmd.supported & SUPPORTED_Pause) {
1572 netdev->supported |= NETDEV_F_PAUSE;
1574 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1575 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1578 /* Advertised features. */
1579 netdev->advertised = 0;
1580 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1581 netdev->advertised |= NETDEV_F_10MB_HD;
1583 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1584 netdev->advertised |= NETDEV_F_10MB_FD;
1586 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1587 netdev->advertised |= NETDEV_F_100MB_HD;
1589 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1590 netdev->advertised |= NETDEV_F_100MB_FD;
1592 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1593 netdev->advertised |= NETDEV_F_1GB_HD;
1595 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1596 netdev->advertised |= NETDEV_F_1GB_FD;
1598 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1599 netdev->advertised |= NETDEV_F_10GB_FD;
1601 if (ecmd.advertising & ADVERTISED_TP) {
1602 netdev->advertised |= NETDEV_F_COPPER;
1604 if (ecmd.advertising & ADVERTISED_FIBRE) {
1605 netdev->advertised |= NETDEV_F_FIBER;
1607 if (ecmd.advertising & ADVERTISED_Autoneg) {
1608 netdev->advertised |= NETDEV_F_AUTONEG;
1610 if (ecmd.advertising & ADVERTISED_Pause) {
1611 netdev->advertised |= NETDEV_F_PAUSE;
1613 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1614 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1617 /* Current settings. */
1619 if (speed == SPEED_10) {
1620 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1621 } else if (speed == SPEED_100) {
1622 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1623 } else if (speed == SPEED_1000) {
1624 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1625 } else if (speed == SPEED_10000) {
1626 netdev->current = NETDEV_F_10GB_FD;
1627 } else if (speed == 40000) {
1628 netdev->current = NETDEV_F_40GB_FD;
1629 } else if (speed == 100000) {
1630 netdev->current = NETDEV_F_100GB_FD;
1631 } else if (speed == 1000000) {
1632 netdev->current = NETDEV_F_1TB_FD;
1634 netdev->current = 0;
1637 if (ecmd.port == PORT_TP) {
1638 netdev->current |= NETDEV_F_COPPER;
1639 } else if (ecmd.port == PORT_FIBRE) {
1640 netdev->current |= NETDEV_F_FIBER;
1644 netdev->current |= NETDEV_F_AUTONEG;
1647 /* Peer advertisements. */
1648 netdev->peer = 0; /* XXX */
1651 netdev->cache_valid |= VALID_FEATURES;
1652 netdev->get_features_error = error;
1655 /* Stores the features supported by 'netdev' into each of '*current',
1656 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1657 * bitmap of NETDEV_* bits. Returns 0 if successful, otherwise a positive
1660 netdev_linux_get_features(const struct netdev *netdev_,
1661 enum netdev_features *current,
1662 enum netdev_features *advertised,
1663 enum netdev_features *supported,
1664 enum netdev_features *peer)
1666 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1668 netdev_linux_read_features(netdev);
1670 if (!netdev->get_features_error) {
1671 *current = netdev->current;
1672 *advertised = netdev->advertised;
1673 *supported = netdev->supported;
1674 *peer = netdev->peer;
1676 return netdev->get_features_error;
1679 /* Set the features advertised by 'netdev' to 'advertise'. */
1681 netdev_linux_set_advertisements(struct netdev *netdev,
1682 enum netdev_features advertise)
1684 struct ethtool_cmd ecmd;
1687 COVERAGE_INC(netdev_get_ethtool);
1688 memset(&ecmd, 0, sizeof ecmd);
1689 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1690 ETHTOOL_GSET, "ETHTOOL_GSET");
1695 ecmd.advertising = 0;
1696 if (advertise & NETDEV_F_10MB_HD) {
1697 ecmd.advertising |= ADVERTISED_10baseT_Half;
1699 if (advertise & NETDEV_F_10MB_FD) {
1700 ecmd.advertising |= ADVERTISED_10baseT_Full;
1702 if (advertise & NETDEV_F_100MB_HD) {
1703 ecmd.advertising |= ADVERTISED_100baseT_Half;
1705 if (advertise & NETDEV_F_100MB_FD) {
1706 ecmd.advertising |= ADVERTISED_100baseT_Full;
1708 if (advertise & NETDEV_F_1GB_HD) {
1709 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1711 if (advertise & NETDEV_F_1GB_FD) {
1712 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1714 if (advertise & NETDEV_F_10GB_FD) {
1715 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1717 if (advertise & NETDEV_F_COPPER) {
1718 ecmd.advertising |= ADVERTISED_TP;
1720 if (advertise & NETDEV_F_FIBER) {
1721 ecmd.advertising |= ADVERTISED_FIBRE;
1723 if (advertise & NETDEV_F_AUTONEG) {
1724 ecmd.advertising |= ADVERTISED_Autoneg;
1726 if (advertise & NETDEV_F_PAUSE) {
1727 ecmd.advertising |= ADVERTISED_Pause;
1729 if (advertise & NETDEV_F_PAUSE_ASYM) {
1730 ecmd.advertising |= ADVERTISED_Asym_Pause;
1732 COVERAGE_INC(netdev_set_ethtool);
1733 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1734 ETHTOOL_SSET, "ETHTOOL_SSET");
1737 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1738 * successful, otherwise a positive errno value. */
1740 netdev_linux_set_policing(struct netdev *netdev_,
1741 uint32_t kbits_rate, uint32_t kbits_burst)
1743 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1744 const char *netdev_name = netdev_get_name(netdev_);
1748 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1749 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1750 : kbits_burst); /* Stick with user-specified value. */
1752 if (netdev->cache_valid & VALID_POLICING) {
1753 if (netdev->netdev_policing_error) {
1754 return netdev->netdev_policing_error;
1757 if (netdev->kbits_rate == kbits_rate &&
1758 netdev->kbits_burst == kbits_burst) {
1759 /* Assume that settings haven't changed since we last set them. */
1762 netdev->cache_valid &= ~VALID_POLICING;
1765 COVERAGE_INC(netdev_set_policing);
1766 /* Remove any existing ingress qdisc. */
1767 error = tc_add_del_ingress_qdisc(netdev_, false);
1769 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1770 netdev_name, ovs_strerror(error));
1775 error = tc_add_del_ingress_qdisc(netdev_, true);
1777 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1778 netdev_name, ovs_strerror(error));
1782 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
1784 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1785 netdev_name, ovs_strerror(error));
1790 netdev->kbits_rate = kbits_rate;
1791 netdev->kbits_burst = kbits_burst;
1794 if (!error || error == ENODEV) {
1795 netdev->netdev_policing_error = error;
1796 netdev->cache_valid |= VALID_POLICING;
1802 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1805 const struct tc_ops *const *opsp;
1807 for (opsp = tcs; *opsp != NULL; opsp++) {
1808 const struct tc_ops *ops = *opsp;
1809 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1810 sset_add(types, ops->ovs_name);
1816 static const struct tc_ops *
1817 tc_lookup_ovs_name(const char *name)
1819 const struct tc_ops *const *opsp;
1821 for (opsp = tcs; *opsp != NULL; opsp++) {
1822 const struct tc_ops *ops = *opsp;
1823 if (!strcmp(name, ops->ovs_name)) {
1830 static const struct tc_ops *
1831 tc_lookup_linux_name(const char *name)
1833 const struct tc_ops *const *opsp;
1835 for (opsp = tcs; *opsp != NULL; opsp++) {
1836 const struct tc_ops *ops = *opsp;
1837 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1844 static struct tc_queue *
1845 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
1848 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1849 struct tc_queue *queue;
1851 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
1852 if (queue->queue_id == queue_id) {
1859 static struct tc_queue *
1860 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1862 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1866 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1868 struct netdev_qos_capabilities *caps)
1870 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1874 caps->n_queues = ops->n_queues;
1879 netdev_linux_get_qos(const struct netdev *netdev_,
1880 const char **typep, struct smap *details)
1882 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1885 error = tc_query_qdisc(netdev_);
1890 *typep = netdev->tc->ops->ovs_name;
1891 return (netdev->tc->ops->qdisc_get
1892 ? netdev->tc->ops->qdisc_get(netdev_, details)
1897 netdev_linux_set_qos(struct netdev *netdev_,
1898 const char *type, const struct smap *details)
1900 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1901 const struct tc_ops *new_ops;
1904 new_ops = tc_lookup_ovs_name(type);
1905 if (!new_ops || !new_ops->tc_install) {
1909 error = tc_query_qdisc(netdev_);
1914 if (new_ops == netdev->tc->ops) {
1915 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
1917 /* Delete existing qdisc. */
1918 error = tc_del_qdisc(netdev_);
1922 ovs_assert(netdev->tc == NULL);
1924 /* Install new qdisc. */
1925 error = new_ops->tc_install(netdev_, details);
1926 ovs_assert((error == 0) == (netdev->tc != NULL));
1933 netdev_linux_get_queue(const struct netdev *netdev_,
1934 unsigned int queue_id, struct smap *details)
1936 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1939 error = tc_query_qdisc(netdev_);
1943 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
1945 ? netdev->tc->ops->class_get(netdev_, queue, details)
1951 netdev_linux_set_queue(struct netdev *netdev_,
1952 unsigned int queue_id, const struct smap *details)
1954 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1957 error = tc_query_qdisc(netdev_);
1960 } else if (queue_id >= netdev->tc->ops->n_queues
1961 || !netdev->tc->ops->class_set) {
1965 return netdev->tc->ops->class_set(netdev_, queue_id, details);
1969 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
1971 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1974 error = tc_query_qdisc(netdev_);
1977 } else if (!netdev->tc->ops->class_delete) {
1980 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
1982 ? netdev->tc->ops->class_delete(netdev_, queue)
1988 netdev_linux_get_queue_stats(const struct netdev *netdev_,
1989 unsigned int queue_id,
1990 struct netdev_queue_stats *stats)
1992 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1995 error = tc_query_qdisc(netdev_);
1998 } else if (!netdev->tc->ops->class_get_stats) {
2001 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2005 stats->created = queue->created;
2006 return netdev->tc->ops->class_get_stats(netdev_, queue, stats);
2011 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
2013 struct ofpbuf request;
2014 struct tcmsg *tcmsg;
2016 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2020 tcmsg->tcm_parent = 0;
2021 nl_dump_start(dump, NETLINK_ROUTE, &request);
2022 ofpbuf_uninit(&request);
2027 netdev_linux_dump_queues(const struct netdev *netdev_,
2028 netdev_dump_queues_cb *cb, void *aux)
2030 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2031 struct tc_queue *queue, *next_queue;
2032 struct smap details;
2036 error = tc_query_qdisc(netdev_);
2039 } else if (!netdev->tc->ops->class_get) {
2044 smap_init(&details);
2045 HMAP_FOR_EACH_SAFE (queue, next_queue, hmap_node,
2046 &netdev->tc->queues) {
2047 smap_clear(&details);
2049 error = netdev->tc->ops->class_get(netdev_, queue, &details);
2051 (*cb)(queue->queue_id, &details, aux);
2056 smap_destroy(&details);
2062 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2063 netdev_dump_queue_stats_cb *cb, void *aux)
2065 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2066 struct nl_dump dump;
2071 error = tc_query_qdisc(netdev_);
2074 } else if (!netdev->tc->ops->class_dump_stats) {
2079 if (!start_queue_dump(netdev_, &dump)) {
2082 while (nl_dump_next(&dump, &msg)) {
2083 error = netdev->tc->ops->class_dump_stats(netdev_, &msg, cb, aux);
2089 error = nl_dump_done(&dump);
2090 return error ? error : last_error;
2094 netdev_linux_get_in4(const struct netdev *netdev_,
2095 struct in_addr *address, struct in_addr *netmask)
2097 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2099 if (!(netdev->cache_valid & VALID_IN4)) {
2102 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
2103 SIOCGIFADDR, "SIOCGIFADDR");
2108 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2109 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2114 netdev->cache_valid |= VALID_IN4;
2116 *address = netdev->address;
2117 *netmask = netdev->netmask;
2118 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
2122 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2123 struct in_addr netmask)
2125 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2128 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2130 netdev->cache_valid |= VALID_IN4;
2131 netdev->address = address;
2132 netdev->netmask = netmask;
2133 if (address.s_addr != INADDR_ANY) {
2134 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2135 "SIOCSIFNETMASK", netmask);
2142 parse_if_inet6_line(const char *line,
2143 struct in6_addr *in6, char ifname[16 + 1])
2145 uint8_t *s6 = in6->s6_addr;
2146 #define X8 "%2"SCNx8
2148 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2149 "%*x %*x %*x %*x %16s\n",
2150 &s6[0], &s6[1], &s6[2], &s6[3],
2151 &s6[4], &s6[5], &s6[6], &s6[7],
2152 &s6[8], &s6[9], &s6[10], &s6[11],
2153 &s6[12], &s6[13], &s6[14], &s6[15],
2157 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2158 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2160 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2162 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2163 if (!(netdev->cache_valid & VALID_IN6)) {
2167 netdev->in6 = in6addr_any;
2169 file = fopen("/proc/net/if_inet6", "r");
2171 const char *name = netdev_get_name(netdev_);
2172 while (fgets(line, sizeof line, file)) {
2173 struct in6_addr in6_tmp;
2174 char ifname[16 + 1];
2175 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2176 && !strcmp(name, ifname))
2178 netdev->in6 = in6_tmp;
2184 netdev->cache_valid |= VALID_IN6;
2191 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2193 struct sockaddr_in sin;
2194 memset(&sin, 0, sizeof sin);
2195 sin.sin_family = AF_INET;
2196 sin.sin_addr = addr;
2199 memset(sa, 0, sizeof *sa);
2200 memcpy(sa, &sin, sizeof sin);
2204 do_set_addr(struct netdev *netdev,
2205 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2208 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2209 make_in4_sockaddr(&ifr.ifr_addr, addr);
2211 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2215 /* Adds 'router' as a default IP gateway. */
2217 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2219 struct in_addr any = { INADDR_ANY };
2223 memset(&rt, 0, sizeof rt);
2224 make_in4_sockaddr(&rt.rt_dst, any);
2225 make_in4_sockaddr(&rt.rt_gateway, router);
2226 make_in4_sockaddr(&rt.rt_genmask, any);
2227 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2228 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2230 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2236 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2239 static const char fn[] = "/proc/net/route";
2244 *netdev_name = NULL;
2245 stream = fopen(fn, "r");
2246 if (stream == NULL) {
2247 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2252 while (fgets(line, sizeof line, stream)) {
2255 ovs_be32 dest, gateway, mask;
2256 int refcnt, metric, mtu;
2257 unsigned int flags, use, window, irtt;
2260 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2262 iface, &dest, &gateway, &flags, &refcnt,
2263 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2265 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2269 if (!(flags & RTF_UP)) {
2270 /* Skip routes that aren't up. */
2274 /* The output of 'dest', 'mask', and 'gateway' were given in
2275 * network byte order, so we don't need need any endian
2276 * conversions here. */
2277 if ((dest & mask) == (host->s_addr & mask)) {
2279 /* The host is directly reachable. */
2280 next_hop->s_addr = 0;
2282 /* To reach the host, we must go through a gateway. */
2283 next_hop->s_addr = gateway;
2285 *netdev_name = xstrdup(iface);
2297 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2299 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2302 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2303 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2305 COVERAGE_INC(netdev_get_ethtool);
2306 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2307 error = netdev_linux_do_ethtool(netdev->up.name,
2310 "ETHTOOL_GDRVINFO");
2312 netdev->cache_valid |= VALID_DRVINFO;
2317 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2318 smap_add(smap, "driver_version", netdev->drvinfo.version);
2319 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2325 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2328 smap_add(smap, "driver_name", "openvswitch");
2332 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2333 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2334 * returns 0. Otherwise, it returns a positive errno value; in particular,
2335 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2337 netdev_linux_arp_lookup(const struct netdev *netdev,
2338 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2341 struct sockaddr_in sin;
2344 memset(&r, 0, sizeof r);
2345 memset(&sin, 0, sizeof sin);
2346 sin.sin_family = AF_INET;
2347 sin.sin_addr.s_addr = ip;
2349 memcpy(&r.arp_pa, &sin, sizeof sin);
2350 r.arp_ha.sa_family = ARPHRD_ETHER;
2352 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2353 COVERAGE_INC(netdev_arp_lookup);
2354 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2356 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2357 } else if (retval != ENXIO) {
2358 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2359 netdev_get_name(netdev), IP_ARGS(ip),
2360 ovs_strerror(retval));
2366 nd_to_iff_flags(enum netdev_flags nd)
2369 if (nd & NETDEV_UP) {
2372 if (nd & NETDEV_PROMISC) {
2379 iff_to_nd_flags(int iff)
2381 enum netdev_flags nd = 0;
2385 if (iff & IFF_PROMISC) {
2386 nd |= NETDEV_PROMISC;
2392 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2393 enum netdev_flags on, enum netdev_flags *old_flagsp)
2395 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2396 int old_flags, new_flags;
2399 old_flags = netdev->ifi_flags;
2400 *old_flagsp = iff_to_nd_flags(old_flags);
2401 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2402 if (new_flags != old_flags) {
2403 error = set_flags(netdev_get_name(netdev_), new_flags);
2404 get_flags(netdev_, &netdev->ifi_flags);
2410 netdev_linux_change_seq(const struct netdev *netdev)
2412 return netdev_linux_cast(netdev)->change_seq;
2415 #define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS, \
2416 GET_FEATURES, GET_STATUS) \
2420 netdev_linux_init, \
2422 netdev_linux_wait, \
2425 netdev_linux_destroy, \
2426 NULL, /* get_config */ \
2427 NULL, /* set_config */ \
2428 NULL, /* get_tunnel_config */ \
2430 netdev_linux_rx_open, \
2432 netdev_linux_send, \
2433 netdev_linux_send_wait, \
2435 netdev_linux_set_etheraddr, \
2436 netdev_linux_get_etheraddr, \
2437 netdev_linux_get_mtu, \
2438 netdev_linux_set_mtu, \
2439 netdev_linux_get_ifindex, \
2440 netdev_linux_get_carrier, \
2441 netdev_linux_get_carrier_resets, \
2442 netdev_linux_set_miimon_interval, \
2447 netdev_linux_set_advertisements, \
2449 netdev_linux_set_policing, \
2450 netdev_linux_get_qos_types, \
2451 netdev_linux_get_qos_capabilities, \
2452 netdev_linux_get_qos, \
2453 netdev_linux_set_qos, \
2454 netdev_linux_get_queue, \
2455 netdev_linux_set_queue, \
2456 netdev_linux_delete_queue, \
2457 netdev_linux_get_queue_stats, \
2458 netdev_linux_dump_queues, \
2459 netdev_linux_dump_queue_stats, \
2461 netdev_linux_get_in4, \
2462 netdev_linux_set_in4, \
2463 netdev_linux_get_in6, \
2464 netdev_linux_add_router, \
2465 netdev_linux_get_next_hop, \
2467 netdev_linux_arp_lookup, \
2469 netdev_linux_update_flags, \
2471 netdev_linux_change_seq \
2474 const struct netdev_class netdev_linux_class =
2477 netdev_linux_create,
2478 netdev_linux_get_stats,
2479 NULL, /* set_stats */
2480 netdev_linux_get_features,
2481 netdev_linux_get_status);
2483 const struct netdev_class netdev_tap_class =
2486 netdev_linux_create_tap,
2487 netdev_tap_get_stats,
2488 NULL, /* set_stats */
2489 netdev_linux_get_features,
2490 netdev_linux_get_status);
2492 const struct netdev_class netdev_internal_class =
2495 netdev_linux_create,
2496 netdev_internal_get_stats,
2497 netdev_internal_set_stats,
2498 NULL, /* get_features */
2499 netdev_internal_get_status);
2501 static const struct netdev_rx_class netdev_rx_linux_class = {
2502 netdev_rx_linux_destroy,
2503 netdev_rx_linux_recv,
2504 netdev_rx_linux_wait,
2505 netdev_rx_linux_drain,
2508 /* HTB traffic control class. */
2510 #define HTB_N_QUEUES 0xf000
2514 unsigned int max_rate; /* In bytes/s. */
2518 struct tc_queue tc_queue;
2519 unsigned int min_rate; /* In bytes/s. */
2520 unsigned int max_rate; /* In bytes/s. */
2521 unsigned int burst; /* In bytes. */
2522 unsigned int priority; /* Lower values are higher priorities. */
2526 htb_get__(const struct netdev *netdev_)
2528 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2529 return CONTAINER_OF(netdev->tc, struct htb, tc);
2533 htb_install__(struct netdev *netdev_, uint64_t max_rate)
2535 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2538 htb = xmalloc(sizeof *htb);
2539 tc_init(&htb->tc, &tc_ops_htb);
2540 htb->max_rate = max_rate;
2542 netdev->tc = &htb->tc;
2545 /* Create an HTB qdisc.
2547 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2549 htb_setup_qdisc__(struct netdev *netdev)
2552 struct tc_htb_glob opt;
2553 struct ofpbuf request;
2554 struct tcmsg *tcmsg;
2556 tc_del_qdisc(netdev);
2558 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2559 NLM_F_EXCL | NLM_F_CREATE, &request);
2563 tcmsg->tcm_handle = tc_make_handle(1, 0);
2564 tcmsg->tcm_parent = TC_H_ROOT;
2566 nl_msg_put_string(&request, TCA_KIND, "htb");
2568 memset(&opt, 0, sizeof opt);
2569 opt.rate2quantum = 10;
2573 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2574 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2575 nl_msg_end_nested(&request, opt_offset);
2577 return tc_transact(&request, NULL);
2580 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2581 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2583 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2584 unsigned int parent, struct htb_class *class)
2587 struct tc_htb_opt opt;
2588 struct ofpbuf request;
2589 struct tcmsg *tcmsg;
2593 error = netdev_get_mtu(netdev, &mtu);
2595 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2596 netdev_get_name(netdev));
2600 memset(&opt, 0, sizeof opt);
2601 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2602 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2603 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2604 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2605 opt.prio = class->priority;
2607 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2611 tcmsg->tcm_handle = handle;
2612 tcmsg->tcm_parent = parent;
2614 nl_msg_put_string(&request, TCA_KIND, "htb");
2615 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2616 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2617 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2618 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2619 nl_msg_end_nested(&request, opt_offset);
2621 error = tc_transact(&request, NULL);
2623 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2624 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2625 netdev_get_name(netdev),
2626 tc_get_major(handle), tc_get_minor(handle),
2627 tc_get_major(parent), tc_get_minor(parent),
2628 class->min_rate, class->max_rate,
2629 class->burst, class->priority, ovs_strerror(error));
2634 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2635 * description of them into 'details'. The description complies with the
2636 * specification given in the vswitch database documentation for linux-htb
2639 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2641 static const struct nl_policy tca_htb_policy[] = {
2642 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2643 .min_len = sizeof(struct tc_htb_opt) },
2646 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2647 const struct tc_htb_opt *htb;
2649 if (!nl_parse_nested(nl_options, tca_htb_policy,
2650 attrs, ARRAY_SIZE(tca_htb_policy))) {
2651 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2655 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2656 class->min_rate = htb->rate.rate;
2657 class->max_rate = htb->ceil.rate;
2658 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2659 class->priority = htb->prio;
2664 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2665 struct htb_class *options,
2666 struct netdev_queue_stats *stats)
2668 struct nlattr *nl_options;
2669 unsigned int handle;
2672 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2673 if (!error && queue_id) {
2674 unsigned int major = tc_get_major(handle);
2675 unsigned int minor = tc_get_minor(handle);
2676 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2677 *queue_id = minor - 1;
2682 if (!error && options) {
2683 error = htb_parse_tca_options__(nl_options, options);
2689 htb_parse_qdisc_details__(struct netdev *netdev,
2690 const struct smap *details, struct htb_class *hc)
2692 const char *max_rate_s;
2694 max_rate_s = smap_get(details, "max-rate");
2695 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2696 if (!hc->max_rate) {
2697 enum netdev_features current;
2699 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2700 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
2702 hc->min_rate = hc->max_rate;
2708 htb_parse_class_details__(struct netdev *netdev,
2709 const struct smap *details, struct htb_class *hc)
2711 const struct htb *htb = htb_get__(netdev);
2712 const char *min_rate_s = smap_get(details, "min-rate");
2713 const char *max_rate_s = smap_get(details, "max-rate");
2714 const char *burst_s = smap_get(details, "burst");
2715 const char *priority_s = smap_get(details, "priority");
2718 error = netdev_get_mtu(netdev, &mtu);
2720 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2721 netdev_get_name(netdev));
2725 /* HTB requires at least an mtu sized min-rate to send any traffic even
2726 * on uncongested links. */
2727 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2728 hc->min_rate = MAX(hc->min_rate, mtu);
2729 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2732 hc->max_rate = (max_rate_s
2733 ? strtoull(max_rate_s, NULL, 10) / 8
2735 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2736 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2740 * According to hints in the documentation that I've read, it is important
2741 * that 'burst' be at least as big as the largest frame that might be
2742 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2743 * but having it a bit too small is a problem. Since netdev_get_mtu()
2744 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2745 * the MTU. We actually add 64, instead of 14, as a guard against
2746 * additional headers get tacked on somewhere that we're not aware of. */
2747 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2748 hc->burst = MAX(hc->burst, mtu + 64);
2751 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2757 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2758 unsigned int parent, struct htb_class *options,
2759 struct netdev_queue_stats *stats)
2761 struct ofpbuf *reply;
2764 error = tc_query_class(netdev, handle, parent, &reply);
2766 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2767 ofpbuf_delete(reply);
2773 htb_tc_install(struct netdev *netdev, const struct smap *details)
2777 error = htb_setup_qdisc__(netdev);
2779 struct htb_class hc;
2781 htb_parse_qdisc_details__(netdev, details, &hc);
2782 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2783 tc_make_handle(1, 0), &hc);
2785 htb_install__(netdev, hc.max_rate);
2791 static struct htb_class *
2792 htb_class_cast__(const struct tc_queue *queue)
2794 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2798 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2799 const struct htb_class *hc)
2801 struct htb *htb = htb_get__(netdev);
2802 size_t hash = hash_int(queue_id, 0);
2803 struct tc_queue *queue;
2804 struct htb_class *hcp;
2806 queue = tc_find_queue__(netdev, queue_id, hash);
2808 hcp = htb_class_cast__(queue);
2810 hcp = xmalloc(sizeof *hcp);
2811 queue = &hcp->tc_queue;
2812 queue->queue_id = queue_id;
2813 queue->created = time_msec();
2814 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2817 hcp->min_rate = hc->min_rate;
2818 hcp->max_rate = hc->max_rate;
2819 hcp->burst = hc->burst;
2820 hcp->priority = hc->priority;
2824 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2827 struct nl_dump dump;
2828 struct htb_class hc;
2830 /* Get qdisc options. */
2832 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2833 htb_install__(netdev, hc.max_rate);
2836 if (!start_queue_dump(netdev, &dump)) {
2839 while (nl_dump_next(&dump, &msg)) {
2840 unsigned int queue_id;
2842 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2843 htb_update_queue__(netdev, queue_id, &hc);
2846 nl_dump_done(&dump);
2852 htb_tc_destroy(struct tc *tc)
2854 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2855 struct htb_class *hc, *next;
2857 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2858 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2866 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
2868 const struct htb *htb = htb_get__(netdev);
2869 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
2874 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
2876 struct htb_class hc;
2879 htb_parse_qdisc_details__(netdev, details, &hc);
2880 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2881 tc_make_handle(1, 0), &hc);
2883 htb_get__(netdev)->max_rate = hc.max_rate;
2889 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2890 const struct tc_queue *queue, struct smap *details)
2892 const struct htb_class *hc = htb_class_cast__(queue);
2894 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
2895 if (hc->min_rate != hc->max_rate) {
2896 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
2898 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
2900 smap_add_format(details, "priority", "%u", hc->priority);
2906 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2907 const struct smap *details)
2909 struct htb_class hc;
2912 error = htb_parse_class_details__(netdev, details, &hc);
2917 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2918 tc_make_handle(1, 0xfffe), &hc);
2923 htb_update_queue__(netdev, queue_id, &hc);
2928 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2930 struct htb_class *hc = htb_class_cast__(queue);
2931 struct htb *htb = htb_get__(netdev);
2934 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2936 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2943 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2944 struct netdev_queue_stats *stats)
2946 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2947 tc_make_handle(1, 0xfffe), NULL, stats);
2951 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2952 const struct ofpbuf *nlmsg,
2953 netdev_dump_queue_stats_cb *cb, void *aux)
2955 struct netdev_queue_stats stats;
2956 unsigned int handle, major, minor;
2959 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2964 major = tc_get_major(handle);
2965 minor = tc_get_minor(handle);
2966 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2967 (*cb)(minor - 1, &stats, aux);
2972 static const struct tc_ops tc_ops_htb = {
2973 "htb", /* linux_name */
2974 "linux-htb", /* ovs_name */
2975 HTB_N_QUEUES, /* n_queues */
2984 htb_class_get_stats,
2985 htb_class_dump_stats
2988 /* "linux-hfsc" traffic control class. */
2990 #define HFSC_N_QUEUES 0xf000
2998 struct tc_queue tc_queue;
3003 static struct hfsc *
3004 hfsc_get__(const struct netdev *netdev_)
3006 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3007 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
3010 static struct hfsc_class *
3011 hfsc_class_cast__(const struct tc_queue *queue)
3013 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3017 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
3019 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3022 hfsc = xmalloc(sizeof *hfsc);
3023 tc_init(&hfsc->tc, &tc_ops_hfsc);
3024 hfsc->max_rate = max_rate;
3025 netdev->tc = &hfsc->tc;
3029 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3030 const struct hfsc_class *hc)
3034 struct hfsc_class *hcp;
3035 struct tc_queue *queue;
3037 hfsc = hfsc_get__(netdev);
3038 hash = hash_int(queue_id, 0);
3040 queue = tc_find_queue__(netdev, queue_id, hash);
3042 hcp = hfsc_class_cast__(queue);
3044 hcp = xmalloc(sizeof *hcp);
3045 queue = &hcp->tc_queue;
3046 queue->queue_id = queue_id;
3047 queue->created = time_msec();
3048 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3051 hcp->min_rate = hc->min_rate;
3052 hcp->max_rate = hc->max_rate;
3056 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3058 const struct tc_service_curve *rsc, *fsc, *usc;
3059 static const struct nl_policy tca_hfsc_policy[] = {
3061 .type = NL_A_UNSPEC,
3063 .min_len = sizeof(struct tc_service_curve),
3066 .type = NL_A_UNSPEC,
3068 .min_len = sizeof(struct tc_service_curve),
3071 .type = NL_A_UNSPEC,
3073 .min_len = sizeof(struct tc_service_curve),
3076 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3078 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3079 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3080 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3084 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3085 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3086 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3088 if (rsc->m1 != 0 || rsc->d != 0 ||
3089 fsc->m1 != 0 || fsc->d != 0 ||
3090 usc->m1 != 0 || usc->d != 0) {
3091 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3092 "Non-linear service curves are not supported.");
3096 if (rsc->m2 != fsc->m2) {
3097 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3098 "Real-time service curves are not supported ");
3102 if (rsc->m2 > usc->m2) {
3103 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3104 "Min-rate service curve is greater than "
3105 "the max-rate service curve.");
3109 class->min_rate = fsc->m2;
3110 class->max_rate = usc->m2;
3115 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3116 struct hfsc_class *options,
3117 struct netdev_queue_stats *stats)
3120 unsigned int handle;
3121 struct nlattr *nl_options;
3123 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3129 unsigned int major, minor;
3131 major = tc_get_major(handle);
3132 minor = tc_get_minor(handle);
3133 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3134 *queue_id = minor - 1;
3141 error = hfsc_parse_tca_options__(nl_options, options);
3148 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3149 unsigned int parent, struct hfsc_class *options,
3150 struct netdev_queue_stats *stats)
3153 struct ofpbuf *reply;
3155 error = tc_query_class(netdev, handle, parent, &reply);
3160 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3161 ofpbuf_delete(reply);
3166 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct smap *details,
3167 struct hfsc_class *class)
3170 const char *max_rate_s;
3172 max_rate_s = smap_get(details, "max-rate");
3173 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3176 enum netdev_features current;
3178 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
3179 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3182 class->min_rate = max_rate;
3183 class->max_rate = max_rate;
3187 hfsc_parse_class_details__(struct netdev *netdev,
3188 const struct smap *details,
3189 struct hfsc_class * class)
3191 const struct hfsc *hfsc;
3192 uint32_t min_rate, max_rate;
3193 const char *min_rate_s, *max_rate_s;
3195 hfsc = hfsc_get__(netdev);
3196 min_rate_s = smap_get(details, "min-rate");
3197 max_rate_s = smap_get(details, "max-rate");
3199 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3200 min_rate = MAX(min_rate, 1);
3201 min_rate = MIN(min_rate, hfsc->max_rate);
3203 max_rate = (max_rate_s
3204 ? strtoull(max_rate_s, NULL, 10) / 8
3206 max_rate = MAX(max_rate, min_rate);
3207 max_rate = MIN(max_rate, hfsc->max_rate);
3209 class->min_rate = min_rate;
3210 class->max_rate = max_rate;
3215 /* Create an HFSC qdisc.
3217 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3219 hfsc_setup_qdisc__(struct netdev * netdev)
3221 struct tcmsg *tcmsg;
3222 struct ofpbuf request;
3223 struct tc_hfsc_qopt opt;
3225 tc_del_qdisc(netdev);
3227 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3228 NLM_F_EXCL | NLM_F_CREATE, &request);
3234 tcmsg->tcm_handle = tc_make_handle(1, 0);
3235 tcmsg->tcm_parent = TC_H_ROOT;
3237 memset(&opt, 0, sizeof opt);
3240 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3241 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3243 return tc_transact(&request, NULL);
3246 /* Create an HFSC class.
3248 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3249 * sc rate <min_rate> ul rate <max_rate>" */
3251 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3252 unsigned int parent, struct hfsc_class *class)
3256 struct tcmsg *tcmsg;
3257 struct ofpbuf request;
3258 struct tc_service_curve min, max;
3260 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3266 tcmsg->tcm_handle = handle;
3267 tcmsg->tcm_parent = parent;
3271 min.m2 = class->min_rate;
3275 max.m2 = class->max_rate;
3277 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3278 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3279 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3280 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3281 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3282 nl_msg_end_nested(&request, opt_offset);
3284 error = tc_transact(&request, NULL);
3286 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3287 "min-rate %ubps, max-rate %ubps (%s)",
3288 netdev_get_name(netdev),
3289 tc_get_major(handle), tc_get_minor(handle),
3290 tc_get_major(parent), tc_get_minor(parent),
3291 class->min_rate, class->max_rate, ovs_strerror(error));
3298 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
3301 struct hfsc_class class;
3303 error = hfsc_setup_qdisc__(netdev);
3309 hfsc_parse_qdisc_details__(netdev, details, &class);
3310 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3311 tc_make_handle(1, 0), &class);
3317 hfsc_install__(netdev, class.max_rate);
3322 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3325 struct nl_dump dump;
3326 struct hfsc_class hc;
3329 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3330 hfsc_install__(netdev, hc.max_rate);
3332 if (!start_queue_dump(netdev, &dump)) {
3336 while (nl_dump_next(&dump, &msg)) {
3337 unsigned int queue_id;
3339 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3340 hfsc_update_queue__(netdev, queue_id, &hc);
3344 nl_dump_done(&dump);
3349 hfsc_tc_destroy(struct tc *tc)
3352 struct hfsc_class *hc, *next;
3354 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3356 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3357 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3366 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
3368 const struct hfsc *hfsc;
3369 hfsc = hfsc_get__(netdev);
3370 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
3375 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
3378 struct hfsc_class class;
3380 hfsc_parse_qdisc_details__(netdev, details, &class);
3381 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3382 tc_make_handle(1, 0), &class);
3385 hfsc_get__(netdev)->max_rate = class.max_rate;
3392 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3393 const struct tc_queue *queue, struct smap *details)
3395 const struct hfsc_class *hc;
3397 hc = hfsc_class_cast__(queue);
3398 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3399 if (hc->min_rate != hc->max_rate) {
3400 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3406 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3407 const struct smap *details)
3410 struct hfsc_class class;
3412 error = hfsc_parse_class_details__(netdev, details, &class);
3417 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3418 tc_make_handle(1, 0xfffe), &class);
3423 hfsc_update_queue__(netdev, queue_id, &class);
3428 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3432 struct hfsc_class *hc;
3434 hc = hfsc_class_cast__(queue);
3435 hfsc = hfsc_get__(netdev);
3437 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3439 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3446 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3447 struct netdev_queue_stats *stats)
3449 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3450 tc_make_handle(1, 0xfffe), NULL, stats);
3454 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3455 const struct ofpbuf *nlmsg,
3456 netdev_dump_queue_stats_cb *cb, void *aux)
3458 struct netdev_queue_stats stats;
3459 unsigned int handle, major, minor;
3462 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3467 major = tc_get_major(handle);
3468 minor = tc_get_minor(handle);
3469 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3470 (*cb)(minor - 1, &stats, aux);
3475 static const struct tc_ops tc_ops_hfsc = {
3476 "hfsc", /* linux_name */
3477 "linux-hfsc", /* ovs_name */
3478 HFSC_N_QUEUES, /* n_queues */
3479 hfsc_tc_install, /* tc_install */
3480 hfsc_tc_load, /* tc_load */
3481 hfsc_tc_destroy, /* tc_destroy */
3482 hfsc_qdisc_get, /* qdisc_get */
3483 hfsc_qdisc_set, /* qdisc_set */
3484 hfsc_class_get, /* class_get */
3485 hfsc_class_set, /* class_set */
3486 hfsc_class_delete, /* class_delete */
3487 hfsc_class_get_stats, /* class_get_stats */
3488 hfsc_class_dump_stats /* class_dump_stats */
3491 /* "linux-default" traffic control class.
3493 * This class represents the default, unnamed Linux qdisc. It corresponds to
3494 * the "" (empty string) QoS type in the OVS database. */
3497 default_install__(struct netdev *netdev_)
3499 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3500 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
3502 /* Nothing but a tc class implementation is allowed to write to a tc. This
3503 * class never does that, so we can legitimately use a const tc object. */
3504 netdev->tc = CONST_CAST(struct tc *, &tc);
3508 default_tc_install(struct netdev *netdev,
3509 const struct smap *details OVS_UNUSED)
3511 default_install__(netdev);
3516 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3518 default_install__(netdev);
3522 static const struct tc_ops tc_ops_default = {
3523 NULL, /* linux_name */
3528 NULL, /* tc_destroy */
3529 NULL, /* qdisc_get */
3530 NULL, /* qdisc_set */
3531 NULL, /* class_get */
3532 NULL, /* class_set */
3533 NULL, /* class_delete */
3534 NULL, /* class_get_stats */
3535 NULL /* class_dump_stats */
3538 /* "linux-other" traffic control class.
3543 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
3545 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3546 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
3548 /* Nothing but a tc class implementation is allowed to write to a tc. This
3549 * class never does that, so we can legitimately use a const tc object. */
3550 netdev->tc = CONST_CAST(struct tc *, &tc);
3554 static const struct tc_ops tc_ops_other = {
3555 NULL, /* linux_name */
3556 "linux-other", /* ovs_name */
3558 NULL, /* tc_install */
3560 NULL, /* tc_destroy */
3561 NULL, /* qdisc_get */
3562 NULL, /* qdisc_set */
3563 NULL, /* class_get */
3564 NULL, /* class_set */
3565 NULL, /* class_delete */
3566 NULL, /* class_get_stats */
3567 NULL /* class_dump_stats */
3570 /* Traffic control. */
3572 /* Number of kernel "tc" ticks per second. */
3573 static double ticks_per_s;
3575 /* Number of kernel "jiffies" per second. This is used for the purpose of
3576 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3577 * one jiffy's worth of data.
3579 * There are two possibilities here:
3581 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3582 * approximate range of 100 to 1024. That means that we really need to
3583 * make sure that the qdisc can buffer that much data.
3585 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3586 * has finely granular timers and there's no need to fudge additional room
3587 * for buffers. (There's no extra effort needed to implement that: the
3588 * large 'buffer_hz' is used as a divisor, so practically any number will
3589 * come out as 0 in the division. Small integer results in the case of
3590 * really high dividends won't have any real effect anyhow.)
3592 static unsigned int buffer_hz;
3594 /* Returns tc handle 'major':'minor'. */
3596 tc_make_handle(unsigned int major, unsigned int minor)
3598 return TC_H_MAKE(major << 16, minor);
3601 /* Returns the major number from 'handle'. */
3603 tc_get_major(unsigned int handle)
3605 return TC_H_MAJ(handle) >> 16;
3608 /* Returns the minor number from 'handle'. */
3610 tc_get_minor(unsigned int handle)
3612 return TC_H_MIN(handle);
3615 static struct tcmsg *
3616 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3617 struct ofpbuf *request)
3619 struct tcmsg *tcmsg;
3623 error = get_ifindex(netdev, &ifindex);
3628 ofpbuf_init(request, 512);
3629 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3630 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3631 tcmsg->tcm_family = AF_UNSPEC;
3632 tcmsg->tcm_ifindex = ifindex;
3633 /* Caller should fill in tcmsg->tcm_handle. */
3634 /* Caller should fill in tcmsg->tcm_parent. */
3640 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3642 int error = nl_transact(NETLINK_ROUTE, request, replyp);
3643 ofpbuf_uninit(request);
3647 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3648 * policing configuration.
3650 * This function is equivalent to running the following when 'add' is true:
3651 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3653 * This function is equivalent to running the following when 'add' is false:
3654 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3656 * The configuration and stats may be seen with the following command:
3657 * /sbin/tc -s qdisc show dev <devname>
3659 * Returns 0 if successful, otherwise a positive errno value.
3662 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3664 struct ofpbuf request;
3665 struct tcmsg *tcmsg;
3667 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3668 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3670 tcmsg = tc_make_request(netdev, type, flags, &request);
3674 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3675 tcmsg->tcm_parent = TC_H_INGRESS;
3676 nl_msg_put_string(&request, TCA_KIND, "ingress");
3677 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3679 error = tc_transact(&request, NULL);
3681 /* If we're deleting the qdisc, don't worry about some of the
3682 * error conditions. */
3683 if (!add && (error == ENOENT || error == EINVAL)) {
3692 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3695 * This function is equivalent to running:
3696 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3697 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3700 * The configuration and stats may be seen with the following command:
3701 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3703 * Returns 0 if successful, otherwise a positive errno value.
3706 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3708 struct tc_police tc_police;
3709 struct ofpbuf request;
3710 struct tcmsg *tcmsg;
3711 size_t basic_offset;
3712 size_t police_offset;
3716 memset(&tc_police, 0, sizeof tc_police);
3717 tc_police.action = TC_POLICE_SHOT;
3718 tc_police.mtu = mtu;
3719 tc_fill_rate(&tc_police.rate, (kbits_rate * 1000)/8, mtu);
3720 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3721 kbits_burst * 1024);
3723 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3724 NLM_F_EXCL | NLM_F_CREATE, &request);
3728 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3729 tcmsg->tcm_info = tc_make_handle(49,
3730 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3732 nl_msg_put_string(&request, TCA_KIND, "basic");
3733 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3734 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3735 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3736 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3737 nl_msg_end_nested(&request, police_offset);
3738 nl_msg_end_nested(&request, basic_offset);
3740 error = tc_transact(&request, NULL);
3751 /* The values in psched are not individually very meaningful, but they are
3752 * important. The tables below show some values seen in the wild.
3756 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3757 * (Before that, there are hints that it was 1000000000.)
3759 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3763 * -----------------------------------
3764 * [1] 000c8000 000f4240 000f4240 00000064
3765 * [2] 000003e8 00000400 000f4240 3b9aca00
3766 * [3] 000003e8 00000400 000f4240 3b9aca00
3767 * [4] 000003e8 00000400 000f4240 00000064
3768 * [5] 000003e8 00000040 000f4240 3b9aca00
3769 * [6] 000003e8 00000040 000f4240 000000f9
3771 * a b c d ticks_per_s buffer_hz
3772 * ------- --------- ---------- ------------- ----------- -------------
3773 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3774 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3775 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3776 * [4] 1,000 1,024 1,000,000 100 976,562 100
3777 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3778 * [6] 1,000 64 1,000,000 249 15,625,000 249
3780 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3781 * [2] 2.6.26-1-686-bigmem from Debian lenny
3782 * [3] 2.6.26-2-sparc64 from Debian lenny
3783 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3784 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3785 * [6] 2.6.34 from kernel.org on KVM
3787 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
3788 static const char fn[] = "/proc/net/psched";
3789 unsigned int a, b, c, d;
3792 if (!ovsthread_once_start(&once)) {
3799 stream = fopen(fn, "r");
3801 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
3805 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3806 VLOG_WARN("%s: read failed", fn);
3810 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3814 VLOG_WARN("%s: invalid scheduler parameters", fn);
3818 ticks_per_s = (double) a * c / b;
3822 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3825 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3828 ovsthread_once_done(&once);
3831 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3832 * rate of 'rate' bytes per second. */
3834 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3837 return (rate * ticks) / ticks_per_s;
3840 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3841 * rate of 'rate' bytes per second. */
3843 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3846 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3849 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3850 * a transmission rate of 'rate' bytes per second. */
3852 tc_buffer_per_jiffy(unsigned int rate)
3855 return rate / buffer_hz;
3858 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3859 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3860 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3861 * stores NULL into it if it is absent.
3863 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3866 * Returns 0 if successful, otherwise a positive errno value. */
3868 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3869 struct nlattr **options)
3871 static const struct nl_policy tca_policy[] = {
3872 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3873 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3875 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3877 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3878 tca_policy, ta, ARRAY_SIZE(ta))) {
3879 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3884 *kind = nl_attr_get_string(ta[TCA_KIND]);
3888 *options = ta[TCA_OPTIONS];
3903 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3904 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3905 * into '*options', and its queue statistics into '*stats'. Any of the output
3906 * arguments may be null.
3908 * Returns 0 if successful, otherwise a positive errno value. */
3910 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3911 struct nlattr **options, struct netdev_queue_stats *stats)
3913 static const struct nl_policy tca_policy[] = {
3914 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3915 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3917 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3919 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3920 tca_policy, ta, ARRAY_SIZE(ta))) {
3921 VLOG_WARN_RL(&rl, "failed to parse class message");
3926 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3927 *handlep = tc->tcm_handle;
3931 *options = ta[TCA_OPTIONS];
3935 const struct gnet_stats_queue *gsq;
3936 struct gnet_stats_basic gsb;
3938 static const struct nl_policy stats_policy[] = {
3939 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3940 .min_len = sizeof gsb },
3941 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3942 .min_len = sizeof *gsq },
3944 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3946 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3947 sa, ARRAY_SIZE(sa))) {
3948 VLOG_WARN_RL(&rl, "failed to parse class stats");
3952 /* Alignment issues screw up the length of struct gnet_stats_basic on
3953 * some arch/bitsize combinations. Newer versions of Linux have a
3954 * struct gnet_stats_basic_packed, but we can't depend on that. The
3955 * easiest thing to do is just to make a copy. */
3956 memset(&gsb, 0, sizeof gsb);
3957 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3958 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3959 stats->tx_bytes = gsb.bytes;
3960 stats->tx_packets = gsb.packets;
3962 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3963 stats->tx_errors = gsq->drops;
3973 memset(stats, 0, sizeof *stats);
3978 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3981 tc_query_class(const struct netdev *netdev,
3982 unsigned int handle, unsigned int parent,
3983 struct ofpbuf **replyp)
3985 struct ofpbuf request;
3986 struct tcmsg *tcmsg;
3989 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3993 tcmsg->tcm_handle = handle;
3994 tcmsg->tcm_parent = parent;
3996 error = tc_transact(&request, replyp);
3998 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3999 netdev_get_name(netdev),
4000 tc_get_major(handle), tc_get_minor(handle),
4001 tc_get_major(parent), tc_get_minor(parent),
4002 ovs_strerror(error));
4007 /* Equivalent to "tc class del dev <name> handle <handle>". */
4009 tc_delete_class(const struct netdev *netdev, unsigned int handle)
4011 struct ofpbuf request;
4012 struct tcmsg *tcmsg;
4015 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
4019 tcmsg->tcm_handle = handle;
4020 tcmsg->tcm_parent = 0;
4022 error = tc_transact(&request, NULL);
4024 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4025 netdev_get_name(netdev),
4026 tc_get_major(handle), tc_get_minor(handle),
4027 ovs_strerror(error));
4032 /* Equivalent to "tc qdisc del dev <name> root". */
4034 tc_del_qdisc(struct netdev *netdev_)
4036 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4037 struct ofpbuf request;
4038 struct tcmsg *tcmsg;
4041 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
4045 tcmsg->tcm_handle = tc_make_handle(1, 0);
4046 tcmsg->tcm_parent = TC_H_ROOT;
4048 error = tc_transact(&request, NULL);
4049 if (error == EINVAL) {
4050 /* EINVAL probably means that the default qdisc was in use, in which
4051 * case we've accomplished our purpose. */
4054 if (!error && netdev->tc) {
4055 if (netdev->tc->ops->tc_destroy) {
4056 netdev->tc->ops->tc_destroy(netdev->tc);
4063 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4064 * kernel to determine what they are. Returns 0 if successful, otherwise a
4065 * positive errno value. */
4067 tc_query_qdisc(const struct netdev *netdev_)
4069 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4070 struct ofpbuf request, *qdisc;
4071 const struct tc_ops *ops;
4072 struct tcmsg *tcmsg;
4080 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4081 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4082 * 2.6.35 without that fix backported to it.
4084 * To avoid the OOPS, we must not make a request that would attempt to dump
4085 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4086 * few others. There are a few ways that I can see to do this, but most of
4087 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4088 * technique chosen here is to assume that any non-default qdisc that we
4089 * create will have a class with handle 1:0. The built-in qdiscs only have
4090 * a class with handle 0:0.
4092 * We could check for Linux 2.6.35+ and use a more straightforward method
4094 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
4098 tcmsg->tcm_handle = tc_make_handle(1, 0);
4099 tcmsg->tcm_parent = 0;
4101 /* Figure out what tc class to instantiate. */
4102 error = tc_transact(&request, &qdisc);
4106 error = tc_parse_qdisc(qdisc, &kind, NULL);
4108 ops = &tc_ops_other;
4110 ops = tc_lookup_linux_name(kind);
4112 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4113 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4115 ops = &tc_ops_other;
4118 } else if (error == ENOENT) {
4119 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4120 * other entity that doesn't have a handle 1:0. We will assume
4121 * that it's the system default qdisc. */
4122 ops = &tc_ops_default;
4125 /* Who knows? Maybe the device got deleted. */
4126 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4127 netdev_get_name(netdev_), ovs_strerror(error));
4128 ops = &tc_ops_other;
4131 /* Instantiate it. */
4132 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
4133 ovs_assert((load_error == 0) == (netdev->tc != NULL));
4134 ofpbuf_delete(qdisc);
4136 return error ? error : load_error;
4139 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4140 approximate the time to transmit packets of various lengths. For an MTU of
4141 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4142 represents two possible packet lengths; for a MTU of 513 through 1024, four
4143 possible lengths; and so on.
4145 Returns, for the specified 'mtu', the number of bits that packet lengths
4146 need to be shifted right to fit within such a 256-entry table. */
4148 tc_calc_cell_log(unsigned int mtu)
4153 mtu = ETH_PAYLOAD_MAX;
4155 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4157 for (cell_log = 0; mtu >= 256; cell_log++) {
4164 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4167 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4169 memset(rate, 0, sizeof *rate);
4170 rate->cell_log = tc_calc_cell_log(mtu);
4171 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4172 /* rate->cell_align = 0; */ /* distro headers. */
4173 rate->mpu = ETH_TOTAL_MIN;
4177 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4178 * attribute of the specified "type".
4180 * See tc_calc_cell_log() above for a description of "rtab"s. */
4182 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4187 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4188 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4189 unsigned packet_size = (i + 1) << rate->cell_log;
4190 if (packet_size < rate->mpu) {
4191 packet_size = rate->mpu;
4193 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4197 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4198 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4199 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4202 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4204 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4205 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4208 /* Linux-only functions declared in netdev-linux.h */
4210 /* Returns a fd for an AF_INET socket or a negative errno value. */
4212 netdev_linux_get_af_inet_sock(void)
4214 int error = netdev_linux_init();
4215 return error ? -error : af_inet_sock;
4218 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4219 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4221 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4222 const char *flag_name, bool enable)
4224 const char *netdev_name = netdev_get_name(netdev);
4225 struct ethtool_value evalue;
4229 COVERAGE_INC(netdev_get_ethtool);
4230 memset(&evalue, 0, sizeof evalue);
4231 error = netdev_linux_do_ethtool(netdev_name,
4232 (struct ethtool_cmd *)&evalue,
4233 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4238 COVERAGE_INC(netdev_set_ethtool);
4239 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4240 error = netdev_linux_do_ethtool(netdev_name,
4241 (struct ethtool_cmd *)&evalue,
4242 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4247 COVERAGE_INC(netdev_get_ethtool);
4248 memset(&evalue, 0, sizeof evalue);
4249 error = netdev_linux_do_ethtool(netdev_name,
4250 (struct ethtool_cmd *)&evalue,
4251 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4256 if (new_flags != evalue.data) {
4257 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4258 "device %s failed", enable ? "enable" : "disable",
4259 flag_name, netdev_name);
4266 /* Utility functions. */
4268 /* Copies 'src' into 'dst', performing format conversion in the process. */
4270 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4271 const struct rtnl_link_stats *src)
4273 dst->rx_packets = src->rx_packets;
4274 dst->tx_packets = src->tx_packets;
4275 dst->rx_bytes = src->rx_bytes;
4276 dst->tx_bytes = src->tx_bytes;
4277 dst->rx_errors = src->rx_errors;
4278 dst->tx_errors = src->tx_errors;
4279 dst->rx_dropped = src->rx_dropped;
4280 dst->tx_dropped = src->tx_dropped;
4281 dst->multicast = src->multicast;
4282 dst->collisions = src->collisions;
4283 dst->rx_length_errors = src->rx_length_errors;
4284 dst->rx_over_errors = src->rx_over_errors;
4285 dst->rx_crc_errors = src->rx_crc_errors;
4286 dst->rx_frame_errors = src->rx_frame_errors;
4287 dst->rx_fifo_errors = src->rx_fifo_errors;
4288 dst->rx_missed_errors = src->rx_missed_errors;
4289 dst->tx_aborted_errors = src->tx_aborted_errors;
4290 dst->tx_carrier_errors = src->tx_carrier_errors;
4291 dst->tx_fifo_errors = src->tx_fifo_errors;
4292 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4293 dst->tx_window_errors = src->tx_window_errors;
4297 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4299 /* Policy for RTNLGRP_LINK messages.
4301 * There are *many* more fields in these messages, but currently we only
4302 * care about these fields. */
4303 static const struct nl_policy rtnlgrp_link_policy[] = {
4304 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4305 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4306 .min_len = sizeof(struct rtnl_link_stats) },
4309 struct ofpbuf request;
4310 struct ofpbuf *reply;
4311 struct ifinfomsg *ifi;
4312 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4315 ofpbuf_init(&request, 0);
4316 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4317 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4318 ifi->ifi_family = PF_UNSPEC;
4319 ifi->ifi_index = ifindex;
4320 error = nl_transact(NETLINK_ROUTE, &request, &reply);
4321 ofpbuf_uninit(&request);
4326 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4327 rtnlgrp_link_policy,
4328 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4329 ofpbuf_delete(reply);
4333 if (!attrs[IFLA_STATS]) {
4334 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4335 ofpbuf_delete(reply);
4339 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4341 ofpbuf_delete(reply);
4347 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4349 static const char fn[] = "/proc/net/dev";
4354 stream = fopen(fn, "r");
4356 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
4361 while (fgets(line, sizeof line, stream)) {
4364 #define X64 "%"SCNu64
4367 X64 X64 X64 X64 X64 X64 X64 "%*u"
4368 X64 X64 X64 X64 X64 X64 X64 "%*u",
4374 &stats->rx_fifo_errors,
4375 &stats->rx_frame_errors,
4381 &stats->tx_fifo_errors,
4383 &stats->tx_carrier_errors) != 15) {
4384 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4385 } else if (!strcmp(devname, netdev_name)) {
4386 stats->rx_length_errors = UINT64_MAX;
4387 stats->rx_over_errors = UINT64_MAX;
4388 stats->rx_crc_errors = UINT64_MAX;
4389 stats->rx_missed_errors = UINT64_MAX;
4390 stats->tx_aborted_errors = UINT64_MAX;
4391 stats->tx_heartbeat_errors = UINT64_MAX;
4392 stats->tx_window_errors = UINT64_MAX;
4398 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4404 get_flags(const struct netdev *dev, unsigned int *flags)
4410 error = netdev_linux_do_ioctl(dev->name, &ifr, SIOCGIFFLAGS,
4413 *flags = ifr.ifr_flags;
4419 set_flags(const char *name, unsigned int flags)
4423 ifr.ifr_flags = flags;
4424 return netdev_linux_do_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
4428 do_get_ifindex(const char *netdev_name)
4432 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4433 COVERAGE_INC(netdev_get_ifindex);
4434 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4435 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4436 netdev_name, ovs_strerror(errno));
4439 return ifr.ifr_ifindex;
4443 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4445 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4447 if (!(netdev->cache_valid & VALID_IFINDEX)) {
4448 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4451 netdev->get_ifindex_error = -ifindex;
4452 netdev->ifindex = 0;
4454 netdev->get_ifindex_error = 0;
4455 netdev->ifindex = ifindex;
4457 netdev->cache_valid |= VALID_IFINDEX;
4460 *ifindexp = netdev->ifindex;
4461 return netdev->get_ifindex_error;
4465 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4470 memset(&ifr, 0, sizeof ifr);
4471 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4472 COVERAGE_INC(netdev_get_hwaddr);
4473 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4474 /* ENODEV probably means that a vif disappeared asynchronously and
4475 * hasn't been removed from the database yet, so reduce the log level
4476 * to INFO for that case. */
4477 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4478 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4479 netdev_name, ovs_strerror(errno));
4482 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4483 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4484 VLOG_WARN("%s device has unknown hardware address family %d",
4485 netdev_name, hwaddr_family);
4487 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4492 set_etheraddr(const char *netdev_name,
4493 const uint8_t mac[ETH_ADDR_LEN])
4497 memset(&ifr, 0, sizeof ifr);
4498 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4499 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4500 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4501 COVERAGE_INC(netdev_set_hwaddr);
4502 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4503 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4504 netdev_name, ovs_strerror(errno));
4511 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4512 int cmd, const char *cmd_name)
4516 memset(&ifr, 0, sizeof ifr);
4517 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4518 ifr.ifr_data = (caddr_t) ecmd;
4521 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4524 if (errno != EOPNOTSUPP) {
4525 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4526 "failed: %s", cmd_name, name, ovs_strerror(errno));
4528 /* The device doesn't support this operation. That's pretty
4529 * common, so there's no point in logging anything. */
4536 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4537 const char *cmd_name)
4539 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4540 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4541 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4542 ovs_strerror(errno));
4549 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4550 int cmd, const char *cmd_name)
4555 ifr.ifr_addr.sa_family = AF_INET;
4556 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4558 const struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *,
4560 *ip = sin->sin_addr;
4565 /* Returns an AF_PACKET raw socket or a negative errno value. */
4567 af_packet_sock(void)
4569 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4572 if (ovsthread_once_start(&once)) {
4573 sock = socket(AF_PACKET, SOCK_RAW, 0);
4575 int error = set_nonblocking(sock);
4582 VLOG_ERR("failed to create packet socket: %s",
4583 ovs_strerror(errno));
4585 ovsthread_once_done(&once);