2 * Copyright (c) 2009, 2010, 2011, 2012 Nicira Networks.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
24 #include <arpa/inet.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
30 #include <linux/types.h>
31 #include <linux/ethtool.h>
32 #include <linux/mii.h>
33 #include <linux/pkt_cls.h>
34 #include <linux/pkt_sched.h>
35 #include <linux/rtnetlink.h>
36 #include <linux/sockios.h>
37 #include <linux/version.h>
38 #include <sys/types.h>
39 #include <sys/ioctl.h>
40 #include <sys/socket.h>
41 #include <netpacket/packet.h>
43 #include <net/if_arp.h>
44 #include <net/if_packet.h>
45 #include <net/route.h>
46 #include <netinet/in.h>
53 #include "dpif-linux.h"
54 #include "dynamic-string.h"
55 #include "fatal-signal.h"
58 #include "netdev-provider.h"
59 #include "netdev-vport.h"
61 #include "netlink-notifier.h"
62 #include "netlink-socket.h"
64 #include "openflow/openflow.h"
66 #include "poll-loop.h"
67 #include "rtnetlink-link.h"
68 #include "socket-util.h"
74 VLOG_DEFINE_THIS_MODULE(netdev_linux);
76 COVERAGE_DEFINE(netdev_set_policing);
77 COVERAGE_DEFINE(netdev_arp_lookup);
78 COVERAGE_DEFINE(netdev_get_ifindex);
79 COVERAGE_DEFINE(netdev_get_hwaddr);
80 COVERAGE_DEFINE(netdev_set_hwaddr);
81 COVERAGE_DEFINE(netdev_ethtool);
84 /* These were introduced in Linux 2.6.14, so they might be missing if we have
86 #ifndef ADVERTISED_Pause
87 #define ADVERTISED_Pause (1 << 13)
89 #ifndef ADVERTISED_Asym_Pause
90 #define ADVERTISED_Asym_Pause (1 << 14)
93 /* These were introduced in Linux 2.6.24, so they might be missing if we
94 * have old headers. */
95 #ifndef ETHTOOL_GFLAGS
96 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
98 #ifndef ETHTOOL_SFLAGS
99 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
102 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
105 #define TC_RTAB_SIZE 1024
108 static struct nln_notifier *netdev_linux_cache_notifier = NULL;
109 static int cache_notifier_refcount;
112 VALID_IFINDEX = 1 << 0,
113 VALID_ETHERADDR = 1 << 1,
117 VALID_POLICING = 1 << 5,
118 VALID_VPORT_STAT_ERROR = 1 << 6,
119 VALID_DRVINFO = 1 << 7,
127 /* Traffic control. */
129 /* An instance of a traffic control class. Always associated with a particular
132 * Each TC implementation subclasses this with whatever additional data it
135 const struct tc_ops *ops;
136 struct hmap queues; /* Contains "struct tc_queue"s.
137 * Read by generic TC layer.
138 * Written only by TC implementation. */
141 /* One traffic control queue.
143 * Each TC implementation subclasses this with whatever additional data it
146 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
147 unsigned int queue_id; /* OpenFlow queue ID. */
150 /* A particular kind of traffic control. Each implementation generally maps to
151 * one particular Linux qdisc class.
153 * The functions below return 0 if successful or a positive errno value on
154 * failure, except where otherwise noted. All of them must be provided, except
155 * where otherwise noted. */
157 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
158 * This is null for tc_ops_default and tc_ops_other, for which there are no
159 * appropriate values. */
160 const char *linux_name;
162 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
163 const char *ovs_name;
165 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
166 * queues. The queues are numbered 0 through n_queues - 1. */
167 unsigned int n_queues;
169 /* Called to install this TC class on 'netdev'. The implementation should
170 * make the Netlink calls required to set up 'netdev' with the right qdisc
171 * and configure it according to 'details'. The implementation may assume
172 * that the current qdisc is the default; that is, there is no need for it
173 * to delete the current qdisc before installing itself.
175 * The contents of 'details' should be documented as valid for 'ovs_name'
176 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
177 * (which is built as ovs-vswitchd.conf.db(8)).
179 * This function must return 0 if and only if it sets 'netdev->tc' to an
180 * initialized 'struct tc'.
182 * (This function is null for tc_ops_other, which cannot be installed. For
183 * other TC classes it should always be nonnull.) */
184 int (*tc_install)(struct netdev *netdev, const struct shash *details);
186 /* Called when the netdev code determines (through a Netlink query) that
187 * this TC class's qdisc is installed on 'netdev', but we didn't install
188 * it ourselves and so don't know any of the details.
190 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
191 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
192 * implementation should parse the other attributes of 'nlmsg' as
193 * necessary to determine its configuration. If necessary it should also
194 * use Netlink queries to determine the configuration of queues on
197 * This function must return 0 if and only if it sets 'netdev->tc' to an
198 * initialized 'struct tc'. */
199 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
201 /* Destroys the data structures allocated by the implementation as part of
202 * 'tc'. (This includes destroying 'tc->queues' by calling
205 * The implementation should not need to perform any Netlink calls. If
206 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
207 * (But it may not be desirable.)
209 * This function may be null if 'tc' is trivial. */
210 void (*tc_destroy)(struct tc *tc);
212 /* Retrieves details of 'netdev->tc' configuration into 'details'.
214 * The implementation should not need to perform any Netlink calls, because
215 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
216 * cached the configuration.
218 * The contents of 'details' should be documented as valid for 'ovs_name'
219 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
220 * (which is built as ovs-vswitchd.conf.db(8)).
222 * This function may be null if 'tc' is not configurable.
224 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
226 /* Reconfigures 'netdev->tc' according to 'details', performing any
227 * required Netlink calls to complete the reconfiguration.
229 * The contents of 'details' should be documented as valid for 'ovs_name'
230 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
231 * (which is built as ovs-vswitchd.conf.db(8)).
233 * This function may be null if 'tc' is not configurable.
235 int (*qdisc_set)(struct netdev *, const struct shash *details);
237 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
238 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
240 * The contents of 'details' should be documented as valid for 'ovs_name'
241 * in the "other_config" column in the "Queue" table in
242 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
244 * The implementation should not need to perform any Netlink calls, because
245 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
246 * cached the queue configuration.
248 * This function may be null if 'tc' does not have queues ('n_queues' is
250 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
251 struct shash *details);
253 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
254 * 'details', perfoming any required Netlink calls to complete the
255 * reconfiguration. The caller ensures that 'queue_id' is less than
258 * The contents of 'details' should be documented as valid for 'ovs_name'
259 * in the "other_config" column in the "Queue" table in
260 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
262 * This function may be null if 'tc' does not have queues or its queues are
263 * not configurable. */
264 int (*class_set)(struct netdev *, unsigned int queue_id,
265 const struct shash *details);
267 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
268 * tc_queue's within 'netdev->tc->queues'.
270 * This function may be null if 'tc' does not have queues or its queues
271 * cannot be deleted. */
272 int (*class_delete)(struct netdev *, struct tc_queue *queue);
274 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
275 * 'struct tc_queue's within 'netdev->tc->queues'.
277 * On success, initializes '*stats'.
279 * This function may be null if 'tc' does not have queues or if it cannot
280 * report queue statistics. */
281 int (*class_get_stats)(const struct netdev *netdev,
282 const struct tc_queue *queue,
283 struct netdev_queue_stats *stats);
285 /* Extracts queue stats from 'nlmsg', which is a response to a
286 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
288 * This function may be null if 'tc' does not have queues or if it cannot
289 * report queue statistics. */
290 int (*class_dump_stats)(const struct netdev *netdev,
291 const struct ofpbuf *nlmsg,
292 netdev_dump_queue_stats_cb *cb, void *aux);
296 tc_init(struct tc *tc, const struct tc_ops *ops)
299 hmap_init(&tc->queues);
303 tc_destroy(struct tc *tc)
305 hmap_destroy(&tc->queues);
308 static const struct tc_ops tc_ops_htb;
309 static const struct tc_ops tc_ops_hfsc;
310 static const struct tc_ops tc_ops_default;
311 static const struct tc_ops tc_ops_other;
313 static const struct tc_ops *tcs[] = {
314 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
315 &tc_ops_hfsc, /* Hierarchical fair service curve. */
316 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
317 &tc_ops_other, /* Some other qdisc. */
321 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
322 static unsigned int tc_get_major(unsigned int handle);
323 static unsigned int tc_get_minor(unsigned int handle);
325 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
326 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
327 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
329 static struct tcmsg *tc_make_request(const struct netdev *, int type,
330 unsigned int flags, struct ofpbuf *);
331 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
332 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
333 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
336 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
337 struct nlattr **options);
338 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
339 struct nlattr **options,
340 struct netdev_queue_stats *);
341 static int tc_query_class(const struct netdev *,
342 unsigned int handle, unsigned int parent,
343 struct ofpbuf **replyp);
344 static int tc_delete_class(const struct netdev *, unsigned int handle);
346 static int tc_del_qdisc(struct netdev *netdev);
347 static int tc_query_qdisc(const struct netdev *netdev);
349 static int tc_calc_cell_log(unsigned int mtu);
350 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
351 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
352 const struct tc_ratespec *rate);
353 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
355 struct netdev_dev_linux {
356 struct netdev_dev netdev_dev;
358 struct shash_node *shash_node;
359 unsigned int cache_valid;
360 unsigned int change_seq;
362 bool miimon; /* Link status of last poll. */
363 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
364 struct timer miimon_timer;
366 /* The following are figured out "on demand" only. They are only valid
367 * when the corresponding VALID_* bit in 'cache_valid' is set. */
369 uint8_t etheraddr[ETH_ADDR_LEN];
370 struct in_addr address, netmask;
373 unsigned int ifi_flags;
374 long long int carrier_resets;
375 uint32_t kbits_rate; /* Policing data. */
376 uint32_t kbits_burst;
377 int vport_stats_error; /* Cached error code from vport_get_stats().
378 0 or an errno value. */
379 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
381 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
385 struct tap_state tap;
389 struct netdev_linux {
390 struct netdev netdev;
394 /* Sockets used for ioctl operations. */
395 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
397 /* A Netlink routing socket that is not subscribed to any multicast groups. */
398 static struct nl_sock *rtnl_sock;
400 /* This is set pretty low because we probably won't learn anything from the
401 * additional log messages. */
402 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
404 static int netdev_linux_init(void);
406 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
407 int cmd, const char *cmd_name);
408 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
409 const char *cmd_name);
410 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
411 int cmd, const char *cmd_name);
412 static int get_flags(const struct netdev_dev *, unsigned int *flags);
413 static int set_flags(struct netdev *, unsigned int flags);
414 static int do_get_ifindex(const char *netdev_name);
415 static int get_ifindex(const struct netdev *, int *ifindexp);
416 static int do_set_addr(struct netdev *netdev,
417 int ioctl_nr, const char *ioctl_name,
418 struct in_addr addr);
419 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
420 static int set_etheraddr(const char *netdev_name, int hwaddr_family,
421 const uint8_t[ETH_ADDR_LEN]);
422 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
423 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
424 static int af_packet_sock(void);
425 static void netdev_linux_miimon_run(void);
426 static void netdev_linux_miimon_wait(void);
429 is_netdev_linux_class(const struct netdev_class *netdev_class)
431 return netdev_class->init == netdev_linux_init;
434 static struct netdev_dev_linux *
435 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
437 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
438 assert(is_netdev_linux_class(netdev_class));
440 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
443 static struct netdev_linux *
444 netdev_linux_cast(const struct netdev *netdev)
446 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
447 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
448 assert(is_netdev_linux_class(netdev_class));
450 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
454 netdev_linux_init(void)
456 static int status = -1;
458 /* Create AF_INET socket. */
459 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
460 status = af_inet_sock >= 0 ? 0 : errno;
462 VLOG_ERR("failed to create inet socket: %s", strerror(status));
465 /* Create rtnetlink socket. */
467 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
469 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
478 netdev_linux_run(void)
480 rtnetlink_link_run();
481 netdev_linux_miimon_run();
485 netdev_linux_wait(void)
487 rtnetlink_link_wait();
488 netdev_linux_miimon_wait();
492 netdev_linux_get_drvinfo(struct netdev_dev_linux *netdev_dev)
497 if (netdev_dev->cache_valid & VALID_DRVINFO) {
501 memset(&netdev_dev->drvinfo, 0, sizeof netdev_dev->drvinfo);
502 error = netdev_linux_do_ethtool(netdev_dev->netdev_dev.name,
503 (struct ethtool_cmd *)&netdev_dev->drvinfo,
507 netdev_dev->cache_valid |= VALID_DRVINFO;
513 netdev_dev_linux_changed(struct netdev_dev_linux *dev,
514 unsigned int ifi_flags,
518 if (!dev->change_seq) {
522 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
523 dev->carrier_resets++;
525 dev->ifi_flags = ifi_flags;
527 dev->cache_valid &= mask;
531 netdev_dev_linux_update(struct netdev_dev_linux *dev,
532 const struct rtnetlink_link_change *change)
534 if (change->nlmsg_type == RTM_NEWLINK) {
536 netdev_dev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
539 dev->mtu = change->mtu;
540 dev->cache_valid |= VALID_MTU;
541 dev->netdev_mtu_error = 0;
545 netdev_dev_linux_changed(dev, change->ifi_flags, 0);
550 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
551 void *aux OVS_UNUSED)
553 struct netdev_dev_linux *dev;
555 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
557 const struct netdev_class *netdev_class =
558 netdev_dev_get_class(base_dev);
560 if (is_netdev_linux_class(netdev_class)) {
561 dev = netdev_dev_linux_cast(base_dev);
562 netdev_dev_linux_update(dev, change);
566 struct shash device_shash;
567 struct shash_node *node;
569 shash_init(&device_shash);
570 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
571 SHASH_FOR_EACH (node, &device_shash) {
576 get_flags(&dev->netdev_dev, &flags);
577 netdev_dev_linux_changed(dev, flags, 0);
579 shash_destroy(&device_shash);
584 cache_notifier_ref(void)
586 if (!cache_notifier_refcount) {
587 assert(!netdev_linux_cache_notifier);
589 netdev_linux_cache_notifier =
590 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
592 if (!netdev_linux_cache_notifier) {
596 cache_notifier_refcount++;
602 cache_notifier_unref(void)
604 assert(cache_notifier_refcount > 0);
605 if (!--cache_notifier_refcount) {
606 assert(netdev_linux_cache_notifier);
607 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
608 netdev_linux_cache_notifier = NULL;
612 /* Creates system and internal devices. */
614 netdev_linux_create(const struct netdev_class *class, const char *name,
615 struct netdev_dev **netdev_devp)
617 struct netdev_dev_linux *netdev_dev;
620 error = cache_notifier_ref();
625 netdev_dev = xzalloc(sizeof *netdev_dev);
626 netdev_dev->change_seq = 1;
627 netdev_dev_init(&netdev_dev->netdev_dev, name, class);
628 get_flags(&netdev_dev->netdev_dev, &netdev_dev->ifi_flags);
630 *netdev_devp = &netdev_dev->netdev_dev;
634 /* For most types of netdevs we open the device for each call of
635 * netdev_open(). However, this is not the case with tap devices,
636 * since it is only possible to open the device once. In this
637 * situation we share a single file descriptor, and consequently
638 * buffers, across all readers. Therefore once data is read it will
639 * be unavailable to other reads for tap devices. */
641 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
642 const char *name, struct netdev_dev **netdev_devp)
644 struct netdev_dev_linux *netdev_dev;
645 struct tap_state *state;
646 static const char tap_dev[] = "/dev/net/tun";
650 netdev_dev = xzalloc(sizeof *netdev_dev);
651 state = &netdev_dev->state.tap;
653 error = cache_notifier_ref();
658 /* Open tap device. */
659 state->fd = open(tap_dev, O_RDWR);
662 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
663 goto error_unref_notifier;
666 /* Create tap device. */
667 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
668 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
669 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
670 VLOG_WARN("%s: creating tap device failed: %s", name,
673 goto error_unref_notifier;
676 /* Make non-blocking. */
677 error = set_nonblocking(state->fd);
679 goto error_unref_notifier;
682 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
683 *netdev_devp = &netdev_dev->netdev_dev;
686 error_unref_notifier:
687 cache_notifier_unref();
694 destroy_tap(struct netdev_dev_linux *netdev_dev)
696 struct tap_state *state = &netdev_dev->state.tap;
698 if (state->fd >= 0) {
703 /* Destroys the netdev device 'netdev_dev_'. */
705 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
707 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
708 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
710 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
711 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
714 if (class == &netdev_tap_class) {
715 destroy_tap(netdev_dev);
719 cache_notifier_unref();
723 netdev_linux_open(struct netdev_dev *netdev_dev_, struct netdev **netdevp)
725 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
726 struct netdev_linux *netdev;
727 enum netdev_flags flags;
730 /* Allocate network device. */
731 netdev = xzalloc(sizeof *netdev);
733 netdev_init(&netdev->netdev, netdev_dev_);
735 /* Verify that the device really exists, by attempting to read its flags.
736 * (The flags might be cached, in which case this won't actually do an
739 * Don't do this for "internal" netdevs, though, because those have to be
740 * created as netdev objects before they exist in the kernel, because
741 * creating them in the kernel happens by passing a netdev object to
742 * dpif_port_add(). */
743 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
744 error = netdev_get_flags(&netdev->netdev, &flags);
745 if (error == ENODEV) {
750 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
751 !netdev_dev->state.tap.opened) {
753 /* We assume that the first user of the tap device is the primary user
754 * and give them the tap FD. Subsequent users probably just expect
755 * this to be a system device so open it normally to avoid send/receive
756 * directions appearing to be reversed. */
757 netdev->fd = netdev_dev->state.tap.fd;
758 netdev_dev->state.tap.opened = true;
761 *netdevp = &netdev->netdev;
765 netdev_uninit(&netdev->netdev, true);
769 /* Closes and destroys 'netdev'. */
771 netdev_linux_close(struct netdev *netdev_)
773 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
775 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
782 netdev_linux_listen(struct netdev *netdev_)
784 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
785 struct sockaddr_ll sll;
790 if (netdev->fd >= 0) {
794 /* Create file descriptor. */
795 fd = socket(PF_PACKET, SOCK_RAW, 0);
798 VLOG_ERR("failed to create raw socket (%s)", strerror(error));
802 /* Set non-blocking mode. */
803 error = set_nonblocking(fd);
808 /* Get ethernet device index. */
809 error = get_ifindex(&netdev->netdev, &ifindex);
814 /* Bind to specific ethernet device. */
815 memset(&sll, 0, sizeof sll);
816 sll.sll_family = AF_PACKET;
817 sll.sll_ifindex = ifindex;
818 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
819 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
821 VLOG_ERR("%s: failed to bind raw socket (%s)",
822 netdev_get_name(netdev_), strerror(error));
837 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
839 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
841 if (netdev->fd < 0) {
842 /* Device is not listening. */
849 retval = (netdev_->netdev_dev->netdev_class == &netdev_tap_class
850 ? read(netdev->fd, data, size)
851 : recv(netdev->fd, data, size, MSG_TRUNC));
853 return retval <= size ? retval : -EMSGSIZE;
854 } else if (errno != EINTR) {
855 if (errno != EAGAIN) {
856 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
857 strerror(errno), netdev_get_name(netdev_));
864 /* Registers with the poll loop to wake up from the next call to poll_block()
865 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
867 netdev_linux_recv_wait(struct netdev *netdev_)
869 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
870 if (netdev->fd >= 0) {
871 poll_fd_wait(netdev->fd, POLLIN);
875 /* Discards all packets waiting to be received from 'netdev'. */
877 netdev_linux_drain(struct netdev *netdev_)
879 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
880 if (netdev->fd < 0) {
882 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
884 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
885 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
889 drain_fd(netdev->fd, ifr.ifr_qlen);
892 return drain_rcvbuf(netdev->fd);
896 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
897 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
898 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
899 * the packet is too big or too small to transmit on the device.
901 * The caller retains ownership of 'buffer' in all cases.
903 * The kernel maintains a packet transmission queue, so the caller is not
904 * expected to do additional queuing of packets. */
906 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
908 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
912 if (netdev->fd < 0) {
913 /* Use our AF_PACKET socket to send to this device. */
914 struct sockaddr_ll sll;
921 sock = af_packet_sock();
926 error = get_ifindex(netdev_, &ifindex);
931 /* We don't bother setting most fields in sockaddr_ll because the
932 * kernel ignores them for SOCK_RAW. */
933 memset(&sll, 0, sizeof sll);
934 sll.sll_family = AF_PACKET;
935 sll.sll_ifindex = ifindex;
937 iov.iov_base = (void *) data;
941 msg.msg_namelen = sizeof sll;
944 msg.msg_control = NULL;
945 msg.msg_controllen = 0;
948 retval = sendmsg(sock, &msg, 0);
950 /* Use the netdev's own fd to send to this device. This is
951 * essential for tap devices, because packets sent to a tap device
952 * with an AF_PACKET socket will loop back to be *received* again
953 * on the tap device. */
954 retval = write(netdev->fd, data, size);
958 /* The Linux AF_PACKET implementation never blocks waiting for room
959 * for packets, instead returning ENOBUFS. Translate this into
960 * EAGAIN for the caller. */
961 if (errno == ENOBUFS) {
963 } else if (errno == EINTR) {
965 } else if (errno != EAGAIN) {
966 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
967 netdev_get_name(netdev_), strerror(errno));
970 } else if (retval != size) {
971 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
972 "%zu) on %s", retval, size, netdev_get_name(netdev_));
980 /* Registers with the poll loop to wake up from the next call to poll_block()
981 * when the packet transmission queue has sufficient room to transmit a packet
982 * with netdev_send().
984 * The kernel maintains a packet transmission queue, so the client is not
985 * expected to do additional queuing of packets. Thus, this function is
986 * unlikely to ever be used. It is included for completeness. */
988 netdev_linux_send_wait(struct netdev *netdev_)
990 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
991 if (netdev->fd < 0) {
993 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
994 poll_fd_wait(netdev->fd, POLLOUT);
996 /* TAP device always accepts packets.*/
997 poll_immediate_wake();
1001 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1002 * otherwise a positive errno value. */
1004 netdev_linux_set_etheraddr(struct netdev *netdev_,
1005 const uint8_t mac[ETH_ADDR_LEN])
1007 struct netdev_dev_linux *netdev_dev =
1008 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1011 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
1012 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
1013 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
1015 netdev_dev->cache_valid |= VALID_ETHERADDR;
1016 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
1024 /* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
1025 * free the returned buffer. */
1027 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1028 uint8_t mac[ETH_ADDR_LEN])
1030 struct netdev_dev_linux *netdev_dev =
1031 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1032 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
1033 int error = get_etheraddr(netdev_get_name(netdev_),
1034 netdev_dev->etheraddr);
1038 netdev_dev->cache_valid |= VALID_ETHERADDR;
1040 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
1044 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1045 * in bytes, not including the hardware header; thus, this is typically 1500
1046 * bytes for Ethernet devices. */
1048 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1050 struct netdev_dev_linux *netdev_dev =
1051 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1052 if (!(netdev_dev->cache_valid & VALID_MTU)) {
1056 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1057 SIOCGIFMTU, "SIOCGIFMTU");
1059 netdev_dev->netdev_mtu_error = error;
1060 netdev_dev->mtu = ifr.ifr_mtu;
1061 netdev_dev->cache_valid |= VALID_MTU;
1064 if (!netdev_dev->netdev_mtu_error) {
1065 *mtup = netdev_dev->mtu;
1067 return netdev_dev->netdev_mtu_error;
1070 /* Sets the maximum size of transmitted (MTU) for given device using linux
1071 * networking ioctl interface.
1074 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1076 struct netdev_dev_linux *netdev_dev =
1077 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1081 if (netdev_dev->cache_valid & VALID_MTU) {
1082 if (netdev_dev->netdev_mtu_error) {
1083 return netdev_dev->netdev_mtu_error;
1085 if (netdev_dev->mtu == mtu) {
1088 netdev_dev->cache_valid &= ~VALID_MTU;
1091 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1092 SIOCSIFMTU, "SIOCSIFMTU");
1093 if (!error || error == ENODEV) {
1094 netdev_dev->netdev_mtu_error = error;
1095 netdev_dev->mtu = ifr.ifr_mtu;
1096 netdev_dev->cache_valid |= VALID_MTU;
1101 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1102 * On failure, returns a negative errno value. */
1104 netdev_linux_get_ifindex(const struct netdev *netdev)
1108 error = get_ifindex(netdev, &ifindex);
1109 return error ? -error : ifindex;
1113 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1115 struct netdev_dev_linux *netdev_dev =
1116 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1118 if (netdev_dev->miimon_interval > 0) {
1119 *carrier = netdev_dev->miimon;
1121 *carrier = (netdev_dev->ifi_flags & IFF_RUNNING) != 0;
1127 static long long int
1128 netdev_linux_get_carrier_resets(const struct netdev *netdev)
1130 return netdev_dev_linux_cast(netdev_get_dev(netdev))->carrier_resets;
1134 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1135 struct mii_ioctl_data *data)
1140 memset(&ifr, 0, sizeof ifr);
1141 memcpy(&ifr.ifr_data, data, sizeof *data);
1142 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1143 memcpy(data, &ifr.ifr_data, sizeof *data);
1149 netdev_linux_get_miimon(const char *name, bool *miimon)
1151 struct mii_ioctl_data data;
1156 memset(&data, 0, sizeof data);
1157 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1159 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1160 data.reg_num = MII_BMSR;
1161 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1165 *miimon = !!(data.val_out & BMSR_LSTATUS);
1167 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1170 struct ethtool_cmd ecmd;
1172 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1175 memset(&ecmd, 0, sizeof ecmd);
1176 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1179 struct ethtool_value eval;
1181 memcpy(&eval, &ecmd, sizeof eval);
1182 *miimon = !!eval.data;
1184 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1192 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1193 long long int interval)
1195 struct netdev_dev_linux *netdev_dev;
1197 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev_));
1199 interval = interval > 0 ? MAX(interval, 100) : 0;
1200 if (netdev_dev->miimon_interval != interval) {
1201 netdev_dev->miimon_interval = interval;
1202 timer_set_expired(&netdev_dev->miimon_timer);
1209 netdev_linux_miimon_run(void)
1211 struct shash device_shash;
1212 struct shash_node *node;
1214 shash_init(&device_shash);
1215 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1216 SHASH_FOR_EACH (node, &device_shash) {
1217 struct netdev_dev_linux *dev = node->data;
1220 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1224 netdev_linux_get_miimon(dev->netdev_dev.name, &miimon);
1225 if (miimon != dev->miimon) {
1226 dev->miimon = miimon;
1227 netdev_dev_linux_changed(dev, dev->ifi_flags, 0);
1230 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1233 shash_destroy(&device_shash);
1237 netdev_linux_miimon_wait(void)
1239 struct shash device_shash;
1240 struct shash_node *node;
1242 shash_init(&device_shash);
1243 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1244 SHASH_FOR_EACH (node, &device_shash) {
1245 struct netdev_dev_linux *dev = node->data;
1247 if (dev->miimon_interval > 0) {
1248 timer_wait(&dev->miimon_timer);
1251 shash_destroy(&device_shash);
1254 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1255 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1258 check_for_working_netlink_stats(void)
1260 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1261 * preferable, so if that works, we'll use it. */
1262 int ifindex = do_get_ifindex("lo");
1264 VLOG_WARN("failed to get ifindex for lo, "
1265 "obtaining netdev stats from proc");
1268 struct netdev_stats stats;
1269 int error = get_stats_via_netlink(ifindex, &stats);
1271 VLOG_DBG("obtaining netdev stats via rtnetlink");
1274 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1275 "via proc (you are probably running a pre-2.6.19 "
1276 "kernel)", strerror(error));
1283 swap_uint64(uint64_t *a, uint64_t *b)
1291 get_stats_via_vport(const struct netdev *netdev_,
1292 struct netdev_stats *stats)
1294 struct netdev_dev_linux *netdev_dev =
1295 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1297 if (!netdev_dev->vport_stats_error ||
1298 !(netdev_dev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1301 error = netdev_vport_get_stats(netdev_, stats);
1303 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1304 "(%s)", netdev_get_name(netdev_), strerror(error));
1306 netdev_dev->vport_stats_error = error;
1307 netdev_dev->cache_valid |= VALID_VPORT_STAT_ERROR;
1312 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1313 struct netdev_stats *stats)
1315 static int use_netlink_stats = -1;
1318 if (use_netlink_stats < 0) {
1319 use_netlink_stats = check_for_working_netlink_stats();
1322 if (use_netlink_stats) {
1325 error = get_ifindex(netdev_, &ifindex);
1327 error = get_stats_via_netlink(ifindex, stats);
1330 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1334 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1335 netdev_get_name(netdev_), error);
1341 /* Retrieves current device stats for 'netdev-linux'. */
1343 netdev_linux_get_stats(const struct netdev *netdev_,
1344 struct netdev_stats *stats)
1346 struct netdev_dev_linux *netdev_dev =
1347 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1348 struct netdev_stats dev_stats;
1351 get_stats_via_vport(netdev_, stats);
1353 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1356 if (netdev_dev->vport_stats_error) {
1363 if (netdev_dev->vport_stats_error) {
1364 /* stats not available from OVS then use ioctl stats. */
1367 stats->rx_errors += dev_stats.rx_errors;
1368 stats->tx_errors += dev_stats.tx_errors;
1369 stats->rx_dropped += dev_stats.rx_dropped;
1370 stats->tx_dropped += dev_stats.tx_dropped;
1371 stats->multicast += dev_stats.multicast;
1372 stats->collisions += dev_stats.collisions;
1373 stats->rx_length_errors += dev_stats.rx_length_errors;
1374 stats->rx_over_errors += dev_stats.rx_over_errors;
1375 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1376 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1377 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1378 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1379 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1380 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1381 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1382 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1383 stats->tx_window_errors += dev_stats.tx_window_errors;
1388 /* Retrieves current device stats for 'netdev-tap' netdev or
1389 * netdev-internal. */
1391 netdev_tap_get_stats(const struct netdev *netdev_,
1392 struct netdev_stats *stats)
1394 struct netdev_dev_linux *netdev_dev =
1395 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1396 struct netdev_stats dev_stats;
1399 get_stats_via_vport(netdev_, stats);
1401 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1403 if (netdev_dev->vport_stats_error) {
1410 /* If this port is an internal port then the transmit and receive stats
1411 * will appear to be swapped relative to the other ports since we are the
1412 * one sending the data, not a remote computer. For consistency, we swap
1413 * them back here. This does not apply if we are getting stats from the
1414 * vport layer because it always tracks stats from the perspective of the
1416 if (netdev_dev->vport_stats_error) {
1418 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1419 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1420 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1421 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1422 stats->rx_length_errors = 0;
1423 stats->rx_over_errors = 0;
1424 stats->rx_crc_errors = 0;
1425 stats->rx_frame_errors = 0;
1426 stats->rx_fifo_errors = 0;
1427 stats->rx_missed_errors = 0;
1428 stats->tx_aborted_errors = 0;
1429 stats->tx_carrier_errors = 0;
1430 stats->tx_fifo_errors = 0;
1431 stats->tx_heartbeat_errors = 0;
1432 stats->tx_window_errors = 0;
1434 stats->rx_dropped += dev_stats.tx_dropped;
1435 stats->tx_dropped += dev_stats.rx_dropped;
1437 stats->rx_errors += dev_stats.tx_errors;
1438 stats->tx_errors += dev_stats.rx_errors;
1440 stats->multicast += dev_stats.multicast;
1441 stats->collisions += dev_stats.collisions;
1447 netdev_internal_get_stats(const struct netdev *netdev_,
1448 struct netdev_stats *stats)
1450 struct netdev_dev_linux *netdev_dev =
1451 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1453 get_stats_via_vport(netdev_, stats);
1454 return netdev_dev->vport_stats_error;
1457 /* Stores the features supported by 'netdev' into each of '*current',
1458 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1459 * bitmap of NETDEV_* bits. Returns 0 if successful, otherwise a positive
1462 netdev_linux_get_features(const struct netdev *netdev,
1463 enum netdev_features *current,
1464 enum netdev_features *advertised,
1465 enum netdev_features *supported,
1466 enum netdev_features *peer)
1468 struct ethtool_cmd ecmd;
1472 memset(&ecmd, 0, sizeof ecmd);
1473 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1474 ETHTOOL_GSET, "ETHTOOL_GSET");
1479 /* Supported features. */
1481 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1482 *supported |= NETDEV_F_10MB_HD;
1484 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1485 *supported |= NETDEV_F_10MB_FD;
1487 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1488 *supported |= NETDEV_F_100MB_HD;
1490 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1491 *supported |= NETDEV_F_100MB_FD;
1493 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1494 *supported |= NETDEV_F_1GB_HD;
1496 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1497 *supported |= NETDEV_F_1GB_FD;
1499 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1500 *supported |= NETDEV_F_10GB_FD;
1502 if (ecmd.supported & SUPPORTED_TP) {
1503 *supported |= NETDEV_F_COPPER;
1505 if (ecmd.supported & SUPPORTED_FIBRE) {
1506 *supported |= NETDEV_F_FIBER;
1508 if (ecmd.supported & SUPPORTED_Autoneg) {
1509 *supported |= NETDEV_F_AUTONEG;
1511 if (ecmd.supported & SUPPORTED_Pause) {
1512 *supported |= NETDEV_F_PAUSE;
1514 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1515 *supported |= NETDEV_F_PAUSE_ASYM;
1518 /* Advertised features. */
1520 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1521 *advertised |= NETDEV_F_10MB_HD;
1523 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1524 *advertised |= NETDEV_F_10MB_FD;
1526 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1527 *advertised |= NETDEV_F_100MB_HD;
1529 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1530 *advertised |= NETDEV_F_100MB_FD;
1532 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1533 *advertised |= NETDEV_F_1GB_HD;
1535 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1536 *advertised |= NETDEV_F_1GB_FD;
1538 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1539 *advertised |= NETDEV_F_10GB_FD;
1541 if (ecmd.advertising & ADVERTISED_TP) {
1542 *advertised |= NETDEV_F_COPPER;
1544 if (ecmd.advertising & ADVERTISED_FIBRE) {
1545 *advertised |= NETDEV_F_FIBER;
1547 if (ecmd.advertising & ADVERTISED_Autoneg) {
1548 *advertised |= NETDEV_F_AUTONEG;
1550 if (ecmd.advertising & ADVERTISED_Pause) {
1551 *advertised |= NETDEV_F_PAUSE;
1553 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1554 *advertised |= NETDEV_F_PAUSE_ASYM;
1557 /* Current settings. */
1559 if (speed == SPEED_10) {
1560 *current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1561 } else if (speed == SPEED_100) {
1562 *current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1563 } else if (speed == SPEED_1000) {
1564 *current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1565 } else if (speed == SPEED_10000) {
1566 *current = NETDEV_F_10GB_FD;
1567 } else if (speed == 40000) {
1568 *current = NETDEV_F_40GB_FD;
1569 } else if (speed == 100000) {
1570 *current = NETDEV_F_100GB_FD;
1571 } else if (speed == 1000000) {
1572 *current = NETDEV_F_1TB_FD;
1577 if (ecmd.port == PORT_TP) {
1578 *current |= NETDEV_F_COPPER;
1579 } else if (ecmd.port == PORT_FIBRE) {
1580 *current |= NETDEV_F_FIBER;
1584 *current |= NETDEV_F_AUTONEG;
1587 /* Peer advertisements. */
1588 *peer = 0; /* XXX */
1593 /* Set the features advertised by 'netdev' to 'advertise'. */
1595 netdev_linux_set_advertisements(struct netdev *netdev,
1596 enum netdev_features advertise)
1598 struct ethtool_cmd ecmd;
1601 memset(&ecmd, 0, sizeof ecmd);
1602 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1603 ETHTOOL_GSET, "ETHTOOL_GSET");
1608 ecmd.advertising = 0;
1609 if (advertise & NETDEV_F_10MB_HD) {
1610 ecmd.advertising |= ADVERTISED_10baseT_Half;
1612 if (advertise & NETDEV_F_10MB_FD) {
1613 ecmd.advertising |= ADVERTISED_10baseT_Full;
1615 if (advertise & NETDEV_F_100MB_HD) {
1616 ecmd.advertising |= ADVERTISED_100baseT_Half;
1618 if (advertise & NETDEV_F_100MB_FD) {
1619 ecmd.advertising |= ADVERTISED_100baseT_Full;
1621 if (advertise & NETDEV_F_1GB_HD) {
1622 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1624 if (advertise & NETDEV_F_1GB_FD) {
1625 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1627 if (advertise & NETDEV_F_10GB_FD) {
1628 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1630 if (advertise & NETDEV_F_COPPER) {
1631 ecmd.advertising |= ADVERTISED_TP;
1633 if (advertise & NETDEV_F_FIBER) {
1634 ecmd.advertising |= ADVERTISED_FIBRE;
1636 if (advertise & NETDEV_F_AUTONEG) {
1637 ecmd.advertising |= ADVERTISED_Autoneg;
1639 if (advertise & NETDEV_F_PAUSE) {
1640 ecmd.advertising |= ADVERTISED_Pause;
1642 if (advertise & NETDEV_F_PAUSE_ASYM) {
1643 ecmd.advertising |= ADVERTISED_Asym_Pause;
1645 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1646 ETHTOOL_SSET, "ETHTOOL_SSET");
1649 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1650 * successful, otherwise a positive errno value. */
1652 netdev_linux_set_policing(struct netdev *netdev,
1653 uint32_t kbits_rate, uint32_t kbits_burst)
1655 struct netdev_dev_linux *netdev_dev =
1656 netdev_dev_linux_cast(netdev_get_dev(netdev));
1657 const char *netdev_name = netdev_get_name(netdev);
1661 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1662 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1663 : kbits_burst); /* Stick with user-specified value. */
1665 if (netdev_dev->cache_valid & VALID_POLICING
1666 && netdev_dev->kbits_rate == kbits_rate
1667 && netdev_dev->kbits_burst == kbits_burst) {
1668 /* Assume that settings haven't changed since we last set them. */
1672 COVERAGE_INC(netdev_set_policing);
1673 /* Remove any existing ingress qdisc. */
1674 error = tc_add_del_ingress_qdisc(netdev, false);
1676 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1677 netdev_name, strerror(error));
1682 error = tc_add_del_ingress_qdisc(netdev, true);
1684 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1685 netdev_name, strerror(error));
1689 error = tc_add_policer(netdev, kbits_rate, kbits_burst);
1691 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1692 netdev_name, strerror(error));
1697 netdev_dev->kbits_rate = kbits_rate;
1698 netdev_dev->kbits_burst = kbits_burst;
1699 netdev_dev->cache_valid |= VALID_POLICING;
1705 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1708 const struct tc_ops **opsp;
1710 for (opsp = tcs; *opsp != NULL; opsp++) {
1711 const struct tc_ops *ops = *opsp;
1712 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1713 sset_add(types, ops->ovs_name);
1719 static const struct tc_ops *
1720 tc_lookup_ovs_name(const char *name)
1722 const struct tc_ops **opsp;
1724 for (opsp = tcs; *opsp != NULL; opsp++) {
1725 const struct tc_ops *ops = *opsp;
1726 if (!strcmp(name, ops->ovs_name)) {
1733 static const struct tc_ops *
1734 tc_lookup_linux_name(const char *name)
1736 const struct tc_ops **opsp;
1738 for (opsp = tcs; *opsp != NULL; opsp++) {
1739 const struct tc_ops *ops = *opsp;
1740 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1747 static struct tc_queue *
1748 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1751 struct netdev_dev_linux *netdev_dev =
1752 netdev_dev_linux_cast(netdev_get_dev(netdev));
1753 struct tc_queue *queue;
1755 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1756 if (queue->queue_id == queue_id) {
1763 static struct tc_queue *
1764 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1766 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1770 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1772 struct netdev_qos_capabilities *caps)
1774 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1778 caps->n_queues = ops->n_queues;
1783 netdev_linux_get_qos(const struct netdev *netdev,
1784 const char **typep, struct shash *details)
1786 struct netdev_dev_linux *netdev_dev =
1787 netdev_dev_linux_cast(netdev_get_dev(netdev));
1790 error = tc_query_qdisc(netdev);
1795 *typep = netdev_dev->tc->ops->ovs_name;
1796 return (netdev_dev->tc->ops->qdisc_get
1797 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1802 netdev_linux_set_qos(struct netdev *netdev,
1803 const char *type, const struct shash *details)
1805 struct netdev_dev_linux *netdev_dev =
1806 netdev_dev_linux_cast(netdev_get_dev(netdev));
1807 const struct tc_ops *new_ops;
1810 new_ops = tc_lookup_ovs_name(type);
1811 if (!new_ops || !new_ops->tc_install) {
1815 error = tc_query_qdisc(netdev);
1820 if (new_ops == netdev_dev->tc->ops) {
1821 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1823 /* Delete existing qdisc. */
1824 error = tc_del_qdisc(netdev);
1828 assert(netdev_dev->tc == NULL);
1830 /* Install new qdisc. */
1831 error = new_ops->tc_install(netdev, details);
1832 assert((error == 0) == (netdev_dev->tc != NULL));
1839 netdev_linux_get_queue(const struct netdev *netdev,
1840 unsigned int queue_id, struct shash *details)
1842 struct netdev_dev_linux *netdev_dev =
1843 netdev_dev_linux_cast(netdev_get_dev(netdev));
1846 error = tc_query_qdisc(netdev);
1850 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1852 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1858 netdev_linux_set_queue(struct netdev *netdev,
1859 unsigned int queue_id, const struct shash *details)
1861 struct netdev_dev_linux *netdev_dev =
1862 netdev_dev_linux_cast(netdev_get_dev(netdev));
1865 error = tc_query_qdisc(netdev);
1868 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1869 || !netdev_dev->tc->ops->class_set) {
1873 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1877 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1879 struct netdev_dev_linux *netdev_dev =
1880 netdev_dev_linux_cast(netdev_get_dev(netdev));
1883 error = tc_query_qdisc(netdev);
1886 } else if (!netdev_dev->tc->ops->class_delete) {
1889 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1891 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1897 netdev_linux_get_queue_stats(const struct netdev *netdev,
1898 unsigned int queue_id,
1899 struct netdev_queue_stats *stats)
1901 struct netdev_dev_linux *netdev_dev =
1902 netdev_dev_linux_cast(netdev_get_dev(netdev));
1905 error = tc_query_qdisc(netdev);
1908 } else if (!netdev_dev->tc->ops->class_get_stats) {
1911 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1913 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1919 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1921 struct ofpbuf request;
1922 struct tcmsg *tcmsg;
1924 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
1928 tcmsg->tcm_parent = 0;
1929 nl_dump_start(dump, rtnl_sock, &request);
1930 ofpbuf_uninit(&request);
1935 netdev_linux_dump_queues(const struct netdev *netdev,
1936 netdev_dump_queues_cb *cb, void *aux)
1938 struct netdev_dev_linux *netdev_dev =
1939 netdev_dev_linux_cast(netdev_get_dev(netdev));
1940 struct tc_queue *queue;
1941 struct shash details;
1945 error = tc_query_qdisc(netdev);
1948 } else if (!netdev_dev->tc->ops->class_get) {
1953 shash_init(&details);
1954 HMAP_FOR_EACH (queue, hmap_node, &netdev_dev->tc->queues) {
1955 shash_clear(&details);
1957 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
1959 (*cb)(queue->queue_id, &details, aux);
1964 shash_destroy(&details);
1970 netdev_linux_dump_queue_stats(const struct netdev *netdev,
1971 netdev_dump_queue_stats_cb *cb, void *aux)
1973 struct netdev_dev_linux *netdev_dev =
1974 netdev_dev_linux_cast(netdev_get_dev(netdev));
1975 struct nl_dump dump;
1980 error = tc_query_qdisc(netdev);
1983 } else if (!netdev_dev->tc->ops->class_dump_stats) {
1988 if (!start_queue_dump(netdev, &dump)) {
1991 while (nl_dump_next(&dump, &msg)) {
1992 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
1998 error = nl_dump_done(&dump);
1999 return error ? error : last_error;
2003 netdev_linux_get_in4(const struct netdev *netdev_,
2004 struct in_addr *address, struct in_addr *netmask)
2006 struct netdev_dev_linux *netdev_dev =
2007 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2009 if (!(netdev_dev->cache_valid & VALID_IN4)) {
2012 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
2013 SIOCGIFADDR, "SIOCGIFADDR");
2018 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
2019 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2024 netdev_dev->cache_valid |= VALID_IN4;
2026 *address = netdev_dev->address;
2027 *netmask = netdev_dev->netmask;
2028 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
2032 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2033 struct in_addr netmask)
2035 struct netdev_dev_linux *netdev_dev =
2036 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2039 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2041 netdev_dev->cache_valid |= VALID_IN4;
2042 netdev_dev->address = address;
2043 netdev_dev->netmask = netmask;
2044 if (address.s_addr != INADDR_ANY) {
2045 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2046 "SIOCSIFNETMASK", netmask);
2053 parse_if_inet6_line(const char *line,
2054 struct in6_addr *in6, char ifname[16 + 1])
2056 uint8_t *s6 = in6->s6_addr;
2057 #define X8 "%2"SCNx8
2059 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2060 "%*x %*x %*x %*x %16s\n",
2061 &s6[0], &s6[1], &s6[2], &s6[3],
2062 &s6[4], &s6[5], &s6[6], &s6[7],
2063 &s6[8], &s6[9], &s6[10], &s6[11],
2064 &s6[12], &s6[13], &s6[14], &s6[15],
2068 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2069 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2071 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2073 struct netdev_dev_linux *netdev_dev =
2074 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2075 if (!(netdev_dev->cache_valid & VALID_IN6)) {
2079 netdev_dev->in6 = in6addr_any;
2081 file = fopen("/proc/net/if_inet6", "r");
2083 const char *name = netdev_get_name(netdev_);
2084 while (fgets(line, sizeof line, file)) {
2085 struct in6_addr in6_tmp;
2086 char ifname[16 + 1];
2087 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2088 && !strcmp(name, ifname))
2090 netdev_dev->in6 = in6_tmp;
2096 netdev_dev->cache_valid |= VALID_IN6;
2098 *in6 = netdev_dev->in6;
2103 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2105 struct sockaddr_in sin;
2106 memset(&sin, 0, sizeof sin);
2107 sin.sin_family = AF_INET;
2108 sin.sin_addr = addr;
2111 memset(sa, 0, sizeof *sa);
2112 memcpy(sa, &sin, sizeof sin);
2116 do_set_addr(struct netdev *netdev,
2117 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2120 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2121 make_in4_sockaddr(&ifr.ifr_addr, addr);
2123 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2127 /* Adds 'router' as a default IP gateway. */
2129 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2131 struct in_addr any = { INADDR_ANY };
2135 memset(&rt, 0, sizeof rt);
2136 make_in4_sockaddr(&rt.rt_dst, any);
2137 make_in4_sockaddr(&rt.rt_gateway, router);
2138 make_in4_sockaddr(&rt.rt_genmask, any);
2139 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2140 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2142 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
2148 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2151 static const char fn[] = "/proc/net/route";
2156 *netdev_name = NULL;
2157 stream = fopen(fn, "r");
2158 if (stream == NULL) {
2159 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
2164 while (fgets(line, sizeof line, stream)) {
2167 ovs_be32 dest, gateway, mask;
2168 int refcnt, metric, mtu;
2169 unsigned int flags, use, window, irtt;
2172 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2174 iface, &dest, &gateway, &flags, &refcnt,
2175 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2177 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2181 if (!(flags & RTF_UP)) {
2182 /* Skip routes that aren't up. */
2186 /* The output of 'dest', 'mask', and 'gateway' were given in
2187 * network byte order, so we don't need need any endian
2188 * conversions here. */
2189 if ((dest & mask) == (host->s_addr & mask)) {
2191 /* The host is directly reachable. */
2192 next_hop->s_addr = 0;
2194 /* To reach the host, we must go through a gateway. */
2195 next_hop->s_addr = gateway;
2197 *netdev_name = xstrdup(iface);
2209 netdev_linux_get_status(const struct netdev *netdev, struct shash *sh)
2212 struct netdev_dev_linux *netdev_dev =
2213 netdev_dev_linux_cast(netdev_get_dev(netdev));
2215 error = netdev_linux_get_drvinfo(netdev_dev);
2217 shash_add(sh, "driver_name", xstrdup(netdev_dev->drvinfo.driver));
2218 shash_add(sh, "driver_version", xstrdup(netdev_dev->drvinfo.version));
2219 shash_add(sh, "firmware_version", xstrdup(netdev_dev->drvinfo.fw_version));
2225 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED, struct shash *sh)
2227 shash_add(sh, "driver_name", xstrdup("openvswitch"));
2231 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2232 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2233 * returns 0. Otherwise, it returns a positive errno value; in particular,
2234 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2236 netdev_linux_arp_lookup(const struct netdev *netdev,
2237 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2240 struct sockaddr_in sin;
2243 memset(&r, 0, sizeof r);
2244 memset(&sin, 0, sizeof sin);
2245 sin.sin_family = AF_INET;
2246 sin.sin_addr.s_addr = ip;
2248 memcpy(&r.arp_pa, &sin, sizeof sin);
2249 r.arp_ha.sa_family = ARPHRD_ETHER;
2251 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2252 COVERAGE_INC(netdev_arp_lookup);
2253 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2255 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2256 } else if (retval != ENXIO) {
2257 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2258 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
2264 nd_to_iff_flags(enum netdev_flags nd)
2267 if (nd & NETDEV_UP) {
2270 if (nd & NETDEV_PROMISC) {
2277 iff_to_nd_flags(int iff)
2279 enum netdev_flags nd = 0;
2283 if (iff & IFF_PROMISC) {
2284 nd |= NETDEV_PROMISC;
2290 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2291 enum netdev_flags on, enum netdev_flags *old_flagsp)
2293 struct netdev_dev_linux *netdev_dev;
2294 int old_flags, new_flags;
2297 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2298 old_flags = netdev_dev->ifi_flags;
2299 *old_flagsp = iff_to_nd_flags(old_flags);
2300 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2301 if (new_flags != old_flags) {
2302 error = set_flags(netdev, new_flags);
2303 get_flags(&netdev_dev->netdev_dev, &netdev_dev->ifi_flags);
2309 netdev_linux_change_seq(const struct netdev *netdev)
2311 return netdev_dev_linux_cast(netdev_get_dev(netdev))->change_seq;
2314 #define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS, \
2319 netdev_linux_init, \
2321 netdev_linux_wait, \
2324 netdev_linux_destroy, \
2325 NULL, /* get_config */ \
2326 NULL, /* set_config */ \
2328 netdev_linux_open, \
2329 netdev_linux_close, \
2331 netdev_linux_listen, \
2332 netdev_linux_recv, \
2333 netdev_linux_recv_wait, \
2334 netdev_linux_drain, \
2336 netdev_linux_send, \
2337 netdev_linux_send_wait, \
2339 netdev_linux_set_etheraddr, \
2340 netdev_linux_get_etheraddr, \
2341 netdev_linux_get_mtu, \
2342 netdev_linux_set_mtu, \
2343 netdev_linux_get_ifindex, \
2344 netdev_linux_get_carrier, \
2345 netdev_linux_get_carrier_resets, \
2346 netdev_linux_set_miimon_interval, \
2350 netdev_linux_get_features, \
2351 netdev_linux_set_advertisements, \
2353 netdev_linux_set_policing, \
2354 netdev_linux_get_qos_types, \
2355 netdev_linux_get_qos_capabilities, \
2356 netdev_linux_get_qos, \
2357 netdev_linux_set_qos, \
2358 netdev_linux_get_queue, \
2359 netdev_linux_set_queue, \
2360 netdev_linux_delete_queue, \
2361 netdev_linux_get_queue_stats, \
2362 netdev_linux_dump_queues, \
2363 netdev_linux_dump_queue_stats, \
2365 netdev_linux_get_in4, \
2366 netdev_linux_set_in4, \
2367 netdev_linux_get_in6, \
2368 netdev_linux_add_router, \
2369 netdev_linux_get_next_hop, \
2371 netdev_linux_arp_lookup, \
2373 netdev_linux_update_flags, \
2375 netdev_linux_change_seq \
2378 const struct netdev_class netdev_linux_class =
2381 netdev_linux_create,
2382 netdev_linux_get_stats,
2383 NULL, /* set_stats */
2384 netdev_linux_get_status);
2386 const struct netdev_class netdev_tap_class =
2389 netdev_linux_create_tap,
2390 netdev_tap_get_stats,
2391 NULL, /* set_stats */
2392 netdev_linux_get_status);
2394 const struct netdev_class netdev_internal_class =
2397 netdev_linux_create,
2398 netdev_internal_get_stats,
2399 netdev_vport_set_stats,
2400 netdev_internal_get_status);
2402 /* HTB traffic control class. */
2404 #define HTB_N_QUEUES 0xf000
2408 unsigned int max_rate; /* In bytes/s. */
2412 struct tc_queue tc_queue;
2413 unsigned int min_rate; /* In bytes/s. */
2414 unsigned int max_rate; /* In bytes/s. */
2415 unsigned int burst; /* In bytes. */
2416 unsigned int priority; /* Lower values are higher priorities. */
2420 htb_get__(const struct netdev *netdev)
2422 struct netdev_dev_linux *netdev_dev =
2423 netdev_dev_linux_cast(netdev_get_dev(netdev));
2424 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2428 htb_install__(struct netdev *netdev, uint64_t max_rate)
2430 struct netdev_dev_linux *netdev_dev =
2431 netdev_dev_linux_cast(netdev_get_dev(netdev));
2434 htb = xmalloc(sizeof *htb);
2435 tc_init(&htb->tc, &tc_ops_htb);
2436 htb->max_rate = max_rate;
2438 netdev_dev->tc = &htb->tc;
2441 /* Create an HTB qdisc.
2443 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2445 htb_setup_qdisc__(struct netdev *netdev)
2448 struct tc_htb_glob opt;
2449 struct ofpbuf request;
2450 struct tcmsg *tcmsg;
2452 tc_del_qdisc(netdev);
2454 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2455 NLM_F_EXCL | NLM_F_CREATE, &request);
2459 tcmsg->tcm_handle = tc_make_handle(1, 0);
2460 tcmsg->tcm_parent = TC_H_ROOT;
2462 nl_msg_put_string(&request, TCA_KIND, "htb");
2464 memset(&opt, 0, sizeof opt);
2465 opt.rate2quantum = 10;
2469 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2470 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2471 nl_msg_end_nested(&request, opt_offset);
2473 return tc_transact(&request, NULL);
2476 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2477 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2479 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2480 unsigned int parent, struct htb_class *class)
2483 struct tc_htb_opt opt;
2484 struct ofpbuf request;
2485 struct tcmsg *tcmsg;
2489 error = netdev_get_mtu(netdev, &mtu);
2491 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2492 netdev_get_name(netdev));
2496 memset(&opt, 0, sizeof opt);
2497 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2498 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2499 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2500 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2501 opt.prio = class->priority;
2503 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2507 tcmsg->tcm_handle = handle;
2508 tcmsg->tcm_parent = parent;
2510 nl_msg_put_string(&request, TCA_KIND, "htb");
2511 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2512 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2513 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2514 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2515 nl_msg_end_nested(&request, opt_offset);
2517 error = tc_transact(&request, NULL);
2519 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2520 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2521 netdev_get_name(netdev),
2522 tc_get_major(handle), tc_get_minor(handle),
2523 tc_get_major(parent), tc_get_minor(parent),
2524 class->min_rate, class->max_rate,
2525 class->burst, class->priority, strerror(error));
2530 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2531 * description of them into 'details'. The description complies with the
2532 * specification given in the vswitch database documentation for linux-htb
2535 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2537 static const struct nl_policy tca_htb_policy[] = {
2538 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2539 .min_len = sizeof(struct tc_htb_opt) },
2542 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2543 const struct tc_htb_opt *htb;
2545 if (!nl_parse_nested(nl_options, tca_htb_policy,
2546 attrs, ARRAY_SIZE(tca_htb_policy))) {
2547 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2551 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2552 class->min_rate = htb->rate.rate;
2553 class->max_rate = htb->ceil.rate;
2554 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2555 class->priority = htb->prio;
2560 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2561 struct htb_class *options,
2562 struct netdev_queue_stats *stats)
2564 struct nlattr *nl_options;
2565 unsigned int handle;
2568 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2569 if (!error && queue_id) {
2570 unsigned int major = tc_get_major(handle);
2571 unsigned int minor = tc_get_minor(handle);
2572 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2573 *queue_id = minor - 1;
2578 if (!error && options) {
2579 error = htb_parse_tca_options__(nl_options, options);
2585 htb_parse_qdisc_details__(struct netdev *netdev,
2586 const struct shash *details, struct htb_class *hc)
2588 const char *max_rate_s;
2590 max_rate_s = shash_find_data(details, "max-rate");
2591 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2592 if (!hc->max_rate) {
2595 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2596 hc->max_rate = netdev_features_to_bps(current) / 8;
2598 hc->min_rate = hc->max_rate;
2604 htb_parse_class_details__(struct netdev *netdev,
2605 const struct shash *details, struct htb_class *hc)
2607 const struct htb *htb = htb_get__(netdev);
2608 const char *min_rate_s = shash_find_data(details, "min-rate");
2609 const char *max_rate_s = shash_find_data(details, "max-rate");
2610 const char *burst_s = shash_find_data(details, "burst");
2611 const char *priority_s = shash_find_data(details, "priority");
2614 error = netdev_get_mtu(netdev, &mtu);
2616 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2617 netdev_get_name(netdev));
2621 /* HTB requires at least an mtu sized min-rate to send any traffic even
2622 * on uncongested links. */
2623 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2624 hc->min_rate = MAX(hc->min_rate, mtu);
2625 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2628 hc->max_rate = (max_rate_s
2629 ? strtoull(max_rate_s, NULL, 10) / 8
2631 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2632 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2636 * According to hints in the documentation that I've read, it is important
2637 * that 'burst' be at least as big as the largest frame that might be
2638 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2639 * but having it a bit too small is a problem. Since netdev_get_mtu()
2640 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2641 * the MTU. We actually add 64, instead of 14, as a guard against
2642 * additional headers get tacked on somewhere that we're not aware of. */
2643 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2644 hc->burst = MAX(hc->burst, mtu + 64);
2647 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2653 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2654 unsigned int parent, struct htb_class *options,
2655 struct netdev_queue_stats *stats)
2657 struct ofpbuf *reply;
2660 error = tc_query_class(netdev, handle, parent, &reply);
2662 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2663 ofpbuf_delete(reply);
2669 htb_tc_install(struct netdev *netdev, const struct shash *details)
2673 error = htb_setup_qdisc__(netdev);
2675 struct htb_class hc;
2677 htb_parse_qdisc_details__(netdev, details, &hc);
2678 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2679 tc_make_handle(1, 0), &hc);
2681 htb_install__(netdev, hc.max_rate);
2687 static struct htb_class *
2688 htb_class_cast__(const struct tc_queue *queue)
2690 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2694 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2695 const struct htb_class *hc)
2697 struct htb *htb = htb_get__(netdev);
2698 size_t hash = hash_int(queue_id, 0);
2699 struct tc_queue *queue;
2700 struct htb_class *hcp;
2702 queue = tc_find_queue__(netdev, queue_id, hash);
2704 hcp = htb_class_cast__(queue);
2706 hcp = xmalloc(sizeof *hcp);
2707 queue = &hcp->tc_queue;
2708 queue->queue_id = queue_id;
2709 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2712 hcp->min_rate = hc->min_rate;
2713 hcp->max_rate = hc->max_rate;
2714 hcp->burst = hc->burst;
2715 hcp->priority = hc->priority;
2719 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2722 struct nl_dump dump;
2723 struct htb_class hc;
2725 /* Get qdisc options. */
2727 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2728 htb_install__(netdev, hc.max_rate);
2731 if (!start_queue_dump(netdev, &dump)) {
2734 while (nl_dump_next(&dump, &msg)) {
2735 unsigned int queue_id;
2737 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2738 htb_update_queue__(netdev, queue_id, &hc);
2741 nl_dump_done(&dump);
2747 htb_tc_destroy(struct tc *tc)
2749 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2750 struct htb_class *hc, *next;
2752 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2753 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2761 htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2763 const struct htb *htb = htb_get__(netdev);
2764 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2769 htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2771 struct htb_class hc;
2774 htb_parse_qdisc_details__(netdev, details, &hc);
2775 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2776 tc_make_handle(1, 0), &hc);
2778 htb_get__(netdev)->max_rate = hc.max_rate;
2784 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2785 const struct tc_queue *queue, struct shash *details)
2787 const struct htb_class *hc = htb_class_cast__(queue);
2789 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2790 if (hc->min_rate != hc->max_rate) {
2791 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2793 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2795 shash_add(details, "priority", xasprintf("%u", hc->priority));
2801 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2802 const struct shash *details)
2804 struct htb_class hc;
2807 error = htb_parse_class_details__(netdev, details, &hc);
2812 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2813 tc_make_handle(1, 0xfffe), &hc);
2818 htb_update_queue__(netdev, queue_id, &hc);
2823 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2825 struct htb_class *hc = htb_class_cast__(queue);
2826 struct htb *htb = htb_get__(netdev);
2829 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2831 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2838 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2839 struct netdev_queue_stats *stats)
2841 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2842 tc_make_handle(1, 0xfffe), NULL, stats);
2846 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2847 const struct ofpbuf *nlmsg,
2848 netdev_dump_queue_stats_cb *cb, void *aux)
2850 struct netdev_queue_stats stats;
2851 unsigned int handle, major, minor;
2854 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2859 major = tc_get_major(handle);
2860 minor = tc_get_minor(handle);
2861 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2862 (*cb)(minor - 1, &stats, aux);
2867 static const struct tc_ops tc_ops_htb = {
2868 "htb", /* linux_name */
2869 "linux-htb", /* ovs_name */
2870 HTB_N_QUEUES, /* n_queues */
2879 htb_class_get_stats,
2880 htb_class_dump_stats
2883 /* "linux-hfsc" traffic control class. */
2885 #define HFSC_N_QUEUES 0xf000
2893 struct tc_queue tc_queue;
2898 static struct hfsc *
2899 hfsc_get__(const struct netdev *netdev)
2901 struct netdev_dev_linux *netdev_dev;
2902 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2903 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2906 static struct hfsc_class *
2907 hfsc_class_cast__(const struct tc_queue *queue)
2909 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2913 hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2915 struct netdev_dev_linux * netdev_dev;
2918 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2919 hfsc = xmalloc(sizeof *hfsc);
2920 tc_init(&hfsc->tc, &tc_ops_hfsc);
2921 hfsc->max_rate = max_rate;
2922 netdev_dev->tc = &hfsc->tc;
2926 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
2927 const struct hfsc_class *hc)
2931 struct hfsc_class *hcp;
2932 struct tc_queue *queue;
2934 hfsc = hfsc_get__(netdev);
2935 hash = hash_int(queue_id, 0);
2937 queue = tc_find_queue__(netdev, queue_id, hash);
2939 hcp = hfsc_class_cast__(queue);
2941 hcp = xmalloc(sizeof *hcp);
2942 queue = &hcp->tc_queue;
2943 queue->queue_id = queue_id;
2944 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
2947 hcp->min_rate = hc->min_rate;
2948 hcp->max_rate = hc->max_rate;
2952 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
2954 const struct tc_service_curve *rsc, *fsc, *usc;
2955 static const struct nl_policy tca_hfsc_policy[] = {
2957 .type = NL_A_UNSPEC,
2959 .min_len = sizeof(struct tc_service_curve),
2962 .type = NL_A_UNSPEC,
2964 .min_len = sizeof(struct tc_service_curve),
2967 .type = NL_A_UNSPEC,
2969 .min_len = sizeof(struct tc_service_curve),
2972 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
2974 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
2975 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
2976 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
2980 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
2981 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
2982 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
2984 if (rsc->m1 != 0 || rsc->d != 0 ||
2985 fsc->m1 != 0 || fsc->d != 0 ||
2986 usc->m1 != 0 || usc->d != 0) {
2987 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2988 "Non-linear service curves are not supported.");
2992 if (rsc->m2 != fsc->m2) {
2993 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2994 "Real-time service curves are not supported ");
2998 if (rsc->m2 > usc->m2) {
2999 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3000 "Min-rate service curve is greater than "
3001 "the max-rate service curve.");
3005 class->min_rate = fsc->m2;
3006 class->max_rate = usc->m2;
3011 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3012 struct hfsc_class *options,
3013 struct netdev_queue_stats *stats)
3016 unsigned int handle;
3017 struct nlattr *nl_options;
3019 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3025 unsigned int major, minor;
3027 major = tc_get_major(handle);
3028 minor = tc_get_minor(handle);
3029 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3030 *queue_id = minor - 1;
3037 error = hfsc_parse_tca_options__(nl_options, options);
3044 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3045 unsigned int parent, struct hfsc_class *options,
3046 struct netdev_queue_stats *stats)
3049 struct ofpbuf *reply;
3051 error = tc_query_class(netdev, handle, parent, &reply);
3056 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3057 ofpbuf_delete(reply);
3062 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
3063 struct hfsc_class *class)
3066 const char *max_rate_s;
3068 max_rate_s = shash_find_data(details, "max-rate");
3069 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3074 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
3075 max_rate = netdev_features_to_bps(current) / 8;
3078 class->min_rate = max_rate;
3079 class->max_rate = max_rate;
3083 hfsc_parse_class_details__(struct netdev *netdev,
3084 const struct shash *details,
3085 struct hfsc_class * class)
3087 const struct hfsc *hfsc;
3088 uint32_t min_rate, max_rate;
3089 const char *min_rate_s, *max_rate_s;
3091 hfsc = hfsc_get__(netdev);
3092 min_rate_s = shash_find_data(details, "min-rate");
3093 max_rate_s = shash_find_data(details, "max-rate");
3095 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3096 min_rate = MAX(min_rate, 1);
3097 min_rate = MIN(min_rate, hfsc->max_rate);
3099 max_rate = (max_rate_s
3100 ? strtoull(max_rate_s, NULL, 10) / 8
3102 max_rate = MAX(max_rate, min_rate);
3103 max_rate = MIN(max_rate, hfsc->max_rate);
3105 class->min_rate = min_rate;
3106 class->max_rate = max_rate;
3111 /* Create an HFSC qdisc.
3113 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3115 hfsc_setup_qdisc__(struct netdev * netdev)
3117 struct tcmsg *tcmsg;
3118 struct ofpbuf request;
3119 struct tc_hfsc_qopt opt;
3121 tc_del_qdisc(netdev);
3123 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3124 NLM_F_EXCL | NLM_F_CREATE, &request);
3130 tcmsg->tcm_handle = tc_make_handle(1, 0);
3131 tcmsg->tcm_parent = TC_H_ROOT;
3133 memset(&opt, 0, sizeof opt);
3136 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3137 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3139 return tc_transact(&request, NULL);
3142 /* Create an HFSC class.
3144 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3145 * sc rate <min_rate> ul rate <max_rate>" */
3147 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3148 unsigned int parent, struct hfsc_class *class)
3152 struct tcmsg *tcmsg;
3153 struct ofpbuf request;
3154 struct tc_service_curve min, max;
3156 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3162 tcmsg->tcm_handle = handle;
3163 tcmsg->tcm_parent = parent;
3167 min.m2 = class->min_rate;
3171 max.m2 = class->max_rate;
3173 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3174 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3175 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3176 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3177 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3178 nl_msg_end_nested(&request, opt_offset);
3180 error = tc_transact(&request, NULL);
3182 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3183 "min-rate %ubps, max-rate %ubps (%s)",
3184 netdev_get_name(netdev),
3185 tc_get_major(handle), tc_get_minor(handle),
3186 tc_get_major(parent), tc_get_minor(parent),
3187 class->min_rate, class->max_rate, strerror(error));
3194 hfsc_tc_install(struct netdev *netdev, const struct shash *details)
3197 struct hfsc_class class;
3199 error = hfsc_setup_qdisc__(netdev);
3205 hfsc_parse_qdisc_details__(netdev, details, &class);
3206 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3207 tc_make_handle(1, 0), &class);
3213 hfsc_install__(netdev, class.max_rate);
3218 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3221 struct nl_dump dump;
3222 struct hfsc_class hc;
3225 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3226 hfsc_install__(netdev, hc.max_rate);
3228 if (!start_queue_dump(netdev, &dump)) {
3232 while (nl_dump_next(&dump, &msg)) {
3233 unsigned int queue_id;
3235 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3236 hfsc_update_queue__(netdev, queue_id, &hc);
3240 nl_dump_done(&dump);
3245 hfsc_tc_destroy(struct tc *tc)
3248 struct hfsc_class *hc, *next;
3250 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3252 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3253 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3262 hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3264 const struct hfsc *hfsc;
3265 hfsc = hfsc_get__(netdev);
3266 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3271 hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3274 struct hfsc_class class;
3276 hfsc_parse_qdisc_details__(netdev, details, &class);
3277 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3278 tc_make_handle(1, 0), &class);
3281 hfsc_get__(netdev)->max_rate = class.max_rate;
3288 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3289 const struct tc_queue *queue, struct shash *details)
3291 const struct hfsc_class *hc;
3293 hc = hfsc_class_cast__(queue);
3294 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3295 if (hc->min_rate != hc->max_rate) {
3296 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3302 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3303 const struct shash *details)
3306 struct hfsc_class class;
3308 error = hfsc_parse_class_details__(netdev, details, &class);
3313 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3314 tc_make_handle(1, 0xfffe), &class);
3319 hfsc_update_queue__(netdev, queue_id, &class);
3324 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3328 struct hfsc_class *hc;
3330 hc = hfsc_class_cast__(queue);
3331 hfsc = hfsc_get__(netdev);
3333 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3335 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3342 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3343 struct netdev_queue_stats *stats)
3345 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3346 tc_make_handle(1, 0xfffe), NULL, stats);
3350 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3351 const struct ofpbuf *nlmsg,
3352 netdev_dump_queue_stats_cb *cb, void *aux)
3354 struct netdev_queue_stats stats;
3355 unsigned int handle, major, minor;
3358 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3363 major = tc_get_major(handle);
3364 minor = tc_get_minor(handle);
3365 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3366 (*cb)(minor - 1, &stats, aux);
3371 static const struct tc_ops tc_ops_hfsc = {
3372 "hfsc", /* linux_name */
3373 "linux-hfsc", /* ovs_name */
3374 HFSC_N_QUEUES, /* n_queues */
3375 hfsc_tc_install, /* tc_install */
3376 hfsc_tc_load, /* tc_load */
3377 hfsc_tc_destroy, /* tc_destroy */
3378 hfsc_qdisc_get, /* qdisc_get */
3379 hfsc_qdisc_set, /* qdisc_set */
3380 hfsc_class_get, /* class_get */
3381 hfsc_class_set, /* class_set */
3382 hfsc_class_delete, /* class_delete */
3383 hfsc_class_get_stats, /* class_get_stats */
3384 hfsc_class_dump_stats /* class_dump_stats */
3387 /* "linux-default" traffic control class.
3389 * This class represents the default, unnamed Linux qdisc. It corresponds to
3390 * the "" (empty string) QoS type in the OVS database. */
3393 default_install__(struct netdev *netdev)
3395 struct netdev_dev_linux *netdev_dev =
3396 netdev_dev_linux_cast(netdev_get_dev(netdev));
3397 static struct tc *tc;
3400 tc = xmalloc(sizeof *tc);
3401 tc_init(tc, &tc_ops_default);
3403 netdev_dev->tc = tc;
3407 default_tc_install(struct netdev *netdev,
3408 const struct shash *details OVS_UNUSED)
3410 default_install__(netdev);
3415 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3417 default_install__(netdev);
3421 static const struct tc_ops tc_ops_default = {
3422 NULL, /* linux_name */
3427 NULL, /* tc_destroy */
3428 NULL, /* qdisc_get */
3429 NULL, /* qdisc_set */
3430 NULL, /* class_get */
3431 NULL, /* class_set */
3432 NULL, /* class_delete */
3433 NULL, /* class_get_stats */
3434 NULL /* class_dump_stats */
3437 /* "linux-other" traffic control class.
3442 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3444 struct netdev_dev_linux *netdev_dev =
3445 netdev_dev_linux_cast(netdev_get_dev(netdev));
3446 static struct tc *tc;
3449 tc = xmalloc(sizeof *tc);
3450 tc_init(tc, &tc_ops_other);
3452 netdev_dev->tc = tc;
3456 static const struct tc_ops tc_ops_other = {
3457 NULL, /* linux_name */
3458 "linux-other", /* ovs_name */
3460 NULL, /* tc_install */
3462 NULL, /* tc_destroy */
3463 NULL, /* qdisc_get */
3464 NULL, /* qdisc_set */
3465 NULL, /* class_get */
3466 NULL, /* class_set */
3467 NULL, /* class_delete */
3468 NULL, /* class_get_stats */
3469 NULL /* class_dump_stats */
3472 /* Traffic control. */
3474 /* Number of kernel "tc" ticks per second. */
3475 static double ticks_per_s;
3477 /* Number of kernel "jiffies" per second. This is used for the purpose of
3478 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3479 * one jiffy's worth of data.
3481 * There are two possibilities here:
3483 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3484 * approximate range of 100 to 1024. That means that we really need to
3485 * make sure that the qdisc can buffer that much data.
3487 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3488 * has finely granular timers and there's no need to fudge additional room
3489 * for buffers. (There's no extra effort needed to implement that: the
3490 * large 'buffer_hz' is used as a divisor, so practically any number will
3491 * come out as 0 in the division. Small integer results in the case of
3492 * really high dividends won't have any real effect anyhow.)
3494 static unsigned int buffer_hz;
3496 /* Returns tc handle 'major':'minor'. */
3498 tc_make_handle(unsigned int major, unsigned int minor)
3500 return TC_H_MAKE(major << 16, minor);
3503 /* Returns the major number from 'handle'. */
3505 tc_get_major(unsigned int handle)
3507 return TC_H_MAJ(handle) >> 16;
3510 /* Returns the minor number from 'handle'. */
3512 tc_get_minor(unsigned int handle)
3514 return TC_H_MIN(handle);
3517 static struct tcmsg *
3518 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3519 struct ofpbuf *request)
3521 struct tcmsg *tcmsg;
3525 error = get_ifindex(netdev, &ifindex);
3530 ofpbuf_init(request, 512);
3531 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3532 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3533 tcmsg->tcm_family = AF_UNSPEC;
3534 tcmsg->tcm_ifindex = ifindex;
3535 /* Caller should fill in tcmsg->tcm_handle. */
3536 /* Caller should fill in tcmsg->tcm_parent. */
3542 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3544 int error = nl_sock_transact(rtnl_sock, request, replyp);
3545 ofpbuf_uninit(request);
3549 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3550 * policing configuration.
3552 * This function is equivalent to running the following when 'add' is true:
3553 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3555 * This function is equivalent to running the following when 'add' is false:
3556 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3558 * The configuration and stats may be seen with the following command:
3559 * /sbin/tc -s qdisc show dev <devname>
3561 * Returns 0 if successful, otherwise a positive errno value.
3564 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3566 struct ofpbuf request;
3567 struct tcmsg *tcmsg;
3569 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3570 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3572 tcmsg = tc_make_request(netdev, type, flags, &request);
3576 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3577 tcmsg->tcm_parent = TC_H_INGRESS;
3578 nl_msg_put_string(&request, TCA_KIND, "ingress");
3579 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3581 error = tc_transact(&request, NULL);
3583 /* If we're deleting the qdisc, don't worry about some of the
3584 * error conditions. */
3585 if (!add && (error == ENOENT || error == EINVAL)) {
3594 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3597 * This function is equivalent to running:
3598 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3599 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3602 * The configuration and stats may be seen with the following command:
3603 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3605 * Returns 0 if successful, otherwise a positive errno value.
3608 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3610 struct tc_police tc_police;
3611 struct ofpbuf request;
3612 struct tcmsg *tcmsg;
3613 size_t basic_offset;
3614 size_t police_offset;
3618 memset(&tc_police, 0, sizeof tc_police);
3619 tc_police.action = TC_POLICE_SHOT;
3620 tc_police.mtu = mtu;
3621 tc_fill_rate(&tc_police.rate, kbits_rate/8 * 1000, mtu);
3622 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3623 kbits_burst * 1024);
3625 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3626 NLM_F_EXCL | NLM_F_CREATE, &request);
3630 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3631 tcmsg->tcm_info = tc_make_handle(49,
3632 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3634 nl_msg_put_string(&request, TCA_KIND, "basic");
3635 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3636 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3637 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3638 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3639 nl_msg_end_nested(&request, police_offset);
3640 nl_msg_end_nested(&request, basic_offset);
3642 error = tc_transact(&request, NULL);
3653 /* The values in psched are not individually very meaningful, but they are
3654 * important. The tables below show some values seen in the wild.
3658 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3659 * (Before that, there are hints that it was 1000000000.)
3661 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3665 * -----------------------------------
3666 * [1] 000c8000 000f4240 000f4240 00000064
3667 * [2] 000003e8 00000400 000f4240 3b9aca00
3668 * [3] 000003e8 00000400 000f4240 3b9aca00
3669 * [4] 000003e8 00000400 000f4240 00000064
3670 * [5] 000003e8 00000040 000f4240 3b9aca00
3671 * [6] 000003e8 00000040 000f4240 000000f9
3673 * a b c d ticks_per_s buffer_hz
3674 * ------- --------- ---------- ------------- ----------- -------------
3675 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3676 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3677 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3678 * [4] 1,000 1,024 1,000,000 100 976,562 100
3679 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3680 * [6] 1,000 64 1,000,000 249 15,625,000 249
3682 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3683 * [2] 2.6.26-1-686-bigmem from Debian lenny
3684 * [3] 2.6.26-2-sparc64 from Debian lenny
3685 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3686 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3687 * [6] 2.6.34 from kernel.org on KVM
3689 static const char fn[] = "/proc/net/psched";
3690 unsigned int a, b, c, d;
3696 stream = fopen(fn, "r");
3698 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3702 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3703 VLOG_WARN("%s: read failed", fn);
3707 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3711 VLOG_WARN("%s: invalid scheduler parameters", fn);
3715 ticks_per_s = (double) a * c / b;
3719 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3722 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3725 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3726 * rate of 'rate' bytes per second. */
3728 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3733 return (rate * ticks) / ticks_per_s;
3736 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3737 * rate of 'rate' bytes per second. */
3739 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3744 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3747 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3748 * a transmission rate of 'rate' bytes per second. */
3750 tc_buffer_per_jiffy(unsigned int rate)
3755 return rate / buffer_hz;
3758 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3759 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3760 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3761 * stores NULL into it if it is absent.
3763 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3766 * Returns 0 if successful, otherwise a positive errno value. */
3768 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3769 struct nlattr **options)
3771 static const struct nl_policy tca_policy[] = {
3772 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3773 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3775 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3777 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3778 tca_policy, ta, ARRAY_SIZE(ta))) {
3779 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3784 *kind = nl_attr_get_string(ta[TCA_KIND]);
3788 *options = ta[TCA_OPTIONS];
3803 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3804 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3805 * into '*options', and its queue statistics into '*stats'. Any of the output
3806 * arguments may be null.
3808 * Returns 0 if successful, otherwise a positive errno value. */
3810 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3811 struct nlattr **options, struct netdev_queue_stats *stats)
3813 static const struct nl_policy tca_policy[] = {
3814 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3815 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3817 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3819 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3820 tca_policy, ta, ARRAY_SIZE(ta))) {
3821 VLOG_WARN_RL(&rl, "failed to parse class message");
3826 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3827 *handlep = tc->tcm_handle;
3831 *options = ta[TCA_OPTIONS];
3835 const struct gnet_stats_queue *gsq;
3836 struct gnet_stats_basic gsb;
3838 static const struct nl_policy stats_policy[] = {
3839 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3840 .min_len = sizeof gsb },
3841 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3842 .min_len = sizeof *gsq },
3844 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3846 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3847 sa, ARRAY_SIZE(sa))) {
3848 VLOG_WARN_RL(&rl, "failed to parse class stats");
3852 /* Alignment issues screw up the length of struct gnet_stats_basic on
3853 * some arch/bitsize combinations. Newer versions of Linux have a
3854 * struct gnet_stats_basic_packed, but we can't depend on that. The
3855 * easiest thing to do is just to make a copy. */
3856 memset(&gsb, 0, sizeof gsb);
3857 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3858 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3859 stats->tx_bytes = gsb.bytes;
3860 stats->tx_packets = gsb.packets;
3862 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3863 stats->tx_errors = gsq->drops;
3873 memset(stats, 0, sizeof *stats);
3878 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3881 tc_query_class(const struct netdev *netdev,
3882 unsigned int handle, unsigned int parent,
3883 struct ofpbuf **replyp)
3885 struct ofpbuf request;
3886 struct tcmsg *tcmsg;
3889 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3893 tcmsg->tcm_handle = handle;
3894 tcmsg->tcm_parent = parent;
3896 error = tc_transact(&request, replyp);
3898 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3899 netdev_get_name(netdev),
3900 tc_get_major(handle), tc_get_minor(handle),
3901 tc_get_major(parent), tc_get_minor(parent),
3907 /* Equivalent to "tc class del dev <name> handle <handle>". */
3909 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3911 struct ofpbuf request;
3912 struct tcmsg *tcmsg;
3915 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
3919 tcmsg->tcm_handle = handle;
3920 tcmsg->tcm_parent = 0;
3922 error = tc_transact(&request, NULL);
3924 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3925 netdev_get_name(netdev),
3926 tc_get_major(handle), tc_get_minor(handle),
3932 /* Equivalent to "tc qdisc del dev <name> root". */
3934 tc_del_qdisc(struct netdev *netdev)
3936 struct netdev_dev_linux *netdev_dev =
3937 netdev_dev_linux_cast(netdev_get_dev(netdev));
3938 struct ofpbuf request;
3939 struct tcmsg *tcmsg;
3942 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
3946 tcmsg->tcm_handle = tc_make_handle(1, 0);
3947 tcmsg->tcm_parent = TC_H_ROOT;
3949 error = tc_transact(&request, NULL);
3950 if (error == EINVAL) {
3951 /* EINVAL probably means that the default qdisc was in use, in which
3952 * case we've accomplished our purpose. */
3955 if (!error && netdev_dev->tc) {
3956 if (netdev_dev->tc->ops->tc_destroy) {
3957 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3959 netdev_dev->tc = NULL;
3964 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
3965 * kernel to determine what they are. Returns 0 if successful, otherwise a
3966 * positive errno value. */
3968 tc_query_qdisc(const struct netdev *netdev)
3970 struct netdev_dev_linux *netdev_dev =
3971 netdev_dev_linux_cast(netdev_get_dev(netdev));
3972 struct ofpbuf request, *qdisc;
3973 const struct tc_ops *ops;
3974 struct tcmsg *tcmsg;
3978 if (netdev_dev->tc) {
3982 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3983 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3984 * 2.6.35 without that fix backported to it.
3986 * To avoid the OOPS, we must not make a request that would attempt to dump
3987 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3988 * few others. There are a few ways that I can see to do this, but most of
3989 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3990 * technique chosen here is to assume that any non-default qdisc that we
3991 * create will have a class with handle 1:0. The built-in qdiscs only have
3992 * a class with handle 0:0.
3994 * We could check for Linux 2.6.35+ and use a more straightforward method
3996 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
4000 tcmsg->tcm_handle = tc_make_handle(1, 0);
4001 tcmsg->tcm_parent = 0;
4003 /* Figure out what tc class to instantiate. */
4004 error = tc_transact(&request, &qdisc);
4008 error = tc_parse_qdisc(qdisc, &kind, NULL);
4010 ops = &tc_ops_other;
4012 ops = tc_lookup_linux_name(kind);
4014 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4015 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4017 ops = &tc_ops_other;
4020 } else if (error == ENOENT) {
4021 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4022 * other entity that doesn't have a handle 1:0. We will assume
4023 * that it's the system default qdisc. */
4024 ops = &tc_ops_default;
4027 /* Who knows? Maybe the device got deleted. */
4028 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4029 netdev_get_name(netdev), strerror(error));
4030 ops = &tc_ops_other;
4033 /* Instantiate it. */
4034 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
4035 assert((load_error == 0) == (netdev_dev->tc != NULL));
4036 ofpbuf_delete(qdisc);
4038 return error ? error : load_error;
4041 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4042 approximate the time to transmit packets of various lengths. For an MTU of
4043 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4044 represents two possible packet lengths; for a MTU of 513 through 1024, four
4045 possible lengths; and so on.
4047 Returns, for the specified 'mtu', the number of bits that packet lengths
4048 need to be shifted right to fit within such a 256-entry table. */
4050 tc_calc_cell_log(unsigned int mtu)
4055 mtu = ETH_PAYLOAD_MAX;
4057 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4059 for (cell_log = 0; mtu >= 256; cell_log++) {
4066 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4069 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4071 memset(rate, 0, sizeof *rate);
4072 rate->cell_log = tc_calc_cell_log(mtu);
4073 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4074 /* rate->cell_align = 0; */ /* distro headers. */
4075 rate->mpu = ETH_TOTAL_MIN;
4079 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4080 * attribute of the specified "type".
4082 * See tc_calc_cell_log() above for a description of "rtab"s. */
4084 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4089 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4090 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4091 unsigned packet_size = (i + 1) << rate->cell_log;
4092 if (packet_size < rate->mpu) {
4093 packet_size = rate->mpu;
4095 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4099 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4100 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4101 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4104 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4106 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4107 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4110 /* Linux-only functions declared in netdev-linux.h */
4112 /* Returns a fd for an AF_INET socket or a negative errno value. */
4114 netdev_linux_get_af_inet_sock(void)
4116 int error = netdev_linux_init();
4117 return error ? -error : af_inet_sock;
4120 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4121 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4123 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4124 const char *flag_name, bool enable)
4126 const char *netdev_name = netdev_get_name(netdev);
4127 struct ethtool_value evalue;
4131 memset(&evalue, 0, sizeof evalue);
4132 error = netdev_linux_do_ethtool(netdev_name,
4133 (struct ethtool_cmd *)&evalue,
4134 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4139 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4140 error = netdev_linux_do_ethtool(netdev_name,
4141 (struct ethtool_cmd *)&evalue,
4142 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4147 memset(&evalue, 0, sizeof evalue);
4148 error = netdev_linux_do_ethtool(netdev_name,
4149 (struct ethtool_cmd *)&evalue,
4150 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4155 if (new_flags != evalue.data) {
4156 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4157 "device %s failed", enable ? "enable" : "disable",
4158 flag_name, netdev_name);
4165 /* Utility functions. */
4167 /* Copies 'src' into 'dst', performing format conversion in the process. */
4169 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4170 const struct rtnl_link_stats *src)
4172 dst->rx_packets = src->rx_packets;
4173 dst->tx_packets = src->tx_packets;
4174 dst->rx_bytes = src->rx_bytes;
4175 dst->tx_bytes = src->tx_bytes;
4176 dst->rx_errors = src->rx_errors;
4177 dst->tx_errors = src->tx_errors;
4178 dst->rx_dropped = src->rx_dropped;
4179 dst->tx_dropped = src->tx_dropped;
4180 dst->multicast = src->multicast;
4181 dst->collisions = src->collisions;
4182 dst->rx_length_errors = src->rx_length_errors;
4183 dst->rx_over_errors = src->rx_over_errors;
4184 dst->rx_crc_errors = src->rx_crc_errors;
4185 dst->rx_frame_errors = src->rx_frame_errors;
4186 dst->rx_fifo_errors = src->rx_fifo_errors;
4187 dst->rx_missed_errors = src->rx_missed_errors;
4188 dst->tx_aborted_errors = src->tx_aborted_errors;
4189 dst->tx_carrier_errors = src->tx_carrier_errors;
4190 dst->tx_fifo_errors = src->tx_fifo_errors;
4191 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4192 dst->tx_window_errors = src->tx_window_errors;
4196 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4198 /* Policy for RTNLGRP_LINK messages.
4200 * There are *many* more fields in these messages, but currently we only
4201 * care about these fields. */
4202 static const struct nl_policy rtnlgrp_link_policy[] = {
4203 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4204 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4205 .min_len = sizeof(struct rtnl_link_stats) },
4208 struct ofpbuf request;
4209 struct ofpbuf *reply;
4210 struct ifinfomsg *ifi;
4211 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4214 ofpbuf_init(&request, 0);
4215 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4216 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4217 ifi->ifi_family = PF_UNSPEC;
4218 ifi->ifi_index = ifindex;
4219 error = nl_sock_transact(rtnl_sock, &request, &reply);
4220 ofpbuf_uninit(&request);
4225 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4226 rtnlgrp_link_policy,
4227 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4228 ofpbuf_delete(reply);
4232 if (!attrs[IFLA_STATS]) {
4233 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4234 ofpbuf_delete(reply);
4238 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4240 ofpbuf_delete(reply);
4246 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4248 static const char fn[] = "/proc/net/dev";
4253 stream = fopen(fn, "r");
4255 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
4260 while (fgets(line, sizeof line, stream)) {
4263 #define X64 "%"SCNu64
4266 X64 X64 X64 X64 X64 X64 X64 "%*u"
4267 X64 X64 X64 X64 X64 X64 X64 "%*u",
4273 &stats->rx_fifo_errors,
4274 &stats->rx_frame_errors,
4280 &stats->tx_fifo_errors,
4282 &stats->tx_carrier_errors) != 15) {
4283 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4284 } else if (!strcmp(devname, netdev_name)) {
4285 stats->rx_length_errors = UINT64_MAX;
4286 stats->rx_over_errors = UINT64_MAX;
4287 stats->rx_crc_errors = UINT64_MAX;
4288 stats->rx_missed_errors = UINT64_MAX;
4289 stats->tx_aborted_errors = UINT64_MAX;
4290 stats->tx_heartbeat_errors = UINT64_MAX;
4291 stats->tx_window_errors = UINT64_MAX;
4297 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4303 get_flags(const struct netdev_dev *dev, unsigned int *flags)
4309 error = netdev_linux_do_ioctl(dev->name, &ifr, SIOCGIFFLAGS,
4312 *flags = ifr.ifr_flags;
4318 set_flags(struct netdev *netdev, unsigned int flags)
4322 ifr.ifr_flags = flags;
4323 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4328 do_get_ifindex(const char *netdev_name)
4332 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4333 COVERAGE_INC(netdev_get_ifindex);
4334 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4335 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4336 netdev_name, strerror(errno));
4339 return ifr.ifr_ifindex;
4343 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4345 struct netdev_dev_linux *netdev_dev =
4346 netdev_dev_linux_cast(netdev_get_dev(netdev_));
4348 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
4349 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4353 netdev_dev->cache_valid |= VALID_IFINDEX;
4354 netdev_dev->ifindex = ifindex;
4356 *ifindexp = netdev_dev->ifindex;
4361 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4366 memset(&ifr, 0, sizeof ifr);
4367 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4368 COVERAGE_INC(netdev_get_hwaddr);
4369 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4370 /* ENODEV probably means that a vif disappeared asynchronously and
4371 * hasn't been removed from the database yet, so reduce the log level
4372 * to INFO for that case. */
4373 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4374 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4375 netdev_name, strerror(errno));
4378 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4379 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4380 VLOG_WARN("%s device has unknown hardware address family %d",
4381 netdev_name, hwaddr_family);
4383 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4388 set_etheraddr(const char *netdev_name, int hwaddr_family,
4389 const uint8_t mac[ETH_ADDR_LEN])
4393 memset(&ifr, 0, sizeof ifr);
4394 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4395 ifr.ifr_hwaddr.sa_family = hwaddr_family;
4396 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4397 COVERAGE_INC(netdev_set_hwaddr);
4398 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4399 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4400 netdev_name, strerror(errno));
4407 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4408 int cmd, const char *cmd_name)
4412 memset(&ifr, 0, sizeof ifr);
4413 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4414 ifr.ifr_data = (caddr_t) ecmd;
4417 COVERAGE_INC(netdev_ethtool);
4418 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4421 if (errno != EOPNOTSUPP) {
4422 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4423 "failed: %s", cmd_name, name, strerror(errno));
4425 /* The device doesn't support this operation. That's pretty
4426 * common, so there's no point in logging anything. */
4433 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4434 const char *cmd_name)
4436 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4437 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4438 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4446 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4447 int cmd, const char *cmd_name)
4452 ifr.ifr_addr.sa_family = AF_INET;
4453 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4455 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4456 *ip = sin->sin_addr;
4461 /* Returns an AF_PACKET raw socket or a negative errno value. */
4463 af_packet_sock(void)
4465 static int sock = INT_MIN;
4467 if (sock == INT_MIN) {
4468 sock = socket(AF_PACKET, SOCK_RAW, 0);
4470 set_nonblocking(sock);
4473 VLOG_ERR("failed to create packet socket: %s", strerror(errno));