2 * Copyright (c) 2009, 2010, 2011, 2012 Nicira Networks.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
24 #include <arpa/inet.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
30 #include <linux/types.h>
31 #include <linux/ethtool.h>
32 #include <linux/mii.h>
33 #include <linux/pkt_cls.h>
34 #include <linux/pkt_sched.h>
35 #include <linux/rtnetlink.h>
36 #include <linux/sockios.h>
37 #include <linux/version.h>
38 #include <sys/types.h>
39 #include <sys/ioctl.h>
40 #include <sys/socket.h>
41 #include <netpacket/packet.h>
43 #include <net/if_arp.h>
44 #include <net/if_packet.h>
45 #include <net/route.h>
46 #include <netinet/in.h>
53 #include "dpif-linux.h"
54 #include "dynamic-string.h"
55 #include "fatal-signal.h"
58 #include "netdev-provider.h"
59 #include "netdev-vport.h"
61 #include "netlink-notifier.h"
62 #include "netlink-socket.h"
64 #include "openflow/openflow.h"
66 #include "poll-loop.h"
67 #include "rtnetlink-link.h"
68 #include "socket-util.h"
74 VLOG_DEFINE_THIS_MODULE(netdev_linux);
76 COVERAGE_DEFINE(netdev_set_policing);
77 COVERAGE_DEFINE(netdev_arp_lookup);
78 COVERAGE_DEFINE(netdev_get_ifindex);
79 COVERAGE_DEFINE(netdev_get_hwaddr);
80 COVERAGE_DEFINE(netdev_set_hwaddr);
81 COVERAGE_DEFINE(netdev_ethtool);
84 /* These were introduced in Linux 2.6.14, so they might be missing if we have
86 #ifndef ADVERTISED_Pause
87 #define ADVERTISED_Pause (1 << 13)
89 #ifndef ADVERTISED_Asym_Pause
90 #define ADVERTISED_Asym_Pause (1 << 14)
93 /* These were introduced in Linux 2.6.24, so they might be missing if we
94 * have old headers. */
95 #ifndef ETHTOOL_GFLAGS
96 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
98 #ifndef ETHTOOL_SFLAGS
99 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
102 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
105 #define TC_RTAB_SIZE 1024
108 static struct nln_notifier *netdev_linux_cache_notifier = NULL;
109 static int cache_notifier_refcount;
112 VALID_IFINDEX = 1 << 0,
113 VALID_ETHERADDR = 1 << 1,
117 VALID_POLICING = 1 << 5,
118 VALID_VPORT_STAT_ERROR = 1 << 6,
119 VALID_DRVINFO = 1 << 7,
127 /* Traffic control. */
129 /* An instance of a traffic control class. Always associated with a particular
132 * Each TC implementation subclasses this with whatever additional data it
135 const struct tc_ops *ops;
136 struct hmap queues; /* Contains "struct tc_queue"s.
137 * Read by generic TC layer.
138 * Written only by TC implementation. */
141 /* One traffic control queue.
143 * Each TC implementation subclasses this with whatever additional data it
146 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
147 unsigned int queue_id; /* OpenFlow queue ID. */
150 /* A particular kind of traffic control. Each implementation generally maps to
151 * one particular Linux qdisc class.
153 * The functions below return 0 if successful or a positive errno value on
154 * failure, except where otherwise noted. All of them must be provided, except
155 * where otherwise noted. */
157 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
158 * This is null for tc_ops_default and tc_ops_other, for which there are no
159 * appropriate values. */
160 const char *linux_name;
162 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
163 const char *ovs_name;
165 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
166 * queues. The queues are numbered 0 through n_queues - 1. */
167 unsigned int n_queues;
169 /* Called to install this TC class on 'netdev'. The implementation should
170 * make the Netlink calls required to set up 'netdev' with the right qdisc
171 * and configure it according to 'details'. The implementation may assume
172 * that the current qdisc is the default; that is, there is no need for it
173 * to delete the current qdisc before installing itself.
175 * The contents of 'details' should be documented as valid for 'ovs_name'
176 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
177 * (which is built as ovs-vswitchd.conf.db(8)).
179 * This function must return 0 if and only if it sets 'netdev->tc' to an
180 * initialized 'struct tc'.
182 * (This function is null for tc_ops_other, which cannot be installed. For
183 * other TC classes it should always be nonnull.) */
184 int (*tc_install)(struct netdev *netdev, const struct shash *details);
186 /* Called when the netdev code determines (through a Netlink query) that
187 * this TC class's qdisc is installed on 'netdev', but we didn't install
188 * it ourselves and so don't know any of the details.
190 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
191 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
192 * implementation should parse the other attributes of 'nlmsg' as
193 * necessary to determine its configuration. If necessary it should also
194 * use Netlink queries to determine the configuration of queues on
197 * This function must return 0 if and only if it sets 'netdev->tc' to an
198 * initialized 'struct tc'. */
199 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
201 /* Destroys the data structures allocated by the implementation as part of
202 * 'tc'. (This includes destroying 'tc->queues' by calling
205 * The implementation should not need to perform any Netlink calls. If
206 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
207 * (But it may not be desirable.)
209 * This function may be null if 'tc' is trivial. */
210 void (*tc_destroy)(struct tc *tc);
212 /* Retrieves details of 'netdev->tc' configuration into 'details'.
214 * The implementation should not need to perform any Netlink calls, because
215 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
216 * cached the configuration.
218 * The contents of 'details' should be documented as valid for 'ovs_name'
219 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
220 * (which is built as ovs-vswitchd.conf.db(8)).
222 * This function may be null if 'tc' is not configurable.
224 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
226 /* Reconfigures 'netdev->tc' according to 'details', performing any
227 * required Netlink calls to complete the reconfiguration.
229 * The contents of 'details' should be documented as valid for 'ovs_name'
230 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
231 * (which is built as ovs-vswitchd.conf.db(8)).
233 * This function may be null if 'tc' is not configurable.
235 int (*qdisc_set)(struct netdev *, const struct shash *details);
237 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
238 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
240 * The contents of 'details' should be documented as valid for 'ovs_name'
241 * in the "other_config" column in the "Queue" table in
242 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
244 * The implementation should not need to perform any Netlink calls, because
245 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
246 * cached the queue configuration.
248 * This function may be null if 'tc' does not have queues ('n_queues' is
250 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
251 struct shash *details);
253 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
254 * 'details', perfoming any required Netlink calls to complete the
255 * reconfiguration. The caller ensures that 'queue_id' is less than
258 * The contents of 'details' should be documented as valid for 'ovs_name'
259 * in the "other_config" column in the "Queue" table in
260 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
262 * This function may be null if 'tc' does not have queues or its queues are
263 * not configurable. */
264 int (*class_set)(struct netdev *, unsigned int queue_id,
265 const struct shash *details);
267 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
268 * tc_queue's within 'netdev->tc->queues'.
270 * This function may be null if 'tc' does not have queues or its queues
271 * cannot be deleted. */
272 int (*class_delete)(struct netdev *, struct tc_queue *queue);
274 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
275 * 'struct tc_queue's within 'netdev->tc->queues'.
277 * On success, initializes '*stats'.
279 * This function may be null if 'tc' does not have queues or if it cannot
280 * report queue statistics. */
281 int (*class_get_stats)(const struct netdev *netdev,
282 const struct tc_queue *queue,
283 struct netdev_queue_stats *stats);
285 /* Extracts queue stats from 'nlmsg', which is a response to a
286 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
288 * This function may be null if 'tc' does not have queues or if it cannot
289 * report queue statistics. */
290 int (*class_dump_stats)(const struct netdev *netdev,
291 const struct ofpbuf *nlmsg,
292 netdev_dump_queue_stats_cb *cb, void *aux);
296 tc_init(struct tc *tc, const struct tc_ops *ops)
299 hmap_init(&tc->queues);
303 tc_destroy(struct tc *tc)
305 hmap_destroy(&tc->queues);
308 static const struct tc_ops tc_ops_htb;
309 static const struct tc_ops tc_ops_hfsc;
310 static const struct tc_ops tc_ops_default;
311 static const struct tc_ops tc_ops_other;
313 static const struct tc_ops *tcs[] = {
314 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
315 &tc_ops_hfsc, /* Hierarchical fair service curve. */
316 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
317 &tc_ops_other, /* Some other qdisc. */
321 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
322 static unsigned int tc_get_major(unsigned int handle);
323 static unsigned int tc_get_minor(unsigned int handle);
325 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
326 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
327 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
329 static struct tcmsg *tc_make_request(const struct netdev *, int type,
330 unsigned int flags, struct ofpbuf *);
331 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
332 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
333 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
336 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
337 struct nlattr **options);
338 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
339 struct nlattr **options,
340 struct netdev_queue_stats *);
341 static int tc_query_class(const struct netdev *,
342 unsigned int handle, unsigned int parent,
343 struct ofpbuf **replyp);
344 static int tc_delete_class(const struct netdev *, unsigned int handle);
346 static int tc_del_qdisc(struct netdev *netdev);
347 static int tc_query_qdisc(const struct netdev *netdev);
349 static int tc_calc_cell_log(unsigned int mtu);
350 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
351 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
352 const struct tc_ratespec *rate);
353 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
355 struct netdev_dev_linux {
356 struct netdev_dev netdev_dev;
358 struct shash_node *shash_node;
359 unsigned int cache_valid;
360 unsigned int change_seq;
362 bool miimon; /* Link status of last poll. */
363 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
364 struct timer miimon_timer;
366 /* The following are figured out "on demand" only. They are only valid
367 * when the corresponding VALID_* bit in 'cache_valid' is set. */
369 uint8_t etheraddr[ETH_ADDR_LEN];
370 struct in_addr address, netmask;
373 unsigned int ifi_flags;
374 long long int carrier_resets;
375 uint32_t kbits_rate; /* Policing data. */
376 uint32_t kbits_burst;
377 int vport_stats_error; /* Cached error code from vport_get_stats().
378 0 or an errno value. */
379 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
383 struct tap_state tap;
387 struct netdev_linux {
388 struct netdev netdev;
392 /* Sockets used for ioctl operations. */
393 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
395 /* A Netlink routing socket that is not subscribed to any multicast groups. */
396 static struct nl_sock *rtnl_sock;
398 /* This is set pretty low because we probably won't learn anything from the
399 * additional log messages. */
400 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
402 static int netdev_linux_init(void);
404 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
405 int cmd, const char *cmd_name);
406 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
407 const char *cmd_name);
408 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
409 int cmd, const char *cmd_name);
410 static int get_flags(const struct netdev_dev *, unsigned int *flags);
411 static int set_flags(struct netdev *, unsigned int flags);
412 static int do_get_ifindex(const char *netdev_name);
413 static int get_ifindex(const struct netdev *, int *ifindexp);
414 static int do_set_addr(struct netdev *netdev,
415 int ioctl_nr, const char *ioctl_name,
416 struct in_addr addr);
417 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
418 static int set_etheraddr(const char *netdev_name, int hwaddr_family,
419 const uint8_t[ETH_ADDR_LEN]);
420 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
421 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
422 static int af_packet_sock(void);
423 static void netdev_linux_miimon_run(void);
424 static void netdev_linux_miimon_wait(void);
427 is_netdev_linux_class(const struct netdev_class *netdev_class)
429 return netdev_class->init == netdev_linux_init;
432 static struct netdev_dev_linux *
433 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
435 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
436 assert(is_netdev_linux_class(netdev_class));
438 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
441 static struct netdev_linux *
442 netdev_linux_cast(const struct netdev *netdev)
444 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
445 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
446 assert(is_netdev_linux_class(netdev_class));
448 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
452 netdev_linux_init(void)
454 static int status = -1;
456 /* Create AF_INET socket. */
457 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
458 status = af_inet_sock >= 0 ? 0 : errno;
460 VLOG_ERR("failed to create inet socket: %s", strerror(status));
463 /* Create rtnetlink socket. */
465 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
467 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
476 netdev_linux_run(void)
478 rtnetlink_link_run();
479 netdev_linux_miimon_run();
483 netdev_linux_wait(void)
485 rtnetlink_link_wait();
486 netdev_linux_miimon_wait();
490 netdev_linux_get_drvinfo(struct netdev_dev_linux *netdev_dev)
495 if (netdev_dev->cache_valid & VALID_DRVINFO) {
499 memset(&netdev_dev->drvinfo, 0, sizeof netdev_dev->drvinfo);
500 error = netdev_linux_do_ethtool(netdev_dev->netdev_dev.name,
501 (struct ethtool_cmd *)&netdev_dev->drvinfo,
505 netdev_dev->cache_valid |= VALID_DRVINFO;
511 netdev_dev_linux_changed(struct netdev_dev_linux *dev,
512 unsigned int ifi_flags,
516 if (!dev->change_seq) {
520 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
521 dev->carrier_resets++;
523 dev->ifi_flags = ifi_flags;
525 dev->cache_valid &= mask;
529 netdev_dev_linux_update(struct netdev_dev_linux *dev,
530 const struct rtnetlink_link_change *change)
532 if (change->nlmsg_type == RTM_NEWLINK) {
534 netdev_dev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
536 netdev_dev_linux_changed(dev, change->ifi_flags, 0);
541 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
542 void *aux OVS_UNUSED)
544 struct netdev_dev_linux *dev;
546 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
548 const struct netdev_class *netdev_class =
549 netdev_dev_get_class(base_dev);
551 if (is_netdev_linux_class(netdev_class)) {
552 dev = netdev_dev_linux_cast(base_dev);
553 netdev_dev_linux_update(dev, change);
557 struct shash device_shash;
558 struct shash_node *node;
560 shash_init(&device_shash);
561 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
562 SHASH_FOR_EACH (node, &device_shash) {
567 get_flags(&dev->netdev_dev, &flags);
568 netdev_dev_linux_changed(dev, flags, 0);
570 shash_destroy(&device_shash);
575 cache_notifier_ref(void)
577 if (!cache_notifier_refcount) {
578 assert(!netdev_linux_cache_notifier);
580 netdev_linux_cache_notifier =
581 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
583 if (!netdev_linux_cache_notifier) {
587 cache_notifier_refcount++;
593 cache_notifier_unref(void)
595 assert(cache_notifier_refcount > 0);
596 if (!--cache_notifier_refcount) {
597 assert(netdev_linux_cache_notifier);
598 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
599 netdev_linux_cache_notifier = NULL;
603 /* Creates system and internal devices. */
605 netdev_linux_create(const struct netdev_class *class, const char *name,
606 struct netdev_dev **netdev_devp)
608 struct netdev_dev_linux *netdev_dev;
611 error = cache_notifier_ref();
616 netdev_dev = xzalloc(sizeof *netdev_dev);
617 netdev_dev->change_seq = 1;
618 netdev_dev_init(&netdev_dev->netdev_dev, name, class);
619 get_flags(&netdev_dev->netdev_dev, &netdev_dev->ifi_flags);
621 *netdev_devp = &netdev_dev->netdev_dev;
625 /* For most types of netdevs we open the device for each call of
626 * netdev_open(). However, this is not the case with tap devices,
627 * since it is only possible to open the device once. In this
628 * situation we share a single file descriptor, and consequently
629 * buffers, across all readers. Therefore once data is read it will
630 * be unavailable to other reads for tap devices. */
632 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
633 const char *name, struct netdev_dev **netdev_devp)
635 struct netdev_dev_linux *netdev_dev;
636 struct tap_state *state;
637 static const char tap_dev[] = "/dev/net/tun";
641 netdev_dev = xzalloc(sizeof *netdev_dev);
642 state = &netdev_dev->state.tap;
644 error = cache_notifier_ref();
649 /* Open tap device. */
650 state->fd = open(tap_dev, O_RDWR);
653 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
654 goto error_unref_notifier;
657 /* Create tap device. */
658 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
659 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
660 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
661 VLOG_WARN("%s: creating tap device failed: %s", name,
664 goto error_unref_notifier;
667 /* Make non-blocking. */
668 error = set_nonblocking(state->fd);
670 goto error_unref_notifier;
673 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
674 *netdev_devp = &netdev_dev->netdev_dev;
677 error_unref_notifier:
678 cache_notifier_unref();
685 destroy_tap(struct netdev_dev_linux *netdev_dev)
687 struct tap_state *state = &netdev_dev->state.tap;
689 if (state->fd >= 0) {
694 /* Destroys the netdev device 'netdev_dev_'. */
696 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
698 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
699 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
701 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
702 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
705 if (class == &netdev_tap_class) {
706 destroy_tap(netdev_dev);
710 cache_notifier_unref();
714 netdev_linux_open(struct netdev_dev *netdev_dev_, struct netdev **netdevp)
716 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
717 struct netdev_linux *netdev;
718 enum netdev_flags flags;
721 /* Allocate network device. */
722 netdev = xzalloc(sizeof *netdev);
724 netdev_init(&netdev->netdev, netdev_dev_);
726 /* Verify that the device really exists, by attempting to read its flags.
727 * (The flags might be cached, in which case this won't actually do an
730 * Don't do this for "internal" netdevs, though, because those have to be
731 * created as netdev objects before they exist in the kernel, because
732 * creating them in the kernel happens by passing a netdev object to
733 * dpif_port_add(). */
734 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
735 error = netdev_get_flags(&netdev->netdev, &flags);
736 if (error == ENODEV) {
741 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
742 !netdev_dev->state.tap.opened) {
744 /* We assume that the first user of the tap device is the primary user
745 * and give them the tap FD. Subsequent users probably just expect
746 * this to be a system device so open it normally to avoid send/receive
747 * directions appearing to be reversed. */
748 netdev->fd = netdev_dev->state.tap.fd;
749 netdev_dev->state.tap.opened = true;
752 *netdevp = &netdev->netdev;
756 netdev_uninit(&netdev->netdev, true);
760 /* Closes and destroys 'netdev'. */
762 netdev_linux_close(struct netdev *netdev_)
764 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
766 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
773 netdev_linux_listen(struct netdev *netdev_)
775 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
776 struct sockaddr_ll sll;
781 if (netdev->fd >= 0) {
785 /* Create file descriptor. */
786 fd = socket(PF_PACKET, SOCK_RAW, 0);
789 VLOG_ERR("failed to create raw socket (%s)", strerror(error));
793 /* Set non-blocking mode. */
794 error = set_nonblocking(fd);
799 /* Get ethernet device index. */
800 error = get_ifindex(&netdev->netdev, &ifindex);
805 /* Bind to specific ethernet device. */
806 memset(&sll, 0, sizeof sll);
807 sll.sll_family = AF_PACKET;
808 sll.sll_ifindex = ifindex;
809 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
810 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
812 VLOG_ERR("%s: failed to bind raw socket (%s)",
813 netdev_get_name(netdev_), strerror(error));
828 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
830 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
832 if (netdev->fd < 0) {
833 /* Device is not listening. */
840 retval = (netdev_->netdev_dev->netdev_class == &netdev_tap_class
841 ? read(netdev->fd, data, size)
842 : recv(netdev->fd, data, size, MSG_TRUNC));
844 return retval <= size ? retval : -EMSGSIZE;
845 } else if (errno != EINTR) {
846 if (errno != EAGAIN) {
847 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
848 strerror(errno), netdev_get_name(netdev_));
855 /* Registers with the poll loop to wake up from the next call to poll_block()
856 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
858 netdev_linux_recv_wait(struct netdev *netdev_)
860 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
861 if (netdev->fd >= 0) {
862 poll_fd_wait(netdev->fd, POLLIN);
866 /* Discards all packets waiting to be received from 'netdev'. */
868 netdev_linux_drain(struct netdev *netdev_)
870 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
871 if (netdev->fd < 0) {
873 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
875 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
876 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
880 drain_fd(netdev->fd, ifr.ifr_qlen);
883 return drain_rcvbuf(netdev->fd);
887 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
888 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
889 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
890 * the packet is too big or too small to transmit on the device.
892 * The caller retains ownership of 'buffer' in all cases.
894 * The kernel maintains a packet transmission queue, so the caller is not
895 * expected to do additional queuing of packets. */
897 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
899 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
903 if (netdev->fd < 0) {
904 /* Use our AF_PACKET socket to send to this device. */
905 struct sockaddr_ll sll;
912 sock = af_packet_sock();
917 error = get_ifindex(netdev_, &ifindex);
922 /* We don't bother setting most fields in sockaddr_ll because the
923 * kernel ignores them for SOCK_RAW. */
924 memset(&sll, 0, sizeof sll);
925 sll.sll_family = AF_PACKET;
926 sll.sll_ifindex = ifindex;
928 iov.iov_base = (void *) data;
932 msg.msg_namelen = sizeof sll;
935 msg.msg_control = NULL;
936 msg.msg_controllen = 0;
939 retval = sendmsg(sock, &msg, 0);
941 /* Use the netdev's own fd to send to this device. This is
942 * essential for tap devices, because packets sent to a tap device
943 * with an AF_PACKET socket will loop back to be *received* again
944 * on the tap device. */
945 retval = write(netdev->fd, data, size);
949 /* The Linux AF_PACKET implementation never blocks waiting for room
950 * for packets, instead returning ENOBUFS. Translate this into
951 * EAGAIN for the caller. */
952 if (errno == ENOBUFS) {
954 } else if (errno == EINTR) {
956 } else if (errno != EAGAIN) {
957 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
958 netdev_get_name(netdev_), strerror(errno));
961 } else if (retval != size) {
962 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
963 "%zu) on %s", retval, size, netdev_get_name(netdev_));
971 /* Registers with the poll loop to wake up from the next call to poll_block()
972 * when the packet transmission queue has sufficient room to transmit a packet
973 * with netdev_send().
975 * The kernel maintains a packet transmission queue, so the client is not
976 * expected to do additional queuing of packets. Thus, this function is
977 * unlikely to ever be used. It is included for completeness. */
979 netdev_linux_send_wait(struct netdev *netdev_)
981 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
982 if (netdev->fd < 0) {
984 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
985 poll_fd_wait(netdev->fd, POLLOUT);
987 /* TAP device always accepts packets.*/
988 poll_immediate_wake();
992 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
993 * otherwise a positive errno value. */
995 netdev_linux_set_etheraddr(struct netdev *netdev_,
996 const uint8_t mac[ETH_ADDR_LEN])
998 struct netdev_dev_linux *netdev_dev =
999 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1002 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
1003 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
1004 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
1006 netdev_dev->cache_valid |= VALID_ETHERADDR;
1007 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
1015 /* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
1016 * free the returned buffer. */
1018 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1019 uint8_t mac[ETH_ADDR_LEN])
1021 struct netdev_dev_linux *netdev_dev =
1022 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1023 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
1024 int error = get_etheraddr(netdev_get_name(netdev_),
1025 netdev_dev->etheraddr);
1029 netdev_dev->cache_valid |= VALID_ETHERADDR;
1031 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
1035 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1036 * in bytes, not including the hardware header; thus, this is typically 1500
1037 * bytes for Ethernet devices. */
1039 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1041 struct netdev_dev_linux *netdev_dev =
1042 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1043 if (!(netdev_dev->cache_valid & VALID_MTU)) {
1047 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1048 SIOCGIFMTU, "SIOCGIFMTU");
1052 netdev_dev->mtu = ifr.ifr_mtu;
1053 netdev_dev->cache_valid |= VALID_MTU;
1055 *mtup = netdev_dev->mtu;
1059 /* Sets the maximum size of transmitted (MTU) for given device using linux
1060 * networking ioctl interface.
1063 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1065 struct netdev_dev_linux *netdev_dev =
1066 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1070 if (netdev_dev->cache_valid & VALID_MTU &&
1071 netdev_dev->mtu == mtu) {
1075 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1076 SIOCSIFMTU, "SIOCSIFMTU");
1081 netdev_dev->mtu = ifr.ifr_mtu;
1082 netdev_dev->cache_valid |= VALID_MTU;
1086 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1087 * On failure, returns a negative errno value. */
1089 netdev_linux_get_ifindex(const struct netdev *netdev)
1093 error = get_ifindex(netdev, &ifindex);
1094 return error ? -error : ifindex;
1098 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1100 struct netdev_dev_linux *netdev_dev =
1101 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1103 if (netdev_dev->miimon_interval > 0) {
1104 *carrier = netdev_dev->miimon;
1106 *carrier = (netdev_dev->ifi_flags & IFF_RUNNING) != 0;
1112 static long long int
1113 netdev_linux_get_carrier_resets(const struct netdev *netdev)
1115 return netdev_dev_linux_cast(netdev_get_dev(netdev))->carrier_resets;
1119 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1120 struct mii_ioctl_data *data)
1125 memset(&ifr, 0, sizeof ifr);
1126 memcpy(&ifr.ifr_data, data, sizeof *data);
1127 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1128 memcpy(data, &ifr.ifr_data, sizeof *data);
1134 netdev_linux_get_miimon(const char *name, bool *miimon)
1136 struct mii_ioctl_data data;
1141 memset(&data, 0, sizeof data);
1142 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1144 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1145 data.reg_num = MII_BMSR;
1146 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1150 *miimon = !!(data.val_out & BMSR_LSTATUS);
1152 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1155 struct ethtool_cmd ecmd;
1157 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1160 memset(&ecmd, 0, sizeof ecmd);
1161 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1164 struct ethtool_value eval;
1166 memcpy(&eval, &ecmd, sizeof eval);
1167 *miimon = !!eval.data;
1169 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1177 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1178 long long int interval)
1180 struct netdev_dev_linux *netdev_dev;
1182 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev_));
1184 interval = interval > 0 ? MAX(interval, 100) : 0;
1185 if (netdev_dev->miimon_interval != interval) {
1186 netdev_dev->miimon_interval = interval;
1187 timer_set_expired(&netdev_dev->miimon_timer);
1194 netdev_linux_miimon_run(void)
1196 struct shash device_shash;
1197 struct shash_node *node;
1199 shash_init(&device_shash);
1200 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1201 SHASH_FOR_EACH (node, &device_shash) {
1202 struct netdev_dev_linux *dev = node->data;
1205 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1209 netdev_linux_get_miimon(dev->netdev_dev.name, &miimon);
1210 if (miimon != dev->miimon) {
1211 dev->miimon = miimon;
1212 netdev_dev_linux_changed(dev, dev->ifi_flags, 0);
1215 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1218 shash_destroy(&device_shash);
1222 netdev_linux_miimon_wait(void)
1224 struct shash device_shash;
1225 struct shash_node *node;
1227 shash_init(&device_shash);
1228 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1229 SHASH_FOR_EACH (node, &device_shash) {
1230 struct netdev_dev_linux *dev = node->data;
1232 if (dev->miimon_interval > 0) {
1233 timer_wait(&dev->miimon_timer);
1236 shash_destroy(&device_shash);
1239 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1240 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1243 check_for_working_netlink_stats(void)
1245 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1246 * preferable, so if that works, we'll use it. */
1247 int ifindex = do_get_ifindex("lo");
1249 VLOG_WARN("failed to get ifindex for lo, "
1250 "obtaining netdev stats from proc");
1253 struct netdev_stats stats;
1254 int error = get_stats_via_netlink(ifindex, &stats);
1256 VLOG_DBG("obtaining netdev stats via rtnetlink");
1259 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1260 "via proc (you are probably running a pre-2.6.19 "
1261 "kernel)", strerror(error));
1268 swap_uint64(uint64_t *a, uint64_t *b)
1276 get_stats_via_vport(const struct netdev *netdev_,
1277 struct netdev_stats *stats)
1279 struct netdev_dev_linux *netdev_dev =
1280 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1282 if (!netdev_dev->vport_stats_error ||
1283 !(netdev_dev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1286 error = netdev_vport_get_stats(netdev_, stats);
1288 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1289 "(%s)", netdev_get_name(netdev_), strerror(error));
1291 netdev_dev->vport_stats_error = error;
1292 netdev_dev->cache_valid |= VALID_VPORT_STAT_ERROR;
1297 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1298 struct netdev_stats *stats)
1300 static int use_netlink_stats = -1;
1303 if (use_netlink_stats < 0) {
1304 use_netlink_stats = check_for_working_netlink_stats();
1307 if (use_netlink_stats) {
1310 error = get_ifindex(netdev_, &ifindex);
1312 error = get_stats_via_netlink(ifindex, stats);
1315 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1319 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1320 netdev_get_name(netdev_), error);
1326 /* Retrieves current device stats for 'netdev-linux'. */
1328 netdev_linux_get_stats(const struct netdev *netdev_,
1329 struct netdev_stats *stats)
1331 struct netdev_dev_linux *netdev_dev =
1332 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1333 struct netdev_stats dev_stats;
1336 get_stats_via_vport(netdev_, stats);
1338 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1341 if (netdev_dev->vport_stats_error) {
1348 if (netdev_dev->vport_stats_error) {
1349 /* stats not available from OVS then use ioctl stats. */
1352 stats->rx_errors += dev_stats.rx_errors;
1353 stats->tx_errors += dev_stats.tx_errors;
1354 stats->rx_dropped += dev_stats.rx_dropped;
1355 stats->tx_dropped += dev_stats.tx_dropped;
1356 stats->multicast += dev_stats.multicast;
1357 stats->collisions += dev_stats.collisions;
1358 stats->rx_length_errors += dev_stats.rx_length_errors;
1359 stats->rx_over_errors += dev_stats.rx_over_errors;
1360 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1361 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1362 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1363 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1364 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1365 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1366 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1367 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1368 stats->tx_window_errors += dev_stats.tx_window_errors;
1373 /* Retrieves current device stats for 'netdev-tap' netdev or
1374 * netdev-internal. */
1376 netdev_tap_get_stats(const struct netdev *netdev_,
1377 struct netdev_stats *stats)
1379 struct netdev_dev_linux *netdev_dev =
1380 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1381 struct netdev_stats dev_stats;
1384 get_stats_via_vport(netdev_, stats);
1386 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1388 if (netdev_dev->vport_stats_error) {
1395 /* If this port is an internal port then the transmit and receive stats
1396 * will appear to be swapped relative to the other ports since we are the
1397 * one sending the data, not a remote computer. For consistency, we swap
1398 * them back here. This does not apply if we are getting stats from the
1399 * vport layer because it always tracks stats from the perspective of the
1401 if (netdev_dev->vport_stats_error) {
1403 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1404 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1405 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1406 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1407 stats->rx_length_errors = 0;
1408 stats->rx_over_errors = 0;
1409 stats->rx_crc_errors = 0;
1410 stats->rx_frame_errors = 0;
1411 stats->rx_fifo_errors = 0;
1412 stats->rx_missed_errors = 0;
1413 stats->tx_aborted_errors = 0;
1414 stats->tx_carrier_errors = 0;
1415 stats->tx_fifo_errors = 0;
1416 stats->tx_heartbeat_errors = 0;
1417 stats->tx_window_errors = 0;
1419 stats->rx_dropped += dev_stats.tx_dropped;
1420 stats->tx_dropped += dev_stats.rx_dropped;
1422 stats->rx_errors += dev_stats.tx_errors;
1423 stats->tx_errors += dev_stats.rx_errors;
1425 stats->multicast += dev_stats.multicast;
1426 stats->collisions += dev_stats.collisions;
1432 netdev_internal_get_stats(const struct netdev *netdev_,
1433 struct netdev_stats *stats)
1435 struct netdev_dev_linux *netdev_dev =
1436 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1438 get_stats_via_vport(netdev_, stats);
1439 return netdev_dev->vport_stats_error;
1442 /* Stores the features supported by 'netdev' into each of '*current',
1443 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1444 * bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if
1445 * successful, otherwise a positive errno value. */
1447 netdev_linux_get_features(const struct netdev *netdev,
1448 uint32_t *current, uint32_t *advertised,
1449 uint32_t *supported, uint32_t *peer)
1451 struct ethtool_cmd ecmd;
1454 memset(&ecmd, 0, sizeof ecmd);
1455 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1456 ETHTOOL_GSET, "ETHTOOL_GSET");
1461 /* Supported features. */
1463 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1464 *supported |= OFPPF_10MB_HD;
1466 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1467 *supported |= OFPPF_10MB_FD;
1469 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1470 *supported |= OFPPF_100MB_HD;
1472 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1473 *supported |= OFPPF_100MB_FD;
1475 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1476 *supported |= OFPPF_1GB_HD;
1478 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1479 *supported |= OFPPF_1GB_FD;
1481 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1482 *supported |= OFPPF_10GB_FD;
1484 if (ecmd.supported & SUPPORTED_TP) {
1485 *supported |= OFPPF_COPPER;
1487 if (ecmd.supported & SUPPORTED_FIBRE) {
1488 *supported |= OFPPF_FIBER;
1490 if (ecmd.supported & SUPPORTED_Autoneg) {
1491 *supported |= OFPPF_AUTONEG;
1493 if (ecmd.supported & SUPPORTED_Pause) {
1494 *supported |= OFPPF_PAUSE;
1496 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1497 *supported |= OFPPF_PAUSE_ASYM;
1500 /* Advertised features. */
1502 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1503 *advertised |= OFPPF_10MB_HD;
1505 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1506 *advertised |= OFPPF_10MB_FD;
1508 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1509 *advertised |= OFPPF_100MB_HD;
1511 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1512 *advertised |= OFPPF_100MB_FD;
1514 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1515 *advertised |= OFPPF_1GB_HD;
1517 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1518 *advertised |= OFPPF_1GB_FD;
1520 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1521 *advertised |= OFPPF_10GB_FD;
1523 if (ecmd.advertising & ADVERTISED_TP) {
1524 *advertised |= OFPPF_COPPER;
1526 if (ecmd.advertising & ADVERTISED_FIBRE) {
1527 *advertised |= OFPPF_FIBER;
1529 if (ecmd.advertising & ADVERTISED_Autoneg) {
1530 *advertised |= OFPPF_AUTONEG;
1532 if (ecmd.advertising & ADVERTISED_Pause) {
1533 *advertised |= OFPPF_PAUSE;
1535 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1536 *advertised |= OFPPF_PAUSE_ASYM;
1539 /* Current settings. */
1540 if (ecmd.speed == SPEED_10) {
1541 *current = ecmd.duplex ? OFPPF_10MB_FD : OFPPF_10MB_HD;
1542 } else if (ecmd.speed == SPEED_100) {
1543 *current = ecmd.duplex ? OFPPF_100MB_FD : OFPPF_100MB_HD;
1544 } else if (ecmd.speed == SPEED_1000) {
1545 *current = ecmd.duplex ? OFPPF_1GB_FD : OFPPF_1GB_HD;
1546 } else if (ecmd.speed == SPEED_10000) {
1547 *current = OFPPF_10GB_FD;
1552 if (ecmd.port == PORT_TP) {
1553 *current |= OFPPF_COPPER;
1554 } else if (ecmd.port == PORT_FIBRE) {
1555 *current |= OFPPF_FIBER;
1559 *current |= OFPPF_AUTONEG;
1562 /* Peer advertisements. */
1563 *peer = 0; /* XXX */
1568 /* Set the features advertised by 'netdev' to 'advertise'. */
1570 netdev_linux_set_advertisements(struct netdev *netdev, uint32_t advertise)
1572 struct ethtool_cmd ecmd;
1575 memset(&ecmd, 0, sizeof ecmd);
1576 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1577 ETHTOOL_GSET, "ETHTOOL_GSET");
1582 ecmd.advertising = 0;
1583 if (advertise & OFPPF_10MB_HD) {
1584 ecmd.advertising |= ADVERTISED_10baseT_Half;
1586 if (advertise & OFPPF_10MB_FD) {
1587 ecmd.advertising |= ADVERTISED_10baseT_Full;
1589 if (advertise & OFPPF_100MB_HD) {
1590 ecmd.advertising |= ADVERTISED_100baseT_Half;
1592 if (advertise & OFPPF_100MB_FD) {
1593 ecmd.advertising |= ADVERTISED_100baseT_Full;
1595 if (advertise & OFPPF_1GB_HD) {
1596 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1598 if (advertise & OFPPF_1GB_FD) {
1599 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1601 if (advertise & OFPPF_10GB_FD) {
1602 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1604 if (advertise & OFPPF_COPPER) {
1605 ecmd.advertising |= ADVERTISED_TP;
1607 if (advertise & OFPPF_FIBER) {
1608 ecmd.advertising |= ADVERTISED_FIBRE;
1610 if (advertise & OFPPF_AUTONEG) {
1611 ecmd.advertising |= ADVERTISED_Autoneg;
1613 if (advertise & OFPPF_PAUSE) {
1614 ecmd.advertising |= ADVERTISED_Pause;
1616 if (advertise & OFPPF_PAUSE_ASYM) {
1617 ecmd.advertising |= ADVERTISED_Asym_Pause;
1619 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1620 ETHTOOL_SSET, "ETHTOOL_SSET");
1623 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1624 * successful, otherwise a positive errno value. */
1626 netdev_linux_set_policing(struct netdev *netdev,
1627 uint32_t kbits_rate, uint32_t kbits_burst)
1629 struct netdev_dev_linux *netdev_dev =
1630 netdev_dev_linux_cast(netdev_get_dev(netdev));
1631 const char *netdev_name = netdev_get_name(netdev);
1635 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1636 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1637 : kbits_burst); /* Stick with user-specified value. */
1639 if (netdev_dev->cache_valid & VALID_POLICING
1640 && netdev_dev->kbits_rate == kbits_rate
1641 && netdev_dev->kbits_burst == kbits_burst) {
1642 /* Assume that settings haven't changed since we last set them. */
1646 COVERAGE_INC(netdev_set_policing);
1647 /* Remove any existing ingress qdisc. */
1648 error = tc_add_del_ingress_qdisc(netdev, false);
1650 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1651 netdev_name, strerror(error));
1656 error = tc_add_del_ingress_qdisc(netdev, true);
1658 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1659 netdev_name, strerror(error));
1663 error = tc_add_policer(netdev, kbits_rate, kbits_burst);
1665 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1666 netdev_name, strerror(error));
1671 netdev_dev->kbits_rate = kbits_rate;
1672 netdev_dev->kbits_burst = kbits_burst;
1673 netdev_dev->cache_valid |= VALID_POLICING;
1679 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1682 const struct tc_ops **opsp;
1684 for (opsp = tcs; *opsp != NULL; opsp++) {
1685 const struct tc_ops *ops = *opsp;
1686 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1687 sset_add(types, ops->ovs_name);
1693 static const struct tc_ops *
1694 tc_lookup_ovs_name(const char *name)
1696 const struct tc_ops **opsp;
1698 for (opsp = tcs; *opsp != NULL; opsp++) {
1699 const struct tc_ops *ops = *opsp;
1700 if (!strcmp(name, ops->ovs_name)) {
1707 static const struct tc_ops *
1708 tc_lookup_linux_name(const char *name)
1710 const struct tc_ops **opsp;
1712 for (opsp = tcs; *opsp != NULL; opsp++) {
1713 const struct tc_ops *ops = *opsp;
1714 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1721 static struct tc_queue *
1722 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1725 struct netdev_dev_linux *netdev_dev =
1726 netdev_dev_linux_cast(netdev_get_dev(netdev));
1727 struct tc_queue *queue;
1729 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1730 if (queue->queue_id == queue_id) {
1737 static struct tc_queue *
1738 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1740 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1744 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1746 struct netdev_qos_capabilities *caps)
1748 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1752 caps->n_queues = ops->n_queues;
1757 netdev_linux_get_qos(const struct netdev *netdev,
1758 const char **typep, struct shash *details)
1760 struct netdev_dev_linux *netdev_dev =
1761 netdev_dev_linux_cast(netdev_get_dev(netdev));
1764 error = tc_query_qdisc(netdev);
1769 *typep = netdev_dev->tc->ops->ovs_name;
1770 return (netdev_dev->tc->ops->qdisc_get
1771 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1776 netdev_linux_set_qos(struct netdev *netdev,
1777 const char *type, const struct shash *details)
1779 struct netdev_dev_linux *netdev_dev =
1780 netdev_dev_linux_cast(netdev_get_dev(netdev));
1781 const struct tc_ops *new_ops;
1784 new_ops = tc_lookup_ovs_name(type);
1785 if (!new_ops || !new_ops->tc_install) {
1789 error = tc_query_qdisc(netdev);
1794 if (new_ops == netdev_dev->tc->ops) {
1795 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1797 /* Delete existing qdisc. */
1798 error = tc_del_qdisc(netdev);
1802 assert(netdev_dev->tc == NULL);
1804 /* Install new qdisc. */
1805 error = new_ops->tc_install(netdev, details);
1806 assert((error == 0) == (netdev_dev->tc != NULL));
1813 netdev_linux_get_queue(const struct netdev *netdev,
1814 unsigned int queue_id, struct shash *details)
1816 struct netdev_dev_linux *netdev_dev =
1817 netdev_dev_linux_cast(netdev_get_dev(netdev));
1820 error = tc_query_qdisc(netdev);
1824 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1826 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1832 netdev_linux_set_queue(struct netdev *netdev,
1833 unsigned int queue_id, const struct shash *details)
1835 struct netdev_dev_linux *netdev_dev =
1836 netdev_dev_linux_cast(netdev_get_dev(netdev));
1839 error = tc_query_qdisc(netdev);
1842 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1843 || !netdev_dev->tc->ops->class_set) {
1847 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1851 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1853 struct netdev_dev_linux *netdev_dev =
1854 netdev_dev_linux_cast(netdev_get_dev(netdev));
1857 error = tc_query_qdisc(netdev);
1860 } else if (!netdev_dev->tc->ops->class_delete) {
1863 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1865 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1871 netdev_linux_get_queue_stats(const struct netdev *netdev,
1872 unsigned int queue_id,
1873 struct netdev_queue_stats *stats)
1875 struct netdev_dev_linux *netdev_dev =
1876 netdev_dev_linux_cast(netdev_get_dev(netdev));
1879 error = tc_query_qdisc(netdev);
1882 } else if (!netdev_dev->tc->ops->class_get_stats) {
1885 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1887 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1893 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1895 struct ofpbuf request;
1896 struct tcmsg *tcmsg;
1898 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
1902 tcmsg->tcm_parent = 0;
1903 nl_dump_start(dump, rtnl_sock, &request);
1904 ofpbuf_uninit(&request);
1909 netdev_linux_dump_queues(const struct netdev *netdev,
1910 netdev_dump_queues_cb *cb, void *aux)
1912 struct netdev_dev_linux *netdev_dev =
1913 netdev_dev_linux_cast(netdev_get_dev(netdev));
1914 struct tc_queue *queue;
1915 struct shash details;
1919 error = tc_query_qdisc(netdev);
1922 } else if (!netdev_dev->tc->ops->class_get) {
1927 shash_init(&details);
1928 HMAP_FOR_EACH (queue, hmap_node, &netdev_dev->tc->queues) {
1929 shash_clear(&details);
1931 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
1933 (*cb)(queue->queue_id, &details, aux);
1938 shash_destroy(&details);
1944 netdev_linux_dump_queue_stats(const struct netdev *netdev,
1945 netdev_dump_queue_stats_cb *cb, void *aux)
1947 struct netdev_dev_linux *netdev_dev =
1948 netdev_dev_linux_cast(netdev_get_dev(netdev));
1949 struct nl_dump dump;
1954 error = tc_query_qdisc(netdev);
1957 } else if (!netdev_dev->tc->ops->class_dump_stats) {
1962 if (!start_queue_dump(netdev, &dump)) {
1965 while (nl_dump_next(&dump, &msg)) {
1966 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
1972 error = nl_dump_done(&dump);
1973 return error ? error : last_error;
1977 netdev_linux_get_in4(const struct netdev *netdev_,
1978 struct in_addr *address, struct in_addr *netmask)
1980 struct netdev_dev_linux *netdev_dev =
1981 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1983 if (!(netdev_dev->cache_valid & VALID_IN4)) {
1986 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
1987 SIOCGIFADDR, "SIOCGIFADDR");
1992 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
1993 SIOCGIFNETMASK, "SIOCGIFNETMASK");
1998 netdev_dev->cache_valid |= VALID_IN4;
2000 *address = netdev_dev->address;
2001 *netmask = netdev_dev->netmask;
2002 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
2006 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2007 struct in_addr netmask)
2009 struct netdev_dev_linux *netdev_dev =
2010 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2013 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2015 netdev_dev->cache_valid |= VALID_IN4;
2016 netdev_dev->address = address;
2017 netdev_dev->netmask = netmask;
2018 if (address.s_addr != INADDR_ANY) {
2019 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2020 "SIOCSIFNETMASK", netmask);
2027 parse_if_inet6_line(const char *line,
2028 struct in6_addr *in6, char ifname[16 + 1])
2030 uint8_t *s6 = in6->s6_addr;
2031 #define X8 "%2"SCNx8
2033 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2034 "%*x %*x %*x %*x %16s\n",
2035 &s6[0], &s6[1], &s6[2], &s6[3],
2036 &s6[4], &s6[5], &s6[6], &s6[7],
2037 &s6[8], &s6[9], &s6[10], &s6[11],
2038 &s6[12], &s6[13], &s6[14], &s6[15],
2042 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2043 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2045 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2047 struct netdev_dev_linux *netdev_dev =
2048 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2049 if (!(netdev_dev->cache_valid & VALID_IN6)) {
2053 netdev_dev->in6 = in6addr_any;
2055 file = fopen("/proc/net/if_inet6", "r");
2057 const char *name = netdev_get_name(netdev_);
2058 while (fgets(line, sizeof line, file)) {
2059 struct in6_addr in6_tmp;
2060 char ifname[16 + 1];
2061 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2062 && !strcmp(name, ifname))
2064 netdev_dev->in6 = in6_tmp;
2070 netdev_dev->cache_valid |= VALID_IN6;
2072 *in6 = netdev_dev->in6;
2077 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2079 struct sockaddr_in sin;
2080 memset(&sin, 0, sizeof sin);
2081 sin.sin_family = AF_INET;
2082 sin.sin_addr = addr;
2085 memset(sa, 0, sizeof *sa);
2086 memcpy(sa, &sin, sizeof sin);
2090 do_set_addr(struct netdev *netdev,
2091 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2094 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2095 make_in4_sockaddr(&ifr.ifr_addr, addr);
2097 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2101 /* Adds 'router' as a default IP gateway. */
2103 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2105 struct in_addr any = { INADDR_ANY };
2109 memset(&rt, 0, sizeof rt);
2110 make_in4_sockaddr(&rt.rt_dst, any);
2111 make_in4_sockaddr(&rt.rt_gateway, router);
2112 make_in4_sockaddr(&rt.rt_genmask, any);
2113 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2114 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2116 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
2122 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2125 static const char fn[] = "/proc/net/route";
2130 *netdev_name = NULL;
2131 stream = fopen(fn, "r");
2132 if (stream == NULL) {
2133 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
2138 while (fgets(line, sizeof line, stream)) {
2141 ovs_be32 dest, gateway, mask;
2142 int refcnt, metric, mtu;
2143 unsigned int flags, use, window, irtt;
2146 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2148 iface, &dest, &gateway, &flags, &refcnt,
2149 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2151 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2155 if (!(flags & RTF_UP)) {
2156 /* Skip routes that aren't up. */
2160 /* The output of 'dest', 'mask', and 'gateway' were given in
2161 * network byte order, so we don't need need any endian
2162 * conversions here. */
2163 if ((dest & mask) == (host->s_addr & mask)) {
2165 /* The host is directly reachable. */
2166 next_hop->s_addr = 0;
2168 /* To reach the host, we must go through a gateway. */
2169 next_hop->s_addr = gateway;
2171 *netdev_name = xstrdup(iface);
2183 netdev_linux_get_status(const struct netdev *netdev, struct shash *sh)
2186 struct netdev_dev_linux *netdev_dev =
2187 netdev_dev_linux_cast(netdev_get_dev(netdev));
2189 error = netdev_linux_get_drvinfo(netdev_dev);
2191 shash_add(sh, "driver_name", xstrdup(netdev_dev->drvinfo.driver));
2192 shash_add(sh, "driver_version", xstrdup(netdev_dev->drvinfo.version));
2193 shash_add(sh, "firmware_version", xstrdup(netdev_dev->drvinfo.fw_version));
2199 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED, struct shash *sh)
2201 shash_add(sh, "driver_name", xstrdup("openvswitch"));
2205 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2206 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2207 * returns 0. Otherwise, it returns a positive errno value; in particular,
2208 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2210 netdev_linux_arp_lookup(const struct netdev *netdev,
2211 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2214 struct sockaddr_in sin;
2217 memset(&r, 0, sizeof r);
2218 memset(&sin, 0, sizeof sin);
2219 sin.sin_family = AF_INET;
2220 sin.sin_addr.s_addr = ip;
2222 memcpy(&r.arp_pa, &sin, sizeof sin);
2223 r.arp_ha.sa_family = ARPHRD_ETHER;
2225 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2226 COVERAGE_INC(netdev_arp_lookup);
2227 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2229 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2230 } else if (retval != ENXIO) {
2231 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2232 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
2238 nd_to_iff_flags(enum netdev_flags nd)
2241 if (nd & NETDEV_UP) {
2244 if (nd & NETDEV_PROMISC) {
2251 iff_to_nd_flags(int iff)
2253 enum netdev_flags nd = 0;
2257 if (iff & IFF_PROMISC) {
2258 nd |= NETDEV_PROMISC;
2264 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2265 enum netdev_flags on, enum netdev_flags *old_flagsp)
2267 struct netdev_dev_linux *netdev_dev;
2268 int old_flags, new_flags;
2271 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2272 old_flags = netdev_dev->ifi_flags;
2273 *old_flagsp = iff_to_nd_flags(old_flags);
2274 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2275 if (new_flags != old_flags) {
2276 error = set_flags(netdev, new_flags);
2277 get_flags(&netdev_dev->netdev_dev, &netdev_dev->ifi_flags);
2283 netdev_linux_change_seq(const struct netdev *netdev)
2285 return netdev_dev_linux_cast(netdev_get_dev(netdev))->change_seq;
2288 #define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS, \
2293 netdev_linux_init, \
2295 netdev_linux_wait, \
2298 netdev_linux_destroy, \
2299 NULL, /* get_config */ \
2300 NULL, /* set_config */ \
2302 netdev_linux_open, \
2303 netdev_linux_close, \
2305 netdev_linux_listen, \
2306 netdev_linux_recv, \
2307 netdev_linux_recv_wait, \
2308 netdev_linux_drain, \
2310 netdev_linux_send, \
2311 netdev_linux_send_wait, \
2313 netdev_linux_set_etheraddr, \
2314 netdev_linux_get_etheraddr, \
2315 netdev_linux_get_mtu, \
2316 netdev_linux_set_mtu, \
2317 netdev_linux_get_ifindex, \
2318 netdev_linux_get_carrier, \
2319 netdev_linux_get_carrier_resets, \
2320 netdev_linux_set_miimon_interval, \
2324 netdev_linux_get_features, \
2325 netdev_linux_set_advertisements, \
2327 netdev_linux_set_policing, \
2328 netdev_linux_get_qos_types, \
2329 netdev_linux_get_qos_capabilities, \
2330 netdev_linux_get_qos, \
2331 netdev_linux_set_qos, \
2332 netdev_linux_get_queue, \
2333 netdev_linux_set_queue, \
2334 netdev_linux_delete_queue, \
2335 netdev_linux_get_queue_stats, \
2336 netdev_linux_dump_queues, \
2337 netdev_linux_dump_queue_stats, \
2339 netdev_linux_get_in4, \
2340 netdev_linux_set_in4, \
2341 netdev_linux_get_in6, \
2342 netdev_linux_add_router, \
2343 netdev_linux_get_next_hop, \
2345 netdev_linux_arp_lookup, \
2347 netdev_linux_update_flags, \
2349 netdev_linux_change_seq \
2352 const struct netdev_class netdev_linux_class =
2355 netdev_linux_create,
2356 netdev_linux_get_stats,
2357 NULL, /* set_stats */
2358 netdev_linux_get_status);
2360 const struct netdev_class netdev_tap_class =
2363 netdev_linux_create_tap,
2364 netdev_tap_get_stats,
2365 NULL, /* set_stats */
2366 netdev_linux_get_status);
2368 const struct netdev_class netdev_internal_class =
2371 netdev_linux_create,
2372 netdev_internal_get_stats,
2373 netdev_vport_set_stats,
2374 netdev_internal_get_status);
2376 /* HTB traffic control class. */
2378 #define HTB_N_QUEUES 0xf000
2382 unsigned int max_rate; /* In bytes/s. */
2386 struct tc_queue tc_queue;
2387 unsigned int min_rate; /* In bytes/s. */
2388 unsigned int max_rate; /* In bytes/s. */
2389 unsigned int burst; /* In bytes. */
2390 unsigned int priority; /* Lower values are higher priorities. */
2394 htb_get__(const struct netdev *netdev)
2396 struct netdev_dev_linux *netdev_dev =
2397 netdev_dev_linux_cast(netdev_get_dev(netdev));
2398 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2402 htb_install__(struct netdev *netdev, uint64_t max_rate)
2404 struct netdev_dev_linux *netdev_dev =
2405 netdev_dev_linux_cast(netdev_get_dev(netdev));
2408 htb = xmalloc(sizeof *htb);
2409 tc_init(&htb->tc, &tc_ops_htb);
2410 htb->max_rate = max_rate;
2412 netdev_dev->tc = &htb->tc;
2415 /* Create an HTB qdisc.
2417 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2419 htb_setup_qdisc__(struct netdev *netdev)
2422 struct tc_htb_glob opt;
2423 struct ofpbuf request;
2424 struct tcmsg *tcmsg;
2426 tc_del_qdisc(netdev);
2428 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2429 NLM_F_EXCL | NLM_F_CREATE, &request);
2433 tcmsg->tcm_handle = tc_make_handle(1, 0);
2434 tcmsg->tcm_parent = TC_H_ROOT;
2436 nl_msg_put_string(&request, TCA_KIND, "htb");
2438 memset(&opt, 0, sizeof opt);
2439 opt.rate2quantum = 10;
2443 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2444 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2445 nl_msg_end_nested(&request, opt_offset);
2447 return tc_transact(&request, NULL);
2450 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2451 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2453 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2454 unsigned int parent, struct htb_class *class)
2457 struct tc_htb_opt opt;
2458 struct ofpbuf request;
2459 struct tcmsg *tcmsg;
2463 error = netdev_get_mtu(netdev, &mtu);
2465 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2466 netdev_get_name(netdev));
2470 memset(&opt, 0, sizeof opt);
2471 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2472 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2473 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2474 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2475 opt.prio = class->priority;
2477 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2481 tcmsg->tcm_handle = handle;
2482 tcmsg->tcm_parent = parent;
2484 nl_msg_put_string(&request, TCA_KIND, "htb");
2485 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2486 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2487 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2488 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2489 nl_msg_end_nested(&request, opt_offset);
2491 error = tc_transact(&request, NULL);
2493 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2494 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2495 netdev_get_name(netdev),
2496 tc_get_major(handle), tc_get_minor(handle),
2497 tc_get_major(parent), tc_get_minor(parent),
2498 class->min_rate, class->max_rate,
2499 class->burst, class->priority, strerror(error));
2504 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2505 * description of them into 'details'. The description complies with the
2506 * specification given in the vswitch database documentation for linux-htb
2509 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2511 static const struct nl_policy tca_htb_policy[] = {
2512 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2513 .min_len = sizeof(struct tc_htb_opt) },
2516 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2517 const struct tc_htb_opt *htb;
2519 if (!nl_parse_nested(nl_options, tca_htb_policy,
2520 attrs, ARRAY_SIZE(tca_htb_policy))) {
2521 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2525 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2526 class->min_rate = htb->rate.rate;
2527 class->max_rate = htb->ceil.rate;
2528 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2529 class->priority = htb->prio;
2534 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2535 struct htb_class *options,
2536 struct netdev_queue_stats *stats)
2538 struct nlattr *nl_options;
2539 unsigned int handle;
2542 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2543 if (!error && queue_id) {
2544 unsigned int major = tc_get_major(handle);
2545 unsigned int minor = tc_get_minor(handle);
2546 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2547 *queue_id = minor - 1;
2552 if (!error && options) {
2553 error = htb_parse_tca_options__(nl_options, options);
2559 htb_parse_qdisc_details__(struct netdev *netdev,
2560 const struct shash *details, struct htb_class *hc)
2562 const char *max_rate_s;
2564 max_rate_s = shash_find_data(details, "max-rate");
2565 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2566 if (!hc->max_rate) {
2569 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2570 hc->max_rate = netdev_features_to_bps(current) / 8;
2572 hc->min_rate = hc->max_rate;
2578 htb_parse_class_details__(struct netdev *netdev,
2579 const struct shash *details, struct htb_class *hc)
2581 const struct htb *htb = htb_get__(netdev);
2582 const char *min_rate_s = shash_find_data(details, "min-rate");
2583 const char *max_rate_s = shash_find_data(details, "max-rate");
2584 const char *burst_s = shash_find_data(details, "burst");
2585 const char *priority_s = shash_find_data(details, "priority");
2588 error = netdev_get_mtu(netdev, &mtu);
2590 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2591 netdev_get_name(netdev));
2595 /* HTB requires at least an mtu sized min-rate to send any traffic even
2596 * on uncongested links. */
2597 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2598 hc->min_rate = MAX(hc->min_rate, mtu);
2599 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2602 hc->max_rate = (max_rate_s
2603 ? strtoull(max_rate_s, NULL, 10) / 8
2605 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2606 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2610 * According to hints in the documentation that I've read, it is important
2611 * that 'burst' be at least as big as the largest frame that might be
2612 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2613 * but having it a bit too small is a problem. Since netdev_get_mtu()
2614 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2615 * the MTU. We actually add 64, instead of 14, as a guard against
2616 * additional headers get tacked on somewhere that we're not aware of. */
2617 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2618 hc->burst = MAX(hc->burst, mtu + 64);
2621 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2627 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2628 unsigned int parent, struct htb_class *options,
2629 struct netdev_queue_stats *stats)
2631 struct ofpbuf *reply;
2634 error = tc_query_class(netdev, handle, parent, &reply);
2636 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2637 ofpbuf_delete(reply);
2643 htb_tc_install(struct netdev *netdev, const struct shash *details)
2647 error = htb_setup_qdisc__(netdev);
2649 struct htb_class hc;
2651 htb_parse_qdisc_details__(netdev, details, &hc);
2652 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2653 tc_make_handle(1, 0), &hc);
2655 htb_install__(netdev, hc.max_rate);
2661 static struct htb_class *
2662 htb_class_cast__(const struct tc_queue *queue)
2664 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2668 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2669 const struct htb_class *hc)
2671 struct htb *htb = htb_get__(netdev);
2672 size_t hash = hash_int(queue_id, 0);
2673 struct tc_queue *queue;
2674 struct htb_class *hcp;
2676 queue = tc_find_queue__(netdev, queue_id, hash);
2678 hcp = htb_class_cast__(queue);
2680 hcp = xmalloc(sizeof *hcp);
2681 queue = &hcp->tc_queue;
2682 queue->queue_id = queue_id;
2683 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2686 hcp->min_rate = hc->min_rate;
2687 hcp->max_rate = hc->max_rate;
2688 hcp->burst = hc->burst;
2689 hcp->priority = hc->priority;
2693 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2696 struct nl_dump dump;
2697 struct htb_class hc;
2699 /* Get qdisc options. */
2701 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2702 htb_install__(netdev, hc.max_rate);
2705 if (!start_queue_dump(netdev, &dump)) {
2708 while (nl_dump_next(&dump, &msg)) {
2709 unsigned int queue_id;
2711 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2712 htb_update_queue__(netdev, queue_id, &hc);
2715 nl_dump_done(&dump);
2721 htb_tc_destroy(struct tc *tc)
2723 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2724 struct htb_class *hc, *next;
2726 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2727 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2735 htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2737 const struct htb *htb = htb_get__(netdev);
2738 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2743 htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2745 struct htb_class hc;
2748 htb_parse_qdisc_details__(netdev, details, &hc);
2749 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2750 tc_make_handle(1, 0), &hc);
2752 htb_get__(netdev)->max_rate = hc.max_rate;
2758 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2759 const struct tc_queue *queue, struct shash *details)
2761 const struct htb_class *hc = htb_class_cast__(queue);
2763 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2764 if (hc->min_rate != hc->max_rate) {
2765 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2767 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2769 shash_add(details, "priority", xasprintf("%u", hc->priority));
2775 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2776 const struct shash *details)
2778 struct htb_class hc;
2781 error = htb_parse_class_details__(netdev, details, &hc);
2786 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2787 tc_make_handle(1, 0xfffe), &hc);
2792 htb_update_queue__(netdev, queue_id, &hc);
2797 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2799 struct htb_class *hc = htb_class_cast__(queue);
2800 struct htb *htb = htb_get__(netdev);
2803 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2805 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2812 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2813 struct netdev_queue_stats *stats)
2815 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2816 tc_make_handle(1, 0xfffe), NULL, stats);
2820 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2821 const struct ofpbuf *nlmsg,
2822 netdev_dump_queue_stats_cb *cb, void *aux)
2824 struct netdev_queue_stats stats;
2825 unsigned int handle, major, minor;
2828 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2833 major = tc_get_major(handle);
2834 minor = tc_get_minor(handle);
2835 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2836 (*cb)(minor - 1, &stats, aux);
2841 static const struct tc_ops tc_ops_htb = {
2842 "htb", /* linux_name */
2843 "linux-htb", /* ovs_name */
2844 HTB_N_QUEUES, /* n_queues */
2853 htb_class_get_stats,
2854 htb_class_dump_stats
2857 /* "linux-hfsc" traffic control class. */
2859 #define HFSC_N_QUEUES 0xf000
2867 struct tc_queue tc_queue;
2872 static struct hfsc *
2873 hfsc_get__(const struct netdev *netdev)
2875 struct netdev_dev_linux *netdev_dev;
2876 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2877 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2880 static struct hfsc_class *
2881 hfsc_class_cast__(const struct tc_queue *queue)
2883 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2887 hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2889 struct netdev_dev_linux * netdev_dev;
2892 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2893 hfsc = xmalloc(sizeof *hfsc);
2894 tc_init(&hfsc->tc, &tc_ops_hfsc);
2895 hfsc->max_rate = max_rate;
2896 netdev_dev->tc = &hfsc->tc;
2900 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
2901 const struct hfsc_class *hc)
2905 struct hfsc_class *hcp;
2906 struct tc_queue *queue;
2908 hfsc = hfsc_get__(netdev);
2909 hash = hash_int(queue_id, 0);
2911 queue = tc_find_queue__(netdev, queue_id, hash);
2913 hcp = hfsc_class_cast__(queue);
2915 hcp = xmalloc(sizeof *hcp);
2916 queue = &hcp->tc_queue;
2917 queue->queue_id = queue_id;
2918 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
2921 hcp->min_rate = hc->min_rate;
2922 hcp->max_rate = hc->max_rate;
2926 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
2928 const struct tc_service_curve *rsc, *fsc, *usc;
2929 static const struct nl_policy tca_hfsc_policy[] = {
2931 .type = NL_A_UNSPEC,
2933 .min_len = sizeof(struct tc_service_curve),
2936 .type = NL_A_UNSPEC,
2938 .min_len = sizeof(struct tc_service_curve),
2941 .type = NL_A_UNSPEC,
2943 .min_len = sizeof(struct tc_service_curve),
2946 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
2948 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
2949 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
2950 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
2954 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
2955 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
2956 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
2958 if (rsc->m1 != 0 || rsc->d != 0 ||
2959 fsc->m1 != 0 || fsc->d != 0 ||
2960 usc->m1 != 0 || usc->d != 0) {
2961 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2962 "Non-linear service curves are not supported.");
2966 if (rsc->m2 != fsc->m2) {
2967 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2968 "Real-time service curves are not supported ");
2972 if (rsc->m2 > usc->m2) {
2973 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2974 "Min-rate service curve is greater than "
2975 "the max-rate service curve.");
2979 class->min_rate = fsc->m2;
2980 class->max_rate = usc->m2;
2985 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2986 struct hfsc_class *options,
2987 struct netdev_queue_stats *stats)
2990 unsigned int handle;
2991 struct nlattr *nl_options;
2993 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2999 unsigned int major, minor;
3001 major = tc_get_major(handle);
3002 minor = tc_get_minor(handle);
3003 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3004 *queue_id = minor - 1;
3011 error = hfsc_parse_tca_options__(nl_options, options);
3018 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3019 unsigned int parent, struct hfsc_class *options,
3020 struct netdev_queue_stats *stats)
3023 struct ofpbuf *reply;
3025 error = tc_query_class(netdev, handle, parent, &reply);
3030 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3031 ofpbuf_delete(reply);
3036 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
3037 struct hfsc_class *class)
3040 const char *max_rate_s;
3042 max_rate_s = shash_find_data(details, "max-rate");
3043 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3048 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
3049 max_rate = netdev_features_to_bps(current) / 8;
3052 class->min_rate = max_rate;
3053 class->max_rate = max_rate;
3057 hfsc_parse_class_details__(struct netdev *netdev,
3058 const struct shash *details,
3059 struct hfsc_class * class)
3061 const struct hfsc *hfsc;
3062 uint32_t min_rate, max_rate;
3063 const char *min_rate_s, *max_rate_s;
3065 hfsc = hfsc_get__(netdev);
3066 min_rate_s = shash_find_data(details, "min-rate");
3067 max_rate_s = shash_find_data(details, "max-rate");
3069 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3070 min_rate = MAX(min_rate, 1);
3071 min_rate = MIN(min_rate, hfsc->max_rate);
3073 max_rate = (max_rate_s
3074 ? strtoull(max_rate_s, NULL, 10) / 8
3076 max_rate = MAX(max_rate, min_rate);
3077 max_rate = MIN(max_rate, hfsc->max_rate);
3079 class->min_rate = min_rate;
3080 class->max_rate = max_rate;
3085 /* Create an HFSC qdisc.
3087 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3089 hfsc_setup_qdisc__(struct netdev * netdev)
3091 struct tcmsg *tcmsg;
3092 struct ofpbuf request;
3093 struct tc_hfsc_qopt opt;
3095 tc_del_qdisc(netdev);
3097 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3098 NLM_F_EXCL | NLM_F_CREATE, &request);
3104 tcmsg->tcm_handle = tc_make_handle(1, 0);
3105 tcmsg->tcm_parent = TC_H_ROOT;
3107 memset(&opt, 0, sizeof opt);
3110 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3111 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3113 return tc_transact(&request, NULL);
3116 /* Create an HFSC class.
3118 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3119 * sc rate <min_rate> ul rate <max_rate>" */
3121 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3122 unsigned int parent, struct hfsc_class *class)
3126 struct tcmsg *tcmsg;
3127 struct ofpbuf request;
3128 struct tc_service_curve min, max;
3130 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3136 tcmsg->tcm_handle = handle;
3137 tcmsg->tcm_parent = parent;
3141 min.m2 = class->min_rate;
3145 max.m2 = class->max_rate;
3147 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3148 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3149 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3150 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3151 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3152 nl_msg_end_nested(&request, opt_offset);
3154 error = tc_transact(&request, NULL);
3156 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3157 "min-rate %ubps, max-rate %ubps (%s)",
3158 netdev_get_name(netdev),
3159 tc_get_major(handle), tc_get_minor(handle),
3160 tc_get_major(parent), tc_get_minor(parent),
3161 class->min_rate, class->max_rate, strerror(error));
3168 hfsc_tc_install(struct netdev *netdev, const struct shash *details)
3171 struct hfsc_class class;
3173 error = hfsc_setup_qdisc__(netdev);
3179 hfsc_parse_qdisc_details__(netdev, details, &class);
3180 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3181 tc_make_handle(1, 0), &class);
3187 hfsc_install__(netdev, class.max_rate);
3192 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3195 struct nl_dump dump;
3196 struct hfsc_class hc;
3199 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3200 hfsc_install__(netdev, hc.max_rate);
3202 if (!start_queue_dump(netdev, &dump)) {
3206 while (nl_dump_next(&dump, &msg)) {
3207 unsigned int queue_id;
3209 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3210 hfsc_update_queue__(netdev, queue_id, &hc);
3214 nl_dump_done(&dump);
3219 hfsc_tc_destroy(struct tc *tc)
3222 struct hfsc_class *hc, *next;
3224 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3226 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3227 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3236 hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3238 const struct hfsc *hfsc;
3239 hfsc = hfsc_get__(netdev);
3240 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3245 hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3248 struct hfsc_class class;
3250 hfsc_parse_qdisc_details__(netdev, details, &class);
3251 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3252 tc_make_handle(1, 0), &class);
3255 hfsc_get__(netdev)->max_rate = class.max_rate;
3262 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3263 const struct tc_queue *queue, struct shash *details)
3265 const struct hfsc_class *hc;
3267 hc = hfsc_class_cast__(queue);
3268 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3269 if (hc->min_rate != hc->max_rate) {
3270 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3276 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3277 const struct shash *details)
3280 struct hfsc_class class;
3282 error = hfsc_parse_class_details__(netdev, details, &class);
3287 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3288 tc_make_handle(1, 0xfffe), &class);
3293 hfsc_update_queue__(netdev, queue_id, &class);
3298 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3302 struct hfsc_class *hc;
3304 hc = hfsc_class_cast__(queue);
3305 hfsc = hfsc_get__(netdev);
3307 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3309 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3316 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3317 struct netdev_queue_stats *stats)
3319 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3320 tc_make_handle(1, 0xfffe), NULL, stats);
3324 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3325 const struct ofpbuf *nlmsg,
3326 netdev_dump_queue_stats_cb *cb, void *aux)
3328 struct netdev_queue_stats stats;
3329 unsigned int handle, major, minor;
3332 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3337 major = tc_get_major(handle);
3338 minor = tc_get_minor(handle);
3339 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3340 (*cb)(minor - 1, &stats, aux);
3345 static const struct tc_ops tc_ops_hfsc = {
3346 "hfsc", /* linux_name */
3347 "linux-hfsc", /* ovs_name */
3348 HFSC_N_QUEUES, /* n_queues */
3349 hfsc_tc_install, /* tc_install */
3350 hfsc_tc_load, /* tc_load */
3351 hfsc_tc_destroy, /* tc_destroy */
3352 hfsc_qdisc_get, /* qdisc_get */
3353 hfsc_qdisc_set, /* qdisc_set */
3354 hfsc_class_get, /* class_get */
3355 hfsc_class_set, /* class_set */
3356 hfsc_class_delete, /* class_delete */
3357 hfsc_class_get_stats, /* class_get_stats */
3358 hfsc_class_dump_stats /* class_dump_stats */
3361 /* "linux-default" traffic control class.
3363 * This class represents the default, unnamed Linux qdisc. It corresponds to
3364 * the "" (empty string) QoS type in the OVS database. */
3367 default_install__(struct netdev *netdev)
3369 struct netdev_dev_linux *netdev_dev =
3370 netdev_dev_linux_cast(netdev_get_dev(netdev));
3371 static struct tc *tc;
3374 tc = xmalloc(sizeof *tc);
3375 tc_init(tc, &tc_ops_default);
3377 netdev_dev->tc = tc;
3381 default_tc_install(struct netdev *netdev,
3382 const struct shash *details OVS_UNUSED)
3384 default_install__(netdev);
3389 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3391 default_install__(netdev);
3395 static const struct tc_ops tc_ops_default = {
3396 NULL, /* linux_name */
3401 NULL, /* tc_destroy */
3402 NULL, /* qdisc_get */
3403 NULL, /* qdisc_set */
3404 NULL, /* class_get */
3405 NULL, /* class_set */
3406 NULL, /* class_delete */
3407 NULL, /* class_get_stats */
3408 NULL /* class_dump_stats */
3411 /* "linux-other" traffic control class.
3416 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3418 struct netdev_dev_linux *netdev_dev =
3419 netdev_dev_linux_cast(netdev_get_dev(netdev));
3420 static struct tc *tc;
3423 tc = xmalloc(sizeof *tc);
3424 tc_init(tc, &tc_ops_other);
3426 netdev_dev->tc = tc;
3430 static const struct tc_ops tc_ops_other = {
3431 NULL, /* linux_name */
3432 "linux-other", /* ovs_name */
3434 NULL, /* tc_install */
3436 NULL, /* tc_destroy */
3437 NULL, /* qdisc_get */
3438 NULL, /* qdisc_set */
3439 NULL, /* class_get */
3440 NULL, /* class_set */
3441 NULL, /* class_delete */
3442 NULL, /* class_get_stats */
3443 NULL /* class_dump_stats */
3446 /* Traffic control. */
3448 /* Number of kernel "tc" ticks per second. */
3449 static double ticks_per_s;
3451 /* Number of kernel "jiffies" per second. This is used for the purpose of
3452 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3453 * one jiffy's worth of data.
3455 * There are two possibilities here:
3457 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3458 * approximate range of 100 to 1024. That means that we really need to
3459 * make sure that the qdisc can buffer that much data.
3461 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3462 * has finely granular timers and there's no need to fudge additional room
3463 * for buffers. (There's no extra effort needed to implement that: the
3464 * large 'buffer_hz' is used as a divisor, so practically any number will
3465 * come out as 0 in the division. Small integer results in the case of
3466 * really high dividends won't have any real effect anyhow.)
3468 static unsigned int buffer_hz;
3470 /* Returns tc handle 'major':'minor'. */
3472 tc_make_handle(unsigned int major, unsigned int minor)
3474 return TC_H_MAKE(major << 16, minor);
3477 /* Returns the major number from 'handle'. */
3479 tc_get_major(unsigned int handle)
3481 return TC_H_MAJ(handle) >> 16;
3484 /* Returns the minor number from 'handle'. */
3486 tc_get_minor(unsigned int handle)
3488 return TC_H_MIN(handle);
3491 static struct tcmsg *
3492 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3493 struct ofpbuf *request)
3495 struct tcmsg *tcmsg;
3499 error = get_ifindex(netdev, &ifindex);
3504 ofpbuf_init(request, 512);
3505 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3506 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3507 tcmsg->tcm_family = AF_UNSPEC;
3508 tcmsg->tcm_ifindex = ifindex;
3509 /* Caller should fill in tcmsg->tcm_handle. */
3510 /* Caller should fill in tcmsg->tcm_parent. */
3516 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3518 int error = nl_sock_transact(rtnl_sock, request, replyp);
3519 ofpbuf_uninit(request);
3523 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3524 * policing configuration.
3526 * This function is equivalent to running the following when 'add' is true:
3527 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3529 * This function is equivalent to running the following when 'add' is false:
3530 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3532 * The configuration and stats may be seen with the following command:
3533 * /sbin/tc -s qdisc show dev <devname>
3535 * Returns 0 if successful, otherwise a positive errno value.
3538 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3540 struct ofpbuf request;
3541 struct tcmsg *tcmsg;
3543 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3544 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3546 tcmsg = tc_make_request(netdev, type, flags, &request);
3550 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3551 tcmsg->tcm_parent = TC_H_INGRESS;
3552 nl_msg_put_string(&request, TCA_KIND, "ingress");
3553 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3555 error = tc_transact(&request, NULL);
3557 /* If we're deleting the qdisc, don't worry about some of the
3558 * error conditions. */
3559 if (!add && (error == ENOENT || error == EINVAL)) {
3568 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3571 * This function is equivalent to running:
3572 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3573 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3576 * The configuration and stats may be seen with the following command:
3577 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3579 * Returns 0 if successful, otherwise a positive errno value.
3582 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3584 struct tc_police tc_police;
3585 struct ofpbuf request;
3586 struct tcmsg *tcmsg;
3587 size_t basic_offset;
3588 size_t police_offset;
3592 memset(&tc_police, 0, sizeof tc_police);
3593 tc_police.action = TC_POLICE_SHOT;
3594 tc_police.mtu = mtu;
3595 tc_fill_rate(&tc_police.rate, kbits_rate/8 * 1000, mtu);
3596 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3597 kbits_burst * 1024);
3599 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3600 NLM_F_EXCL | NLM_F_CREATE, &request);
3604 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3605 tcmsg->tcm_info = tc_make_handle(49,
3606 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3608 nl_msg_put_string(&request, TCA_KIND, "basic");
3609 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3610 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3611 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3612 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3613 nl_msg_end_nested(&request, police_offset);
3614 nl_msg_end_nested(&request, basic_offset);
3616 error = tc_transact(&request, NULL);
3627 /* The values in psched are not individually very meaningful, but they are
3628 * important. The tables below show some values seen in the wild.
3632 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3633 * (Before that, there are hints that it was 1000000000.)
3635 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3639 * -----------------------------------
3640 * [1] 000c8000 000f4240 000f4240 00000064
3641 * [2] 000003e8 00000400 000f4240 3b9aca00
3642 * [3] 000003e8 00000400 000f4240 3b9aca00
3643 * [4] 000003e8 00000400 000f4240 00000064
3644 * [5] 000003e8 00000040 000f4240 3b9aca00
3645 * [6] 000003e8 00000040 000f4240 000000f9
3647 * a b c d ticks_per_s buffer_hz
3648 * ------- --------- ---------- ------------- ----------- -------------
3649 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3650 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3651 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3652 * [4] 1,000 1,024 1,000,000 100 976,562 100
3653 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3654 * [6] 1,000 64 1,000,000 249 15,625,000 249
3656 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3657 * [2] 2.6.26-1-686-bigmem from Debian lenny
3658 * [3] 2.6.26-2-sparc64 from Debian lenny
3659 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3660 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3661 * [6] 2.6.34 from kernel.org on KVM
3663 static const char fn[] = "/proc/net/psched";
3664 unsigned int a, b, c, d;
3670 stream = fopen(fn, "r");
3672 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3676 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3677 VLOG_WARN("%s: read failed", fn);
3681 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3685 VLOG_WARN("%s: invalid scheduler parameters", fn);
3689 ticks_per_s = (double) a * c / b;
3693 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3696 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3699 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3700 * rate of 'rate' bytes per second. */
3702 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3707 return (rate * ticks) / ticks_per_s;
3710 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3711 * rate of 'rate' bytes per second. */
3713 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3718 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3721 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3722 * a transmission rate of 'rate' bytes per second. */
3724 tc_buffer_per_jiffy(unsigned int rate)
3729 return rate / buffer_hz;
3732 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3733 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3734 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3735 * stores NULL into it if it is absent.
3737 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3740 * Returns 0 if successful, otherwise a positive errno value. */
3742 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3743 struct nlattr **options)
3745 static const struct nl_policy tca_policy[] = {
3746 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3747 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3749 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3751 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3752 tca_policy, ta, ARRAY_SIZE(ta))) {
3753 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3758 *kind = nl_attr_get_string(ta[TCA_KIND]);
3762 *options = ta[TCA_OPTIONS];
3777 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3778 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3779 * into '*options', and its queue statistics into '*stats'. Any of the output
3780 * arguments may be null.
3782 * Returns 0 if successful, otherwise a positive errno value. */
3784 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3785 struct nlattr **options, struct netdev_queue_stats *stats)
3787 static const struct nl_policy tca_policy[] = {
3788 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3789 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3791 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3793 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3794 tca_policy, ta, ARRAY_SIZE(ta))) {
3795 VLOG_WARN_RL(&rl, "failed to parse class message");
3800 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3801 *handlep = tc->tcm_handle;
3805 *options = ta[TCA_OPTIONS];
3809 const struct gnet_stats_queue *gsq;
3810 struct gnet_stats_basic gsb;
3812 static const struct nl_policy stats_policy[] = {
3813 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3814 .min_len = sizeof gsb },
3815 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3816 .min_len = sizeof *gsq },
3818 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3820 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3821 sa, ARRAY_SIZE(sa))) {
3822 VLOG_WARN_RL(&rl, "failed to parse class stats");
3826 /* Alignment issues screw up the length of struct gnet_stats_basic on
3827 * some arch/bitsize combinations. Newer versions of Linux have a
3828 * struct gnet_stats_basic_packed, but we can't depend on that. The
3829 * easiest thing to do is just to make a copy. */
3830 memset(&gsb, 0, sizeof gsb);
3831 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3832 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3833 stats->tx_bytes = gsb.bytes;
3834 stats->tx_packets = gsb.packets;
3836 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3837 stats->tx_errors = gsq->drops;
3847 memset(stats, 0, sizeof *stats);
3852 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3855 tc_query_class(const struct netdev *netdev,
3856 unsigned int handle, unsigned int parent,
3857 struct ofpbuf **replyp)
3859 struct ofpbuf request;
3860 struct tcmsg *tcmsg;
3863 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3867 tcmsg->tcm_handle = handle;
3868 tcmsg->tcm_parent = parent;
3870 error = tc_transact(&request, replyp);
3872 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3873 netdev_get_name(netdev),
3874 tc_get_major(handle), tc_get_minor(handle),
3875 tc_get_major(parent), tc_get_minor(parent),
3881 /* Equivalent to "tc class del dev <name> handle <handle>". */
3883 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3885 struct ofpbuf request;
3886 struct tcmsg *tcmsg;
3889 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
3893 tcmsg->tcm_handle = handle;
3894 tcmsg->tcm_parent = 0;
3896 error = tc_transact(&request, NULL);
3898 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3899 netdev_get_name(netdev),
3900 tc_get_major(handle), tc_get_minor(handle),
3906 /* Equivalent to "tc qdisc del dev <name> root". */
3908 tc_del_qdisc(struct netdev *netdev)
3910 struct netdev_dev_linux *netdev_dev =
3911 netdev_dev_linux_cast(netdev_get_dev(netdev));
3912 struct ofpbuf request;
3913 struct tcmsg *tcmsg;
3916 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
3920 tcmsg->tcm_handle = tc_make_handle(1, 0);
3921 tcmsg->tcm_parent = TC_H_ROOT;
3923 error = tc_transact(&request, NULL);
3924 if (error == EINVAL) {
3925 /* EINVAL probably means that the default qdisc was in use, in which
3926 * case we've accomplished our purpose. */
3929 if (!error && netdev_dev->tc) {
3930 if (netdev_dev->tc->ops->tc_destroy) {
3931 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3933 netdev_dev->tc = NULL;
3938 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
3939 * kernel to determine what they are. Returns 0 if successful, otherwise a
3940 * positive errno value. */
3942 tc_query_qdisc(const struct netdev *netdev)
3944 struct netdev_dev_linux *netdev_dev =
3945 netdev_dev_linux_cast(netdev_get_dev(netdev));
3946 struct ofpbuf request, *qdisc;
3947 const struct tc_ops *ops;
3948 struct tcmsg *tcmsg;
3952 if (netdev_dev->tc) {
3956 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3957 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3958 * 2.6.35 without that fix backported to it.
3960 * To avoid the OOPS, we must not make a request that would attempt to dump
3961 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3962 * few others. There are a few ways that I can see to do this, but most of
3963 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3964 * technique chosen here is to assume that any non-default qdisc that we
3965 * create will have a class with handle 1:0. The built-in qdiscs only have
3966 * a class with handle 0:0.
3968 * We could check for Linux 2.6.35+ and use a more straightforward method
3970 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
3974 tcmsg->tcm_handle = tc_make_handle(1, 0);
3975 tcmsg->tcm_parent = 0;
3977 /* Figure out what tc class to instantiate. */
3978 error = tc_transact(&request, &qdisc);
3982 error = tc_parse_qdisc(qdisc, &kind, NULL);
3984 ops = &tc_ops_other;
3986 ops = tc_lookup_linux_name(kind);
3988 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
3989 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
3991 ops = &tc_ops_other;
3994 } else if (error == ENOENT) {
3995 /* Either it's a built-in qdisc, or it's a qdisc set up by some
3996 * other entity that doesn't have a handle 1:0. We will assume
3997 * that it's the system default qdisc. */
3998 ops = &tc_ops_default;
4001 /* Who knows? Maybe the device got deleted. */
4002 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4003 netdev_get_name(netdev), strerror(error));
4004 ops = &tc_ops_other;
4007 /* Instantiate it. */
4008 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
4009 assert((load_error == 0) == (netdev_dev->tc != NULL));
4010 ofpbuf_delete(qdisc);
4012 return error ? error : load_error;
4015 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4016 approximate the time to transmit packets of various lengths. For an MTU of
4017 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4018 represents two possible packet lengths; for a MTU of 513 through 1024, four
4019 possible lengths; and so on.
4021 Returns, for the specified 'mtu', the number of bits that packet lengths
4022 need to be shifted right to fit within such a 256-entry table. */
4024 tc_calc_cell_log(unsigned int mtu)
4029 mtu = ETH_PAYLOAD_MAX;
4031 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4033 for (cell_log = 0; mtu >= 256; cell_log++) {
4040 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4043 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4045 memset(rate, 0, sizeof *rate);
4046 rate->cell_log = tc_calc_cell_log(mtu);
4047 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4048 /* rate->cell_align = 0; */ /* distro headers. */
4049 rate->mpu = ETH_TOTAL_MIN;
4053 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4054 * attribute of the specified "type".
4056 * See tc_calc_cell_log() above for a description of "rtab"s. */
4058 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4063 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4064 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4065 unsigned packet_size = (i + 1) << rate->cell_log;
4066 if (packet_size < rate->mpu) {
4067 packet_size = rate->mpu;
4069 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4073 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4074 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4075 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4078 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4080 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4081 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4084 /* Linux-only functions declared in netdev-linux.h */
4086 /* Returns a fd for an AF_INET socket or a negative errno value. */
4088 netdev_linux_get_af_inet_sock(void)
4090 int error = netdev_linux_init();
4091 return error ? -error : af_inet_sock;
4094 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4095 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4097 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4098 const char *flag_name, bool enable)
4100 const char *netdev_name = netdev_get_name(netdev);
4101 struct ethtool_value evalue;
4105 memset(&evalue, 0, sizeof evalue);
4106 error = netdev_linux_do_ethtool(netdev_name,
4107 (struct ethtool_cmd *)&evalue,
4108 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4113 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4114 error = netdev_linux_do_ethtool(netdev_name,
4115 (struct ethtool_cmd *)&evalue,
4116 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4121 memset(&evalue, 0, sizeof evalue);
4122 error = netdev_linux_do_ethtool(netdev_name,
4123 (struct ethtool_cmd *)&evalue,
4124 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4129 if (new_flags != evalue.data) {
4130 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4131 "device %s failed", enable ? "enable" : "disable",
4132 flag_name, netdev_name);
4139 /* Utility functions. */
4141 /* Copies 'src' into 'dst', performing format conversion in the process. */
4143 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4144 const struct rtnl_link_stats *src)
4146 dst->rx_packets = src->rx_packets;
4147 dst->tx_packets = src->tx_packets;
4148 dst->rx_bytes = src->rx_bytes;
4149 dst->tx_bytes = src->tx_bytes;
4150 dst->rx_errors = src->rx_errors;
4151 dst->tx_errors = src->tx_errors;
4152 dst->rx_dropped = src->rx_dropped;
4153 dst->tx_dropped = src->tx_dropped;
4154 dst->multicast = src->multicast;
4155 dst->collisions = src->collisions;
4156 dst->rx_length_errors = src->rx_length_errors;
4157 dst->rx_over_errors = src->rx_over_errors;
4158 dst->rx_crc_errors = src->rx_crc_errors;
4159 dst->rx_frame_errors = src->rx_frame_errors;
4160 dst->rx_fifo_errors = src->rx_fifo_errors;
4161 dst->rx_missed_errors = src->rx_missed_errors;
4162 dst->tx_aborted_errors = src->tx_aborted_errors;
4163 dst->tx_carrier_errors = src->tx_carrier_errors;
4164 dst->tx_fifo_errors = src->tx_fifo_errors;
4165 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4166 dst->tx_window_errors = src->tx_window_errors;
4170 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4172 /* Policy for RTNLGRP_LINK messages.
4174 * There are *many* more fields in these messages, but currently we only
4175 * care about these fields. */
4176 static const struct nl_policy rtnlgrp_link_policy[] = {
4177 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4178 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4179 .min_len = sizeof(struct rtnl_link_stats) },
4182 struct ofpbuf request;
4183 struct ofpbuf *reply;
4184 struct ifinfomsg *ifi;
4185 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4188 ofpbuf_init(&request, 0);
4189 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4190 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4191 ifi->ifi_family = PF_UNSPEC;
4192 ifi->ifi_index = ifindex;
4193 error = nl_sock_transact(rtnl_sock, &request, &reply);
4194 ofpbuf_uninit(&request);
4199 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4200 rtnlgrp_link_policy,
4201 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4202 ofpbuf_delete(reply);
4206 if (!attrs[IFLA_STATS]) {
4207 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4208 ofpbuf_delete(reply);
4212 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4214 ofpbuf_delete(reply);
4220 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4222 static const char fn[] = "/proc/net/dev";
4227 stream = fopen(fn, "r");
4229 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
4234 while (fgets(line, sizeof line, stream)) {
4237 #define X64 "%"SCNu64
4240 X64 X64 X64 X64 X64 X64 X64 "%*u"
4241 X64 X64 X64 X64 X64 X64 X64 "%*u",
4247 &stats->rx_fifo_errors,
4248 &stats->rx_frame_errors,
4254 &stats->tx_fifo_errors,
4256 &stats->tx_carrier_errors) != 15) {
4257 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4258 } else if (!strcmp(devname, netdev_name)) {
4259 stats->rx_length_errors = UINT64_MAX;
4260 stats->rx_over_errors = UINT64_MAX;
4261 stats->rx_crc_errors = UINT64_MAX;
4262 stats->rx_missed_errors = UINT64_MAX;
4263 stats->tx_aborted_errors = UINT64_MAX;
4264 stats->tx_heartbeat_errors = UINT64_MAX;
4265 stats->tx_window_errors = UINT64_MAX;
4271 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4277 get_flags(const struct netdev_dev *dev, unsigned int *flags)
4283 error = netdev_linux_do_ioctl(dev->name, &ifr, SIOCGIFFLAGS,
4286 *flags = ifr.ifr_flags;
4292 set_flags(struct netdev *netdev, unsigned int flags)
4296 ifr.ifr_flags = flags;
4297 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4302 do_get_ifindex(const char *netdev_name)
4306 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4307 COVERAGE_INC(netdev_get_ifindex);
4308 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4309 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4310 netdev_name, strerror(errno));
4313 return ifr.ifr_ifindex;
4317 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4319 struct netdev_dev_linux *netdev_dev =
4320 netdev_dev_linux_cast(netdev_get_dev(netdev_));
4322 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
4323 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4327 netdev_dev->cache_valid |= VALID_IFINDEX;
4328 netdev_dev->ifindex = ifindex;
4330 *ifindexp = netdev_dev->ifindex;
4335 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4340 memset(&ifr, 0, sizeof ifr);
4341 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4342 COVERAGE_INC(netdev_get_hwaddr);
4343 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4344 /* ENODEV probably means that a vif disappeared asynchronously and
4345 * hasn't been removed from the database yet, so reduce the log level
4346 * to INFO for that case. */
4347 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4348 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4349 netdev_name, strerror(errno));
4352 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4353 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4354 VLOG_WARN("%s device has unknown hardware address family %d",
4355 netdev_name, hwaddr_family);
4357 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4362 set_etheraddr(const char *netdev_name, int hwaddr_family,
4363 const uint8_t mac[ETH_ADDR_LEN])
4367 memset(&ifr, 0, sizeof ifr);
4368 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4369 ifr.ifr_hwaddr.sa_family = hwaddr_family;
4370 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4371 COVERAGE_INC(netdev_set_hwaddr);
4372 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4373 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4374 netdev_name, strerror(errno));
4381 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4382 int cmd, const char *cmd_name)
4386 memset(&ifr, 0, sizeof ifr);
4387 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4388 ifr.ifr_data = (caddr_t) ecmd;
4391 COVERAGE_INC(netdev_ethtool);
4392 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4395 if (errno != EOPNOTSUPP) {
4396 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4397 "failed: %s", cmd_name, name, strerror(errno));
4399 /* The device doesn't support this operation. That's pretty
4400 * common, so there's no point in logging anything. */
4407 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4408 const char *cmd_name)
4410 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4411 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4412 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4420 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4421 int cmd, const char *cmd_name)
4426 ifr.ifr_addr.sa_family = AF_INET;
4427 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4429 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4430 *ip = sin->sin_addr;
4435 /* Returns an AF_PACKET raw socket or a negative errno value. */
4437 af_packet_sock(void)
4439 static int sock = INT_MIN;
4441 if (sock == INT_MIN) {
4442 sock = socket(AF_PACKET, SOCK_RAW, 0);
4444 set_nonblocking(sock);
4447 VLOG_ERR("failed to create packet socket: %s", strerror(errno));