X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=lib%2Fnetdev-linux.c;h=5c7065fe1ab31309be55567ef9d10c5372b683ec;hb=3e912ffcbb;hp=d146ccfcfe090e8ccf097d9fd7310bb2ae50291d;hpb=38e0065b1f25615bf69cfdc000d17935f99c022b;p=sliver-openvswitch.git diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c index d146ccfcf..5c7065fe1 100644 --- a/lib/netdev-linux.c +++ b/lib/netdev-linux.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009, 2010, 2011, 2012, 2013 Nicira, Inc. + * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -33,7 +33,6 @@ #include #include #include -#include #include #include #include @@ -50,6 +49,7 @@ #include "coverage.h" #include "dpif-linux.h" +#include "dpif-netdev.h" #include "dynamic-string.h" #include "fatal-signal.h" #include "hash.h" @@ -61,6 +61,7 @@ #include "netlink.h" #include "ofpbuf.h" #include "openflow/openflow.h" +#include "ovs-atomic.h" #include "packets.h" #include "poll-loop.h" #include "rtnetlink-link.h" @@ -106,6 +107,36 @@ COVERAGE_DEFINE(netdev_set_ethtool); #define TC_RTAB_SIZE 1024 #endif +/* Linux 2.6.21 introduced struct tpacket_auxdata. + * Linux 2.6.27 added the tp_vlan_tci member. + * Linux 3.0 defined TP_STATUS_VLAN_VALID. + * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined + * TP_STATUS_VLAN_TPID_VALID. + * + * With all this churn it's easiest to unconditionally define a replacement + * structure that has everything we want. + */ +#ifndef PACKET_AUXDATA +#define PACKET_AUXDATA 8 +#endif +#ifndef TP_STATUS_VLAN_VALID +#define TP_STATUS_VLAN_VALID (1 << 4) +#endif +#ifndef TP_STATUS_VLAN_TPID_VALID +#define TP_STATUS_VLAN_TPID_VALID (1 << 6) +#endif +#undef tpacket_auxdata +#define tpacket_auxdata rpl_tpacket_auxdata +struct tpacket_auxdata { + uint32_t tp_status; + uint32_t tp_len; + uint32_t tp_snaplen; + uint16_t tp_mac; + uint16_t tp_net; + uint16_t tp_vlan_tci; + uint16_t tp_vlan_tpid; +}; + enum { VALID_IFINDEX = 1 << 0, VALID_ETHERADDR = 1 << 1, @@ -356,7 +387,6 @@ struct netdev_linux { struct ovs_mutex mutex; unsigned int cache_valid; - unsigned int change_seq; bool miimon; /* Link status of last poll. */ long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */ @@ -392,8 +422,8 @@ struct netdev_linux { int tap_fd; }; -struct netdev_rx_linux { - struct netdev_rx up; +struct netdev_rxq_linux { + struct netdev_rxq up; bool is_tap; int fd; }; @@ -402,6 +432,11 @@ struct netdev_rx_linux { * additional log messages. */ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20); +/* Polling miimon status for all ports causes performance degradation when + * handling a large number of ports. If there are no devices using miimon, then + * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait(). */ +static atomic_int miimon_cnt = ATOMIC_VAR_INIT(0); + static void netdev_linux_run(void); static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *, @@ -410,6 +445,9 @@ static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *, int cmd, const char *cmd_name); static int get_flags(const struct netdev *, unsigned int *flags); static int set_flags(const char *, unsigned int flags); +static int update_flags(struct netdev_linux *netdev, enum netdev_flags off, + enum netdev_flags on, enum netdev_flags *old_flagsp) + OVS_REQUIRES(netdev->mutex); static int do_get_ifindex(const char *netdev_name); static int get_ifindex(const struct netdev *, int *ifindexp); static int do_set_addr(struct netdev *netdev, @@ -417,11 +455,12 @@ static int do_set_addr(struct netdev *netdev, struct in_addr addr); static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]); static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]); -static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats); -static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats); +static int get_stats_via_netlink(const struct netdev *, struct netdev_stats *); static int af_packet_sock(void); +static bool netdev_linux_miimon_enabled(void); static void netdev_linux_miimon_run(void); static void netdev_linux_miimon_wait(void); +static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup); static bool is_netdev_linux_class(const struct netdev_class *netdev_class) @@ -443,11 +482,11 @@ netdev_linux_cast(const struct netdev *netdev) return CONTAINER_OF(netdev, struct netdev_linux, up); } -static struct netdev_rx_linux * -netdev_rx_linux_cast(const struct netdev_rx *rx) +static struct netdev_rxq_linux * +netdev_rxq_linux_cast(const struct netdev_rxq *rx) { ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev))); - return CONTAINER_OF(rx, struct netdev_rx_linux, up); + return CONTAINER_OF(rx, struct netdev_rxq_linux, up); } static void netdev_linux_update(struct netdev_linux *netdev, @@ -482,13 +521,24 @@ netdev_linux_notify_sock(void) return sock; } +static bool +netdev_linux_miimon_enabled(void) +{ + int miimon; + + atomic_read(&miimon_cnt, &miimon); + return miimon > 0; +} + static void netdev_linux_run(void) { struct nl_sock *sock; int error; - netdev_linux_miimon_run(); + if (netdev_linux_miimon_enabled()) { + netdev_linux_miimon_run(); + } sock = netdev_linux_notify_sock(); if (!sock) { @@ -550,7 +600,9 @@ netdev_linux_wait(void) { struct nl_sock *sock; - netdev_linux_miimon_wait(); + if (netdev_linux_miimon_enabled()) { + netdev_linux_miimon_wait(); + } sock = netdev_linux_notify_sock(); if (sock) { nl_sock_wait(sock, POLLIN); @@ -562,10 +614,7 @@ netdev_linux_changed(struct netdev_linux *dev, unsigned int ifi_flags, unsigned int mask) OVS_REQUIRES(dev->mutex) { - dev->change_seq++; - if (!dev->change_seq) { - dev->change_seq++; - } + netdev_change_seq_changed(&dev->up); if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) { dev->carrier_resets++; @@ -616,8 +665,7 @@ netdev_linux_alloc(void) static void netdev_linux_common_construct(struct netdev_linux *netdev) { - ovs_mutex_init(&netdev->mutex, PTHREAD_MUTEX_NORMAL); - netdev->change_seq = 1; + ovs_mutex_init(&netdev->mutex); } /* Creates system and internal devices. */ @@ -708,6 +756,11 @@ netdev_linux_destruct(struct netdev *netdev_) close(netdev->tap_fd); } + if (netdev->miimon_interval > 0) { + int junk; + atomic_sub(&miimon_cnt, 1, &junk); + } + ovs_mutex_destroy(&netdev->mutex); } @@ -718,17 +771,17 @@ netdev_linux_dealloc(struct netdev *netdev_) free(netdev); } -static struct netdev_rx * -netdev_linux_rx_alloc(void) +static struct netdev_rxq * +netdev_linux_rxq_alloc(void) { - struct netdev_rx_linux *rx = xzalloc(sizeof *rx); + struct netdev_rxq_linux *rx = xzalloc(sizeof *rx); return &rx->up; } static int -netdev_linux_rx_construct(struct netdev_rx *rx_) +netdev_linux_rxq_construct(struct netdev_rxq *rxq_) { - struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_); + struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_); struct netdev *netdev_ = rx->up.netdev; struct netdev_linux *netdev = netdev_linux_cast(netdev_); int error; @@ -739,7 +792,7 @@ netdev_linux_rx_construct(struct netdev_rx *rx_) rx->fd = netdev->tap_fd; } else { struct sockaddr_ll sll; - int ifindex; + int ifindex, val; /* Result of tcpdump -dd inbound */ static const struct sock_filter filt[] = { { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */ @@ -759,6 +812,14 @@ netdev_linux_rx_construct(struct netdev_rx *rx_) goto error; } + val = 1; + if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) { + error = errno; + VLOG_ERR("%s: failed to mark socket for auxdata (%s)", + netdev_get_name(netdev_), ovs_strerror(error)); + goto error; + } + /* Set non-blocking mode. */ error = set_nonblocking(rx->fd); if (error) { @@ -775,7 +836,7 @@ netdev_linux_rx_construct(struct netdev_rx *rx_) memset(&sll, 0, sizeof sll); sll.sll_family = AF_PACKET; sll.sll_ifindex = ifindex; - sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL); + sll.sll_protocol = htons(ETH_P_ALL); if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) { error = errno; VLOG_ERR("%s: failed to bind raw socket (%s)", @@ -806,9 +867,9 @@ error: } static void -netdev_linux_rx_destruct(struct netdev_rx *rx_) +netdev_linux_rxq_destruct(struct netdev_rxq *rxq_) { - struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_); + struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_); if (!rx->is_tap) { close(rx->fd); @@ -816,50 +877,160 @@ netdev_linux_rx_destruct(struct netdev_rx *rx_) } static void -netdev_linux_rx_dealloc(struct netdev_rx *rx_) +netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_) { - struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_); + struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_); free(rx); } +static ovs_be16 +auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux) +{ + if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) { + return htons(aux->tp_vlan_tpid); + } else { + return htons(ETH_TYPE_VLAN); + } +} + +static bool +auxdata_has_vlan_tci(const struct tpacket_auxdata *aux) +{ + return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID; +} + static int -netdev_linux_rx_recv(struct netdev_rx *rx_, void *data, size_t size) +netdev_linux_rxq_recv_sock(int fd, struct ofpbuf *buffer) { - struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_); + size_t size; ssize_t retval; + struct iovec iov; + struct cmsghdr *cmsg; + union { + struct cmsghdr cmsg; + char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))]; + } cmsg_buffer; + struct msghdr msgh; + + /* Reserve headroom for a single VLAN tag */ + ofpbuf_reserve(buffer, VLAN_HEADER_LEN); + size = ofpbuf_tailroom(buffer); + + iov.iov_base = ofpbuf_data(buffer); + iov.iov_len = size; + msgh.msg_name = NULL; + msgh.msg_namelen = 0; + msgh.msg_iov = &iov; + msgh.msg_iovlen = 1; + msgh.msg_control = &cmsg_buffer; + msgh.msg_controllen = sizeof cmsg_buffer; + msgh.msg_flags = 0; do { - retval = (rx->is_tap - ? read(rx->fd, data, size) - : recv(rx->fd, data, size, MSG_TRUNC)); + retval = recvmsg(fd, &msgh, MSG_TRUNC); } while (retval < 0 && errno == EINTR); - if (retval >= 0) { - return retval > size ? -EMSGSIZE : retval; - } else { - if (errno != EAGAIN) { + if (retval < 0) { + return errno; + } else if (retval > size) { + return EMSGSIZE; + } + + ofpbuf_set_size(buffer, ofpbuf_size(buffer) + retval); + + for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) { + const struct tpacket_auxdata *aux; + + if (cmsg->cmsg_level != SOL_PACKET + || cmsg->cmsg_type != PACKET_AUXDATA + || cmsg->cmsg_len < CMSG_LEN(sizeof(struct tpacket_auxdata))) { + continue; + } + + aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg)); + if (auxdata_has_vlan_tci(aux)) { + if (retval < ETH_HEADER_LEN) { + return EINVAL; + } + + eth_push_vlan(buffer, auxdata_to_vlan_tpid(aux), + htons(aux->tp_vlan_tci)); + break; + } + } + + return 0; +} + +static int +netdev_linux_rxq_recv_tap(int fd, struct ofpbuf *buffer) +{ + ssize_t retval; + size_t size = ofpbuf_tailroom(buffer); + + do { + retval = read(fd, ofpbuf_data(buffer), size); + } while (retval < 0 && errno == EINTR); + + if (retval < 0) { + return errno; + } else if (retval > size) { + return EMSGSIZE; + } + + ofpbuf_set_size(buffer, ofpbuf_size(buffer) + retval); + return 0; +} + +static int +netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct ofpbuf **packet, int *c) +{ + struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_); + struct netdev *netdev = rx->up.netdev; + struct ofpbuf *buffer; + ssize_t retval; + int mtu; + + if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) { + mtu = ETH_PAYLOAD_MAX; + } + + buffer = ofpbuf_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu, DP_NETDEV_HEADROOM); + + retval = (rx->is_tap + ? netdev_linux_rxq_recv_tap(rx->fd, buffer) + : netdev_linux_rxq_recv_sock(rx->fd, buffer)); + + if (retval) { + if (retval != EAGAIN && retval != EMSGSIZE) { VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s", - ovs_strerror(errno), netdev_rx_get_name(rx_)); + ovs_strerror(errno), netdev_rxq_get_name(rxq_)); } - return -errno; + ofpbuf_delete(buffer); + } else { + dp_packet_pad(buffer); + packet[0] = buffer; + *c = 1; } + + return retval; } static void -netdev_linux_rx_wait(struct netdev_rx *rx_) +netdev_linux_rxq_wait(struct netdev_rxq *rxq_) { - struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_); + struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_); poll_fd_wait(rx->fd, POLLIN); } static int -netdev_linux_rx_drain(struct netdev_rx *rx_) +netdev_linux_rxq_drain(struct netdev_rxq *rxq_) { - struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_); + struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_); if (rx->is_tap) { struct ifreq ifr; - int error = af_inet_ifreq_ioctl(netdev_rx_get_name(rx_), &ifr, + int error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr, SIOCGIFTXQLEN, "SIOCGIFTXQLEN"); if (error) { return error; @@ -881,8 +1052,11 @@ netdev_linux_rx_drain(struct netdev_rx *rx_) * The kernel maintains a packet transmission queue, so the caller is not * expected to do additional queuing of packets. */ static int -netdev_linux_send(struct netdev *netdev_, const void *data, size_t size) +netdev_linux_send(struct netdev *netdev_, struct ofpbuf *pkt, bool may_steal) { + const void *data = ofpbuf_data(pkt); + size_t size = ofpbuf_size(pkt); + for (;;) { ssize_t retval; @@ -933,6 +1107,10 @@ netdev_linux_send(struct netdev *netdev_, const void *data, size_t size) retval = write(netdev->tap_fd, data, size); } + if (may_steal) { + ofpbuf_delete(pkt); + } + if (retval < 0) { /* The Linux AF_PACKET implementation never blocks waiting for room * for packets, instead returning ENOBUFS. Translate this into @@ -947,8 +1125,8 @@ netdev_linux_send(struct netdev *netdev_, const void *data, size_t size) } return errno; } else if (retval != size) { - VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of " - "%zu) on %s", retval, size, netdev_get_name(netdev_)); + VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE"d bytes of " + "%"PRIuSIZE") on %s", retval, size, netdev_get_name(netdev_)); return EMSGSIZE; } else { return 0; @@ -979,7 +1157,7 @@ netdev_linux_set_etheraddr(struct netdev *netdev_, const uint8_t mac[ETH_ADDR_LEN]) { struct netdev_linux *netdev = netdev_linux_cast(netdev_); - struct netdev_saved_flags *sf = NULL; + enum netdev_flags old_flags = 0; int error; ovs_mutex_lock(&netdev->mutex); @@ -994,7 +1172,7 @@ netdev_linux_set_etheraddr(struct netdev *netdev_, /* Tap devices must be brought down before setting the address. */ if (is_tap_netdev(netdev_)) { - netdev_turn_flags_off(netdev_, NETDEV_UP, &sf); + update_flags(netdev, NETDEV_UP, 0, &old_flags); } error = set_etheraddr(netdev_get_name(netdev_), mac); if (!error || error == ENODEV) { @@ -1005,7 +1183,9 @@ netdev_linux_set_etheraddr(struct netdev *netdev_, } } - netdev_restore_flags(sf); + if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) { + update_flags(netdev, 0, NETDEV_UP, &old_flags); + } exit: ovs_mutex_unlock(&netdev->mutex); @@ -1036,21 +1216,16 @@ netdev_linux_get_etheraddr(const struct netdev *netdev_, return error; } -/* Returns the maximum size of transmitted (and received) packets on 'netdev', - * in bytes, not including the hardware header; thus, this is typically 1500 - * bytes for Ethernet devices. */ static int -netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup) +netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup) { - struct netdev_linux *netdev = netdev_linux_cast(netdev_); int error; - ovs_mutex_lock(&netdev->mutex); if (!(netdev->cache_valid & VALID_MTU)) { struct ifreq ifr; netdev->netdev_mtu_error = af_inet_ifreq_ioctl( - netdev_get_name(netdev_), &ifr, SIOCGIFMTU, "SIOCGIFMTU"); + netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU"); netdev->mtu = ifr.ifr_mtu; netdev->cache_valid |= VALID_MTU; } @@ -1059,6 +1234,21 @@ netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup) if (!error) { *mtup = netdev->mtu; } + + return error; +} + +/* Returns the maximum size of transmitted (and received) packets on 'netdev', + * in bytes, not including the hardware header; thus, this is typically 1500 + * bytes for Ethernet devices. */ +static int +netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup) +{ + struct netdev_linux *netdev = netdev_linux_cast(netdev_); + int error; + + ovs_mutex_lock(&netdev->mutex); + error = netdev_linux_get_mtu__(netdev, mtup); ovs_mutex_unlock(&netdev->mutex); return error; @@ -1207,6 +1397,14 @@ netdev_linux_set_miimon_interval(struct netdev *netdev_, ovs_mutex_lock(&netdev->mutex); interval = interval > 0 ? MAX(interval, 100) : 0; if (netdev->miimon_interval != interval) { + int junk; + + if (interval && !netdev->miimon_interval) { + atomic_add(&miimon_cnt, 1, &junk); + } else if (!interval && netdev->miimon_interval) { + atomic_sub(&miimon_cnt, 1, &junk); + } + netdev->miimon_interval = interval; timer_set_expired(&netdev->miimon_timer); } @@ -1267,34 +1465,6 @@ netdev_linux_miimon_wait(void) shash_destroy(&device_shash); } -/* Check whether we can we use RTM_GETLINK to get network device statistics. - * In pre-2.6.19 kernels, this was only available if wireless extensions were - * enabled. */ -static bool -check_for_working_netlink_stats(void) -{ - /* Decide on the netdev_get_stats() implementation to use. Netlink is - * preferable, so if that works, we'll use it. */ - int ifindex = do_get_ifindex("lo"); - if (ifindex < 0) { - VLOG_WARN("failed to get ifindex for lo, " - "obtaining netdev stats from proc"); - return false; - } else { - struct netdev_stats stats; - int error = get_stats_via_netlink(ifindex, &stats); - if (!error) { - VLOG_DBG("obtaining netdev stats via rtnetlink"); - return true; - } else { - VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats " - "via proc (you are probably running a pre-2.6.19 " - "kernel)", ovs_strerror(error)); - return false; - } - } -} - static void swap_uint64(uint64_t *a, uint64_t *b) { @@ -1376,38 +1546,6 @@ get_stats_via_vport(const struct netdev *netdev_, } } -static int -netdev_linux_sys_get_stats(const struct netdev *netdev_, - struct netdev_stats *stats) -{ - static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER; - static int use_netlink_stats; - int error; - - if (ovsthread_once_start(&once)) { - use_netlink_stats = check_for_working_netlink_stats(); - ovsthread_once_done(&once); - } - - if (use_netlink_stats) { - int ifindex; - - error = get_ifindex(netdev_, &ifindex); - if (!error) { - error = get_stats_via_netlink(ifindex, stats); - } - } else { - error = get_stats_via_proc(netdev_get_name(netdev_), stats); - } - - if (error) { - VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d", - netdev_get_name(netdev_), error); - } - return error; - -} - /* Retrieves current device stats for 'netdev-linux'. */ static int netdev_linux_get_stats(const struct netdev *netdev_, @@ -1419,7 +1557,7 @@ netdev_linux_get_stats(const struct netdev *netdev_, ovs_mutex_lock(&netdev->mutex); get_stats_via_vport(netdev_, stats); - error = netdev_linux_sys_get_stats(netdev_, &dev_stats); + error = get_stats_via_netlink(netdev_, &dev_stats); if (error) { if (!netdev->vport_stats_error) { error = 0; @@ -1462,7 +1600,7 @@ netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats) ovs_mutex_lock(&netdev->mutex); get_stats_via_vport(netdev_, stats); - error = netdev_linux_sys_get_stats(netdev_, &dev_stats); + error = get_stats_via_netlink(netdev_, &dev_stats); if (error) { if (!netdev->vport_stats_error) { error = 0; @@ -1557,7 +1695,6 @@ netdev_internal_set_stats(struct netdev *netdev, static void netdev_linux_read_features(struct netdev_linux *netdev) - OVS_REQUIRES(netdev->mutex) { struct ethtool_cmd ecmd; uint32_t speed; @@ -2070,8 +2207,13 @@ netdev_linux_get_queue_stats(const struct netdev *netdev_, return error; } +struct queue_dump_state { + struct nl_dump dump; + struct ofpbuf buf; +}; + static bool -start_queue_dump(const struct netdev *netdev, struct nl_dump *dump) +start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state) { struct ofpbuf request; struct tcmsg *tcmsg; @@ -2081,40 +2223,49 @@ start_queue_dump(const struct netdev *netdev, struct nl_dump *dump) return false; } tcmsg->tcm_parent = 0; - nl_dump_start(dump, NETLINK_ROUTE, &request); + nl_dump_start(&state->dump, NETLINK_ROUTE, &request); ofpbuf_uninit(&request); + + ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE); return true; } static int -netdev_linux_dump_queues(const struct netdev *netdev_, - netdev_dump_queues_cb *cb, void *aux) +finish_queue_dump(struct queue_dump_state *state) { - struct netdev_linux *netdev = netdev_linux_cast(netdev_); + ofpbuf_uninit(&state->buf); + return nl_dump_done(&state->dump); +} + +struct netdev_linux_queue_state { + unsigned int *queues; + size_t cur_queue; + size_t n_queues; +}; + +static int +netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep) +{ + const struct netdev_linux *netdev = netdev_linux_cast(netdev_); int error; ovs_mutex_lock(&netdev->mutex); error = tc_query_qdisc(netdev_); if (!error) { if (netdev->tc->ops->class_get) { - struct tc_queue *queue, *next_queue; - struct smap details; - - smap_init(&details); - HMAP_FOR_EACH_SAFE (queue, next_queue, hmap_node, - &netdev->tc->queues) { - int retval; - - smap_clear(&details); - - retval = netdev->tc->ops->class_get(netdev_, queue, &details); - if (!retval) { - (*cb)(queue->queue_id, &details, aux); - } else { - error = retval; - } + struct netdev_linux_queue_state *state; + struct tc_queue *queue; + size_t i; + + *statep = state = xmalloc(sizeof *state); + state->n_queues = hmap_count(&netdev->tc->queues); + state->cur_queue = 0; + state->queues = xmalloc(state->n_queues * sizeof *state->queues); + + i = 0; + HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) { + state->queues[i++] = queue->queue_id; } - smap_destroy(&details); } else { error = EOPNOTSUPP; } @@ -2124,6 +2275,41 @@ netdev_linux_dump_queues(const struct netdev *netdev_, return error; } +static int +netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_, + unsigned int *queue_idp, struct smap *details) +{ + const struct netdev_linux *netdev = netdev_linux_cast(netdev_); + struct netdev_linux_queue_state *state = state_; + int error = EOF; + + ovs_mutex_lock(&netdev->mutex); + while (state->cur_queue < state->n_queues) { + unsigned int queue_id = state->queues[state->cur_queue++]; + struct tc_queue *queue = tc_find_queue(netdev_, queue_id); + + if (queue) { + *queue_idp = queue_id; + error = netdev->tc->ops->class_get(netdev_, queue, details); + break; + } + } + ovs_mutex_unlock(&netdev->mutex); + + return error; +} + +static int +netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED, + void *state_) +{ + struct netdev_linux_queue_state *state = state_; + + free(state->queues); + free(state); + return 0; +} + static int netdev_linux_dump_queue_stats(const struct netdev *netdev_, netdev_dump_queue_stats_cb *cb, void *aux) @@ -2134,17 +2320,17 @@ netdev_linux_dump_queue_stats(const struct netdev *netdev_, ovs_mutex_lock(&netdev->mutex); error = tc_query_qdisc(netdev_); if (!error) { - struct nl_dump dump; + struct queue_dump_state state; if (!netdev->tc->ops->class_dump_stats) { error = EOPNOTSUPP; - } else if (!start_queue_dump(netdev_, &dump)) { + } else if (!start_queue_dump(netdev_, &state)) { error = ENODEV; } else { struct ofpbuf msg; int retval; - while (nl_dump_next(&dump, &msg)) { + while (nl_dump_next(&state.dump, &msg, &state.buf)) { retval = netdev->tc->ops->class_dump_stats(netdev_, &msg, cb, aux); if (retval) { @@ -2152,7 +2338,7 @@ netdev_linux_dump_queue_stats(const struct netdev *netdev_, } } - retval = nl_dump_done(&dump); + retval = finish_queue_dump(&state); if (retval) { error = retval; } @@ -2227,14 +2413,14 @@ parse_if_inet6_line(const char *line, { uint8_t *s6 = in6->s6_addr; #define X8 "%2"SCNx8 - return sscanf(line, - " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 - "%*x %*x %*x %*x %16s\n", - &s6[0], &s6[1], &s6[2], &s6[3], - &s6[4], &s6[5], &s6[6], &s6[7], - &s6[8], &s6[9], &s6[10], &s6[11], - &s6[12], &s6[13], &s6[14], &s6[15], - ifname) == 17; + return ovs_scan(line, + " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 + "%*x %*x %*x %*x %16s\n", + &s6[0], &s6[1], &s6[2], &s6[3], + &s6[4], &s6[5], &s6[6], &s6[7], + &s6[8], &s6[9], &s6[10], &s6[11], + &s6[12], &s6[13], &s6[14], &s6[15], + ifname); } /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if @@ -2342,12 +2528,11 @@ netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop, int refcnt, metric, mtu; unsigned int flags, use, window, irtt; - if (sscanf(line, - "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32 - " %d %u %u\n", - iface, &dest, &gateway, &flags, &refcnt, - &use, &metric, &mask, &mtu, &window, &irtt) != 11) { - + if (!ovs_scan(line, + "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32 + " %d %u %u\n", + iface, &dest, &gateway, &flags, &refcnt, + &use, &metric, &mask, &mtu, &window, &irtt)) { VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s", fn, ln, line); continue; @@ -2461,6 +2646,9 @@ nd_to_iff_flags(enum netdev_flags nd) if (nd & NETDEV_PROMISC) { iff |= IFF_PROMISC; } + if (nd & NETDEV_LOOPBACK) { + iff |= IFF_LOOPBACK; + } return iff; } @@ -2474,41 +2662,43 @@ iff_to_nd_flags(int iff) if (iff & IFF_PROMISC) { nd |= NETDEV_PROMISC; } + if (iff & IFF_LOOPBACK) { + nd |= NETDEV_LOOPBACK; + } return nd; } static int -netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off, - enum netdev_flags on, enum netdev_flags *old_flagsp) +update_flags(struct netdev_linux *netdev, enum netdev_flags off, + enum netdev_flags on, enum netdev_flags *old_flagsp) + OVS_REQUIRES(netdev->mutex) { - struct netdev_linux *netdev = netdev_linux_cast(netdev_); int old_flags, new_flags; int error = 0; - ovs_mutex_lock(&netdev->mutex); old_flags = netdev->ifi_flags; *old_flagsp = iff_to_nd_flags(old_flags); new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on); if (new_flags != old_flags) { - error = set_flags(netdev_get_name(netdev_), new_flags); - get_flags(netdev_, &netdev->ifi_flags); + error = set_flags(netdev_get_name(&netdev->up), new_flags); + get_flags(&netdev->up, &netdev->ifi_flags); } - ovs_mutex_unlock(&netdev->mutex); return error; } -static unsigned int -netdev_linux_change_seq(const struct netdev *netdev_) +static int +netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off, + enum netdev_flags on, enum netdev_flags *old_flagsp) { struct netdev_linux *netdev = netdev_linux_cast(netdev_); - unsigned int change_seq; + int error; ovs_mutex_lock(&netdev->mutex); - change_seq = netdev->change_seq; + error = update_flags(netdev, off, on, old_flagsp); ovs_mutex_unlock(&netdev->mutex); - return change_seq; + return error; } #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, SET_STATS, \ @@ -2554,7 +2744,9 @@ netdev_linux_change_seq(const struct netdev *netdev_) netdev_linux_set_queue, \ netdev_linux_delete_queue, \ netdev_linux_get_queue_stats, \ - netdev_linux_dump_queues, \ + netdev_linux_queue_dump_start, \ + netdev_linux_queue_dump_next, \ + netdev_linux_queue_dump_done, \ netdev_linux_dump_queue_stats, \ \ netdev_linux_get_in4, \ @@ -2567,15 +2759,13 @@ netdev_linux_change_seq(const struct netdev *netdev_) \ netdev_linux_update_flags, \ \ - netdev_linux_change_seq, \ - \ - netdev_linux_rx_alloc, \ - netdev_linux_rx_construct, \ - netdev_linux_rx_destruct, \ - netdev_linux_rx_dealloc, \ - netdev_linux_rx_recv, \ - netdev_linux_rx_wait, \ - netdev_linux_rx_drain, \ + netdev_linux_rxq_alloc, \ + netdev_linux_rxq_construct, \ + netdev_linux_rxq_destruct, \ + netdev_linux_rxq_dealloc, \ + netdev_linux_rxq_recv, \ + netdev_linux_rxq_wait, \ + netdev_linux_rxq_drain, \ } const struct netdev_class netdev_linux_class = @@ -2690,7 +2880,7 @@ htb_setup_class__(struct netdev *netdev, unsigned int handle, int error; int mtu; - error = netdev_get_mtu(netdev, &mtu); + error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu); if (error) { VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU", netdev_get_name(netdev)); @@ -2786,9 +2976,10 @@ htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id, } static void -htb_parse_qdisc_details__(struct netdev *netdev, +htb_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details, struct htb_class *hc) { + struct netdev_linux *netdev = netdev_linux_cast(netdev_); const char *max_rate_s; max_rate_s = smap_get(details, "max-rate"); @@ -2796,7 +2987,8 @@ htb_parse_qdisc_details__(struct netdev *netdev, if (!hc->max_rate) { enum netdev_features current; - netdev_get_features(netdev, ¤t, NULL, NULL, NULL); + netdev_linux_read_features(netdev); + current = !netdev->get_features_error ? netdev->current : 0; hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8; } hc->min_rate = hc->max_rate; @@ -2815,7 +3007,7 @@ htb_parse_class_details__(struct netdev *netdev, const char *priority_s = smap_get(details, "priority"); int mtu, error; - error = netdev_get_mtu(netdev, &mtu); + error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu); if (error) { VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU", netdev_get_name(netdev)); @@ -2924,7 +3116,7 @@ static int htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED) { struct ofpbuf msg; - struct nl_dump dump; + struct queue_dump_state state; struct htb_class hc; /* Get qdisc options. */ @@ -2933,17 +3125,17 @@ htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED) htb_install__(netdev, hc.max_rate); /* Get queues. */ - if (!start_queue_dump(netdev, &dump)) { + if (!start_queue_dump(netdev, &state)) { return ENODEV; } - while (nl_dump_next(&dump, &msg)) { + while (nl_dump_next(&state.dump, &msg, &state.buf)) { unsigned int queue_id; if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) { htb_update_queue__(netdev, queue_id, &hc); } } - nl_dump_done(&dump); + finish_queue_dump(&state); return 0; } @@ -3263,9 +3455,10 @@ hfsc_query_class__(const struct netdev *netdev, unsigned int handle, } static void -hfsc_parse_qdisc_details__(struct netdev *netdev, const struct smap *details, +hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details, struct hfsc_class *class) { + struct netdev_linux *netdev = netdev_linux_cast(netdev_); uint32_t max_rate; const char *max_rate_s; @@ -3275,7 +3468,8 @@ hfsc_parse_qdisc_details__(struct netdev *netdev, const struct smap *details, if (!max_rate) { enum netdev_features current; - netdev_get_features(netdev, ¤t, NULL, NULL, NULL); + netdev_linux_read_features(netdev); + current = !netdev->get_features_error ? netdev->current : 0; max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8; } @@ -3422,18 +3616,18 @@ static int hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED) { struct ofpbuf msg; - struct nl_dump dump; + struct queue_dump_state state; struct hfsc_class hc; hc.max_rate = 0; hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL); hfsc_install__(netdev, hc.max_rate); - if (!start_queue_dump(netdev, &dump)) { + if (!start_queue_dump(netdev, &state)) { return ENODEV; } - while (nl_dump_next(&dump, &msg)) { + while (nl_dump_next(&state.dump, &msg, &state.buf)) { unsigned int queue_id; if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) { @@ -3441,7 +3635,7 @@ hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED) } } - nl_dump_done(&dump); + finish_queue_dump(&state); return 0; } @@ -4386,110 +4580,41 @@ netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst, } static int -get_stats_via_netlink(int ifindex, struct netdev_stats *stats) +get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats) { - /* Policy for RTNLGRP_LINK messages. - * - * There are *many* more fields in these messages, but currently we only - * care about these fields. */ - static const struct nl_policy rtnlgrp_link_policy[] = { - [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false }, - [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true, - .min_len = sizeof(struct rtnl_link_stats) }, - }; - struct ofpbuf request; struct ofpbuf *reply; - struct ifinfomsg *ifi; - struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)]; int error; ofpbuf_init(&request, 0); - nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST); - ifi = ofpbuf_put_zeros(&request, sizeof *ifi); - ifi->ifi_family = PF_UNSPEC; - ifi->ifi_index = ifindex; + nl_msg_put_nlmsghdr(&request, + sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ), + RTM_GETLINK, NLM_F_REQUEST); + ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg)); + nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_)); error = nl_transact(NETLINK_ROUTE, &request, &reply); ofpbuf_uninit(&request); if (error) { return error; } - if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg), - rtnlgrp_link_policy, - attrs, ARRAY_SIZE(rtnlgrp_link_policy))) { - ofpbuf_delete(reply); - return EPROTO; - } - - if (!attrs[IFLA_STATS]) { - VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats"); - ofpbuf_delete(reply); - return EPROTO; + if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) { + const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS); + if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) { + netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a)); + error = 0; + } else { + VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats"); + error = EPROTO; + } + } else { + VLOG_WARN_RL(&rl, "short RTM_GETLINK reply"); + error = EPROTO; } - netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS])); ofpbuf_delete(reply); - - return 0; -} - -static int -get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats) -{ - static const char fn[] = "/proc/net/dev"; - char line[1024]; - FILE *stream; - int ln; - - stream = fopen(fn, "r"); - if (!stream) { - VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno)); - return errno; - } - - ln = 0; - while (fgets(line, sizeof line, stream)) { - if (++ln >= 3) { - char devname[16]; -#define X64 "%"SCNu64 - if (sscanf(line, - " %15[^:]:" - X64 X64 X64 X64 X64 X64 X64 "%*u" - X64 X64 X64 X64 X64 X64 X64 "%*u", - devname, - &stats->rx_bytes, - &stats->rx_packets, - &stats->rx_errors, - &stats->rx_dropped, - &stats->rx_fifo_errors, - &stats->rx_frame_errors, - &stats->multicast, - &stats->tx_bytes, - &stats->tx_packets, - &stats->tx_errors, - &stats->tx_dropped, - &stats->tx_fifo_errors, - &stats->collisions, - &stats->tx_carrier_errors) != 15) { - VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln); - } else if (!strcmp(devname, netdev_name)) { - stats->rx_length_errors = UINT64_MAX; - stats->rx_over_errors = UINT64_MAX; - stats->rx_crc_errors = UINT64_MAX; - stats->rx_missed_errors = UINT64_MAX; - stats->tx_aborted_errors = UINT64_MAX; - stats->tx_heartbeat_errors = UINT64_MAX; - stats->tx_window_errors = UINT64_MAX; - fclose(stream); - return 0; - } - } - } - VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name); - fclose(stream); - return ENODEV; + return error; } static int