X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=lib%2Fnetdev-linux.c;h=c1d93237eb59165d9fbd13b008d1812dcde0bce3;hb=HEAD;hp=68d476f187afcd8fef2ffcd28b34a78129855641;hpb=4b0424809b823101c969a0691fc1db0c880ae64a;p=sliver-openvswitch.git diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c index 68d476f18..c1d93237e 100644 --- a/lib/netdev-linux.c +++ b/lib/netdev-linux.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009, 2010, 2011, 2012, 2013 Nicira, Inc. + * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -33,7 +33,6 @@ #include #include #include -#include #include #include #include @@ -48,9 +47,9 @@ #include #include -#include "connectivity.h" #include "coverage.h" #include "dpif-linux.h" +#include "dpif-netdev.h" #include "dynamic-string.h" #include "fatal-signal.h" #include "hash.h" @@ -66,7 +65,6 @@ #include "packets.h" #include "poll-loop.h" #include "rtnetlink-link.h" -#include "seq.h" #include "shash.h" #include "socket-util.h" #include "sset.h" @@ -109,6 +107,36 @@ COVERAGE_DEFINE(netdev_set_ethtool); #define TC_RTAB_SIZE 1024 #endif +/* Linux 2.6.21 introduced struct tpacket_auxdata. + * Linux 2.6.27 added the tp_vlan_tci member. + * Linux 3.0 defined TP_STATUS_VLAN_VALID. + * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined + * TP_STATUS_VLAN_TPID_VALID. + * + * With all this churn it's easiest to unconditionally define a replacement + * structure that has everything we want. + */ +#ifndef PACKET_AUXDATA +#define PACKET_AUXDATA 8 +#endif +#ifndef TP_STATUS_VLAN_VALID +#define TP_STATUS_VLAN_VALID (1 << 4) +#endif +#ifndef TP_STATUS_VLAN_TPID_VALID +#define TP_STATUS_VLAN_TPID_VALID (1 << 6) +#endif +#undef tpacket_auxdata +#define tpacket_auxdata rpl_tpacket_auxdata +struct tpacket_auxdata { + uint32_t tp_status; + uint32_t tp_len; + uint32_t tp_snaplen; + uint16_t tp_mac; + uint16_t tp_net; + uint16_t tp_vlan_tci; + uint16_t tp_vlan_tpid; +}; + enum { VALID_IFINDEX = 1 << 0, VALID_ETHERADDR = 1 << 1, @@ -394,8 +422,8 @@ struct netdev_linux { int tap_fd; }; -struct netdev_rx_linux { - struct netdev_rx up; +struct netdev_rxq_linux { + struct netdev_rxq up; bool is_tap; int fd; }; @@ -432,6 +460,7 @@ static int af_packet_sock(void); static bool netdev_linux_miimon_enabled(void); static void netdev_linux_miimon_run(void); static void netdev_linux_miimon_wait(void); +static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup); static bool is_netdev_linux_class(const struct netdev_class *netdev_class) @@ -453,11 +482,11 @@ netdev_linux_cast(const struct netdev *netdev) return CONTAINER_OF(netdev, struct netdev_linux, up); } -static struct netdev_rx_linux * -netdev_rx_linux_cast(const struct netdev_rx *rx) +static struct netdev_rxq_linux * +netdev_rxq_linux_cast(const struct netdev_rxq *rx) { ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev))); - return CONTAINER_OF(rx, struct netdev_rx_linux, up); + return CONTAINER_OF(rx, struct netdev_rxq_linux, up); } static void netdev_linux_update(struct netdev_linux *netdev, @@ -585,7 +614,7 @@ netdev_linux_changed(struct netdev_linux *dev, unsigned int ifi_flags, unsigned int mask) OVS_REQUIRES(dev->mutex) { - seq_change(connectivity_seq_get()); + netdev_change_seq_changed(&dev->up); if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) { dev->carrier_resets++; @@ -742,17 +771,17 @@ netdev_linux_dealloc(struct netdev *netdev_) free(netdev); } -static struct netdev_rx * -netdev_linux_rx_alloc(void) +static struct netdev_rxq * +netdev_linux_rxq_alloc(void) { - struct netdev_rx_linux *rx = xzalloc(sizeof *rx); + struct netdev_rxq_linux *rx = xzalloc(sizeof *rx); return &rx->up; } static int -netdev_linux_rx_construct(struct netdev_rx *rx_) +netdev_linux_rxq_construct(struct netdev_rxq *rxq_) { - struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_); + struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_); struct netdev *netdev_ = rx->up.netdev; struct netdev_linux *netdev = netdev_linux_cast(netdev_); int error; @@ -763,7 +792,7 @@ netdev_linux_rx_construct(struct netdev_rx *rx_) rx->fd = netdev->tap_fd; } else { struct sockaddr_ll sll; - int ifindex; + int ifindex, val; /* Result of tcpdump -dd inbound */ static const struct sock_filter filt[] = { { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */ @@ -783,6 +812,14 @@ netdev_linux_rx_construct(struct netdev_rx *rx_) goto error; } + val = 1; + if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) { + error = errno; + VLOG_ERR("%s: failed to mark socket for auxdata (%s)", + netdev_get_name(netdev_), ovs_strerror(error)); + goto error; + } + /* Set non-blocking mode. */ error = set_nonblocking(rx->fd); if (error) { @@ -799,7 +836,7 @@ netdev_linux_rx_construct(struct netdev_rx *rx_) memset(&sll, 0, sizeof sll); sll.sll_family = AF_PACKET; sll.sll_ifindex = ifindex; - sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL); + sll.sll_protocol = htons(ETH_P_ALL); if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) { error = errno; VLOG_ERR("%s: failed to bind raw socket (%s)", @@ -830,9 +867,9 @@ error: } static void -netdev_linux_rx_destruct(struct netdev_rx *rx_) +netdev_linux_rxq_destruct(struct netdev_rxq *rxq_) { - struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_); + struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_); if (!rx->is_tap) { close(rx->fd); @@ -840,50 +877,160 @@ netdev_linux_rx_destruct(struct netdev_rx *rx_) } static void -netdev_linux_rx_dealloc(struct netdev_rx *rx_) +netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_) { - struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_); + struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_); free(rx); } +static ovs_be16 +auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux) +{ + if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) { + return htons(aux->tp_vlan_tpid); + } else { + return htons(ETH_TYPE_VLAN); + } +} + +static bool +auxdata_has_vlan_tci(const struct tpacket_auxdata *aux) +{ + return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID; +} + static int -netdev_linux_rx_recv(struct netdev_rx *rx_, void *data, size_t size) +netdev_linux_rxq_recv_sock(int fd, struct ofpbuf *buffer) { - struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_); + size_t size; ssize_t retval; + struct iovec iov; + struct cmsghdr *cmsg; + union { + struct cmsghdr cmsg; + char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))]; + } cmsg_buffer; + struct msghdr msgh; + + /* Reserve headroom for a single VLAN tag */ + ofpbuf_reserve(buffer, VLAN_HEADER_LEN); + size = ofpbuf_tailroom(buffer); + + iov.iov_base = ofpbuf_data(buffer); + iov.iov_len = size; + msgh.msg_name = NULL; + msgh.msg_namelen = 0; + msgh.msg_iov = &iov; + msgh.msg_iovlen = 1; + msgh.msg_control = &cmsg_buffer; + msgh.msg_controllen = sizeof cmsg_buffer; + msgh.msg_flags = 0; do { - retval = (rx->is_tap - ? read(rx->fd, data, size) - : recv(rx->fd, data, size, MSG_TRUNC)); + retval = recvmsg(fd, &msgh, MSG_TRUNC); } while (retval < 0 && errno == EINTR); - if (retval >= 0) { - return retval > size ? -EMSGSIZE : retval; - } else { - if (errno != EAGAIN) { + if (retval < 0) { + return errno; + } else if (retval > size) { + return EMSGSIZE; + } + + ofpbuf_set_size(buffer, ofpbuf_size(buffer) + retval); + + for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) { + const struct tpacket_auxdata *aux; + + if (cmsg->cmsg_level != SOL_PACKET + || cmsg->cmsg_type != PACKET_AUXDATA + || cmsg->cmsg_len < CMSG_LEN(sizeof(struct tpacket_auxdata))) { + continue; + } + + aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg)); + if (auxdata_has_vlan_tci(aux)) { + if (retval < ETH_HEADER_LEN) { + return EINVAL; + } + + eth_push_vlan(buffer, auxdata_to_vlan_tpid(aux), + htons(aux->tp_vlan_tci)); + break; + } + } + + return 0; +} + +static int +netdev_linux_rxq_recv_tap(int fd, struct ofpbuf *buffer) +{ + ssize_t retval; + size_t size = ofpbuf_tailroom(buffer); + + do { + retval = read(fd, ofpbuf_data(buffer), size); + } while (retval < 0 && errno == EINTR); + + if (retval < 0) { + return errno; + } else if (retval > size) { + return EMSGSIZE; + } + + ofpbuf_set_size(buffer, ofpbuf_size(buffer) + retval); + return 0; +} + +static int +netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct ofpbuf **packet, int *c) +{ + struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_); + struct netdev *netdev = rx->up.netdev; + struct ofpbuf *buffer; + ssize_t retval; + int mtu; + + if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) { + mtu = ETH_PAYLOAD_MAX; + } + + buffer = ofpbuf_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu, DP_NETDEV_HEADROOM); + + retval = (rx->is_tap + ? netdev_linux_rxq_recv_tap(rx->fd, buffer) + : netdev_linux_rxq_recv_sock(rx->fd, buffer)); + + if (retval) { + if (retval != EAGAIN && retval != EMSGSIZE) { VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s", - ovs_strerror(errno), netdev_rx_get_name(rx_)); + ovs_strerror(errno), netdev_rxq_get_name(rxq_)); } - return -errno; + ofpbuf_delete(buffer); + } else { + dp_packet_pad(buffer); + packet[0] = buffer; + *c = 1; } + + return retval; } static void -netdev_linux_rx_wait(struct netdev_rx *rx_) +netdev_linux_rxq_wait(struct netdev_rxq *rxq_) { - struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_); + struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_); poll_fd_wait(rx->fd, POLLIN); } static int -netdev_linux_rx_drain(struct netdev_rx *rx_) +netdev_linux_rxq_drain(struct netdev_rxq *rxq_) { - struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_); + struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_); if (rx->is_tap) { struct ifreq ifr; - int error = af_inet_ifreq_ioctl(netdev_rx_get_name(rx_), &ifr, + int error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr, SIOCGIFTXQLEN, "SIOCGIFTXQLEN"); if (error) { return error; @@ -905,8 +1052,11 @@ netdev_linux_rx_drain(struct netdev_rx *rx_) * The kernel maintains a packet transmission queue, so the caller is not * expected to do additional queuing of packets. */ static int -netdev_linux_send(struct netdev *netdev_, const void *data, size_t size) +netdev_linux_send(struct netdev *netdev_, struct ofpbuf *pkt, bool may_steal) { + const void *data = ofpbuf_data(pkt); + size_t size = ofpbuf_size(pkt); + for (;;) { ssize_t retval; @@ -957,6 +1107,10 @@ netdev_linux_send(struct netdev *netdev_, const void *data, size_t size) retval = write(netdev->tap_fd, data, size); } + if (may_steal) { + ofpbuf_delete(pkt); + } + if (retval < 0) { /* The Linux AF_PACKET implementation never blocks waiting for room * for packets, instead returning ENOBUFS. Translate this into @@ -1409,9 +1563,17 @@ netdev_linux_get_stats(const struct netdev *netdev_, error = 0; } } else if (netdev->vport_stats_error) { - /* stats not available from OVS then use ioctl stats. */ + /* stats not available from OVS then use netdev stats. */ *stats = dev_stats; } else { + /* Use kernel netdev's packet and byte counts since vport's counters + * do not reflect packet counts on the wire when GSO, TSO or GRO are + * enabled. */ + stats->rx_packets = dev_stats.rx_packets; + stats->rx_bytes = dev_stats.rx_bytes; + stats->tx_packets = dev_stats.tx_packets; + stats->tx_bytes = dev_stats.tx_bytes; + stats->rx_errors += dev_stats.rx_errors; stats->tx_errors += dev_stats.tx_errors; stats->rx_dropped += dev_stats.rx_dropped; @@ -1475,6 +1637,14 @@ netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats) stats->tx_heartbeat_errors = 0; stats->tx_window_errors = 0; } else { + /* Use kernel netdev's packet and byte counts since vport counters + * do not reflect packet counts on the wire when GSO, TSO or GRO + * are enabled. */ + stats->rx_packets = dev_stats.tx_packets; + stats->rx_bytes = dev_stats.tx_bytes; + stats->tx_packets = dev_stats.rx_packets; + stats->tx_bytes = dev_stats.rx_bytes; + stats->rx_dropped += dev_stats.tx_dropped; stats->tx_dropped += dev_stats.rx_dropped; @@ -2053,8 +2223,13 @@ netdev_linux_get_queue_stats(const struct netdev *netdev_, return error; } +struct queue_dump_state { + struct nl_dump dump; + struct ofpbuf buf; +}; + static bool -start_queue_dump(const struct netdev *netdev, struct nl_dump *dump) +start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state) { struct ofpbuf request; struct tcmsg *tcmsg; @@ -2064,11 +2239,20 @@ start_queue_dump(const struct netdev *netdev, struct nl_dump *dump) return false; } tcmsg->tcm_parent = 0; - nl_dump_start(dump, NETLINK_ROUTE, &request); + nl_dump_start(&state->dump, NETLINK_ROUTE, &request); ofpbuf_uninit(&request); + + ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE); return true; } +static int +finish_queue_dump(struct queue_dump_state *state) +{ + ofpbuf_uninit(&state->buf); + return nl_dump_done(&state->dump); +} + struct netdev_linux_queue_state { unsigned int *queues; size_t cur_queue; @@ -2152,17 +2336,17 @@ netdev_linux_dump_queue_stats(const struct netdev *netdev_, ovs_mutex_lock(&netdev->mutex); error = tc_query_qdisc(netdev_); if (!error) { - struct nl_dump dump; + struct queue_dump_state state; if (!netdev->tc->ops->class_dump_stats) { error = EOPNOTSUPP; - } else if (!start_queue_dump(netdev_, &dump)) { + } else if (!start_queue_dump(netdev_, &state)) { error = ENODEV; } else { struct ofpbuf msg; int retval; - while (nl_dump_next(&dump, &msg)) { + while (nl_dump_next(&state.dump, &msg, &state.buf)) { retval = netdev->tc->ops->class_dump_stats(netdev_, &msg, cb, aux); if (retval) { @@ -2170,7 +2354,7 @@ netdev_linux_dump_queue_stats(const struct netdev *netdev_, } } - retval = nl_dump_done(&dump); + retval = finish_queue_dump(&state); if (retval) { error = retval; } @@ -2591,13 +2775,13 @@ netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off, \ netdev_linux_update_flags, \ \ - netdev_linux_rx_alloc, \ - netdev_linux_rx_construct, \ - netdev_linux_rx_destruct, \ - netdev_linux_rx_dealloc, \ - netdev_linux_rx_recv, \ - netdev_linux_rx_wait, \ - netdev_linux_rx_drain, \ + netdev_linux_rxq_alloc, \ + netdev_linux_rxq_construct, \ + netdev_linux_rxq_destruct, \ + netdev_linux_rxq_dealloc, \ + netdev_linux_rxq_recv, \ + netdev_linux_rxq_wait, \ + netdev_linux_rxq_drain, \ } const struct netdev_class netdev_linux_class = @@ -2948,7 +3132,7 @@ static int htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED) { struct ofpbuf msg; - struct nl_dump dump; + struct queue_dump_state state; struct htb_class hc; /* Get qdisc options. */ @@ -2957,17 +3141,17 @@ htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED) htb_install__(netdev, hc.max_rate); /* Get queues. */ - if (!start_queue_dump(netdev, &dump)) { + if (!start_queue_dump(netdev, &state)) { return ENODEV; } - while (nl_dump_next(&dump, &msg)) { + while (nl_dump_next(&state.dump, &msg, &state.buf)) { unsigned int queue_id; if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) { htb_update_queue__(netdev, queue_id, &hc); } } - nl_dump_done(&dump); + finish_queue_dump(&state); return 0; } @@ -3448,18 +3632,18 @@ static int hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED) { struct ofpbuf msg; - struct nl_dump dump; + struct queue_dump_state state; struct hfsc_class hc; hc.max_rate = 0; hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL); hfsc_install__(netdev, hc.max_rate); - if (!start_queue_dump(netdev, &dump)) { + if (!start_queue_dump(netdev, &state)) { return ENODEV; } - while (nl_dump_next(&dump, &msg)) { + while (nl_dump_next(&state.dump, &msg, &state.buf)) { unsigned int queue_id; if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) { @@ -3467,7 +3651,7 @@ hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED) } } - nl_dump_done(&dump); + finish_queue_dump(&state); return 0; }