X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=lib%2Fnetlink-socket.c;h=bc462353434ddc215cc48027eafcfd0aae130dbb;hb=4cf6a09c619d2c362ecfa688c90aadae4b1be39e;hp=f4635c8221c1f11a090c4e081d39c82a58aa4eed;hpb=0428ae5f7b6b3a83d501e8b8491923650b79f854;p=sliver-openvswitch.git diff --git a/lib/netlink-socket.c b/lib/netlink-socket.c index f4635c822..bc4623534 100644 --- a/lib/netlink-socket.c +++ b/lib/netlink-socket.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include "coverage.h" #include "dynamic-string.h" @@ -32,6 +33,7 @@ #include "poll-loop.h" #include "socket-util.h" #include "stress.h" +#include "util.h" #include "vlog.h" VLOG_DEFINE_THIS_MODULE(netlink_socket); @@ -62,12 +64,20 @@ struct nl_sock int fd; uint32_t pid; int protocol; - bool any_groups; struct nl_dump *dump; + unsigned int rcvbuf; /* Receive buffer size (SO_RCVBUF). */ }; -static int alloc_pid(uint32_t *); -static void free_pid(uint32_t); +/* Compile-time limit on iovecs, so that we can allocate a maximum-size array + * of iovecs on the stack. */ +#define MAX_IOVS 128 + +/* Maximum number of iovecs that may be passed to sendmsg, capped at a + * minimum of _XOPEN_IOV_MAX (16) and a maximum of MAX_IOVS. + * + * Initialized by nl_sock_create(). */ +static int max_iovs; + static int nl_sock_cow__(struct nl_sock *); /* Creates a new netlink socket for the given netlink 'protocol' @@ -78,8 +88,26 @@ nl_sock_create(int protocol, struct nl_sock **sockp) { struct nl_sock *sock; struct sockaddr_nl local, remote; + socklen_t local_size; int retval = 0; + if (!max_iovs) { + int save_errno = errno; + errno = 0; + + max_iovs = sysconf(_SC_UIO_MAXIOV); + if (max_iovs < _XOPEN_IOV_MAX) { + if (max_iovs == -1 && errno) { + VLOG_WARN("sysconf(_SC_UIO_MAXIOV): %s", strerror(errno)); + } + max_iovs = _XOPEN_IOV_MAX; + } else if (max_iovs > MAX_IOVS) { + max_iovs = MAX_IOVS; + } + + errno = save_errno; + } + *sockp = NULL; sock = malloc(sizeof *sock); if (sock == NULL) { @@ -92,37 +120,40 @@ nl_sock_create(int protocol, struct nl_sock **sockp) goto error; } sock->protocol = protocol; - sock->any_groups = false; sock->dump = NULL; - retval = alloc_pid(&sock->pid); - if (retval) { + retval = get_socket_rcvbuf(sock->fd); + if (retval < 0) { + retval = -retval; goto error; } + sock->rcvbuf = retval; - /* Bind local address as our selected pid. */ - memset(&local, 0, sizeof local); - local.nl_family = AF_NETLINK; - local.nl_pid = sock->pid; - if (bind(sock->fd, (struct sockaddr *) &local, sizeof local) < 0) { - VLOG_ERR("bind(%"PRIu32"): %s", sock->pid, strerror(errno)); - goto error_free_pid; - } - - /* Bind remote address as the kernel (pid 0). */ + /* Connect to kernel (pid 0) as remote address. */ memset(&remote, 0, sizeof remote); remote.nl_family = AF_NETLINK; remote.nl_pid = 0; if (connect(sock->fd, (struct sockaddr *) &remote, sizeof remote) < 0) { VLOG_ERR("connect(0): %s", strerror(errno)); - goto error_free_pid; + goto error; + } + + /* Obtain pid assigned by kernel. */ + local_size = sizeof local; + if (getsockname(sock->fd, (struct sockaddr *) &local, &local_size) < 0) { + VLOG_ERR("getsockname: %s", strerror(errno)); + goto error; } + if (local_size < sizeof local || local.nl_family != AF_NETLINK) { + VLOG_ERR("getsockname returned bad Netlink name"); + retval = EINVAL; + goto error; + } + sock->pid = local.nl_pid; *sockp = sock; return 0; -error_free_pid: - free_pid(sock->pid); error: if (retval == 0) { retval = errno; @@ -155,7 +186,6 @@ nl_sock_destroy(struct nl_sock *sock) sock->dump = NULL; } else { close(sock->fd); - free_pid(sock->pid); free(sock); } } @@ -164,6 +194,10 @@ nl_sock_destroy(struct nl_sock *sock) /* Tries to add 'sock' as a listener for 'multicast_group'. Returns 0 if * successful, otherwise a positive errno value. * + * A socket that is subscribed to a multicast group that receives asynchronous + * notifications must not be used for Netlink transactions or dumps, because + * transactions and dumps can cause notifications to be lost. + * * Multicast group numbers are always positive. * * It is not an error to attempt to join a multicast group to which a socket @@ -181,7 +215,6 @@ nl_sock_join_mcgroup(struct nl_sock *sock, unsigned int multicast_group) multicast_group, strerror(errno)); return errno; } - sock->any_groups = true; return 0; } @@ -353,6 +386,202 @@ nl_sock_recv(struct nl_sock *sock, struct ofpbuf **bufp, bool wait) return nl_sock_recv__(sock, bufp, wait); } +static int +find_nl_transaction_by_seq(struct nl_transaction **transactions, size_t n, + uint32_t seq) +{ + int i; + + for (i = 0; i < n; i++) { + struct nl_transaction *t = transactions[i]; + + if (seq == nl_msg_nlmsghdr(t->request)->nlmsg_seq) { + return i; + } + } + + return -1; +} + +static void +nl_sock_record_errors__(struct nl_transaction **transactions, size_t n, + int error) +{ + size_t i; + + for (i = 0; i < n; i++) { + transactions[i]->error = error; + transactions[i]->reply = NULL; + } +} + +static int +nl_sock_transact_multiple__(struct nl_sock *sock, + struct nl_transaction **transactions, size_t n, + size_t *done) +{ + struct iovec iovs[MAX_IOVS]; + struct msghdr msg; + int error; + int i; + + *done = 0; + for (i = 0; i < n; i++) { + struct ofpbuf *request = transactions[i]->request; + struct nlmsghdr *nlmsg = nl_msg_nlmsghdr(request); + + nlmsg->nlmsg_len = request->size; + nlmsg->nlmsg_pid = sock->pid; + if (i == n - 1) { + /* Ensure that we get a reply even if the final request doesn't + * ordinarily call for one. */ + nlmsg->nlmsg_flags |= NLM_F_ACK; + } + + iovs[i].iov_base = request->data; + iovs[i].iov_len = request->size; + } + + memset(&msg, 0, sizeof msg); + msg.msg_iov = iovs; + msg.msg_iovlen = n; + do { + error = sendmsg(sock->fd, &msg, 0) < 0 ? errno : 0; + } while (error == EINTR); + + for (i = 0; i < n; i++) { + struct ofpbuf *request = transactions[i]->request; + + log_nlmsg(__func__, error, request->data, request->size, + sock->protocol); + } + if (!error) { + COVERAGE_ADD(netlink_sent, n); + } + + if (error) { + return error; + } + + while (n > 0) { + struct ofpbuf *reply; + + error = nl_sock_recv__(sock, &reply, true); + if (error) { + return error; + } + + i = find_nl_transaction_by_seq(transactions, n, + nl_msg_nlmsghdr(reply)->nlmsg_seq); + if (i < 0) { + VLOG_DBG_RL(&rl, "ignoring unexpected seq %#"PRIx32, + nl_msg_nlmsghdr(reply)->nlmsg_seq); + ofpbuf_delete(reply); + continue; + } + + nl_sock_record_errors__(transactions, i, 0); + if (nl_msg_nlmsgerr(reply, &error)) { + transactions[i]->reply = NULL; + transactions[i]->error = error; + if (error) { + VLOG_DBG_RL(&rl, "received NAK error=%d (%s)", + error, strerror(error)); + } + ofpbuf_delete(reply); + } else { + transactions[i]->reply = reply; + transactions[i]->error = 0; + } + + *done += i + 1; + transactions += i + 1; + n -= i + 1; + } + + return 0; +} + +/* Sends the 'request' member of the 'n' transactions in 'transactions' to the + * kernel, in order, and waits for responses to all of them. Fills in the + * 'error' member of each transaction with 0 if it was successful, otherwise + * with a positive errno value. 'reply' will be NULL on error or if the + * transaction was successful but had no reply beyond an indication of success. + * For a successful transaction that did have a more detailed reply, 'reply' + * will be set to the reply message. + * + * The caller is responsible for destroying each request and reply, and the + * transactions array itself. + * + * Before sending each message, this function will finalize nlmsg_len in each + * 'request' to match the ofpbuf's size, and set nlmsg_pid to 'sock''s pid. + * NLM_F_ACK will be added to some requests' nlmsg_flags. + * + * Bare Netlink is an unreliable transport protocol. This function layers + * reliable delivery and reply semantics on top of bare Netlink. See + * nl_sock_transact() for some caveats. + */ +void +nl_sock_transact_multiple(struct nl_sock *sock, + struct nl_transaction **transactions, size_t n) +{ + int max_batch_count; + int error; + + if (!n) { + return; + } + + error = nl_sock_cow__(sock); + if (error) { + nl_sock_record_errors__(transactions, n, error); + return; + } + + /* In theory, every request could have a 64 kB reply. But the default and + * maximum socket rcvbuf size with typical Dom0 memory sizes both tend to + * be a bit below 128 kB, so that would only allow a single message in a + * "batch". So we assume that replies average (at most) 4 kB, which allows + * a good deal of batching. + * + * In practice, most of the requests that we batch either have no reply at + * all or a brief reply. */ + max_batch_count = MAX(sock->rcvbuf / 4096, 1); + max_batch_count = MIN(max_batch_count, max_iovs); + + while (n > 0) { + size_t count, bytes; + size_t done; + + /* Batch up to 'max_batch_count' transactions. But cap it at about a + * page of requests total because big skbuffs are expensive to + * allocate in the kernel. */ +#if defined(PAGESIZE) + enum { MAX_BATCH_BYTES = MAX(1, PAGESIZE - 512) }; +#else + enum { MAX_BATCH_BYTES = 4096 - 512 }; +#endif + bytes = transactions[0]->request->size; + for (count = 1; count < n && count < max_batch_count; count++) { + if (bytes + transactions[count]->request->size > MAX_BATCH_BYTES) { + break; + } + bytes += transactions[count]->request->size; + } + + error = nl_sock_transact_multiple__(sock, transactions, count, &done); + transactions += done; + n -= done; + + if (error == ENOBUFS) { + VLOG_DBG_RL(&rl, "receive buffer overflow, resending request"); + } else if (error) { + VLOG_ERR_RL(&rl, "transaction error (%s)", strerror(error)); + nl_sock_record_errors__(transactions, n, error); + } + } +} + /* Sends 'request' to the kernel via 'sock' and waits for a response. If * successful, returns 0. On failure, returns a positive errno value. * @@ -394,68 +623,21 @@ nl_sock_recv(struct nl_sock *sock, struct ofpbuf **bufp, bool wait) * needs to be idempotent. */ int -nl_sock_transact(struct nl_sock *sock, - const struct ofpbuf *request, struct ofpbuf **replyp) +nl_sock_transact(struct nl_sock *sock, const struct ofpbuf *request, + struct ofpbuf **replyp) { - uint32_t seq = nl_msg_nlmsghdr(request)->nlmsg_seq; - struct nlmsghdr *nlmsghdr; - struct ofpbuf *reply; - int retval; + struct nl_transaction *transactionp; + struct nl_transaction transaction; + transaction.request = (struct ofpbuf *) request; + transactionp = &transaction; + nl_sock_transact_multiple(sock, &transactionp, 1); if (replyp) { - *replyp = NULL; - } - - /* Ensure that we get a reply even if this message doesn't ordinarily call - * for one. */ - nl_msg_nlmsghdr(request)->nlmsg_flags |= NLM_F_ACK; - -send: - retval = nl_sock_send(sock, request, true); - if (retval) { - return retval; - } - -recv: - retval = nl_sock_recv(sock, &reply, true); - if (retval) { - if (retval == ENOBUFS) { - COVERAGE_INC(netlink_overflow); - VLOG_DBG_RL(&rl, "receive buffer overflow, resending request"); - goto send; - } else { - return retval; - } - } - nlmsghdr = nl_msg_nlmsghdr(reply); - if (seq != nlmsghdr->nlmsg_seq) { - VLOG_DBG_RL(&rl, "ignoring seq %#"PRIx32" != expected %#"PRIx32, - nl_msg_nlmsghdr(reply)->nlmsg_seq, seq); - ofpbuf_delete(reply); - goto recv; - } - - /* If the reply is an error, discard the reply and return the error code. - * - * Except: if the reply is just an acknowledgement (error code of 0), and - * the caller is interested in the reply (replyp != NULL), pass the reply - * up to the caller. Otherwise the caller will get a return value of 0 - * and null '*replyp', which makes unwary callers likely to segfault. */ - if (nl_msg_nlmsgerr(reply, &retval) && (retval || !replyp)) { - ofpbuf_delete(reply); - if (retval) { - VLOG_DBG_RL(&rl, "received NAK error=%d (%s)", - retval, strerror(retval)); - } - return retval != EAGAIN ? retval : EPROTO; - } - - if (replyp) { - *replyp = reply; + *replyp = transaction.reply; } else { - ofpbuf_delete(reply); + ofpbuf_delete(transaction.reply); } - return 0; + return transaction.error; } /* Drain all the messages currently in 'sock''s receive queue. */ @@ -536,10 +718,9 @@ nl_dump_start(struct nl_dump *dump, nlmsghdr->nlmsg_flags |= NLM_F_DUMP | NLM_F_ACK; dump->seq = nlmsghdr->nlmsg_seq; dump->buffer = NULL; - if (sock->any_groups || sock->dump) { - /* 'sock' might belong to some multicast group, or it already has an - * ongoing dump. Clone the socket to avoid possibly intermixing - * multicast messages or previous dump results with our results. */ + if (sock->dump) { + /* 'sock' already has an ongoing dump. Clone the socket because + * Netlink only allows one dump at a time. */ dump->status = nl_sock_clone(sock, &dump->sock); if (dump->status) { return; @@ -667,6 +848,26 @@ nl_sock_wait(const struct nl_sock *sock, short int events) { poll_fd_wait(sock->fd, events); } + +/* Returns the underlying fd for 'sock', for use in "poll()"-like operations + * that can't use nl_sock_wait(). + * + * It's a little tricky to use the returned fd correctly, because nl_sock does + * "copy on write" to allow a single nl_sock to be used for notifications, + * transactions, and dumps. If 'sock' is used only for notifications and + * transactions (and never for dump) then the usage is safe. */ +int +nl_sock_fd(const struct nl_sock *sock) +{ + return sock->fd; +} + +/* Returns the PID associated with this socket. */ +uint32_t +nl_sock_pid(const struct nl_sock *sock) +{ + return sock->pid; +} /* Miscellaneous. */ @@ -775,9 +976,8 @@ nl_lookup_genl_mcgroup(const char *family_name, const char *group_name, unsigned int *multicast_group, unsigned int fallback) { struct nlattr *family_attrs[ARRAY_SIZE(family_policy)]; - struct ofpbuf all_mcs; + const struct nlattr *mc; struct ofpbuf *reply; - struct nlattr *mc; unsigned int left; int error; @@ -795,8 +995,7 @@ nl_lookup_genl_mcgroup(const char *family_name, const char *group_name, goto exit; } - nl_attr_get_nested(family_attrs[CTRL_ATTR_MCAST_GROUPS], &all_mcs); - NL_ATTR_FOR_EACH (mc, left, all_mcs.data, all_mcs.size) { + NL_NESTED_FOR_EACH (mc, left, family_attrs[CTRL_ATTR_MCAST_GROUPS]) { static const struct nl_policy mc_policy[] = { [CTRL_ATTR_MCAST_GRP_ID] = {.type = NL_A_U32}, [CTRL_ATTR_MCAST_GRP_NAME] = {.type = NL_A_STRING}, @@ -851,54 +1050,6 @@ nl_lookup_genl_family(const char *name, int *number) return *number > 0 ? 0 : -*number; } -/* Netlink PID. - * - * Every Netlink socket must be bound to a unique 32-bit PID. By convention, - * programs that have a single Netlink socket use their Unix process ID as PID, - * and programs with multiple Netlink sockets add a unique per-socket - * identifier in the bits above the Unix process ID. - * - * The kernel has Netlink PID 0. - */ - -/* Parameters for how many bits in the PID should come from the Unix process ID - * and how many unique per-socket. */ -#define SOCKET_BITS 10 -#define MAX_SOCKETS (1u << SOCKET_BITS) - -#define PROCESS_BITS (32 - SOCKET_BITS) -#define MAX_PROCESSES (1u << PROCESS_BITS) -#define PROCESS_MASK ((uint32_t) (MAX_PROCESSES - 1)) - -/* Bit vector of unused socket identifiers. */ -static uint32_t avail_sockets[ROUND_UP(MAX_SOCKETS, 32)]; - -/* Allocates and returns a new Netlink PID. */ -static int -alloc_pid(uint32_t *pid) -{ - int i; - - for (i = 0; i < MAX_SOCKETS; i++) { - if ((avail_sockets[i / 32] & (1u << (i % 32))) == 0) { - avail_sockets[i / 32] |= 1u << (i % 32); - *pid = (getpid() & PROCESS_MASK) | (i << PROCESS_BITS); - return 0; - } - } - VLOG_ERR("netlink pid space exhausted"); - return ENOBUFS; -} - -/* Makes the specified 'pid' available for reuse. */ -static void -free_pid(uint32_t pid) -{ - int sock = pid >> PROCESS_BITS; - assert(avail_sockets[sock / 32] & (1u << (sock % 32))); - avail_sockets[sock / 32] &= ~(1u << (sock % 32)); -} - static void nlmsghdr_to_string(const struct nlmsghdr *h, int protocol, struct ds *ds) { @@ -947,10 +1098,8 @@ nlmsghdr_to_string(const struct nlmsghdr *h, int protocol, struct ds *ds) if (flags_left) { ds_put_format(ds, "[OTHER:%"PRIx16"]", flags_left); } - ds_put_format(ds, ", seq=%"PRIx32", pid=%"PRIu32"(%d:%d))", - h->nlmsg_seq, h->nlmsg_pid, - (int) (h->nlmsg_pid & PROCESS_MASK), - (int) (h->nlmsg_pid >> PROCESS_BITS)); + ds_put_format(ds, ", seq=%"PRIx32", pid=%"PRIu32, + h->nlmsg_seq, h->nlmsg_pid); } static char *