X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=net%2Fnetlink%2Faf_netlink.c;h=87d8074b7757a534bd133bc84282abeefbd0b717;hb=refs%2Fheads%2Fvserver;hp=4aa05030ebae63cd589fc4f8aa98bda8b24c56e7;hpb=c7b5ebbddf7bcd3651947760f423e3783bbe6573;p=linux-2.6.git diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 4aa05030e..87d8074b7 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -13,15 +13,19 @@ * added netlink_proto_exit * Tue Jan 22 18:32:44 BRST 2002 Arnaldo C. de Melo * use nlk_sk, as sk->protinfo is on a diet 8) - * + * Fri Jul 22 19:51:12 MEST 2005 Harald Welte + * - inc module use count of module that owns + * the kernel socket in case userspace opens + * socket of same protocol + * - remove all module support, since netlink is + * mandatory if CONFIG_NET=y these days */ -#include #include +#include #include #include -#include #include #include #include @@ -44,52 +48,94 @@ #include #include #include -#include +#include +#include +#include +#include +#include +#include +#include +#include #include #include #include + #include #include +#include -#define Nprintk(a...) - -#if defined(CONFIG_NETLINK_DEV) || defined(CONFIG_NETLINK_DEV_MODULE) -#define NL_EMULATE_DEV -#endif +#define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8) -struct netlink_opt -{ +struct netlink_sock { + /* struct sock has to be the first member of netlink_sock */ + struct sock sk; u32 pid; - unsigned groups; u32 dst_pid; - unsigned dst_groups; + u32 dst_group; + u32 flags; + u32 subscriptions; + u32 ngroups; + unsigned long *groups; unsigned long state; - int (*handler)(int unit, struct sk_buff *skb); wait_queue_head_t wait; struct netlink_callback *cb; spinlock_t cb_lock; void (*data_ready)(struct sock *sk, int bytes); + struct module *module; }; -#define nlk_sk(__sk) ((struct netlink_opt *)(__sk)->sk_protinfo) +#define NETLINK_KERNEL_SOCKET 0x1 +#define NETLINK_RECV_PKTINFO 0x2 -static struct hlist_head nl_table[MAX_LINKS]; -static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait); -static unsigned nl_nonroot[MAX_LINKS]; +static inline struct netlink_sock *nlk_sk(struct sock *sk) +{ + return (struct netlink_sock *)sk; +} -#ifdef NL_EMULATE_DEV -static struct socket *netlink_kernel[MAX_LINKS]; -#endif +struct nl_pid_hash { + struct hlist_head *table; + unsigned long rehash_time; + + unsigned int mask; + unsigned int shift; + + unsigned int entries; + unsigned int max_shift; + + u32 rnd; +}; + +struct netlink_table { + struct nl_pid_hash hash; + struct hlist_head mc_list; + unsigned long *listeners; + unsigned int nl_nonroot; + unsigned int groups; + struct module *module; + int registered; +}; + +static struct netlink_table *nl_table; + +static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait); static int netlink_dump(struct sock *sk); static void netlink_destroy_callback(struct netlink_callback *cb); -atomic_t netlink_sock_nr; - -static rwlock_t nl_table_lock = RW_LOCK_UNLOCKED; +static DEFINE_RWLOCK(nl_table_lock); static atomic_t nl_table_users = ATOMIC_INIT(0); -static struct notifier_block *netlink_chain; +static ATOMIC_NOTIFIER_HEAD(netlink_chain); + +static u32 netlink_group_mask(u32 group) +{ + return group ? 1 << (group - 1) : 0; +} + +static struct hlist_head *nl_pid_hashfn(struct nl_pid_hash *hash, u32 pid) +{ + return &hash->table[jhash_1word(pid, hash->rnd) & hash->mask]; +} static void netlink_sock_destruct(struct sock *sk) { @@ -102,13 +148,7 @@ static void netlink_sock_destruct(struct sock *sk) BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc)); BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc)); BUG_TRAP(!nlk_sk(sk)->cb); - - kfree(nlk_sk(sk)); - - atomic_dec(&netlink_sock_nr); -#ifdef NETLINK_REFCNT_DEBUG - printk(KERN_DEBUG "NETLINK %p released, %d are still alive\n", sk, atomic_read(&netlink_sock_nr)); -#endif + BUG_TRAP(!nlk_sk(sk)->groups); } /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it is _very_ bad on SMP. @@ -119,7 +159,7 @@ static void netlink_sock_destruct(struct sock *sk) static void netlink_table_grab(void) { - write_lock_bh(&nl_table_lock); + write_lock_irq(&nl_table_lock); if (atomic_read(&nl_table_users)) { DECLARE_WAITQUEUE(wait, current); @@ -129,9 +169,9 @@ static void netlink_table_grab(void) set_current_state(TASK_UNINTERRUPTIBLE); if (atomic_read(&nl_table_users) == 0) break; - write_unlock_bh(&nl_table_lock); + write_unlock_irq(&nl_table_lock); schedule(); - write_lock_bh(&nl_table_lock); + write_lock_irq(&nl_table_lock); } __set_current_state(TASK_RUNNING); @@ -141,7 +181,7 @@ static void netlink_table_grab(void) static __inline__ void netlink_table_ungrab(void) { - write_unlock_bh(&nl_table_lock); + write_unlock_irq(&nl_table_lock); wake_up(&nl_table_wait); } @@ -164,11 +204,14 @@ netlink_unlock_table(void) static __inline__ struct sock *netlink_lookup(int protocol, u32 pid) { + struct nl_pid_hash *hash = &nl_table[protocol].hash; + struct hlist_head *head; struct sock *sk; struct hlist_node *node; read_lock(&nl_table_lock); - sk_for_each(sk, node, &nl_table[protocol]) { + head = nl_pid_hashfn(hash, pid); + sk_for_each(sk, node, head) { if (nlk_sk(sk)->pid == pid) { sock_hold(sk); goto found; @@ -180,27 +223,136 @@ found: return sk; } -static struct proto_ops netlink_ops; +static inline struct hlist_head *nl_pid_hash_alloc(size_t size) +{ + if (size <= PAGE_SIZE) + return kmalloc(size, GFP_ATOMIC); + else + return (struct hlist_head *) + __get_free_pages(GFP_ATOMIC, get_order(size)); +} + +static inline void nl_pid_hash_free(struct hlist_head *table, size_t size) +{ + if (size <= PAGE_SIZE) + kfree(table); + else + free_pages((unsigned long)table, get_order(size)); +} + +static int nl_pid_hash_rehash(struct nl_pid_hash *hash, int grow) +{ + unsigned int omask, mask, shift; + size_t osize, size; + struct hlist_head *otable, *table; + int i; + + omask = mask = hash->mask; + osize = size = (mask + 1) * sizeof(*table); + shift = hash->shift; + + if (grow) { + if (++shift > hash->max_shift) + return 0; + mask = mask * 2 + 1; + size *= 2; + } + + table = nl_pid_hash_alloc(size); + if (!table) + return 0; + + memset(table, 0, size); + otable = hash->table; + hash->table = table; + hash->mask = mask; + hash->shift = shift; + get_random_bytes(&hash->rnd, sizeof(hash->rnd)); + + for (i = 0; i <= omask; i++) { + struct sock *sk; + struct hlist_node *node, *tmp; + + sk_for_each_safe(sk, node, tmp, &otable[i]) + __sk_add_node(sk, nl_pid_hashfn(hash, nlk_sk(sk)->pid)); + } + + nl_pid_hash_free(otable, osize); + hash->rehash_time = jiffies + 10 * 60 * HZ; + return 1; +} + +static inline int nl_pid_hash_dilute(struct nl_pid_hash *hash, int len) +{ + int avg = hash->entries >> hash->shift; + + if (unlikely(avg > 1) && nl_pid_hash_rehash(hash, 1)) + return 1; + + if (unlikely(len > avg) && time_after(jiffies, hash->rehash_time)) { + nl_pid_hash_rehash(hash, 0); + return 1; + } + + return 0; +} + +static const struct proto_ops netlink_ops; + +static void +netlink_update_listeners(struct sock *sk) +{ + struct netlink_table *tbl = &nl_table[sk->sk_protocol]; + struct hlist_node *node; + unsigned long mask; + unsigned int i; + + for (i = 0; i < NLGRPSZ(tbl->groups)/sizeof(unsigned long); i++) { + mask = 0; + sk_for_each_bound(sk, node, &tbl->mc_list) + mask |= nlk_sk(sk)->groups[i]; + tbl->listeners[i] = mask; + } + /* this function is only called with the netlink table "grabbed", which + * makes sure updates are visible before bind or setsockopt return. */ +} static int netlink_insert(struct sock *sk, u32 pid) { + struct nl_pid_hash *hash = &nl_table[sk->sk_protocol].hash; + struct hlist_head *head; int err = -EADDRINUSE; struct sock *osk; struct hlist_node *node; + int len; netlink_table_grab(); - sk_for_each(osk, node, &nl_table[sk->sk_protocol]) { + head = nl_pid_hashfn(hash, pid); + len = 0; + sk_for_each(osk, node, head) { if (nlk_sk(osk)->pid == pid) break; + len++; } - if (!node) { - err = -EBUSY; - if (nlk_sk(sk)->pid == 0) { - nlk_sk(sk)->pid = pid; - sk_add_node(sk, &nl_table[sk->sk_protocol]); - err = 0; - } - } + if (node) + goto err; + + err = -EBUSY; + if (nlk_sk(sk)->pid) + goto err; + + err = -ENOMEM; + if (BITS_PER_LONG > 32 && unlikely(hash->entries >= UINT_MAX)) + goto err; + + if (len && nl_pid_hash_dilute(hash, len)) + head = nl_pid_hashfn(hash, pid); + hash->entries++; + nlk_sk(sk)->pid = pid; + sk_add_node(sk, head); + err = 0; + +err: netlink_table_ungrab(); return err; } @@ -208,14 +360,47 @@ static int netlink_insert(struct sock *sk, u32 pid) static void netlink_remove(struct sock *sk) { netlink_table_grab(); - sk_del_node_init(sk); + if (sk_del_node_init(sk)) + nl_table[sk->sk_protocol].hash.entries--; + if (nlk_sk(sk)->subscriptions) + __sk_del_bind_node(sk); netlink_table_ungrab(); } -static int netlink_create(struct socket *sock, int protocol) +static struct proto netlink_proto = { + .name = "NETLINK", + .owner = THIS_MODULE, + .obj_size = sizeof(struct netlink_sock), +}; + +static int __netlink_create(struct socket *sock, int protocol) { struct sock *sk; - struct netlink_opt *nlk; + struct netlink_sock *nlk; + + sock->ops = &netlink_ops; + + sk = sk_alloc(PF_NETLINK, GFP_KERNEL, &netlink_proto, 1); + if (!sk) + return -ENOMEM; + + sock_init_data(sock, sk); + + nlk = nlk_sk(sk); + spin_lock_init(&nlk->cb_lock); + init_waitqueue_head(&nlk->wait); + + sk->sk_destruct = netlink_sock_destruct; + sk->sk_protocol = protocol; + return 0; +} + +static int netlink_create(struct socket *sock, int protocol) +{ + struct module *module = NULL; + struct netlink_sock *nlk; + unsigned int groups; + int err = 0; sock->state = SS_UNCONNECTED; @@ -225,41 +410,37 @@ static int netlink_create(struct socket *sock, int protocol) if (protocol<0 || protocol >= MAX_LINKS) return -EPROTONOSUPPORT; - sock->ops = &netlink_ops; - - sk = sk_alloc(PF_NETLINK, GFP_KERNEL, 1, NULL); - if (!sk) - return -ENOMEM; - - sock_init_data(sock,sk); - sk_set_owner(sk, THIS_MODULE); - - nlk = sk->sk_protinfo = kmalloc(sizeof(*nlk), GFP_KERNEL); - if (!nlk) { - sk_free(sk); - return -ENOMEM; + netlink_lock_table(); +#ifdef CONFIG_KMOD + if (!nl_table[protocol].registered) { + netlink_unlock_table(); + request_module("net-pf-%d-proto-%d", PF_NETLINK, protocol); + netlink_lock_table(); } - memset(nlk, 0, sizeof(*nlk)); +#endif + if (nl_table[protocol].registered && + try_module_get(nl_table[protocol].module)) + module = nl_table[protocol].module; + groups = nl_table[protocol].groups; + netlink_unlock_table(); - spin_lock_init(&nlk->cb_lock); - init_waitqueue_head(&nlk->wait); - sk->sk_destruct = netlink_sock_destruct; - atomic_inc(&netlink_sock_nr); + if ((err = __netlink_create(sock, protocol)) < 0) + goto out_module; - set_vx_info(&sk->sk_vx_info, current->vx_info); - sk->sk_xid = vx_current_xid(); - vx_sock_inc(sk); - set_nx_info(&sk->sk_nx_info, current->nx_info); - sk->sk_nid = nx_current_nid(); + nlk = nlk_sk(sock->sk); + nlk->module = module; +out: + return err; - sk->sk_protocol = protocol; - return 0; +out_module: + module_put(module); + goto out; } static int netlink_release(struct socket *sock) { struct sock *sk = sock->sk; - struct netlink_opt *nlk; + struct netlink_sock *nlk; if (!sk) return 0; @@ -269,10 +450,10 @@ static int netlink_release(struct socket *sock) spin_lock(&nlk->cb_lock); if (nlk->cb) { - nlk->cb->done(nlk->cb); + if (nlk->cb->done) + nlk->cb->done(nlk->cb); netlink_destroy_callback(nlk->cb); nlk->cb = NULL; - __sock_put(sk); } spin_unlock(&nlk->cb_lock); @@ -285,19 +466,28 @@ static int netlink_release(struct socket *sock) skb_queue_purge(&sk->sk_write_queue); - if (nlk->pid && !nlk->groups) { + if (nlk->pid && !nlk->subscriptions) { struct netlink_notify n = { .protocol = sk->sk_protocol, .pid = nlk->pid, }; - notifier_call_chain(&netlink_chain, NETLINK_URELEASE, &n); + atomic_notifier_call_chain(&netlink_chain, + NETLINK_URELEASE, &n); } - - vx_sock_dec(sk); - clr_vx_info(&sk->sk_vx_info); - sk->sk_xid = -1; - clr_nx_info(&sk->sk_nx_info); - sk->sk_nid = -1; + + module_put(nlk->module); + + netlink_table_grab(); + if (nlk->flags & NETLINK_KERNEL_SOCKET) { + kfree(nl_table[sk->sk_protocol].listeners); + nl_table[sk->sk_protocol].module = NULL; + nl_table[sk->sk_protocol].registered = 0; + } else if (nlk->subscriptions) + netlink_update_listeners(sk); + netlink_table_ungrab(); + + kfree(nlk->groups); + nlk->groups = NULL; sock_put(sk); return 0; @@ -306,19 +496,24 @@ static int netlink_release(struct socket *sock) static int netlink_autobind(struct socket *sock) { struct sock *sk = sock->sk; + struct nl_pid_hash *hash = &nl_table[sk->sk_protocol].hash; + struct hlist_head *head; struct sock *osk; struct hlist_node *node; - s32 pid = current->pid; + s32 pid = current->tgid; int err; + static s32 rover = -4097; retry: + cond_resched(); netlink_table_grab(); - sk_for_each(osk, node, &nl_table[sk->sk_protocol]) { + head = nl_pid_hashfn(hash, pid); + sk_for_each(osk, node, head) { if (nlk_sk(osk)->pid == pid) { /* Bind collision, search negative pid values. */ - if (pid > 0) - pid = -4096; - pid--; + pid = rover--; + if (rover > -4097) + rover = -4097; netlink_table_ungrab(); goto retry; } @@ -328,20 +523,58 @@ retry: err = netlink_insert(sk, pid); if (err == -EADDRINUSE) goto retry; - nlk_sk(sk)->groups = 0; - return 0; + + /* If 2 threads race to autobind, that is fine. */ + if (err == -EBUSY) + err = 0; + + return err; } -static inline int netlink_capable(struct socket *sock, unsigned flag) +static inline int netlink_capable(struct socket *sock, unsigned int flag) { - return (nl_nonroot[sock->sk->sk_protocol] & flag) || + return (nl_table[sock->sk->sk_protocol].nl_nonroot & flag) || capable(CAP_NET_ADMIN); } +static void +netlink_update_subscriptions(struct sock *sk, unsigned int subscriptions) +{ + struct netlink_sock *nlk = nlk_sk(sk); + + if (nlk->subscriptions && !subscriptions) + __sk_del_bind_node(sk); + else if (!nlk->subscriptions && subscriptions) + sk_add_bind_node(sk, &nl_table[sk->sk_protocol].mc_list); + nlk->subscriptions = subscriptions; +} + +static int netlink_alloc_groups(struct sock *sk) +{ + struct netlink_sock *nlk = nlk_sk(sk); + unsigned int groups; + int err = 0; + + netlink_lock_table(); + groups = nl_table[sk->sk_protocol].groups; + if (!nl_table[sk->sk_protocol].registered) + err = -ENOENT; + netlink_unlock_table(); + + if (err) + return err; + + nlk->groups = kzalloc(NLGRPSZ(groups), GFP_KERNEL); + if (nlk->groups == NULL) + return -ENOMEM; + nlk->ngroups = groups; + return 0; +} + static int netlink_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { struct sock *sk = sock->sk; - struct netlink_opt *nlk = nlk_sk(sk); + struct netlink_sock *nlk = nlk_sk(sk); struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr; int err; @@ -349,27 +582,39 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, int addr_len return -EINVAL; /* Only superuser is allowed to listen multicasts */ - if (nladdr->nl_groups && !netlink_capable(sock, NL_NONROOT_RECV)) - return -EPERM; + if (nladdr->nl_groups) { + if (!netlink_capable(sock, NL_NONROOT_RECV)) + return -EPERM; + if (nlk->groups == NULL) { + err = netlink_alloc_groups(sk); + if (err) + return err; + } + } if (nlk->pid) { if (nladdr->nl_pid != nlk->pid) return -EINVAL; - nlk->groups = nladdr->nl_groups; - return 0; + } else { + err = nladdr->nl_pid ? + netlink_insert(sk, nladdr->nl_pid) : + netlink_autobind(sock); + if (err) + return err; } - if (nladdr->nl_pid == 0) { - err = netlink_autobind(sock); - if (err == 0) - nlk->groups = nladdr->nl_groups; - return err; - } + if (!nladdr->nl_groups && (nlk->groups == NULL || !(u32)nlk->groups[0])) + return 0; - err = netlink_insert(sk, nladdr->nl_pid); - if (err == 0) - nlk->groups = nladdr->nl_groups; - return err; + netlink_table_grab(); + netlink_update_subscriptions(sk, nlk->subscriptions + + hweight32(nladdr->nl_groups) - + hweight32(nlk->groups[0])); + nlk->groups[0] = (nlk->groups[0] & ~0xffffffffUL) | nladdr->nl_groups; + netlink_update_listeners(sk); + netlink_table_ungrab(); + + return 0; } static int netlink_connect(struct socket *sock, struct sockaddr *addr, @@ -377,13 +622,13 @@ static int netlink_connect(struct socket *sock, struct sockaddr *addr, { int err = 0; struct sock *sk = sock->sk; - struct netlink_opt *nlk = nlk_sk(sk); + struct netlink_sock *nlk = nlk_sk(sk); struct sockaddr_nl *nladdr=(struct sockaddr_nl*)addr; if (addr->sa_family == AF_UNSPEC) { sk->sk_state = NETLINK_UNCONNECTED; nlk->dst_pid = 0; - nlk->dst_groups = 0; + nlk->dst_group = 0; return 0; } if (addr->sa_family != AF_NETLINK) @@ -399,7 +644,7 @@ static int netlink_connect(struct socket *sock, struct sockaddr *addr, if (err == 0) { sk->sk_state = NETLINK_CONNECTED; nlk->dst_pid = nladdr->nl_pid; - nlk->dst_groups = nladdr->nl_groups; + nlk->dst_group = ffs(nladdr->nl_groups); } return err; @@ -408,7 +653,7 @@ static int netlink_connect(struct socket *sock, struct sockaddr *addr, static int netlink_getname(struct socket *sock, struct sockaddr *addr, int *addr_len, int peer) { struct sock *sk = sock->sk; - struct netlink_opt *nlk = nlk_sk(sk); + struct netlink_sock *nlk = nlk_sk(sk); struct sockaddr_nl *nladdr=(struct sockaddr_nl *)addr; nladdr->nl_family = AF_NETLINK; @@ -417,10 +662,10 @@ static int netlink_getname(struct socket *sock, struct sockaddr *addr, int *addr if (peer) { nladdr->nl_pid = nlk->dst_pid; - nladdr->nl_groups = nlk->dst_groups; + nladdr->nl_groups = netlink_group_mask(nlk->dst_group); } else { nladdr->nl_pid = nlk->pid; - nladdr->nl_groups = nlk->groups; + nladdr->nl_groups = nlk->groups ? nlk->groups[0] : 0; } return 0; } @@ -433,11 +678,11 @@ static void netlink_overrun(struct sock *sk) } } -struct sock *netlink_getsockbypid(struct sock *ssk, u32 pid) +static struct sock *netlink_getsockbypid(struct sock *ssk, u32 pid) { int protocol = ssk->sk_protocol; struct sock *sock; - struct netlink_opt *nlk; + struct netlink_sock *nlk; sock = netlink_lookup(protocol, pid); if (!sock) @@ -456,14 +701,13 @@ struct sock *netlink_getsockbypid(struct sock *ssk, u32 pid) struct sock *netlink_getsockbyfilp(struct file *filp) { - struct inode *inode = filp->f_dentry->d_inode; - struct socket *socket; + struct inode *inode = filp->f_path.dentry->d_inode; struct sock *sock; - if (!inode->i_sock || !(socket = SOCKET_I(inode))) + if (!S_ISSOCK(inode->i_mode)) return ERR_PTR(-ENOTSOCK); - sock = socket->sk; + sock = SOCKET_I(inode)->sk; if (sock->sk_family != AF_NETLINK) return ERR_PTR(-EINVAL); @@ -481,21 +725,18 @@ struct sock *netlink_getsockbyfilp(struct file *filp) * 0: continue * 1: repeat lookup - reference dropped while waiting for socket memory. */ -int netlink_attachskb(struct sock *sk, struct sk_buff *skb, int nonblock, long timeo) +int netlink_attachskb(struct sock *sk, struct sk_buff *skb, int nonblock, + long timeo, struct sock *ssk) { - struct netlink_opt *nlk; + struct netlink_sock *nlk; nlk = nlk_sk(sk); -#ifdef NL_EMULATE_DEV - if (nlk->handler) - return 0; -#endif if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || test_bit(0, &nlk->state)) { DECLARE_WAITQUEUE(wait, current); if (!timeo) { - if (!nlk->pid) + if (!ssk || nlk_sk(ssk)->pid == 0) netlink_overrun(sk); sock_put(sk); kfree_skb(skb); @@ -520,26 +761,14 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, int nonblock, long t } return 1; } - skb_orphan(skb); skb_set_owner_r(skb, sk); return 0; } int netlink_sendskb(struct sock *sk, struct sk_buff *skb, int protocol) { - struct netlink_opt *nlk; int len = skb->len; - nlk = nlk_sk(sk); -#ifdef NL_EMULATE_DEV - if (nlk->handler) { - skb_orphan(skb); - len = nlk->handler(protocol, skb); - sock_put(sk); - return len; - } -#endif - skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_data_ready(sk, len); sock_put(sk); @@ -552,21 +781,29 @@ void netlink_detachskb(struct sock *sk, struct sk_buff *skb) sock_put(sk); } -static inline void netlink_trim(struct sk_buff *skb, int allocation) +static inline struct sk_buff *netlink_trim(struct sk_buff *skb, + gfp_t allocation) { - int delta = skb->end - skb->tail; + int delta; - /* If the packet is charged to a socket, the modification - * of truesize below is illegal and will corrupt socket - * buffer accounting state. - */ - BUG_ON(skb->list != NULL); + skb_orphan(skb); + delta = skb->end - skb->tail; if (delta * 2 < skb->truesize) - return; - if (pskb_expand_head(skb, 0, -delta, allocation)) - return; - skb->truesize -= delta; + return skb; + + if (skb_shared(skb)) { + struct sk_buff *nskb = skb_clone(skb, allocation); + if (!nskb) + return skb; + kfree_skb(skb); + skb = nskb; + } + + if (!pskb_expand_head(skb, 0, -delta, allocation)) + skb->truesize -= delta; + + return skb; } int netlink_unicast(struct sock *ssk, struct sk_buff *skb, u32 pid, int nonblock) @@ -575,7 +812,7 @@ int netlink_unicast(struct sock *ssk, struct sk_buff *skb, u32 pid, int nonblock int err; long timeo; - netlink_trim(skb, gfp_any()); + skb = netlink_trim(skb, gfp_any()); timeo = sock_sndtimeo(ssk, nonblock); retry: @@ -584,7 +821,7 @@ retry: kfree_skb(skb); return PTR_ERR(sk); } - err = netlink_attachskb(sk, skb, nonblock, timeo); + err = netlink_attachskb(sk, skb, nonblock, timeo, ssk); if (err == 1) goto retry; if (err) @@ -593,117 +830,277 @@ retry: return netlink_sendskb(sk, skb, ssk->sk_protocol); } +int netlink_has_listeners(struct sock *sk, unsigned int group) +{ + int res = 0; + + BUG_ON(!(nlk_sk(sk)->flags & NETLINK_KERNEL_SOCKET)); + if (group - 1 < nl_table[sk->sk_protocol].groups) + res = test_bit(group - 1, nl_table[sk->sk_protocol].listeners); + return res; +} +EXPORT_SYMBOL_GPL(netlink_has_listeners); + static __inline__ int netlink_broadcast_deliver(struct sock *sk, struct sk_buff *skb) { - struct netlink_opt *nlk = nlk_sk(sk); -#ifdef NL_EMULATE_DEV - if (nlk->handler) { - skb_orphan(skb); - nlk->handler(sk->sk_protocol, skb); - return 0; - } else -#endif + struct netlink_sock *nlk = nlk_sk(sk); + if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && !test_bit(0, &nlk->state)) { - skb_orphan(skb); skb_set_owner_r(skb, sk); skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_data_ready(sk, skb->len); - return 0; + return atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf; } return -1; } +struct netlink_broadcast_data { + struct sock *exclude_sk; + u32 pid; + u32 group; + int failure; + int congested; + int delivered; + gfp_t allocation; + struct sk_buff *skb, *skb2; +}; + +static inline int do_one_broadcast(struct sock *sk, + struct netlink_broadcast_data *p) +{ + struct netlink_sock *nlk = nlk_sk(sk); + int val; + + if (p->exclude_sk == sk) + goto out; + + if (nlk->pid == p->pid || p->group - 1 >= nlk->ngroups || + !test_bit(p->group - 1, nlk->groups)) + goto out; + + if (p->failure) { + netlink_overrun(sk); + goto out; + } + + sock_hold(sk); + if (p->skb2 == NULL) { + if (skb_shared(p->skb)) { + p->skb2 = skb_clone(p->skb, p->allocation); + } else { + p->skb2 = skb_get(p->skb); + /* + * skb ownership may have been set when + * delivered to a previous socket. + */ + skb_orphan(p->skb2); + } + } + if (p->skb2 == NULL) { + netlink_overrun(sk); + /* Clone failed. Notify ALL listeners. */ + p->failure = 1; + } else if ((val = netlink_broadcast_deliver(sk, p->skb2)) < 0) { + netlink_overrun(sk); + } else { + p->congested |= val; + p->delivered = 1; + p->skb2 = NULL; + } + sock_put(sk); + +out: + return 0; +} + int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 pid, - u32 group, int allocation) + u32 group, gfp_t allocation) { - struct sock *sk; + struct netlink_broadcast_data info; struct hlist_node *node; - struct sk_buff *skb2 = NULL; - int protocol = ssk->sk_protocol; - int failure = 0, delivered = 0; + struct sock *sk; + + skb = netlink_trim(skb, allocation); - netlink_trim(skb, allocation); + info.exclude_sk = ssk; + info.pid = pid; + info.group = group; + info.failure = 0; + info.congested = 0; + info.delivered = 0; + info.allocation = allocation; + info.skb = skb; + info.skb2 = NULL; /* While we sleep in clone, do not allow to change socket list */ netlink_lock_table(); - sk_for_each(sk, node, &nl_table[protocol]) { - struct netlink_opt *nlk = nlk_sk(sk); - - if (ssk == sk) - continue; - - if (nlk->pid == pid || !(nlk->groups & group)) - continue; - - if (failure) { - netlink_overrun(sk); - continue; - } + sk_for_each_bound(sk, node, &nl_table[ssk->sk_protocol].mc_list) + do_one_broadcast(sk, &info); - sock_hold(sk); - if (skb2 == NULL) { - if (atomic_read(&skb->users) != 1) { - skb2 = skb_clone(skb, allocation); - } else { - skb2 = skb; - atomic_inc(&skb->users); - } - } - if (skb2 == NULL) { - netlink_overrun(sk); - /* Clone failed. Notify ALL listeners. */ - failure = 1; - } else if (netlink_broadcast_deliver(sk, skb2)) { - netlink_overrun(sk); - } else { - delivered = 1; - skb2 = NULL; - } - sock_put(sk); - } + kfree_skb(skb); netlink_unlock_table(); - if (skb2) - kfree_skb(skb2); - kfree_skb(skb); + if (info.skb2) + kfree_skb(info.skb2); - if (delivered) + if (info.delivered) { + if (info.congested && (allocation & __GFP_WAIT)) + yield(); return 0; - if (failure) + } + if (info.failure) return -ENOBUFS; return -ESRCH; } +struct netlink_set_err_data { + struct sock *exclude_sk; + u32 pid; + u32 group; + int code; +}; + +static inline int do_one_set_err(struct sock *sk, + struct netlink_set_err_data *p) +{ + struct netlink_sock *nlk = nlk_sk(sk); + + if (sk == p->exclude_sk) + goto out; + + if (nlk->pid == p->pid || p->group - 1 >= nlk->ngroups || + !test_bit(p->group - 1, nlk->groups)) + goto out; + + sk->sk_err = p->code; + sk->sk_error_report(sk); +out: + return 0; +} + void netlink_set_err(struct sock *ssk, u32 pid, u32 group, int code) { - struct sock *sk; + struct netlink_set_err_data info; struct hlist_node *node; - int protocol = ssk->sk_protocol; + struct sock *sk; + + info.exclude_sk = ssk; + info.pid = pid; + info.group = group; + info.code = code; read_lock(&nl_table_lock); - sk_for_each(sk, node, &nl_table[protocol]) { - struct netlink_opt *nlk = nlk_sk(sk); - if (ssk == sk) - continue; - if (nlk->pid == pid || !(nlk->groups & group)) - continue; + sk_for_each_bound(sk, node, &nl_table[ssk->sk_protocol].mc_list) + do_one_set_err(sk, &info); - sk->sk_err = code; - sk->sk_error_report(sk); - } read_unlock(&nl_table_lock); } +static int netlink_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, int optlen) +{ + struct sock *sk = sock->sk; + struct netlink_sock *nlk = nlk_sk(sk); + int val = 0, err; + + if (level != SOL_NETLINK) + return -ENOPROTOOPT; + + if (optlen >= sizeof(int) && + get_user(val, (int __user *)optval)) + return -EFAULT; + + switch (optname) { + case NETLINK_PKTINFO: + if (val) + nlk->flags |= NETLINK_RECV_PKTINFO; + else + nlk->flags &= ~NETLINK_RECV_PKTINFO; + err = 0; + break; + case NETLINK_ADD_MEMBERSHIP: + case NETLINK_DROP_MEMBERSHIP: { + unsigned int subscriptions; + int old, new = optname == NETLINK_ADD_MEMBERSHIP ? 1 : 0; + + if (!netlink_capable(sock, NL_NONROOT_RECV)) + return -EPERM; + if (nlk->groups == NULL) { + err = netlink_alloc_groups(sk); + if (err) + return err; + } + if (!val || val - 1 >= nlk->ngroups) + return -EINVAL; + netlink_table_grab(); + old = test_bit(val - 1, nlk->groups); + subscriptions = nlk->subscriptions - old + new; + if (new) + __set_bit(val - 1, nlk->groups); + else + __clear_bit(val - 1, nlk->groups); + netlink_update_subscriptions(sk, subscriptions); + netlink_update_listeners(sk); + netlink_table_ungrab(); + err = 0; + break; + } + default: + err = -ENOPROTOOPT; + } + return err; +} + +static int netlink_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + struct netlink_sock *nlk = nlk_sk(sk); + int len, val, err; + + if (level != SOL_NETLINK) + return -ENOPROTOOPT; + + if (get_user(len, optlen)) + return -EFAULT; + if (len < 0) + return -EINVAL; + + switch (optname) { + case NETLINK_PKTINFO: + if (len < sizeof(int)) + return -EINVAL; + len = sizeof(int); + val = nlk->flags & NETLINK_RECV_PKTINFO ? 1 : 0; + if (put_user(len, optlen) || + put_user(val, optval)) + return -EFAULT; + err = 0; + break; + default: + err = -ENOPROTOOPT; + } + return err; +} + +static void netlink_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb) +{ + struct nl_pktinfo info; + + info.group = NETLINK_CB(skb).dst_group; + put_cmsg(msg, SOL_NETLINK, NETLINK_PKTINFO, sizeof(info), &info); +} + static inline void netlink_rcv_wake(struct sock *sk) { - struct netlink_opt *nlk = nlk_sk(sk); + struct netlink_sock *nlk = nlk_sk(sk); - if (!skb_queue_len(&sk->sk_receive_queue)) + if (skb_queue_empty(&sk->sk_receive_queue)) clear_bit(0, &nlk->state); if (!test_bit(0, &nlk->state)) wake_up_interruptible(&nlk->wait); @@ -714,10 +1111,10 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock, { struct sock_iocb *siocb = kiocb_to_siocb(kiocb); struct sock *sk = sock->sk; - struct netlink_opt *nlk = nlk_sk(sk); + struct netlink_sock *nlk = nlk_sk(sk); struct sockaddr_nl *addr=msg->msg_name; u32 dst_pid; - u32 dst_groups; + u32 dst_group; struct sk_buff *skb; int err; struct scm_cookie scm; @@ -735,12 +1132,12 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock, if (addr->nl_family != AF_NETLINK) return -EINVAL; dst_pid = addr->nl_pid; - dst_groups = addr->nl_groups; - if (dst_groups && !netlink_capable(sock, NL_NONROOT_SEND)) + dst_group = ffs(addr->nl_groups); + if (dst_group && !netlink_capable(sock, NL_NONROOT_SEND)) return -EPERM; } else { dst_pid = nlk->dst_pid; - dst_groups = nlk->dst_groups; + dst_group = nlk->dst_group; } if (!nlk->pid) { @@ -758,9 +1155,9 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock, goto out; NETLINK_CB(skb).pid = nlk->pid; - NETLINK_CB(skb).groups = nlk->groups; - NETLINK_CB(skb).dst_pid = dst_pid; - NETLINK_CB(skb).dst_groups = dst_groups; + NETLINK_CB(skb).dst_group = dst_group; + NETLINK_CB(skb).loginuid = audit_get_loginuid(current->audit_context); + selinux_get_task_sid(current, &(NETLINK_CB(skb).sid)); memcpy(NETLINK_CREDS(skb), &siocb->scm->creds, sizeof(struct ucred)); /* What can I do? Netlink is asynchronous, so that @@ -781,9 +1178,9 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock, goto out; } - if (dst_groups) { + if (dst_group) { atomic_inc(&skb->users); - netlink_broadcast(sk, skb, dst_pid, dst_groups, GFP_KERNEL); + netlink_broadcast(sk, skb, dst_pid, dst_group, GFP_KERNEL); } err = netlink_unicast(sk, skb, dst_pid, msg->msg_flags&MSG_DONTWAIT); @@ -798,7 +1195,7 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock, struct sock_iocb *siocb = kiocb_to_siocb(kiocb); struct scm_cookie scm; struct sock *sk = sock->sk; - struct netlink_opt *nlk = nlk_sk(sk); + struct netlink_sock *nlk = nlk_sk(sk); int noblock = flags&MSG_DONTWAIT; size_t copied; struct sk_buff *skb; @@ -829,10 +1226,13 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock, addr->nl_family = AF_NETLINK; addr->nl_pad = 0; addr->nl_pid = NETLINK_CB(skb).pid; - addr->nl_groups = NETLINK_CB(skb).dst_groups; + addr->nl_groups = netlink_group_mask(NETLINK_CB(skb).dst_group); msg->msg_namelen = sizeof(*addr); } + if (nlk->flags & NETLINK_RECV_PKTINFO) + netlink_cmsg_recv_pktinfo(msg, skb); + if (NULL == siocb->scm) { memset(&scm, 0, sizeof(scm)); siocb->scm = &scm; @@ -852,7 +1252,7 @@ out: static void netlink_data_ready(struct sock *sk, int len) { - struct netlink_opt *nlk = nlk_sk(sk); + struct netlink_sock *nlk = nlk_sk(sk); if (nlk->data_ready) nlk->data_ready(sk, len); @@ -866,10 +1266,16 @@ static void netlink_data_ready(struct sock *sk, int len) */ struct sock * -netlink_kernel_create(int unit, void (*input)(struct sock *sk, int len)) +netlink_kernel_create(int unit, unsigned int groups, + void (*input)(struct sock *sk, int len), + struct module *module) { struct socket *sock; struct sock *sk; + struct netlink_sock *nlk; + unsigned long *listeners = NULL; + + BUG_ON(!nl_table); if (unit<0 || unit>=MAX_LINKS) return NULL; @@ -877,23 +1283,46 @@ netlink_kernel_create(int unit, void (*input)(struct sock *sk, int len)) if (sock_create_lite(PF_NETLINK, SOCK_DGRAM, unit, &sock)) return NULL; - if (netlink_create(sock, unit) < 0) { - sock_release(sock); - return NULL; - } + if (__netlink_create(sock, unit) < 0) + goto out_sock_release; + + if (groups < 32) + groups = 32; + + listeners = kzalloc(NLGRPSZ(groups), GFP_KERNEL); + if (!listeners) + goto out_sock_release; + sk = sock->sk; sk->sk_data_ready = netlink_data_ready; if (input) nlk_sk(sk)->data_ready = input; - netlink_insert(sk, 0); + if (netlink_insert(sk, 0)) + goto out_sock_release; + + nlk = nlk_sk(sk); + nlk->flags |= NETLINK_KERNEL_SOCKET; + + netlink_table_grab(); + nl_table[unit].groups = groups; + nl_table[unit].listeners = listeners; + nl_table[unit].module = module; + nl_table[unit].registered = 1; + netlink_table_ungrab(); + return sk; + +out_sock_release: + kfree(listeners); + sock_release(sock); + return NULL; } -void netlink_set_nonroot(int protocol, unsigned flags) +void netlink_set_nonroot(int protocol, unsigned int flags) { - if ((unsigned)protocol < MAX_LINKS) - nl_nonroot[protocol] = flags; + if ((unsigned int)protocol < MAX_LINKS) + nl_table[protocol].nl_nonroot = flags; } static void netlink_destroy_callback(struct netlink_callback *cb) @@ -910,23 +1339,22 @@ static void netlink_destroy_callback(struct netlink_callback *cb) static int netlink_dump(struct sock *sk) { - struct netlink_opt *nlk = nlk_sk(sk); + struct netlink_sock *nlk = nlk_sk(sk); struct netlink_callback *cb; struct sk_buff *skb; struct nlmsghdr *nlh; - int len; + int len, err = -ENOBUFS; skb = sock_rmalloc(sk, NLMSG_GOODSIZE, 0, GFP_KERNEL); if (!skb) - return -ENOBUFS; + goto errout; spin_lock(&nlk->cb_lock); cb = nlk->cb; if (cb == NULL) { - spin_unlock(&nlk->cb_lock); - kfree_skb(skb); - return -EINVAL; + err = -EINVAL; + goto errout_skb; } len = cb->dump(skb, cb); @@ -938,19 +1366,28 @@ static int netlink_dump(struct sock *sk) return 0; } - nlh = __nlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, NLMSG_DONE, sizeof(int)); - nlh->nlmsg_flags |= NLM_F_MULTI; - memcpy(NLMSG_DATA(nlh), &len, sizeof(len)); + nlh = nlmsg_put_answer(skb, cb, NLMSG_DONE, sizeof(len), NLM_F_MULTI); + if (!nlh) + goto errout_skb; + + memcpy(nlmsg_data(nlh), &len, sizeof(len)); + skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_data_ready(sk, skb->len); - cb->done(cb); + if (cb->done) + cb->done(cb); nlk->cb = NULL; spin_unlock(&nlk->cb_lock); netlink_destroy_callback(cb); - sock_put(sk); return 0; + +errout_skb: + spin_unlock(&nlk->cb_lock); + kfree_skb(skb); +errout: + return err; } int netlink_dump_start(struct sock *ssk, struct sk_buff *skb, @@ -960,13 +1397,12 @@ int netlink_dump_start(struct sock *ssk, struct sk_buff *skb, { struct netlink_callback *cb; struct sock *sk; - struct netlink_opt *nlk; + struct netlink_sock *nlk; - cb = kmalloc(sizeof(*cb), GFP_KERNEL); + cb = kzalloc(sizeof(*cb), GFP_KERNEL); if (cb == NULL) return -ENOBUFS; - memset(cb, 0, sizeof(*cb)); cb->dump = dump; cb->done = done; cb->nlh = nlh; @@ -991,6 +1427,7 @@ int netlink_dump_start(struct sock *ssk, struct sk_buff *skb, spin_unlock(&nlk->cb_lock); netlink_dump(sk); + sock_put(sk); return 0; } @@ -999,14 +1436,13 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err) struct sk_buff *skb; struct nlmsghdr *rep; struct nlmsgerr *errmsg; - int size; + size_t payload = sizeof(*errmsg); - if (err == 0) - size = NLMSG_SPACE(sizeof(struct nlmsgerr)); - else - size = NLMSG_SPACE(4 + NLMSG_ALIGN(nlh->nlmsg_len)); + /* error messages get the original request appened */ + if (err) + payload += nlmsg_len(nlh); - skb = alloc_skb(size, GFP_KERNEL); + skb = nlmsg_new(payload, GFP_KERNEL); if (!skb) { struct sock *sk; @@ -1021,84 +1457,157 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err) } rep = __nlmsg_put(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, - NLMSG_ERROR, sizeof(struct nlmsgerr)); - errmsg = NLMSG_DATA(rep); + NLMSG_ERROR, sizeof(struct nlmsgerr), 0); + errmsg = nlmsg_data(rep); errmsg->error = err; - memcpy(&errmsg->msg, nlh, err ? nlh->nlmsg_len : sizeof(struct nlmsghdr)); + memcpy(&errmsg->msg, nlh, err ? nlh->nlmsg_len : sizeof(*nlh)); netlink_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT); } +static int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *, + struct nlmsghdr *, int *)) +{ + struct nlmsghdr *nlh; + int err; -#ifdef NL_EMULATE_DEV + while (skb->len >= nlmsg_total_size(0)) { + nlh = (struct nlmsghdr *) skb->data; -static rwlock_t nl_emu_lock = RW_LOCK_UNLOCKED; + if (nlh->nlmsg_len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len) + return 0; + + if (cb(skb, nlh, &err) < 0) { + /* Not an error, but we have to interrupt processing + * here. Note: that in this case we do not pull + * message from skb, it will be processed later. + */ + if (err == 0) + return -1; + netlink_ack(skb, nlh, err); + } else if (nlh->nlmsg_flags & NLM_F_ACK) + netlink_ack(skb, nlh, 0); + + netlink_queue_skip(nlh, skb); + } -/* - * Backward compatibility. - */ - -int netlink_attach(int unit, int (*function)(int, struct sk_buff *skb)) -{ - struct sock *sk = netlink_kernel_create(unit, NULL); - if (sk == NULL) - return -ENOBUFS; - nlk_sk(sk)->handler = function; - write_lock_bh(&nl_emu_lock); - netlink_kernel[unit] = sk->sk_socket; - write_unlock_bh(&nl_emu_lock); return 0; } -void netlink_detach(int unit) +/** + * nelink_run_queue - Process netlink receive queue. + * @sk: Netlink socket containing the queue + * @qlen: Place to store queue length upon entry + * @cb: Callback function invoked for each netlink message found + * + * Processes as much as there was in the queue upon entry and invokes + * a callback function for each netlink message found. The callback + * function may refuse a message by returning a negative error code + * but setting the error pointer to 0 in which case this function + * returns with a qlen != 0. + * + * qlen must be initialized to 0 before the initial entry, afterwards + * the function may be called repeatedly until qlen reaches 0. + */ +void netlink_run_queue(struct sock *sk, unsigned int *qlen, + int (*cb)(struct sk_buff *, struct nlmsghdr *, int *)) { - struct socket *sock; + struct sk_buff *skb; - write_lock_bh(&nl_emu_lock); - sock = netlink_kernel[unit]; - netlink_kernel[unit] = NULL; - write_unlock_bh(&nl_emu_lock); + if (!*qlen || *qlen > skb_queue_len(&sk->sk_receive_queue)) + *qlen = skb_queue_len(&sk->sk_receive_queue); + + for (; *qlen; (*qlen)--) { + skb = skb_dequeue(&sk->sk_receive_queue); + if (netlink_rcv_skb(skb, cb)) { + if (skb->len) + skb_queue_head(&sk->sk_receive_queue, skb); + else { + kfree_skb(skb); + (*qlen)--; + } + break; + } - sock_release(sock); + kfree_skb(skb); + } } -int netlink_post(int unit, struct sk_buff *skb) +/** + * netlink_queue_skip - Skip netlink message while processing queue. + * @nlh: Netlink message to be skipped + * @skb: Socket buffer containing the netlink messages. + * + * Pulls the given netlink message off the socket buffer so the next + * call to netlink_queue_run() will not reconsider the message. + */ +void netlink_queue_skip(struct nlmsghdr *nlh, struct sk_buff *skb) { - struct socket *sock; + int msglen = NLMSG_ALIGN(nlh->nlmsg_len); - read_lock(&nl_emu_lock); - sock = netlink_kernel[unit]; - if (sock) { - struct sock *sk = sock->sk; - memset(skb->cb, 0, sizeof(skb->cb)); - sock_hold(sk); - read_unlock(&nl_emu_lock); + if (msglen > skb->len) + msglen = skb->len; - netlink_broadcast(sk, skb, 0, ~0, GFP_ATOMIC); + skb_pull(skb, msglen); +} - sock_put(sk); - return 0; +/** + * nlmsg_notify - send a notification netlink message + * @sk: netlink socket to use + * @skb: notification message + * @pid: destination netlink pid for reports or 0 + * @group: destination multicast group or 0 + * @report: 1 to report back, 0 to disable + * @flags: allocation flags + */ +int nlmsg_notify(struct sock *sk, struct sk_buff *skb, u32 pid, + unsigned int group, int report, gfp_t flags) +{ + int err = 0; + + if (group) { + int exclude_pid = 0; + + if (report) { + atomic_inc(&skb->users); + exclude_pid = pid; + } + + /* errors reported via destination sk->sk_err */ + nlmsg_multicast(sk, skb, exclude_pid, group, flags); } - read_unlock(&nl_emu_lock); - return -EUNATCH; -} -#endif + if (report) + err = nlmsg_unicast(sk, skb, pid); + + return err; +} #ifdef CONFIG_PROC_FS +struct nl_seq_iter { + int link; + int hash_idx; +}; + static struct sock *netlink_seq_socket_idx(struct seq_file *seq, loff_t pos) { - long i; + struct nl_seq_iter *iter = seq->private; + int i, j; struct sock *s; struct hlist_node *node; loff_t off = 0; for (i=0; iprivate = (void *) i; - return s; + struct nl_pid_hash *hash = &nl_table[i].hash; + + for (j = 0; j <= hash->mask; j++) { + sk_for_each(s, node, &hash->table[j]) { + if (off == pos) { + iter->link = i; + iter->hash_idx = j; + return s; + } + ++off; } - ++off; } } return NULL; @@ -1113,6 +1622,8 @@ static void *netlink_seq_start(struct seq_file *seq, loff_t *pos) static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct sock *s; + struct nl_seq_iter *iter; + int i, j; ++*pos; @@ -1120,18 +1631,29 @@ static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos) return netlink_seq_socket_idx(seq, 0); s = sk_next(v); - if (!s) { - long i = (long)seq->private; + if (s) + return s; + + iter = seq->private; + i = iter->link; + j = iter->hash_idx + 1; - while (++i < MAX_LINKS) { - s = sk_head(&nl_table[i]); + do { + struct nl_pid_hash *hash = &nl_table[i].hash; + + for (; j <= hash->mask; j++) { + s = sk_head(&hash->table[j]); if (s) { - seq->private = (void *) i; - break; + iter->link = i; + iter->hash_idx = j; + return s; } } - } - return s; + + j = 0; + } while (++i < MAX_LINKS); + + return NULL; } static void netlink_seq_stop(struct seq_file *seq, void *v) @@ -1148,13 +1670,13 @@ static int netlink_seq_show(struct seq_file *seq, void *v) "Rmem Wmem Dump Locks\n"); else { struct sock *s = v; - struct netlink_opt *nlk = nlk_sk(s); + struct netlink_sock *nlk = nlk_sk(s); seq_printf(seq, "%p %-3d %-6d %08x %-8d %-8d %p %d\n", s, s->sk_protocol, nlk->pid, - nlk->groups, + nlk->groups ? (u32)nlk->groups[0] : 0, atomic_read(&s->sk_rmem_alloc), atomic_read(&s->sk_wmem_alloc), nlk->cb, @@ -1175,7 +1697,23 @@ static struct seq_operations netlink_seq_ops = { static int netlink_seq_open(struct inode *inode, struct file *file) { - return seq_open(file, &netlink_seq_ops); + struct seq_file *seq; + struct nl_seq_iter *iter; + int err; + + iter = kzalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return -ENOMEM; + + err = seq_open(file, &netlink_seq_ops); + if (err) { + kfree(iter); + return err; + } + + seq = file->private_data; + seq->private = iter; + return 0; } static struct file_operations netlink_seq_fops = { @@ -1183,22 +1721,22 @@ static struct file_operations netlink_seq_fops = { .open = netlink_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release, + .release = seq_release_private, }; #endif int netlink_register_notifier(struct notifier_block *nb) { - return notifier_chain_register(&netlink_chain, nb); + return atomic_notifier_chain_register(&netlink_chain, nb); } int netlink_unregister_notifier(struct notifier_block *nb) { - return notifier_chain_unregister(&netlink_chain, nb); + return atomic_notifier_chain_unregister(&netlink_chain, nb); } -static struct proto_ops netlink_ops = { +static const struct proto_ops netlink_ops = { .family = PF_NETLINK, .owner = THIS_MODULE, .release = netlink_release, @@ -1211,8 +1749,8 @@ static struct proto_ops netlink_ops = { .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, - .setsockopt = sock_no_setsockopt, - .getsockopt = sock_no_getsockopt, + .setsockopt = netlink_setsockopt, + .getsockopt = netlink_getsockopt, .sendmsg = netlink_sendmsg, .recvmsg = netlink_recvmsg, .mmap = sock_no_mmap, @@ -1228,34 +1766,64 @@ static struct net_proto_family netlink_family_ops = { static int __init netlink_proto_init(void) { struct sk_buff *dummy_skb; + int i; + unsigned long max; + unsigned int order; + int err = proto_register(&netlink_proto, 0); - if (sizeof(struct netlink_skb_parms) > sizeof(dummy_skb->cb)) { - printk(KERN_CRIT "netlink_init: panic\n"); - return -1; + if (err != 0) + goto out; + + BUILD_BUG_ON(sizeof(struct netlink_skb_parms) > sizeof(dummy_skb->cb)); + + nl_table = kcalloc(MAX_LINKS, sizeof(*nl_table), GFP_KERNEL); + if (!nl_table) + goto panic; + + if (num_physpages >= (128 * 1024)) + max = num_physpages >> (21 - PAGE_SHIFT); + else + max = num_physpages >> (23 - PAGE_SHIFT); + + order = get_bitmask_order(max) - 1 + PAGE_SHIFT; + max = (1UL << order) / sizeof(struct hlist_head); + order = get_bitmask_order(max > UINT_MAX ? UINT_MAX : max) - 1; + + for (i = 0; i < MAX_LINKS; i++) { + struct nl_pid_hash *hash = &nl_table[i].hash; + + hash->table = nl_pid_hash_alloc(1 * sizeof(*hash->table)); + if (!hash->table) { + while (i-- > 0) + nl_pid_hash_free(nl_table[i].hash.table, + 1 * sizeof(*hash->table)); + kfree(nl_table); + goto panic; + } + memset(hash->table, 0, 1 * sizeof(*hash->table)); + hash->max_shift = order; + hash->shift = 0; + hash->mask = 0; + hash->rehash_time = jiffies; } + sock_register(&netlink_family_ops); #ifdef CONFIG_PROC_FS proc_net_fops_create("netlink", 0, &netlink_seq_fops); #endif /* The netlink device handler may be needed early. */ rtnetlink_init(); - return 0; -} - -static void __exit netlink_proto_exit(void) -{ - sock_unregister(PF_NETLINK); - proc_net_remove("netlink"); +out: + return err; +panic: + panic("netlink_init: Cannot allocate nl_table\n"); } core_initcall(netlink_proto_init); -module_exit(netlink_proto_exit); - -MODULE_LICENSE("GPL"); - -MODULE_ALIAS_NETPROTO(PF_NETLINK); EXPORT_SYMBOL(netlink_ack); +EXPORT_SYMBOL(netlink_run_queue); +EXPORT_SYMBOL(netlink_queue_skip); EXPORT_SYMBOL(netlink_broadcast); EXPORT_SYMBOL(netlink_dump_start); EXPORT_SYMBOL(netlink_kernel_create); @@ -1264,9 +1832,4 @@ EXPORT_SYMBOL(netlink_set_err); EXPORT_SYMBOL(netlink_set_nonroot); EXPORT_SYMBOL(netlink_unicast); EXPORT_SYMBOL(netlink_unregister_notifier); - -#if defined(CONFIG_NETLINK_DEV) || defined(CONFIG_NETLINK_DEV_MODULE) -EXPORT_SYMBOL(netlink_attach); -EXPORT_SYMBOL(netlink_detach); -EXPORT_SYMBOL(netlink_post); -#endif +EXPORT_SYMBOL(nlmsg_notify);