*/
#include <linux/module.h>
-#include <linux/config.h>
#include <linux/kernel.h>
-#include <linux/major.h>
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <net/sock.h>
-#include <linux/tcp.h>
+#include <net/tcp_states.h>
#include <net/af_unix.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <net/checksum.h>
#include <linux/security.h>
#include <linux/vs_context.h>
-#include <linux/vs_network.h>
#include <linux/vs_limit.h>
-int sysctl_unix_max_dgram_qlen = 10;
-
-kmem_cache_t *unix_sk_cachep;
+int sysctl_unix_max_dgram_qlen __read_mostly = 10;
struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1];
-rwlock_t unix_table_lock = RW_LOCK_UNLOCKED;
+DEFINE_SPINLOCK(unix_table_lock);
static atomic_t unix_nr_socks = ATOMIC_INIT(0);
#define unix_sockets_unbound (&unix_socket_table[UNIX_HASH_SIZE])
#define UNIX_ABSTRACT(sk) (unix_sk(sk)->addr->hash != UNIX_HASH_SIZE)
+#ifdef CONFIG_SECURITY_NETWORK
+static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
+{
+ memcpy(UNIXSID(skb), &scm->secid, sizeof(u32));
+}
+
+static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
+{
+ scm->secid = *UNIXSID(skb);
+}
+#else
+static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
+{ }
+
+static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
+{ }
+#endif /* CONFIG_SECURITY_NETWORK */
+
/*
* SMP locking strategy:
- * hash table is protected with rwlock unix_table_lock
+ * hash table is protected with spinlock unix_table_lock
* each socket state is protected by separate rwlock.
*/
-static inline unsigned unix_hash_fold(unsigned hash)
+static inline unsigned unix_hash_fold(__wsum n)
{
+ unsigned hash = (__force unsigned)n;
hash ^= hash>>16;
hash ^= hash>>8;
return hash&(UNIX_HASH_SIZE-1);
if (!sunaddr || sunaddr->sun_family != AF_UNIX)
return -EINVAL;
if (sunaddr->sun_path[0]) {
+ /*
+ * This may look like an off by one error but it is a bit more
+ * subtle. 108 is the longest valid AF_UNIX path for a binding.
+ * sun_path[108] doesnt as such exist. However in kernel space
+ * we are guaranteed that it is a valid memory location in our
+ * kernel address buffer.
+ */
((char *)sunaddr)[len]=0;
len = strlen(sunaddr->sun_path)+1+sizeof(short);
return len;
static inline void unix_remove_socket(struct sock *sk)
{
- write_lock(&unix_table_lock);
+ spin_lock(&unix_table_lock);
__unix_remove_socket(sk);
- write_unlock(&unix_table_lock);
+ spin_unlock(&unix_table_lock);
}
static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
{
- write_lock(&unix_table_lock);
+ spin_lock(&unix_table_lock);
__unix_insert_socket(list, sk);
- write_unlock(&unix_table_lock);
+ spin_unlock(&unix_table_lock);
}
static struct sock *__unix_find_socket_byname(struct sockaddr_un *sunname,
sk_for_each(s, node, &unix_socket_table[hash ^ type]) {
struct unix_sock *u = unix_sk(s);
+ if (!nx_check(s->sk_nid, VS_WATCH_P|VS_IDENT))
+ continue;
if (u->addr->len == len &&
!memcmp(u->addr->name, sunname, len))
goto found;
{
struct sock *s;
- read_lock(&unix_table_lock);
+ spin_lock(&unix_table_lock);
s = __unix_find_socket_byname(sunname, len, type, hash);
if (s)
sock_hold(s);
- read_unlock(&unix_table_lock);
+ spin_unlock(&unix_table_lock);
return s;
}
struct sock *s;
struct hlist_node *node;
- read_lock(&unix_table_lock);
+ spin_lock(&unix_table_lock);
sk_for_each(s, node,
&unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
struct dentry *dentry = unix_sk(s)->dentry;
}
s = NULL;
found:
- read_unlock(&unix_table_lock);
+ spin_unlock(&unix_table_lock);
return s;
}
* may receive messages only from that peer. */
static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
{
- if (skb_queue_len(&sk->sk_receive_queue)) {
+ if (!skb_queue_empty(&sk->sk_receive_queue)) {
skb_queue_purge(&sk->sk_receive_queue);
wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
mntput(mnt);
}
- vx_sock_dec(sk);
- clr_vx_info(&sk->sk_vx_info);
- clr_nx_info(&sk->sk_nx_info);
sock_put(sk);
/* ---- Socket is dead now and most probably destroyed ---- */
static int unix_seqpacket_sendmsg(struct kiocb *, struct socket *,
struct msghdr *, size_t);
-static struct proto_ops unix_stream_ops = {
+static const struct proto_ops unix_stream_ops = {
.family = PF_UNIX,
.owner = THIS_MODULE,
.release = unix_release,
.sendpage = sock_no_sendpage,
};
-static struct proto_ops unix_dgram_ops = {
+static const struct proto_ops unix_dgram_ops = {
.family = PF_UNIX,
.owner = THIS_MODULE,
.release = unix_release,
.sendpage = sock_no_sendpage,
};
-static struct proto_ops unix_seqpacket_ops = {
+static const struct proto_ops unix_seqpacket_ops = {
.family = PF_UNIX,
.owner = THIS_MODULE,
.release = unix_release,
.sendpage = sock_no_sendpage,
};
+static struct proto unix_proto = {
+ .name = "UNIX",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct unix_sock),
+};
+
+/*
+ * AF_UNIX sockets do not interact with hardware, hence they
+ * dont trigger interrupts - so it's safe for them to have
+ * bh-unsafe locking for their sk_receive_queue.lock. Split off
+ * this special lock-class by reinitializing the spinlock key:
+ */
+static struct lock_class_key af_unix_sk_receive_queue_lock_key;
+
static struct sock * unix_create1(struct socket *sock)
{
struct sock *sk = NULL;
struct unix_sock *u;
- if (atomic_read(&unix_nr_socks) >= 2*files_stat.max_files)
+ if (atomic_read(&unix_nr_socks) >= 2*get_max_files())
goto out;
- sk = sk_alloc(PF_UNIX, GFP_KERNEL, sizeof(struct unix_sock),
- unix_sk_cachep);
+ sk = sk_alloc(PF_UNIX, GFP_KERNEL, &unix_proto, 1);
if (!sk)
goto out;
atomic_inc(&unix_nr_socks);
sock_init_data(sock,sk);
- sk_set_owner(sk, THIS_MODULE);
-
- set_vx_info(&sk->sk_vx_info, current->vx_info);
- sk->sk_xid = vx_current_xid();
- vx_sock_inc(sk);
- set_nx_info(&sk->sk_nx_info, current->nx_info);
+ lockdep_set_class(&sk->sk_receive_queue.lock,
+ &af_unix_sk_receive_queue_lock_key);
sk->sk_write_space = unix_write_space;
sk->sk_max_ack_backlog = sysctl_unix_max_dgram_qlen;
u = unix_sk(sk);
u->dentry = NULL;
u->mnt = NULL;
- rwlock_init(&u->lock);
+ spin_lock_init(&u->lock);
atomic_set(&u->inflight, sock ? 0 : -1);
- init_MUTEX(&u->readsem); /* single task reading lock */
+ mutex_init(&u->readlock); /* single task reading lock */
init_waitqueue_head(&u->peer_wait);
unix_insert_socket(unix_sockets_unbound, sk);
out:
struct unix_address * addr;
int err;
- down(&u->readsem);
+ mutex_lock(&u->readlock);
err = 0;
if (u->addr)
goto out;
err = -ENOMEM;
- addr = kmalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
+ addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
if (!addr)
goto out;
- memset(addr, 0, sizeof(*addr) + sizeof(short) + 16);
addr->name->sun_family = AF_UNIX;
atomic_set(&addr->refcnt, 1);
addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
addr->hash = unix_hash_fold(csum_partial((void*)addr->name, addr->len, 0));
- write_lock(&unix_table_lock);
+ spin_lock(&unix_table_lock);
ordernum = (ordernum+1)&0xFFFFF;
if (__unix_find_socket_byname(addr->name, addr->len, sock->type,
addr->hash)) {
- write_unlock(&unix_table_lock);
+ spin_unlock(&unix_table_lock);
/* Sanity yield. It is unusual case, but yet... */
if (!(ordernum&0xFF))
yield();
__unix_remove_socket(sk);
u->addr = addr;
__unix_insert_socket(&unix_socket_table[addr->hash], sk);
- write_unlock(&unix_table_lock);
+ spin_unlock(&unix_table_lock);
err = 0;
-out: up(&u->readsem);
+out: mutex_unlock(&u->readlock);
return err;
}
err = path_lookup(sunname->sun_path, LOOKUP_FOLLOW, &nd);
if (err)
goto fail;
- err = permission(nd.dentry->d_inode,MAY_WRITE, &nd);
+ err = vfs_permission(&nd, MAY_WRITE);
if (err)
goto put_fail;
goto out;
addr_len = err;
- down(&u->readsem);
+ mutex_lock(&u->readlock);
err = -EINVAL;
if (u->addr)
err = path_lookup(sunaddr->sun_path, LOOKUP_PARENT, &nd);
if (err)
goto out_mknod_parent;
- /*
- * Yucky last component or no last component at all?
- * (foo/., foo/.., /////)
- */
- err = -EEXIST;
- if (nd.last_type != LAST_NORM)
- goto out_mknod;
- /*
- * Lock the directory.
- */
- down(&nd.dentry->d_inode->i_sem);
- /*
- * Do the final lookup.
- */
- dentry = lookup_hash(&nd.last, nd.dentry);
+
+ dentry = lookup_create(&nd, 0);
err = PTR_ERR(dentry);
if (IS_ERR(dentry))
goto out_mknod_unlock;
- err = -ENOENT;
- /*
- * Special case - lookup gave negative, but... we had foo/bar/
- * From the vfs_mknod() POV we just have a negative dentry -
- * all is fine. Let's be bastards - you had / on the end, you've
- * been asking for (non-existent) directory. -ENOENT for you.
- */
- if (nd.last.name[nd.last.len] && !dentry->d_inode)
- goto out_mknod_dput;
+
/*
* All right, let's create it.
*/
mode = S_IFSOCK |
(SOCK_INODE(sock)->i_mode & ~current->fs->umask);
- err = vfs_mknod(nd.dentry->d_inode, dentry, mode, 0);
+ err = vfs_mknod(nd.dentry->d_inode, dentry, mode, 0, NULL);
if (err)
goto out_mknod_dput;
- up(&nd.dentry->d_inode->i_sem);
+ mutex_unlock(&nd.dentry->d_inode->i_mutex);
dput(nd.dentry);
nd.dentry = dentry;
addr->hash = UNIX_HASH_SIZE;
}
- write_lock(&unix_table_lock);
+ spin_lock(&unix_table_lock);
if (!sunaddr->sun_path[0]) {
err = -EADDRINUSE;
__unix_insert_socket(list, sk);
out_unlock:
- write_unlock(&unix_table_lock);
+ spin_unlock(&unix_table_lock);
out_up:
- up(&u->readsem);
+ mutex_unlock(&u->readlock);
out:
return err;
out_mknod_dput:
dput(dentry);
out_mknod_unlock:
- up(&nd.dentry->d_inode->i_sem);
-out_mknod:
+ mutex_unlock(&nd.dentry->d_inode->i_mutex);
path_release(&nd);
out_mknod_parent:
if (err==-EEXIST)
goto out;
alen = err;
- if (test_bit(SOCK_PASS_CRED, &sock->flags) && !unix_sk(sk)->addr &&
- (err = unix_autobind(sock)) != 0)
+ if (test_bit(SOCK_PASSCRED, &sock->flags) &&
+ !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
goto out;
other=unix_find_other(sunaddr, alen, sock->type, hash, &err);
goto out;
addr_len = err;
- if (test_bit(SOCK_PASS_CRED, &sock->flags)
+ if (test_bit(SOCK_PASSCRED, &sock->flags)
&& !u->addr && (err = unix_autobind(sock)) != 0)
goto out;
goto out_unlock;
}
- unix_state_wlock(sk);
+ unix_state_wlock_nested(sk);
if (sk->sk_state != st) {
unix_state_wunlock(sk);
/* Set credentials */
sk->sk_peercred = other->sk_peercred;
- sock_hold(newsk);
- unix_peer(sk) = newsk;
sock->state = SS_CONNECTED;
sk->sk_state = TCP_ESTABLISHED;
+ sock_hold(newsk);
+
+ smp_mb__after_atomic_inc(); /* sock_hold() does an atomic_inc() */
+ unix_peer(sk) = newsk;
unix_state_wunlock(sk);
goto out;
}
- if (test_bit(SOCK_PASS_CRED, &sock->flags)
+ if (test_bit(SOCK_PASSCRED, &sock->flags)
&& !u->addr && (err = unix_autobind(sock)) != 0)
goto out;
memcpy(UNIXCREDS(skb), &siocb->scm->creds, sizeof(struct ucred));
if (siocb->scm->fp)
unix_attach_fds(siocb->scm, skb);
+ unix_get_secdata(siocb->scm, skb);
skb->h.raw = skb->data;
err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
} else {
sunaddr = NULL;
err = -ENOTCONN;
- other = unix_peer_get(sk);
+ other = unix_peer(sk);
if (!other)
goto out_err;
}
while(sent < len)
{
/*
- * Optimisation for the fact that under 0.01% of X messages typically
- * need breaking up.
+ * Optimisation for the fact that under 0.01% of X
+ * messages typically need breaking up.
*/
- size=len-sent;
+ size = len-sent;
/* Keep two messages in the pipe so it schedules better */
- if (size > sk->sk_sndbuf / 2 - 64)
- size = sk->sk_sndbuf / 2 - 64;
+ if (size > ((sk->sk_sndbuf >> 1) - 64))
+ size = (sk->sk_sndbuf >> 1) - 64;
if (size > SKB_MAX_ALLOC)
size = SKB_MAX_ALLOC;
other->sk_data_ready(other, size);
sent+=size;
}
- sock_put(other);
scm_destroy(siocb->scm);
siocb->scm = NULL;
send_sig(SIGPIPE,current,0);
err = -EPIPE;
out_err:
- if (other)
- sock_put(other);
scm_destroy(siocb->scm);
siocb->scm = NULL;
return sent ? : err;
msg->msg_namelen = 0;
- down(&u->readsem);
+ mutex_lock(&u->readlock);
skb = skb_recv_datagram(sk, flags, noblock, &err);
if (!skb)
memset(&tmp_scm, 0, sizeof(tmp_scm));
}
siocb->scm->creds = *UNIXCREDS(skb);
+ unix_set_secdata(siocb->scm, skb);
if (!(flags & MSG_PEEK))
{
out_free:
skb_free_datagram(sk,skb);
out_unlock:
- up(&u->readsem);
+ mutex_unlock(&u->readlock);
out:
return err;
}
for (;;) {
prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
- if (skb_queue_len(&sk->sk_receive_queue) ||
+ if (!skb_queue_empty(&sk->sk_receive_queue) ||
sk->sk_err ||
(sk->sk_shutdown & RCV_SHUTDOWN) ||
signal_pending(current) ||
memset(&tmp_scm, 0, sizeof(tmp_scm));
}
- down(&u->readsem);
+ mutex_lock(&u->readlock);
do
{
err = -EAGAIN;
if (!timeo)
break;
- up(&u->readsem);
+ mutex_unlock(&u->readlock);
timeo = unix_stream_data_wait(sk, timeo);
err = sock_intr_errno(timeo);
goto out;
}
- down(&u->readsem);
+ mutex_lock(&u->readlock);
continue;
}
}
} while (size);
- up(&u->readsem);
+ mutex_unlock(&u->readlock);
scm_recv(sock, msg, siocb->scm, flags);
out:
return copied ? : err;
case SIOCINQ:
{
struct sk_buff *skb;
+
if (sk->sk_state == TCP_LISTEN) {
err = -EINVAL;
break;
}
spin_lock(&sk->sk_receive_queue.lock);
- skb = skb_peek(&sk->sk_receive_queue);
- if (skb)
- amount=skb->len;
+ if (sk->sk_type == SOCK_STREAM ||
+ sk->sk_type == SOCK_SEQPACKET) {
+ skb_queue_walk(&sk->sk_receive_queue, skb)
+ amount += skb->len;
+ } else {
+ skb = skb_peek(&sk->sk_receive_queue);
+ if (skb)
+ amount=skb->len;
+ }
spin_unlock(&sk->sk_receive_queue.lock);
err = put_user(amount, (int __user *)arg);
break;
}
default:
- err = dev_ioctl(cmd, (void __user *)arg);
+ err = -ENOIOCTLCMD;
break;
}
return err;
mask |= POLLERR;
if (sk->sk_shutdown == SHUTDOWN_MASK)
mask |= POLLHUP;
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ mask |= POLLRDHUP;
/* readable? */
if (!skb_queue_empty(&sk->sk_receive_queue) ||
static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
{
- read_lock(&unix_table_lock);
+ spin_lock(&unix_table_lock);
return *pos ? unix_seq_idx(seq->private, *pos - 1) : ((void *) 1);
}
static void unix_seq_stop(struct seq_file *seq, void *v)
{
- read_unlock(&unix_table_lock);
+ spin_unlock(&unix_table_lock);
}
static int unix_seq_show(struct seq_file *seq, void *v)
.owner = THIS_MODULE,
};
-#ifdef CONFIG_SYSCTL
-extern void unix_sysctl_register(void);
-extern void unix_sysctl_unregister(void);
-#else
-static inline void unix_sysctl_register(void) {}
-static inline void unix_sysctl_unregister(void) {}
-#endif
-
static int __init af_unix_init(void)
{
+ int rc = -1;
struct sk_buff *dummy_skb;
- if (sizeof(struct unix_skb_parms) > sizeof(dummy_skb->cb)) {
- printk(KERN_CRIT "%s: panic\n", __FUNCTION__);
- return -1;
+ BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof(dummy_skb->cb));
+
+ rc = proto_register(&unix_proto, 1);
+ if (rc != 0) {
+ printk(KERN_CRIT "%s: Cannot create unix_sock SLAB cache!\n",
+ __FUNCTION__);
+ goto out;
}
- /* allocate our sock slab cache */
- unix_sk_cachep = kmem_cache_create("unix_sock",
- sizeof(struct unix_sock), 0,
- SLAB_HWCACHE_ALIGN, NULL, NULL);
- if (!unix_sk_cachep)
- printk(KERN_CRIT
- "af_unix_init: Cannot create unix_sock SLAB cache!\n");
sock_register(&unix_family_ops);
#ifdef CONFIG_PROC_FS
proc_net_fops_create("unix", 0, &unix_seq_fops);
#endif
unix_sysctl_register();
- return 0;
+out:
+ return rc;
}
static void __exit af_unix_exit(void)
sock_unregister(PF_UNIX);
unix_sysctl_unregister();
proc_net_remove("unix");
- kmem_cache_destroy(unix_sk_cachep);
+ proto_unregister(&unix_proto);
}
module_init(af_unix_init);