vserver 2.0 rc7
[linux-2.6.git] / net / unix / af_unix.c
1 /*
2  * NET4:        Implementation of BSD Unix domain sockets.
3  *
4  * Authors:     Alan Cox, <alan.cox@linux.org>
5  *
6  *              This program is free software; you can redistribute it and/or
7  *              modify it under the terms of the GNU General Public License
8  *              as published by the Free Software Foundation; either version
9  *              2 of the License, or (at your option) any later version.
10  *
11  * Version:     $Id: af_unix.c,v 1.133 2002/02/08 03:57:19 davem Exp $
12  *
13  * Fixes:
14  *              Linus Torvalds  :       Assorted bug cures.
15  *              Niibe Yutaka    :       async I/O support.
16  *              Carsten Paeth   :       PF_UNIX check, address fixes.
17  *              Alan Cox        :       Limit size of allocated blocks.
18  *              Alan Cox        :       Fixed the stupid socketpair bug.
19  *              Alan Cox        :       BSD compatibility fine tuning.
20  *              Alan Cox        :       Fixed a bug in connect when interrupted.
21  *              Alan Cox        :       Sorted out a proper draft version of
22  *                                      file descriptor passing hacked up from
23  *                                      Mike Shaver's work.
24  *              Marty Leisner   :       Fixes to fd passing
25  *              Nick Nevin      :       recvmsg bugfix.
26  *              Alan Cox        :       Started proper garbage collector
27  *              Heiko EiBfeldt  :       Missing verify_area check
28  *              Alan Cox        :       Started POSIXisms
29  *              Andreas Schwab  :       Replace inode by dentry for proper
30  *                                      reference counting
31  *              Kirk Petersen   :       Made this a module
32  *          Christoph Rohland   :       Elegant non-blocking accept/connect algorithm.
33  *                                      Lots of bug fixes.
34  *           Alexey Kuznetosv   :       Repaired (I hope) bugs introduces
35  *                                      by above two patches.
36  *           Andrea Arcangeli   :       If possible we block in connect(2)
37  *                                      if the max backlog of the listen socket
38  *                                      is been reached. This won't break
39  *                                      old apps and it will avoid huge amount
40  *                                      of socks hashed (this for unix_gc()
41  *                                      performances reasons).
42  *                                      Security fix that limits the max
43  *                                      number of socks to 2*max_files and
44  *                                      the number of skb queueable in the
45  *                                      dgram receiver.
46  *              Artur Skawina   :       Hash function optimizations
47  *           Alexey Kuznetsov   :       Full scale SMP. Lot of bugs are introduced 8)
48  *            Malcolm Beattie   :       Set peercred for socketpair
49  *           Michal Ostrowski   :       Module initialization cleanup.
50  *           Arnaldo C. Melo    :       Remove MOD_{INC,DEC}_USE_COUNT,
51  *                                      the core infrastructure is doing that
52  *                                      for all net proto families now (2.5.69+)
53  *
54  *
55  * Known differences from reference BSD that was tested:
56  *
57  *      [TO FIX]
58  *      ECONNREFUSED is not returned from one end of a connected() socket to the
59  *              other the moment one end closes.
60  *      fstat() doesn't return st_dev=0, and give the blksize as high water mark
61  *              and a fake inode identifier (nor the BSD first socket fstat twice bug).
62  *      [NOT TO FIX]
63  *      accept() returns a path name even if the connecting socket has closed
64  *              in the meantime (BSD loses the path and gives up).
65  *      accept() returns 0 length path for an unbound connector. BSD returns 16
66  *              and a null first byte in the path (but not for gethost/peername - BSD bug ??)
67  *      socketpair(...SOCK_RAW..) doesn't panic the kernel.
68  *      BSD af_unix apparently has connect forgetting to block properly.
69  *              (need to check this with the POSIX spec in detail)
70  *
71  * Differences from 2.0.0-11-... (ANK)
72  *      Bug fixes and improvements.
73  *              - client shutdown killed server socket.
74  *              - removed all useless cli/sti pairs.
75  *
76  *      Semantic changes/extensions.
77  *              - generic control message passing.
78  *              - SCM_CREDENTIALS control message.
79  *              - "Abstract" (not FS based) socket bindings.
80  *                Abstract names are sequences of bytes (not zero terminated)
81  *                started by 0, so that this name space does not intersect
82  *                with BSD names.
83  */
84
85 #include <linux/module.h>
86 #include <linux/config.h>
87 #include <linux/kernel.h>
88 #include <linux/signal.h>
89 #include <linux/sched.h>
90 #include <linux/errno.h>
91 #include <linux/string.h>
92 #include <linux/stat.h>
93 #include <linux/dcache.h>
94 #include <linux/namei.h>
95 #include <linux/socket.h>
96 #include <linux/un.h>
97 #include <linux/fcntl.h>
98 #include <linux/termios.h>
99 #include <linux/sockios.h>
100 #include <linux/net.h>
101 #include <linux/in.h>
102 #include <linux/fs.h>
103 #include <linux/slab.h>
104 #include <asm/uaccess.h>
105 #include <linux/skbuff.h>
106 #include <linux/netdevice.h>
107 #include <net/sock.h>
108 #include <linux/tcp.h>
109 #include <net/af_unix.h>
110 #include <linux/proc_fs.h>
111 #include <linux/seq_file.h>
112 #include <net/scm.h>
113 #include <linux/init.h>
114 #include <linux/poll.h>
115 #include <linux/smp_lock.h>
116 #include <linux/rtnetlink.h>
117 #include <linux/mount.h>
118 #include <net/checksum.h>
119 #include <linux/security.h>
120 #include <linux/vs_context.h>
121 #include <linux/vs_network.h>
122 #include <linux/vs_limit.h>
123
124 int sysctl_unix_max_dgram_qlen = 10;
125
126 struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1];
127 DEFINE_RWLOCK(unix_table_lock);
128 static atomic_t unix_nr_socks = ATOMIC_INIT(0);
129
130 #define unix_sockets_unbound    (&unix_socket_table[UNIX_HASH_SIZE])
131
132 #define UNIX_ABSTRACT(sk)       (unix_sk(sk)->addr->hash != UNIX_HASH_SIZE)
133
134 /*
135  *  SMP locking strategy:
136  *    hash table is protected with rwlock unix_table_lock
137  *    each socket state is protected by separate rwlock.
138  */
139
140 static inline unsigned unix_hash_fold(unsigned hash)
141 {
142         hash ^= hash>>16;
143         hash ^= hash>>8;
144         return hash&(UNIX_HASH_SIZE-1);
145 }
146
147 #define unix_peer(sk) (unix_sk(sk)->peer)
148
149 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
150 {
151         return unix_peer(osk) == sk;
152 }
153
154 static inline int unix_may_send(struct sock *sk, struct sock *osk)
155 {
156         return (unix_peer(osk) == NULL || unix_our_peer(sk, osk));
157 }
158
159 static struct sock *unix_peer_get(struct sock *s)
160 {
161         struct sock *peer;
162
163         unix_state_rlock(s);
164         peer = unix_peer(s);
165         if (peer)
166                 sock_hold(peer);
167         unix_state_runlock(s);
168         return peer;
169 }
170
171 static inline void unix_release_addr(struct unix_address *addr)
172 {
173         if (atomic_dec_and_test(&addr->refcnt))
174                 kfree(addr);
175 }
176
177 /*
178  *      Check unix socket name:
179  *              - should be not zero length.
180  *              - if started by not zero, should be NULL terminated (FS object)
181  *              - if started by zero, it is abstract name.
182  */
183  
184 static int unix_mkname(struct sockaddr_un * sunaddr, int len, unsigned *hashp)
185 {
186         if (len <= sizeof(short) || len > sizeof(*sunaddr))
187                 return -EINVAL;
188         if (!sunaddr || sunaddr->sun_family != AF_UNIX)
189                 return -EINVAL;
190         if (sunaddr->sun_path[0]) {
191                 /*
192                  * This may look like an off by one error but it is a bit more
193                  * subtle. 108 is the longest valid AF_UNIX path for a binding.
194                  * sun_path[108] doesnt as such exist.  However in kernel space
195                  * we are guaranteed that it is a valid memory location in our
196                  * kernel address buffer.
197                  */
198                 ((char *)sunaddr)[len]=0;
199                 len = strlen(sunaddr->sun_path)+1+sizeof(short);
200                 return len;
201         }
202
203         *hashp = unix_hash_fold(csum_partial((char*)sunaddr, len, 0));
204         return len;
205 }
206
207 static void __unix_remove_socket(struct sock *sk)
208 {
209         sk_del_node_init(sk);
210 }
211
212 static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
213 {
214         BUG_TRAP(sk_unhashed(sk));
215         sk_add_node(sk, list);
216 }
217
218 static inline void unix_remove_socket(struct sock *sk)
219 {
220         write_lock(&unix_table_lock);
221         __unix_remove_socket(sk);
222         write_unlock(&unix_table_lock);
223 }
224
225 static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
226 {
227         write_lock(&unix_table_lock);
228         __unix_insert_socket(list, sk);
229         write_unlock(&unix_table_lock);
230 }
231
232 static struct sock *__unix_find_socket_byname(struct sockaddr_un *sunname,
233                                               int len, int type, unsigned hash)
234 {
235         struct sock *s;
236         struct hlist_node *node;
237
238         sk_for_each(s, node, &unix_socket_table[hash ^ type]) {
239                 struct unix_sock *u = unix_sk(s);
240
241                 if (u->addr->len == len &&
242                     !memcmp(u->addr->name, sunname, len))
243                         goto found;
244         }
245         s = NULL;
246 found:
247         return s;
248 }
249
250 static inline struct sock *unix_find_socket_byname(struct sockaddr_un *sunname,
251                                                    int len, int type,
252                                                    unsigned hash)
253 {
254         struct sock *s;
255
256         read_lock(&unix_table_lock);
257         s = __unix_find_socket_byname(sunname, len, type, hash);
258         if (s)
259                 sock_hold(s);
260         read_unlock(&unix_table_lock);
261         return s;
262 }
263
264 static struct sock *unix_find_socket_byinode(struct inode *i)
265 {
266         struct sock *s;
267         struct hlist_node *node;
268
269         read_lock(&unix_table_lock);
270         sk_for_each(s, node,
271                     &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
272                 struct dentry *dentry = unix_sk(s)->dentry;
273
274                 if(dentry && dentry->d_inode == i)
275                 {
276                         sock_hold(s);
277                         goto found;
278                 }
279         }
280         s = NULL;
281 found:
282         read_unlock(&unix_table_lock);
283         return s;
284 }
285
286 static inline int unix_writable(struct sock *sk)
287 {
288         return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
289 }
290
291 static void unix_write_space(struct sock *sk)
292 {
293         read_lock(&sk->sk_callback_lock);
294         if (unix_writable(sk)) {
295                 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
296                         wake_up_interruptible(sk->sk_sleep);
297                 sk_wake_async(sk, 2, POLL_OUT);
298         }
299         read_unlock(&sk->sk_callback_lock);
300 }
301
302 /* When dgram socket disconnects (or changes its peer), we clear its receive
303  * queue of packets arrived from previous peer. First, it allows to do
304  * flow control based only on wmem_alloc; second, sk connected to peer
305  * may receive messages only from that peer. */
306 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
307 {
308         if (skb_queue_len(&sk->sk_receive_queue)) {
309                 skb_queue_purge(&sk->sk_receive_queue);
310                 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
311
312                 /* If one link of bidirectional dgram pipe is disconnected,
313                  * we signal error. Messages are lost. Do not make this,
314                  * when peer was not connected to us.
315                  */
316                 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
317                         other->sk_err = ECONNRESET;
318                         other->sk_error_report(other);
319                 }
320         }
321 }
322
323 static void unix_sock_destructor(struct sock *sk)
324 {
325         struct unix_sock *u = unix_sk(sk);
326
327         skb_queue_purge(&sk->sk_receive_queue);
328
329         BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
330         BUG_TRAP(sk_unhashed(sk));
331         BUG_TRAP(!sk->sk_socket);
332         if (!sock_flag(sk, SOCK_DEAD)) {
333                 printk("Attempt to release alive unix socket: %p\n", sk);
334                 return;
335         }
336
337         if (u->addr)
338                 unix_release_addr(u->addr);
339
340         atomic_dec(&unix_nr_socks);
341 #ifdef UNIX_REFCNT_DEBUG
342         printk(KERN_DEBUG "UNIX %p is destroyed, %d are still alive.\n", sk, atomic_read(&unix_nr_socks));
343 #endif
344 }
345
346 static int unix_release_sock (struct sock *sk, int embrion)
347 {
348         struct unix_sock *u = unix_sk(sk);
349         struct dentry *dentry;
350         struct vfsmount *mnt;
351         struct sock *skpair;
352         struct sk_buff *skb;
353         int state;
354
355         unix_remove_socket(sk);
356
357         /* Clear state */
358         unix_state_wlock(sk);
359         sock_orphan(sk);
360         sk->sk_shutdown = SHUTDOWN_MASK;
361         dentry       = u->dentry;
362         u->dentry    = NULL;
363         mnt          = u->mnt;
364         u->mnt       = NULL;
365         state = sk->sk_state;
366         sk->sk_state = TCP_CLOSE;
367         unix_state_wunlock(sk);
368
369         wake_up_interruptible_all(&u->peer_wait);
370
371         skpair=unix_peer(sk);
372
373         if (skpair!=NULL) {
374                 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
375                         unix_state_wlock(skpair);
376                         /* No more writes */
377                         skpair->sk_shutdown = SHUTDOWN_MASK;
378                         if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
379                                 skpair->sk_err = ECONNRESET;
380                         unix_state_wunlock(skpair);
381                         skpair->sk_state_change(skpair);
382                         read_lock(&skpair->sk_callback_lock);
383                         sk_wake_async(skpair,1,POLL_HUP);
384                         read_unlock(&skpair->sk_callback_lock);
385                 }
386                 sock_put(skpair); /* It may now die */
387                 unix_peer(sk) = NULL;
388         }
389
390         /* Try to flush out this socket. Throw out buffers at least */
391
392         while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
393                 if (state==TCP_LISTEN)
394                         unix_release_sock(skb->sk, 1);
395                 /* passed fds are erased in the kfree_skb hook        */
396                 kfree_skb(skb);
397         }
398
399         if (dentry) {
400                 dput(dentry);
401                 mntput(mnt);
402         }
403
404         sock_put(sk);
405
406         /* ---- Socket is dead now and most probably destroyed ---- */
407
408         /*
409          * Fixme: BSD difference: In BSD all sockets connected to use get
410          *        ECONNRESET and we die on the spot. In Linux we behave
411          *        like files and pipes do and wait for the last
412          *        dereference.
413          *
414          * Can't we simply set sock->err?
415          *
416          *        What the above comment does talk about? --ANK(980817)
417          */
418
419         if (atomic_read(&unix_tot_inflight))
420                 unix_gc();              /* Garbage collect fds */       
421
422         return 0;
423 }
424
425 static int unix_listen(struct socket *sock, int backlog)
426 {
427         int err;
428         struct sock *sk = sock->sk;
429         struct unix_sock *u = unix_sk(sk);
430
431         err = -EOPNOTSUPP;
432         if (sock->type!=SOCK_STREAM && sock->type!=SOCK_SEQPACKET)
433                 goto out;                       /* Only stream/seqpacket sockets accept */
434         err = -EINVAL;
435         if (!u->addr)
436                 goto out;                       /* No listens on an unbound socket */
437         unix_state_wlock(sk);
438         if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
439                 goto out_unlock;
440         if (backlog > sk->sk_max_ack_backlog)
441                 wake_up_interruptible_all(&u->peer_wait);
442         sk->sk_max_ack_backlog  = backlog;
443         sk->sk_state            = TCP_LISTEN;
444         /* set credentials so connect can copy them */
445         sk->sk_peercred.pid     = current->tgid;
446         sk->sk_peercred.uid     = current->euid;
447         sk->sk_peercred.gid     = current->egid;
448         err = 0;
449
450 out_unlock:
451         unix_state_wunlock(sk);
452 out:
453         return err;
454 }
455
456 static int unix_release(struct socket *);
457 static int unix_bind(struct socket *, struct sockaddr *, int);
458 static int unix_stream_connect(struct socket *, struct sockaddr *,
459                                int addr_len, int flags);
460 static int unix_socketpair(struct socket *, struct socket *);
461 static int unix_accept(struct socket *, struct socket *, int);
462 static int unix_getname(struct socket *, struct sockaddr *, int *, int);
463 static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
464 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
465 static int unix_shutdown(struct socket *, int);
466 static int unix_stream_sendmsg(struct kiocb *, struct socket *,
467                                struct msghdr *, size_t);
468 static int unix_stream_recvmsg(struct kiocb *, struct socket *,
469                                struct msghdr *, size_t, int);
470 static int unix_dgram_sendmsg(struct kiocb *, struct socket *,
471                               struct msghdr *, size_t);
472 static int unix_dgram_recvmsg(struct kiocb *, struct socket *,
473                               struct msghdr *, size_t, int);
474 static int unix_dgram_connect(struct socket *, struct sockaddr *,
475                               int, int);
476 static int unix_seqpacket_sendmsg(struct kiocb *, struct socket *,
477                                   struct msghdr *, size_t);
478
479 static struct proto_ops unix_stream_ops = {
480         .family =       PF_UNIX,
481         .owner =        THIS_MODULE,
482         .release =      unix_release,
483         .bind =         unix_bind,
484         .connect =      unix_stream_connect,
485         .socketpair =   unix_socketpair,
486         .accept =       unix_accept,
487         .getname =      unix_getname,
488         .poll =         unix_poll,
489         .ioctl =        unix_ioctl,
490         .listen =       unix_listen,
491         .shutdown =     unix_shutdown,
492         .setsockopt =   sock_no_setsockopt,
493         .getsockopt =   sock_no_getsockopt,
494         .sendmsg =      unix_stream_sendmsg,
495         .recvmsg =      unix_stream_recvmsg,
496         .mmap =         sock_no_mmap,
497         .sendpage =     sock_no_sendpage,
498 };
499
500 static struct proto_ops unix_dgram_ops = {
501         .family =       PF_UNIX,
502         .owner =        THIS_MODULE,
503         .release =      unix_release,
504         .bind =         unix_bind,
505         .connect =      unix_dgram_connect,
506         .socketpair =   unix_socketpair,
507         .accept =       sock_no_accept,
508         .getname =      unix_getname,
509         .poll =         datagram_poll,
510         .ioctl =        unix_ioctl,
511         .listen =       sock_no_listen,
512         .shutdown =     unix_shutdown,
513         .setsockopt =   sock_no_setsockopt,
514         .getsockopt =   sock_no_getsockopt,
515         .sendmsg =      unix_dgram_sendmsg,
516         .recvmsg =      unix_dgram_recvmsg,
517         .mmap =         sock_no_mmap,
518         .sendpage =     sock_no_sendpage,
519 };
520
521 static struct proto_ops unix_seqpacket_ops = {
522         .family =       PF_UNIX,
523         .owner =        THIS_MODULE,
524         .release =      unix_release,
525         .bind =         unix_bind,
526         .connect =      unix_stream_connect,
527         .socketpair =   unix_socketpair,
528         .accept =       unix_accept,
529         .getname =      unix_getname,
530         .poll =         datagram_poll,
531         .ioctl =        unix_ioctl,
532         .listen =       unix_listen,
533         .shutdown =     unix_shutdown,
534         .setsockopt =   sock_no_setsockopt,
535         .getsockopt =   sock_no_getsockopt,
536         .sendmsg =      unix_seqpacket_sendmsg,
537         .recvmsg =      unix_dgram_recvmsg,
538         .mmap =         sock_no_mmap,
539         .sendpage =     sock_no_sendpage,
540 };
541
542 static struct proto unix_proto = {
543         .name     = "UNIX",
544         .owner    = THIS_MODULE,
545         .obj_size = sizeof(struct unix_sock),
546 };
547
548 static struct sock * unix_create1(struct socket *sock)
549 {
550         struct sock *sk = NULL;
551         struct unix_sock *u;
552
553         if (atomic_read(&unix_nr_socks) >= 2*files_stat.max_files)
554                 goto out;
555
556         sk = sk_alloc(PF_UNIX, GFP_KERNEL, &unix_proto, 1);
557         if (!sk)
558                 goto out;
559
560         atomic_inc(&unix_nr_socks);
561
562         sock_init_data(sock,sk);
563
564         sk->sk_write_space      = unix_write_space;
565         sk->sk_max_ack_backlog  = sysctl_unix_max_dgram_qlen;
566         sk->sk_destruct         = unix_sock_destructor;
567         u         = unix_sk(sk);
568         u->dentry = NULL;
569         u->mnt    = NULL;
570         rwlock_init(&u->lock);
571         atomic_set(&u->inflight, sock ? 0 : -1);
572         init_MUTEX(&u->readsem); /* single task reading lock */
573         init_waitqueue_head(&u->peer_wait);
574         unix_insert_socket(unix_sockets_unbound, sk);
575 out:
576         return sk;
577 }
578
579 static int unix_create(struct socket *sock, int protocol)
580 {
581         if (protocol && protocol != PF_UNIX)
582                 return -EPROTONOSUPPORT;
583
584         sock->state = SS_UNCONNECTED;
585
586         switch (sock->type) {
587         case SOCK_STREAM:
588                 sock->ops = &unix_stream_ops;
589                 break;
590                 /*
591                  *      Believe it or not BSD has AF_UNIX, SOCK_RAW though
592                  *      nothing uses it.
593                  */
594         case SOCK_RAW:
595                 sock->type=SOCK_DGRAM;
596         case SOCK_DGRAM:
597                 sock->ops = &unix_dgram_ops;
598                 break;
599         case SOCK_SEQPACKET:
600                 sock->ops = &unix_seqpacket_ops;
601                 break;
602         default:
603                 return -ESOCKTNOSUPPORT;
604         }
605
606         return unix_create1(sock) ? 0 : -ENOMEM;
607 }
608
609 static int unix_release(struct socket *sock)
610 {
611         struct sock *sk = sock->sk;
612
613         if (!sk)
614                 return 0;
615
616         sock->sk = NULL;
617
618         return unix_release_sock (sk, 0);
619 }
620
621 static int unix_autobind(struct socket *sock)
622 {
623         struct sock *sk = sock->sk;
624         struct unix_sock *u = unix_sk(sk);
625         static u32 ordernum = 1;
626         struct unix_address * addr;
627         int err;
628
629         down(&u->readsem);
630
631         err = 0;
632         if (u->addr)
633                 goto out;
634
635         err = -ENOMEM;
636         addr = kmalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
637         if (!addr)
638                 goto out;
639
640         memset(addr, 0, sizeof(*addr) + sizeof(short) + 16);
641         addr->name->sun_family = AF_UNIX;
642         atomic_set(&addr->refcnt, 1);
643
644 retry:
645         addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
646         addr->hash = unix_hash_fold(csum_partial((void*)addr->name, addr->len, 0));
647
648         write_lock(&unix_table_lock);
649         ordernum = (ordernum+1)&0xFFFFF;
650
651         if (__unix_find_socket_byname(addr->name, addr->len, sock->type,
652                                       addr->hash)) {
653                 write_unlock(&unix_table_lock);
654                 /* Sanity yield. It is unusual case, but yet... */
655                 if (!(ordernum&0xFF))
656                         yield();
657                 goto retry;
658         }
659         addr->hash ^= sk->sk_type;
660
661         __unix_remove_socket(sk);
662         u->addr = addr;
663         __unix_insert_socket(&unix_socket_table[addr->hash], sk);
664         write_unlock(&unix_table_lock);
665         err = 0;
666
667 out:    up(&u->readsem);
668         return err;
669 }
670
671 static struct sock *unix_find_other(struct sockaddr_un *sunname, int len,
672                                     int type, unsigned hash, int *error)
673 {
674         struct sock *u;
675         struct nameidata nd;
676         int err = 0;
677         
678         if (sunname->sun_path[0]) {
679                 err = path_lookup(sunname->sun_path, LOOKUP_FOLLOW, &nd);
680                 if (err)
681                         goto fail;
682                 err = permission(nd.dentry->d_inode,MAY_WRITE, &nd);
683                 if (err)
684                         goto put_fail;
685
686                 err = -ECONNREFUSED;
687                 if (!S_ISSOCK(nd.dentry->d_inode->i_mode))
688                         goto put_fail;
689                 u=unix_find_socket_byinode(nd.dentry->d_inode);
690                 if (!u)
691                         goto put_fail;
692
693                 if (u->sk_type == type)
694                         touch_atime(nd.mnt, nd.dentry);
695
696                 path_release(&nd);
697
698                 err=-EPROTOTYPE;
699                 if (u->sk_type != type) {
700                         sock_put(u);
701                         goto fail;
702                 }
703         } else {
704                 err = -ECONNREFUSED;
705                 u=unix_find_socket_byname(sunname, len, type, hash);
706                 if (u) {
707                         struct dentry *dentry;
708                         dentry = unix_sk(u)->dentry;
709                         if (dentry)
710                                 touch_atime(unix_sk(u)->mnt, dentry);
711                 } else
712                         goto fail;
713         }
714         return u;
715
716 put_fail:
717         path_release(&nd);
718 fail:
719         *error=err;
720         return NULL;
721 }
722
723
724 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
725 {
726         struct sock *sk = sock->sk;
727         struct unix_sock *u = unix_sk(sk);
728         struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr;
729         struct dentry * dentry = NULL;
730         struct nameidata nd;
731         int err;
732         unsigned hash;
733         struct unix_address *addr;
734         struct hlist_head *list;
735
736         err = -EINVAL;
737         if (sunaddr->sun_family != AF_UNIX)
738                 goto out;
739
740         if (addr_len==sizeof(short)) {
741                 err = unix_autobind(sock);
742                 goto out;
743         }
744
745         err = unix_mkname(sunaddr, addr_len, &hash);
746         if (err < 0)
747                 goto out;
748         addr_len = err;
749
750         down(&u->readsem);
751
752         err = -EINVAL;
753         if (u->addr)
754                 goto out_up;
755
756         err = -ENOMEM;
757         addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
758         if (!addr)
759                 goto out_up;
760
761         memcpy(addr->name, sunaddr, addr_len);
762         addr->len = addr_len;
763         addr->hash = hash ^ sk->sk_type;
764         atomic_set(&addr->refcnt, 1);
765
766         if (sunaddr->sun_path[0]) {
767                 unsigned int mode;
768                 err = 0;
769                 /*
770                  * Get the parent directory, calculate the hash for last
771                  * component.
772                  */
773                 err = path_lookup(sunaddr->sun_path, LOOKUP_PARENT, &nd);
774                 if (err)
775                         goto out_mknod_parent;
776
777                 dentry = lookup_create(&nd, 0);
778                 err = PTR_ERR(dentry);
779                 if (IS_ERR(dentry))
780                         goto out_mknod_unlock;
781
782                 /*
783                  * All right, let's create it.
784                  */
785                 mode = S_IFSOCK |
786                        (SOCK_INODE(sock)->i_mode & ~current->fs->umask);
787                 err = vfs_mknod(nd.dentry->d_inode, dentry, mode, 0);
788                 if (err)
789                         goto out_mknod_dput;
790                 up(&nd.dentry->d_inode->i_sem);
791                 dput(nd.dentry);
792                 nd.dentry = dentry;
793
794                 addr->hash = UNIX_HASH_SIZE;
795         }
796
797         write_lock(&unix_table_lock);
798
799         if (!sunaddr->sun_path[0]) {
800                 err = -EADDRINUSE;
801                 if (__unix_find_socket_byname(sunaddr, addr_len,
802                                               sk->sk_type, hash)) {
803                         unix_release_addr(addr);
804                         goto out_unlock;
805                 }
806
807                 list = &unix_socket_table[addr->hash];
808         } else {
809                 list = &unix_socket_table[dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1)];
810                 u->dentry = nd.dentry;
811                 u->mnt    = nd.mnt;
812         }
813
814         err = 0;
815         __unix_remove_socket(sk);
816         u->addr = addr;
817         __unix_insert_socket(list, sk);
818
819 out_unlock:
820         write_unlock(&unix_table_lock);
821 out_up:
822         up(&u->readsem);
823 out:
824         return err;
825
826 out_mknod_dput:
827         dput(dentry);
828 out_mknod_unlock:
829         up(&nd.dentry->d_inode->i_sem);
830         path_release(&nd);
831 out_mknod_parent:
832         if (err==-EEXIST)
833                 err=-EADDRINUSE;
834         unix_release_addr(addr);
835         goto out_up;
836 }
837
838 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
839                               int alen, int flags)
840 {
841         struct sock *sk = sock->sk;
842         struct sockaddr_un *sunaddr=(struct sockaddr_un*)addr;
843         struct sock *other;
844         unsigned hash;
845         int err;
846
847         if (addr->sa_family != AF_UNSPEC) {
848                 err = unix_mkname(sunaddr, alen, &hash);
849                 if (err < 0)
850                         goto out;
851                 alen = err;
852
853                 if (test_bit(SOCK_PASSCRED, &sock->flags) &&
854                     !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
855                         goto out;
856
857                 other=unix_find_other(sunaddr, alen, sock->type, hash, &err);
858                 if (!other)
859                         goto out;
860
861                 unix_state_wlock(sk);
862
863                 err = -EPERM;
864                 if (!unix_may_send(sk, other))
865                         goto out_unlock;
866
867                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
868                 if (err)
869                         goto out_unlock;
870
871         } else {
872                 /*
873                  *      1003.1g breaking connected state with AF_UNSPEC
874                  */
875                 other = NULL;
876                 unix_state_wlock(sk);
877         }
878
879         /*
880          * If it was connected, reconnect.
881          */
882         if (unix_peer(sk)) {
883                 struct sock *old_peer = unix_peer(sk);
884                 unix_peer(sk)=other;
885                 unix_state_wunlock(sk);
886
887                 if (other != old_peer)
888                         unix_dgram_disconnected(sk, old_peer);
889                 sock_put(old_peer);
890         } else {
891                 unix_peer(sk)=other;
892                 unix_state_wunlock(sk);
893         }
894         return 0;
895
896 out_unlock:
897         unix_state_wunlock(sk);
898         sock_put(other);
899 out:
900         return err;
901 }
902
903 static long unix_wait_for_peer(struct sock *other, long timeo)
904 {
905         struct unix_sock *u = unix_sk(other);
906         int sched;
907         DEFINE_WAIT(wait);
908
909         prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
910
911         sched = !sock_flag(other, SOCK_DEAD) &&
912                 !(other->sk_shutdown & RCV_SHUTDOWN) &&
913                 (skb_queue_len(&other->sk_receive_queue) >
914                  other->sk_max_ack_backlog);
915
916         unix_state_runlock(other);
917
918         if (sched)
919                 timeo = schedule_timeout(timeo);
920
921         finish_wait(&u->peer_wait, &wait);
922         return timeo;
923 }
924
925 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
926                                int addr_len, int flags)
927 {
928         struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr;
929         struct sock *sk = sock->sk;
930         struct unix_sock *u = unix_sk(sk), *newu, *otheru;
931         struct sock *newsk = NULL;
932         struct sock *other = NULL;
933         struct sk_buff *skb = NULL;
934         unsigned hash;
935         int st;
936         int err;
937         long timeo;
938
939         err = unix_mkname(sunaddr, addr_len, &hash);
940         if (err < 0)
941                 goto out;
942         addr_len = err;
943
944         if (test_bit(SOCK_PASSCRED, &sock->flags)
945                 && !u->addr && (err = unix_autobind(sock)) != 0)
946                 goto out;
947
948         timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
949
950         /* First of all allocate resources.
951            If we will make it after state is locked,
952            we will have to recheck all again in any case.
953          */
954
955         err = -ENOMEM;
956
957         /* create new sock for complete connection */
958         newsk = unix_create1(NULL);
959         if (newsk == NULL)
960                 goto out;
961
962         /* Allocate skb for sending to listening sock */
963         skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
964         if (skb == NULL)
965                 goto out;
966
967 restart:
968         /*  Find listening sock. */
969         other = unix_find_other(sunaddr, addr_len, sk->sk_type, hash, &err);
970         if (!other)
971                 goto out;
972
973         /* Latch state of peer */
974         unix_state_rlock(other);
975
976         /* Apparently VFS overslept socket death. Retry. */
977         if (sock_flag(other, SOCK_DEAD)) {
978                 unix_state_runlock(other);
979                 sock_put(other);
980                 goto restart;
981         }
982
983         err = -ECONNREFUSED;
984         if (other->sk_state != TCP_LISTEN)
985                 goto out_unlock;
986
987         if (skb_queue_len(&other->sk_receive_queue) >
988             other->sk_max_ack_backlog) {
989                 err = -EAGAIN;
990                 if (!timeo)
991                         goto out_unlock;
992
993                 timeo = unix_wait_for_peer(other, timeo);
994
995                 err = sock_intr_errno(timeo);
996                 if (signal_pending(current))
997                         goto out;
998                 sock_put(other);
999                 goto restart;
1000         }
1001
1002         /* Latch our state.
1003
1004            It is tricky place. We need to grab write lock and cannot
1005            drop lock on peer. It is dangerous because deadlock is
1006            possible. Connect to self case and simultaneous
1007            attempt to connect are eliminated by checking socket
1008            state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1009            check this before attempt to grab lock.
1010
1011            Well, and we have to recheck the state after socket locked.
1012          */
1013         st = sk->sk_state;
1014
1015         switch (st) {
1016         case TCP_CLOSE:
1017                 /* This is ok... continue with connect */
1018                 break;
1019         case TCP_ESTABLISHED:
1020                 /* Socket is already connected */
1021                 err = -EISCONN;
1022                 goto out_unlock;
1023         default:
1024                 err = -EINVAL;
1025                 goto out_unlock;
1026         }
1027
1028         unix_state_wlock(sk);
1029
1030         if (sk->sk_state != st) {
1031                 unix_state_wunlock(sk);
1032                 unix_state_runlock(other);
1033                 sock_put(other);
1034                 goto restart;
1035         }
1036
1037         err = security_unix_stream_connect(sock, other->sk_socket, newsk);
1038         if (err) {
1039                 unix_state_wunlock(sk);
1040                 goto out_unlock;
1041         }
1042
1043         /* The way is open! Fastly set all the necessary fields... */
1044
1045         sock_hold(sk);
1046         unix_peer(newsk)        = sk;
1047         newsk->sk_state         = TCP_ESTABLISHED;
1048         newsk->sk_type          = sk->sk_type;
1049         newsk->sk_peercred.pid  = current->tgid;
1050         newsk->sk_peercred.uid  = current->euid;
1051         newsk->sk_peercred.gid  = current->egid;
1052         newu = unix_sk(newsk);
1053         newsk->sk_sleep         = &newu->peer_wait;
1054         otheru = unix_sk(other);
1055
1056         /* copy address information from listening to new sock*/
1057         if (otheru->addr) {
1058                 atomic_inc(&otheru->addr->refcnt);
1059                 newu->addr = otheru->addr;
1060         }
1061         if (otheru->dentry) {
1062                 newu->dentry    = dget(otheru->dentry);
1063                 newu->mnt       = mntget(otheru->mnt);
1064         }
1065
1066         /* Set credentials */
1067         sk->sk_peercred = other->sk_peercred;
1068
1069         sock_hold(newsk);
1070         unix_peer(sk)   = newsk;
1071         sock->state     = SS_CONNECTED;
1072         sk->sk_state    = TCP_ESTABLISHED;
1073
1074         unix_state_wunlock(sk);
1075
1076         /* take ten and and send info to listening sock */
1077         spin_lock(&other->sk_receive_queue.lock);
1078         __skb_queue_tail(&other->sk_receive_queue, skb);
1079         /* Undo artificially decreased inflight after embrion
1080          * is installed to listening socket. */
1081         atomic_inc(&newu->inflight);
1082         spin_unlock(&other->sk_receive_queue.lock);
1083         unix_state_runlock(other);
1084         other->sk_data_ready(other, 0);
1085         sock_put(other);
1086         return 0;
1087
1088 out_unlock:
1089         if (other)
1090                 unix_state_runlock(other);
1091
1092 out:
1093         if (skb)
1094                 kfree_skb(skb);
1095         if (newsk)
1096                 unix_release_sock(newsk, 0);
1097         if (other)
1098                 sock_put(other);
1099         return err;
1100 }
1101
1102 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1103 {
1104         struct sock *ska=socka->sk, *skb = sockb->sk;
1105
1106         /* Join our sockets back to back */
1107         sock_hold(ska);
1108         sock_hold(skb);
1109         unix_peer(ska)=skb;
1110         unix_peer(skb)=ska;
1111         ska->sk_peercred.pid = skb->sk_peercred.pid = current->tgid;
1112         ska->sk_peercred.uid = skb->sk_peercred.uid = current->euid;
1113         ska->sk_peercred.gid = skb->sk_peercred.gid = current->egid;
1114
1115         if (ska->sk_type != SOCK_DGRAM) {
1116                 ska->sk_state = TCP_ESTABLISHED;
1117                 skb->sk_state = TCP_ESTABLISHED;
1118                 socka->state  = SS_CONNECTED;
1119                 sockb->state  = SS_CONNECTED;
1120         }
1121         return 0;
1122 }
1123
1124 static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
1125 {
1126         struct sock *sk = sock->sk;
1127         struct sock *tsk;
1128         struct sk_buff *skb;
1129         int err;
1130
1131         err = -EOPNOTSUPP;
1132         if (sock->type!=SOCK_STREAM && sock->type!=SOCK_SEQPACKET)
1133                 goto out;
1134
1135         err = -EINVAL;
1136         if (sk->sk_state != TCP_LISTEN)
1137                 goto out;
1138
1139         /* If socket state is TCP_LISTEN it cannot change (for now...),
1140          * so that no locks are necessary.
1141          */
1142
1143         skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1144         if (!skb) {
1145                 /* This means receive shutdown. */
1146                 if (err == 0)
1147                         err = -EINVAL;
1148                 goto out;
1149         }
1150
1151         tsk = skb->sk;
1152         skb_free_datagram(sk, skb);
1153         wake_up_interruptible(&unix_sk(sk)->peer_wait);
1154
1155         /* attach accepted sock to socket */
1156         unix_state_wlock(tsk);
1157         newsock->state = SS_CONNECTED;
1158         sock_graft(tsk, newsock);
1159         unix_state_wunlock(tsk);
1160         return 0;
1161
1162 out:
1163         return err;
1164 }
1165
1166
1167 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer)
1168 {
1169         struct sock *sk = sock->sk;
1170         struct unix_sock *u;
1171         struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr;
1172         int err = 0;
1173
1174         if (peer) {
1175                 sk = unix_peer_get(sk);
1176
1177                 err = -ENOTCONN;
1178                 if (!sk)
1179                         goto out;
1180                 err = 0;
1181         } else {
1182                 sock_hold(sk);
1183         }
1184
1185         u = unix_sk(sk);
1186         unix_state_rlock(sk);
1187         if (!u->addr) {
1188                 sunaddr->sun_family = AF_UNIX;
1189                 sunaddr->sun_path[0] = 0;
1190                 *uaddr_len = sizeof(short);
1191         } else {
1192                 struct unix_address *addr = u->addr;
1193
1194                 *uaddr_len = addr->len;
1195                 memcpy(sunaddr, addr->name, *uaddr_len);
1196         }
1197         unix_state_runlock(sk);
1198         sock_put(sk);
1199 out:
1200         return err;
1201 }
1202
1203 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1204 {
1205         int i;
1206
1207         scm->fp = UNIXCB(skb).fp;
1208         skb->destructor = sock_wfree;
1209         UNIXCB(skb).fp = NULL;
1210
1211         for (i=scm->fp->count-1; i>=0; i--)
1212                 unix_notinflight(scm->fp->fp[i]);
1213 }
1214
1215 static void unix_destruct_fds(struct sk_buff *skb)
1216 {
1217         struct scm_cookie scm;
1218         memset(&scm, 0, sizeof(scm));
1219         unix_detach_fds(&scm, skb);
1220
1221         /* Alas, it calls VFS */
1222         /* So fscking what? fput() had been SMP-safe since the last Summer */
1223         scm_destroy(&scm);
1224         sock_wfree(skb);
1225 }
1226
1227 static void unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1228 {
1229         int i;
1230         for (i=scm->fp->count-1; i>=0; i--)
1231                 unix_inflight(scm->fp->fp[i]);
1232         UNIXCB(skb).fp = scm->fp;
1233         skb->destructor = unix_destruct_fds;
1234         scm->fp = NULL;
1235 }
1236
1237 /*
1238  *      Send AF_UNIX data.
1239  */
1240
1241 static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
1242                               struct msghdr *msg, size_t len)
1243 {
1244         struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1245         struct sock *sk = sock->sk;
1246         struct unix_sock *u = unix_sk(sk);
1247         struct sockaddr_un *sunaddr=msg->msg_name;
1248         struct sock *other = NULL;
1249         int namelen = 0; /* fake GCC */
1250         int err;
1251         unsigned hash;
1252         struct sk_buff *skb;
1253         long timeo;
1254         struct scm_cookie tmp_scm;
1255
1256         if (NULL == siocb->scm)
1257                 siocb->scm = &tmp_scm;
1258         err = scm_send(sock, msg, siocb->scm);
1259         if (err < 0)
1260                 return err;
1261
1262         err = -EOPNOTSUPP;
1263         if (msg->msg_flags&MSG_OOB)
1264                 goto out;
1265
1266         if (msg->msg_namelen) {
1267                 err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1268                 if (err < 0)
1269                         goto out;
1270                 namelen = err;
1271         } else {
1272                 sunaddr = NULL;
1273                 err = -ENOTCONN;
1274                 other = unix_peer_get(sk);
1275                 if (!other)
1276                         goto out;
1277         }
1278
1279         if (test_bit(SOCK_PASSCRED, &sock->flags)
1280                 && !u->addr && (err = unix_autobind(sock)) != 0)
1281                 goto out;
1282
1283         err = -EMSGSIZE;
1284         if (len > sk->sk_sndbuf - 32)
1285                 goto out;
1286
1287         skb = sock_alloc_send_skb(sk, len, msg->msg_flags&MSG_DONTWAIT, &err);
1288         if (skb==NULL)
1289                 goto out;
1290
1291         memcpy(UNIXCREDS(skb), &siocb->scm->creds, sizeof(struct ucred));
1292         if (siocb->scm->fp)
1293                 unix_attach_fds(siocb->scm, skb);
1294
1295         skb->h.raw = skb->data;
1296         err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
1297         if (err)
1298                 goto out_free;
1299
1300         timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1301
1302 restart:
1303         if (!other) {
1304                 err = -ECONNRESET;
1305                 if (sunaddr == NULL)
1306                         goto out_free;
1307
1308                 other = unix_find_other(sunaddr, namelen, sk->sk_type,
1309                                         hash, &err);
1310                 if (other==NULL)
1311                         goto out_free;
1312         }
1313
1314         unix_state_rlock(other);
1315         err = -EPERM;
1316         if (!unix_may_send(sk, other))
1317                 goto out_unlock;
1318
1319         if (sock_flag(other, SOCK_DEAD)) {
1320                 /*
1321                  *      Check with 1003.1g - what should
1322                  *      datagram error
1323                  */
1324                 unix_state_runlock(other);
1325                 sock_put(other);
1326
1327                 err = 0;
1328                 unix_state_wlock(sk);
1329                 if (unix_peer(sk) == other) {
1330                         unix_peer(sk)=NULL;
1331                         unix_state_wunlock(sk);
1332
1333                         unix_dgram_disconnected(sk, other);
1334                         sock_put(other);
1335                         err = -ECONNREFUSED;
1336                 } else {
1337                         unix_state_wunlock(sk);
1338                 }
1339
1340                 other = NULL;
1341                 if (err)
1342                         goto out_free;
1343                 goto restart;
1344         }
1345
1346         err = -EPIPE;
1347         if (other->sk_shutdown & RCV_SHUTDOWN)
1348                 goto out_unlock;
1349
1350         if (sk->sk_type != SOCK_SEQPACKET) {
1351                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1352                 if (err)
1353                         goto out_unlock;
1354         }
1355
1356         if (unix_peer(other) != sk &&
1357             (skb_queue_len(&other->sk_receive_queue) >
1358              other->sk_max_ack_backlog)) {
1359                 if (!timeo) {
1360                         err = -EAGAIN;
1361                         goto out_unlock;
1362                 }
1363
1364                 timeo = unix_wait_for_peer(other, timeo);
1365
1366                 err = sock_intr_errno(timeo);
1367                 if (signal_pending(current))
1368                         goto out_free;
1369
1370                 goto restart;
1371         }
1372
1373         skb_queue_tail(&other->sk_receive_queue, skb);
1374         unix_state_runlock(other);
1375         other->sk_data_ready(other, len);
1376         sock_put(other);
1377         scm_destroy(siocb->scm);
1378         return len;
1379
1380 out_unlock:
1381         unix_state_runlock(other);
1382 out_free:
1383         kfree_skb(skb);
1384 out:
1385         if (other)
1386                 sock_put(other);
1387         scm_destroy(siocb->scm);
1388         return err;
1389 }
1390
1391                 
1392 static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
1393                                struct msghdr *msg, size_t len)
1394 {
1395         struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1396         struct sock *sk = sock->sk;
1397         struct sock *other = NULL;
1398         struct sockaddr_un *sunaddr=msg->msg_name;
1399         int err,size;
1400         struct sk_buff *skb;
1401         int sent=0;
1402         struct scm_cookie tmp_scm;
1403
1404         if (NULL == siocb->scm)
1405                 siocb->scm = &tmp_scm;
1406         err = scm_send(sock, msg, siocb->scm);
1407         if (err < 0)
1408                 return err;
1409
1410         err = -EOPNOTSUPP;
1411         if (msg->msg_flags&MSG_OOB)
1412                 goto out_err;
1413
1414         if (msg->msg_namelen) {
1415                 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1416                 goto out_err;
1417         } else {
1418                 sunaddr = NULL;
1419                 err = -ENOTCONN;
1420                 other = unix_peer_get(sk);
1421                 if (!other)
1422                         goto out_err;
1423         }
1424
1425         if (sk->sk_shutdown & SEND_SHUTDOWN)
1426                 goto pipe_err;
1427
1428         while(sent < len)
1429         {
1430                 /*
1431                  *      Optimisation for the fact that under 0.01% of X messages typically
1432                  *      need breaking up.
1433                  */
1434
1435                 size=len-sent;
1436
1437                 /* Keep two messages in the pipe so it schedules better */
1438                 if (size > sk->sk_sndbuf / 2 - 64)
1439                         size = sk->sk_sndbuf / 2 - 64;
1440
1441                 if (size > SKB_MAX_ALLOC)
1442                         size = SKB_MAX_ALLOC;
1443                         
1444                 /*
1445                  *      Grab a buffer
1446                  */
1447                  
1448                 skb=sock_alloc_send_skb(sk,size,msg->msg_flags&MSG_DONTWAIT, &err);
1449
1450                 if (skb==NULL)
1451                         goto out_err;
1452
1453                 /*
1454                  *      If you pass two values to the sock_alloc_send_skb
1455                  *      it tries to grab the large buffer with GFP_NOFS
1456                  *      (which can fail easily), and if it fails grab the
1457                  *      fallback size buffer which is under a page and will
1458                  *      succeed. [Alan]
1459                  */
1460                 size = min_t(int, size, skb_tailroom(skb));
1461
1462                 memcpy(UNIXCREDS(skb), &siocb->scm->creds, sizeof(struct ucred));
1463                 if (siocb->scm->fp)
1464                         unix_attach_fds(siocb->scm, skb);
1465
1466                 if ((err = memcpy_fromiovec(skb_put(skb,size), msg->msg_iov, size)) != 0) {
1467                         kfree_skb(skb);
1468                         goto out_err;
1469                 }
1470
1471                 unix_state_rlock(other);
1472
1473                 if (sock_flag(other, SOCK_DEAD) ||
1474                     (other->sk_shutdown & RCV_SHUTDOWN))
1475                         goto pipe_err_free;
1476
1477                 skb_queue_tail(&other->sk_receive_queue, skb);
1478                 unix_state_runlock(other);
1479                 other->sk_data_ready(other, size);
1480                 sent+=size;
1481         }
1482         sock_put(other);
1483
1484         scm_destroy(siocb->scm);
1485         siocb->scm = NULL;
1486
1487         return sent;
1488
1489 pipe_err_free:
1490         unix_state_runlock(other);
1491         kfree_skb(skb);
1492 pipe_err:
1493         if (sent==0 && !(msg->msg_flags&MSG_NOSIGNAL))
1494                 send_sig(SIGPIPE,current,0);
1495         err = -EPIPE;
1496 out_err:
1497         if (other)
1498                 sock_put(other);
1499         scm_destroy(siocb->scm);
1500         siocb->scm = NULL;
1501         return sent ? : err;
1502 }
1503
1504 static int unix_seqpacket_sendmsg(struct kiocb *kiocb, struct socket *sock,
1505                                   struct msghdr *msg, size_t len)
1506 {
1507         int err;
1508         struct sock *sk = sock->sk;
1509         
1510         err = sock_error(sk);
1511         if (err)
1512                 return err;
1513
1514         if (sk->sk_state != TCP_ESTABLISHED)
1515                 return -ENOTCONN;
1516
1517         if (msg->msg_namelen)
1518                 msg->msg_namelen = 0;
1519
1520         return unix_dgram_sendmsg(kiocb, sock, msg, len);
1521 }
1522                                                                                             
1523 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
1524 {
1525         struct unix_sock *u = unix_sk(sk);
1526
1527         msg->msg_namelen = 0;
1528         if (u->addr) {
1529                 msg->msg_namelen = u->addr->len;
1530                 memcpy(msg->msg_name, u->addr->name, u->addr->len);
1531         }
1532 }
1533
1534 static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
1535                               struct msghdr *msg, size_t size,
1536                               int flags)
1537 {
1538         struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1539         struct scm_cookie tmp_scm;
1540         struct sock *sk = sock->sk;
1541         struct unix_sock *u = unix_sk(sk);
1542         int noblock = flags & MSG_DONTWAIT;
1543         struct sk_buff *skb;
1544         int err;
1545
1546         err = -EOPNOTSUPP;
1547         if (flags&MSG_OOB)
1548                 goto out;
1549
1550         msg->msg_namelen = 0;
1551
1552         down(&u->readsem);
1553
1554         skb = skb_recv_datagram(sk, flags, noblock, &err);
1555         if (!skb)
1556                 goto out_unlock;
1557
1558         wake_up_interruptible(&u->peer_wait);
1559
1560         if (msg->msg_name)
1561                 unix_copy_addr(msg, skb->sk);
1562
1563         if (size > skb->len)
1564                 size = skb->len;
1565         else if (size < skb->len)
1566                 msg->msg_flags |= MSG_TRUNC;
1567
1568         err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, size);
1569         if (err)
1570                 goto out_free;
1571
1572         if (!siocb->scm) {
1573                 siocb->scm = &tmp_scm;
1574                 memset(&tmp_scm, 0, sizeof(tmp_scm));
1575         }
1576         siocb->scm->creds = *UNIXCREDS(skb);
1577
1578         if (!(flags & MSG_PEEK))
1579         {
1580                 if (UNIXCB(skb).fp)
1581                         unix_detach_fds(siocb->scm, skb);
1582         }
1583         else 
1584         {
1585                 /* It is questionable: on PEEK we could:
1586                    - do not return fds - good, but too simple 8)
1587                    - return fds, and do not return them on read (old strategy,
1588                      apparently wrong)
1589                    - clone fds (I chose it for now, it is the most universal
1590                      solution)
1591                 
1592                    POSIX 1003.1g does not actually define this clearly
1593                    at all. POSIX 1003.1g doesn't define a lot of things
1594                    clearly however!                  
1595                    
1596                 */
1597                 if (UNIXCB(skb).fp)
1598                         siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1599         }
1600         err = size;
1601
1602         scm_recv(sock, msg, siocb->scm, flags);
1603
1604 out_free:
1605         skb_free_datagram(sk,skb);
1606 out_unlock:
1607         up(&u->readsem);
1608 out:
1609         return err;
1610 }
1611
1612 /*
1613  *      Sleep until data has arrive. But check for races..
1614  */
1615  
1616 static long unix_stream_data_wait(struct sock * sk, long timeo)
1617 {
1618         DEFINE_WAIT(wait);
1619
1620         unix_state_rlock(sk);
1621
1622         for (;;) {
1623                 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1624
1625                 if (skb_queue_len(&sk->sk_receive_queue) ||
1626                     sk->sk_err ||
1627                     (sk->sk_shutdown & RCV_SHUTDOWN) ||
1628                     signal_pending(current) ||
1629                     !timeo)
1630                         break;
1631
1632                 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1633                 unix_state_runlock(sk);
1634                 timeo = schedule_timeout(timeo);
1635                 unix_state_rlock(sk);
1636                 clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1637         }
1638
1639         finish_wait(sk->sk_sleep, &wait);
1640         unix_state_runlock(sk);
1641         return timeo;
1642 }
1643
1644
1645
1646 static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
1647                                struct msghdr *msg, size_t size,
1648                                int flags)
1649 {
1650         struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1651         struct scm_cookie tmp_scm;
1652         struct sock *sk = sock->sk;
1653         struct unix_sock *u = unix_sk(sk);
1654         struct sockaddr_un *sunaddr=msg->msg_name;
1655         int copied = 0;
1656         int check_creds = 0;
1657         int target;
1658         int err = 0;
1659         long timeo;
1660
1661         err = -EINVAL;
1662         if (sk->sk_state != TCP_ESTABLISHED)
1663                 goto out;
1664
1665         err = -EOPNOTSUPP;
1666         if (flags&MSG_OOB)
1667                 goto out;
1668
1669         target = sock_rcvlowat(sk, flags&MSG_WAITALL, size);
1670         timeo = sock_rcvtimeo(sk, flags&MSG_DONTWAIT);
1671
1672         msg->msg_namelen = 0;
1673
1674         /* Lock the socket to prevent queue disordering
1675          * while sleeps in memcpy_tomsg
1676          */
1677
1678         if (!siocb->scm) {
1679                 siocb->scm = &tmp_scm;
1680                 memset(&tmp_scm, 0, sizeof(tmp_scm));
1681         }
1682
1683         down(&u->readsem);
1684
1685         do
1686         {
1687                 int chunk;
1688                 struct sk_buff *skb;
1689
1690                 skb = skb_dequeue(&sk->sk_receive_queue);
1691                 if (skb==NULL)
1692                 {
1693                         if (copied >= target)
1694                                 break;
1695
1696                         /*
1697                          *      POSIX 1003.1g mandates this order.
1698                          */
1699                          
1700                         if ((err = sock_error(sk)) != 0)
1701                                 break;
1702                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1703                                 break;
1704                         err = -EAGAIN;
1705                         if (!timeo)
1706                                 break;
1707                         up(&u->readsem);
1708
1709                         timeo = unix_stream_data_wait(sk, timeo);
1710
1711                         if (signal_pending(current)) {
1712                                 err = sock_intr_errno(timeo);
1713                                 goto out;
1714                         }
1715                         down(&u->readsem);
1716                         continue;
1717                 }
1718
1719                 if (check_creds) {
1720                         /* Never glue messages from different writers */
1721                         if (memcmp(UNIXCREDS(skb), &siocb->scm->creds, sizeof(siocb->scm->creds)) != 0) {
1722                                 skb_queue_head(&sk->sk_receive_queue, skb);
1723                                 break;
1724                         }
1725                 } else {
1726                         /* Copy credentials */
1727                         siocb->scm->creds = *UNIXCREDS(skb);
1728                         check_creds = 1;
1729                 }
1730
1731                 /* Copy address just once */
1732                 if (sunaddr)
1733                 {
1734                         unix_copy_addr(msg, skb->sk);
1735                         sunaddr = NULL;
1736                 }
1737
1738                 chunk = min_t(unsigned int, skb->len, size);
1739                 if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) {
1740                         skb_queue_head(&sk->sk_receive_queue, skb);
1741                         if (copied == 0)
1742                                 copied = -EFAULT;
1743                         break;
1744                 }
1745                 copied += chunk;
1746                 size -= chunk;
1747
1748                 /* Mark read part of skb as used */
1749                 if (!(flags & MSG_PEEK))
1750                 {
1751                         skb_pull(skb, chunk);
1752
1753                         if (UNIXCB(skb).fp)
1754                                 unix_detach_fds(siocb->scm, skb);
1755
1756                         /* put the skb back if we didn't use it up.. */
1757                         if (skb->len)
1758                         {
1759                                 skb_queue_head(&sk->sk_receive_queue, skb);
1760                                 break;
1761                         }
1762
1763                         kfree_skb(skb);
1764
1765                         if (siocb->scm->fp)
1766                                 break;
1767                 }
1768                 else
1769                 {
1770                         /* It is questionable, see note in unix_dgram_recvmsg.
1771                          */
1772                         if (UNIXCB(skb).fp)
1773                                 siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1774
1775                         /* put message back and return */
1776                         skb_queue_head(&sk->sk_receive_queue, skb);
1777                         break;
1778                 }
1779         } while (size);
1780
1781         up(&u->readsem);
1782         scm_recv(sock, msg, siocb->scm, flags);
1783 out:
1784         return copied ? : err;
1785 }
1786
1787 static int unix_shutdown(struct socket *sock, int mode)
1788 {
1789         struct sock *sk = sock->sk;
1790         struct sock *other;
1791
1792         mode = (mode+1)&(RCV_SHUTDOWN|SEND_SHUTDOWN);
1793
1794         if (mode) {
1795                 unix_state_wlock(sk);
1796                 sk->sk_shutdown |= mode;
1797                 other=unix_peer(sk);
1798                 if (other)
1799                         sock_hold(other);
1800                 unix_state_wunlock(sk);
1801                 sk->sk_state_change(sk);
1802
1803                 if (other &&
1804                         (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
1805
1806                         int peer_mode = 0;
1807
1808                         if (mode&RCV_SHUTDOWN)
1809                                 peer_mode |= SEND_SHUTDOWN;
1810                         if (mode&SEND_SHUTDOWN)
1811                                 peer_mode |= RCV_SHUTDOWN;
1812                         unix_state_wlock(other);
1813                         other->sk_shutdown |= peer_mode;
1814                         unix_state_wunlock(other);
1815                         other->sk_state_change(other);
1816                         read_lock(&other->sk_callback_lock);
1817                         if (peer_mode == SHUTDOWN_MASK)
1818                                 sk_wake_async(other,1,POLL_HUP);
1819                         else if (peer_mode & RCV_SHUTDOWN)
1820                                 sk_wake_async(other,1,POLL_IN);
1821                         read_unlock(&other->sk_callback_lock);
1822                 }
1823                 if (other)
1824                         sock_put(other);
1825         }
1826         return 0;
1827 }
1828
1829 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1830 {
1831         struct sock *sk = sock->sk;
1832         long amount=0;
1833         int err;
1834
1835         switch(cmd)
1836         {
1837                 case SIOCOUTQ:
1838                         amount = atomic_read(&sk->sk_wmem_alloc);
1839                         err = put_user(amount, (int __user *)arg);
1840                         break;
1841                 case SIOCINQ:
1842                 {
1843                         struct sk_buff *skb;
1844
1845                         if (sk->sk_state == TCP_LISTEN) {
1846                                 err = -EINVAL;
1847                                 break;
1848                         }
1849
1850                         spin_lock(&sk->sk_receive_queue.lock);
1851                         if (sk->sk_type == SOCK_STREAM ||
1852                             sk->sk_type == SOCK_SEQPACKET) {
1853                                 skb_queue_walk(&sk->sk_receive_queue, skb)
1854                                         amount += skb->len;
1855                         } else {
1856                                 skb = skb_peek(&sk->sk_receive_queue);
1857                                 if (skb)
1858                                         amount=skb->len;
1859                         }
1860                         spin_unlock(&sk->sk_receive_queue.lock);
1861                         err = put_user(amount, (int __user *)arg);
1862                         break;
1863                 }
1864
1865                 default:
1866                         err = dev_ioctl(cmd, (void __user *)arg);
1867                         break;
1868         }
1869         return err;
1870 }
1871
1872 static unsigned int unix_poll(struct file * file, struct socket *sock, poll_table *wait)
1873 {
1874         struct sock *sk = sock->sk;
1875         unsigned int mask;
1876
1877         poll_wait(file, sk->sk_sleep, wait);
1878         mask = 0;
1879
1880         /* exceptional events? */
1881         if (sk->sk_err)
1882                 mask |= POLLERR;
1883         if (sk->sk_shutdown == SHUTDOWN_MASK)
1884                 mask |= POLLHUP;
1885
1886         /* readable? */
1887         if (!skb_queue_empty(&sk->sk_receive_queue) ||
1888             (sk->sk_shutdown & RCV_SHUTDOWN))
1889                 mask |= POLLIN | POLLRDNORM;
1890
1891         /* Connection-based need to check for termination and startup */
1892         if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) && sk->sk_state == TCP_CLOSE)
1893                 mask |= POLLHUP;
1894
1895         /*
1896          * we set writable also when the other side has shut down the
1897          * connection. This prevents stuck sockets.
1898          */
1899         if (unix_writable(sk))
1900                 mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
1901
1902         return mask;
1903 }
1904
1905
1906 #ifdef CONFIG_PROC_FS
1907 static struct sock *unix_seq_idx(int *iter, loff_t pos)
1908 {
1909         loff_t off = 0;
1910         struct sock *s;
1911
1912         for (s = first_unix_socket(iter); s; s = next_unix_socket(iter, s)) {
1913                 if (off == pos) 
1914                         return s;
1915                 ++off;
1916         }
1917         return NULL;
1918 }
1919
1920
1921 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
1922 {
1923         read_lock(&unix_table_lock);
1924         return *pos ? unix_seq_idx(seq->private, *pos - 1) : ((void *) 1);
1925 }
1926
1927 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1928 {
1929         ++*pos;
1930
1931         if (v == (void *)1) 
1932                 return first_unix_socket(seq->private);
1933         return next_unix_socket(seq->private, v);
1934 }
1935
1936 static void unix_seq_stop(struct seq_file *seq, void *v)
1937 {
1938         read_unlock(&unix_table_lock);
1939 }
1940
1941 static int unix_seq_show(struct seq_file *seq, void *v)
1942 {
1943         
1944         if (v == (void *)1)
1945                 seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
1946                          "Inode Path\n");
1947         else {
1948                 struct sock *s = v;
1949                 struct unix_sock *u = unix_sk(s);
1950                 unix_state_rlock(s);
1951
1952                 seq_printf(seq, "%p: %08X %08X %08X %04X %02X %5lu",
1953                         s,
1954                         atomic_read(&s->sk_refcnt),
1955                         0,
1956                         s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
1957                         s->sk_type,
1958                         s->sk_socket ?
1959                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
1960                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
1961                         sock_i_ino(s));
1962
1963                 if (u->addr) {
1964                         int i, len;
1965                         seq_putc(seq, ' ');
1966
1967                         i = 0;
1968                         len = u->addr->len - sizeof(short);
1969                         if (!UNIX_ABSTRACT(s))
1970                                 len--;
1971                         else {
1972                                 seq_putc(seq, '@');
1973                                 i++;
1974                         }
1975                         for ( ; i < len; i++)
1976                                 seq_putc(seq, u->addr->name->sun_path[i]);
1977                 }
1978                 unix_state_runlock(s);
1979                 seq_putc(seq, '\n');
1980         }
1981
1982         return 0;
1983 }
1984
1985 static struct seq_operations unix_seq_ops = {
1986         .start  = unix_seq_start,
1987         .next   = unix_seq_next,
1988         .stop   = unix_seq_stop,
1989         .show   = unix_seq_show,
1990 };
1991
1992
1993 static int unix_seq_open(struct inode *inode, struct file *file)
1994 {
1995         struct seq_file *seq;
1996         int rc = -ENOMEM;
1997         int *iter = kmalloc(sizeof(int), GFP_KERNEL);
1998
1999         if (!iter)
2000                 goto out;
2001
2002         rc = seq_open(file, &unix_seq_ops);
2003         if (rc)
2004                 goto out_kfree;
2005
2006         seq          = file->private_data;
2007         seq->private = iter;
2008         *iter = 0;
2009 out:
2010         return rc;
2011 out_kfree:
2012         kfree(iter);
2013         goto out;
2014 }
2015
2016 static struct file_operations unix_seq_fops = {
2017         .owner          = THIS_MODULE,
2018         .open           = unix_seq_open,
2019         .read           = seq_read,
2020         .llseek         = seq_lseek,
2021         .release        = seq_release_private,
2022 };
2023
2024 #endif
2025
2026 static struct net_proto_family unix_family_ops = {
2027         .family = PF_UNIX,
2028         .create = unix_create,
2029         .owner  = THIS_MODULE,
2030 };
2031
2032 #ifdef CONFIG_SYSCTL
2033 extern void unix_sysctl_register(void);
2034 extern void unix_sysctl_unregister(void);
2035 #else
2036 static inline void unix_sysctl_register(void) {}
2037 static inline void unix_sysctl_unregister(void) {}
2038 #endif
2039
2040 static int __init af_unix_init(void)
2041 {
2042         int rc = -1;
2043         struct sk_buff *dummy_skb;
2044
2045         if (sizeof(struct unix_skb_parms) > sizeof(dummy_skb->cb)) {
2046                 printk(KERN_CRIT "%s: panic\n", __FUNCTION__);
2047                 goto out;
2048         }
2049
2050         rc = proto_register(&unix_proto, 1);
2051         if (rc != 0) {
2052                 printk(KERN_CRIT "%s: Cannot create unix_sock SLAB cache!\n",
2053                        __FUNCTION__);
2054                 goto out;
2055         }
2056
2057         sock_register(&unix_family_ops);
2058 #ifdef CONFIG_PROC_FS
2059         proc_net_fops_create("unix", 0, &unix_seq_fops);
2060 #endif
2061         unix_sysctl_register();
2062 out:
2063         return rc;
2064 }
2065
2066 static void __exit af_unix_exit(void)
2067 {
2068         sock_unregister(PF_UNIX);
2069         unix_sysctl_unregister();
2070         proc_net_remove("unix");
2071         proto_unregister(&unix_proto);
2072 }
2073
2074 module_init(af_unix_init);
2075 module_exit(af_unix_exit);
2076
2077 MODULE_LICENSE("GPL");
2078 MODULE_ALIAS_NETPROTO(PF_UNIX);