Merge to Fedora kernel-2.6.7-1.492
[linux-2.6.git] / net / unix / af_unix.c
1 /*
2  * NET4:        Implementation of BSD Unix domain sockets.
3  *
4  * Authors:     Alan Cox, <alan.cox@linux.org>
5  *
6  *              This program is free software; you can redistribute it and/or
7  *              modify it under the terms of the GNU General Public License
8  *              as published by the Free Software Foundation; either version
9  *              2 of the License, or (at your option) any later version.
10  *
11  * Version:     $Id: af_unix.c,v 1.133 2002/02/08 03:57:19 davem Exp $
12  *
13  * Fixes:
14  *              Linus Torvalds  :       Assorted bug cures.
15  *              Niibe Yutaka    :       async I/O support.
16  *              Carsten Paeth   :       PF_UNIX check, address fixes.
17  *              Alan Cox        :       Limit size of allocated blocks.
18  *              Alan Cox        :       Fixed the stupid socketpair bug.
19  *              Alan Cox        :       BSD compatibility fine tuning.
20  *              Alan Cox        :       Fixed a bug in connect when interrupted.
21  *              Alan Cox        :       Sorted out a proper draft version of
22  *                                      file descriptor passing hacked up from
23  *                                      Mike Shaver's work.
24  *              Marty Leisner   :       Fixes to fd passing
25  *              Nick Nevin      :       recvmsg bugfix.
26  *              Alan Cox        :       Started proper garbage collector
27  *              Heiko EiBfeldt  :       Missing verify_area check
28  *              Alan Cox        :       Started POSIXisms
29  *              Andreas Schwab  :       Replace inode by dentry for proper
30  *                                      reference counting
31  *              Kirk Petersen   :       Made this a module
32  *          Christoph Rohland   :       Elegant non-blocking accept/connect algorithm.
33  *                                      Lots of bug fixes.
34  *           Alexey Kuznetosv   :       Repaired (I hope) bugs introduces
35  *                                      by above two patches.
36  *           Andrea Arcangeli   :       If possible we block in connect(2)
37  *                                      if the max backlog of the listen socket
38  *                                      is been reached. This won't break
39  *                                      old apps and it will avoid huge amount
40  *                                      of socks hashed (this for unix_gc()
41  *                                      performances reasons).
42  *                                      Security fix that limits the max
43  *                                      number of socks to 2*max_files and
44  *                                      the number of skb queueable in the
45  *                                      dgram receiver.
46  *              Artur Skawina   :       Hash function optimizations
47  *           Alexey Kuznetsov   :       Full scale SMP. Lot of bugs are introduced 8)
48  *            Malcolm Beattie   :       Set peercred for socketpair
49  *           Michal Ostrowski   :       Module initialization cleanup.
50  *           Arnaldo C. Melo    :       Remove MOD_{INC,DEC}_USE_COUNT,
51  *                                      the core infrastructure is doing that
52  *                                      for all net proto families now (2.5.69+)
53  *
54  *
55  * Known differences from reference BSD that was tested:
56  *
57  *      [TO FIX]
58  *      ECONNREFUSED is not returned from one end of a connected() socket to the
59  *              other the moment one end closes.
60  *      fstat() doesn't return st_dev=0, and give the blksize as high water mark
61  *              and a fake inode identifier (nor the BSD first socket fstat twice bug).
62  *      [NOT TO FIX]
63  *      accept() returns a path name even if the connecting socket has closed
64  *              in the meantime (BSD loses the path and gives up).
65  *      accept() returns 0 length path for an unbound connector. BSD returns 16
66  *              and a null first byte in the path (but not for gethost/peername - BSD bug ??)
67  *      socketpair(...SOCK_RAW..) doesn't panic the kernel.
68  *      BSD af_unix apparently has connect forgetting to block properly.
69  *              (need to check this with the POSIX spec in detail)
70  *
71  * Differences from 2.0.0-11-... (ANK)
72  *      Bug fixes and improvements.
73  *              - client shutdown killed server socket.
74  *              - removed all useless cli/sti pairs.
75  *
76  *      Semantic changes/extensions.
77  *              - generic control message passing.
78  *              - SCM_CREDENTIALS control message.
79  *              - "Abstract" (not FS based) socket bindings.
80  *                Abstract names are sequences of bytes (not zero terminated)
81  *                started by 0, so that this name space does not intersect
82  *                with BSD names.
83  */
84
85 #include <linux/module.h>
86 #include <linux/config.h>
87 #include <linux/kernel.h>
88 #include <linux/major.h>
89 #include <linux/signal.h>
90 #include <linux/sched.h>
91 #include <linux/errno.h>
92 #include <linux/string.h>
93 #include <linux/stat.h>
94 #include <linux/dcache.h>
95 #include <linux/namei.h>
96 #include <linux/socket.h>
97 #include <linux/un.h>
98 #include <linux/fcntl.h>
99 #include <linux/termios.h>
100 #include <linux/sockios.h>
101 #include <linux/net.h>
102 #include <linux/in.h>
103 #include <linux/fs.h>
104 #include <linux/slab.h>
105 #include <asm/uaccess.h>
106 #include <linux/skbuff.h>
107 #include <linux/netdevice.h>
108 #include <net/sock.h>
109 #include <linux/tcp.h>
110 #include <net/af_unix.h>
111 #include <linux/proc_fs.h>
112 #include <linux/seq_file.h>
113 #include <net/scm.h>
114 #include <linux/init.h>
115 #include <linux/poll.h>
116 #include <linux/smp_lock.h>
117 #include <linux/rtnetlink.h>
118 #include <linux/mount.h>
119 #include <net/checksum.h>
120 #include <linux/security.h>
121
122 #include <linux/vs_context.h>
123 #include <linux/vs_network.h>
124
125
126 int sysctl_unix_max_dgram_qlen = 10;
127
128 kmem_cache_t *unix_sk_cachep;
129
130 struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1];
131 rwlock_t unix_table_lock = RW_LOCK_UNLOCKED;
132 static atomic_t unix_nr_socks = ATOMIC_INIT(0);
133
134 #define unix_sockets_unbound    (&unix_socket_table[UNIX_HASH_SIZE])
135
136 #define UNIX_ABSTRACT(sk)       (unix_sk(sk)->addr->hash != UNIX_HASH_SIZE)
137
138 /*
139  *  SMP locking strategy:
140  *    hash table is protected with rwlock unix_table_lock
141  *    each socket state is protected by separate rwlock.
142  */
143
144 static inline unsigned unix_hash_fold(unsigned hash)
145 {
146         hash ^= hash>>16;
147         hash ^= hash>>8;
148         return hash&(UNIX_HASH_SIZE-1);
149 }
150
151 #define unix_peer(sk) ((sk)->sk_pair)
152
153 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
154 {
155         return unix_peer(osk) == sk;
156 }
157
158 static inline int unix_may_send(struct sock *sk, struct sock *osk)
159 {
160         return (unix_peer(osk) == NULL || unix_our_peer(sk, osk));
161 }
162
163 static struct sock *unix_peer_get(struct sock *s)
164 {
165         struct sock *peer;
166
167         unix_state_rlock(s);
168         peer = unix_peer(s);
169         if (peer)
170                 sock_hold(peer);
171         unix_state_runlock(s);
172         return peer;
173 }
174
175 static inline void unix_release_addr(struct unix_address *addr)
176 {
177         if (atomic_dec_and_test(&addr->refcnt))
178                 kfree(addr);
179 }
180
181 /*
182  *      Check unix socket name:
183  *              - should be not zero length.
184  *              - if started by not zero, should be NULL terminated (FS object)
185  *              - if started by zero, it is abstract name.
186  */
187  
188 static int unix_mkname(struct sockaddr_un * sunaddr, int len, unsigned *hashp)
189 {
190         if (len <= sizeof(short) || len > sizeof(*sunaddr))
191                 return -EINVAL;
192         if (!sunaddr || sunaddr->sun_family != AF_UNIX)
193                 return -EINVAL;
194         if (sunaddr->sun_path[0])
195         {
196                 /*
197                  *      This may look like an off by one error but it is
198                  *      a bit more subtle. 108 is the longest valid AF_UNIX
199                  *      path for a binding. sun_path[108] doesn't as such
200                  *      exist. However in kernel space we are guaranteed that
201                  *      it is a valid memory location in our kernel
202                  *      address buffer.
203                  */
204                 if (len > sizeof(*sunaddr))
205                         len = sizeof(*sunaddr);
206                 ((char *)sunaddr)[len]=0;
207                 len = strlen(sunaddr->sun_path)+1+sizeof(short);
208                 return len;
209         }
210
211         *hashp = unix_hash_fold(csum_partial((char*)sunaddr, len, 0));
212         return len;
213 }
214
215 static void __unix_remove_socket(struct sock *sk)
216 {
217         sk_del_node_init(sk);
218 }
219
220 static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
221 {
222         BUG_TRAP(sk_unhashed(sk));
223         sk_add_node(sk, list);
224 }
225
226 static inline void unix_remove_socket(struct sock *sk)
227 {
228         write_lock(&unix_table_lock);
229         __unix_remove_socket(sk);
230         write_unlock(&unix_table_lock);
231 }
232
233 static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
234 {
235         write_lock(&unix_table_lock);
236         __unix_insert_socket(list, sk);
237         write_unlock(&unix_table_lock);
238 }
239
240 static struct sock *__unix_find_socket_byname(struct sockaddr_un *sunname,
241                                               int len, int type, unsigned hash)
242 {
243         struct sock *s;
244         struct hlist_node *node;
245
246         sk_for_each(s, node, &unix_socket_table[hash ^ type]) {
247                 struct unix_sock *u = unix_sk(s);
248
249                 if (u->addr->len == len &&
250                     !memcmp(u->addr->name, sunname, len))
251                         goto found;
252         }
253         s = NULL;
254 found:
255         return s;
256 }
257
258 static inline struct sock *unix_find_socket_byname(struct sockaddr_un *sunname,
259                                                    int len, int type,
260                                                    unsigned hash)
261 {
262         struct sock *s;
263
264         read_lock(&unix_table_lock);
265         s = __unix_find_socket_byname(sunname, len, type, hash);
266         if (s)
267                 sock_hold(s);
268         read_unlock(&unix_table_lock);
269         return s;
270 }
271
272 static struct sock *unix_find_socket_byinode(struct inode *i)
273 {
274         struct sock *s;
275         struct hlist_node *node;
276
277         read_lock(&unix_table_lock);
278         sk_for_each(s, node,
279                     &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
280                 struct dentry *dentry = unix_sk(s)->dentry;
281
282                 if(dentry && dentry->d_inode == i)
283                 {
284                         sock_hold(s);
285                         goto found;
286                 }
287         }
288         s = NULL;
289 found:
290         read_unlock(&unix_table_lock);
291         return s;
292 }
293
294 static inline int unix_writable(struct sock *sk)
295 {
296         return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
297 }
298
299 static void unix_write_space(struct sock *sk)
300 {
301         read_lock(&sk->sk_callback_lock);
302         if (unix_writable(sk)) {
303                 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
304                         wake_up_interruptible(sk->sk_sleep);
305                 sk_wake_async(sk, 2, POLL_OUT);
306         }
307         read_unlock(&sk->sk_callback_lock);
308 }
309
310 /* When dgram socket disconnects (or changes its peer), we clear its receive
311  * queue of packets arrived from previous peer. First, it allows to do
312  * flow control based only on wmem_alloc; second, sk connected to peer
313  * may receive messages only from that peer. */
314 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
315 {
316         if (skb_queue_len(&sk->sk_receive_queue)) {
317                 skb_queue_purge(&sk->sk_receive_queue);
318                 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
319
320                 /* If one link of bidirectional dgram pipe is disconnected,
321                  * we signal error. Messages are lost. Do not make this,
322                  * when peer was not connected to us.
323                  */
324                 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
325                         other->sk_err = ECONNRESET;
326                         other->sk_error_report(other);
327                 }
328         }
329 }
330
331 static void unix_sock_destructor(struct sock *sk)
332 {
333         struct unix_sock *u = unix_sk(sk);
334
335         skb_queue_purge(&sk->sk_receive_queue);
336
337         BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
338         BUG_TRAP(sk_unhashed(sk));
339         BUG_TRAP(!sk->sk_socket);
340         if (!sock_flag(sk, SOCK_DEAD)) {
341                 printk("Attempt to release alive unix socket: %p\n", sk);
342                 return;
343         }
344
345         if (u->addr)
346                 unix_release_addr(u->addr);
347
348         atomic_dec(&unix_nr_socks);
349 #ifdef UNIX_REFCNT_DEBUG
350         printk(KERN_DEBUG "UNIX %p is destroyed, %d are still alive.\n", sk, atomic_read(&unix_nr_socks));
351 #endif
352 }
353
354 static int unix_release_sock (struct sock *sk, int embrion)
355 {
356         struct unix_sock *u = unix_sk(sk);
357         struct dentry *dentry;
358         struct vfsmount *mnt;
359         struct sock *skpair;
360         struct sk_buff *skb;
361         int state;
362
363         unix_remove_socket(sk);
364
365         /* Clear state */
366         unix_state_wlock(sk);
367         sock_orphan(sk);
368         sk->sk_shutdown = SHUTDOWN_MASK;
369         dentry       = u->dentry;
370         u->dentry    = NULL;
371         mnt          = u->mnt;
372         u->mnt       = NULL;
373         state = sk->sk_state;
374         sk->sk_state = TCP_CLOSE;
375         unix_state_wunlock(sk);
376
377         wake_up_interruptible_all(&u->peer_wait);
378
379         skpair=unix_peer(sk);
380
381         if (skpair!=NULL) {
382                 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
383                         unix_state_wlock(skpair);
384                         /* No more writes */
385                         skpair->sk_shutdown = SHUTDOWN_MASK;
386                         if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
387                                 skpair->sk_err = ECONNRESET;
388                         unix_state_wunlock(skpair);
389                         skpair->sk_state_change(skpair);
390                         read_lock(&skpair->sk_callback_lock);
391                         sk_wake_async(skpair,1,POLL_HUP);
392                         read_unlock(&skpair->sk_callback_lock);
393                 }
394                 sock_put(skpair); /* It may now die */
395                 unix_peer(sk) = NULL;
396         }
397
398         /* Try to flush out this socket. Throw out buffers at least */
399
400         while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
401                 if (state==TCP_LISTEN)
402                         unix_release_sock(skb->sk, 1);
403                 /* passed fds are erased in the kfree_skb hook        */
404                 kfree_skb(skb);
405         }
406
407         if (dentry) {
408                 dput(dentry);
409                 mntput(mnt);
410         }
411
412         clr_vx_info(&sk->sk_vx_info);
413         clr_nx_info(&sk->sk_nx_info);
414         sock_put(sk);
415
416         /* ---- Socket is dead now and most probably destroyed ---- */
417
418         /*
419          * Fixme: BSD difference: In BSD all sockets connected to use get
420          *        ECONNRESET and we die on the spot. In Linux we behave
421          *        like files and pipes do and wait for the last
422          *        dereference.
423          *
424          * Can't we simply set sock->err?
425          *
426          *        What the above comment does talk about? --ANK(980817)
427          */
428
429         if (atomic_read(&unix_tot_inflight))
430                 unix_gc();              /* Garbage collect fds */       
431
432         return 0;
433 }
434
435 static int unix_listen(struct socket *sock, int backlog)
436 {
437         int err;
438         struct sock *sk = sock->sk;
439         struct unix_sock *u = unix_sk(sk);
440
441         err = -EOPNOTSUPP;
442         if (sock->type!=SOCK_STREAM && sock->type!=SOCK_SEQPACKET)
443                 goto out;                       /* Only stream/seqpacket sockets accept */
444         err = -EINVAL;
445         if (!u->addr)
446                 goto out;                       /* No listens on an unbound socket */
447         unix_state_wlock(sk);
448         if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
449                 goto out_unlock;
450         if (backlog > sk->sk_max_ack_backlog)
451                 wake_up_interruptible_all(&u->peer_wait);
452         sk->sk_max_ack_backlog  = backlog;
453         sk->sk_state            = TCP_LISTEN;
454         /* set credentials so connect can copy them */
455         sk->sk_peercred.pid     = current->tgid;
456         sk->sk_peercred.uid     = current->euid;
457         sk->sk_peercred.gid     = current->egid;
458         err = 0;
459
460 out_unlock:
461         unix_state_wunlock(sk);
462 out:
463         return err;
464 }
465
466 static int unix_release(struct socket *);
467 static int unix_bind(struct socket *, struct sockaddr *, int);
468 static int unix_stream_connect(struct socket *, struct sockaddr *,
469                                int addr_len, int flags);
470 static int unix_socketpair(struct socket *, struct socket *);
471 static int unix_accept(struct socket *, struct socket *, int);
472 static int unix_getname(struct socket *, struct sockaddr *, int *, int);
473 static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
474 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
475 static int unix_shutdown(struct socket *, int);
476 static int unix_stream_sendmsg(struct kiocb *, struct socket *,
477                                struct msghdr *, size_t);
478 static int unix_stream_recvmsg(struct kiocb *, struct socket *,
479                                struct msghdr *, size_t, int);
480 static int unix_dgram_sendmsg(struct kiocb *, struct socket *,
481                               struct msghdr *, size_t);
482 static int unix_dgram_recvmsg(struct kiocb *, struct socket *,
483                               struct msghdr *, size_t, int);
484 static int unix_dgram_connect(struct socket *, struct sockaddr *,
485                               int, int);
486
487 static struct proto_ops unix_stream_ops = {
488         .family =       PF_UNIX,
489         .owner =        THIS_MODULE,
490         .release =      unix_release,
491         .bind =         unix_bind,
492         .connect =      unix_stream_connect,
493         .socketpair =   unix_socketpair,
494         .accept =       unix_accept,
495         .getname =      unix_getname,
496         .poll =         unix_poll,
497         .ioctl =        unix_ioctl,
498         .listen =       unix_listen,
499         .shutdown =     unix_shutdown,
500         .setsockopt =   sock_no_setsockopt,
501         .getsockopt =   sock_no_getsockopt,
502         .sendmsg =      unix_stream_sendmsg,
503         .recvmsg =      unix_stream_recvmsg,
504         .mmap =         sock_no_mmap,
505         .sendpage =     sock_no_sendpage,
506 };
507
508 static struct proto_ops unix_dgram_ops = {
509         .family =       PF_UNIX,
510         .owner =        THIS_MODULE,
511         .release =      unix_release,
512         .bind =         unix_bind,
513         .connect =      unix_dgram_connect,
514         .socketpair =   unix_socketpair,
515         .accept =       sock_no_accept,
516         .getname =      unix_getname,
517         .poll =         datagram_poll,
518         .ioctl =        unix_ioctl,
519         .listen =       sock_no_listen,
520         .shutdown =     unix_shutdown,
521         .setsockopt =   sock_no_setsockopt,
522         .getsockopt =   sock_no_getsockopt,
523         .sendmsg =      unix_dgram_sendmsg,
524         .recvmsg =      unix_dgram_recvmsg,
525         .mmap =         sock_no_mmap,
526         .sendpage =     sock_no_sendpage,
527 };
528
529 static struct proto_ops unix_seqpacket_ops = {
530         .family =       PF_UNIX,
531         .owner =        THIS_MODULE,
532         .release =      unix_release,
533         .bind =         unix_bind,
534         .connect =      unix_stream_connect,
535         .socketpair =   unix_socketpair,
536         .accept =       unix_accept,
537         .getname =      unix_getname,
538         .poll =         datagram_poll,
539         .ioctl =        unix_ioctl,
540         .listen =       unix_listen,
541         .shutdown =     unix_shutdown,
542         .setsockopt =   sock_no_setsockopt,
543         .getsockopt =   sock_no_getsockopt,
544         .sendmsg =      unix_dgram_sendmsg,
545         .recvmsg =      unix_dgram_recvmsg,
546         .mmap =         sock_no_mmap,
547         .sendpage =     sock_no_sendpage,
548 };
549
550 static struct sock * unix_create1(struct socket *sock)
551 {
552         struct sock *sk = NULL;
553         struct unix_sock *u;
554
555         if (atomic_read(&unix_nr_socks) >= 2*files_stat.max_files)
556                 goto out;
557
558         sk = sk_alloc(PF_UNIX, GFP_KERNEL, sizeof(struct unix_sock),
559                       unix_sk_cachep);
560         if (!sk)
561                 goto out;
562
563         atomic_inc(&unix_nr_socks);
564
565         sock_init_data(sock,sk);
566         sk_set_owner(sk, THIS_MODULE);
567
568         set_vx_info(&sk->sk_vx_info, current->vx_info);
569         set_nx_info(&sk->sk_nx_info, current->nx_info);
570         sk->sk_xid = vx_current_xid();
571
572         sk->sk_write_space      = unix_write_space;
573         sk->sk_max_ack_backlog  = sysctl_unix_max_dgram_qlen;
574         sk->sk_destruct         = unix_sock_destructor;
575         u         = unix_sk(sk);
576         u->dentry = NULL;
577         u->mnt    = NULL;
578         rwlock_init(&u->lock);
579         atomic_set(&u->inflight, sock ? 0 : -1);
580         init_MUTEX(&u->readsem); /* single task reading lock */
581         init_waitqueue_head(&u->peer_wait);
582         unix_insert_socket(unix_sockets_unbound, sk);
583 out:
584         return sk;
585 }
586
587 static int unix_create(struct socket *sock, int protocol)
588 {
589         if (protocol && protocol != PF_UNIX)
590                 return -EPROTONOSUPPORT;
591
592         sock->state = SS_UNCONNECTED;
593
594         switch (sock->type) {
595         case SOCK_STREAM:
596                 sock->ops = &unix_stream_ops;
597                 break;
598                 /*
599                  *      Believe it or not BSD has AF_UNIX, SOCK_RAW though
600                  *      nothing uses it.
601                  */
602         case SOCK_RAW:
603                 sock->type=SOCK_DGRAM;
604         case SOCK_DGRAM:
605                 sock->ops = &unix_dgram_ops;
606                 break;
607         case SOCK_SEQPACKET:
608                 sock->ops = &unix_seqpacket_ops;
609                 break;
610         default:
611                 return -ESOCKTNOSUPPORT;
612         }
613
614         return unix_create1(sock) ? 0 : -ENOMEM;
615 }
616
617 static int unix_release(struct socket *sock)
618 {
619         struct sock *sk = sock->sk;
620
621         if (!sk)
622                 return 0;
623
624         sock->sk = NULL;
625
626         return unix_release_sock (sk, 0);
627 }
628
629 static int unix_autobind(struct socket *sock)
630 {
631         struct sock *sk = sock->sk;
632         struct unix_sock *u = unix_sk(sk);
633         static u32 ordernum = 1;
634         struct unix_address * addr;
635         int err;
636
637         down(&u->readsem);
638
639         err = 0;
640         if (u->addr)
641                 goto out;
642
643         err = -ENOMEM;
644         addr = kmalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
645         if (!addr)
646                 goto out;
647
648         memset(addr, 0, sizeof(*addr) + sizeof(short) + 16);
649         addr->name->sun_family = AF_UNIX;
650         atomic_set(&addr->refcnt, 1);
651
652 retry:
653         addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
654         addr->hash = unix_hash_fold(csum_partial((void*)addr->name, addr->len, 0));
655
656         write_lock(&unix_table_lock);
657         ordernum = (ordernum+1)&0xFFFFF;
658
659         if (__unix_find_socket_byname(addr->name, addr->len, sock->type,
660                                       addr->hash)) {
661                 write_unlock(&unix_table_lock);
662                 /* Sanity yield. It is unusual case, but yet... */
663                 if (!(ordernum&0xFF))
664                         yield();
665                 goto retry;
666         }
667         addr->hash ^= sk->sk_type;
668
669         __unix_remove_socket(sk);
670         u->addr = addr;
671         __unix_insert_socket(&unix_socket_table[addr->hash], sk);
672         write_unlock(&unix_table_lock);
673         err = 0;
674
675 out:    up(&u->readsem);
676         return err;
677 }
678
679 static struct sock *unix_find_other(struct sockaddr_un *sunname, int len,
680                                     int type, unsigned hash, int *error)
681 {
682         struct sock *u;
683         struct nameidata nd;
684         int err = 0;
685         
686         if (sunname->sun_path[0]) {
687                 err = path_lookup(sunname->sun_path, LOOKUP_FOLLOW, &nd);
688                 if (err)
689                         goto fail;
690                 err = permission(nd.dentry->d_inode,MAY_WRITE, &nd);
691                 if (err)
692                         goto put_fail;
693
694                 err = -ECONNREFUSED;
695                 if (!S_ISSOCK(nd.dentry->d_inode->i_mode))
696                         goto put_fail;
697                 u=unix_find_socket_byinode(nd.dentry->d_inode);
698                 if (!u)
699                         goto put_fail;
700
701                 if (u->sk_type == type)
702                         touch_atime(nd.mnt, nd.dentry);
703
704                 path_release(&nd);
705
706                 err=-EPROTOTYPE;
707                 if (u->sk_type != type) {
708                         sock_put(u);
709                         goto fail;
710                 }
711         } else {
712                 err = -ECONNREFUSED;
713                 u=unix_find_socket_byname(sunname, len, type, hash);
714                 if (u) {
715                         struct dentry *dentry;
716                         dentry = unix_sk(u)->dentry;
717                         if (dentry)
718                                 touch_atime(unix_sk(u)->mnt, dentry);
719                 } else
720                         goto fail;
721         }
722         return u;
723
724 put_fail:
725         path_release(&nd);
726 fail:
727         *error=err;
728         return NULL;
729 }
730
731
732 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
733 {
734         struct sock *sk = sock->sk;
735         struct unix_sock *u = unix_sk(sk);
736         struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr;
737         struct dentry * dentry = NULL;
738         struct nameidata nd;
739         int err;
740         unsigned hash;
741         struct unix_address *addr;
742         struct hlist_head *list;
743
744         err = -EINVAL;
745         if (sunaddr->sun_family != AF_UNIX)
746                 goto out;
747
748         if (addr_len==sizeof(short)) {
749                 err = unix_autobind(sock);
750                 goto out;
751         }
752
753         err = unix_mkname(sunaddr, addr_len, &hash);
754         if (err < 0)
755                 goto out;
756         addr_len = err;
757
758         down(&u->readsem);
759
760         err = -EINVAL;
761         if (u->addr)
762                 goto out_up;
763
764         err = -ENOMEM;
765         addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
766         if (!addr)
767                 goto out_up;
768
769         memcpy(addr->name, sunaddr, addr_len);
770         addr->len = addr_len;
771         addr->hash = hash ^ sk->sk_type;
772         atomic_set(&addr->refcnt, 1);
773
774         if (sunaddr->sun_path[0]) {
775                 unsigned int mode;
776                 err = 0;
777                 /*
778                  * Get the parent directory, calculate the hash for last
779                  * component.
780                  */
781                 err = path_lookup(sunaddr->sun_path, LOOKUP_PARENT, &nd);
782                 if (err)
783                         goto out_mknod_parent;
784                 /*
785                  * Yucky last component or no last component at all?
786                  * (foo/., foo/.., /////)
787                  */
788                 err = -EEXIST;
789                 if (nd.last_type != LAST_NORM)
790                         goto out_mknod;
791                 /*
792                  * Lock the directory.
793                  */
794                 down(&nd.dentry->d_inode->i_sem);
795                 /*
796                  * Do the final lookup.
797                  */
798                 dentry = lookup_hash(&nd.last, nd.dentry);
799                 err = PTR_ERR(dentry);
800                 if (IS_ERR(dentry))
801                         goto out_mknod_unlock;
802                 err = -ENOENT;
803                 /*
804                  * Special case - lookup gave negative, but... we had foo/bar/
805                  * From the vfs_mknod() POV we just have a negative dentry -
806                  * all is fine. Let's be bastards - you had / on the end, you've
807                  * been asking for (non-existent) directory. -ENOENT for you.
808                  */
809                 if (nd.last.name[nd.last.len] && !dentry->d_inode)
810                         goto out_mknod_dput;
811                 /*
812                  * All right, let's create it.
813                  */
814                 mode = S_IFSOCK |
815                        (SOCK_INODE(sock)->i_mode & ~current->fs->umask);
816                 err = vfs_mknod(nd.dentry->d_inode, dentry, mode, 0);
817                 if (err)
818                         goto out_mknod_dput;
819                 up(&nd.dentry->d_inode->i_sem);
820                 dput(nd.dentry);
821                 nd.dentry = dentry;
822
823                 addr->hash = UNIX_HASH_SIZE;
824         }
825
826         write_lock(&unix_table_lock);
827
828         if (!sunaddr->sun_path[0]) {
829                 err = -EADDRINUSE;
830                 if (__unix_find_socket_byname(sunaddr, addr_len,
831                                               sk->sk_type, hash)) {
832                         unix_release_addr(addr);
833                         goto out_unlock;
834                 }
835
836                 list = &unix_socket_table[addr->hash];
837         } else {
838                 list = &unix_socket_table[dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1)];
839                 u->dentry = nd.dentry;
840                 u->mnt    = nd.mnt;
841         }
842
843         err = 0;
844         __unix_remove_socket(sk);
845         u->addr = addr;
846         __unix_insert_socket(list, sk);
847
848 out_unlock:
849         write_unlock(&unix_table_lock);
850 out_up:
851         up(&u->readsem);
852 out:
853         return err;
854
855 out_mknod_dput:
856         dput(dentry);
857 out_mknod_unlock:
858         up(&nd.dentry->d_inode->i_sem);
859 out_mknod:
860         path_release(&nd);
861 out_mknod_parent:
862         if (err==-EEXIST)
863                 err=-EADDRINUSE;
864         unix_release_addr(addr);
865         goto out_up;
866 }
867
868 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
869                               int alen, int flags)
870 {
871         struct sock *sk = sock->sk;
872         struct sockaddr_un *sunaddr=(struct sockaddr_un*)addr;
873         struct sock *other;
874         unsigned hash;
875         int err;
876
877         if (addr->sa_family != AF_UNSPEC) {
878                 err = unix_mkname(sunaddr, alen, &hash);
879                 if (err < 0)
880                         goto out;
881                 alen = err;
882
883                 if (test_bit(SOCK_PASS_CRED, &sock->flags) && !unix_sk(sk)->addr &&
884                     (err = unix_autobind(sock)) != 0)
885                         goto out;
886
887                 other=unix_find_other(sunaddr, alen, sock->type, hash, &err);
888                 if (!other)
889                         goto out;
890
891                 unix_state_wlock(sk);
892
893                 err = -EPERM;
894                 if (!unix_may_send(sk, other))
895                         goto out_unlock;
896
897                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
898                 if (err)
899                         goto out_unlock;
900
901         } else {
902                 /*
903                  *      1003.1g breaking connected state with AF_UNSPEC
904                  */
905                 other = NULL;
906                 unix_state_wlock(sk);
907         }
908
909         /*
910          * If it was connected, reconnect.
911          */
912         if (unix_peer(sk)) {
913                 struct sock *old_peer = unix_peer(sk);
914                 unix_peer(sk)=other;
915                 unix_state_wunlock(sk);
916
917                 if (other != old_peer)
918                         unix_dgram_disconnected(sk, old_peer);
919                 sock_put(old_peer);
920         } else {
921                 unix_peer(sk)=other;
922                 unix_state_wunlock(sk);
923         }
924         return 0;
925
926 out_unlock:
927         unix_state_wunlock(sk);
928         sock_put(other);
929 out:
930         return err;
931 }
932
933 static long unix_wait_for_peer(struct sock *other, long timeo)
934 {
935         struct unix_sock *u = unix_sk(other);
936         int sched;
937         DEFINE_WAIT(wait);
938
939         prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
940
941         sched = !sock_flag(other, SOCK_DEAD) &&
942                 !(other->sk_shutdown & RCV_SHUTDOWN) &&
943                 (skb_queue_len(&other->sk_receive_queue) >
944                  other->sk_max_ack_backlog);
945
946         unix_state_runlock(other);
947
948         if (sched)
949                 timeo = schedule_timeout(timeo);
950
951         finish_wait(&u->peer_wait, &wait);
952         return timeo;
953 }
954
955 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
956                                int addr_len, int flags)
957 {
958         struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr;
959         struct sock *sk = sock->sk;
960         struct unix_sock *u = unix_sk(sk), *newu, *otheru;
961         struct sock *newsk = NULL;
962         struct sock *other = NULL;
963         struct sk_buff *skb = NULL;
964         unsigned hash;
965         int st;
966         int err;
967         long timeo;
968
969         err = unix_mkname(sunaddr, addr_len, &hash);
970         if (err < 0)
971                 goto out;
972         addr_len = err;
973
974         if (test_bit(SOCK_PASS_CRED, &sock->flags)
975                 && !u->addr && (err = unix_autobind(sock)) != 0)
976                 goto out;
977
978         timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
979
980         /* First of all allocate resources.
981            If we will make it after state is locked,
982            we will have to recheck all again in any case.
983          */
984
985         err = -ENOMEM;
986
987         /* create new sock for complete connection */
988         newsk = unix_create1(NULL);
989         if (newsk == NULL)
990                 goto out;
991
992         /* Allocate skb for sending to listening sock */
993         skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
994         if (skb == NULL)
995                 goto out;
996
997 restart:
998         /*  Find listening sock. */
999         other = unix_find_other(sunaddr, addr_len, sk->sk_type, hash, &err);
1000         if (!other)
1001                 goto out;
1002
1003         /* Latch state of peer */
1004         unix_state_rlock(other);
1005
1006         /* Apparently VFS overslept socket death. Retry. */
1007         if (sock_flag(other, SOCK_DEAD)) {
1008                 unix_state_runlock(other);
1009                 sock_put(other);
1010                 goto restart;
1011         }
1012
1013         err = -ECONNREFUSED;
1014         if (other->sk_state != TCP_LISTEN)
1015                 goto out_unlock;
1016
1017         if (skb_queue_len(&other->sk_receive_queue) >
1018             other->sk_max_ack_backlog) {
1019                 err = -EAGAIN;
1020                 if (!timeo)
1021                         goto out_unlock;
1022
1023                 timeo = unix_wait_for_peer(other, timeo);
1024
1025                 err = sock_intr_errno(timeo);
1026                 if (signal_pending(current))
1027                         goto out;
1028                 sock_put(other);
1029                 goto restart;
1030         }
1031
1032         /* Latch our state.
1033
1034            It is tricky place. We need to grab write lock and cannot
1035            drop lock on peer. It is dangerous because deadlock is
1036            possible. Connect to self case and simultaneous
1037            attempt to connect are eliminated by checking socket
1038            state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1039            check this before attempt to grab lock.
1040
1041            Well, and we have to recheck the state after socket locked.
1042          */
1043         st = sk->sk_state;
1044
1045         switch (st) {
1046         case TCP_CLOSE:
1047                 /* This is ok... continue with connect */
1048                 break;
1049         case TCP_ESTABLISHED:
1050                 /* Socket is already connected */
1051                 err = -EISCONN;
1052                 goto out_unlock;
1053         default:
1054                 err = -EINVAL;
1055                 goto out_unlock;
1056         }
1057
1058         unix_state_wlock(sk);
1059
1060         if (sk->sk_state != st) {
1061                 unix_state_wunlock(sk);
1062                 unix_state_runlock(other);
1063                 sock_put(other);
1064                 goto restart;
1065         }
1066
1067         err = security_unix_stream_connect(sock, other->sk_socket, newsk);
1068         if (err) {
1069                 unix_state_wunlock(sk);
1070                 goto out_unlock;
1071         }
1072
1073         /* The way is open! Fastly set all the necessary fields... */
1074
1075         sock_hold(sk);
1076         unix_peer(newsk)        = sk;
1077         newsk->sk_state         = TCP_ESTABLISHED;
1078         newsk->sk_type          = sk->sk_type;
1079         newsk->sk_peercred.pid  = current->tgid;
1080         newsk->sk_peercred.uid  = current->euid;
1081         newsk->sk_peercred.gid  = current->egid;
1082         newu = unix_sk(newsk);
1083         newsk->sk_sleep         = &newu->peer_wait;
1084         otheru = unix_sk(other);
1085
1086         /* copy address information from listening to new sock*/
1087         if (otheru->addr) {
1088                 atomic_inc(&otheru->addr->refcnt);
1089                 newu->addr = otheru->addr;
1090         }
1091         if (otheru->dentry) {
1092                 newu->dentry    = dget(otheru->dentry);
1093                 newu->mnt       = mntget(otheru->mnt);
1094         }
1095
1096         /* Set credentials */
1097         sk->sk_peercred = other->sk_peercred;
1098
1099         sock_hold(newsk);
1100         unix_peer(sk)   = newsk;
1101         sock->state     = SS_CONNECTED;
1102         sk->sk_state    = TCP_ESTABLISHED;
1103
1104         unix_state_wunlock(sk);
1105
1106         /* take ten and and send info to listening sock */
1107         spin_lock(&other->sk_receive_queue.lock);
1108         __skb_queue_tail(&other->sk_receive_queue, skb);
1109         /* Undo artificially decreased inflight after embrion
1110          * is installed to listening socket. */
1111         atomic_inc(&newu->inflight);
1112         spin_unlock(&other->sk_receive_queue.lock);
1113         unix_state_runlock(other);
1114         other->sk_data_ready(other, 0);
1115         sock_put(other);
1116         return 0;
1117
1118 out_unlock:
1119         if (other)
1120                 unix_state_runlock(other);
1121
1122 out:
1123         if (skb)
1124                 kfree_skb(skb);
1125         if (newsk)
1126                 unix_release_sock(newsk, 0);
1127         if (other)
1128                 sock_put(other);
1129         return err;
1130 }
1131
1132 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1133 {
1134         struct sock *ska=socka->sk, *skb = sockb->sk;
1135
1136         /* Join our sockets back to back */
1137         sock_hold(ska);
1138         sock_hold(skb);
1139         unix_peer(ska)=skb;
1140         unix_peer(skb)=ska;
1141         ska->sk_peercred.pid = skb->sk_peercred.pid = current->tgid;
1142         ska->sk_peercred.uid = skb->sk_peercred.uid = current->euid;
1143         ska->sk_peercred.gid = skb->sk_peercred.gid = current->egid;
1144
1145         if (ska->sk_type != SOCK_DGRAM) {
1146                 ska->sk_state = TCP_ESTABLISHED;
1147                 skb->sk_state = TCP_ESTABLISHED;
1148                 socka->state  = SS_CONNECTED;
1149                 sockb->state  = SS_CONNECTED;
1150         }
1151         return 0;
1152 }
1153
1154 static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
1155 {
1156         struct sock *sk = sock->sk;
1157         struct sock *tsk;
1158         struct sk_buff *skb;
1159         int err;
1160
1161         err = -EOPNOTSUPP;
1162         if (sock->type!=SOCK_STREAM && sock->type!=SOCK_SEQPACKET)
1163                 goto out;
1164
1165         err = -EINVAL;
1166         if (sk->sk_state != TCP_LISTEN)
1167                 goto out;
1168
1169         /* If socket state is TCP_LISTEN it cannot change (for now...),
1170          * so that no locks are necessary.
1171          */
1172
1173         skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1174         if (!skb) {
1175                 /* This means receive shutdown. */
1176                 if (err == 0)
1177                         err = -EINVAL;
1178                 goto out;
1179         }
1180
1181         tsk = skb->sk;
1182         skb_free_datagram(sk, skb);
1183         wake_up_interruptible(&unix_sk(sk)->peer_wait);
1184
1185         /* attach accepted sock to socket */
1186         unix_state_wlock(tsk);
1187         newsock->state = SS_CONNECTED;
1188         sock_graft(tsk, newsock);
1189         unix_state_wunlock(tsk);
1190         return 0;
1191
1192 out:
1193         return err;
1194 }
1195
1196
1197 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer)
1198 {
1199         struct sock *sk = sock->sk;
1200         struct unix_sock *u;
1201         struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr;
1202         int err = 0;
1203
1204         if (peer) {
1205                 sk = unix_peer_get(sk);
1206
1207                 err = -ENOTCONN;
1208                 if (!sk)
1209                         goto out;
1210                 err = 0;
1211         } else {
1212                 sock_hold(sk);
1213         }
1214
1215         u = unix_sk(sk);
1216         unix_state_rlock(sk);
1217         if (!u->addr) {
1218                 sunaddr->sun_family = AF_UNIX;
1219                 sunaddr->sun_path[0] = 0;
1220                 *uaddr_len = sizeof(short);
1221         } else {
1222                 struct unix_address *addr = u->addr;
1223
1224                 *uaddr_len = addr->len;
1225                 memcpy(sunaddr, addr->name, *uaddr_len);
1226         }
1227         unix_state_runlock(sk);
1228         sock_put(sk);
1229 out:
1230         return err;
1231 }
1232
1233 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1234 {
1235         int i;
1236
1237         scm->fp = UNIXCB(skb).fp;
1238         skb->destructor = sock_wfree;
1239         UNIXCB(skb).fp = NULL;
1240
1241         for (i=scm->fp->count-1; i>=0; i--)
1242                 unix_notinflight(scm->fp->fp[i]);
1243 }
1244
1245 static void unix_destruct_fds(struct sk_buff *skb)
1246 {
1247         struct scm_cookie scm;
1248         memset(&scm, 0, sizeof(scm));
1249         unix_detach_fds(&scm, skb);
1250
1251         /* Alas, it calls VFS */
1252         /* So fscking what? fput() had been SMP-safe since the last Summer */
1253         scm_destroy(&scm);
1254         sock_wfree(skb);
1255 }
1256
1257 static void unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1258 {
1259         int i;
1260         for (i=scm->fp->count-1; i>=0; i--)
1261                 unix_inflight(scm->fp->fp[i]);
1262         UNIXCB(skb).fp = scm->fp;
1263         skb->destructor = unix_destruct_fds;
1264         scm->fp = NULL;
1265 }
1266
1267 /*
1268  *      Send AF_UNIX data.
1269  */
1270
1271 static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
1272                               struct msghdr *msg, size_t len)
1273 {
1274         struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1275         struct sock *sk = sock->sk;
1276         struct unix_sock *u = unix_sk(sk);
1277         struct sockaddr_un *sunaddr=msg->msg_name;
1278         struct sock *other = NULL;
1279         int namelen = 0; /* fake GCC */
1280         int err;
1281         unsigned hash;
1282         struct sk_buff *skb;
1283         long timeo;
1284         struct scm_cookie tmp_scm;
1285
1286         if (NULL == siocb->scm)
1287                 siocb->scm = &tmp_scm;
1288         err = scm_send(sock, msg, siocb->scm);
1289         if (err < 0)
1290                 return err;
1291
1292         err = -EOPNOTSUPP;
1293         if (msg->msg_flags&MSG_OOB)
1294                 goto out;
1295
1296         if (msg->msg_namelen) {
1297                 err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1298                 if (err < 0)
1299                         goto out;
1300                 namelen = err;
1301         } else {
1302                 sunaddr = NULL;
1303                 err = -ENOTCONN;
1304                 other = unix_peer_get(sk);
1305                 if (!other)
1306                         goto out;
1307         }
1308
1309         if (test_bit(SOCK_PASS_CRED, &sock->flags)
1310                 && !u->addr && (err = unix_autobind(sock)) != 0)
1311                 goto out;
1312
1313         err = -EMSGSIZE;
1314         if (len > sk->sk_sndbuf - 32)
1315                 goto out;
1316
1317         skb = sock_alloc_send_skb(sk, len, msg->msg_flags&MSG_DONTWAIT, &err);
1318         if (skb==NULL)
1319                 goto out;
1320
1321         memcpy(UNIXCREDS(skb), &siocb->scm->creds, sizeof(struct ucred));
1322         if (siocb->scm->fp)
1323                 unix_attach_fds(siocb->scm, skb);
1324
1325         skb->h.raw = skb->data;
1326         err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
1327         if (err)
1328                 goto out_free;
1329
1330         timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1331
1332 restart:
1333         if (!other) {
1334                 err = -ECONNRESET;
1335                 if (sunaddr == NULL)
1336                         goto out_free;
1337
1338                 other = unix_find_other(sunaddr, namelen, sk->sk_type,
1339                                         hash, &err);
1340                 if (other==NULL)
1341                         goto out_free;
1342         }
1343
1344         unix_state_rlock(other);
1345         err = -EPERM;
1346         if (!unix_may_send(sk, other))
1347                 goto out_unlock;
1348
1349         if (sock_flag(other, SOCK_DEAD)) {
1350                 /*
1351                  *      Check with 1003.1g - what should
1352                  *      datagram error
1353                  */
1354                 unix_state_runlock(other);
1355                 sock_put(other);
1356
1357                 err = 0;
1358                 unix_state_wlock(sk);
1359                 if (unix_peer(sk) == other) {
1360                         unix_peer(sk)=NULL;
1361                         unix_state_wunlock(sk);
1362
1363                         unix_dgram_disconnected(sk, other);
1364                         sock_put(other);
1365                         err = -ECONNREFUSED;
1366                 } else {
1367                         unix_state_wunlock(sk);
1368                 }
1369
1370                 other = NULL;
1371                 if (err)
1372                         goto out_free;
1373                 goto restart;
1374         }
1375
1376         err = -EPIPE;
1377         if (other->sk_shutdown & RCV_SHUTDOWN)
1378                 goto out_unlock;
1379
1380         err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1381         if (err)
1382                 goto out_unlock;
1383
1384         if (unix_peer(other) != sk &&
1385             (skb_queue_len(&other->sk_receive_queue) >
1386              other->sk_max_ack_backlog)) {
1387                 if (!timeo) {
1388                         err = -EAGAIN;
1389                         goto out_unlock;
1390                 }
1391
1392                 timeo = unix_wait_for_peer(other, timeo);
1393
1394                 err = sock_intr_errno(timeo);
1395                 if (signal_pending(current))
1396                         goto out_free;
1397
1398                 goto restart;
1399         }
1400
1401         skb_queue_tail(&other->sk_receive_queue, skb);
1402         unix_state_runlock(other);
1403         other->sk_data_ready(other, len);
1404         sock_put(other);
1405         scm_destroy(siocb->scm);
1406         return len;
1407
1408 out_unlock:
1409         unix_state_runlock(other);
1410 out_free:
1411         kfree_skb(skb);
1412 out:
1413         if (other)
1414                 sock_put(other);
1415         scm_destroy(siocb->scm);
1416         return err;
1417 }
1418
1419                 
1420 static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
1421                                struct msghdr *msg, size_t len)
1422 {
1423         struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1424         struct sock *sk = sock->sk;
1425         struct sock *other = NULL;
1426         struct sockaddr_un *sunaddr=msg->msg_name;
1427         int err,size;
1428         struct sk_buff *skb;
1429         int sent=0;
1430         struct scm_cookie tmp_scm;
1431
1432         if (NULL == siocb->scm)
1433                 siocb->scm = &tmp_scm;
1434         err = scm_send(sock, msg, siocb->scm);
1435         if (err < 0)
1436                 return err;
1437
1438         err = -EOPNOTSUPP;
1439         if (msg->msg_flags&MSG_OOB)
1440                 goto out_err;
1441
1442         if (msg->msg_namelen) {
1443                 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1444                 goto out_err;
1445         } else {
1446                 sunaddr = NULL;
1447                 err = -ENOTCONN;
1448                 other = unix_peer_get(sk);
1449                 if (!other)
1450                         goto out_err;
1451         }
1452
1453         if (sk->sk_shutdown & SEND_SHUTDOWN)
1454                 goto pipe_err;
1455
1456         while(sent < len)
1457         {
1458                 /*
1459                  *      Optimisation for the fact that under 0.01% of X messages typically
1460                  *      need breaking up.
1461                  */
1462
1463                 size=len-sent;
1464
1465                 /* Keep two messages in the pipe so it schedules better */
1466                 if (size > sk->sk_sndbuf / 2 - 64)
1467                         size = sk->sk_sndbuf / 2 - 64;
1468
1469                 if (size > SKB_MAX_ALLOC)
1470                         size = SKB_MAX_ALLOC;
1471                         
1472                 /*
1473                  *      Grab a buffer
1474                  */
1475                  
1476                 skb=sock_alloc_send_skb(sk,size,msg->msg_flags&MSG_DONTWAIT, &err);
1477
1478                 if (skb==NULL)
1479                         goto out_err;
1480
1481                 /*
1482                  *      If you pass two values to the sock_alloc_send_skb
1483                  *      it tries to grab the large buffer with GFP_NOFS
1484                  *      (which can fail easily), and if it fails grab the
1485                  *      fallback size buffer which is under a page and will
1486                  *      succeed. [Alan]
1487                  */
1488                 size = min_t(int, size, skb_tailroom(skb));
1489
1490                 memcpy(UNIXCREDS(skb), &siocb->scm->creds, sizeof(struct ucred));
1491                 if (siocb->scm->fp)
1492                         unix_attach_fds(siocb->scm, skb);
1493
1494                 if ((err = memcpy_fromiovec(skb_put(skb,size), msg->msg_iov, size)) != 0) {
1495                         kfree_skb(skb);
1496                         goto out_err;
1497                 }
1498
1499                 unix_state_rlock(other);
1500
1501                 if (sock_flag(other, SOCK_DEAD) ||
1502                     (other->sk_shutdown & RCV_SHUTDOWN))
1503                         goto pipe_err_free;
1504
1505                 skb_queue_tail(&other->sk_receive_queue, skb);
1506                 unix_state_runlock(other);
1507                 other->sk_data_ready(other, size);
1508                 sent+=size;
1509         }
1510         sock_put(other);
1511
1512         scm_destroy(siocb->scm);
1513         siocb->scm = NULL;
1514
1515         return sent;
1516
1517 pipe_err_free:
1518         unix_state_runlock(other);
1519         kfree_skb(skb);
1520 pipe_err:
1521         if (sent==0 && !(msg->msg_flags&MSG_NOSIGNAL))
1522                 send_sig(SIGPIPE,current,0);
1523         err = -EPIPE;
1524 out_err:
1525         if (other)
1526                 sock_put(other);
1527         scm_destroy(siocb->scm);
1528         siocb->scm = NULL;
1529         return sent ? : err;
1530 }
1531
1532 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
1533 {
1534         struct unix_sock *u = unix_sk(sk);
1535
1536         msg->msg_namelen = 0;
1537         if (u->addr) {
1538                 msg->msg_namelen = u->addr->len;
1539                 memcpy(msg->msg_name, u->addr->name, u->addr->len);
1540         }
1541 }
1542
1543 static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
1544                               struct msghdr *msg, size_t size,
1545                               int flags)
1546 {
1547         struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1548         struct scm_cookie tmp_scm;
1549         struct sock *sk = sock->sk;
1550         struct unix_sock *u = unix_sk(sk);
1551         int noblock = flags & MSG_DONTWAIT;
1552         struct sk_buff *skb;
1553         int err;
1554
1555         err = -EOPNOTSUPP;
1556         if (flags&MSG_OOB)
1557                 goto out;
1558
1559         msg->msg_namelen = 0;
1560
1561         skb = skb_recv_datagram(sk, flags, noblock, &err);
1562         if (!skb)
1563                 goto out;
1564
1565         wake_up_interruptible(&u->peer_wait);
1566
1567         if (msg->msg_name)
1568                 unix_copy_addr(msg, skb->sk);
1569
1570         if (size > skb->len)
1571                 size = skb->len;
1572         else if (size < skb->len)
1573                 msg->msg_flags |= MSG_TRUNC;
1574
1575         err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, size);
1576         if (err)
1577                 goto out_free;
1578
1579         if (!siocb->scm) {
1580                 siocb->scm = &tmp_scm;
1581                 memset(&tmp_scm, 0, sizeof(tmp_scm));
1582         }
1583         siocb->scm->creds = *UNIXCREDS(skb);
1584
1585         if (!(flags & MSG_PEEK))
1586         {
1587                 if (UNIXCB(skb).fp)
1588                         unix_detach_fds(siocb->scm, skb);
1589         }
1590         else 
1591         {
1592                 /* It is questionable: on PEEK we could:
1593                    - do not return fds - good, but too simple 8)
1594                    - return fds, and do not return them on read (old strategy,
1595                      apparently wrong)
1596                    - clone fds (I chose it for now, it is the most universal
1597                      solution)
1598                 
1599                    POSIX 1003.1g does not actually define this clearly
1600                    at all. POSIX 1003.1g doesn't define a lot of things
1601                    clearly however!                  
1602                    
1603                 */
1604                 if (UNIXCB(skb).fp)
1605                         siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1606         }
1607         err = size;
1608
1609         scm_recv(sock, msg, siocb->scm, flags);
1610
1611 out_free:
1612         skb_free_datagram(sk,skb);
1613 out:
1614         return err;
1615 }
1616
1617 /*
1618  *      Sleep until data has arrive. But check for races..
1619  */
1620  
1621 static long unix_stream_data_wait(struct sock * sk, long timeo)
1622 {
1623         DEFINE_WAIT(wait);
1624
1625         unix_state_rlock(sk);
1626
1627         for (;;) {
1628                 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1629
1630                 if (skb_queue_len(&sk->sk_receive_queue) ||
1631                     sk->sk_err ||
1632                     (sk->sk_shutdown & RCV_SHUTDOWN) ||
1633                     signal_pending(current) ||
1634                     !timeo)
1635                         break;
1636
1637                 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1638                 unix_state_runlock(sk);
1639                 timeo = schedule_timeout(timeo);
1640                 unix_state_rlock(sk);
1641                 clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1642         }
1643
1644         finish_wait(sk->sk_sleep, &wait);
1645         unix_state_runlock(sk);
1646         return timeo;
1647 }
1648
1649
1650
1651 static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
1652                                struct msghdr *msg, size_t size,
1653                                int flags)
1654 {
1655         struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1656         struct scm_cookie tmp_scm;
1657         struct sock *sk = sock->sk;
1658         struct unix_sock *u = unix_sk(sk);
1659         struct sockaddr_un *sunaddr=msg->msg_name;
1660         int copied = 0;
1661         int check_creds = 0;
1662         int target;
1663         int err = 0;
1664         long timeo;
1665
1666         err = -EINVAL;
1667         if (sk->sk_state != TCP_ESTABLISHED)
1668                 goto out;
1669
1670         err = -EOPNOTSUPP;
1671         if (flags&MSG_OOB)
1672                 goto out;
1673
1674         target = sock_rcvlowat(sk, flags&MSG_WAITALL, size);
1675         timeo = sock_rcvtimeo(sk, flags&MSG_DONTWAIT);
1676
1677         msg->msg_namelen = 0;
1678
1679         /* Lock the socket to prevent queue disordering
1680          * while sleeps in memcpy_tomsg
1681          */
1682
1683         if (!siocb->scm) {
1684                 siocb->scm = &tmp_scm;
1685                 memset(&tmp_scm, 0, sizeof(tmp_scm));
1686         }
1687
1688         down(&u->readsem);
1689
1690         do
1691         {
1692                 int chunk;
1693                 struct sk_buff *skb;
1694
1695                 skb = skb_dequeue(&sk->sk_receive_queue);
1696                 if (skb==NULL)
1697                 {
1698                         if (copied >= target)
1699                                 break;
1700
1701                         /*
1702                          *      POSIX 1003.1g mandates this order.
1703                          */
1704                          
1705                         if ((err = sock_error(sk)) != 0)
1706                                 break;
1707                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1708                                 break;
1709                         err = -EAGAIN;
1710                         if (!timeo)
1711                                 break;
1712                         up(&u->readsem);
1713
1714                         timeo = unix_stream_data_wait(sk, timeo);
1715
1716                         if (signal_pending(current)) {
1717                                 err = sock_intr_errno(timeo);
1718                                 goto out;
1719                         }
1720                         down(&u->readsem);
1721                         continue;
1722                 }
1723
1724                 if (check_creds) {
1725                         /* Never glue messages from different writers */
1726                         if (memcmp(UNIXCREDS(skb), &siocb->scm->creds, sizeof(siocb->scm->creds)) != 0) {
1727                                 skb_queue_head(&sk->sk_receive_queue, skb);
1728                                 break;
1729                         }
1730                 } else {
1731                         /* Copy credentials */
1732                         siocb->scm->creds = *UNIXCREDS(skb);
1733                         check_creds = 1;
1734                 }
1735
1736                 /* Copy address just once */
1737                 if (sunaddr)
1738                 {
1739                         unix_copy_addr(msg, skb->sk);
1740                         sunaddr = NULL;
1741                 }
1742
1743                 chunk = min_t(unsigned int, skb->len, size);
1744                 if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) {
1745                         skb_queue_head(&sk->sk_receive_queue, skb);
1746                         if (copied == 0)
1747                                 copied = -EFAULT;
1748                         break;
1749                 }
1750                 copied += chunk;
1751                 size -= chunk;
1752
1753                 /* Mark read part of skb as used */
1754                 if (!(flags & MSG_PEEK))
1755                 {
1756                         skb_pull(skb, chunk);
1757
1758                         if (UNIXCB(skb).fp)
1759                                 unix_detach_fds(siocb->scm, skb);
1760
1761                         /* put the skb back if we didn't use it up.. */
1762                         if (skb->len)
1763                         {
1764                                 skb_queue_head(&sk->sk_receive_queue, skb);
1765                                 break;
1766                         }
1767
1768                         kfree_skb(skb);
1769
1770                         if (siocb->scm->fp)
1771                                 break;
1772                 }
1773                 else
1774                 {
1775                         /* It is questionable, see note in unix_dgram_recvmsg.
1776                          */
1777                         if (UNIXCB(skb).fp)
1778                                 siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1779
1780                         /* put message back and return */
1781                         skb_queue_head(&sk->sk_receive_queue, skb);
1782                         break;
1783                 }
1784         } while (size);
1785
1786         up(&u->readsem);
1787         scm_recv(sock, msg, siocb->scm, flags);
1788 out:
1789         return copied ? : err;
1790 }
1791
1792 static int unix_shutdown(struct socket *sock, int mode)
1793 {
1794         struct sock *sk = sock->sk;
1795         struct sock *other;
1796
1797         mode = (mode+1)&(RCV_SHUTDOWN|SEND_SHUTDOWN);
1798
1799         if (mode) {
1800                 unix_state_wlock(sk);
1801                 sk->sk_shutdown |= mode;
1802                 other=unix_peer(sk);
1803                 if (other)
1804                         sock_hold(other);
1805                 unix_state_wunlock(sk);
1806                 sk->sk_state_change(sk);
1807
1808                 if (other &&
1809                         (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
1810
1811                         int peer_mode = 0;
1812
1813                         if (mode&RCV_SHUTDOWN)
1814                                 peer_mode |= SEND_SHUTDOWN;
1815                         if (mode&SEND_SHUTDOWN)
1816                                 peer_mode |= RCV_SHUTDOWN;
1817                         unix_state_wlock(other);
1818                         other->sk_shutdown |= peer_mode;
1819                         unix_state_wunlock(other);
1820                         other->sk_state_change(other);
1821                         read_lock(&other->sk_callback_lock);
1822                         if (peer_mode == SHUTDOWN_MASK)
1823                                 sk_wake_async(other,1,POLL_HUP);
1824                         else if (peer_mode & RCV_SHUTDOWN)
1825                                 sk_wake_async(other,1,POLL_IN);
1826                         read_unlock(&other->sk_callback_lock);
1827                 }
1828                 if (other)
1829                         sock_put(other);
1830         }
1831         return 0;
1832 }
1833
1834 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1835 {
1836         struct sock *sk = sock->sk;
1837         long amount=0;
1838         int err;
1839
1840         switch(cmd)
1841         {
1842                 case SIOCOUTQ:
1843                         amount = atomic_read(&sk->sk_wmem_alloc);
1844                         err = put_user(amount, (int __user *)arg);
1845                         break;
1846                 case SIOCINQ:
1847                 {
1848                         struct sk_buff *skb;
1849                         if (sk->sk_state == TCP_LISTEN) {
1850                                 err = -EINVAL;
1851                                 break;
1852                         }
1853
1854                         spin_lock(&sk->sk_receive_queue.lock);
1855                         skb = skb_peek(&sk->sk_receive_queue);
1856                         if (skb)
1857                                 amount=skb->len;
1858                         spin_unlock(&sk->sk_receive_queue.lock);
1859                         err = put_user(amount, (int __user *)arg);
1860                         break;
1861                 }
1862
1863                 default:
1864                         err = dev_ioctl(cmd, (void __user *)arg);
1865                         break;
1866         }
1867         return err;
1868 }
1869
1870 static unsigned int unix_poll(struct file * file, struct socket *sock, poll_table *wait)
1871 {
1872         struct sock *sk = sock->sk;
1873         unsigned int mask;
1874
1875         poll_wait(file, sk->sk_sleep, wait);
1876         mask = 0;
1877
1878         /* exceptional events? */
1879         if (sk->sk_err)
1880                 mask |= POLLERR;
1881         if (sk->sk_shutdown == SHUTDOWN_MASK)
1882                 mask |= POLLHUP;
1883
1884         /* readable? */
1885         if (!skb_queue_empty(&sk->sk_receive_queue) ||
1886             (sk->sk_shutdown & RCV_SHUTDOWN))
1887                 mask |= POLLIN | POLLRDNORM;
1888
1889         /* Connection-based need to check for termination and startup */
1890         if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) && sk->sk_state == TCP_CLOSE)
1891                 mask |= POLLHUP;
1892
1893         /*
1894          * we set writable also when the other side has shut down the
1895          * connection. This prevents stuck sockets.
1896          */
1897         if (unix_writable(sk))
1898                 mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
1899
1900         return mask;
1901 }
1902
1903
1904 #ifdef CONFIG_PROC_FS
1905 static struct sock *unix_seq_idx(int *iter, loff_t pos)
1906 {
1907         loff_t off = 0;
1908         struct sock *s;
1909
1910         for (s = first_unix_socket(iter); s; s = next_unix_socket(iter, s)) {
1911                 if (off == pos) 
1912                         return s;
1913                 ++off;
1914         }
1915         return NULL;
1916 }
1917
1918
1919 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
1920 {
1921         read_lock(&unix_table_lock);
1922         return *pos ? unix_seq_idx(seq->private, *pos - 1) : ((void *) 1);
1923 }
1924
1925 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1926 {
1927         ++*pos;
1928
1929         if (v == (void *)1) 
1930                 return first_unix_socket(seq->private);
1931         return next_unix_socket(seq->private, v);
1932 }
1933
1934 static void unix_seq_stop(struct seq_file *seq, void *v)
1935 {
1936         read_unlock(&unix_table_lock);
1937 }
1938
1939 static int unix_seq_show(struct seq_file *seq, void *v)
1940 {
1941         
1942         if (v == (void *)1)
1943                 seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
1944                          "Inode Path\n");
1945         else {
1946                 struct sock *s = v;
1947                 struct unix_sock *u = unix_sk(s);
1948                 unix_state_rlock(s);
1949
1950                 seq_printf(seq, "%p: %08X %08X %08X %04X %02X %5lu",
1951                         s,
1952                         atomic_read(&s->sk_refcnt),
1953                         0,
1954                         s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
1955                         s->sk_type,
1956                         s->sk_socket ?
1957                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
1958                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
1959                         sock_i_ino(s));
1960
1961                 if (u->addr) {
1962                         int i, len;
1963                         seq_putc(seq, ' ');
1964
1965                         i = 0;
1966                         len = u->addr->len - sizeof(short);
1967                         if (!UNIX_ABSTRACT(s))
1968                                 len--;
1969                         else {
1970                                 seq_putc(seq, '@');
1971                                 i++;
1972                         }
1973                         for ( ; i < len; i++)
1974                                 seq_putc(seq, u->addr->name->sun_path[i]);
1975                 }
1976                 unix_state_runlock(s);
1977                 seq_putc(seq, '\n');
1978         }
1979
1980         return 0;
1981 }
1982
1983 static struct seq_operations unix_seq_ops = {
1984         .start  = unix_seq_start,
1985         .next   = unix_seq_next,
1986         .stop   = unix_seq_stop,
1987         .show   = unix_seq_show,
1988 };
1989
1990
1991 static int unix_seq_open(struct inode *inode, struct file *file)
1992 {
1993         struct seq_file *seq;
1994         int rc = -ENOMEM;
1995         int *iter = kmalloc(sizeof(int), GFP_KERNEL);
1996
1997         if (!iter)
1998                 goto out;
1999
2000         rc = seq_open(file, &unix_seq_ops);
2001         if (rc)
2002                 goto out_kfree;
2003
2004         seq          = file->private_data;
2005         seq->private = iter;
2006         *iter = 0;
2007 out:
2008         return rc;
2009 out_kfree:
2010         kfree(iter);
2011         goto out;
2012 }
2013
2014 static struct file_operations unix_seq_fops = {
2015         .owner          = THIS_MODULE,
2016         .open           = unix_seq_open,
2017         .read           = seq_read,
2018         .llseek         = seq_lseek,
2019         .release        = seq_release_private,
2020 };
2021
2022 #endif
2023
2024 static struct net_proto_family unix_family_ops = {
2025         .family = PF_UNIX,
2026         .create = unix_create,
2027         .owner  = THIS_MODULE,
2028 };
2029
2030 #ifdef CONFIG_SYSCTL
2031 extern void unix_sysctl_register(void);
2032 extern void unix_sysctl_unregister(void);
2033 #else
2034 static inline void unix_sysctl_register(void) {}
2035 static inline void unix_sysctl_unregister(void) {}
2036 #endif
2037
2038 static int __init af_unix_init(void)
2039 {
2040         struct sk_buff *dummy_skb;
2041
2042         if (sizeof(struct unix_skb_parms) > sizeof(dummy_skb->cb)) {
2043                 printk(KERN_CRIT "%s: panic\n", __FUNCTION__);
2044                 return -1;
2045         }
2046         /* allocate our sock slab cache */
2047         unix_sk_cachep = kmem_cache_create("unix_sock",
2048                                            sizeof(struct unix_sock), 0,
2049                                            SLAB_HWCACHE_ALIGN, NULL, NULL);
2050         if (!unix_sk_cachep)
2051                 printk(KERN_CRIT
2052                         "af_unix_init: Cannot create unix_sock SLAB cache!\n");
2053
2054         sock_register(&unix_family_ops);
2055 #ifdef CONFIG_PROC_FS
2056         proc_net_fops_create("unix", 0, &unix_seq_fops);
2057 #endif
2058         unix_sysctl_register();
2059         return 0;
2060 }
2061
2062 static void __exit af_unix_exit(void)
2063 {
2064         sock_unregister(PF_UNIX);
2065         unix_sysctl_unregister();
2066         proc_net_remove("unix");
2067         kmem_cache_destroy(unix_sk_cachep);
2068 }
2069
2070 module_init(af_unix_init);
2071 module_exit(af_unix_exit);
2072
2073 MODULE_LICENSE("GPL");
2074 MODULE_ALIAS_NETPROTO(PF_UNIX);