89fb7eb9e0868c3a00ca94275dbaf1710e817b30
[linux-2.6.git] / net / unix / af_unix.c
1 /*
2  * NET4:        Implementation of BSD Unix domain sockets.
3  *
4  * Authors:     Alan Cox, <alan.cox@linux.org>
5  *
6  *              This program is free software; you can redistribute it and/or
7  *              modify it under the terms of the GNU General Public License
8  *              as published by the Free Software Foundation; either version
9  *              2 of the License, or (at your option) any later version.
10  *
11  * Version:     $Id: af_unix.c,v 1.133 2002/02/08 03:57:19 davem Exp $
12  *
13  * Fixes:
14  *              Linus Torvalds  :       Assorted bug cures.
15  *              Niibe Yutaka    :       async I/O support.
16  *              Carsten Paeth   :       PF_UNIX check, address fixes.
17  *              Alan Cox        :       Limit size of allocated blocks.
18  *              Alan Cox        :       Fixed the stupid socketpair bug.
19  *              Alan Cox        :       BSD compatibility fine tuning.
20  *              Alan Cox        :       Fixed a bug in connect when interrupted.
21  *              Alan Cox        :       Sorted out a proper draft version of
22  *                                      file descriptor passing hacked up from
23  *                                      Mike Shaver's work.
24  *              Marty Leisner   :       Fixes to fd passing
25  *              Nick Nevin      :       recvmsg bugfix.
26  *              Alan Cox        :       Started proper garbage collector
27  *              Heiko EiBfeldt  :       Missing verify_area check
28  *              Alan Cox        :       Started POSIXisms
29  *              Andreas Schwab  :       Replace inode by dentry for proper
30  *                                      reference counting
31  *              Kirk Petersen   :       Made this a module
32  *          Christoph Rohland   :       Elegant non-blocking accept/connect algorithm.
33  *                                      Lots of bug fixes.
34  *           Alexey Kuznetosv   :       Repaired (I hope) bugs introduces
35  *                                      by above two patches.
36  *           Andrea Arcangeli   :       If possible we block in connect(2)
37  *                                      if the max backlog of the listen socket
38  *                                      is been reached. This won't break
39  *                                      old apps and it will avoid huge amount
40  *                                      of socks hashed (this for unix_gc()
41  *                                      performances reasons).
42  *                                      Security fix that limits the max
43  *                                      number of socks to 2*max_files and
44  *                                      the number of skb queueable in the
45  *                                      dgram receiver.
46  *              Artur Skawina   :       Hash function optimizations
47  *           Alexey Kuznetsov   :       Full scale SMP. Lot of bugs are introduced 8)
48  *            Malcolm Beattie   :       Set peercred for socketpair
49  *           Michal Ostrowski   :       Module initialization cleanup.
50  *           Arnaldo C. Melo    :       Remove MOD_{INC,DEC}_USE_COUNT,
51  *                                      the core infrastructure is doing that
52  *                                      for all net proto families now (2.5.69+)
53  *
54  *
55  * Known differences from reference BSD that was tested:
56  *
57  *      [TO FIX]
58  *      ECONNREFUSED is not returned from one end of a connected() socket to the
59  *              other the moment one end closes.
60  *      fstat() doesn't return st_dev=0, and give the blksize as high water mark
61  *              and a fake inode identifier (nor the BSD first socket fstat twice bug).
62  *      [NOT TO FIX]
63  *      accept() returns a path name even if the connecting socket has closed
64  *              in the meantime (BSD loses the path and gives up).
65  *      accept() returns 0 length path for an unbound connector. BSD returns 16
66  *              and a null first byte in the path (but not for gethost/peername - BSD bug ??)
67  *      socketpair(...SOCK_RAW..) doesn't panic the kernel.
68  *      BSD af_unix apparently has connect forgetting to block properly.
69  *              (need to check this with the POSIX spec in detail)
70  *
71  * Differences from 2.0.0-11-... (ANK)
72  *      Bug fixes and improvements.
73  *              - client shutdown killed server socket.
74  *              - removed all useless cli/sti pairs.
75  *
76  *      Semantic changes/extensions.
77  *              - generic control message passing.
78  *              - SCM_CREDENTIALS control message.
79  *              - "Abstract" (not FS based) socket bindings.
80  *                Abstract names are sequences of bytes (not zero terminated)
81  *                started by 0, so that this name space does not intersect
82  *                with BSD names.
83  */
84
85 #include <linux/module.h>
86 #include <linux/config.h>
87 #include <linux/kernel.h>
88 #include <linux/major.h>
89 #include <linux/signal.h>
90 #include <linux/sched.h>
91 #include <linux/errno.h>
92 #include <linux/string.h>
93 #include <linux/stat.h>
94 #include <linux/dcache.h>
95 #include <linux/namei.h>
96 #include <linux/socket.h>
97 #include <linux/un.h>
98 #include <linux/fcntl.h>
99 #include <linux/termios.h>
100 #include <linux/sockios.h>
101 #include <linux/net.h>
102 #include <linux/in.h>
103 #include <linux/fs.h>
104 #include <linux/slab.h>
105 #include <asm/uaccess.h>
106 #include <linux/skbuff.h>
107 #include <linux/netdevice.h>
108 #include <net/sock.h>
109 #include <linux/tcp.h>
110 #include <net/af_unix.h>
111 #include <linux/proc_fs.h>
112 #include <linux/seq_file.h>
113 #include <net/scm.h>
114 #include <linux/init.h>
115 #include <linux/poll.h>
116 #include <linux/smp_lock.h>
117 #include <linux/rtnetlink.h>
118 #include <linux/mount.h>
119 #include <net/checksum.h>
120 #include <linux/security.h>
121
122 int sysctl_unix_max_dgram_qlen = 10;
123
124 kmem_cache_t *unix_sk_cachep;
125
126 struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1];
127 rwlock_t unix_table_lock = RW_LOCK_UNLOCKED;
128 static atomic_t unix_nr_socks = ATOMIC_INIT(0);
129
130 #define unix_sockets_unbound    (&unix_socket_table[UNIX_HASH_SIZE])
131
132 #define UNIX_ABSTRACT(sk)       (unix_sk(sk)->addr->hash != UNIX_HASH_SIZE)
133
134 /*
135  *  SMP locking strategy:
136  *    hash table is protected with rwlock unix_table_lock
137  *    each socket state is protected by separate rwlock.
138  */
139
140 static inline unsigned unix_hash_fold(unsigned hash)
141 {
142         hash ^= hash>>16;
143         hash ^= hash>>8;
144         return hash&(UNIX_HASH_SIZE-1);
145 }
146
147 #define unix_peer(sk) ((sk)->sk_pair)
148
149 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
150 {
151         return unix_peer(osk) == sk;
152 }
153
154 static inline int unix_may_send(struct sock *sk, struct sock *osk)
155 {
156         return (unix_peer(osk) == NULL || unix_our_peer(sk, osk));
157 }
158
159 static struct sock *unix_peer_get(struct sock *s)
160 {
161         struct sock *peer;
162
163         unix_state_rlock(s);
164         peer = unix_peer(s);
165         if (peer)
166                 sock_hold(peer);
167         unix_state_runlock(s);
168         return peer;
169 }
170
171 static inline void unix_release_addr(struct unix_address *addr)
172 {
173         if (atomic_dec_and_test(&addr->refcnt))
174                 kfree(addr);
175 }
176
177 /*
178  *      Check unix socket name:
179  *              - should be not zero length.
180  *              - if started by not zero, should be NULL terminated (FS object)
181  *              - if started by zero, it is abstract name.
182  */
183  
184 static int unix_mkname(struct sockaddr_un * sunaddr, int len, unsigned *hashp)
185 {
186         if (len <= sizeof(short) || len > sizeof(*sunaddr))
187                 return -EINVAL;
188         if (!sunaddr || sunaddr->sun_family != AF_UNIX)
189                 return -EINVAL;
190         if (sunaddr->sun_path[0])
191         {
192                 /*
193                  *      This may look like an off by one error but it is
194                  *      a bit more subtle. 108 is the longest valid AF_UNIX
195                  *      path for a binding. sun_path[108] doesn't as such
196                  *      exist. However in kernel space we are guaranteed that
197                  *      it is a valid memory location in our kernel
198                  *      address buffer.
199                  */
200                 if (len > sizeof(*sunaddr))
201                         len = sizeof(*sunaddr);
202                 ((char *)sunaddr)[len]=0;
203                 len = strlen(sunaddr->sun_path)+1+sizeof(short);
204                 return len;
205         }
206
207         *hashp = unix_hash_fold(csum_partial((char*)sunaddr, len, 0));
208         return len;
209 }
210
211 static void __unix_remove_socket(struct sock *sk)
212 {
213         sk_del_node_init(sk);
214 }
215
216 static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
217 {
218         BUG_TRAP(sk_unhashed(sk));
219         sk_add_node(sk, list);
220 }
221
222 static inline void unix_remove_socket(struct sock *sk)
223 {
224         write_lock(&unix_table_lock);
225         __unix_remove_socket(sk);
226         write_unlock(&unix_table_lock);
227 }
228
229 static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
230 {
231         write_lock(&unix_table_lock);
232         __unix_insert_socket(list, sk);
233         write_unlock(&unix_table_lock);
234 }
235
236 static struct sock *__unix_find_socket_byname(struct sockaddr_un *sunname,
237                                               int len, int type, unsigned hash)
238 {
239         struct sock *s;
240         struct hlist_node *node;
241
242         sk_for_each(s, node, &unix_socket_table[hash ^ type]) {
243                 struct unix_sock *u = unix_sk(s);
244
245                 if (u->addr->len == len &&
246                     !memcmp(u->addr->name, sunname, len))
247                         goto found;
248         }
249         s = NULL;
250 found:
251         return s;
252 }
253
254 static inline struct sock *unix_find_socket_byname(struct sockaddr_un *sunname,
255                                                    int len, int type,
256                                                    unsigned hash)
257 {
258         struct sock *s;
259
260         read_lock(&unix_table_lock);
261         s = __unix_find_socket_byname(sunname, len, type, hash);
262         if (s)
263                 sock_hold(s);
264         read_unlock(&unix_table_lock);
265         return s;
266 }
267
268 static struct sock *unix_find_socket_byinode(struct inode *i)
269 {
270         struct sock *s;
271         struct hlist_node *node;
272
273         read_lock(&unix_table_lock);
274         sk_for_each(s, node,
275                     &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
276                 struct dentry *dentry = unix_sk(s)->dentry;
277
278                 if(dentry && dentry->d_inode == i)
279                 {
280                         sock_hold(s);
281                         goto found;
282                 }
283         }
284         s = NULL;
285 found:
286         read_unlock(&unix_table_lock);
287         return s;
288 }
289
290 static inline int unix_writable(struct sock *sk)
291 {
292         return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
293 }
294
295 static void unix_write_space(struct sock *sk)
296 {
297         read_lock(&sk->sk_callback_lock);
298         if (unix_writable(sk)) {
299                 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
300                         wake_up_interruptible(sk->sk_sleep);
301                 sk_wake_async(sk, 2, POLL_OUT);
302         }
303         read_unlock(&sk->sk_callback_lock);
304 }
305
306 /* When dgram socket disconnects (or changes its peer), we clear its receive
307  * queue of packets arrived from previous peer. First, it allows to do
308  * flow control based only on wmem_alloc; second, sk connected to peer
309  * may receive messages only from that peer. */
310 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
311 {
312         if (skb_queue_len(&sk->sk_receive_queue)) {
313                 skb_queue_purge(&sk->sk_receive_queue);
314                 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
315
316                 /* If one link of bidirectional dgram pipe is disconnected,
317                  * we signal error. Messages are lost. Do not make this,
318                  * when peer was not connected to us.
319                  */
320                 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
321                         other->sk_err = ECONNRESET;
322                         other->sk_error_report(other);
323                 }
324         }
325 }
326
327 static void unix_sock_destructor(struct sock *sk)
328 {
329         struct unix_sock *u = unix_sk(sk);
330
331         skb_queue_purge(&sk->sk_receive_queue);
332
333         BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
334         BUG_TRAP(sk_unhashed(sk));
335         BUG_TRAP(!sk->sk_socket);
336         if (!sock_flag(sk, SOCK_DEAD)) {
337                 printk("Attempt to release alive unix socket: %p\n", sk);
338                 return;
339         }
340
341         if (u->addr)
342                 unix_release_addr(u->addr);
343
344         atomic_dec(&unix_nr_socks);
345 #ifdef UNIX_REFCNT_DEBUG
346         printk(KERN_DEBUG "UNIX %p is destroyed, %d are still alive.\n", sk, atomic_read(&unix_nr_socks));
347 #endif
348 }
349
350 static int unix_release_sock (struct sock *sk, int embrion)
351 {
352         struct unix_sock *u = unix_sk(sk);
353         struct dentry *dentry;
354         struct vfsmount *mnt;
355         struct sock *skpair;
356         struct sk_buff *skb;
357         int state;
358
359         unix_remove_socket(sk);
360
361         /* Clear state */
362         unix_state_wlock(sk);
363         sock_orphan(sk);
364         sk->sk_shutdown = SHUTDOWN_MASK;
365         dentry       = u->dentry;
366         u->dentry    = NULL;
367         mnt          = u->mnt;
368         u->mnt       = NULL;
369         state = sk->sk_state;
370         sk->sk_state = TCP_CLOSE;
371         unix_state_wunlock(sk);
372
373         wake_up_interruptible_all(&u->peer_wait);
374
375         skpair=unix_peer(sk);
376
377         if (skpair!=NULL) {
378                 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
379                         unix_state_wlock(skpair);
380                         /* No more writes */
381                         skpair->sk_shutdown = SHUTDOWN_MASK;
382                         if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
383                                 skpair->sk_err = ECONNRESET;
384                         unix_state_wunlock(skpair);
385                         skpair->sk_state_change(skpair);
386                         read_lock(&skpair->sk_callback_lock);
387                         sk_wake_async(skpair,1,POLL_HUP);
388                         read_unlock(&skpair->sk_callback_lock);
389                 }
390                 sock_put(skpair); /* It may now die */
391                 unix_peer(sk) = NULL;
392         }
393
394         /* Try to flush out this socket. Throw out buffers at least */
395
396         while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
397                 if (state==TCP_LISTEN)
398                         unix_release_sock(skb->sk, 1);
399                 /* passed fds are erased in the kfree_skb hook        */
400                 kfree_skb(skb);
401         }
402
403         if (dentry) {
404                 dput(dentry);
405                 mntput(mnt);
406         }
407
408         sock_put(sk);
409
410         /* ---- Socket is dead now and most probably destroyed ---- */
411
412         /*
413          * Fixme: BSD difference: In BSD all sockets connected to use get
414          *        ECONNRESET and we die on the spot. In Linux we behave
415          *        like files and pipes do and wait for the last
416          *        dereference.
417          *
418          * Can't we simply set sock->err?
419          *
420          *        What the above comment does talk about? --ANK(980817)
421          */
422
423         if (atomic_read(&unix_tot_inflight))
424                 unix_gc();              /* Garbage collect fds */       
425
426         return 0;
427 }
428
429 static int unix_listen(struct socket *sock, int backlog)
430 {
431         int err;
432         struct sock *sk = sock->sk;
433         struct unix_sock *u = unix_sk(sk);
434
435         err = -EOPNOTSUPP;
436         if (sock->type!=SOCK_STREAM && sock->type!=SOCK_SEQPACKET)
437                 goto out;                       /* Only stream/seqpacket sockets accept */
438         err = -EINVAL;
439         if (!u->addr)
440                 goto out;                       /* No listens on an unbound socket */
441         unix_state_wlock(sk);
442         if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
443                 goto out_unlock;
444         if (backlog > sk->sk_max_ack_backlog)
445                 wake_up_interruptible_all(&u->peer_wait);
446         sk->sk_max_ack_backlog  = backlog;
447         sk->sk_state            = TCP_LISTEN;
448         /* set credentials so connect can copy them */
449         sk->sk_peercred.pid     = current->tgid;
450         sk->sk_peercred.uid     = current->euid;
451         sk->sk_peercred.gid     = current->egid;
452         err = 0;
453
454 out_unlock:
455         unix_state_wunlock(sk);
456 out:
457         return err;
458 }
459
460 static int unix_release(struct socket *);
461 static int unix_bind(struct socket *, struct sockaddr *, int);
462 static int unix_stream_connect(struct socket *, struct sockaddr *,
463                                int addr_len, int flags);
464 static int unix_socketpair(struct socket *, struct socket *);
465 static int unix_accept(struct socket *, struct socket *, int);
466 static int unix_getname(struct socket *, struct sockaddr *, int *, int);
467 static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
468 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
469 static int unix_shutdown(struct socket *, int);
470 static int unix_stream_sendmsg(struct kiocb *, struct socket *,
471                                struct msghdr *, size_t);
472 static int unix_stream_recvmsg(struct kiocb *, struct socket *,
473                                struct msghdr *, size_t, int);
474 static int unix_dgram_sendmsg(struct kiocb *, struct socket *,
475                               struct msghdr *, size_t);
476 static int unix_dgram_recvmsg(struct kiocb *, struct socket *,
477                               struct msghdr *, size_t, int);
478 static int unix_dgram_connect(struct socket *, struct sockaddr *,
479                               int, int);
480
481 static struct proto_ops unix_stream_ops = {
482         .family =       PF_UNIX,
483         .owner =        THIS_MODULE,
484         .release =      unix_release,
485         .bind =         unix_bind,
486         .connect =      unix_stream_connect,
487         .socketpair =   unix_socketpair,
488         .accept =       unix_accept,
489         .getname =      unix_getname,
490         .poll =         unix_poll,
491         .ioctl =        unix_ioctl,
492         .listen =       unix_listen,
493         .shutdown =     unix_shutdown,
494         .setsockopt =   sock_no_setsockopt,
495         .getsockopt =   sock_no_getsockopt,
496         .sendmsg =      unix_stream_sendmsg,
497         .recvmsg =      unix_stream_recvmsg,
498         .mmap =         sock_no_mmap,
499         .sendpage =     sock_no_sendpage,
500 };
501
502 static struct proto_ops unix_dgram_ops = {
503         .family =       PF_UNIX,
504         .owner =        THIS_MODULE,
505         .release =      unix_release,
506         .bind =         unix_bind,
507         .connect =      unix_dgram_connect,
508         .socketpair =   unix_socketpair,
509         .accept =       sock_no_accept,
510         .getname =      unix_getname,
511         .poll =         datagram_poll,
512         .ioctl =        unix_ioctl,
513         .listen =       sock_no_listen,
514         .shutdown =     unix_shutdown,
515         .setsockopt =   sock_no_setsockopt,
516         .getsockopt =   sock_no_getsockopt,
517         .sendmsg =      unix_dgram_sendmsg,
518         .recvmsg =      unix_dgram_recvmsg,
519         .mmap =         sock_no_mmap,
520         .sendpage =     sock_no_sendpage,
521 };
522
523 static struct proto_ops unix_seqpacket_ops = {
524         .family =       PF_UNIX,
525         .owner =        THIS_MODULE,
526         .release =      unix_release,
527         .bind =         unix_bind,
528         .connect =      unix_stream_connect,
529         .socketpair =   unix_socketpair,
530         .accept =       unix_accept,
531         .getname =      unix_getname,
532         .poll =         datagram_poll,
533         .ioctl =        unix_ioctl,
534         .listen =       unix_listen,
535         .shutdown =     unix_shutdown,
536         .setsockopt =   sock_no_setsockopt,
537         .getsockopt =   sock_no_getsockopt,
538         .sendmsg =      unix_dgram_sendmsg,
539         .recvmsg =      unix_dgram_recvmsg,
540         .mmap =         sock_no_mmap,
541         .sendpage =     sock_no_sendpage,
542 };
543
544 static struct sock * unix_create1(struct socket *sock)
545 {
546         struct sock *sk = NULL;
547         struct unix_sock *u;
548
549         if (atomic_read(&unix_nr_socks) >= 2*files_stat.max_files)
550                 goto out;
551
552         sk = sk_alloc(PF_UNIX, GFP_KERNEL, sizeof(struct unix_sock),
553                       unix_sk_cachep);
554         if (!sk)
555                 goto out;
556
557         atomic_inc(&unix_nr_socks);
558
559         sock_init_data(sock,sk);
560         sk_set_owner(sk, THIS_MODULE);
561
562         sk->sk_write_space      = unix_write_space;
563         sk->sk_max_ack_backlog  = sysctl_unix_max_dgram_qlen;
564         sk->sk_destruct         = unix_sock_destructor;
565         u         = unix_sk(sk);
566         u->dentry = NULL;
567         u->mnt    = NULL;
568         rwlock_init(&u->lock);
569         atomic_set(&u->inflight, sock ? 0 : -1);
570         init_MUTEX(&u->readsem); /* single task reading lock */
571         init_waitqueue_head(&u->peer_wait);
572         unix_insert_socket(unix_sockets_unbound, sk);
573 out:
574         return sk;
575 }
576
577 static int unix_create(struct socket *sock, int protocol)
578 {
579         if (protocol && protocol != PF_UNIX)
580                 return -EPROTONOSUPPORT;
581
582         sock->state = SS_UNCONNECTED;
583
584         switch (sock->type) {
585         case SOCK_STREAM:
586                 sock->ops = &unix_stream_ops;
587                 break;
588                 /*
589                  *      Believe it or not BSD has AF_UNIX, SOCK_RAW though
590                  *      nothing uses it.
591                  */
592         case SOCK_RAW:
593                 sock->type=SOCK_DGRAM;
594         case SOCK_DGRAM:
595                 sock->ops = &unix_dgram_ops;
596                 break;
597         case SOCK_SEQPACKET:
598                 sock->ops = &unix_seqpacket_ops;
599                 break;
600         default:
601                 return -ESOCKTNOSUPPORT;
602         }
603
604         return unix_create1(sock) ? 0 : -ENOMEM;
605 }
606
607 static int unix_release(struct socket *sock)
608 {
609         struct sock *sk = sock->sk;
610
611         if (!sk)
612                 return 0;
613
614         sock->sk = NULL;
615
616         return unix_release_sock (sk, 0);
617 }
618
619 static int unix_autobind(struct socket *sock)
620 {
621         struct sock *sk = sock->sk;
622         struct unix_sock *u = unix_sk(sk);
623         static u32 ordernum = 1;
624         struct unix_address * addr;
625         int err;
626
627         down(&u->readsem);
628
629         err = 0;
630         if (u->addr)
631                 goto out;
632
633         err = -ENOMEM;
634         addr = kmalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
635         if (!addr)
636                 goto out;
637
638         memset(addr, 0, sizeof(*addr) + sizeof(short) + 16);
639         addr->name->sun_family = AF_UNIX;
640         atomic_set(&addr->refcnt, 1);
641
642 retry:
643         addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
644         addr->hash = unix_hash_fold(csum_partial((void*)addr->name, addr->len, 0));
645
646         write_lock(&unix_table_lock);
647         ordernum = (ordernum+1)&0xFFFFF;
648
649         if (__unix_find_socket_byname(addr->name, addr->len, sock->type,
650                                       addr->hash)) {
651                 write_unlock(&unix_table_lock);
652                 /* Sanity yield. It is unusual case, but yet... */
653                 if (!(ordernum&0xFF))
654                         yield();
655                 goto retry;
656         }
657         addr->hash ^= sk->sk_type;
658
659         __unix_remove_socket(sk);
660         u->addr = addr;
661         __unix_insert_socket(&unix_socket_table[addr->hash], sk);
662         write_unlock(&unix_table_lock);
663         err = 0;
664
665 out:    up(&u->readsem);
666         return err;
667 }
668
669 static struct sock *unix_find_other(struct sockaddr_un *sunname, int len,
670                                     int type, unsigned hash, int *error)
671 {
672         struct sock *u;
673         struct nameidata nd;
674         int err = 0;
675         
676         if (sunname->sun_path[0]) {
677                 err = path_lookup(sunname->sun_path, LOOKUP_FOLLOW, &nd);
678                 if (err)
679                         goto fail;
680                 err = permission(nd.dentry->d_inode,MAY_WRITE, &nd);
681                 if (err)
682                         goto put_fail;
683
684                 err = -ECONNREFUSED;
685                 if (!S_ISSOCK(nd.dentry->d_inode->i_mode))
686                         goto put_fail;
687                 u=unix_find_socket_byinode(nd.dentry->d_inode);
688                 if (!u)
689                         goto put_fail;
690
691                 if (u->sk_type == type)
692                         touch_atime(nd.mnt, nd.dentry);
693
694                 path_release(&nd);
695
696                 err=-EPROTOTYPE;
697                 if (u->sk_type != type) {
698                         sock_put(u);
699                         goto fail;
700                 }
701         } else {
702                 err = -ECONNREFUSED;
703                 u=unix_find_socket_byname(sunname, len, type, hash);
704                 if (u) {
705                         struct dentry *dentry;
706                         dentry = unix_sk(u)->dentry;
707                         if (dentry)
708                                 touch_atime(unix_sk(u)->mnt, dentry);
709                 } else
710                         goto fail;
711         }
712         return u;
713
714 put_fail:
715         path_release(&nd);
716 fail:
717         *error=err;
718         return NULL;
719 }
720
721
722 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
723 {
724         struct sock *sk = sock->sk;
725         struct unix_sock *u = unix_sk(sk);
726         struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr;
727         struct dentry * dentry = NULL;
728         struct nameidata nd;
729         int err;
730         unsigned hash;
731         struct unix_address *addr;
732         struct hlist_head *list;
733
734         err = -EINVAL;
735         if (sunaddr->sun_family != AF_UNIX)
736                 goto out;
737
738         if (addr_len==sizeof(short)) {
739                 err = unix_autobind(sock);
740                 goto out;
741         }
742
743         err = unix_mkname(sunaddr, addr_len, &hash);
744         if (err < 0)
745                 goto out;
746         addr_len = err;
747
748         down(&u->readsem);
749
750         err = -EINVAL;
751         if (u->addr)
752                 goto out_up;
753
754         err = -ENOMEM;
755         addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
756         if (!addr)
757                 goto out_up;
758
759         memcpy(addr->name, sunaddr, addr_len);
760         addr->len = addr_len;
761         addr->hash = hash ^ sk->sk_type;
762         atomic_set(&addr->refcnt, 1);
763
764         if (sunaddr->sun_path[0]) {
765                 unsigned int mode;
766                 err = 0;
767                 /*
768                  * Get the parent directory, calculate the hash for last
769                  * component.
770                  */
771                 err = path_lookup(sunaddr->sun_path, LOOKUP_PARENT, &nd);
772                 if (err)
773                         goto out_mknod_parent;
774                 /*
775                  * Yucky last component or no last component at all?
776                  * (foo/., foo/.., /////)
777                  */
778                 err = -EEXIST;
779                 if (nd.last_type != LAST_NORM)
780                         goto out_mknod;
781                 /*
782                  * Lock the directory.
783                  */
784                 down(&nd.dentry->d_inode->i_sem);
785                 /*
786                  * Do the final lookup.
787                  */
788                 dentry = lookup_hash(&nd.last, nd.dentry);
789                 err = PTR_ERR(dentry);
790                 if (IS_ERR(dentry))
791                         goto out_mknod_unlock;
792                 err = -ENOENT;
793                 /*
794                  * Special case - lookup gave negative, but... we had foo/bar/
795                  * From the vfs_mknod() POV we just have a negative dentry -
796                  * all is fine. Let's be bastards - you had / on the end, you've
797                  * been asking for (non-existent) directory. -ENOENT for you.
798                  */
799                 if (nd.last.name[nd.last.len] && !dentry->d_inode)
800                         goto out_mknod_dput;
801                 /*
802                  * All right, let's create it.
803                  */
804                 mode = S_IFSOCK |
805                        (SOCK_INODE(sock)->i_mode & ~current->fs->umask);
806                 err = vfs_mknod(nd.dentry->d_inode, dentry, mode, 0);
807                 if (err)
808                         goto out_mknod_dput;
809                 up(&nd.dentry->d_inode->i_sem);
810                 dput(nd.dentry);
811                 nd.dentry = dentry;
812
813                 addr->hash = UNIX_HASH_SIZE;
814         }
815
816         write_lock(&unix_table_lock);
817
818         if (!sunaddr->sun_path[0]) {
819                 err = -EADDRINUSE;
820                 if (__unix_find_socket_byname(sunaddr, addr_len,
821                                               sk->sk_type, hash)) {
822                         unix_release_addr(addr);
823                         goto out_unlock;
824                 }
825
826                 list = &unix_socket_table[addr->hash];
827         } else {
828                 list = &unix_socket_table[dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1)];
829                 u->dentry = nd.dentry;
830                 u->mnt    = nd.mnt;
831         }
832
833         err = 0;
834         __unix_remove_socket(sk);
835         u->addr = addr;
836         __unix_insert_socket(list, sk);
837
838 out_unlock:
839         write_unlock(&unix_table_lock);
840 out_up:
841         up(&u->readsem);
842 out:
843         return err;
844
845 out_mknod_dput:
846         dput(dentry);
847 out_mknod_unlock:
848         up(&nd.dentry->d_inode->i_sem);
849 out_mknod:
850         path_release(&nd);
851 out_mknod_parent:
852         if (err==-EEXIST)
853                 err=-EADDRINUSE;
854         unix_release_addr(addr);
855         goto out_up;
856 }
857
858 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
859                               int alen, int flags)
860 {
861         struct sock *sk = sock->sk;
862         struct sockaddr_un *sunaddr=(struct sockaddr_un*)addr;
863         struct sock *other;
864         unsigned hash;
865         int err;
866
867         if (addr->sa_family != AF_UNSPEC) {
868                 err = unix_mkname(sunaddr, alen, &hash);
869                 if (err < 0)
870                         goto out;
871                 alen = err;
872
873                 if (sock->passcred && !unix_sk(sk)->addr &&
874                     (err = unix_autobind(sock)) != 0)
875                         goto out;
876
877                 other=unix_find_other(sunaddr, alen, sock->type, hash, &err);
878                 if (!other)
879                         goto out;
880
881                 unix_state_wlock(sk);
882
883                 err = -EPERM;
884                 if (!unix_may_send(sk, other))
885                         goto out_unlock;
886
887                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
888                 if (err)
889                         goto out_unlock;
890
891         } else {
892                 /*
893                  *      1003.1g breaking connected state with AF_UNSPEC
894                  */
895                 other = NULL;
896                 unix_state_wlock(sk);
897         }
898
899         /*
900          * If it was connected, reconnect.
901          */
902         if (unix_peer(sk)) {
903                 struct sock *old_peer = unix_peer(sk);
904                 unix_peer(sk)=other;
905                 unix_state_wunlock(sk);
906
907                 if (other != old_peer)
908                         unix_dgram_disconnected(sk, old_peer);
909                 sock_put(old_peer);
910         } else {
911                 unix_peer(sk)=other;
912                 unix_state_wunlock(sk);
913         }
914         return 0;
915
916 out_unlock:
917         unix_state_wunlock(sk);
918         sock_put(other);
919 out:
920         return err;
921 }
922
923 static long unix_wait_for_peer(struct sock *other, long timeo)
924 {
925         struct unix_sock *u = unix_sk(other);
926         int sched;
927         DEFINE_WAIT(wait);
928
929         prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
930
931         sched = !sock_flag(other, SOCK_DEAD) &&
932                 !(other->sk_shutdown & RCV_SHUTDOWN) &&
933                 (skb_queue_len(&other->sk_receive_queue) >
934                  other->sk_max_ack_backlog);
935
936         unix_state_runlock(other);
937
938         if (sched)
939                 timeo = schedule_timeout(timeo);
940
941         finish_wait(&u->peer_wait, &wait);
942         return timeo;
943 }
944
945 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
946                                int addr_len, int flags)
947 {
948         struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr;
949         struct sock *sk = sock->sk;
950         struct unix_sock *u = unix_sk(sk), *newu, *otheru;
951         struct sock *newsk = NULL;
952         struct sock *other = NULL;
953         struct sk_buff *skb = NULL;
954         unsigned hash;
955         int st;
956         int err;
957         long timeo;
958
959         err = unix_mkname(sunaddr, addr_len, &hash);
960         if (err < 0)
961                 goto out;
962         addr_len = err;
963
964         if (sock->passcred && !u->addr && (err = unix_autobind(sock)) != 0)
965                 goto out;
966
967         timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
968
969         /* First of all allocate resources.
970            If we will make it after state is locked,
971            we will have to recheck all again in any case.
972          */
973
974         err = -ENOMEM;
975
976         /* create new sock for complete connection */
977         newsk = unix_create1(NULL);
978         if (newsk == NULL)
979                 goto out;
980
981         /* Allocate skb for sending to listening sock */
982         skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
983         if (skb == NULL)
984                 goto out;
985
986 restart:
987         /*  Find listening sock. */
988         other = unix_find_other(sunaddr, addr_len, sk->sk_type, hash, &err);
989         if (!other)
990                 goto out;
991
992         /* Latch state of peer */
993         unix_state_rlock(other);
994
995         /* Apparently VFS overslept socket death. Retry. */
996         if (sock_flag(other, SOCK_DEAD)) {
997                 unix_state_runlock(other);
998                 sock_put(other);
999                 goto restart;
1000         }
1001
1002         err = -ECONNREFUSED;
1003         if (other->sk_state != TCP_LISTEN)
1004                 goto out_unlock;
1005
1006         if (skb_queue_len(&other->sk_receive_queue) >
1007             other->sk_max_ack_backlog) {
1008                 err = -EAGAIN;
1009                 if (!timeo)
1010                         goto out_unlock;
1011
1012                 timeo = unix_wait_for_peer(other, timeo);
1013
1014                 err = sock_intr_errno(timeo);
1015                 if (signal_pending(current))
1016                         goto out;
1017                 sock_put(other);
1018                 goto restart;
1019         }
1020
1021         /* Latch our state.
1022
1023            It is tricky place. We need to grab write lock and cannot
1024            drop lock on peer. It is dangerous because deadlock is
1025            possible. Connect to self case and simultaneous
1026            attempt to connect are eliminated by checking socket
1027            state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1028            check this before attempt to grab lock.
1029
1030            Well, and we have to recheck the state after socket locked.
1031          */
1032         st = sk->sk_state;
1033
1034         switch (st) {
1035         case TCP_CLOSE:
1036                 /* This is ok... continue with connect */
1037                 break;
1038         case TCP_ESTABLISHED:
1039                 /* Socket is already connected */
1040                 err = -EISCONN;
1041                 goto out_unlock;
1042         default:
1043                 err = -EINVAL;
1044                 goto out_unlock;
1045         }
1046
1047         unix_state_wlock(sk);
1048
1049         if (sk->sk_state != st) {
1050                 unix_state_wunlock(sk);
1051                 unix_state_runlock(other);
1052                 sock_put(other);
1053                 goto restart;
1054         }
1055
1056         err = security_unix_stream_connect(sock, other->sk_socket, newsk);
1057         if (err) {
1058                 unix_state_wunlock(sk);
1059                 goto out_unlock;
1060         }
1061
1062         /* The way is open! Fastly set all the necessary fields... */
1063
1064         sock_hold(sk);
1065         unix_peer(newsk)        = sk;
1066         newsk->sk_state         = TCP_ESTABLISHED;
1067         newsk->sk_type          = sk->sk_type;
1068         newsk->sk_peercred.pid  = current->tgid;
1069         newsk->sk_peercred.uid  = current->euid;
1070         newsk->sk_peercred.gid  = current->egid;
1071         newu = unix_sk(newsk);
1072         newsk->sk_sleep         = &newu->peer_wait;
1073         otheru = unix_sk(other);
1074
1075         /* copy address information from listening to new sock*/
1076         if (otheru->addr) {
1077                 atomic_inc(&otheru->addr->refcnt);
1078                 newu->addr = otheru->addr;
1079         }
1080         if (otheru->dentry) {
1081                 newu->dentry    = dget(otheru->dentry);
1082                 newu->mnt       = mntget(otheru->mnt);
1083         }
1084
1085         /* Set credentials */
1086         sk->sk_peercred = other->sk_peercred;
1087
1088         sock_hold(newsk);
1089         unix_peer(sk)   = newsk;
1090         sock->state     = SS_CONNECTED;
1091         sk->sk_state    = TCP_ESTABLISHED;
1092
1093         unix_state_wunlock(sk);
1094
1095         /* take ten and and send info to listening sock */
1096         spin_lock(&other->sk_receive_queue.lock);
1097         __skb_queue_tail(&other->sk_receive_queue, skb);
1098         /* Undo artificially decreased inflight after embrion
1099          * is installed to listening socket. */
1100         atomic_inc(&newu->inflight);
1101         spin_unlock(&other->sk_receive_queue.lock);
1102         unix_state_runlock(other);
1103         other->sk_data_ready(other, 0);
1104         sock_put(other);
1105         return 0;
1106
1107 out_unlock:
1108         if (other)
1109                 unix_state_runlock(other);
1110
1111 out:
1112         if (skb)
1113                 kfree_skb(skb);
1114         if (newsk)
1115                 unix_release_sock(newsk, 0);
1116         if (other)
1117                 sock_put(other);
1118         return err;
1119 }
1120
1121 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1122 {
1123         struct sock *ska=socka->sk, *skb = sockb->sk;
1124
1125         /* Join our sockets back to back */
1126         sock_hold(ska);
1127         sock_hold(skb);
1128         unix_peer(ska)=skb;
1129         unix_peer(skb)=ska;
1130         ska->sk_peercred.pid = skb->sk_peercred.pid = current->tgid;
1131         ska->sk_peercred.uid = skb->sk_peercred.uid = current->euid;
1132         ska->sk_peercred.gid = skb->sk_peercred.gid = current->egid;
1133
1134         if (ska->sk_type != SOCK_DGRAM) {
1135                 ska->sk_state = TCP_ESTABLISHED;
1136                 skb->sk_state = TCP_ESTABLISHED;
1137                 socka->state  = SS_CONNECTED;
1138                 sockb->state  = SS_CONNECTED;
1139         }
1140         return 0;
1141 }
1142
1143 static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
1144 {
1145         struct sock *sk = sock->sk;
1146         struct sock *tsk;
1147         struct sk_buff *skb;
1148         int err;
1149
1150         err = -EOPNOTSUPP;
1151         if (sock->type!=SOCK_STREAM && sock->type!=SOCK_SEQPACKET)
1152                 goto out;
1153
1154         err = -EINVAL;
1155         if (sk->sk_state != TCP_LISTEN)
1156                 goto out;
1157
1158         /* If socket state is TCP_LISTEN it cannot change (for now...),
1159          * so that no locks are necessary.
1160          */
1161
1162         skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1163         if (!skb) {
1164                 /* This means receive shutdown. */
1165                 if (err == 0)
1166                         err = -EINVAL;
1167                 goto out;
1168         }
1169
1170         tsk = skb->sk;
1171         skb_free_datagram(sk, skb);
1172         wake_up_interruptible(&unix_sk(sk)->peer_wait);
1173
1174         /* attach accepted sock to socket */
1175         unix_state_wlock(tsk);
1176         newsock->state = SS_CONNECTED;
1177         sock_graft(tsk, newsock);
1178         unix_state_wunlock(tsk);
1179         return 0;
1180
1181 out:
1182         return err;
1183 }
1184
1185
1186 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer)
1187 {
1188         struct sock *sk = sock->sk;
1189         struct unix_sock *u;
1190         struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr;
1191         int err = 0;
1192
1193         if (peer) {
1194                 sk = unix_peer_get(sk);
1195
1196                 err = -ENOTCONN;
1197                 if (!sk)
1198                         goto out;
1199                 err = 0;
1200         } else {
1201                 sock_hold(sk);
1202         }
1203
1204         u = unix_sk(sk);
1205         unix_state_rlock(sk);
1206         if (!u->addr) {
1207                 sunaddr->sun_family = AF_UNIX;
1208                 sunaddr->sun_path[0] = 0;
1209                 *uaddr_len = sizeof(short);
1210         } else {
1211                 struct unix_address *addr = u->addr;
1212
1213                 *uaddr_len = addr->len;
1214                 memcpy(sunaddr, addr->name, *uaddr_len);
1215         }
1216         unix_state_runlock(sk);
1217         sock_put(sk);
1218 out:
1219         return err;
1220 }
1221
1222 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1223 {
1224         int i;
1225
1226         scm->fp = UNIXCB(skb).fp;
1227         skb->destructor = sock_wfree;
1228         UNIXCB(skb).fp = NULL;
1229
1230         for (i=scm->fp->count-1; i>=0; i--)
1231                 unix_notinflight(scm->fp->fp[i]);
1232 }
1233
1234 static void unix_destruct_fds(struct sk_buff *skb)
1235 {
1236         struct scm_cookie scm;
1237         memset(&scm, 0, sizeof(scm));
1238         unix_detach_fds(&scm, skb);
1239
1240         /* Alas, it calls VFS */
1241         /* So fscking what? fput() had been SMP-safe since the last Summer */
1242         scm_destroy(&scm);
1243         sock_wfree(skb);
1244 }
1245
1246 static void unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1247 {
1248         int i;
1249         for (i=scm->fp->count-1; i>=0; i--)
1250                 unix_inflight(scm->fp->fp[i]);
1251         UNIXCB(skb).fp = scm->fp;
1252         skb->destructor = unix_destruct_fds;
1253         scm->fp = NULL;
1254 }
1255
1256 /*
1257  *      Send AF_UNIX data.
1258  */
1259
1260 static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
1261                               struct msghdr *msg, size_t len)
1262 {
1263         struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1264         struct sock *sk = sock->sk;
1265         struct unix_sock *u = unix_sk(sk);
1266         struct sockaddr_un *sunaddr=msg->msg_name;
1267         struct sock *other = NULL;
1268         int namelen = 0; /* fake GCC */
1269         int err;
1270         unsigned hash;
1271         struct sk_buff *skb;
1272         long timeo;
1273         struct scm_cookie tmp_scm;
1274
1275         if (NULL == siocb->scm)
1276                 siocb->scm = &tmp_scm;
1277         err = scm_send(sock, msg, siocb->scm);
1278         if (err < 0)
1279                 return err;
1280
1281         err = -EOPNOTSUPP;
1282         if (msg->msg_flags&MSG_OOB)
1283                 goto out;
1284
1285         if (msg->msg_namelen) {
1286                 err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1287                 if (err < 0)
1288                         goto out;
1289                 namelen = err;
1290         } else {
1291                 sunaddr = NULL;
1292                 err = -ENOTCONN;
1293                 other = unix_peer_get(sk);
1294                 if (!other)
1295                         goto out;
1296         }
1297
1298         if (sock->passcred && !u->addr && (err = unix_autobind(sock)) != 0)
1299                 goto out;
1300
1301         err = -EMSGSIZE;
1302         if (len > sk->sk_sndbuf - 32)
1303                 goto out;
1304
1305         skb = sock_alloc_send_skb(sk, len, msg->msg_flags&MSG_DONTWAIT, &err);
1306         if (skb==NULL)
1307                 goto out;
1308
1309         memcpy(UNIXCREDS(skb), &siocb->scm->creds, sizeof(struct ucred));
1310         if (siocb->scm->fp)
1311                 unix_attach_fds(siocb->scm, skb);
1312
1313         skb->h.raw = skb->data;
1314         err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
1315         if (err)
1316                 goto out_free;
1317
1318         timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1319
1320 restart:
1321         if (!other) {
1322                 err = -ECONNRESET;
1323                 if (sunaddr == NULL)
1324                         goto out_free;
1325
1326                 other = unix_find_other(sunaddr, namelen, sk->sk_type,
1327                                         hash, &err);
1328                 if (other==NULL)
1329                         goto out_free;
1330         }
1331
1332         unix_state_rlock(other);
1333         err = -EPERM;
1334         if (!unix_may_send(sk, other))
1335                 goto out_unlock;
1336
1337         if (sock_flag(other, SOCK_DEAD)) {
1338                 /*
1339                  *      Check with 1003.1g - what should
1340                  *      datagram error
1341                  */
1342                 unix_state_runlock(other);
1343                 sock_put(other);
1344
1345                 err = 0;
1346                 unix_state_wlock(sk);
1347                 if (unix_peer(sk) == other) {
1348                         unix_peer(sk)=NULL;
1349                         unix_state_wunlock(sk);
1350
1351                         unix_dgram_disconnected(sk, other);
1352                         sock_put(other);
1353                         err = -ECONNREFUSED;
1354                 } else {
1355                         unix_state_wunlock(sk);
1356                 }
1357
1358                 other = NULL;
1359                 if (err)
1360                         goto out_free;
1361                 goto restart;
1362         }
1363
1364         err = -EPIPE;
1365         if (other->sk_shutdown & RCV_SHUTDOWN)
1366                 goto out_unlock;
1367
1368         err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1369         if (err)
1370                 goto out_unlock;
1371
1372         if (unix_peer(other) != sk &&
1373             (skb_queue_len(&other->sk_receive_queue) >
1374              other->sk_max_ack_backlog)) {
1375                 if (!timeo) {
1376                         err = -EAGAIN;
1377                         goto out_unlock;
1378                 }
1379
1380                 timeo = unix_wait_for_peer(other, timeo);
1381
1382                 err = sock_intr_errno(timeo);
1383                 if (signal_pending(current))
1384                         goto out_free;
1385
1386                 goto restart;
1387         }
1388
1389         skb_queue_tail(&other->sk_receive_queue, skb);
1390         unix_state_runlock(other);
1391         other->sk_data_ready(other, len);
1392         sock_put(other);
1393         scm_destroy(siocb->scm);
1394         return len;
1395
1396 out_unlock:
1397         unix_state_runlock(other);
1398 out_free:
1399         kfree_skb(skb);
1400 out:
1401         if (other)
1402                 sock_put(other);
1403         scm_destroy(siocb->scm);
1404         return err;
1405 }
1406
1407                 
1408 static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
1409                                struct msghdr *msg, size_t len)
1410 {
1411         struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1412         struct sock *sk = sock->sk;
1413         struct sock *other = NULL;
1414         struct sockaddr_un *sunaddr=msg->msg_name;
1415         int err,size;
1416         struct sk_buff *skb;
1417         int sent=0;
1418         struct scm_cookie tmp_scm;
1419
1420         if (NULL == siocb->scm)
1421                 siocb->scm = &tmp_scm;
1422         err = scm_send(sock, msg, siocb->scm);
1423         if (err < 0)
1424                 return err;
1425
1426         err = -EOPNOTSUPP;
1427         if (msg->msg_flags&MSG_OOB)
1428                 goto out_err;
1429
1430         if (msg->msg_namelen) {
1431                 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1432                 goto out_err;
1433         } else {
1434                 sunaddr = NULL;
1435                 err = -ENOTCONN;
1436                 other = unix_peer_get(sk);
1437                 if (!other)
1438                         goto out_err;
1439         }
1440
1441         if (sk->sk_shutdown & SEND_SHUTDOWN)
1442                 goto pipe_err;
1443
1444         while(sent < len)
1445         {
1446                 /*
1447                  *      Optimisation for the fact that under 0.01% of X messages typically
1448                  *      need breaking up.
1449                  */
1450
1451                 size=len-sent;
1452
1453                 /* Keep two messages in the pipe so it schedules better */
1454                 if (size > sk->sk_sndbuf / 2 - 64)
1455                         size = sk->sk_sndbuf / 2 - 64;
1456
1457                 if (size > SKB_MAX_ALLOC)
1458                         size = SKB_MAX_ALLOC;
1459                         
1460                 /*
1461                  *      Grab a buffer
1462                  */
1463                  
1464                 skb=sock_alloc_send_skb(sk,size,msg->msg_flags&MSG_DONTWAIT, &err);
1465
1466                 if (skb==NULL)
1467                         goto out_err;
1468
1469                 /*
1470                  *      If you pass two values to the sock_alloc_send_skb
1471                  *      it tries to grab the large buffer with GFP_NOFS
1472                  *      (which can fail easily), and if it fails grab the
1473                  *      fallback size buffer which is under a page and will
1474                  *      succeed. [Alan]
1475                  */
1476                 size = min_t(int, size, skb_tailroom(skb));
1477
1478                 memcpy(UNIXCREDS(skb), &siocb->scm->creds, sizeof(struct ucred));
1479                 if (siocb->scm->fp)
1480                         unix_attach_fds(siocb->scm, skb);
1481
1482                 if ((err = memcpy_fromiovec(skb_put(skb,size), msg->msg_iov, size)) != 0) {
1483                         kfree_skb(skb);
1484                         goto out_err;
1485                 }
1486
1487                 unix_state_rlock(other);
1488
1489                 if (sock_flag(other, SOCK_DEAD) ||
1490                     (other->sk_shutdown & RCV_SHUTDOWN))
1491                         goto pipe_err_free;
1492
1493                 skb_queue_tail(&other->sk_receive_queue, skb);
1494                 unix_state_runlock(other);
1495                 other->sk_data_ready(other, size);
1496                 sent+=size;
1497         }
1498         sock_put(other);
1499
1500         scm_destroy(siocb->scm);
1501         siocb->scm = NULL;
1502
1503         return sent;
1504
1505 pipe_err_free:
1506         unix_state_runlock(other);
1507         kfree_skb(skb);
1508 pipe_err:
1509         if (sent==0 && !(msg->msg_flags&MSG_NOSIGNAL))
1510                 send_sig(SIGPIPE,current,0);
1511         err = -EPIPE;
1512 out_err:
1513         if (other)
1514                 sock_put(other);
1515         scm_destroy(siocb->scm);
1516         siocb->scm = NULL;
1517         return sent ? : err;
1518 }
1519
1520 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
1521 {
1522         struct unix_sock *u = unix_sk(sk);
1523
1524         msg->msg_namelen = 0;
1525         if (u->addr) {
1526                 msg->msg_namelen = u->addr->len;
1527                 memcpy(msg->msg_name, u->addr->name, u->addr->len);
1528         }
1529 }
1530
1531 static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
1532                               struct msghdr *msg, size_t size,
1533                               int flags)
1534 {
1535         struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1536         struct scm_cookie tmp_scm;
1537         struct sock *sk = sock->sk;
1538         struct unix_sock *u = unix_sk(sk);
1539         int noblock = flags & MSG_DONTWAIT;
1540         struct sk_buff *skb;
1541         int err;
1542
1543         err = -EOPNOTSUPP;
1544         if (flags&MSG_OOB)
1545                 goto out;
1546
1547         msg->msg_namelen = 0;
1548
1549         skb = skb_recv_datagram(sk, flags, noblock, &err);
1550         if (!skb)
1551                 goto out;
1552
1553         wake_up_interruptible(&u->peer_wait);
1554
1555         if (msg->msg_name)
1556                 unix_copy_addr(msg, skb->sk);
1557
1558         if (size > skb->len)
1559                 size = skb->len;
1560         else if (size < skb->len)
1561                 msg->msg_flags |= MSG_TRUNC;
1562
1563         err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, size);
1564         if (err)
1565                 goto out_free;
1566
1567         if (!siocb->scm) {
1568                 siocb->scm = &tmp_scm;
1569                 memset(&tmp_scm, 0, sizeof(tmp_scm));
1570         }
1571         siocb->scm->creds = *UNIXCREDS(skb);
1572
1573         if (!(flags & MSG_PEEK))
1574         {
1575                 if (UNIXCB(skb).fp)
1576                         unix_detach_fds(siocb->scm, skb);
1577         }
1578         else 
1579         {
1580                 /* It is questionable: on PEEK we could:
1581                    - do not return fds - good, but too simple 8)
1582                    - return fds, and do not return them on read (old strategy,
1583                      apparently wrong)
1584                    - clone fds (I chose it for now, it is the most universal
1585                      solution)
1586                 
1587                    POSIX 1003.1g does not actually define this clearly
1588                    at all. POSIX 1003.1g doesn't define a lot of things
1589                    clearly however!                  
1590                    
1591                 */
1592                 if (UNIXCB(skb).fp)
1593                         siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1594         }
1595         err = size;
1596
1597         scm_recv(sock, msg, siocb->scm, flags);
1598
1599 out_free:
1600         skb_free_datagram(sk,skb);
1601 out:
1602         return err;
1603 }
1604
1605 /*
1606  *      Sleep until data has arrive. But check for races..
1607  */
1608  
1609 static long unix_stream_data_wait(struct sock * sk, long timeo)
1610 {
1611         DEFINE_WAIT(wait);
1612
1613         unix_state_rlock(sk);
1614
1615         for (;;) {
1616                 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1617
1618                 if (skb_queue_len(&sk->sk_receive_queue) ||
1619                     sk->sk_err ||
1620                     (sk->sk_shutdown & RCV_SHUTDOWN) ||
1621                     signal_pending(current) ||
1622                     !timeo)
1623                         break;
1624
1625                 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1626                 unix_state_runlock(sk);
1627                 timeo = schedule_timeout(timeo);
1628                 unix_state_rlock(sk);
1629                 clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1630         }
1631
1632         finish_wait(sk->sk_sleep, &wait);
1633         unix_state_runlock(sk);
1634         return timeo;
1635 }
1636
1637
1638
1639 static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
1640                                struct msghdr *msg, size_t size,
1641                                int flags)
1642 {
1643         struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1644         struct scm_cookie tmp_scm;
1645         struct sock *sk = sock->sk;
1646         struct unix_sock *u = unix_sk(sk);
1647         struct sockaddr_un *sunaddr=msg->msg_name;
1648         int copied = 0;
1649         int check_creds = 0;
1650         int target;
1651         int err = 0;
1652         long timeo;
1653
1654         err = -EINVAL;
1655         if (sk->sk_state != TCP_ESTABLISHED)
1656                 goto out;
1657
1658         err = -EOPNOTSUPP;
1659         if (flags&MSG_OOB)
1660                 goto out;
1661
1662         target = sock_rcvlowat(sk, flags&MSG_WAITALL, size);
1663         timeo = sock_rcvtimeo(sk, flags&MSG_DONTWAIT);
1664
1665         msg->msg_namelen = 0;
1666
1667         /* Lock the socket to prevent queue disordering
1668          * while sleeps in memcpy_tomsg
1669          */
1670
1671         if (!siocb->scm) {
1672                 siocb->scm = &tmp_scm;
1673                 memset(&tmp_scm, 0, sizeof(tmp_scm));
1674         }
1675
1676         down(&u->readsem);
1677
1678         do
1679         {
1680                 int chunk;
1681                 struct sk_buff *skb;
1682
1683                 skb = skb_dequeue(&sk->sk_receive_queue);
1684                 if (skb==NULL)
1685                 {
1686                         if (copied >= target)
1687                                 break;
1688
1689                         /*
1690                          *      POSIX 1003.1g mandates this order.
1691                          */
1692                          
1693                         if ((err = sock_error(sk)) != 0)
1694                                 break;
1695                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1696                                 break;
1697                         err = -EAGAIN;
1698                         if (!timeo)
1699                                 break;
1700                         up(&u->readsem);
1701
1702                         timeo = unix_stream_data_wait(sk, timeo);
1703
1704                         if (signal_pending(current)) {
1705                                 err = sock_intr_errno(timeo);
1706                                 goto out;
1707                         }
1708                         down(&u->readsem);
1709                         continue;
1710                 }
1711
1712                 if (check_creds) {
1713                         /* Never glue messages from different writers */
1714                         if (memcmp(UNIXCREDS(skb), &siocb->scm->creds, sizeof(siocb->scm->creds)) != 0) {
1715                                 skb_queue_head(&sk->sk_receive_queue, skb);
1716                                 break;
1717                         }
1718                 } else {
1719                         /* Copy credentials */
1720                         siocb->scm->creds = *UNIXCREDS(skb);
1721                         check_creds = 1;
1722                 }
1723
1724                 /* Copy address just once */
1725                 if (sunaddr)
1726                 {
1727                         unix_copy_addr(msg, skb->sk);
1728                         sunaddr = NULL;
1729                 }
1730
1731                 chunk = min_t(unsigned int, skb->len, size);
1732                 if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) {
1733                         skb_queue_head(&sk->sk_receive_queue, skb);
1734                         if (copied == 0)
1735                                 copied = -EFAULT;
1736                         break;
1737                 }
1738                 copied += chunk;
1739                 size -= chunk;
1740
1741                 /* Mark read part of skb as used */
1742                 if (!(flags & MSG_PEEK))
1743                 {
1744                         skb_pull(skb, chunk);
1745
1746                         if (UNIXCB(skb).fp)
1747                                 unix_detach_fds(siocb->scm, skb);
1748
1749                         /* put the skb back if we didn't use it up.. */
1750                         if (skb->len)
1751                         {
1752                                 skb_queue_head(&sk->sk_receive_queue, skb);
1753                                 break;
1754                         }
1755
1756                         kfree_skb(skb);
1757
1758                         if (siocb->scm->fp)
1759                                 break;
1760                 }
1761                 else
1762                 {
1763                         /* It is questionable, see note in unix_dgram_recvmsg.
1764                          */
1765                         if (UNIXCB(skb).fp)
1766                                 siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1767
1768                         /* put message back and return */
1769                         skb_queue_head(&sk->sk_receive_queue, skb);
1770                         break;
1771                 }
1772         } while (size);
1773
1774         up(&u->readsem);
1775         scm_recv(sock, msg, siocb->scm, flags);
1776 out:
1777         return copied ? : err;
1778 }
1779
1780 static int unix_shutdown(struct socket *sock, int mode)
1781 {
1782         struct sock *sk = sock->sk;
1783         struct sock *other;
1784
1785         mode = (mode+1)&(RCV_SHUTDOWN|SEND_SHUTDOWN);
1786
1787         if (mode) {
1788                 unix_state_wlock(sk);
1789                 sk->sk_shutdown |= mode;
1790                 other=unix_peer(sk);
1791                 if (other)
1792                         sock_hold(other);
1793                 unix_state_wunlock(sk);
1794                 sk->sk_state_change(sk);
1795
1796                 if (other &&
1797                         (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
1798
1799                         int peer_mode = 0;
1800
1801                         if (mode&RCV_SHUTDOWN)
1802                                 peer_mode |= SEND_SHUTDOWN;
1803                         if (mode&SEND_SHUTDOWN)
1804                                 peer_mode |= RCV_SHUTDOWN;
1805                         unix_state_wlock(other);
1806                         other->sk_shutdown |= peer_mode;
1807                         unix_state_wunlock(other);
1808                         other->sk_state_change(other);
1809                         read_lock(&other->sk_callback_lock);
1810                         if (peer_mode == SHUTDOWN_MASK)
1811                                 sk_wake_async(other,1,POLL_HUP);
1812                         else if (peer_mode & RCV_SHUTDOWN)
1813                                 sk_wake_async(other,1,POLL_IN);
1814                         read_unlock(&other->sk_callback_lock);
1815                 }
1816                 if (other)
1817                         sock_put(other);
1818         }
1819         return 0;
1820 }
1821
1822 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1823 {
1824         struct sock *sk = sock->sk;
1825         long amount=0;
1826         int err;
1827
1828         switch(cmd)
1829         {
1830                 case SIOCOUTQ:
1831                         amount = atomic_read(&sk->sk_wmem_alloc);
1832                         err = put_user(amount, (int *)arg);
1833                         break;
1834                 case SIOCINQ:
1835                 {
1836                         struct sk_buff *skb;
1837                         if (sk->sk_state == TCP_LISTEN) {
1838                                 err = -EINVAL;
1839                                 break;
1840                         }
1841
1842                         spin_lock(&sk->sk_receive_queue.lock);
1843                         skb = skb_peek(&sk->sk_receive_queue);
1844                         if (skb)
1845                                 amount=skb->len;
1846                         spin_unlock(&sk->sk_receive_queue.lock);
1847                         err = put_user(amount, (int *)arg);
1848                         break;
1849                 }
1850
1851                 default:
1852                         err = dev_ioctl(cmd, (void *)arg);
1853                         break;
1854         }
1855         return err;
1856 }
1857
1858 static unsigned int unix_poll(struct file * file, struct socket *sock, poll_table *wait)
1859 {
1860         struct sock *sk = sock->sk;
1861         unsigned int mask;
1862
1863         poll_wait(file, sk->sk_sleep, wait);
1864         mask = 0;
1865
1866         /* exceptional events? */
1867         if (sk->sk_err)
1868                 mask |= POLLERR;
1869         if (sk->sk_shutdown == SHUTDOWN_MASK)
1870                 mask |= POLLHUP;
1871
1872         /* readable? */
1873         if (!skb_queue_empty(&sk->sk_receive_queue) ||
1874             (sk->sk_shutdown & RCV_SHUTDOWN))
1875                 mask |= POLLIN | POLLRDNORM;
1876
1877         /* Connection-based need to check for termination and startup */
1878         if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) && sk->sk_state == TCP_CLOSE)
1879                 mask |= POLLHUP;
1880
1881         /*
1882          * we set writable also when the other side has shut down the
1883          * connection. This prevents stuck sockets.
1884          */
1885         if (unix_writable(sk))
1886                 mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
1887
1888         return mask;
1889 }
1890
1891
1892 #ifdef CONFIG_PROC_FS
1893 static struct sock *unix_seq_idx(int *iter, loff_t pos)
1894 {
1895         loff_t off = 0;
1896         struct sock *s;
1897
1898         for (s = first_unix_socket(iter); s; s = next_unix_socket(iter, s)) {
1899                 if (off == pos) 
1900                         return s;
1901                 ++off;
1902         }
1903         return NULL;
1904 }
1905
1906
1907 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
1908 {
1909         read_lock(&unix_table_lock);
1910         return *pos ? unix_seq_idx(seq->private, *pos - 1) : ((void *) 1);
1911 }
1912
1913 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1914 {
1915         ++*pos;
1916
1917         if (v == (void *)1) 
1918                 return first_unix_socket(seq->private);
1919         return next_unix_socket(seq->private, v);
1920 }
1921
1922 static void unix_seq_stop(struct seq_file *seq, void *v)
1923 {
1924         read_unlock(&unix_table_lock);
1925 }
1926
1927 static int unix_seq_show(struct seq_file *seq, void *v)
1928 {
1929         
1930         if (v == (void *)1)
1931                 seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
1932                          "Inode Path\n");
1933         else {
1934                 struct sock *s = v;
1935                 struct unix_sock *u = unix_sk(s);
1936                 unix_state_rlock(s);
1937
1938                 seq_printf(seq, "%p: %08X %08X %08X %04X %02X %5lu",
1939                         s,
1940                         atomic_read(&s->sk_refcnt),
1941                         0,
1942                         s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
1943                         s->sk_type,
1944                         s->sk_socket ?
1945                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
1946                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
1947                         sock_i_ino(s));
1948
1949                 if (u->addr) {
1950                         int i, len;
1951                         seq_putc(seq, ' ');
1952
1953                         i = 0;
1954                         len = u->addr->len - sizeof(short);
1955                         if (!UNIX_ABSTRACT(s))
1956                                 len--;
1957                         else {
1958                                 seq_putc(seq, '@');
1959                                 i++;
1960                         }
1961                         for ( ; i < len; i++)
1962                                 seq_putc(seq, u->addr->name->sun_path[i]);
1963                 }
1964                 unix_state_runlock(s);
1965                 seq_putc(seq, '\n');
1966         }
1967
1968         return 0;
1969 }
1970
1971 static struct seq_operations unix_seq_ops = {
1972         .start  = unix_seq_start,
1973         .next   = unix_seq_next,
1974         .stop   = unix_seq_stop,
1975         .show   = unix_seq_show,
1976 };
1977
1978
1979 static int unix_seq_open(struct inode *inode, struct file *file)
1980 {
1981         struct seq_file *seq;
1982         int rc = -ENOMEM;
1983         int *iter = kmalloc(sizeof(int), GFP_KERNEL);
1984
1985         if (!iter)
1986                 goto out;
1987
1988         rc = seq_open(file, &unix_seq_ops);
1989         if (rc)
1990                 goto out_kfree;
1991
1992         seq          = file->private_data;
1993         seq->private = iter;
1994         *iter = 0;
1995 out:
1996         return rc;
1997 out_kfree:
1998         kfree(iter);
1999         goto out;
2000 }
2001
2002 static struct file_operations unix_seq_fops = {
2003         .owner          = THIS_MODULE,
2004         .open           = unix_seq_open,
2005         .read           = seq_read,
2006         .llseek         = seq_lseek,
2007         .release        = seq_release_private,
2008 };
2009
2010 #endif
2011
2012 static struct net_proto_family unix_family_ops = {
2013         .family = PF_UNIX,
2014         .create = unix_create,
2015         .owner  = THIS_MODULE,
2016 };
2017
2018 #ifdef CONFIG_SYSCTL
2019 extern void unix_sysctl_register(void);
2020 extern void unix_sysctl_unregister(void);
2021 #else
2022 static inline void unix_sysctl_register(void) {}
2023 static inline void unix_sysctl_unregister(void) {}
2024 #endif
2025
2026 static int __init af_unix_init(void)
2027 {
2028         struct sk_buff *dummy_skb;
2029
2030         if (sizeof(struct unix_skb_parms) > sizeof(dummy_skb->cb)) {
2031                 printk(KERN_CRIT "%s: panic\n", __FUNCTION__);
2032                 return -1;
2033         }
2034         /* allocate our sock slab cache */
2035         unix_sk_cachep = kmem_cache_create("unix_sock",
2036                                            sizeof(struct unix_sock), 0,
2037                                            SLAB_HWCACHE_ALIGN, 0, 0);
2038         if (!unix_sk_cachep)
2039                 printk(KERN_CRIT
2040                         "af_unix_init: Cannot create unix_sock SLAB cache!\n");
2041
2042         sock_register(&unix_family_ops);
2043 #ifdef CONFIG_PROC_FS
2044         proc_net_fops_create("unix", 0, &unix_seq_fops);
2045 #endif
2046         unix_sysctl_register();
2047         return 0;
2048 }
2049
2050 static void __exit af_unix_exit(void)
2051 {
2052         sock_unregister(PF_UNIX);
2053         unix_sysctl_unregister();
2054         proc_net_remove("unix");
2055         kmem_cache_destroy(unix_sk_cachep);
2056 }
2057
2058 module_init(af_unix_init);
2059 module_exit(af_unix_exit);
2060
2061 MODULE_LICENSE("GPL");
2062 MODULE_ALIAS_NETPROTO(PF_UNIX);