This commit was manufactured by cvs2svn to create tag
[linux-2.6.git] / net / unix / af_unix.c
1 /*
2  * NET4:        Implementation of BSD Unix domain sockets.
3  *
4  * Authors:     Alan Cox, <alan.cox@linux.org>
5  *
6  *              This program is free software; you can redistribute it and/or
7  *              modify it under the terms of the GNU General Public License
8  *              as published by the Free Software Foundation; either version
9  *              2 of the License, or (at your option) any later version.
10  *
11  * Version:     $Id: af_unix.c,v 1.133 2002/02/08 03:57:19 davem Exp $
12  *
13  * Fixes:
14  *              Linus Torvalds  :       Assorted bug cures.
15  *              Niibe Yutaka    :       async I/O support.
16  *              Carsten Paeth   :       PF_UNIX check, address fixes.
17  *              Alan Cox        :       Limit size of allocated blocks.
18  *              Alan Cox        :       Fixed the stupid socketpair bug.
19  *              Alan Cox        :       BSD compatibility fine tuning.
20  *              Alan Cox        :       Fixed a bug in connect when interrupted.
21  *              Alan Cox        :       Sorted out a proper draft version of
22  *                                      file descriptor passing hacked up from
23  *                                      Mike Shaver's work.
24  *              Marty Leisner   :       Fixes to fd passing
25  *              Nick Nevin      :       recvmsg bugfix.
26  *              Alan Cox        :       Started proper garbage collector
27  *              Heiko EiBfeldt  :       Missing verify_area check
28  *              Alan Cox        :       Started POSIXisms
29  *              Andreas Schwab  :       Replace inode by dentry for proper
30  *                                      reference counting
31  *              Kirk Petersen   :       Made this a module
32  *          Christoph Rohland   :       Elegant non-blocking accept/connect algorithm.
33  *                                      Lots of bug fixes.
34  *           Alexey Kuznetosv   :       Repaired (I hope) bugs introduces
35  *                                      by above two patches.
36  *           Andrea Arcangeli   :       If possible we block in connect(2)
37  *                                      if the max backlog of the listen socket
38  *                                      is been reached. This won't break
39  *                                      old apps and it will avoid huge amount
40  *                                      of socks hashed (this for unix_gc()
41  *                                      performances reasons).
42  *                                      Security fix that limits the max
43  *                                      number of socks to 2*max_files and
44  *                                      the number of skb queueable in the
45  *                                      dgram receiver.
46  *              Artur Skawina   :       Hash function optimizations
47  *           Alexey Kuznetsov   :       Full scale SMP. Lot of bugs are introduced 8)
48  *            Malcolm Beattie   :       Set peercred for socketpair
49  *           Michal Ostrowski   :       Module initialization cleanup.
50  *           Arnaldo C. Melo    :       Remove MOD_{INC,DEC}_USE_COUNT,
51  *                                      the core infrastructure is doing that
52  *                                      for all net proto families now (2.5.69+)
53  *
54  *
55  * Known differences from reference BSD that was tested:
56  *
57  *      [TO FIX]
58  *      ECONNREFUSED is not returned from one end of a connected() socket to the
59  *              other the moment one end closes.
60  *      fstat() doesn't return st_dev=0, and give the blksize as high water mark
61  *              and a fake inode identifier (nor the BSD first socket fstat twice bug).
62  *      [NOT TO FIX]
63  *      accept() returns a path name even if the connecting socket has closed
64  *              in the meantime (BSD loses the path and gives up).
65  *      accept() returns 0 length path for an unbound connector. BSD returns 16
66  *              and a null first byte in the path (but not for gethost/peername - BSD bug ??)
67  *      socketpair(...SOCK_RAW..) doesn't panic the kernel.
68  *      BSD af_unix apparently has connect forgetting to block properly.
69  *              (need to check this with the POSIX spec in detail)
70  *
71  * Differences from 2.0.0-11-... (ANK)
72  *      Bug fixes and improvements.
73  *              - client shutdown killed server socket.
74  *              - removed all useless cli/sti pairs.
75  *
76  *      Semantic changes/extensions.
77  *              - generic control message passing.
78  *              - SCM_CREDENTIALS control message.
79  *              - "Abstract" (not FS based) socket bindings.
80  *                Abstract names are sequences of bytes (not zero terminated)
81  *                started by 0, so that this name space does not intersect
82  *                with BSD names.
83  */
84
85 #include <linux/module.h>
86 #include <linux/config.h>
87 #include <linux/kernel.h>
88 #include <linux/major.h>
89 #include <linux/signal.h>
90 #include <linux/sched.h>
91 #include <linux/errno.h>
92 #include <linux/string.h>
93 #include <linux/stat.h>
94 #include <linux/dcache.h>
95 #include <linux/namei.h>
96 #include <linux/socket.h>
97 #include <linux/un.h>
98 #include <linux/fcntl.h>
99 #include <linux/termios.h>
100 #include <linux/sockios.h>
101 #include <linux/net.h>
102 #include <linux/in.h>
103 #include <linux/fs.h>
104 #include <linux/slab.h>
105 #include <asm/uaccess.h>
106 #include <linux/skbuff.h>
107 #include <linux/netdevice.h>
108 #include <net/sock.h>
109 #include <linux/tcp.h>
110 #include <net/af_unix.h>
111 #include <linux/proc_fs.h>
112 #include <linux/seq_file.h>
113 #include <net/scm.h>
114 #include <linux/init.h>
115 #include <linux/poll.h>
116 #include <linux/smp_lock.h>
117 #include <linux/rtnetlink.h>
118 #include <linux/mount.h>
119 #include <net/checksum.h>
120 #include <linux/security.h>
121 #include <linux/vs_context.h>
122 #include <linux/vs_network.h>
123
124 int sysctl_unix_max_dgram_qlen = 10;
125
126 kmem_cache_t *unix_sk_cachep;
127
128 struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1];
129 rwlock_t unix_table_lock = RW_LOCK_UNLOCKED;
130 static atomic_t unix_nr_socks = ATOMIC_INIT(0);
131
132 #define unix_sockets_unbound    (&unix_socket_table[UNIX_HASH_SIZE])
133
134 #define UNIX_ABSTRACT(sk)       (unix_sk(sk)->addr->hash != UNIX_HASH_SIZE)
135
136 /*
137  *  SMP locking strategy:
138  *    hash table is protected with rwlock unix_table_lock
139  *    each socket state is protected by separate rwlock.
140  */
141
142 static inline unsigned unix_hash_fold(unsigned hash)
143 {
144         hash ^= hash>>16;
145         hash ^= hash>>8;
146         return hash&(UNIX_HASH_SIZE-1);
147 }
148
149 #define unix_peer(sk) ((sk)->sk_pair)
150
151 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
152 {
153         return unix_peer(osk) == sk;
154 }
155
156 static inline int unix_may_send(struct sock *sk, struct sock *osk)
157 {
158         return (unix_peer(osk) == NULL || unix_our_peer(sk, osk));
159 }
160
161 static struct sock *unix_peer_get(struct sock *s)
162 {
163         struct sock *peer;
164
165         unix_state_rlock(s);
166         peer = unix_peer(s);
167         if (peer)
168                 sock_hold(peer);
169         unix_state_runlock(s);
170         return peer;
171 }
172
173 static inline void unix_release_addr(struct unix_address *addr)
174 {
175         if (atomic_dec_and_test(&addr->refcnt))
176                 kfree(addr);
177 }
178
179 /*
180  *      Check unix socket name:
181  *              - should be not zero length.
182  *              - if started by not zero, should be NULL terminated (FS object)
183  *              - if started by zero, it is abstract name.
184  */
185  
186 static int unix_mkname(struct sockaddr_un * sunaddr, int len, unsigned *hashp)
187 {
188         if (len <= sizeof(short) || len > sizeof(*sunaddr))
189                 return -EINVAL;
190         if (!sunaddr || sunaddr->sun_family != AF_UNIX)
191                 return -EINVAL;
192         if (sunaddr->sun_path[0])
193         {
194                 /*
195                  *      This may look like an off by one error but it is
196                  *      a bit more subtle. 108 is the longest valid AF_UNIX
197                  *      path for a binding. sun_path[108] doesn't as such
198                  *      exist. However in kernel space we are guaranteed that
199                  *      it is a valid memory location in our kernel
200                  *      address buffer.
201                  */
202                 if (len > sizeof(*sunaddr))
203                         len = sizeof(*sunaddr);
204                 ((char *)sunaddr)[len]=0;
205                 len = strlen(sunaddr->sun_path)+1+sizeof(short);
206                 return len;
207         }
208
209         *hashp = unix_hash_fold(csum_partial((char*)sunaddr, len, 0));
210         return len;
211 }
212
213 static void __unix_remove_socket(struct sock *sk)
214 {
215         sk_del_node_init(sk);
216 }
217
218 static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
219 {
220         BUG_TRAP(sk_unhashed(sk));
221         sk_add_node(sk, list);
222 }
223
224 static inline void unix_remove_socket(struct sock *sk)
225 {
226         write_lock(&unix_table_lock);
227         __unix_remove_socket(sk);
228         write_unlock(&unix_table_lock);
229 }
230
231 static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
232 {
233         write_lock(&unix_table_lock);
234         __unix_insert_socket(list, sk);
235         write_unlock(&unix_table_lock);
236 }
237
238 static struct sock *__unix_find_socket_byname(struct sockaddr_un *sunname,
239                                               int len, int type, unsigned hash)
240 {
241         struct sock *s;
242         struct hlist_node *node;
243
244         sk_for_each(s, node, &unix_socket_table[hash ^ type]) {
245                 struct unix_sock *u = unix_sk(s);
246
247                 if (u->addr->len == len &&
248                     !memcmp(u->addr->name, sunname, len))
249                         goto found;
250         }
251         s = NULL;
252 found:
253         return s;
254 }
255
256 static inline struct sock *unix_find_socket_byname(struct sockaddr_un *sunname,
257                                                    int len, int type,
258                                                    unsigned hash)
259 {
260         struct sock *s;
261
262         read_lock(&unix_table_lock);
263         s = __unix_find_socket_byname(sunname, len, type, hash);
264         if (s)
265                 sock_hold(s);
266         read_unlock(&unix_table_lock);
267         return s;
268 }
269
270 static struct sock *unix_find_socket_byinode(struct inode *i)
271 {
272         struct sock *s;
273         struct hlist_node *node;
274
275         read_lock(&unix_table_lock);
276         sk_for_each(s, node,
277                     &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
278                 struct dentry *dentry = unix_sk(s)->dentry;
279
280                 if(dentry && dentry->d_inode == i)
281                 {
282                         sock_hold(s);
283                         goto found;
284                 }
285         }
286         s = NULL;
287 found:
288         read_unlock(&unix_table_lock);
289         return s;
290 }
291
292 static inline int unix_writable(struct sock *sk)
293 {
294         return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
295 }
296
297 static void unix_write_space(struct sock *sk)
298 {
299         read_lock(&sk->sk_callback_lock);
300         if (unix_writable(sk)) {
301                 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
302                         wake_up_interruptible(sk->sk_sleep);
303                 sk_wake_async(sk, 2, POLL_OUT);
304         }
305         read_unlock(&sk->sk_callback_lock);
306 }
307
308 /* When dgram socket disconnects (or changes its peer), we clear its receive
309  * queue of packets arrived from previous peer. First, it allows to do
310  * flow control based only on wmem_alloc; second, sk connected to peer
311  * may receive messages only from that peer. */
312 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
313 {
314         if (skb_queue_len(&sk->sk_receive_queue)) {
315                 skb_queue_purge(&sk->sk_receive_queue);
316                 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
317
318                 /* If one link of bidirectional dgram pipe is disconnected,
319                  * we signal error. Messages are lost. Do not make this,
320                  * when peer was not connected to us.
321                  */
322                 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
323                         other->sk_err = ECONNRESET;
324                         other->sk_error_report(other);
325                 }
326         }
327 }
328
329 static void unix_sock_destructor(struct sock *sk)
330 {
331         struct unix_sock *u = unix_sk(sk);
332
333         skb_queue_purge(&sk->sk_receive_queue);
334
335         BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
336         BUG_TRAP(sk_unhashed(sk));
337         BUG_TRAP(!sk->sk_socket);
338         if (!sock_flag(sk, SOCK_DEAD)) {
339                 printk("Attempt to release alive unix socket: %p\n", sk);
340                 return;
341         }
342
343         if (u->addr)
344                 unix_release_addr(u->addr);
345
346         atomic_dec(&unix_nr_socks);
347 #ifdef UNIX_REFCNT_DEBUG
348         printk(KERN_DEBUG "UNIX %p is destroyed, %d are still alive.\n", sk, atomic_read(&unix_nr_socks));
349 #endif
350 }
351
352 static int unix_release_sock (struct sock *sk, int embrion)
353 {
354         struct unix_sock *u = unix_sk(sk);
355         struct dentry *dentry;
356         struct vfsmount *mnt;
357         struct sock *skpair;
358         struct sk_buff *skb;
359         int state;
360
361         unix_remove_socket(sk);
362
363         /* Clear state */
364         unix_state_wlock(sk);
365         sock_orphan(sk);
366         sk->sk_shutdown = SHUTDOWN_MASK;
367         dentry       = u->dentry;
368         u->dentry    = NULL;
369         mnt          = u->mnt;
370         u->mnt       = NULL;
371         state = sk->sk_state;
372         sk->sk_state = TCP_CLOSE;
373         unix_state_wunlock(sk);
374
375         wake_up_interruptible_all(&u->peer_wait);
376
377         skpair=unix_peer(sk);
378
379         if (skpair!=NULL) {
380                 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
381                         unix_state_wlock(skpair);
382                         /* No more writes */
383                         skpair->sk_shutdown = SHUTDOWN_MASK;
384                         if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
385                                 skpair->sk_err = ECONNRESET;
386                         unix_state_wunlock(skpair);
387                         skpair->sk_state_change(skpair);
388                         read_lock(&skpair->sk_callback_lock);
389                         sk_wake_async(skpair,1,POLL_HUP);
390                         read_unlock(&skpair->sk_callback_lock);
391                 }
392                 sock_put(skpair); /* It may now die */
393                 unix_peer(sk) = NULL;
394         }
395
396         /* Try to flush out this socket. Throw out buffers at least */
397
398         while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
399                 if (state==TCP_LISTEN)
400                         unix_release_sock(skb->sk, 1);
401                 /* passed fds are erased in the kfree_skb hook        */
402                 kfree_skb(skb);
403         }
404
405         if (dentry) {
406                 dput(dentry);
407                 mntput(mnt);
408         }
409
410         clr_vx_info(&sk->sk_vx_info);
411         clr_nx_info(&sk->sk_nx_info);
412         sock_put(sk);
413
414         /* ---- Socket is dead now and most probably destroyed ---- */
415
416         /*
417          * Fixme: BSD difference: In BSD all sockets connected to use get
418          *        ECONNRESET and we die on the spot. In Linux we behave
419          *        like files and pipes do and wait for the last
420          *        dereference.
421          *
422          * Can't we simply set sock->err?
423          *
424          *        What the above comment does talk about? --ANK(980817)
425          */
426
427         if (atomic_read(&unix_tot_inflight))
428                 unix_gc();              /* Garbage collect fds */       
429
430         return 0;
431 }
432
433 static int unix_listen(struct socket *sock, int backlog)
434 {
435         int err;
436         struct sock *sk = sock->sk;
437         struct unix_sock *u = unix_sk(sk);
438
439         err = -EOPNOTSUPP;
440         if (sock->type!=SOCK_STREAM && sock->type!=SOCK_SEQPACKET)
441                 goto out;                       /* Only stream/seqpacket sockets accept */
442         err = -EINVAL;
443         if (!u->addr)
444                 goto out;                       /* No listens on an unbound socket */
445         unix_state_wlock(sk);
446         if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
447                 goto out_unlock;
448         if (backlog > sk->sk_max_ack_backlog)
449                 wake_up_interruptible_all(&u->peer_wait);
450         sk->sk_max_ack_backlog  = backlog;
451         sk->sk_state            = TCP_LISTEN;
452         /* set credentials so connect can copy them */
453         sk->sk_peercred.pid     = current->tgid;
454         sk->sk_peercred.uid     = current->euid;
455         sk->sk_peercred.gid     = current->egid;
456         err = 0;
457
458 out_unlock:
459         unix_state_wunlock(sk);
460 out:
461         return err;
462 }
463
464 static int unix_release(struct socket *);
465 static int unix_bind(struct socket *, struct sockaddr *, int);
466 static int unix_stream_connect(struct socket *, struct sockaddr *,
467                                int addr_len, int flags);
468 static int unix_socketpair(struct socket *, struct socket *);
469 static int unix_accept(struct socket *, struct socket *, int);
470 static int unix_getname(struct socket *, struct sockaddr *, int *, int);
471 static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
472 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
473 static int unix_shutdown(struct socket *, int);
474 static int unix_stream_sendmsg(struct kiocb *, struct socket *,
475                                struct msghdr *, size_t);
476 static int unix_stream_recvmsg(struct kiocb *, struct socket *,
477                                struct msghdr *, size_t, int);
478 static int unix_dgram_sendmsg(struct kiocb *, struct socket *,
479                               struct msghdr *, size_t);
480 static int unix_dgram_recvmsg(struct kiocb *, struct socket *,
481                               struct msghdr *, size_t, int);
482 static int unix_dgram_connect(struct socket *, struct sockaddr *,
483                               int, int);
484
485 static struct proto_ops unix_stream_ops = {
486         .family =       PF_UNIX,
487         .owner =        THIS_MODULE,
488         .release =      unix_release,
489         .bind =         unix_bind,
490         .connect =      unix_stream_connect,
491         .socketpair =   unix_socketpair,
492         .accept =       unix_accept,
493         .getname =      unix_getname,
494         .poll =         unix_poll,
495         .ioctl =        unix_ioctl,
496         .listen =       unix_listen,
497         .shutdown =     unix_shutdown,
498         .setsockopt =   sock_no_setsockopt,
499         .getsockopt =   sock_no_getsockopt,
500         .sendmsg =      unix_stream_sendmsg,
501         .recvmsg =      unix_stream_recvmsg,
502         .mmap =         sock_no_mmap,
503         .sendpage =     sock_no_sendpage,
504 };
505
506 static struct proto_ops unix_dgram_ops = {
507         .family =       PF_UNIX,
508         .owner =        THIS_MODULE,
509         .release =      unix_release,
510         .bind =         unix_bind,
511         .connect =      unix_dgram_connect,
512         .socketpair =   unix_socketpair,
513         .accept =       sock_no_accept,
514         .getname =      unix_getname,
515         .poll =         datagram_poll,
516         .ioctl =        unix_ioctl,
517         .listen =       sock_no_listen,
518         .shutdown =     unix_shutdown,
519         .setsockopt =   sock_no_setsockopt,
520         .getsockopt =   sock_no_getsockopt,
521         .sendmsg =      unix_dgram_sendmsg,
522         .recvmsg =      unix_dgram_recvmsg,
523         .mmap =         sock_no_mmap,
524         .sendpage =     sock_no_sendpage,
525 };
526
527 static struct proto_ops unix_seqpacket_ops = {
528         .family =       PF_UNIX,
529         .owner =        THIS_MODULE,
530         .release =      unix_release,
531         .bind =         unix_bind,
532         .connect =      unix_stream_connect,
533         .socketpair =   unix_socketpair,
534         .accept =       unix_accept,
535         .getname =      unix_getname,
536         .poll =         datagram_poll,
537         .ioctl =        unix_ioctl,
538         .listen =       unix_listen,
539         .shutdown =     unix_shutdown,
540         .setsockopt =   sock_no_setsockopt,
541         .getsockopt =   sock_no_getsockopt,
542         .sendmsg =      unix_dgram_sendmsg,
543         .recvmsg =      unix_dgram_recvmsg,
544         .mmap =         sock_no_mmap,
545         .sendpage =     sock_no_sendpage,
546 };
547
548 static struct sock * unix_create1(struct socket *sock)
549 {
550         struct sock *sk = NULL;
551         struct unix_sock *u;
552
553         if (atomic_read(&unix_nr_socks) >= 2*files_stat.max_files)
554                 goto out;
555
556         sk = sk_alloc(PF_UNIX, GFP_KERNEL, sizeof(struct unix_sock),
557                       unix_sk_cachep);
558         if (!sk)
559                 goto out;
560
561         atomic_inc(&unix_nr_socks);
562
563         sock_init_data(sock,sk);
564         sk_set_owner(sk, THIS_MODULE);
565
566         set_vx_info(&sk->sk_vx_info, current->vx_info);
567         set_nx_info(&sk->sk_nx_info, current->nx_info);
568         sk->sk_xid = vx_current_xid();
569
570         sk->sk_write_space      = unix_write_space;
571         sk->sk_max_ack_backlog  = sysctl_unix_max_dgram_qlen;
572         sk->sk_destruct         = unix_sock_destructor;
573         u         = unix_sk(sk);
574         u->dentry = NULL;
575         u->mnt    = NULL;
576         rwlock_init(&u->lock);
577         atomic_set(&u->inflight, sock ? 0 : -1);
578         init_MUTEX(&u->readsem); /* single task reading lock */
579         init_waitqueue_head(&u->peer_wait);
580         unix_insert_socket(unix_sockets_unbound, sk);
581 out:
582         return sk;
583 }
584
585 static int unix_create(struct socket *sock, int protocol)
586 {
587         if (protocol && protocol != PF_UNIX)
588                 return -EPROTONOSUPPORT;
589
590         sock->state = SS_UNCONNECTED;
591
592         switch (sock->type) {
593         case SOCK_STREAM:
594                 sock->ops = &unix_stream_ops;
595                 break;
596                 /*
597                  *      Believe it or not BSD has AF_UNIX, SOCK_RAW though
598                  *      nothing uses it.
599                  */
600         case SOCK_RAW:
601                 sock->type=SOCK_DGRAM;
602         case SOCK_DGRAM:
603                 sock->ops = &unix_dgram_ops;
604                 break;
605         case SOCK_SEQPACKET:
606                 sock->ops = &unix_seqpacket_ops;
607                 break;
608         default:
609                 return -ESOCKTNOSUPPORT;
610         }
611
612         return unix_create1(sock) ? 0 : -ENOMEM;
613 }
614
615 static int unix_release(struct socket *sock)
616 {
617         struct sock *sk = sock->sk;
618
619         if (!sk)
620                 return 0;
621
622         sock->sk = NULL;
623
624         return unix_release_sock (sk, 0);
625 }
626
627 static int unix_autobind(struct socket *sock)
628 {
629         struct sock *sk = sock->sk;
630         struct unix_sock *u = unix_sk(sk);
631         static u32 ordernum = 1;
632         struct unix_address * addr;
633         int err;
634
635         down(&u->readsem);
636
637         err = 0;
638         if (u->addr)
639                 goto out;
640
641         err = -ENOMEM;
642         addr = kmalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
643         if (!addr)
644                 goto out;
645
646         memset(addr, 0, sizeof(*addr) + sizeof(short) + 16);
647         addr->name->sun_family = AF_UNIX;
648         atomic_set(&addr->refcnt, 1);
649
650 retry:
651         addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
652         addr->hash = unix_hash_fold(csum_partial((void*)addr->name, addr->len, 0));
653
654         write_lock(&unix_table_lock);
655         ordernum = (ordernum+1)&0xFFFFF;
656
657         if (__unix_find_socket_byname(addr->name, addr->len, sock->type,
658                                       addr->hash)) {
659                 write_unlock(&unix_table_lock);
660                 /* Sanity yield. It is unusual case, but yet... */
661                 if (!(ordernum&0xFF))
662                         yield();
663                 goto retry;
664         }
665         addr->hash ^= sk->sk_type;
666
667         __unix_remove_socket(sk);
668         u->addr = addr;
669         __unix_insert_socket(&unix_socket_table[addr->hash], sk);
670         write_unlock(&unix_table_lock);
671         err = 0;
672
673 out:    up(&u->readsem);
674         return err;
675 }
676
677 static struct sock *unix_find_other(struct sockaddr_un *sunname, int len,
678                                     int type, unsigned hash, int *error)
679 {
680         struct sock *u;
681         struct nameidata nd;
682         int err = 0;
683         
684         if (sunname->sun_path[0]) {
685                 err = path_lookup(sunname->sun_path, LOOKUP_FOLLOW, &nd);
686                 if (err)
687                         goto fail;
688                 err = permission(nd.dentry->d_inode,MAY_WRITE, &nd);
689                 if (err)
690                         goto put_fail;
691
692                 err = -ECONNREFUSED;
693                 if (!S_ISSOCK(nd.dentry->d_inode->i_mode))
694                         goto put_fail;
695                 u=unix_find_socket_byinode(nd.dentry->d_inode);
696                 if (!u)
697                         goto put_fail;
698
699                 if (u->sk_type == type)
700                         touch_atime(nd.mnt, nd.dentry);
701
702                 path_release(&nd);
703
704                 err=-EPROTOTYPE;
705                 if (u->sk_type != type) {
706                         sock_put(u);
707                         goto fail;
708                 }
709         } else {
710                 err = -ECONNREFUSED;
711                 u=unix_find_socket_byname(sunname, len, type, hash);
712                 if (u) {
713                         struct dentry *dentry;
714                         dentry = unix_sk(u)->dentry;
715                         if (dentry)
716                                 touch_atime(unix_sk(u)->mnt, dentry);
717                 } else
718                         goto fail;
719         }
720         return u;
721
722 put_fail:
723         path_release(&nd);
724 fail:
725         *error=err;
726         return NULL;
727 }
728
729
730 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
731 {
732         struct sock *sk = sock->sk;
733         struct unix_sock *u = unix_sk(sk);
734         struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr;
735         struct dentry * dentry = NULL;
736         struct nameidata nd;
737         int err;
738         unsigned hash;
739         struct unix_address *addr;
740         struct hlist_head *list;
741
742         err = -EINVAL;
743         if (sunaddr->sun_family != AF_UNIX)
744                 goto out;
745
746         if (addr_len==sizeof(short)) {
747                 err = unix_autobind(sock);
748                 goto out;
749         }
750
751         err = unix_mkname(sunaddr, addr_len, &hash);
752         if (err < 0)
753                 goto out;
754         addr_len = err;
755
756         down(&u->readsem);
757
758         err = -EINVAL;
759         if (u->addr)
760                 goto out_up;
761
762         err = -ENOMEM;
763         addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
764         if (!addr)
765                 goto out_up;
766
767         memcpy(addr->name, sunaddr, addr_len);
768         addr->len = addr_len;
769         addr->hash = hash ^ sk->sk_type;
770         atomic_set(&addr->refcnt, 1);
771
772         if (sunaddr->sun_path[0]) {
773                 unsigned int mode;
774                 err = 0;
775                 /*
776                  * Get the parent directory, calculate the hash for last
777                  * component.
778                  */
779                 err = path_lookup(sunaddr->sun_path, LOOKUP_PARENT, &nd);
780                 if (err)
781                         goto out_mknod_parent;
782                 /*
783                  * Yucky last component or no last component at all?
784                  * (foo/., foo/.., /////)
785                  */
786                 err = -EEXIST;
787                 if (nd.last_type != LAST_NORM)
788                         goto out_mknod;
789                 /*
790                  * Lock the directory.
791                  */
792                 down(&nd.dentry->d_inode->i_sem);
793                 /*
794                  * Do the final lookup.
795                  */
796                 dentry = lookup_hash(&nd.last, nd.dentry);
797                 err = PTR_ERR(dentry);
798                 if (IS_ERR(dentry))
799                         goto out_mknod_unlock;
800                 err = -ENOENT;
801                 /*
802                  * Special case - lookup gave negative, but... we had foo/bar/
803                  * From the vfs_mknod() POV we just have a negative dentry -
804                  * all is fine. Let's be bastards - you had / on the end, you've
805                  * been asking for (non-existent) directory. -ENOENT for you.
806                  */
807                 if (nd.last.name[nd.last.len] && !dentry->d_inode)
808                         goto out_mknod_dput;
809                 /*
810                  * All right, let's create it.
811                  */
812                 mode = S_IFSOCK |
813                        (SOCK_INODE(sock)->i_mode & ~current->fs->umask);
814                 err = vfs_mknod(nd.dentry->d_inode, dentry, mode, 0);
815                 if (err)
816                         goto out_mknod_dput;
817                 up(&nd.dentry->d_inode->i_sem);
818                 dput(nd.dentry);
819                 nd.dentry = dentry;
820
821                 addr->hash = UNIX_HASH_SIZE;
822         }
823
824         write_lock(&unix_table_lock);
825
826         if (!sunaddr->sun_path[0]) {
827                 err = -EADDRINUSE;
828                 if (__unix_find_socket_byname(sunaddr, addr_len,
829                                               sk->sk_type, hash)) {
830                         unix_release_addr(addr);
831                         goto out_unlock;
832                 }
833
834                 list = &unix_socket_table[addr->hash];
835         } else {
836                 list = &unix_socket_table[dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1)];
837                 u->dentry = nd.dentry;
838                 u->mnt    = nd.mnt;
839         }
840
841         err = 0;
842         __unix_remove_socket(sk);
843         u->addr = addr;
844         __unix_insert_socket(list, sk);
845
846 out_unlock:
847         write_unlock(&unix_table_lock);
848 out_up:
849         up(&u->readsem);
850 out:
851         return err;
852
853 out_mknod_dput:
854         dput(dentry);
855 out_mknod_unlock:
856         up(&nd.dentry->d_inode->i_sem);
857 out_mknod:
858         path_release(&nd);
859 out_mknod_parent:
860         if (err==-EEXIST)
861                 err=-EADDRINUSE;
862         unix_release_addr(addr);
863         goto out_up;
864 }
865
866 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
867                               int alen, int flags)
868 {
869         struct sock *sk = sock->sk;
870         struct sockaddr_un *sunaddr=(struct sockaddr_un*)addr;
871         struct sock *other;
872         unsigned hash;
873         int err;
874
875         if (addr->sa_family != AF_UNSPEC) {
876                 err = unix_mkname(sunaddr, alen, &hash);
877                 if (err < 0)
878                         goto out;
879                 alen = err;
880
881                 if (test_bit(SOCK_PASS_CRED, &sock->flags) && !unix_sk(sk)->addr &&
882                     (err = unix_autobind(sock)) != 0)
883                         goto out;
884
885                 other=unix_find_other(sunaddr, alen, sock->type, hash, &err);
886                 if (!other)
887                         goto out;
888
889                 unix_state_wlock(sk);
890
891                 err = -EPERM;
892                 if (!unix_may_send(sk, other))
893                         goto out_unlock;
894
895                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
896                 if (err)
897                         goto out_unlock;
898
899         } else {
900                 /*
901                  *      1003.1g breaking connected state with AF_UNSPEC
902                  */
903                 other = NULL;
904                 unix_state_wlock(sk);
905         }
906
907         /*
908          * If it was connected, reconnect.
909          */
910         if (unix_peer(sk)) {
911                 struct sock *old_peer = unix_peer(sk);
912                 unix_peer(sk)=other;
913                 unix_state_wunlock(sk);
914
915                 if (other != old_peer)
916                         unix_dgram_disconnected(sk, old_peer);
917                 sock_put(old_peer);
918         } else {
919                 unix_peer(sk)=other;
920                 unix_state_wunlock(sk);
921         }
922         return 0;
923
924 out_unlock:
925         unix_state_wunlock(sk);
926         sock_put(other);
927 out:
928         return err;
929 }
930
931 static long unix_wait_for_peer(struct sock *other, long timeo)
932 {
933         struct unix_sock *u = unix_sk(other);
934         int sched;
935         DEFINE_WAIT(wait);
936
937         prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
938
939         sched = !sock_flag(other, SOCK_DEAD) &&
940                 !(other->sk_shutdown & RCV_SHUTDOWN) &&
941                 (skb_queue_len(&other->sk_receive_queue) >
942                  other->sk_max_ack_backlog);
943
944         unix_state_runlock(other);
945
946         if (sched)
947                 timeo = schedule_timeout(timeo);
948
949         finish_wait(&u->peer_wait, &wait);
950         return timeo;
951 }
952
953 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
954                                int addr_len, int flags)
955 {
956         struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr;
957         struct sock *sk = sock->sk;
958         struct unix_sock *u = unix_sk(sk), *newu, *otheru;
959         struct sock *newsk = NULL;
960         struct sock *other = NULL;
961         struct sk_buff *skb = NULL;
962         unsigned hash;
963         int st;
964         int err;
965         long timeo;
966
967         err = unix_mkname(sunaddr, addr_len, &hash);
968         if (err < 0)
969                 goto out;
970         addr_len = err;
971
972         if (test_bit(SOCK_PASS_CRED, &sock->flags)
973                 && !u->addr && (err = unix_autobind(sock)) != 0)
974                 goto out;
975
976         timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
977
978         /* First of all allocate resources.
979            If we will make it after state is locked,
980            we will have to recheck all again in any case.
981          */
982
983         err = -ENOMEM;
984
985         /* create new sock for complete connection */
986         newsk = unix_create1(NULL);
987         if (newsk == NULL)
988                 goto out;
989
990         /* Allocate skb for sending to listening sock */
991         skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
992         if (skb == NULL)
993                 goto out;
994
995 restart:
996         /*  Find listening sock. */
997         other = unix_find_other(sunaddr, addr_len, sk->sk_type, hash, &err);
998         if (!other)
999                 goto out;
1000
1001         /* Latch state of peer */
1002         unix_state_rlock(other);
1003
1004         /* Apparently VFS overslept socket death. Retry. */
1005         if (sock_flag(other, SOCK_DEAD)) {
1006                 unix_state_runlock(other);
1007                 sock_put(other);
1008                 goto restart;
1009         }
1010
1011         err = -ECONNREFUSED;
1012         if (other->sk_state != TCP_LISTEN)
1013                 goto out_unlock;
1014
1015         if (skb_queue_len(&other->sk_receive_queue) >
1016             other->sk_max_ack_backlog) {
1017                 err = -EAGAIN;
1018                 if (!timeo)
1019                         goto out_unlock;
1020
1021                 timeo = unix_wait_for_peer(other, timeo);
1022
1023                 err = sock_intr_errno(timeo);
1024                 if (signal_pending(current))
1025                         goto out;
1026                 sock_put(other);
1027                 goto restart;
1028         }
1029
1030         /* Latch our state.
1031
1032            It is tricky place. We need to grab write lock and cannot
1033            drop lock on peer. It is dangerous because deadlock is
1034            possible. Connect to self case and simultaneous
1035            attempt to connect are eliminated by checking socket
1036            state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1037            check this before attempt to grab lock.
1038
1039            Well, and we have to recheck the state after socket locked.
1040          */
1041         st = sk->sk_state;
1042
1043         switch (st) {
1044         case TCP_CLOSE:
1045                 /* This is ok... continue with connect */
1046                 break;
1047         case TCP_ESTABLISHED:
1048                 /* Socket is already connected */
1049                 err = -EISCONN;
1050                 goto out_unlock;
1051         default:
1052                 err = -EINVAL;
1053                 goto out_unlock;
1054         }
1055
1056         unix_state_wlock(sk);
1057
1058         if (sk->sk_state != st) {
1059                 unix_state_wunlock(sk);
1060                 unix_state_runlock(other);
1061                 sock_put(other);
1062                 goto restart;
1063         }
1064
1065         err = security_unix_stream_connect(sock, other->sk_socket, newsk);
1066         if (err) {
1067                 unix_state_wunlock(sk);
1068                 goto out_unlock;
1069         }
1070
1071         /* The way is open! Fastly set all the necessary fields... */
1072
1073         sock_hold(sk);
1074         unix_peer(newsk)        = sk;
1075         newsk->sk_state         = TCP_ESTABLISHED;
1076         newsk->sk_type          = sk->sk_type;
1077         newsk->sk_peercred.pid  = current->tgid;
1078         newsk->sk_peercred.uid  = current->euid;
1079         newsk->sk_peercred.gid  = current->egid;
1080         newu = unix_sk(newsk);
1081         newsk->sk_sleep         = &newu->peer_wait;
1082         otheru = unix_sk(other);
1083
1084         /* copy address information from listening to new sock*/
1085         if (otheru->addr) {
1086                 atomic_inc(&otheru->addr->refcnt);
1087                 newu->addr = otheru->addr;
1088         }
1089         if (otheru->dentry) {
1090                 newu->dentry    = dget(otheru->dentry);
1091                 newu->mnt       = mntget(otheru->mnt);
1092         }
1093
1094         /* Set credentials */
1095         sk->sk_peercred = other->sk_peercred;
1096
1097         sock_hold(newsk);
1098         unix_peer(sk)   = newsk;
1099         sock->state     = SS_CONNECTED;
1100         sk->sk_state    = TCP_ESTABLISHED;
1101
1102         unix_state_wunlock(sk);
1103
1104         /* take ten and and send info to listening sock */
1105         spin_lock(&other->sk_receive_queue.lock);
1106         __skb_queue_tail(&other->sk_receive_queue, skb);
1107         /* Undo artificially decreased inflight after embrion
1108          * is installed to listening socket. */
1109         atomic_inc(&newu->inflight);
1110         spin_unlock(&other->sk_receive_queue.lock);
1111         unix_state_runlock(other);
1112         other->sk_data_ready(other, 0);
1113         sock_put(other);
1114         return 0;
1115
1116 out_unlock:
1117         if (other)
1118                 unix_state_runlock(other);
1119
1120 out:
1121         if (skb)
1122                 kfree_skb(skb);
1123         if (newsk)
1124                 unix_release_sock(newsk, 0);
1125         if (other)
1126                 sock_put(other);
1127         return err;
1128 }
1129
1130 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1131 {
1132         struct sock *ska=socka->sk, *skb = sockb->sk;
1133
1134         /* Join our sockets back to back */
1135         sock_hold(ska);
1136         sock_hold(skb);
1137         unix_peer(ska)=skb;
1138         unix_peer(skb)=ska;
1139         ska->sk_peercred.pid = skb->sk_peercred.pid = current->tgid;
1140         ska->sk_peercred.uid = skb->sk_peercred.uid = current->euid;
1141         ska->sk_peercred.gid = skb->sk_peercred.gid = current->egid;
1142
1143         if (ska->sk_type != SOCK_DGRAM) {
1144                 ska->sk_state = TCP_ESTABLISHED;
1145                 skb->sk_state = TCP_ESTABLISHED;
1146                 socka->state  = SS_CONNECTED;
1147                 sockb->state  = SS_CONNECTED;
1148         }
1149         return 0;
1150 }
1151
1152 static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
1153 {
1154         struct sock *sk = sock->sk;
1155         struct sock *tsk;
1156         struct sk_buff *skb;
1157         int err;
1158
1159         err = -EOPNOTSUPP;
1160         if (sock->type!=SOCK_STREAM && sock->type!=SOCK_SEQPACKET)
1161                 goto out;
1162
1163         err = -EINVAL;
1164         if (sk->sk_state != TCP_LISTEN)
1165                 goto out;
1166
1167         /* If socket state is TCP_LISTEN it cannot change (for now...),
1168          * so that no locks are necessary.
1169          */
1170
1171         skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1172         if (!skb) {
1173                 /* This means receive shutdown. */
1174                 if (err == 0)
1175                         err = -EINVAL;
1176                 goto out;
1177         }
1178
1179         tsk = skb->sk;
1180         skb_free_datagram(sk, skb);
1181         wake_up_interruptible(&unix_sk(sk)->peer_wait);
1182
1183         /* attach accepted sock to socket */
1184         unix_state_wlock(tsk);
1185         newsock->state = SS_CONNECTED;
1186         sock_graft(tsk, newsock);
1187         unix_state_wunlock(tsk);
1188         return 0;
1189
1190 out:
1191         return err;
1192 }
1193
1194
1195 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer)
1196 {
1197         struct sock *sk = sock->sk;
1198         struct unix_sock *u;
1199         struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr;
1200         int err = 0;
1201
1202         if (peer) {
1203                 sk = unix_peer_get(sk);
1204
1205                 err = -ENOTCONN;
1206                 if (!sk)
1207                         goto out;
1208                 err = 0;
1209         } else {
1210                 sock_hold(sk);
1211         }
1212
1213         u = unix_sk(sk);
1214         unix_state_rlock(sk);
1215         if (!u->addr) {
1216                 sunaddr->sun_family = AF_UNIX;
1217                 sunaddr->sun_path[0] = 0;
1218                 *uaddr_len = sizeof(short);
1219         } else {
1220                 struct unix_address *addr = u->addr;
1221
1222                 *uaddr_len = addr->len;
1223                 memcpy(sunaddr, addr->name, *uaddr_len);
1224         }
1225         unix_state_runlock(sk);
1226         sock_put(sk);
1227 out:
1228         return err;
1229 }
1230
1231 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1232 {
1233         int i;
1234
1235         scm->fp = UNIXCB(skb).fp;
1236         skb->destructor = sock_wfree;
1237         UNIXCB(skb).fp = NULL;
1238
1239         for (i=scm->fp->count-1; i>=0; i--)
1240                 unix_notinflight(scm->fp->fp[i]);
1241 }
1242
1243 static void unix_destruct_fds(struct sk_buff *skb)
1244 {
1245         struct scm_cookie scm;
1246         memset(&scm, 0, sizeof(scm));
1247         unix_detach_fds(&scm, skb);
1248
1249         /* Alas, it calls VFS */
1250         /* So fscking what? fput() had been SMP-safe since the last Summer */
1251         scm_destroy(&scm);
1252         sock_wfree(skb);
1253 }
1254
1255 static void unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1256 {
1257         int i;
1258         for (i=scm->fp->count-1; i>=0; i--)
1259                 unix_inflight(scm->fp->fp[i]);
1260         UNIXCB(skb).fp = scm->fp;
1261         skb->destructor = unix_destruct_fds;
1262         scm->fp = NULL;
1263 }
1264
1265 /*
1266  *      Send AF_UNIX data.
1267  */
1268
1269 static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
1270                               struct msghdr *msg, size_t len)
1271 {
1272         struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1273         struct sock *sk = sock->sk;
1274         struct unix_sock *u = unix_sk(sk);
1275         struct sockaddr_un *sunaddr=msg->msg_name;
1276         struct sock *other = NULL;
1277         int namelen = 0; /* fake GCC */
1278         int err;
1279         unsigned hash;
1280         struct sk_buff *skb;
1281         long timeo;
1282         struct scm_cookie tmp_scm;
1283
1284         if (NULL == siocb->scm)
1285                 siocb->scm = &tmp_scm;
1286         err = scm_send(sock, msg, siocb->scm);
1287         if (err < 0)
1288                 return err;
1289
1290         err = -EOPNOTSUPP;
1291         if (msg->msg_flags&MSG_OOB)
1292                 goto out;
1293
1294         if (msg->msg_namelen) {
1295                 err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1296                 if (err < 0)
1297                         goto out;
1298                 namelen = err;
1299         } else {
1300                 sunaddr = NULL;
1301                 err = -ENOTCONN;
1302                 other = unix_peer_get(sk);
1303                 if (!other)
1304                         goto out;
1305         }
1306
1307         if (test_bit(SOCK_PASS_CRED, &sock->flags)
1308                 && !u->addr && (err = unix_autobind(sock)) != 0)
1309                 goto out;
1310
1311         err = -EMSGSIZE;
1312         if (len > sk->sk_sndbuf - 32)
1313                 goto out;
1314
1315         skb = sock_alloc_send_skb(sk, len, msg->msg_flags&MSG_DONTWAIT, &err);
1316         if (skb==NULL)
1317                 goto out;
1318
1319         memcpy(UNIXCREDS(skb), &siocb->scm->creds, sizeof(struct ucred));
1320         if (siocb->scm->fp)
1321                 unix_attach_fds(siocb->scm, skb);
1322
1323         skb->h.raw = skb->data;
1324         err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
1325         if (err)
1326                 goto out_free;
1327
1328         timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1329
1330 restart:
1331         if (!other) {
1332                 err = -ECONNRESET;
1333                 if (sunaddr == NULL)
1334                         goto out_free;
1335
1336                 other = unix_find_other(sunaddr, namelen, sk->sk_type,
1337                                         hash, &err);
1338                 if (other==NULL)
1339                         goto out_free;
1340         }
1341
1342         unix_state_rlock(other);
1343         err = -EPERM;
1344         if (!unix_may_send(sk, other))
1345                 goto out_unlock;
1346
1347         if (sock_flag(other, SOCK_DEAD)) {
1348                 /*
1349                  *      Check with 1003.1g - what should
1350                  *      datagram error
1351                  */
1352                 unix_state_runlock(other);
1353                 sock_put(other);
1354
1355                 err = 0;
1356                 unix_state_wlock(sk);
1357                 if (unix_peer(sk) == other) {
1358                         unix_peer(sk)=NULL;
1359                         unix_state_wunlock(sk);
1360
1361                         unix_dgram_disconnected(sk, other);
1362                         sock_put(other);
1363                         err = -ECONNREFUSED;
1364                 } else {
1365                         unix_state_wunlock(sk);
1366                 }
1367
1368                 other = NULL;
1369                 if (err)
1370                         goto out_free;
1371                 goto restart;
1372         }
1373
1374         err = -EPIPE;
1375         if (other->sk_shutdown & RCV_SHUTDOWN)
1376                 goto out_unlock;
1377
1378         err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1379         if (err)
1380                 goto out_unlock;
1381
1382         if (unix_peer(other) != sk &&
1383             (skb_queue_len(&other->sk_receive_queue) >
1384              other->sk_max_ack_backlog)) {
1385                 if (!timeo) {
1386                         err = -EAGAIN;
1387                         goto out_unlock;
1388                 }
1389
1390                 timeo = unix_wait_for_peer(other, timeo);
1391
1392                 err = sock_intr_errno(timeo);
1393                 if (signal_pending(current))
1394                         goto out_free;
1395
1396                 goto restart;
1397         }
1398
1399         skb_queue_tail(&other->sk_receive_queue, skb);
1400         unix_state_runlock(other);
1401         other->sk_data_ready(other, len);
1402         sock_put(other);
1403         scm_destroy(siocb->scm);
1404         return len;
1405
1406 out_unlock:
1407         unix_state_runlock(other);
1408 out_free:
1409         kfree_skb(skb);
1410 out:
1411         if (other)
1412                 sock_put(other);
1413         scm_destroy(siocb->scm);
1414         return err;
1415 }
1416
1417                 
1418 static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
1419                                struct msghdr *msg, size_t len)
1420 {
1421         struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1422         struct sock *sk = sock->sk;
1423         struct sock *other = NULL;
1424         struct sockaddr_un *sunaddr=msg->msg_name;
1425         int err,size;
1426         struct sk_buff *skb;
1427         int sent=0;
1428         struct scm_cookie tmp_scm;
1429
1430         if (NULL == siocb->scm)
1431                 siocb->scm = &tmp_scm;
1432         err = scm_send(sock, msg, siocb->scm);
1433         if (err < 0)
1434                 return err;
1435
1436         err = -EOPNOTSUPP;
1437         if (msg->msg_flags&MSG_OOB)
1438                 goto out_err;
1439
1440         if (msg->msg_namelen) {
1441                 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1442                 goto out_err;
1443         } else {
1444                 sunaddr = NULL;
1445                 err = -ENOTCONN;
1446                 other = unix_peer_get(sk);
1447                 if (!other)
1448                         goto out_err;
1449         }
1450
1451         if (sk->sk_shutdown & SEND_SHUTDOWN)
1452                 goto pipe_err;
1453
1454         while(sent < len)
1455         {
1456                 /*
1457                  *      Optimisation for the fact that under 0.01% of X messages typically
1458                  *      need breaking up.
1459                  */
1460
1461                 size=len-sent;
1462
1463                 /* Keep two messages in the pipe so it schedules better */
1464                 if (size > sk->sk_sndbuf / 2 - 64)
1465                         size = sk->sk_sndbuf / 2 - 64;
1466
1467                 if (size > SKB_MAX_ALLOC)
1468                         size = SKB_MAX_ALLOC;
1469                         
1470                 /*
1471                  *      Grab a buffer
1472                  */
1473                  
1474                 skb=sock_alloc_send_skb(sk,size,msg->msg_flags&MSG_DONTWAIT, &err);
1475
1476                 if (skb==NULL)
1477                         goto out_err;
1478
1479                 /*
1480                  *      If you pass two values to the sock_alloc_send_skb
1481                  *      it tries to grab the large buffer with GFP_NOFS
1482                  *      (which can fail easily), and if it fails grab the
1483                  *      fallback size buffer which is under a page and will
1484                  *      succeed. [Alan]
1485                  */
1486                 size = min_t(int, size, skb_tailroom(skb));
1487
1488                 memcpy(UNIXCREDS(skb), &siocb->scm->creds, sizeof(struct ucred));
1489                 if (siocb->scm->fp)
1490                         unix_attach_fds(siocb->scm, skb);
1491
1492                 if ((err = memcpy_fromiovec(skb_put(skb,size), msg->msg_iov, size)) != 0) {
1493                         kfree_skb(skb);
1494                         goto out_err;
1495                 }
1496
1497                 unix_state_rlock(other);
1498
1499                 if (sock_flag(other, SOCK_DEAD) ||
1500                     (other->sk_shutdown & RCV_SHUTDOWN))
1501                         goto pipe_err_free;
1502
1503                 skb_queue_tail(&other->sk_receive_queue, skb);
1504                 unix_state_runlock(other);
1505                 other->sk_data_ready(other, size);
1506                 sent+=size;
1507         }
1508         sock_put(other);
1509
1510         scm_destroy(siocb->scm);
1511         siocb->scm = NULL;
1512
1513         return sent;
1514
1515 pipe_err_free:
1516         unix_state_runlock(other);
1517         kfree_skb(skb);
1518 pipe_err:
1519         if (sent==0 && !(msg->msg_flags&MSG_NOSIGNAL))
1520                 send_sig(SIGPIPE,current,0);
1521         err = -EPIPE;
1522 out_err:
1523         if (other)
1524                 sock_put(other);
1525         scm_destroy(siocb->scm);
1526         siocb->scm = NULL;
1527         return sent ? : err;
1528 }
1529
1530 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
1531 {
1532         struct unix_sock *u = unix_sk(sk);
1533
1534         msg->msg_namelen = 0;
1535         if (u->addr) {
1536                 msg->msg_namelen = u->addr->len;
1537                 memcpy(msg->msg_name, u->addr->name, u->addr->len);
1538         }
1539 }
1540
1541 static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
1542                               struct msghdr *msg, size_t size,
1543                               int flags)
1544 {
1545         struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1546         struct scm_cookie tmp_scm;
1547         struct sock *sk = sock->sk;
1548         struct unix_sock *u = unix_sk(sk);
1549         int noblock = flags & MSG_DONTWAIT;
1550         struct sk_buff *skb;
1551         int err;
1552
1553         err = -EOPNOTSUPP;
1554         if (flags&MSG_OOB)
1555                 goto out;
1556
1557         msg->msg_namelen = 0;
1558
1559         skb = skb_recv_datagram(sk, flags, noblock, &err);
1560         if (!skb)
1561                 goto out;
1562
1563         wake_up_interruptible(&u->peer_wait);
1564
1565         if (msg->msg_name)
1566                 unix_copy_addr(msg, skb->sk);
1567
1568         if (size > skb->len)
1569                 size = skb->len;
1570         else if (size < skb->len)
1571                 msg->msg_flags |= MSG_TRUNC;
1572
1573         err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, size);
1574         if (err)
1575                 goto out_free;
1576
1577         if (!siocb->scm) {
1578                 siocb->scm = &tmp_scm;
1579                 memset(&tmp_scm, 0, sizeof(tmp_scm));
1580         }
1581         siocb->scm->creds = *UNIXCREDS(skb);
1582
1583         if (!(flags & MSG_PEEK))
1584         {
1585                 if (UNIXCB(skb).fp)
1586                         unix_detach_fds(siocb->scm, skb);
1587         }
1588         else 
1589         {
1590                 /* It is questionable: on PEEK we could:
1591                    - do not return fds - good, but too simple 8)
1592                    - return fds, and do not return them on read (old strategy,
1593                      apparently wrong)
1594                    - clone fds (I chose it for now, it is the most universal
1595                      solution)
1596                 
1597                    POSIX 1003.1g does not actually define this clearly
1598                    at all. POSIX 1003.1g doesn't define a lot of things
1599                    clearly however!                  
1600                    
1601                 */
1602                 if (UNIXCB(skb).fp)
1603                         siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1604         }
1605         err = size;
1606
1607         scm_recv(sock, msg, siocb->scm, flags);
1608
1609 out_free:
1610         skb_free_datagram(sk,skb);
1611 out:
1612         return err;
1613 }
1614
1615 /*
1616  *      Sleep until data has arrive. But check for races..
1617  */
1618  
1619 static long unix_stream_data_wait(struct sock * sk, long timeo)
1620 {
1621         DEFINE_WAIT(wait);
1622
1623         unix_state_rlock(sk);
1624
1625         for (;;) {
1626                 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1627
1628                 if (skb_queue_len(&sk->sk_receive_queue) ||
1629                     sk->sk_err ||
1630                     (sk->sk_shutdown & RCV_SHUTDOWN) ||
1631                     signal_pending(current) ||
1632                     !timeo)
1633                         break;
1634
1635                 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1636                 unix_state_runlock(sk);
1637                 timeo = schedule_timeout(timeo);
1638                 unix_state_rlock(sk);
1639                 clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1640         }
1641
1642         finish_wait(sk->sk_sleep, &wait);
1643         unix_state_runlock(sk);
1644         return timeo;
1645 }
1646
1647
1648
1649 static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
1650                                struct msghdr *msg, size_t size,
1651                                int flags)
1652 {
1653         struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1654         struct scm_cookie tmp_scm;
1655         struct sock *sk = sock->sk;
1656         struct unix_sock *u = unix_sk(sk);
1657         struct sockaddr_un *sunaddr=msg->msg_name;
1658         int copied = 0;
1659         int check_creds = 0;
1660         int target;
1661         int err = 0;
1662         long timeo;
1663
1664         err = -EINVAL;
1665         if (sk->sk_state != TCP_ESTABLISHED)
1666                 goto out;
1667
1668         err = -EOPNOTSUPP;
1669         if (flags&MSG_OOB)
1670                 goto out;
1671
1672         target = sock_rcvlowat(sk, flags&MSG_WAITALL, size);
1673         timeo = sock_rcvtimeo(sk, flags&MSG_DONTWAIT);
1674
1675         msg->msg_namelen = 0;
1676
1677         /* Lock the socket to prevent queue disordering
1678          * while sleeps in memcpy_tomsg
1679          */
1680
1681         if (!siocb->scm) {
1682                 siocb->scm = &tmp_scm;
1683                 memset(&tmp_scm, 0, sizeof(tmp_scm));
1684         }
1685
1686         down(&u->readsem);
1687
1688         do
1689         {
1690                 int chunk;
1691                 struct sk_buff *skb;
1692
1693                 skb = skb_dequeue(&sk->sk_receive_queue);
1694                 if (skb==NULL)
1695                 {
1696                         if (copied >= target)
1697                                 break;
1698
1699                         /*
1700                          *      POSIX 1003.1g mandates this order.
1701                          */
1702                          
1703                         if ((err = sock_error(sk)) != 0)
1704                                 break;
1705                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1706                                 break;
1707                         err = -EAGAIN;
1708                         if (!timeo)
1709                                 break;
1710                         up(&u->readsem);
1711
1712                         timeo = unix_stream_data_wait(sk, timeo);
1713
1714                         if (signal_pending(current)) {
1715                                 err = sock_intr_errno(timeo);
1716                                 goto out;
1717                         }
1718                         down(&u->readsem);
1719                         continue;
1720                 }
1721
1722                 if (check_creds) {
1723                         /* Never glue messages from different writers */
1724                         if (memcmp(UNIXCREDS(skb), &siocb->scm->creds, sizeof(siocb->scm->creds)) != 0) {
1725                                 skb_queue_head(&sk->sk_receive_queue, skb);
1726                                 break;
1727                         }
1728                 } else {
1729                         /* Copy credentials */
1730                         siocb->scm->creds = *UNIXCREDS(skb);
1731                         check_creds = 1;
1732                 }
1733
1734                 /* Copy address just once */
1735                 if (sunaddr)
1736                 {
1737                         unix_copy_addr(msg, skb->sk);
1738                         sunaddr = NULL;
1739                 }
1740
1741                 chunk = min_t(unsigned int, skb->len, size);
1742                 if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) {
1743                         skb_queue_head(&sk->sk_receive_queue, skb);
1744                         if (copied == 0)
1745                                 copied = -EFAULT;
1746                         break;
1747                 }
1748                 copied += chunk;
1749                 size -= chunk;
1750
1751                 /* Mark read part of skb as used */
1752                 if (!(flags & MSG_PEEK))
1753                 {
1754                         skb_pull(skb, chunk);
1755
1756                         if (UNIXCB(skb).fp)
1757                                 unix_detach_fds(siocb->scm, skb);
1758
1759                         /* put the skb back if we didn't use it up.. */
1760                         if (skb->len)
1761                         {
1762                                 skb_queue_head(&sk->sk_receive_queue, skb);
1763                                 break;
1764                         }
1765
1766                         kfree_skb(skb);
1767
1768                         if (siocb->scm->fp)
1769                                 break;
1770                 }
1771                 else
1772                 {
1773                         /* It is questionable, see note in unix_dgram_recvmsg.
1774                          */
1775                         if (UNIXCB(skb).fp)
1776                                 siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1777
1778                         /* put message back and return */
1779                         skb_queue_head(&sk->sk_receive_queue, skb);
1780                         break;
1781                 }
1782         } while (size);
1783
1784         up(&u->readsem);
1785         scm_recv(sock, msg, siocb->scm, flags);
1786 out:
1787         return copied ? : err;
1788 }
1789
1790 static int unix_shutdown(struct socket *sock, int mode)
1791 {
1792         struct sock *sk = sock->sk;
1793         struct sock *other;
1794
1795         mode = (mode+1)&(RCV_SHUTDOWN|SEND_SHUTDOWN);
1796
1797         if (mode) {
1798                 unix_state_wlock(sk);
1799                 sk->sk_shutdown |= mode;
1800                 other=unix_peer(sk);
1801                 if (other)
1802                         sock_hold(other);
1803                 unix_state_wunlock(sk);
1804                 sk->sk_state_change(sk);
1805
1806                 if (other &&
1807                         (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
1808
1809                         int peer_mode = 0;
1810
1811                         if (mode&RCV_SHUTDOWN)
1812                                 peer_mode |= SEND_SHUTDOWN;
1813                         if (mode&SEND_SHUTDOWN)
1814                                 peer_mode |= RCV_SHUTDOWN;
1815                         unix_state_wlock(other);
1816                         other->sk_shutdown |= peer_mode;
1817                         unix_state_wunlock(other);
1818                         other->sk_state_change(other);
1819                         read_lock(&other->sk_callback_lock);
1820                         if (peer_mode == SHUTDOWN_MASK)
1821                                 sk_wake_async(other,1,POLL_HUP);
1822                         else if (peer_mode & RCV_SHUTDOWN)
1823                                 sk_wake_async(other,1,POLL_IN);
1824                         read_unlock(&other->sk_callback_lock);
1825                 }
1826                 if (other)
1827                         sock_put(other);
1828         }
1829         return 0;
1830 }
1831
1832 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1833 {
1834         struct sock *sk = sock->sk;
1835         long amount=0;
1836         int err;
1837
1838         switch(cmd)
1839         {
1840                 case SIOCOUTQ:
1841                         amount = atomic_read(&sk->sk_wmem_alloc);
1842                         err = put_user(amount, (int __user *)arg);
1843                         break;
1844                 case SIOCINQ:
1845                 {
1846                         struct sk_buff *skb;
1847                         if (sk->sk_state == TCP_LISTEN) {
1848                                 err = -EINVAL;
1849                                 break;
1850                         }
1851
1852                         spin_lock(&sk->sk_receive_queue.lock);
1853                         skb = skb_peek(&sk->sk_receive_queue);
1854                         if (skb)
1855                                 amount=skb->len;
1856                         spin_unlock(&sk->sk_receive_queue.lock);
1857                         err = put_user(amount, (int __user *)arg);
1858                         break;
1859                 }
1860
1861                 default:
1862                         err = dev_ioctl(cmd, (void __user *)arg);
1863                         break;
1864         }
1865         return err;
1866 }
1867
1868 static unsigned int unix_poll(struct file * file, struct socket *sock, poll_table *wait)
1869 {
1870         struct sock *sk = sock->sk;
1871         unsigned int mask;
1872
1873         poll_wait(file, sk->sk_sleep, wait);
1874         mask = 0;
1875
1876         /* exceptional events? */
1877         if (sk->sk_err)
1878                 mask |= POLLERR;
1879         if (sk->sk_shutdown == SHUTDOWN_MASK)
1880                 mask |= POLLHUP;
1881
1882         /* readable? */
1883         if (!skb_queue_empty(&sk->sk_receive_queue) ||
1884             (sk->sk_shutdown & RCV_SHUTDOWN))
1885                 mask |= POLLIN | POLLRDNORM;
1886
1887         /* Connection-based need to check for termination and startup */
1888         if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) && sk->sk_state == TCP_CLOSE)
1889                 mask |= POLLHUP;
1890
1891         /*
1892          * we set writable also when the other side has shut down the
1893          * connection. This prevents stuck sockets.
1894          */
1895         if (unix_writable(sk))
1896                 mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
1897
1898         return mask;
1899 }
1900
1901
1902 #ifdef CONFIG_PROC_FS
1903 static struct sock *unix_seq_idx(int *iter, loff_t pos)
1904 {
1905         loff_t off = 0;
1906         struct sock *s;
1907
1908         for (s = first_unix_socket(iter); s; s = next_unix_socket(iter, s)) {
1909                 if (off == pos) 
1910                         return s;
1911                 ++off;
1912         }
1913         return NULL;
1914 }
1915
1916
1917 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
1918 {
1919         read_lock(&unix_table_lock);
1920         return *pos ? unix_seq_idx(seq->private, *pos - 1) : ((void *) 1);
1921 }
1922
1923 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1924 {
1925         ++*pos;
1926
1927         if (v == (void *)1) 
1928                 return first_unix_socket(seq->private);
1929         return next_unix_socket(seq->private, v);
1930 }
1931
1932 static void unix_seq_stop(struct seq_file *seq, void *v)
1933 {
1934         read_unlock(&unix_table_lock);
1935 }
1936
1937 static int unix_seq_show(struct seq_file *seq, void *v)
1938 {
1939         
1940         if (v == (void *)1)
1941                 seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
1942                          "Inode Path\n");
1943         else {
1944                 struct sock *s = v;
1945                 struct unix_sock *u = unix_sk(s);
1946                 unix_state_rlock(s);
1947
1948                 seq_printf(seq, "%p: %08X %08X %08X %04X %02X %5lu",
1949                         s,
1950                         atomic_read(&s->sk_refcnt),
1951                         0,
1952                         s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
1953                         s->sk_type,
1954                         s->sk_socket ?
1955                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
1956                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
1957                         sock_i_ino(s));
1958
1959                 if (u->addr) {
1960                         int i, len;
1961                         seq_putc(seq, ' ');
1962
1963                         i = 0;
1964                         len = u->addr->len - sizeof(short);
1965                         if (!UNIX_ABSTRACT(s))
1966                                 len--;
1967                         else {
1968                                 seq_putc(seq, '@');
1969                                 i++;
1970                         }
1971                         for ( ; i < len; i++)
1972                                 seq_putc(seq, u->addr->name->sun_path[i]);
1973                 }
1974                 unix_state_runlock(s);
1975                 seq_putc(seq, '\n');
1976         }
1977
1978         return 0;
1979 }
1980
1981 static struct seq_operations unix_seq_ops = {
1982         .start  = unix_seq_start,
1983         .next   = unix_seq_next,
1984         .stop   = unix_seq_stop,
1985         .show   = unix_seq_show,
1986 };
1987
1988
1989 static int unix_seq_open(struct inode *inode, struct file *file)
1990 {
1991         struct seq_file *seq;
1992         int rc = -ENOMEM;
1993         int *iter = kmalloc(sizeof(int), GFP_KERNEL);
1994
1995         if (!iter)
1996                 goto out;
1997
1998         rc = seq_open(file, &unix_seq_ops);
1999         if (rc)
2000                 goto out_kfree;
2001
2002         seq          = file->private_data;
2003         seq->private = iter;
2004         *iter = 0;
2005 out:
2006         return rc;
2007 out_kfree:
2008         kfree(iter);
2009         goto out;
2010 }
2011
2012 static struct file_operations unix_seq_fops = {
2013         .owner          = THIS_MODULE,
2014         .open           = unix_seq_open,
2015         .read           = seq_read,
2016         .llseek         = seq_lseek,
2017         .release        = seq_release_private,
2018 };
2019
2020 #endif
2021
2022 static struct net_proto_family unix_family_ops = {
2023         .family = PF_UNIX,
2024         .create = unix_create,
2025         .owner  = THIS_MODULE,
2026 };
2027
2028 #ifdef CONFIG_SYSCTL
2029 extern void unix_sysctl_register(void);
2030 extern void unix_sysctl_unregister(void);
2031 #else
2032 static inline void unix_sysctl_register(void) {}
2033 static inline void unix_sysctl_unregister(void) {}
2034 #endif
2035
2036 static int __init af_unix_init(void)
2037 {
2038         struct sk_buff *dummy_skb;
2039
2040         if (sizeof(struct unix_skb_parms) > sizeof(dummy_skb->cb)) {
2041                 printk(KERN_CRIT "%s: panic\n", __FUNCTION__);
2042                 return -1;
2043         }
2044         /* allocate our sock slab cache */
2045         unix_sk_cachep = kmem_cache_create("unix_sock",
2046                                            sizeof(struct unix_sock), 0,
2047                                            SLAB_HWCACHE_ALIGN, NULL, NULL);
2048         if (!unix_sk_cachep)
2049                 printk(KERN_CRIT
2050                         "af_unix_init: Cannot create unix_sock SLAB cache!\n");
2051
2052         sock_register(&unix_family_ops);
2053 #ifdef CONFIG_PROC_FS
2054         proc_net_fops_create("unix", 0, &unix_seq_fops);
2055 #endif
2056         unix_sysctl_register();
2057         return 0;
2058 }
2059
2060 static void __exit af_unix_exit(void)
2061 {
2062         sock_unregister(PF_UNIX);
2063         unix_sysctl_unregister();
2064         proc_net_remove("unix");
2065         kmem_cache_destroy(unix_sk_cachep);
2066 }
2067
2068 module_init(af_unix_init);
2069 module_exit(af_unix_exit);
2070
2071 MODULE_LICENSE("GPL");
2072 MODULE_ALIAS_NETPROTO(PF_UNIX);