VServer 1.9.2 (patch-2.6.8.1-vs1.9.2.diff)
[linux-2.6.git] / net / unix / af_unix.c
1 /*
2  * NET4:        Implementation of BSD Unix domain sockets.
3  *
4  * Authors:     Alan Cox, <alan.cox@linux.org>
5  *
6  *              This program is free software; you can redistribute it and/or
7  *              modify it under the terms of the GNU General Public License
8  *              as published by the Free Software Foundation; either version
9  *              2 of the License, or (at your option) any later version.
10  *
11  * Version:     $Id: af_unix.c,v 1.133 2002/02/08 03:57:19 davem Exp $
12  *
13  * Fixes:
14  *              Linus Torvalds  :       Assorted bug cures.
15  *              Niibe Yutaka    :       async I/O support.
16  *              Carsten Paeth   :       PF_UNIX check, address fixes.
17  *              Alan Cox        :       Limit size of allocated blocks.
18  *              Alan Cox        :       Fixed the stupid socketpair bug.
19  *              Alan Cox        :       BSD compatibility fine tuning.
20  *              Alan Cox        :       Fixed a bug in connect when interrupted.
21  *              Alan Cox        :       Sorted out a proper draft version of
22  *                                      file descriptor passing hacked up from
23  *                                      Mike Shaver's work.
24  *              Marty Leisner   :       Fixes to fd passing
25  *              Nick Nevin      :       recvmsg bugfix.
26  *              Alan Cox        :       Started proper garbage collector
27  *              Heiko EiBfeldt  :       Missing verify_area check
28  *              Alan Cox        :       Started POSIXisms
29  *              Andreas Schwab  :       Replace inode by dentry for proper
30  *                                      reference counting
31  *              Kirk Petersen   :       Made this a module
32  *          Christoph Rohland   :       Elegant non-blocking accept/connect algorithm.
33  *                                      Lots of bug fixes.
34  *           Alexey Kuznetosv   :       Repaired (I hope) bugs introduces
35  *                                      by above two patches.
36  *           Andrea Arcangeli   :       If possible we block in connect(2)
37  *                                      if the max backlog of the listen socket
38  *                                      is been reached. This won't break
39  *                                      old apps and it will avoid huge amount
40  *                                      of socks hashed (this for unix_gc()
41  *                                      performances reasons).
42  *                                      Security fix that limits the max
43  *                                      number of socks to 2*max_files and
44  *                                      the number of skb queueable in the
45  *                                      dgram receiver.
46  *              Artur Skawina   :       Hash function optimizations
47  *           Alexey Kuznetsov   :       Full scale SMP. Lot of bugs are introduced 8)
48  *            Malcolm Beattie   :       Set peercred for socketpair
49  *           Michal Ostrowski   :       Module initialization cleanup.
50  *           Arnaldo C. Melo    :       Remove MOD_{INC,DEC}_USE_COUNT,
51  *                                      the core infrastructure is doing that
52  *                                      for all net proto families now (2.5.69+)
53  *
54  *
55  * Known differences from reference BSD that was tested:
56  *
57  *      [TO FIX]
58  *      ECONNREFUSED is not returned from one end of a connected() socket to the
59  *              other the moment one end closes.
60  *      fstat() doesn't return st_dev=0, and give the blksize as high water mark
61  *              and a fake inode identifier (nor the BSD first socket fstat twice bug).
62  *      [NOT TO FIX]
63  *      accept() returns a path name even if the connecting socket has closed
64  *              in the meantime (BSD loses the path and gives up).
65  *      accept() returns 0 length path for an unbound connector. BSD returns 16
66  *              and a null first byte in the path (but not for gethost/peername - BSD bug ??)
67  *      socketpair(...SOCK_RAW..) doesn't panic the kernel.
68  *      BSD af_unix apparently has connect forgetting to block properly.
69  *              (need to check this with the POSIX spec in detail)
70  *
71  * Differences from 2.0.0-11-... (ANK)
72  *      Bug fixes and improvements.
73  *              - client shutdown killed server socket.
74  *              - removed all useless cli/sti pairs.
75  *
76  *      Semantic changes/extensions.
77  *              - generic control message passing.
78  *              - SCM_CREDENTIALS control message.
79  *              - "Abstract" (not FS based) socket bindings.
80  *                Abstract names are sequences of bytes (not zero terminated)
81  *                started by 0, so that this name space does not intersect
82  *                with BSD names.
83  */
84
85 #include <linux/module.h>
86 #include <linux/config.h>
87 #include <linux/kernel.h>
88 #include <linux/major.h>
89 #include <linux/signal.h>
90 #include <linux/sched.h>
91 #include <linux/errno.h>
92 #include <linux/string.h>
93 #include <linux/stat.h>
94 #include <linux/dcache.h>
95 #include <linux/namei.h>
96 #include <linux/socket.h>
97 #include <linux/un.h>
98 #include <linux/fcntl.h>
99 #include <linux/termios.h>
100 #include <linux/sockios.h>
101 #include <linux/net.h>
102 #include <linux/in.h>
103 #include <linux/fs.h>
104 #include <linux/slab.h>
105 #include <asm/uaccess.h>
106 #include <linux/skbuff.h>
107 #include <linux/netdevice.h>
108 #include <net/sock.h>
109 #include <linux/tcp.h>
110 #include <net/af_unix.h>
111 #include <linux/proc_fs.h>
112 #include <linux/seq_file.h>
113 #include <net/scm.h>
114 #include <linux/init.h>
115 #include <linux/poll.h>
116 #include <linux/smp_lock.h>
117 #include <linux/rtnetlink.h>
118 #include <linux/mount.h>
119 #include <net/checksum.h>
120 #include <linux/security.h>
121 #include <linux/vs_context.h>
122 #include <linux/vs_network.h>
123 #include <linux/vs_limit.h>
124
125 int sysctl_unix_max_dgram_qlen = 10;
126
127 kmem_cache_t *unix_sk_cachep;
128
129 struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1];
130 rwlock_t unix_table_lock = RW_LOCK_UNLOCKED;
131 static atomic_t unix_nr_socks = ATOMIC_INIT(0);
132
133 #define unix_sockets_unbound    (&unix_socket_table[UNIX_HASH_SIZE])
134
135 #define UNIX_ABSTRACT(sk)       (unix_sk(sk)->addr->hash != UNIX_HASH_SIZE)
136
137 /*
138  *  SMP locking strategy:
139  *    hash table is protected with rwlock unix_table_lock
140  *    each socket state is protected by separate rwlock.
141  */
142
143 static inline unsigned unix_hash_fold(unsigned hash)
144 {
145         hash ^= hash>>16;
146         hash ^= hash>>8;
147         return hash&(UNIX_HASH_SIZE-1);
148 }
149
150 #define unix_peer(sk) ((sk)->sk_pair)
151
152 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
153 {
154         return unix_peer(osk) == sk;
155 }
156
157 static inline int unix_may_send(struct sock *sk, struct sock *osk)
158 {
159         return (unix_peer(osk) == NULL || unix_our_peer(sk, osk));
160 }
161
162 static struct sock *unix_peer_get(struct sock *s)
163 {
164         struct sock *peer;
165
166         unix_state_rlock(s);
167         peer = unix_peer(s);
168         if (peer)
169                 sock_hold(peer);
170         unix_state_runlock(s);
171         return peer;
172 }
173
174 static inline void unix_release_addr(struct unix_address *addr)
175 {
176         if (atomic_dec_and_test(&addr->refcnt))
177                 kfree(addr);
178 }
179
180 /*
181  *      Check unix socket name:
182  *              - should be not zero length.
183  *              - if started by not zero, should be NULL terminated (FS object)
184  *              - if started by zero, it is abstract name.
185  */
186  
187 static int unix_mkname(struct sockaddr_un * sunaddr, int len, unsigned *hashp)
188 {
189         if (len <= sizeof(short) || len > sizeof(*sunaddr))
190                 return -EINVAL;
191         if (!sunaddr || sunaddr->sun_family != AF_UNIX)
192                 return -EINVAL;
193         if (sunaddr->sun_path[0])
194         {
195                 /*
196                  *      This may look like an off by one error but it is
197                  *      a bit more subtle. 108 is the longest valid AF_UNIX
198                  *      path for a binding. sun_path[108] doesn't as such
199                  *      exist. However in kernel space we are guaranteed that
200                  *      it is a valid memory location in our kernel
201                  *      address buffer.
202                  */
203                 if (len > sizeof(*sunaddr))
204                         len = sizeof(*sunaddr);
205                 ((char *)sunaddr)[len]=0;
206                 len = strlen(sunaddr->sun_path)+1+sizeof(short);
207                 return len;
208         }
209
210         *hashp = unix_hash_fold(csum_partial((char*)sunaddr, len, 0));
211         return len;
212 }
213
214 static void __unix_remove_socket(struct sock *sk)
215 {
216         sk_del_node_init(sk);
217 }
218
219 static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
220 {
221         BUG_TRAP(sk_unhashed(sk));
222         sk_add_node(sk, list);
223 }
224
225 static inline void unix_remove_socket(struct sock *sk)
226 {
227         write_lock(&unix_table_lock);
228         __unix_remove_socket(sk);
229         write_unlock(&unix_table_lock);
230 }
231
232 static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
233 {
234         write_lock(&unix_table_lock);
235         __unix_insert_socket(list, sk);
236         write_unlock(&unix_table_lock);
237 }
238
239 static struct sock *__unix_find_socket_byname(struct sockaddr_un *sunname,
240                                               int len, int type, unsigned hash)
241 {
242         struct sock *s;
243         struct hlist_node *node;
244
245         sk_for_each(s, node, &unix_socket_table[hash ^ type]) {
246                 struct unix_sock *u = unix_sk(s);
247
248                 if (u->addr->len == len &&
249                     !memcmp(u->addr->name, sunname, len))
250                         goto found;
251         }
252         s = NULL;
253 found:
254         return s;
255 }
256
257 static inline struct sock *unix_find_socket_byname(struct sockaddr_un *sunname,
258                                                    int len, int type,
259                                                    unsigned hash)
260 {
261         struct sock *s;
262
263         read_lock(&unix_table_lock);
264         s = __unix_find_socket_byname(sunname, len, type, hash);
265         if (s)
266                 sock_hold(s);
267         read_unlock(&unix_table_lock);
268         return s;
269 }
270
271 static struct sock *unix_find_socket_byinode(struct inode *i)
272 {
273         struct sock *s;
274         struct hlist_node *node;
275
276         read_lock(&unix_table_lock);
277         sk_for_each(s, node,
278                     &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
279                 struct dentry *dentry = unix_sk(s)->dentry;
280
281                 if(dentry && dentry->d_inode == i)
282                 {
283                         sock_hold(s);
284                         goto found;
285                 }
286         }
287         s = NULL;
288 found:
289         read_unlock(&unix_table_lock);
290         return s;
291 }
292
293 static inline int unix_writable(struct sock *sk)
294 {
295         return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
296 }
297
298 static void unix_write_space(struct sock *sk)
299 {
300         read_lock(&sk->sk_callback_lock);
301         if (unix_writable(sk)) {
302                 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
303                         wake_up_interruptible(sk->sk_sleep);
304                 sk_wake_async(sk, 2, POLL_OUT);
305         }
306         read_unlock(&sk->sk_callback_lock);
307 }
308
309 /* When dgram socket disconnects (or changes its peer), we clear its receive
310  * queue of packets arrived from previous peer. First, it allows to do
311  * flow control based only on wmem_alloc; second, sk connected to peer
312  * may receive messages only from that peer. */
313 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
314 {
315         if (skb_queue_len(&sk->sk_receive_queue)) {
316                 skb_queue_purge(&sk->sk_receive_queue);
317                 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
318
319                 /* If one link of bidirectional dgram pipe is disconnected,
320                  * we signal error. Messages are lost. Do not make this,
321                  * when peer was not connected to us.
322                  */
323                 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
324                         other->sk_err = ECONNRESET;
325                         other->sk_error_report(other);
326                 }
327         }
328 }
329
330 static void unix_sock_destructor(struct sock *sk)
331 {
332         struct unix_sock *u = unix_sk(sk);
333
334         skb_queue_purge(&sk->sk_receive_queue);
335
336         BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
337         BUG_TRAP(sk_unhashed(sk));
338         BUG_TRAP(!sk->sk_socket);
339         if (!sock_flag(sk, SOCK_DEAD)) {
340                 printk("Attempt to release alive unix socket: %p\n", sk);
341                 return;
342         }
343
344         if (u->addr)
345                 unix_release_addr(u->addr);
346
347         atomic_dec(&unix_nr_socks);
348 #ifdef UNIX_REFCNT_DEBUG
349         printk(KERN_DEBUG "UNIX %p is destroyed, %d are still alive.\n", sk, atomic_read(&unix_nr_socks));
350 #endif
351 }
352
353 static int unix_release_sock (struct sock *sk, int embrion)
354 {
355         struct unix_sock *u = unix_sk(sk);
356         struct dentry *dentry;
357         struct vfsmount *mnt;
358         struct sock *skpair;
359         struct sk_buff *skb;
360         int state;
361
362         unix_remove_socket(sk);
363
364         /* Clear state */
365         unix_state_wlock(sk);
366         sock_orphan(sk);
367         sk->sk_shutdown = SHUTDOWN_MASK;
368         dentry       = u->dentry;
369         u->dentry    = NULL;
370         mnt          = u->mnt;
371         u->mnt       = NULL;
372         state = sk->sk_state;
373         sk->sk_state = TCP_CLOSE;
374         unix_state_wunlock(sk);
375
376         wake_up_interruptible_all(&u->peer_wait);
377
378         skpair=unix_peer(sk);
379
380         if (skpair!=NULL) {
381                 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
382                         unix_state_wlock(skpair);
383                         /* No more writes */
384                         skpair->sk_shutdown = SHUTDOWN_MASK;
385                         if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
386                                 skpair->sk_err = ECONNRESET;
387                         unix_state_wunlock(skpair);
388                         skpair->sk_state_change(skpair);
389                         read_lock(&skpair->sk_callback_lock);
390                         sk_wake_async(skpair,1,POLL_HUP);
391                         read_unlock(&skpair->sk_callback_lock);
392                 }
393                 sock_put(skpair); /* It may now die */
394                 unix_peer(sk) = NULL;
395         }
396
397         /* Try to flush out this socket. Throw out buffers at least */
398
399         while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
400                 if (state==TCP_LISTEN)
401                         unix_release_sock(skb->sk, 1);
402                 /* passed fds are erased in the kfree_skb hook        */
403                 kfree_skb(skb);
404         }
405
406         if (dentry) {
407                 dput(dentry);
408                 mntput(mnt);
409         }
410
411         vx_sock_dec(sk);
412         clr_vx_info(&sk->sk_vx_info);
413         clr_nx_info(&sk->sk_nx_info);
414         sock_put(sk);
415
416         /* ---- Socket is dead now and most probably destroyed ---- */
417
418         /*
419          * Fixme: BSD difference: In BSD all sockets connected to use get
420          *        ECONNRESET and we die on the spot. In Linux we behave
421          *        like files and pipes do and wait for the last
422          *        dereference.
423          *
424          * Can't we simply set sock->err?
425          *
426          *        What the above comment does talk about? --ANK(980817)
427          */
428
429         if (atomic_read(&unix_tot_inflight))
430                 unix_gc();              /* Garbage collect fds */       
431
432         return 0;
433 }
434
435 static int unix_listen(struct socket *sock, int backlog)
436 {
437         int err;
438         struct sock *sk = sock->sk;
439         struct unix_sock *u = unix_sk(sk);
440
441         err = -EOPNOTSUPP;
442         if (sock->type!=SOCK_STREAM && sock->type!=SOCK_SEQPACKET)
443                 goto out;                       /* Only stream/seqpacket sockets accept */
444         err = -EINVAL;
445         if (!u->addr)
446                 goto out;                       /* No listens on an unbound socket */
447         unix_state_wlock(sk);
448         if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
449                 goto out_unlock;
450         if (backlog > sk->sk_max_ack_backlog)
451                 wake_up_interruptible_all(&u->peer_wait);
452         sk->sk_max_ack_backlog  = backlog;
453         sk->sk_state            = TCP_LISTEN;
454         /* set credentials so connect can copy them */
455         sk->sk_peercred.pid     = current->tgid;
456         sk->sk_peercred.uid     = current->euid;
457         sk->sk_peercred.gid     = current->egid;
458         err = 0;
459
460 out_unlock:
461         unix_state_wunlock(sk);
462 out:
463         return err;
464 }
465
466 static int unix_release(struct socket *);
467 static int unix_bind(struct socket *, struct sockaddr *, int);
468 static int unix_stream_connect(struct socket *, struct sockaddr *,
469                                int addr_len, int flags);
470 static int unix_socketpair(struct socket *, struct socket *);
471 static int unix_accept(struct socket *, struct socket *, int);
472 static int unix_getname(struct socket *, struct sockaddr *, int *, int);
473 static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
474 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
475 static int unix_shutdown(struct socket *, int);
476 static int unix_stream_sendmsg(struct kiocb *, struct socket *,
477                                struct msghdr *, size_t);
478 static int unix_stream_recvmsg(struct kiocb *, struct socket *,
479                                struct msghdr *, size_t, int);
480 static int unix_dgram_sendmsg(struct kiocb *, struct socket *,
481                               struct msghdr *, size_t);
482 static int unix_dgram_recvmsg(struct kiocb *, struct socket *,
483                               struct msghdr *, size_t, int);
484 static int unix_dgram_connect(struct socket *, struct sockaddr *,
485                               int, int);
486
487 static struct proto_ops unix_stream_ops = {
488         .family =       PF_UNIX,
489         .owner =        THIS_MODULE,
490         .release =      unix_release,
491         .bind =         unix_bind,
492         .connect =      unix_stream_connect,
493         .socketpair =   unix_socketpair,
494         .accept =       unix_accept,
495         .getname =      unix_getname,
496         .poll =         unix_poll,
497         .ioctl =        unix_ioctl,
498         .listen =       unix_listen,
499         .shutdown =     unix_shutdown,
500         .setsockopt =   sock_no_setsockopt,
501         .getsockopt =   sock_no_getsockopt,
502         .sendmsg =      unix_stream_sendmsg,
503         .recvmsg =      unix_stream_recvmsg,
504         .mmap =         sock_no_mmap,
505         .sendpage =     sock_no_sendpage,
506 };
507
508 static struct proto_ops unix_dgram_ops = {
509         .family =       PF_UNIX,
510         .owner =        THIS_MODULE,
511         .release =      unix_release,
512         .bind =         unix_bind,
513         .connect =      unix_dgram_connect,
514         .socketpair =   unix_socketpair,
515         .accept =       sock_no_accept,
516         .getname =      unix_getname,
517         .poll =         datagram_poll,
518         .ioctl =        unix_ioctl,
519         .listen =       sock_no_listen,
520         .shutdown =     unix_shutdown,
521         .setsockopt =   sock_no_setsockopt,
522         .getsockopt =   sock_no_getsockopt,
523         .sendmsg =      unix_dgram_sendmsg,
524         .recvmsg =      unix_dgram_recvmsg,
525         .mmap =         sock_no_mmap,
526         .sendpage =     sock_no_sendpage,
527 };
528
529 static struct proto_ops unix_seqpacket_ops = {
530         .family =       PF_UNIX,
531         .owner =        THIS_MODULE,
532         .release =      unix_release,
533         .bind =         unix_bind,
534         .connect =      unix_stream_connect,
535         .socketpair =   unix_socketpair,
536         .accept =       unix_accept,
537         .getname =      unix_getname,
538         .poll =         datagram_poll,
539         .ioctl =        unix_ioctl,
540         .listen =       unix_listen,
541         .shutdown =     unix_shutdown,
542         .setsockopt =   sock_no_setsockopt,
543         .getsockopt =   sock_no_getsockopt,
544         .sendmsg =      unix_dgram_sendmsg,
545         .recvmsg =      unix_dgram_recvmsg,
546         .mmap =         sock_no_mmap,
547         .sendpage =     sock_no_sendpage,
548 };
549
550 static struct sock * unix_create1(struct socket *sock)
551 {
552         struct sock *sk = NULL;
553         struct unix_sock *u;
554
555         if (atomic_read(&unix_nr_socks) >= 2*files_stat.max_files)
556                 goto out;
557
558         sk = sk_alloc(PF_UNIX, GFP_KERNEL, sizeof(struct unix_sock),
559                       unix_sk_cachep);
560         if (!sk)
561                 goto out;
562
563         atomic_inc(&unix_nr_socks);
564
565         sock_init_data(sock,sk);
566         sk_set_owner(sk, THIS_MODULE);
567
568         set_vx_info(&sk->sk_vx_info, current->vx_info);
569         sk->sk_xid = vx_current_xid();
570         vx_sock_inc(sk);
571         set_nx_info(&sk->sk_nx_info, current->nx_info);
572
573         sk->sk_write_space      = unix_write_space;
574         sk->sk_max_ack_backlog  = sysctl_unix_max_dgram_qlen;
575         sk->sk_destruct         = unix_sock_destructor;
576         u         = unix_sk(sk);
577         u->dentry = NULL;
578         u->mnt    = NULL;
579         rwlock_init(&u->lock);
580         atomic_set(&u->inflight, sock ? 0 : -1);
581         init_MUTEX(&u->readsem); /* single task reading lock */
582         init_waitqueue_head(&u->peer_wait);
583         unix_insert_socket(unix_sockets_unbound, sk);
584 out:
585         return sk;
586 }
587
588 static int unix_create(struct socket *sock, int protocol)
589 {
590         if (protocol && protocol != PF_UNIX)
591                 return -EPROTONOSUPPORT;
592
593         sock->state = SS_UNCONNECTED;
594
595         switch (sock->type) {
596         case SOCK_STREAM:
597                 sock->ops = &unix_stream_ops;
598                 break;
599                 /*
600                  *      Believe it or not BSD has AF_UNIX, SOCK_RAW though
601                  *      nothing uses it.
602                  */
603         case SOCK_RAW:
604                 sock->type=SOCK_DGRAM;
605         case SOCK_DGRAM:
606                 sock->ops = &unix_dgram_ops;
607                 break;
608         case SOCK_SEQPACKET:
609                 sock->ops = &unix_seqpacket_ops;
610                 break;
611         default:
612                 return -ESOCKTNOSUPPORT;
613         }
614
615         return unix_create1(sock) ? 0 : -ENOMEM;
616 }
617
618 static int unix_release(struct socket *sock)
619 {
620         struct sock *sk = sock->sk;
621
622         if (!sk)
623                 return 0;
624
625         sock->sk = NULL;
626
627         return unix_release_sock (sk, 0);
628 }
629
630 static int unix_autobind(struct socket *sock)
631 {
632         struct sock *sk = sock->sk;
633         struct unix_sock *u = unix_sk(sk);
634         static u32 ordernum = 1;
635         struct unix_address * addr;
636         int err;
637
638         down(&u->readsem);
639
640         err = 0;
641         if (u->addr)
642                 goto out;
643
644         err = -ENOMEM;
645         addr = kmalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
646         if (!addr)
647                 goto out;
648
649         memset(addr, 0, sizeof(*addr) + sizeof(short) + 16);
650         addr->name->sun_family = AF_UNIX;
651         atomic_set(&addr->refcnt, 1);
652
653 retry:
654         addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
655         addr->hash = unix_hash_fold(csum_partial((void*)addr->name, addr->len, 0));
656
657         write_lock(&unix_table_lock);
658         ordernum = (ordernum+1)&0xFFFFF;
659
660         if (__unix_find_socket_byname(addr->name, addr->len, sock->type,
661                                       addr->hash)) {
662                 write_unlock(&unix_table_lock);
663                 /* Sanity yield. It is unusual case, but yet... */
664                 if (!(ordernum&0xFF))
665                         yield();
666                 goto retry;
667         }
668         addr->hash ^= sk->sk_type;
669
670         __unix_remove_socket(sk);
671         u->addr = addr;
672         __unix_insert_socket(&unix_socket_table[addr->hash], sk);
673         write_unlock(&unix_table_lock);
674         err = 0;
675
676 out:    up(&u->readsem);
677         return err;
678 }
679
680 static struct sock *unix_find_other(struct sockaddr_un *sunname, int len,
681                                     int type, unsigned hash, int *error)
682 {
683         struct sock *u;
684         struct nameidata nd;
685         int err = 0;
686         
687         if (sunname->sun_path[0]) {
688                 err = path_lookup(sunname->sun_path, LOOKUP_FOLLOW, &nd);
689                 if (err)
690                         goto fail;
691                 err = permission(nd.dentry->d_inode,MAY_WRITE, &nd);
692                 if (err)
693                         goto put_fail;
694
695                 err = -ECONNREFUSED;
696                 if (!S_ISSOCK(nd.dentry->d_inode->i_mode))
697                         goto put_fail;
698                 u=unix_find_socket_byinode(nd.dentry->d_inode);
699                 if (!u)
700                         goto put_fail;
701
702                 if (u->sk_type == type)
703                         touch_atime(nd.mnt, nd.dentry);
704
705                 path_release(&nd);
706
707                 err=-EPROTOTYPE;
708                 if (u->sk_type != type) {
709                         sock_put(u);
710                         goto fail;
711                 }
712         } else {
713                 err = -ECONNREFUSED;
714                 u=unix_find_socket_byname(sunname, len, type, hash);
715                 if (u) {
716                         struct dentry *dentry;
717                         dentry = unix_sk(u)->dentry;
718                         if (dentry)
719                                 touch_atime(unix_sk(u)->mnt, dentry);
720                 } else
721                         goto fail;
722         }
723         return u;
724
725 put_fail:
726         path_release(&nd);
727 fail:
728         *error=err;
729         return NULL;
730 }
731
732
733 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
734 {
735         struct sock *sk = sock->sk;
736         struct unix_sock *u = unix_sk(sk);
737         struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr;
738         struct dentry * dentry = NULL;
739         struct nameidata nd;
740         int err;
741         unsigned hash;
742         struct unix_address *addr;
743         struct hlist_head *list;
744
745         err = -EINVAL;
746         if (sunaddr->sun_family != AF_UNIX)
747                 goto out;
748
749         if (addr_len==sizeof(short)) {
750                 err = unix_autobind(sock);
751                 goto out;
752         }
753
754         err = unix_mkname(sunaddr, addr_len, &hash);
755         if (err < 0)
756                 goto out;
757         addr_len = err;
758
759         down(&u->readsem);
760
761         err = -EINVAL;
762         if (u->addr)
763                 goto out_up;
764
765         err = -ENOMEM;
766         addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
767         if (!addr)
768                 goto out_up;
769
770         memcpy(addr->name, sunaddr, addr_len);
771         addr->len = addr_len;
772         addr->hash = hash ^ sk->sk_type;
773         atomic_set(&addr->refcnt, 1);
774
775         if (sunaddr->sun_path[0]) {
776                 unsigned int mode;
777                 err = 0;
778                 /*
779                  * Get the parent directory, calculate the hash for last
780                  * component.
781                  */
782                 err = path_lookup(sunaddr->sun_path, LOOKUP_PARENT, &nd);
783                 if (err)
784                         goto out_mknod_parent;
785                 /*
786                  * Yucky last component or no last component at all?
787                  * (foo/., foo/.., /////)
788                  */
789                 err = -EEXIST;
790                 if (nd.last_type != LAST_NORM)
791                         goto out_mknod;
792                 /*
793                  * Lock the directory.
794                  */
795                 down(&nd.dentry->d_inode->i_sem);
796                 /*
797                  * Do the final lookup.
798                  */
799                 dentry = lookup_hash(&nd.last, nd.dentry);
800                 err = PTR_ERR(dentry);
801                 if (IS_ERR(dentry))
802                         goto out_mknod_unlock;
803                 err = -ENOENT;
804                 /*
805                  * Special case - lookup gave negative, but... we had foo/bar/
806                  * From the vfs_mknod() POV we just have a negative dentry -
807                  * all is fine. Let's be bastards - you had / on the end, you've
808                  * been asking for (non-existent) directory. -ENOENT for you.
809                  */
810                 if (nd.last.name[nd.last.len] && !dentry->d_inode)
811                         goto out_mknod_dput;
812                 /*
813                  * All right, let's create it.
814                  */
815                 mode = S_IFSOCK |
816                        (SOCK_INODE(sock)->i_mode & ~current->fs->umask);
817                 err = vfs_mknod(nd.dentry->d_inode, dentry, mode, 0);
818                 if (err)
819                         goto out_mknod_dput;
820                 up(&nd.dentry->d_inode->i_sem);
821                 dput(nd.dentry);
822                 nd.dentry = dentry;
823
824                 addr->hash = UNIX_HASH_SIZE;
825         }
826
827         write_lock(&unix_table_lock);
828
829         if (!sunaddr->sun_path[0]) {
830                 err = -EADDRINUSE;
831                 if (__unix_find_socket_byname(sunaddr, addr_len,
832                                               sk->sk_type, hash)) {
833                         unix_release_addr(addr);
834                         goto out_unlock;
835                 }
836
837                 list = &unix_socket_table[addr->hash];
838         } else {
839                 list = &unix_socket_table[dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1)];
840                 u->dentry = nd.dentry;
841                 u->mnt    = nd.mnt;
842         }
843
844         err = 0;
845         __unix_remove_socket(sk);
846         u->addr = addr;
847         __unix_insert_socket(list, sk);
848
849 out_unlock:
850         write_unlock(&unix_table_lock);
851 out_up:
852         up(&u->readsem);
853 out:
854         return err;
855
856 out_mknod_dput:
857         dput(dentry);
858 out_mknod_unlock:
859         up(&nd.dentry->d_inode->i_sem);
860 out_mknod:
861         path_release(&nd);
862 out_mknod_parent:
863         if (err==-EEXIST)
864                 err=-EADDRINUSE;
865         unix_release_addr(addr);
866         goto out_up;
867 }
868
869 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
870                               int alen, int flags)
871 {
872         struct sock *sk = sock->sk;
873         struct sockaddr_un *sunaddr=(struct sockaddr_un*)addr;
874         struct sock *other;
875         unsigned hash;
876         int err;
877
878         if (addr->sa_family != AF_UNSPEC) {
879                 err = unix_mkname(sunaddr, alen, &hash);
880                 if (err < 0)
881                         goto out;
882                 alen = err;
883
884                 if (test_bit(SOCK_PASS_CRED, &sock->flags) && !unix_sk(sk)->addr &&
885                     (err = unix_autobind(sock)) != 0)
886                         goto out;
887
888                 other=unix_find_other(sunaddr, alen, sock->type, hash, &err);
889                 if (!other)
890                         goto out;
891
892                 unix_state_wlock(sk);
893
894                 err = -EPERM;
895                 if (!unix_may_send(sk, other))
896                         goto out_unlock;
897
898                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
899                 if (err)
900                         goto out_unlock;
901
902         } else {
903                 /*
904                  *      1003.1g breaking connected state with AF_UNSPEC
905                  */
906                 other = NULL;
907                 unix_state_wlock(sk);
908         }
909
910         /*
911          * If it was connected, reconnect.
912          */
913         if (unix_peer(sk)) {
914                 struct sock *old_peer = unix_peer(sk);
915                 unix_peer(sk)=other;
916                 unix_state_wunlock(sk);
917
918                 if (other != old_peer)
919                         unix_dgram_disconnected(sk, old_peer);
920                 sock_put(old_peer);
921         } else {
922                 unix_peer(sk)=other;
923                 unix_state_wunlock(sk);
924         }
925         return 0;
926
927 out_unlock:
928         unix_state_wunlock(sk);
929         sock_put(other);
930 out:
931         return err;
932 }
933
934 static long unix_wait_for_peer(struct sock *other, long timeo)
935 {
936         struct unix_sock *u = unix_sk(other);
937         int sched;
938         DEFINE_WAIT(wait);
939
940         prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
941
942         sched = !sock_flag(other, SOCK_DEAD) &&
943                 !(other->sk_shutdown & RCV_SHUTDOWN) &&
944                 (skb_queue_len(&other->sk_receive_queue) >
945                  other->sk_max_ack_backlog);
946
947         unix_state_runlock(other);
948
949         if (sched)
950                 timeo = schedule_timeout(timeo);
951
952         finish_wait(&u->peer_wait, &wait);
953         return timeo;
954 }
955
956 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
957                                int addr_len, int flags)
958 {
959         struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr;
960         struct sock *sk = sock->sk;
961         struct unix_sock *u = unix_sk(sk), *newu, *otheru;
962         struct sock *newsk = NULL;
963         struct sock *other = NULL;
964         struct sk_buff *skb = NULL;
965         unsigned hash;
966         int st;
967         int err;
968         long timeo;
969
970         err = unix_mkname(sunaddr, addr_len, &hash);
971         if (err < 0)
972                 goto out;
973         addr_len = err;
974
975         if (test_bit(SOCK_PASS_CRED, &sock->flags)
976                 && !u->addr && (err = unix_autobind(sock)) != 0)
977                 goto out;
978
979         timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
980
981         /* First of all allocate resources.
982            If we will make it after state is locked,
983            we will have to recheck all again in any case.
984          */
985
986         err = -ENOMEM;
987
988         /* create new sock for complete connection */
989         newsk = unix_create1(NULL);
990         if (newsk == NULL)
991                 goto out;
992
993         /* Allocate skb for sending to listening sock */
994         skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
995         if (skb == NULL)
996                 goto out;
997
998 restart:
999         /*  Find listening sock. */
1000         other = unix_find_other(sunaddr, addr_len, sk->sk_type, hash, &err);
1001         if (!other)
1002                 goto out;
1003
1004         /* Latch state of peer */
1005         unix_state_rlock(other);
1006
1007         /* Apparently VFS overslept socket death. Retry. */
1008         if (sock_flag(other, SOCK_DEAD)) {
1009                 unix_state_runlock(other);
1010                 sock_put(other);
1011                 goto restart;
1012         }
1013
1014         err = -ECONNREFUSED;
1015         if (other->sk_state != TCP_LISTEN)
1016                 goto out_unlock;
1017
1018         if (skb_queue_len(&other->sk_receive_queue) >
1019             other->sk_max_ack_backlog) {
1020                 err = -EAGAIN;
1021                 if (!timeo)
1022                         goto out_unlock;
1023
1024                 timeo = unix_wait_for_peer(other, timeo);
1025
1026                 err = sock_intr_errno(timeo);
1027                 if (signal_pending(current))
1028                         goto out;
1029                 sock_put(other);
1030                 goto restart;
1031         }
1032
1033         /* Latch our state.
1034
1035            It is tricky place. We need to grab write lock and cannot
1036            drop lock on peer. It is dangerous because deadlock is
1037            possible. Connect to self case and simultaneous
1038            attempt to connect are eliminated by checking socket
1039            state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1040            check this before attempt to grab lock.
1041
1042            Well, and we have to recheck the state after socket locked.
1043          */
1044         st = sk->sk_state;
1045
1046         switch (st) {
1047         case TCP_CLOSE:
1048                 /* This is ok... continue with connect */
1049                 break;
1050         case TCP_ESTABLISHED:
1051                 /* Socket is already connected */
1052                 err = -EISCONN;
1053                 goto out_unlock;
1054         default:
1055                 err = -EINVAL;
1056                 goto out_unlock;
1057         }
1058
1059         unix_state_wlock(sk);
1060
1061         if (sk->sk_state != st) {
1062                 unix_state_wunlock(sk);
1063                 unix_state_runlock(other);
1064                 sock_put(other);
1065                 goto restart;
1066         }
1067
1068         err = security_unix_stream_connect(sock, other->sk_socket, newsk);
1069         if (err) {
1070                 unix_state_wunlock(sk);
1071                 goto out_unlock;
1072         }
1073
1074         /* The way is open! Fastly set all the necessary fields... */
1075
1076         sock_hold(sk);
1077         unix_peer(newsk)        = sk;
1078         newsk->sk_state         = TCP_ESTABLISHED;
1079         newsk->sk_type          = sk->sk_type;
1080         newsk->sk_peercred.pid  = current->tgid;
1081         newsk->sk_peercred.uid  = current->euid;
1082         newsk->sk_peercred.gid  = current->egid;
1083         newu = unix_sk(newsk);
1084         newsk->sk_sleep         = &newu->peer_wait;
1085         otheru = unix_sk(other);
1086
1087         /* copy address information from listening to new sock*/
1088         if (otheru->addr) {
1089                 atomic_inc(&otheru->addr->refcnt);
1090                 newu->addr = otheru->addr;
1091         }
1092         if (otheru->dentry) {
1093                 newu->dentry    = dget(otheru->dentry);
1094                 newu->mnt       = mntget(otheru->mnt);
1095         }
1096
1097         /* Set credentials */
1098         sk->sk_peercred = other->sk_peercred;
1099
1100         sock_hold(newsk);
1101         unix_peer(sk)   = newsk;
1102         sock->state     = SS_CONNECTED;
1103         sk->sk_state    = TCP_ESTABLISHED;
1104
1105         unix_state_wunlock(sk);
1106
1107         /* take ten and and send info to listening sock */
1108         spin_lock(&other->sk_receive_queue.lock);
1109         __skb_queue_tail(&other->sk_receive_queue, skb);
1110         /* Undo artificially decreased inflight after embrion
1111          * is installed to listening socket. */
1112         atomic_inc(&newu->inflight);
1113         spin_unlock(&other->sk_receive_queue.lock);
1114         unix_state_runlock(other);
1115         other->sk_data_ready(other, 0);
1116         sock_put(other);
1117         return 0;
1118
1119 out_unlock:
1120         if (other)
1121                 unix_state_runlock(other);
1122
1123 out:
1124         if (skb)
1125                 kfree_skb(skb);
1126         if (newsk)
1127                 unix_release_sock(newsk, 0);
1128         if (other)
1129                 sock_put(other);
1130         return err;
1131 }
1132
1133 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1134 {
1135         struct sock *ska=socka->sk, *skb = sockb->sk;
1136
1137         /* Join our sockets back to back */
1138         sock_hold(ska);
1139         sock_hold(skb);
1140         unix_peer(ska)=skb;
1141         unix_peer(skb)=ska;
1142         ska->sk_peercred.pid = skb->sk_peercred.pid = current->tgid;
1143         ska->sk_peercred.uid = skb->sk_peercred.uid = current->euid;
1144         ska->sk_peercred.gid = skb->sk_peercred.gid = current->egid;
1145
1146         if (ska->sk_type != SOCK_DGRAM) {
1147                 ska->sk_state = TCP_ESTABLISHED;
1148                 skb->sk_state = TCP_ESTABLISHED;
1149                 socka->state  = SS_CONNECTED;
1150                 sockb->state  = SS_CONNECTED;
1151         }
1152         return 0;
1153 }
1154
1155 static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
1156 {
1157         struct sock *sk = sock->sk;
1158         struct sock *tsk;
1159         struct sk_buff *skb;
1160         int err;
1161
1162         err = -EOPNOTSUPP;
1163         if (sock->type!=SOCK_STREAM && sock->type!=SOCK_SEQPACKET)
1164                 goto out;
1165
1166         err = -EINVAL;
1167         if (sk->sk_state != TCP_LISTEN)
1168                 goto out;
1169
1170         /* If socket state is TCP_LISTEN it cannot change (for now...),
1171          * so that no locks are necessary.
1172          */
1173
1174         skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1175         if (!skb) {
1176                 /* This means receive shutdown. */
1177                 if (err == 0)
1178                         err = -EINVAL;
1179                 goto out;
1180         }
1181
1182         tsk = skb->sk;
1183         skb_free_datagram(sk, skb);
1184         wake_up_interruptible(&unix_sk(sk)->peer_wait);
1185
1186         /* attach accepted sock to socket */
1187         unix_state_wlock(tsk);
1188         newsock->state = SS_CONNECTED;
1189         sock_graft(tsk, newsock);
1190         unix_state_wunlock(tsk);
1191         return 0;
1192
1193 out:
1194         return err;
1195 }
1196
1197
1198 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer)
1199 {
1200         struct sock *sk = sock->sk;
1201         struct unix_sock *u;
1202         struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr;
1203         int err = 0;
1204
1205         if (peer) {
1206                 sk = unix_peer_get(sk);
1207
1208                 err = -ENOTCONN;
1209                 if (!sk)
1210                         goto out;
1211                 err = 0;
1212         } else {
1213                 sock_hold(sk);
1214         }
1215
1216         u = unix_sk(sk);
1217         unix_state_rlock(sk);
1218         if (!u->addr) {
1219                 sunaddr->sun_family = AF_UNIX;
1220                 sunaddr->sun_path[0] = 0;
1221                 *uaddr_len = sizeof(short);
1222         } else {
1223                 struct unix_address *addr = u->addr;
1224
1225                 *uaddr_len = addr->len;
1226                 memcpy(sunaddr, addr->name, *uaddr_len);
1227         }
1228         unix_state_runlock(sk);
1229         sock_put(sk);
1230 out:
1231         return err;
1232 }
1233
1234 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1235 {
1236         int i;
1237
1238         scm->fp = UNIXCB(skb).fp;
1239         skb->destructor = sock_wfree;
1240         UNIXCB(skb).fp = NULL;
1241
1242         for (i=scm->fp->count-1; i>=0; i--)
1243                 unix_notinflight(scm->fp->fp[i]);
1244 }
1245
1246 static void unix_destruct_fds(struct sk_buff *skb)
1247 {
1248         struct scm_cookie scm;
1249         memset(&scm, 0, sizeof(scm));
1250         unix_detach_fds(&scm, skb);
1251
1252         /* Alas, it calls VFS */
1253         /* So fscking what? fput() had been SMP-safe since the last Summer */
1254         scm_destroy(&scm);
1255         sock_wfree(skb);
1256 }
1257
1258 static void unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1259 {
1260         int i;
1261         for (i=scm->fp->count-1; i>=0; i--)
1262                 unix_inflight(scm->fp->fp[i]);
1263         UNIXCB(skb).fp = scm->fp;
1264         skb->destructor = unix_destruct_fds;
1265         scm->fp = NULL;
1266 }
1267
1268 /*
1269  *      Send AF_UNIX data.
1270  */
1271
1272 static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
1273                               struct msghdr *msg, size_t len)
1274 {
1275         struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1276         struct sock *sk = sock->sk;
1277         struct unix_sock *u = unix_sk(sk);
1278         struct sockaddr_un *sunaddr=msg->msg_name;
1279         struct sock *other = NULL;
1280         int namelen = 0; /* fake GCC */
1281         int err;
1282         unsigned hash;
1283         struct sk_buff *skb;
1284         long timeo;
1285         struct scm_cookie tmp_scm;
1286
1287         if (NULL == siocb->scm)
1288                 siocb->scm = &tmp_scm;
1289         err = scm_send(sock, msg, siocb->scm);
1290         if (err < 0)
1291                 return err;
1292
1293         err = -EOPNOTSUPP;
1294         if (msg->msg_flags&MSG_OOB)
1295                 goto out;
1296
1297         if (msg->msg_namelen) {
1298                 err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1299                 if (err < 0)
1300                         goto out;
1301                 namelen = err;
1302         } else {
1303                 sunaddr = NULL;
1304                 err = -ENOTCONN;
1305                 other = unix_peer_get(sk);
1306                 if (!other)
1307                         goto out;
1308         }
1309
1310         if (test_bit(SOCK_PASS_CRED, &sock->flags)
1311                 && !u->addr && (err = unix_autobind(sock)) != 0)
1312                 goto out;
1313
1314         err = -EMSGSIZE;
1315         if (len > sk->sk_sndbuf - 32)
1316                 goto out;
1317
1318         skb = sock_alloc_send_skb(sk, len, msg->msg_flags&MSG_DONTWAIT, &err);
1319         if (skb==NULL)
1320                 goto out;
1321
1322         memcpy(UNIXCREDS(skb), &siocb->scm->creds, sizeof(struct ucred));
1323         if (siocb->scm->fp)
1324                 unix_attach_fds(siocb->scm, skb);
1325
1326         skb->h.raw = skb->data;
1327         err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
1328         if (err)
1329                 goto out_free;
1330
1331         timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1332
1333 restart:
1334         if (!other) {
1335                 err = -ECONNRESET;
1336                 if (sunaddr == NULL)
1337                         goto out_free;
1338
1339                 other = unix_find_other(sunaddr, namelen, sk->sk_type,
1340                                         hash, &err);
1341                 if (other==NULL)
1342                         goto out_free;
1343         }
1344
1345         unix_state_rlock(other);
1346         err = -EPERM;
1347         if (!unix_may_send(sk, other))
1348                 goto out_unlock;
1349
1350         if (sock_flag(other, SOCK_DEAD)) {
1351                 /*
1352                  *      Check with 1003.1g - what should
1353                  *      datagram error
1354                  */
1355                 unix_state_runlock(other);
1356                 sock_put(other);
1357
1358                 err = 0;
1359                 unix_state_wlock(sk);
1360                 if (unix_peer(sk) == other) {
1361                         unix_peer(sk)=NULL;
1362                         unix_state_wunlock(sk);
1363
1364                         unix_dgram_disconnected(sk, other);
1365                         sock_put(other);
1366                         err = -ECONNREFUSED;
1367                 } else {
1368                         unix_state_wunlock(sk);
1369                 }
1370
1371                 other = NULL;
1372                 if (err)
1373                         goto out_free;
1374                 goto restart;
1375         }
1376
1377         err = -EPIPE;
1378         if (other->sk_shutdown & RCV_SHUTDOWN)
1379                 goto out_unlock;
1380
1381         err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1382         if (err)
1383                 goto out_unlock;
1384
1385         if (unix_peer(other) != sk &&
1386             (skb_queue_len(&other->sk_receive_queue) >
1387              other->sk_max_ack_backlog)) {
1388                 if (!timeo) {
1389                         err = -EAGAIN;
1390                         goto out_unlock;
1391                 }
1392
1393                 timeo = unix_wait_for_peer(other, timeo);
1394
1395                 err = sock_intr_errno(timeo);
1396                 if (signal_pending(current))
1397                         goto out_free;
1398
1399                 goto restart;
1400         }
1401
1402         skb_queue_tail(&other->sk_receive_queue, skb);
1403         unix_state_runlock(other);
1404         other->sk_data_ready(other, len);
1405         sock_put(other);
1406         scm_destroy(siocb->scm);
1407         return len;
1408
1409 out_unlock:
1410         unix_state_runlock(other);
1411 out_free:
1412         kfree_skb(skb);
1413 out:
1414         if (other)
1415                 sock_put(other);
1416         scm_destroy(siocb->scm);
1417         return err;
1418 }
1419
1420                 
1421 static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
1422                                struct msghdr *msg, size_t len)
1423 {
1424         struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1425         struct sock *sk = sock->sk;
1426         struct sock *other = NULL;
1427         struct sockaddr_un *sunaddr=msg->msg_name;
1428         int err,size;
1429         struct sk_buff *skb;
1430         int sent=0;
1431         struct scm_cookie tmp_scm;
1432
1433         if (NULL == siocb->scm)
1434                 siocb->scm = &tmp_scm;
1435         err = scm_send(sock, msg, siocb->scm);
1436         if (err < 0)
1437                 return err;
1438
1439         err = -EOPNOTSUPP;
1440         if (msg->msg_flags&MSG_OOB)
1441                 goto out_err;
1442
1443         if (msg->msg_namelen) {
1444                 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1445                 goto out_err;
1446         } else {
1447                 sunaddr = NULL;
1448                 err = -ENOTCONN;
1449                 other = unix_peer_get(sk);
1450                 if (!other)
1451                         goto out_err;
1452         }
1453
1454         if (sk->sk_shutdown & SEND_SHUTDOWN)
1455                 goto pipe_err;
1456
1457         while(sent < len)
1458         {
1459                 /*
1460                  *      Optimisation for the fact that under 0.01% of X messages typically
1461                  *      need breaking up.
1462                  */
1463
1464                 size=len-sent;
1465
1466                 /* Keep two messages in the pipe so it schedules better */
1467                 if (size > sk->sk_sndbuf / 2 - 64)
1468                         size = sk->sk_sndbuf / 2 - 64;
1469
1470                 if (size > SKB_MAX_ALLOC)
1471                         size = SKB_MAX_ALLOC;
1472                         
1473                 /*
1474                  *      Grab a buffer
1475                  */
1476                  
1477                 skb=sock_alloc_send_skb(sk,size,msg->msg_flags&MSG_DONTWAIT, &err);
1478
1479                 if (skb==NULL)
1480                         goto out_err;
1481
1482                 /*
1483                  *      If you pass two values to the sock_alloc_send_skb
1484                  *      it tries to grab the large buffer with GFP_NOFS
1485                  *      (which can fail easily), and if it fails grab the
1486                  *      fallback size buffer which is under a page and will
1487                  *      succeed. [Alan]
1488                  */
1489                 size = min_t(int, size, skb_tailroom(skb));
1490
1491                 memcpy(UNIXCREDS(skb), &siocb->scm->creds, sizeof(struct ucred));
1492                 if (siocb->scm->fp)
1493                         unix_attach_fds(siocb->scm, skb);
1494
1495                 if ((err = memcpy_fromiovec(skb_put(skb,size), msg->msg_iov, size)) != 0) {
1496                         kfree_skb(skb);
1497                         goto out_err;
1498                 }
1499
1500                 unix_state_rlock(other);
1501
1502                 if (sock_flag(other, SOCK_DEAD) ||
1503                     (other->sk_shutdown & RCV_SHUTDOWN))
1504                         goto pipe_err_free;
1505
1506                 skb_queue_tail(&other->sk_receive_queue, skb);
1507                 unix_state_runlock(other);
1508                 other->sk_data_ready(other, size);
1509                 sent+=size;
1510         }
1511         sock_put(other);
1512
1513         scm_destroy(siocb->scm);
1514         siocb->scm = NULL;
1515
1516         return sent;
1517
1518 pipe_err_free:
1519         unix_state_runlock(other);
1520         kfree_skb(skb);
1521 pipe_err:
1522         if (sent==0 && !(msg->msg_flags&MSG_NOSIGNAL))
1523                 send_sig(SIGPIPE,current,0);
1524         err = -EPIPE;
1525 out_err:
1526         if (other)
1527                 sock_put(other);
1528         scm_destroy(siocb->scm);
1529         siocb->scm = NULL;
1530         return sent ? : err;
1531 }
1532
1533 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
1534 {
1535         struct unix_sock *u = unix_sk(sk);
1536
1537         msg->msg_namelen = 0;
1538         if (u->addr) {
1539                 msg->msg_namelen = u->addr->len;
1540                 memcpy(msg->msg_name, u->addr->name, u->addr->len);
1541         }
1542 }
1543
1544 static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
1545                               struct msghdr *msg, size_t size,
1546                               int flags)
1547 {
1548         struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1549         struct scm_cookie tmp_scm;
1550         struct sock *sk = sock->sk;
1551         struct unix_sock *u = unix_sk(sk);
1552         int noblock = flags & MSG_DONTWAIT;
1553         struct sk_buff *skb;
1554         int err;
1555
1556         err = -EOPNOTSUPP;
1557         if (flags&MSG_OOB)
1558                 goto out;
1559
1560         msg->msg_namelen = 0;
1561
1562         skb = skb_recv_datagram(sk, flags, noblock, &err);
1563         if (!skb)
1564                 goto out;
1565
1566         wake_up_interruptible(&u->peer_wait);
1567
1568         if (msg->msg_name)
1569                 unix_copy_addr(msg, skb->sk);
1570
1571         if (size > skb->len)
1572                 size = skb->len;
1573         else if (size < skb->len)
1574                 msg->msg_flags |= MSG_TRUNC;
1575
1576         err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, size);
1577         if (err)
1578                 goto out_free;
1579
1580         if (!siocb->scm) {
1581                 siocb->scm = &tmp_scm;
1582                 memset(&tmp_scm, 0, sizeof(tmp_scm));
1583         }
1584         siocb->scm->creds = *UNIXCREDS(skb);
1585
1586         if (!(flags & MSG_PEEK))
1587         {
1588                 if (UNIXCB(skb).fp)
1589                         unix_detach_fds(siocb->scm, skb);
1590         }
1591         else 
1592         {
1593                 /* It is questionable: on PEEK we could:
1594                    - do not return fds - good, but too simple 8)
1595                    - return fds, and do not return them on read (old strategy,
1596                      apparently wrong)
1597                    - clone fds (I chose it for now, it is the most universal
1598                      solution)
1599                 
1600                    POSIX 1003.1g does not actually define this clearly
1601                    at all. POSIX 1003.1g doesn't define a lot of things
1602                    clearly however!                  
1603                    
1604                 */
1605                 if (UNIXCB(skb).fp)
1606                         siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1607         }
1608         err = size;
1609
1610         scm_recv(sock, msg, siocb->scm, flags);
1611
1612 out_free:
1613         skb_free_datagram(sk,skb);
1614 out:
1615         return err;
1616 }
1617
1618 /*
1619  *      Sleep until data has arrive. But check for races..
1620  */
1621  
1622 static long unix_stream_data_wait(struct sock * sk, long timeo)
1623 {
1624         DEFINE_WAIT(wait);
1625
1626         unix_state_rlock(sk);
1627
1628         for (;;) {
1629                 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1630
1631                 if (skb_queue_len(&sk->sk_receive_queue) ||
1632                     sk->sk_err ||
1633                     (sk->sk_shutdown & RCV_SHUTDOWN) ||
1634                     signal_pending(current) ||
1635                     !timeo)
1636                         break;
1637
1638                 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1639                 unix_state_runlock(sk);
1640                 timeo = schedule_timeout(timeo);
1641                 unix_state_rlock(sk);
1642                 clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1643         }
1644
1645         finish_wait(sk->sk_sleep, &wait);
1646         unix_state_runlock(sk);
1647         return timeo;
1648 }
1649
1650
1651
1652 static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
1653                                struct msghdr *msg, size_t size,
1654                                int flags)
1655 {
1656         struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1657         struct scm_cookie tmp_scm;
1658         struct sock *sk = sock->sk;
1659         struct unix_sock *u = unix_sk(sk);
1660         struct sockaddr_un *sunaddr=msg->msg_name;
1661         int copied = 0;
1662         int check_creds = 0;
1663         int target;
1664         int err = 0;
1665         long timeo;
1666
1667         err = -EINVAL;
1668         if (sk->sk_state != TCP_ESTABLISHED)
1669                 goto out;
1670
1671         err = -EOPNOTSUPP;
1672         if (flags&MSG_OOB)
1673                 goto out;
1674
1675         target = sock_rcvlowat(sk, flags&MSG_WAITALL, size);
1676         timeo = sock_rcvtimeo(sk, flags&MSG_DONTWAIT);
1677
1678         msg->msg_namelen = 0;
1679
1680         /* Lock the socket to prevent queue disordering
1681          * while sleeps in memcpy_tomsg
1682          */
1683
1684         if (!siocb->scm) {
1685                 siocb->scm = &tmp_scm;
1686                 memset(&tmp_scm, 0, sizeof(tmp_scm));
1687         }
1688
1689         down(&u->readsem);
1690
1691         do
1692         {
1693                 int chunk;
1694                 struct sk_buff *skb;
1695
1696                 skb = skb_dequeue(&sk->sk_receive_queue);
1697                 if (skb==NULL)
1698                 {
1699                         if (copied >= target)
1700                                 break;
1701
1702                         /*
1703                          *      POSIX 1003.1g mandates this order.
1704                          */
1705                          
1706                         if ((err = sock_error(sk)) != 0)
1707                                 break;
1708                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1709                                 break;
1710                         err = -EAGAIN;
1711                         if (!timeo)
1712                                 break;
1713                         up(&u->readsem);
1714
1715                         timeo = unix_stream_data_wait(sk, timeo);
1716
1717                         if (signal_pending(current)) {
1718                                 err = sock_intr_errno(timeo);
1719                                 goto out;
1720                         }
1721                         down(&u->readsem);
1722                         continue;
1723                 }
1724
1725                 if (check_creds) {
1726                         /* Never glue messages from different writers */
1727                         if (memcmp(UNIXCREDS(skb), &siocb->scm->creds, sizeof(siocb->scm->creds)) != 0) {
1728                                 skb_queue_head(&sk->sk_receive_queue, skb);
1729                                 break;
1730                         }
1731                 } else {
1732                         /* Copy credentials */
1733                         siocb->scm->creds = *UNIXCREDS(skb);
1734                         check_creds = 1;
1735                 }
1736
1737                 /* Copy address just once */
1738                 if (sunaddr)
1739                 {
1740                         unix_copy_addr(msg, skb->sk);
1741                         sunaddr = NULL;
1742                 }
1743
1744                 chunk = min_t(unsigned int, skb->len, size);
1745                 if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) {
1746                         skb_queue_head(&sk->sk_receive_queue, skb);
1747                         if (copied == 0)
1748                                 copied = -EFAULT;
1749                         break;
1750                 }
1751                 copied += chunk;
1752                 size -= chunk;
1753
1754                 /* Mark read part of skb as used */
1755                 if (!(flags & MSG_PEEK))
1756                 {
1757                         skb_pull(skb, chunk);
1758
1759                         if (UNIXCB(skb).fp)
1760                                 unix_detach_fds(siocb->scm, skb);
1761
1762                         /* put the skb back if we didn't use it up.. */
1763                         if (skb->len)
1764                         {
1765                                 skb_queue_head(&sk->sk_receive_queue, skb);
1766                                 break;
1767                         }
1768
1769                         kfree_skb(skb);
1770
1771                         if (siocb->scm->fp)
1772                                 break;
1773                 }
1774                 else
1775                 {
1776                         /* It is questionable, see note in unix_dgram_recvmsg.
1777                          */
1778                         if (UNIXCB(skb).fp)
1779                                 siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1780
1781                         /* put message back and return */
1782                         skb_queue_head(&sk->sk_receive_queue, skb);
1783                         break;
1784                 }
1785         } while (size);
1786
1787         up(&u->readsem);
1788         scm_recv(sock, msg, siocb->scm, flags);
1789 out:
1790         return copied ? : err;
1791 }
1792
1793 static int unix_shutdown(struct socket *sock, int mode)
1794 {
1795         struct sock *sk = sock->sk;
1796         struct sock *other;
1797
1798         mode = (mode+1)&(RCV_SHUTDOWN|SEND_SHUTDOWN);
1799
1800         if (mode) {
1801                 unix_state_wlock(sk);
1802                 sk->sk_shutdown |= mode;
1803                 other=unix_peer(sk);
1804                 if (other)
1805                         sock_hold(other);
1806                 unix_state_wunlock(sk);
1807                 sk->sk_state_change(sk);
1808
1809                 if (other &&
1810                         (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
1811
1812                         int peer_mode = 0;
1813
1814                         if (mode&RCV_SHUTDOWN)
1815                                 peer_mode |= SEND_SHUTDOWN;
1816                         if (mode&SEND_SHUTDOWN)
1817                                 peer_mode |= RCV_SHUTDOWN;
1818                         unix_state_wlock(other);
1819                         other->sk_shutdown |= peer_mode;
1820                         unix_state_wunlock(other);
1821                         other->sk_state_change(other);
1822                         read_lock(&other->sk_callback_lock);
1823                         if (peer_mode == SHUTDOWN_MASK)
1824                                 sk_wake_async(other,1,POLL_HUP);
1825                         else if (peer_mode & RCV_SHUTDOWN)
1826                                 sk_wake_async(other,1,POLL_IN);
1827                         read_unlock(&other->sk_callback_lock);
1828                 }
1829                 if (other)
1830                         sock_put(other);
1831         }
1832         return 0;
1833 }
1834
1835 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1836 {
1837         struct sock *sk = sock->sk;
1838         long amount=0;
1839         int err;
1840
1841         switch(cmd)
1842         {
1843                 case SIOCOUTQ:
1844                         amount = atomic_read(&sk->sk_wmem_alloc);
1845                         err = put_user(amount, (int __user *)arg);
1846                         break;
1847                 case SIOCINQ:
1848                 {
1849                         struct sk_buff *skb;
1850                         if (sk->sk_state == TCP_LISTEN) {
1851                                 err = -EINVAL;
1852                                 break;
1853                         }
1854
1855                         spin_lock(&sk->sk_receive_queue.lock);
1856                         skb = skb_peek(&sk->sk_receive_queue);
1857                         if (skb)
1858                                 amount=skb->len;
1859                         spin_unlock(&sk->sk_receive_queue.lock);
1860                         err = put_user(amount, (int __user *)arg);
1861                         break;
1862                 }
1863
1864                 default:
1865                         err = dev_ioctl(cmd, (void __user *)arg);
1866                         break;
1867         }
1868         return err;
1869 }
1870
1871 static unsigned int unix_poll(struct file * file, struct socket *sock, poll_table *wait)
1872 {
1873         struct sock *sk = sock->sk;
1874         unsigned int mask;
1875
1876         poll_wait(file, sk->sk_sleep, wait);
1877         mask = 0;
1878
1879         /* exceptional events? */
1880         if (sk->sk_err)
1881                 mask |= POLLERR;
1882         if (sk->sk_shutdown == SHUTDOWN_MASK)
1883                 mask |= POLLHUP;
1884
1885         /* readable? */
1886         if (!skb_queue_empty(&sk->sk_receive_queue) ||
1887             (sk->sk_shutdown & RCV_SHUTDOWN))
1888                 mask |= POLLIN | POLLRDNORM;
1889
1890         /* Connection-based need to check for termination and startup */
1891         if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) && sk->sk_state == TCP_CLOSE)
1892                 mask |= POLLHUP;
1893
1894         /*
1895          * we set writable also when the other side has shut down the
1896          * connection. This prevents stuck sockets.
1897          */
1898         if (unix_writable(sk))
1899                 mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
1900
1901         return mask;
1902 }
1903
1904
1905 #ifdef CONFIG_PROC_FS
1906 static struct sock *unix_seq_idx(int *iter, loff_t pos)
1907 {
1908         loff_t off = 0;
1909         struct sock *s;
1910
1911         for (s = first_unix_socket(iter); s; s = next_unix_socket(iter, s)) {
1912                 if (off == pos) 
1913                         return s;
1914                 ++off;
1915         }
1916         return NULL;
1917 }
1918
1919
1920 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
1921 {
1922         read_lock(&unix_table_lock);
1923         return *pos ? unix_seq_idx(seq->private, *pos - 1) : ((void *) 1);
1924 }
1925
1926 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1927 {
1928         ++*pos;
1929
1930         if (v == (void *)1) 
1931                 return first_unix_socket(seq->private);
1932         return next_unix_socket(seq->private, v);
1933 }
1934
1935 static void unix_seq_stop(struct seq_file *seq, void *v)
1936 {
1937         read_unlock(&unix_table_lock);
1938 }
1939
1940 static int unix_seq_show(struct seq_file *seq, void *v)
1941 {
1942         
1943         if (v == (void *)1)
1944                 seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
1945                          "Inode Path\n");
1946         else {
1947                 struct sock *s = v;
1948                 struct unix_sock *u = unix_sk(s);
1949                 unix_state_rlock(s);
1950
1951                 seq_printf(seq, "%p: %08X %08X %08X %04X %02X %5lu",
1952                         s,
1953                         atomic_read(&s->sk_refcnt),
1954                         0,
1955                         s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
1956                         s->sk_type,
1957                         s->sk_socket ?
1958                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
1959                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
1960                         sock_i_ino(s));
1961
1962                 if (u->addr) {
1963                         int i, len;
1964                         seq_putc(seq, ' ');
1965
1966                         i = 0;
1967                         len = u->addr->len - sizeof(short);
1968                         if (!UNIX_ABSTRACT(s))
1969                                 len--;
1970                         else {
1971                                 seq_putc(seq, '@');
1972                                 i++;
1973                         }
1974                         for ( ; i < len; i++)
1975                                 seq_putc(seq, u->addr->name->sun_path[i]);
1976                 }
1977                 unix_state_runlock(s);
1978                 seq_putc(seq, '\n');
1979         }
1980
1981         return 0;
1982 }
1983
1984 static struct seq_operations unix_seq_ops = {
1985         .start  = unix_seq_start,
1986         .next   = unix_seq_next,
1987         .stop   = unix_seq_stop,
1988         .show   = unix_seq_show,
1989 };
1990
1991
1992 static int unix_seq_open(struct inode *inode, struct file *file)
1993 {
1994         struct seq_file *seq;
1995         int rc = -ENOMEM;
1996         int *iter = kmalloc(sizeof(int), GFP_KERNEL);
1997
1998         if (!iter)
1999                 goto out;
2000
2001         rc = seq_open(file, &unix_seq_ops);
2002         if (rc)
2003                 goto out_kfree;
2004
2005         seq          = file->private_data;
2006         seq->private = iter;
2007         *iter = 0;
2008 out:
2009         return rc;
2010 out_kfree:
2011         kfree(iter);
2012         goto out;
2013 }
2014
2015 static struct file_operations unix_seq_fops = {
2016         .owner          = THIS_MODULE,
2017         .open           = unix_seq_open,
2018         .read           = seq_read,
2019         .llseek         = seq_lseek,
2020         .release        = seq_release_private,
2021 };
2022
2023 #endif
2024
2025 static struct net_proto_family unix_family_ops = {
2026         .family = PF_UNIX,
2027         .create = unix_create,
2028         .owner  = THIS_MODULE,
2029 };
2030
2031 #ifdef CONFIG_SYSCTL
2032 extern void unix_sysctl_register(void);
2033 extern void unix_sysctl_unregister(void);
2034 #else
2035 static inline void unix_sysctl_register(void) {}
2036 static inline void unix_sysctl_unregister(void) {}
2037 #endif
2038
2039 static int __init af_unix_init(void)
2040 {
2041         struct sk_buff *dummy_skb;
2042
2043         if (sizeof(struct unix_skb_parms) > sizeof(dummy_skb->cb)) {
2044                 printk(KERN_CRIT "%s: panic\n", __FUNCTION__);
2045                 return -1;
2046         }
2047         /* allocate our sock slab cache */
2048         unix_sk_cachep = kmem_cache_create("unix_sock",
2049                                            sizeof(struct unix_sock), 0,
2050                                            SLAB_HWCACHE_ALIGN, NULL, NULL);
2051         if (!unix_sk_cachep)
2052                 printk(KERN_CRIT
2053                         "af_unix_init: Cannot create unix_sock SLAB cache!\n");
2054
2055         sock_register(&unix_family_ops);
2056 #ifdef CONFIG_PROC_FS
2057         proc_net_fops_create("unix", 0, &unix_seq_fops);
2058 #endif
2059         unix_sysctl_register();
2060         return 0;
2061 }
2062
2063 static void __exit af_unix_exit(void)
2064 {
2065         sock_unregister(PF_UNIX);
2066         unix_sysctl_unregister();
2067         proc_net_remove("unix");
2068         kmem_cache_destroy(unix_sk_cachep);
2069 }
2070
2071 module_init(af_unix_init);
2072 module_exit(af_unix_exit);
2073
2074 MODULE_LICENSE("GPL");
2075 MODULE_ALIAS_NETPROTO(PF_UNIX);