This commit was manufactured by cvs2svn to create tag 'before-xenU'.
[linux-2.6.git] / net / unix / af_unix.c
1 /*
2  * NET4:        Implementation of BSD Unix domain sockets.
3  *
4  * Authors:     Alan Cox, <alan.cox@linux.org>
5  *
6  *              This program is free software; you can redistribute it and/or
7  *              modify it under the terms of the GNU General Public License
8  *              as published by the Free Software Foundation; either version
9  *              2 of the License, or (at your option) any later version.
10  *
11  * Version:     $Id: af_unix.c,v 1.133 2002/02/08 03:57:19 davem Exp $
12  *
13  * Fixes:
14  *              Linus Torvalds  :       Assorted bug cures.
15  *              Niibe Yutaka    :       async I/O support.
16  *              Carsten Paeth   :       PF_UNIX check, address fixes.
17  *              Alan Cox        :       Limit size of allocated blocks.
18  *              Alan Cox        :       Fixed the stupid socketpair bug.
19  *              Alan Cox        :       BSD compatibility fine tuning.
20  *              Alan Cox        :       Fixed a bug in connect when interrupted.
21  *              Alan Cox        :       Sorted out a proper draft version of
22  *                                      file descriptor passing hacked up from
23  *                                      Mike Shaver's work.
24  *              Marty Leisner   :       Fixes to fd passing
25  *              Nick Nevin      :       recvmsg bugfix.
26  *              Alan Cox        :       Started proper garbage collector
27  *              Heiko EiBfeldt  :       Missing verify_area check
28  *              Alan Cox        :       Started POSIXisms
29  *              Andreas Schwab  :       Replace inode by dentry for proper
30  *                                      reference counting
31  *              Kirk Petersen   :       Made this a module
32  *          Christoph Rohland   :       Elegant non-blocking accept/connect algorithm.
33  *                                      Lots of bug fixes.
34  *           Alexey Kuznetosv   :       Repaired (I hope) bugs introduces
35  *                                      by above two patches.
36  *           Andrea Arcangeli   :       If possible we block in connect(2)
37  *                                      if the max backlog of the listen socket
38  *                                      is been reached. This won't break
39  *                                      old apps and it will avoid huge amount
40  *                                      of socks hashed (this for unix_gc()
41  *                                      performances reasons).
42  *                                      Security fix that limits the max
43  *                                      number of socks to 2*max_files and
44  *                                      the number of skb queueable in the
45  *                                      dgram receiver.
46  *              Artur Skawina   :       Hash function optimizations
47  *           Alexey Kuznetsov   :       Full scale SMP. Lot of bugs are introduced 8)
48  *            Malcolm Beattie   :       Set peercred for socketpair
49  *           Michal Ostrowski   :       Module initialization cleanup.
50  *           Arnaldo C. Melo    :       Remove MOD_{INC,DEC}_USE_COUNT,
51  *                                      the core infrastructure is doing that
52  *                                      for all net proto families now (2.5.69+)
53  *
54  *
55  * Known differences from reference BSD that was tested:
56  *
57  *      [TO FIX]
58  *      ECONNREFUSED is not returned from one end of a connected() socket to the
59  *              other the moment one end closes.
60  *      fstat() doesn't return st_dev=0, and give the blksize as high water mark
61  *              and a fake inode identifier (nor the BSD first socket fstat twice bug).
62  *      [NOT TO FIX]
63  *      accept() returns a path name even if the connecting socket has closed
64  *              in the meantime (BSD loses the path and gives up).
65  *      accept() returns 0 length path for an unbound connector. BSD returns 16
66  *              and a null first byte in the path (but not for gethost/peername - BSD bug ??)
67  *      socketpair(...SOCK_RAW..) doesn't panic the kernel.
68  *      BSD af_unix apparently has connect forgetting to block properly.
69  *              (need to check this with the POSIX spec in detail)
70  *
71  * Differences from 2.0.0-11-... (ANK)
72  *      Bug fixes and improvements.
73  *              - client shutdown killed server socket.
74  *              - removed all useless cli/sti pairs.
75  *
76  *      Semantic changes/extensions.
77  *              - generic control message passing.
78  *              - SCM_CREDENTIALS control message.
79  *              - "Abstract" (not FS based) socket bindings.
80  *                Abstract names are sequences of bytes (not zero terminated)
81  *                started by 0, so that this name space does not intersect
82  *                with BSD names.
83  */
84
85 #include <linux/module.h>
86 #include <linux/config.h>
87 #include <linux/kernel.h>
88 #include <linux/signal.h>
89 #include <linux/sched.h>
90 #include <linux/errno.h>
91 #include <linux/string.h>
92 #include <linux/stat.h>
93 #include <linux/dcache.h>
94 #include <linux/namei.h>
95 #include <linux/socket.h>
96 #include <linux/un.h>
97 #include <linux/fcntl.h>
98 #include <linux/termios.h>
99 #include <linux/sockios.h>
100 #include <linux/net.h>
101 #include <linux/in.h>
102 #include <linux/fs.h>
103 #include <linux/slab.h>
104 #include <asm/uaccess.h>
105 #include <linux/skbuff.h>
106 #include <linux/netdevice.h>
107 #include <net/sock.h>
108 #include <linux/tcp.h>
109 #include <net/af_unix.h>
110 #include <linux/proc_fs.h>
111 #include <linux/seq_file.h>
112 #include <net/scm.h>
113 #include <linux/init.h>
114 #include <linux/poll.h>
115 #include <linux/smp_lock.h>
116 #include <linux/rtnetlink.h>
117 #include <linux/mount.h>
118 #include <net/checksum.h>
119 #include <linux/security.h>
120 #include <linux/vs_context.h>
121 #include <linux/vs_network.h>
122 #include <linux/vs_limit.h>
123
124 int sysctl_unix_max_dgram_qlen = 10;
125
126 struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1];
127 DEFINE_RWLOCK(unix_table_lock);
128 static atomic_t unix_nr_socks = ATOMIC_INIT(0);
129
130 #define unix_sockets_unbound    (&unix_socket_table[UNIX_HASH_SIZE])
131
132 #define UNIX_ABSTRACT(sk)       (unix_sk(sk)->addr->hash != UNIX_HASH_SIZE)
133
134 /*
135  *  SMP locking strategy:
136  *    hash table is protected with rwlock unix_table_lock
137  *    each socket state is protected by separate rwlock.
138  */
139
140 static inline unsigned unix_hash_fold(unsigned hash)
141 {
142         hash ^= hash>>16;
143         hash ^= hash>>8;
144         return hash&(UNIX_HASH_SIZE-1);
145 }
146
147 #define unix_peer(sk) (unix_sk(sk)->peer)
148
149 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
150 {
151         return unix_peer(osk) == sk;
152 }
153
154 static inline int unix_may_send(struct sock *sk, struct sock *osk)
155 {
156         return (unix_peer(osk) == NULL || unix_our_peer(sk, osk));
157 }
158
159 static struct sock *unix_peer_get(struct sock *s)
160 {
161         struct sock *peer;
162
163         unix_state_rlock(s);
164         peer = unix_peer(s);
165         if (peer)
166                 sock_hold(peer);
167         unix_state_runlock(s);
168         return peer;
169 }
170
171 static inline void unix_release_addr(struct unix_address *addr)
172 {
173         if (atomic_dec_and_test(&addr->refcnt))
174                 kfree(addr);
175 }
176
177 /*
178  *      Check unix socket name:
179  *              - should be not zero length.
180  *              - if started by not zero, should be NULL terminated (FS object)
181  *              - if started by zero, it is abstract name.
182  */
183  
184 static int unix_mkname(struct sockaddr_un * sunaddr, int len, unsigned *hashp)
185 {
186         if (len <= sizeof(short) || len > sizeof(*sunaddr))
187                 return -EINVAL;
188         if (!sunaddr || sunaddr->sun_family != AF_UNIX)
189                 return -EINVAL;
190         if (sunaddr->sun_path[0]) {
191                 /*
192                  * This may look like an off by one error but it is a bit more
193                  * subtle. 108 is the longest valid AF_UNIX path for a binding.
194                  * sun_path[108] doesnt as such exist.  However in kernel space
195                  * we are guaranteed that it is a valid memory location in our
196                  * kernel address buffer.
197                  */
198                 ((char *)sunaddr)[len]=0;
199                 len = strlen(sunaddr->sun_path)+1+sizeof(short);
200                 return len;
201         }
202
203         *hashp = unix_hash_fold(csum_partial((char*)sunaddr, len, 0));
204         return len;
205 }
206
207 static void __unix_remove_socket(struct sock *sk)
208 {
209         sk_del_node_init(sk);
210 }
211
212 static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
213 {
214         BUG_TRAP(sk_unhashed(sk));
215         sk_add_node(sk, list);
216 }
217
218 static inline void unix_remove_socket(struct sock *sk)
219 {
220         write_lock(&unix_table_lock);
221         __unix_remove_socket(sk);
222         write_unlock(&unix_table_lock);
223 }
224
225 static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
226 {
227         write_lock(&unix_table_lock);
228         __unix_insert_socket(list, sk);
229         write_unlock(&unix_table_lock);
230 }
231
232 static struct sock *__unix_find_socket_byname(struct sockaddr_un *sunname,
233                                               int len, int type, unsigned hash)
234 {
235         struct sock *s;
236         struct hlist_node *node;
237
238         sk_for_each(s, node, &unix_socket_table[hash ^ type]) {
239                 struct unix_sock *u = unix_sk(s);
240
241                 if (u->addr->len == len &&
242                     !memcmp(u->addr->name, sunname, len))
243                         goto found;
244         }
245         s = NULL;
246 found:
247         return s;
248 }
249
250 static inline struct sock *unix_find_socket_byname(struct sockaddr_un *sunname,
251                                                    int len, int type,
252                                                    unsigned hash)
253 {
254         struct sock *s;
255
256         read_lock(&unix_table_lock);
257         s = __unix_find_socket_byname(sunname, len, type, hash);
258         if (s)
259                 sock_hold(s);
260         read_unlock(&unix_table_lock);
261         return s;
262 }
263
264 static struct sock *unix_find_socket_byinode(struct inode *i)
265 {
266         struct sock *s;
267         struct hlist_node *node;
268
269         read_lock(&unix_table_lock);
270         sk_for_each(s, node,
271                     &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
272                 struct dentry *dentry = unix_sk(s)->dentry;
273
274                 if(dentry && dentry->d_inode == i)
275                 {
276                         sock_hold(s);
277                         goto found;
278                 }
279         }
280         s = NULL;
281 found:
282         read_unlock(&unix_table_lock);
283         return s;
284 }
285
286 static inline int unix_writable(struct sock *sk)
287 {
288         return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
289 }
290
291 static void unix_write_space(struct sock *sk)
292 {
293         read_lock(&sk->sk_callback_lock);
294         if (unix_writable(sk)) {
295                 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
296                         wake_up_interruptible(sk->sk_sleep);
297                 sk_wake_async(sk, 2, POLL_OUT);
298         }
299         read_unlock(&sk->sk_callback_lock);
300 }
301
302 /* When dgram socket disconnects (or changes its peer), we clear its receive
303  * queue of packets arrived from previous peer. First, it allows to do
304  * flow control based only on wmem_alloc; second, sk connected to peer
305  * may receive messages only from that peer. */
306 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
307 {
308         if (skb_queue_len(&sk->sk_receive_queue)) {
309                 skb_queue_purge(&sk->sk_receive_queue);
310                 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
311
312                 /* If one link of bidirectional dgram pipe is disconnected,
313                  * we signal error. Messages are lost. Do not make this,
314                  * when peer was not connected to us.
315                  */
316                 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
317                         other->sk_err = ECONNRESET;
318                         other->sk_error_report(other);
319                 }
320         }
321 }
322
323 static void unix_sock_destructor(struct sock *sk)
324 {
325         struct unix_sock *u = unix_sk(sk);
326
327         skb_queue_purge(&sk->sk_receive_queue);
328
329         BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
330         BUG_TRAP(sk_unhashed(sk));
331         BUG_TRAP(!sk->sk_socket);
332         if (!sock_flag(sk, SOCK_DEAD)) {
333                 printk("Attempt to release alive unix socket: %p\n", sk);
334                 return;
335         }
336
337         if (u->addr)
338                 unix_release_addr(u->addr);
339
340         atomic_dec(&unix_nr_socks);
341 #ifdef UNIX_REFCNT_DEBUG
342         printk(KERN_DEBUG "UNIX %p is destroyed, %d are still alive.\n", sk, atomic_read(&unix_nr_socks));
343 #endif
344 }
345
346 static int unix_release_sock (struct sock *sk, int embrion)
347 {
348         struct unix_sock *u = unix_sk(sk);
349         struct dentry *dentry;
350         struct vfsmount *mnt;
351         struct sock *skpair;
352         struct sk_buff *skb;
353         int state;
354
355         unix_remove_socket(sk);
356
357         /* Clear state */
358         unix_state_wlock(sk);
359         sock_orphan(sk);
360         sk->sk_shutdown = SHUTDOWN_MASK;
361         dentry       = u->dentry;
362         u->dentry    = NULL;
363         mnt          = u->mnt;
364         u->mnt       = NULL;
365         state = sk->sk_state;
366         sk->sk_state = TCP_CLOSE;
367         unix_state_wunlock(sk);
368
369         wake_up_interruptible_all(&u->peer_wait);
370
371         skpair=unix_peer(sk);
372
373         if (skpair!=NULL) {
374                 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
375                         unix_state_wlock(skpair);
376                         /* No more writes */
377                         skpair->sk_shutdown = SHUTDOWN_MASK;
378                         if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
379                                 skpair->sk_err = ECONNRESET;
380                         unix_state_wunlock(skpair);
381                         skpair->sk_state_change(skpair);
382                         read_lock(&skpair->sk_callback_lock);
383                         sk_wake_async(skpair,1,POLL_HUP);
384                         read_unlock(&skpair->sk_callback_lock);
385                 }
386                 sock_put(skpair); /* It may now die */
387                 unix_peer(sk) = NULL;
388         }
389
390         /* Try to flush out this socket. Throw out buffers at least */
391
392         while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
393                 if (state==TCP_LISTEN)
394                         unix_release_sock(skb->sk, 1);
395                 /* passed fds are erased in the kfree_skb hook        */
396                 kfree_skb(skb);
397         }
398
399         if (dentry) {
400                 dput(dentry);
401                 mntput(mnt);
402         }
403
404         vx_sock_dec(sk);
405         clr_vx_info(&sk->sk_vx_info);
406         clr_nx_info(&sk->sk_nx_info);
407         sock_put(sk);
408
409         /* ---- Socket is dead now and most probably destroyed ---- */
410
411         /*
412          * Fixme: BSD difference: In BSD all sockets connected to use get
413          *        ECONNRESET and we die on the spot. In Linux we behave
414          *        like files and pipes do and wait for the last
415          *        dereference.
416          *
417          * Can't we simply set sock->err?
418          *
419          *        What the above comment does talk about? --ANK(980817)
420          */
421
422         if (atomic_read(&unix_tot_inflight))
423                 unix_gc();              /* Garbage collect fds */       
424
425         return 0;
426 }
427
428 static int unix_listen(struct socket *sock, int backlog)
429 {
430         int err;
431         struct sock *sk = sock->sk;
432         struct unix_sock *u = unix_sk(sk);
433
434         err = -EOPNOTSUPP;
435         if (sock->type!=SOCK_STREAM && sock->type!=SOCK_SEQPACKET)
436                 goto out;                       /* Only stream/seqpacket sockets accept */
437         err = -EINVAL;
438         if (!u->addr)
439                 goto out;                       /* No listens on an unbound socket */
440         unix_state_wlock(sk);
441         if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
442                 goto out_unlock;
443         if (backlog > sk->sk_max_ack_backlog)
444                 wake_up_interruptible_all(&u->peer_wait);
445         sk->sk_max_ack_backlog  = backlog;
446         sk->sk_state            = TCP_LISTEN;
447         /* set credentials so connect can copy them */
448         sk->sk_peercred.pid     = current->tgid;
449         sk->sk_peercred.uid     = current->euid;
450         sk->sk_peercred.gid     = current->egid;
451         err = 0;
452
453 out_unlock:
454         unix_state_wunlock(sk);
455 out:
456         return err;
457 }
458
459 static int unix_release(struct socket *);
460 static int unix_bind(struct socket *, struct sockaddr *, int);
461 static int unix_stream_connect(struct socket *, struct sockaddr *,
462                                int addr_len, int flags);
463 static int unix_socketpair(struct socket *, struct socket *);
464 static int unix_accept(struct socket *, struct socket *, int);
465 static int unix_getname(struct socket *, struct sockaddr *, int *, int);
466 static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
467 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
468 static int unix_shutdown(struct socket *, int);
469 static int unix_stream_sendmsg(struct kiocb *, struct socket *,
470                                struct msghdr *, size_t);
471 static int unix_stream_recvmsg(struct kiocb *, struct socket *,
472                                struct msghdr *, size_t, int);
473 static int unix_dgram_sendmsg(struct kiocb *, struct socket *,
474                               struct msghdr *, size_t);
475 static int unix_dgram_recvmsg(struct kiocb *, struct socket *,
476                               struct msghdr *, size_t, int);
477 static int unix_dgram_connect(struct socket *, struct sockaddr *,
478                               int, int);
479 static int unix_seqpacket_sendmsg(struct kiocb *, struct socket *,
480                                   struct msghdr *, size_t);
481
482 static struct proto_ops unix_stream_ops = {
483         .family =       PF_UNIX,
484         .owner =        THIS_MODULE,
485         .release =      unix_release,
486         .bind =         unix_bind,
487         .connect =      unix_stream_connect,
488         .socketpair =   unix_socketpair,
489         .accept =       unix_accept,
490         .getname =      unix_getname,
491         .poll =         unix_poll,
492         .ioctl =        unix_ioctl,
493         .listen =       unix_listen,
494         .shutdown =     unix_shutdown,
495         .setsockopt =   sock_no_setsockopt,
496         .getsockopt =   sock_no_getsockopt,
497         .sendmsg =      unix_stream_sendmsg,
498         .recvmsg =      unix_stream_recvmsg,
499         .mmap =         sock_no_mmap,
500         .sendpage =     sock_no_sendpage,
501 };
502
503 static struct proto_ops unix_dgram_ops = {
504         .family =       PF_UNIX,
505         .owner =        THIS_MODULE,
506         .release =      unix_release,
507         .bind =         unix_bind,
508         .connect =      unix_dgram_connect,
509         .socketpair =   unix_socketpair,
510         .accept =       sock_no_accept,
511         .getname =      unix_getname,
512         .poll =         datagram_poll,
513         .ioctl =        unix_ioctl,
514         .listen =       sock_no_listen,
515         .shutdown =     unix_shutdown,
516         .setsockopt =   sock_no_setsockopt,
517         .getsockopt =   sock_no_getsockopt,
518         .sendmsg =      unix_dgram_sendmsg,
519         .recvmsg =      unix_dgram_recvmsg,
520         .mmap =         sock_no_mmap,
521         .sendpage =     sock_no_sendpage,
522 };
523
524 static struct proto_ops unix_seqpacket_ops = {
525         .family =       PF_UNIX,
526         .owner =        THIS_MODULE,
527         .release =      unix_release,
528         .bind =         unix_bind,
529         .connect =      unix_stream_connect,
530         .socketpair =   unix_socketpair,
531         .accept =       unix_accept,
532         .getname =      unix_getname,
533         .poll =         datagram_poll,
534         .ioctl =        unix_ioctl,
535         .listen =       unix_listen,
536         .shutdown =     unix_shutdown,
537         .setsockopt =   sock_no_setsockopt,
538         .getsockopt =   sock_no_getsockopt,
539         .sendmsg =      unix_seqpacket_sendmsg,
540         .recvmsg =      unix_dgram_recvmsg,
541         .mmap =         sock_no_mmap,
542         .sendpage =     sock_no_sendpage,
543 };
544
545 static struct proto unix_proto = {
546         .name     = "UNIX",
547         .owner    = THIS_MODULE,
548         .obj_size = sizeof(struct unix_sock),
549 };
550
551 static struct sock * unix_create1(struct socket *sock)
552 {
553         struct sock *sk = NULL;
554         struct unix_sock *u;
555
556         if (atomic_read(&unix_nr_socks) >= 2*files_stat.max_files)
557                 goto out;
558
559         sk = sk_alloc(PF_UNIX, GFP_KERNEL, &unix_proto, 1);
560         if (!sk)
561                 goto out;
562
563         atomic_inc(&unix_nr_socks);
564
565         sock_init_data(sock,sk);
566
567         set_vx_info(&sk->sk_vx_info, current->vx_info);
568         sk->sk_xid = vx_current_xid();
569         vx_sock_inc(sk);
570         set_nx_info(&sk->sk_nx_info, current->nx_info);
571
572         sk->sk_write_space      = unix_write_space;
573         sk->sk_max_ack_backlog  = sysctl_unix_max_dgram_qlen;
574         sk->sk_destruct         = unix_sock_destructor;
575         u         = unix_sk(sk);
576         u->dentry = NULL;
577         u->mnt    = NULL;
578         rwlock_init(&u->lock);
579         atomic_set(&u->inflight, sock ? 0 : -1);
580         init_MUTEX(&u->readsem); /* single task reading lock */
581         init_waitqueue_head(&u->peer_wait);
582         unix_insert_socket(unix_sockets_unbound, sk);
583 out:
584         return sk;
585 }
586
587 static int unix_create(struct socket *sock, int protocol)
588 {
589         if (protocol && protocol != PF_UNIX)
590                 return -EPROTONOSUPPORT;
591
592         sock->state = SS_UNCONNECTED;
593
594         switch (sock->type) {
595         case SOCK_STREAM:
596                 sock->ops = &unix_stream_ops;
597                 break;
598                 /*
599                  *      Believe it or not BSD has AF_UNIX, SOCK_RAW though
600                  *      nothing uses it.
601                  */
602         case SOCK_RAW:
603                 sock->type=SOCK_DGRAM;
604         case SOCK_DGRAM:
605                 sock->ops = &unix_dgram_ops;
606                 break;
607         case SOCK_SEQPACKET:
608                 sock->ops = &unix_seqpacket_ops;
609                 break;
610         default:
611                 return -ESOCKTNOSUPPORT;
612         }
613
614         return unix_create1(sock) ? 0 : -ENOMEM;
615 }
616
617 static int unix_release(struct socket *sock)
618 {
619         struct sock *sk = sock->sk;
620
621         if (!sk)
622                 return 0;
623
624         sock->sk = NULL;
625
626         return unix_release_sock (sk, 0);
627 }
628
629 static int unix_autobind(struct socket *sock)
630 {
631         struct sock *sk = sock->sk;
632         struct unix_sock *u = unix_sk(sk);
633         static u32 ordernum = 1;
634         struct unix_address * addr;
635         int err;
636
637         down(&u->readsem);
638
639         err = 0;
640         if (u->addr)
641                 goto out;
642
643         err = -ENOMEM;
644         addr = kmalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
645         if (!addr)
646                 goto out;
647
648         memset(addr, 0, sizeof(*addr) + sizeof(short) + 16);
649         addr->name->sun_family = AF_UNIX;
650         atomic_set(&addr->refcnt, 1);
651
652 retry:
653         addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
654         addr->hash = unix_hash_fold(csum_partial((void*)addr->name, addr->len, 0));
655
656         write_lock(&unix_table_lock);
657         ordernum = (ordernum+1)&0xFFFFF;
658
659         if (__unix_find_socket_byname(addr->name, addr->len, sock->type,
660                                       addr->hash)) {
661                 write_unlock(&unix_table_lock);
662                 /* Sanity yield. It is unusual case, but yet... */
663                 if (!(ordernum&0xFF))
664                         yield();
665                 goto retry;
666         }
667         addr->hash ^= sk->sk_type;
668
669         __unix_remove_socket(sk);
670         u->addr = addr;
671         __unix_insert_socket(&unix_socket_table[addr->hash], sk);
672         write_unlock(&unix_table_lock);
673         err = 0;
674
675 out:    up(&u->readsem);
676         return err;
677 }
678
679 static struct sock *unix_find_other(struct sockaddr_un *sunname, int len,
680                                     int type, unsigned hash, int *error)
681 {
682         struct sock *u;
683         struct nameidata nd;
684         int err = 0;
685         
686         if (sunname->sun_path[0]) {
687                 err = path_lookup(sunname->sun_path, LOOKUP_FOLLOW, &nd);
688                 if (err)
689                         goto fail;
690                 err = permission(nd.dentry->d_inode,MAY_WRITE, &nd);
691                 if (err)
692                         goto put_fail;
693
694                 err = -ECONNREFUSED;
695                 if (!S_ISSOCK(nd.dentry->d_inode->i_mode))
696                         goto put_fail;
697                 u=unix_find_socket_byinode(nd.dentry->d_inode);
698                 if (!u)
699                         goto put_fail;
700
701                 if (u->sk_type == type)
702                         touch_atime(nd.mnt, nd.dentry);
703
704                 path_release(&nd);
705
706                 err=-EPROTOTYPE;
707                 if (u->sk_type != type) {
708                         sock_put(u);
709                         goto fail;
710                 }
711         } else {
712                 err = -ECONNREFUSED;
713                 u=unix_find_socket_byname(sunname, len, type, hash);
714                 if (u) {
715                         struct dentry *dentry;
716                         dentry = unix_sk(u)->dentry;
717                         if (dentry)
718                                 touch_atime(unix_sk(u)->mnt, dentry);
719                 } else
720                         goto fail;
721         }
722         return u;
723
724 put_fail:
725         path_release(&nd);
726 fail:
727         *error=err;
728         return NULL;
729 }
730
731
732 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
733 {
734         struct sock *sk = sock->sk;
735         struct unix_sock *u = unix_sk(sk);
736         struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr;
737         struct dentry * dentry = NULL;
738         struct nameidata nd;
739         int err;
740         unsigned hash;
741         struct unix_address *addr;
742         struct hlist_head *list;
743
744         err = -EINVAL;
745         if (sunaddr->sun_family != AF_UNIX)
746                 goto out;
747
748         if (addr_len==sizeof(short)) {
749                 err = unix_autobind(sock);
750                 goto out;
751         }
752
753         err = unix_mkname(sunaddr, addr_len, &hash);
754         if (err < 0)
755                 goto out;
756         addr_len = err;
757
758         down(&u->readsem);
759
760         err = -EINVAL;
761         if (u->addr)
762                 goto out_up;
763
764         err = -ENOMEM;
765         addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
766         if (!addr)
767                 goto out_up;
768
769         memcpy(addr->name, sunaddr, addr_len);
770         addr->len = addr_len;
771         addr->hash = hash ^ sk->sk_type;
772         atomic_set(&addr->refcnt, 1);
773
774         if (sunaddr->sun_path[0]) {
775                 unsigned int mode;
776                 err = 0;
777                 /*
778                  * Get the parent directory, calculate the hash for last
779                  * component.
780                  */
781                 err = path_lookup(sunaddr->sun_path, LOOKUP_PARENT, &nd);
782                 if (err)
783                         goto out_mknod_parent;
784
785                 dentry = lookup_create(&nd, 0);
786                 err = PTR_ERR(dentry);
787                 if (IS_ERR(dentry))
788                         goto out_mknod_unlock;
789
790                 /*
791                  * All right, let's create it.
792                  */
793                 mode = S_IFSOCK |
794                        (SOCK_INODE(sock)->i_mode & ~current->fs->umask);
795                 err = vfs_mknod(nd.dentry->d_inode, dentry, mode, 0);
796                 if (err)
797                         goto out_mknod_dput;
798                 up(&nd.dentry->d_inode->i_sem);
799                 dput(nd.dentry);
800                 nd.dentry = dentry;
801
802                 addr->hash = UNIX_HASH_SIZE;
803         }
804
805         write_lock(&unix_table_lock);
806
807         if (!sunaddr->sun_path[0]) {
808                 err = -EADDRINUSE;
809                 if (__unix_find_socket_byname(sunaddr, addr_len,
810                                               sk->sk_type, hash)) {
811                         unix_release_addr(addr);
812                         goto out_unlock;
813                 }
814
815                 list = &unix_socket_table[addr->hash];
816         } else {
817                 list = &unix_socket_table[dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1)];
818                 u->dentry = nd.dentry;
819                 u->mnt    = nd.mnt;
820         }
821
822         err = 0;
823         __unix_remove_socket(sk);
824         u->addr = addr;
825         __unix_insert_socket(list, sk);
826
827 out_unlock:
828         write_unlock(&unix_table_lock);
829 out_up:
830         up(&u->readsem);
831 out:
832         return err;
833
834 out_mknod_dput:
835         dput(dentry);
836 out_mknod_unlock:
837         up(&nd.dentry->d_inode->i_sem);
838         path_release(&nd);
839 out_mknod_parent:
840         if (err==-EEXIST)
841                 err=-EADDRINUSE;
842         unix_release_addr(addr);
843         goto out_up;
844 }
845
846 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
847                               int alen, int flags)
848 {
849         struct sock *sk = sock->sk;
850         struct sockaddr_un *sunaddr=(struct sockaddr_un*)addr;
851         struct sock *other;
852         unsigned hash;
853         int err;
854
855         if (addr->sa_family != AF_UNSPEC) {
856                 err = unix_mkname(sunaddr, alen, &hash);
857                 if (err < 0)
858                         goto out;
859                 alen = err;
860
861                 if (test_bit(SOCK_PASSCRED, &sock->flags) &&
862                     !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
863                         goto out;
864
865                 other=unix_find_other(sunaddr, alen, sock->type, hash, &err);
866                 if (!other)
867                         goto out;
868
869                 unix_state_wlock(sk);
870
871                 err = -EPERM;
872                 if (!unix_may_send(sk, other))
873                         goto out_unlock;
874
875                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
876                 if (err)
877                         goto out_unlock;
878
879         } else {
880                 /*
881                  *      1003.1g breaking connected state with AF_UNSPEC
882                  */
883                 other = NULL;
884                 unix_state_wlock(sk);
885         }
886
887         /*
888          * If it was connected, reconnect.
889          */
890         if (unix_peer(sk)) {
891                 struct sock *old_peer = unix_peer(sk);
892                 unix_peer(sk)=other;
893                 unix_state_wunlock(sk);
894
895                 if (other != old_peer)
896                         unix_dgram_disconnected(sk, old_peer);
897                 sock_put(old_peer);
898         } else {
899                 unix_peer(sk)=other;
900                 unix_state_wunlock(sk);
901         }
902         return 0;
903
904 out_unlock:
905         unix_state_wunlock(sk);
906         sock_put(other);
907 out:
908         return err;
909 }
910
911 static long unix_wait_for_peer(struct sock *other, long timeo)
912 {
913         struct unix_sock *u = unix_sk(other);
914         int sched;
915         DEFINE_WAIT(wait);
916
917         prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
918
919         sched = !sock_flag(other, SOCK_DEAD) &&
920                 !(other->sk_shutdown & RCV_SHUTDOWN) &&
921                 (skb_queue_len(&other->sk_receive_queue) >
922                  other->sk_max_ack_backlog);
923
924         unix_state_runlock(other);
925
926         if (sched)
927                 timeo = schedule_timeout(timeo);
928
929         finish_wait(&u->peer_wait, &wait);
930         return timeo;
931 }
932
933 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
934                                int addr_len, int flags)
935 {
936         struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr;
937         struct sock *sk = sock->sk;
938         struct unix_sock *u = unix_sk(sk), *newu, *otheru;
939         struct sock *newsk = NULL;
940         struct sock *other = NULL;
941         struct sk_buff *skb = NULL;
942         unsigned hash;
943         int st;
944         int err;
945         long timeo;
946
947         err = unix_mkname(sunaddr, addr_len, &hash);
948         if (err < 0)
949                 goto out;
950         addr_len = err;
951
952         if (test_bit(SOCK_PASSCRED, &sock->flags)
953                 && !u->addr && (err = unix_autobind(sock)) != 0)
954                 goto out;
955
956         timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
957
958         /* First of all allocate resources.
959            If we will make it after state is locked,
960            we will have to recheck all again in any case.
961          */
962
963         err = -ENOMEM;
964
965         /* create new sock for complete connection */
966         newsk = unix_create1(NULL);
967         if (newsk == NULL)
968                 goto out;
969
970         /* Allocate skb for sending to listening sock */
971         skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
972         if (skb == NULL)
973                 goto out;
974
975 restart:
976         /*  Find listening sock. */
977         other = unix_find_other(sunaddr, addr_len, sk->sk_type, hash, &err);
978         if (!other)
979                 goto out;
980
981         /* Latch state of peer */
982         unix_state_rlock(other);
983
984         /* Apparently VFS overslept socket death. Retry. */
985         if (sock_flag(other, SOCK_DEAD)) {
986                 unix_state_runlock(other);
987                 sock_put(other);
988                 goto restart;
989         }
990
991         err = -ECONNREFUSED;
992         if (other->sk_state != TCP_LISTEN)
993                 goto out_unlock;
994
995         if (skb_queue_len(&other->sk_receive_queue) >
996             other->sk_max_ack_backlog) {
997                 err = -EAGAIN;
998                 if (!timeo)
999                         goto out_unlock;
1000
1001                 timeo = unix_wait_for_peer(other, timeo);
1002
1003                 err = sock_intr_errno(timeo);
1004                 if (signal_pending(current))
1005                         goto out;
1006                 sock_put(other);
1007                 goto restart;
1008         }
1009
1010         /* Latch our state.
1011
1012            It is tricky place. We need to grab write lock and cannot
1013            drop lock on peer. It is dangerous because deadlock is
1014            possible. Connect to self case and simultaneous
1015            attempt to connect are eliminated by checking socket
1016            state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1017            check this before attempt to grab lock.
1018
1019            Well, and we have to recheck the state after socket locked.
1020          */
1021         st = sk->sk_state;
1022
1023         switch (st) {
1024         case TCP_CLOSE:
1025                 /* This is ok... continue with connect */
1026                 break;
1027         case TCP_ESTABLISHED:
1028                 /* Socket is already connected */
1029                 err = -EISCONN;
1030                 goto out_unlock;
1031         default:
1032                 err = -EINVAL;
1033                 goto out_unlock;
1034         }
1035
1036         unix_state_wlock(sk);
1037
1038         if (sk->sk_state != st) {
1039                 unix_state_wunlock(sk);
1040                 unix_state_runlock(other);
1041                 sock_put(other);
1042                 goto restart;
1043         }
1044
1045         err = security_unix_stream_connect(sock, other->sk_socket, newsk);
1046         if (err) {
1047                 unix_state_wunlock(sk);
1048                 goto out_unlock;
1049         }
1050
1051         /* The way is open! Fastly set all the necessary fields... */
1052
1053         sock_hold(sk);
1054         unix_peer(newsk)        = sk;
1055         newsk->sk_state         = TCP_ESTABLISHED;
1056         newsk->sk_type          = sk->sk_type;
1057         newsk->sk_peercred.pid  = current->tgid;
1058         newsk->sk_peercred.uid  = current->euid;
1059         newsk->sk_peercred.gid  = current->egid;
1060         newu = unix_sk(newsk);
1061         newsk->sk_sleep         = &newu->peer_wait;
1062         otheru = unix_sk(other);
1063
1064         /* copy address information from listening to new sock*/
1065         if (otheru->addr) {
1066                 atomic_inc(&otheru->addr->refcnt);
1067                 newu->addr = otheru->addr;
1068         }
1069         if (otheru->dentry) {
1070                 newu->dentry    = dget(otheru->dentry);
1071                 newu->mnt       = mntget(otheru->mnt);
1072         }
1073
1074         /* Set credentials */
1075         sk->sk_peercred = other->sk_peercred;
1076
1077         sock_hold(newsk);
1078         unix_peer(sk)   = newsk;
1079         sock->state     = SS_CONNECTED;
1080         sk->sk_state    = TCP_ESTABLISHED;
1081
1082         unix_state_wunlock(sk);
1083
1084         /* take ten and and send info to listening sock */
1085         spin_lock(&other->sk_receive_queue.lock);
1086         __skb_queue_tail(&other->sk_receive_queue, skb);
1087         /* Undo artificially decreased inflight after embrion
1088          * is installed to listening socket. */
1089         atomic_inc(&newu->inflight);
1090         spin_unlock(&other->sk_receive_queue.lock);
1091         unix_state_runlock(other);
1092         other->sk_data_ready(other, 0);
1093         sock_put(other);
1094         return 0;
1095
1096 out_unlock:
1097         if (other)
1098                 unix_state_runlock(other);
1099
1100 out:
1101         if (skb)
1102                 kfree_skb(skb);
1103         if (newsk)
1104                 unix_release_sock(newsk, 0);
1105         if (other)
1106                 sock_put(other);
1107         return err;
1108 }
1109
1110 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1111 {
1112         struct sock *ska=socka->sk, *skb = sockb->sk;
1113
1114         /* Join our sockets back to back */
1115         sock_hold(ska);
1116         sock_hold(skb);
1117         unix_peer(ska)=skb;
1118         unix_peer(skb)=ska;
1119         ska->sk_peercred.pid = skb->sk_peercred.pid = current->tgid;
1120         ska->sk_peercred.uid = skb->sk_peercred.uid = current->euid;
1121         ska->sk_peercred.gid = skb->sk_peercred.gid = current->egid;
1122
1123         if (ska->sk_type != SOCK_DGRAM) {
1124                 ska->sk_state = TCP_ESTABLISHED;
1125                 skb->sk_state = TCP_ESTABLISHED;
1126                 socka->state  = SS_CONNECTED;
1127                 sockb->state  = SS_CONNECTED;
1128         }
1129         return 0;
1130 }
1131
1132 static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
1133 {
1134         struct sock *sk = sock->sk;
1135         struct sock *tsk;
1136         struct sk_buff *skb;
1137         int err;
1138
1139         err = -EOPNOTSUPP;
1140         if (sock->type!=SOCK_STREAM && sock->type!=SOCK_SEQPACKET)
1141                 goto out;
1142
1143         err = -EINVAL;
1144         if (sk->sk_state != TCP_LISTEN)
1145                 goto out;
1146
1147         /* If socket state is TCP_LISTEN it cannot change (for now...),
1148          * so that no locks are necessary.
1149          */
1150
1151         skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1152         if (!skb) {
1153                 /* This means receive shutdown. */
1154                 if (err == 0)
1155                         err = -EINVAL;
1156                 goto out;
1157         }
1158
1159         tsk = skb->sk;
1160         skb_free_datagram(sk, skb);
1161         wake_up_interruptible(&unix_sk(sk)->peer_wait);
1162
1163         /* attach accepted sock to socket */
1164         unix_state_wlock(tsk);
1165         newsock->state = SS_CONNECTED;
1166         sock_graft(tsk, newsock);
1167         unix_state_wunlock(tsk);
1168         return 0;
1169
1170 out:
1171         return err;
1172 }
1173
1174
1175 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer)
1176 {
1177         struct sock *sk = sock->sk;
1178         struct unix_sock *u;
1179         struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr;
1180         int err = 0;
1181
1182         if (peer) {
1183                 sk = unix_peer_get(sk);
1184
1185                 err = -ENOTCONN;
1186                 if (!sk)
1187                         goto out;
1188                 err = 0;
1189         } else {
1190                 sock_hold(sk);
1191         }
1192
1193         u = unix_sk(sk);
1194         unix_state_rlock(sk);
1195         if (!u->addr) {
1196                 sunaddr->sun_family = AF_UNIX;
1197                 sunaddr->sun_path[0] = 0;
1198                 *uaddr_len = sizeof(short);
1199         } else {
1200                 struct unix_address *addr = u->addr;
1201
1202                 *uaddr_len = addr->len;
1203                 memcpy(sunaddr, addr->name, *uaddr_len);
1204         }
1205         unix_state_runlock(sk);
1206         sock_put(sk);
1207 out:
1208         return err;
1209 }
1210
1211 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1212 {
1213         int i;
1214
1215         scm->fp = UNIXCB(skb).fp;
1216         skb->destructor = sock_wfree;
1217         UNIXCB(skb).fp = NULL;
1218
1219         for (i=scm->fp->count-1; i>=0; i--)
1220                 unix_notinflight(scm->fp->fp[i]);
1221 }
1222
1223 static void unix_destruct_fds(struct sk_buff *skb)
1224 {
1225         struct scm_cookie scm;
1226         memset(&scm, 0, sizeof(scm));
1227         unix_detach_fds(&scm, skb);
1228
1229         /* Alas, it calls VFS */
1230         /* So fscking what? fput() had been SMP-safe since the last Summer */
1231         scm_destroy(&scm);
1232         sock_wfree(skb);
1233 }
1234
1235 static void unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1236 {
1237         int i;
1238         for (i=scm->fp->count-1; i>=0; i--)
1239                 unix_inflight(scm->fp->fp[i]);
1240         UNIXCB(skb).fp = scm->fp;
1241         skb->destructor = unix_destruct_fds;
1242         scm->fp = NULL;
1243 }
1244
1245 /*
1246  *      Send AF_UNIX data.
1247  */
1248
1249 static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
1250                               struct msghdr *msg, size_t len)
1251 {
1252         struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1253         struct sock *sk = sock->sk;
1254         struct unix_sock *u = unix_sk(sk);
1255         struct sockaddr_un *sunaddr=msg->msg_name;
1256         struct sock *other = NULL;
1257         int namelen = 0; /* fake GCC */
1258         int err;
1259         unsigned hash;
1260         struct sk_buff *skb;
1261         long timeo;
1262         struct scm_cookie tmp_scm;
1263
1264         if (NULL == siocb->scm)
1265                 siocb->scm = &tmp_scm;
1266         err = scm_send(sock, msg, siocb->scm);
1267         if (err < 0)
1268                 return err;
1269
1270         err = -EOPNOTSUPP;
1271         if (msg->msg_flags&MSG_OOB)
1272                 goto out;
1273
1274         if (msg->msg_namelen) {
1275                 err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1276                 if (err < 0)
1277                         goto out;
1278                 namelen = err;
1279         } else {
1280                 sunaddr = NULL;
1281                 err = -ENOTCONN;
1282                 other = unix_peer_get(sk);
1283                 if (!other)
1284                         goto out;
1285         }
1286
1287         if (test_bit(SOCK_PASSCRED, &sock->flags)
1288                 && !u->addr && (err = unix_autobind(sock)) != 0)
1289                 goto out;
1290
1291         err = -EMSGSIZE;
1292         if (len > sk->sk_sndbuf - 32)
1293                 goto out;
1294
1295         skb = sock_alloc_send_skb(sk, len, msg->msg_flags&MSG_DONTWAIT, &err);
1296         if (skb==NULL)
1297                 goto out;
1298
1299         memcpy(UNIXCREDS(skb), &siocb->scm->creds, sizeof(struct ucred));
1300         if (siocb->scm->fp)
1301                 unix_attach_fds(siocb->scm, skb);
1302
1303         skb->h.raw = skb->data;
1304         err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
1305         if (err)
1306                 goto out_free;
1307
1308         timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1309
1310 restart:
1311         if (!other) {
1312                 err = -ECONNRESET;
1313                 if (sunaddr == NULL)
1314                         goto out_free;
1315
1316                 other = unix_find_other(sunaddr, namelen, sk->sk_type,
1317                                         hash, &err);
1318                 if (other==NULL)
1319                         goto out_free;
1320         }
1321
1322         unix_state_rlock(other);
1323         err = -EPERM;
1324         if (!unix_may_send(sk, other))
1325                 goto out_unlock;
1326
1327         if (sock_flag(other, SOCK_DEAD)) {
1328                 /*
1329                  *      Check with 1003.1g - what should
1330                  *      datagram error
1331                  */
1332                 unix_state_runlock(other);
1333                 sock_put(other);
1334
1335                 err = 0;
1336                 unix_state_wlock(sk);
1337                 if (unix_peer(sk) == other) {
1338                         unix_peer(sk)=NULL;
1339                         unix_state_wunlock(sk);
1340
1341                         unix_dgram_disconnected(sk, other);
1342                         sock_put(other);
1343                         err = -ECONNREFUSED;
1344                 } else {
1345                         unix_state_wunlock(sk);
1346                 }
1347
1348                 other = NULL;
1349                 if (err)
1350                         goto out_free;
1351                 goto restart;
1352         }
1353
1354         err = -EPIPE;
1355         if (other->sk_shutdown & RCV_SHUTDOWN)
1356                 goto out_unlock;
1357
1358         if (sk->sk_type != SOCK_SEQPACKET) {
1359                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1360                 if (err)
1361                         goto out_unlock;
1362         }
1363
1364         if (unix_peer(other) != sk &&
1365             (skb_queue_len(&other->sk_receive_queue) >
1366              other->sk_max_ack_backlog)) {
1367                 if (!timeo) {
1368                         err = -EAGAIN;
1369                         goto out_unlock;
1370                 }
1371
1372                 timeo = unix_wait_for_peer(other, timeo);
1373
1374                 err = sock_intr_errno(timeo);
1375                 if (signal_pending(current))
1376                         goto out_free;
1377
1378                 goto restart;
1379         }
1380
1381         skb_queue_tail(&other->sk_receive_queue, skb);
1382         unix_state_runlock(other);
1383         other->sk_data_ready(other, len);
1384         sock_put(other);
1385         scm_destroy(siocb->scm);
1386         return len;
1387
1388 out_unlock:
1389         unix_state_runlock(other);
1390 out_free:
1391         kfree_skb(skb);
1392 out:
1393         if (other)
1394                 sock_put(other);
1395         scm_destroy(siocb->scm);
1396         return err;
1397 }
1398
1399                 
1400 static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
1401                                struct msghdr *msg, size_t len)
1402 {
1403         struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1404         struct sock *sk = sock->sk;
1405         struct sock *other = NULL;
1406         struct sockaddr_un *sunaddr=msg->msg_name;
1407         int err,size;
1408         struct sk_buff *skb;
1409         int sent=0;
1410         struct scm_cookie tmp_scm;
1411
1412         if (NULL == siocb->scm)
1413                 siocb->scm = &tmp_scm;
1414         err = scm_send(sock, msg, siocb->scm);
1415         if (err < 0)
1416                 return err;
1417
1418         err = -EOPNOTSUPP;
1419         if (msg->msg_flags&MSG_OOB)
1420                 goto out_err;
1421
1422         if (msg->msg_namelen) {
1423                 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1424                 goto out_err;
1425         } else {
1426                 sunaddr = NULL;
1427                 err = -ENOTCONN;
1428                 other = unix_peer_get(sk);
1429                 if (!other)
1430                         goto out_err;
1431         }
1432
1433         if (sk->sk_shutdown & SEND_SHUTDOWN)
1434                 goto pipe_err;
1435
1436         while(sent < len)
1437         {
1438                 /*
1439                  *      Optimisation for the fact that under 0.01% of X messages typically
1440                  *      need breaking up.
1441                  */
1442
1443                 size=len-sent;
1444
1445                 /* Keep two messages in the pipe so it schedules better */
1446                 if (size > sk->sk_sndbuf / 2 - 64)
1447                         size = sk->sk_sndbuf / 2 - 64;
1448
1449                 if (size > SKB_MAX_ALLOC)
1450                         size = SKB_MAX_ALLOC;
1451                         
1452                 /*
1453                  *      Grab a buffer
1454                  */
1455                  
1456                 skb=sock_alloc_send_skb(sk,size,msg->msg_flags&MSG_DONTWAIT, &err);
1457
1458                 if (skb==NULL)
1459                         goto out_err;
1460
1461                 /*
1462                  *      If you pass two values to the sock_alloc_send_skb
1463                  *      it tries to grab the large buffer with GFP_NOFS
1464                  *      (which can fail easily), and if it fails grab the
1465                  *      fallback size buffer which is under a page and will
1466                  *      succeed. [Alan]
1467                  */
1468                 size = min_t(int, size, skb_tailroom(skb));
1469
1470                 memcpy(UNIXCREDS(skb), &siocb->scm->creds, sizeof(struct ucred));
1471                 if (siocb->scm->fp)
1472                         unix_attach_fds(siocb->scm, skb);
1473
1474                 if ((err = memcpy_fromiovec(skb_put(skb,size), msg->msg_iov, size)) != 0) {
1475                         kfree_skb(skb);
1476                         goto out_err;
1477                 }
1478
1479                 unix_state_rlock(other);
1480
1481                 if (sock_flag(other, SOCK_DEAD) ||
1482                     (other->sk_shutdown & RCV_SHUTDOWN))
1483                         goto pipe_err_free;
1484
1485                 skb_queue_tail(&other->sk_receive_queue, skb);
1486                 unix_state_runlock(other);
1487                 other->sk_data_ready(other, size);
1488                 sent+=size;
1489         }
1490         sock_put(other);
1491
1492         scm_destroy(siocb->scm);
1493         siocb->scm = NULL;
1494
1495         return sent;
1496
1497 pipe_err_free:
1498         unix_state_runlock(other);
1499         kfree_skb(skb);
1500 pipe_err:
1501         if (sent==0 && !(msg->msg_flags&MSG_NOSIGNAL))
1502                 send_sig(SIGPIPE,current,0);
1503         err = -EPIPE;
1504 out_err:
1505         if (other)
1506                 sock_put(other);
1507         scm_destroy(siocb->scm);
1508         siocb->scm = NULL;
1509         return sent ? : err;
1510 }
1511
1512 static int unix_seqpacket_sendmsg(struct kiocb *kiocb, struct socket *sock,
1513                                   struct msghdr *msg, size_t len)
1514 {
1515         int err;
1516         struct sock *sk = sock->sk;
1517         
1518         err = sock_error(sk);
1519         if (err)
1520                 return err;
1521
1522         if (sk->sk_state != TCP_ESTABLISHED)
1523                 return -ENOTCONN;
1524
1525         if (msg->msg_namelen)
1526                 msg->msg_namelen = 0;
1527
1528         return unix_dgram_sendmsg(kiocb, sock, msg, len);
1529 }
1530                                                                                             
1531 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
1532 {
1533         struct unix_sock *u = unix_sk(sk);
1534
1535         msg->msg_namelen = 0;
1536         if (u->addr) {
1537                 msg->msg_namelen = u->addr->len;
1538                 memcpy(msg->msg_name, u->addr->name, u->addr->len);
1539         }
1540 }
1541
1542 static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
1543                               struct msghdr *msg, size_t size,
1544                               int flags)
1545 {
1546         struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1547         struct scm_cookie tmp_scm;
1548         struct sock *sk = sock->sk;
1549         struct unix_sock *u = unix_sk(sk);
1550         int noblock = flags & MSG_DONTWAIT;
1551         struct sk_buff *skb;
1552         int err;
1553
1554         err = -EOPNOTSUPP;
1555         if (flags&MSG_OOB)
1556                 goto out;
1557
1558         msg->msg_namelen = 0;
1559
1560         down(&u->readsem);
1561
1562         skb = skb_recv_datagram(sk, flags, noblock, &err);
1563         if (!skb)
1564                 goto out_unlock;
1565
1566         wake_up_interruptible(&u->peer_wait);
1567
1568         if (msg->msg_name)
1569                 unix_copy_addr(msg, skb->sk);
1570
1571         if (size > skb->len)
1572                 size = skb->len;
1573         else if (size < skb->len)
1574                 msg->msg_flags |= MSG_TRUNC;
1575
1576         err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, size);
1577         if (err)
1578                 goto out_free;
1579
1580         if (!siocb->scm) {
1581                 siocb->scm = &tmp_scm;
1582                 memset(&tmp_scm, 0, sizeof(tmp_scm));
1583         }
1584         siocb->scm->creds = *UNIXCREDS(skb);
1585
1586         if (!(flags & MSG_PEEK))
1587         {
1588                 if (UNIXCB(skb).fp)
1589                         unix_detach_fds(siocb->scm, skb);
1590         }
1591         else 
1592         {
1593                 /* It is questionable: on PEEK we could:
1594                    - do not return fds - good, but too simple 8)
1595                    - return fds, and do not return them on read (old strategy,
1596                      apparently wrong)
1597                    - clone fds (I chose it for now, it is the most universal
1598                      solution)
1599                 
1600                    POSIX 1003.1g does not actually define this clearly
1601                    at all. POSIX 1003.1g doesn't define a lot of things
1602                    clearly however!                  
1603                    
1604                 */
1605                 if (UNIXCB(skb).fp)
1606                         siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1607         }
1608         err = size;
1609
1610         scm_recv(sock, msg, siocb->scm, flags);
1611
1612 out_free:
1613         skb_free_datagram(sk,skb);
1614 out_unlock:
1615         up(&u->readsem);
1616 out:
1617         return err;
1618 }
1619
1620 /*
1621  *      Sleep until data has arrive. But check for races..
1622  */
1623  
1624 static long unix_stream_data_wait(struct sock * sk, long timeo)
1625 {
1626         DEFINE_WAIT(wait);
1627
1628         unix_state_rlock(sk);
1629
1630         for (;;) {
1631                 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1632
1633                 if (skb_queue_len(&sk->sk_receive_queue) ||
1634                     sk->sk_err ||
1635                     (sk->sk_shutdown & RCV_SHUTDOWN) ||
1636                     signal_pending(current) ||
1637                     !timeo)
1638                         break;
1639
1640                 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1641                 unix_state_runlock(sk);
1642                 timeo = schedule_timeout(timeo);
1643                 unix_state_rlock(sk);
1644                 clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1645         }
1646
1647         finish_wait(sk->sk_sleep, &wait);
1648         unix_state_runlock(sk);
1649         return timeo;
1650 }
1651
1652
1653
1654 static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
1655                                struct msghdr *msg, size_t size,
1656                                int flags)
1657 {
1658         struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1659         struct scm_cookie tmp_scm;
1660         struct sock *sk = sock->sk;
1661         struct unix_sock *u = unix_sk(sk);
1662         struct sockaddr_un *sunaddr=msg->msg_name;
1663         int copied = 0;
1664         int check_creds = 0;
1665         int target;
1666         int err = 0;
1667         long timeo;
1668
1669         err = -EINVAL;
1670         if (sk->sk_state != TCP_ESTABLISHED)
1671                 goto out;
1672
1673         err = -EOPNOTSUPP;
1674         if (flags&MSG_OOB)
1675                 goto out;
1676
1677         target = sock_rcvlowat(sk, flags&MSG_WAITALL, size);
1678         timeo = sock_rcvtimeo(sk, flags&MSG_DONTWAIT);
1679
1680         msg->msg_namelen = 0;
1681
1682         /* Lock the socket to prevent queue disordering
1683          * while sleeps in memcpy_tomsg
1684          */
1685
1686         if (!siocb->scm) {
1687                 siocb->scm = &tmp_scm;
1688                 memset(&tmp_scm, 0, sizeof(tmp_scm));
1689         }
1690
1691         down(&u->readsem);
1692
1693         do
1694         {
1695                 int chunk;
1696                 struct sk_buff *skb;
1697
1698                 skb = skb_dequeue(&sk->sk_receive_queue);
1699                 if (skb==NULL)
1700                 {
1701                         if (copied >= target)
1702                                 break;
1703
1704                         /*
1705                          *      POSIX 1003.1g mandates this order.
1706                          */
1707                          
1708                         if ((err = sock_error(sk)) != 0)
1709                                 break;
1710                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1711                                 break;
1712                         err = -EAGAIN;
1713                         if (!timeo)
1714                                 break;
1715                         up(&u->readsem);
1716
1717                         timeo = unix_stream_data_wait(sk, timeo);
1718
1719                         if (signal_pending(current)) {
1720                                 err = sock_intr_errno(timeo);
1721                                 goto out;
1722                         }
1723                         down(&u->readsem);
1724                         continue;
1725                 }
1726
1727                 if (check_creds) {
1728                         /* Never glue messages from different writers */
1729                         if (memcmp(UNIXCREDS(skb), &siocb->scm->creds, sizeof(siocb->scm->creds)) != 0) {
1730                                 skb_queue_head(&sk->sk_receive_queue, skb);
1731                                 break;
1732                         }
1733                 } else {
1734                         /* Copy credentials */
1735                         siocb->scm->creds = *UNIXCREDS(skb);
1736                         check_creds = 1;
1737                 }
1738
1739                 /* Copy address just once */
1740                 if (sunaddr)
1741                 {
1742                         unix_copy_addr(msg, skb->sk);
1743                         sunaddr = NULL;
1744                 }
1745
1746                 chunk = min_t(unsigned int, skb->len, size);
1747                 if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) {
1748                         skb_queue_head(&sk->sk_receive_queue, skb);
1749                         if (copied == 0)
1750                                 copied = -EFAULT;
1751                         break;
1752                 }
1753                 copied += chunk;
1754                 size -= chunk;
1755
1756                 /* Mark read part of skb as used */
1757                 if (!(flags & MSG_PEEK))
1758                 {
1759                         skb_pull(skb, chunk);
1760
1761                         if (UNIXCB(skb).fp)
1762                                 unix_detach_fds(siocb->scm, skb);
1763
1764                         /* put the skb back if we didn't use it up.. */
1765                         if (skb->len)
1766                         {
1767                                 skb_queue_head(&sk->sk_receive_queue, skb);
1768                                 break;
1769                         }
1770
1771                         kfree_skb(skb);
1772
1773                         if (siocb->scm->fp)
1774                                 break;
1775                 }
1776                 else
1777                 {
1778                         /* It is questionable, see note in unix_dgram_recvmsg.
1779                          */
1780                         if (UNIXCB(skb).fp)
1781                                 siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1782
1783                         /* put message back and return */
1784                         skb_queue_head(&sk->sk_receive_queue, skb);
1785                         break;
1786                 }
1787         } while (size);
1788
1789         up(&u->readsem);
1790         scm_recv(sock, msg, siocb->scm, flags);
1791 out:
1792         return copied ? : err;
1793 }
1794
1795 static int unix_shutdown(struct socket *sock, int mode)
1796 {
1797         struct sock *sk = sock->sk;
1798         struct sock *other;
1799
1800         mode = (mode+1)&(RCV_SHUTDOWN|SEND_SHUTDOWN);
1801
1802         if (mode) {
1803                 unix_state_wlock(sk);
1804                 sk->sk_shutdown |= mode;
1805                 other=unix_peer(sk);
1806                 if (other)
1807                         sock_hold(other);
1808                 unix_state_wunlock(sk);
1809                 sk->sk_state_change(sk);
1810
1811                 if (other &&
1812                         (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
1813
1814                         int peer_mode = 0;
1815
1816                         if (mode&RCV_SHUTDOWN)
1817                                 peer_mode |= SEND_SHUTDOWN;
1818                         if (mode&SEND_SHUTDOWN)
1819                                 peer_mode |= RCV_SHUTDOWN;
1820                         unix_state_wlock(other);
1821                         other->sk_shutdown |= peer_mode;
1822                         unix_state_wunlock(other);
1823                         other->sk_state_change(other);
1824                         read_lock(&other->sk_callback_lock);
1825                         if (peer_mode == SHUTDOWN_MASK)
1826                                 sk_wake_async(other,1,POLL_HUP);
1827                         else if (peer_mode & RCV_SHUTDOWN)
1828                                 sk_wake_async(other,1,POLL_IN);
1829                         read_unlock(&other->sk_callback_lock);
1830                 }
1831                 if (other)
1832                         sock_put(other);
1833         }
1834         return 0;
1835 }
1836
1837 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1838 {
1839         struct sock *sk = sock->sk;
1840         long amount=0;
1841         int err;
1842
1843         switch(cmd)
1844         {
1845                 case SIOCOUTQ:
1846                         amount = atomic_read(&sk->sk_wmem_alloc);
1847                         err = put_user(amount, (int __user *)arg);
1848                         break;
1849                 case SIOCINQ:
1850                 {
1851                         struct sk_buff *skb;
1852
1853                         if (sk->sk_state == TCP_LISTEN) {
1854                                 err = -EINVAL;
1855                                 break;
1856                         }
1857
1858                         spin_lock(&sk->sk_receive_queue.lock);
1859                         if (sk->sk_type == SOCK_STREAM ||
1860                             sk->sk_type == SOCK_SEQPACKET) {
1861                                 skb_queue_walk(&sk->sk_receive_queue, skb)
1862                                         amount += skb->len;
1863                         } else {
1864                                 skb = skb_peek(&sk->sk_receive_queue);
1865                                 if (skb)
1866                                         amount=skb->len;
1867                         }
1868                         spin_unlock(&sk->sk_receive_queue.lock);
1869                         err = put_user(amount, (int __user *)arg);
1870                         break;
1871                 }
1872
1873                 default:
1874                         err = dev_ioctl(cmd, (void __user *)arg);
1875                         break;
1876         }
1877         return err;
1878 }
1879
1880 static unsigned int unix_poll(struct file * file, struct socket *sock, poll_table *wait)
1881 {
1882         struct sock *sk = sock->sk;
1883         unsigned int mask;
1884
1885         poll_wait(file, sk->sk_sleep, wait);
1886         mask = 0;
1887
1888         /* exceptional events? */
1889         if (sk->sk_err)
1890                 mask |= POLLERR;
1891         if (sk->sk_shutdown == SHUTDOWN_MASK)
1892                 mask |= POLLHUP;
1893
1894         /* readable? */
1895         if (!skb_queue_empty(&sk->sk_receive_queue) ||
1896             (sk->sk_shutdown & RCV_SHUTDOWN))
1897                 mask |= POLLIN | POLLRDNORM;
1898
1899         /* Connection-based need to check for termination and startup */
1900         if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) && sk->sk_state == TCP_CLOSE)
1901                 mask |= POLLHUP;
1902
1903         /*
1904          * we set writable also when the other side has shut down the
1905          * connection. This prevents stuck sockets.
1906          */
1907         if (unix_writable(sk))
1908                 mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
1909
1910         return mask;
1911 }
1912
1913
1914 #ifdef CONFIG_PROC_FS
1915 static struct sock *unix_seq_idx(int *iter, loff_t pos)
1916 {
1917         loff_t off = 0;
1918         struct sock *s;
1919
1920         for (s = first_unix_socket(iter); s; s = next_unix_socket(iter, s)) {
1921                 if (off == pos) 
1922                         return s;
1923                 ++off;
1924         }
1925         return NULL;
1926 }
1927
1928
1929 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
1930 {
1931         read_lock(&unix_table_lock);
1932         return *pos ? unix_seq_idx(seq->private, *pos - 1) : ((void *) 1);
1933 }
1934
1935 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1936 {
1937         ++*pos;
1938
1939         if (v == (void *)1) 
1940                 return first_unix_socket(seq->private);
1941         return next_unix_socket(seq->private, v);
1942 }
1943
1944 static void unix_seq_stop(struct seq_file *seq, void *v)
1945 {
1946         read_unlock(&unix_table_lock);
1947 }
1948
1949 static int unix_seq_show(struct seq_file *seq, void *v)
1950 {
1951         
1952         if (v == (void *)1)
1953                 seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
1954                          "Inode Path\n");
1955         else {
1956                 struct sock *s = v;
1957                 struct unix_sock *u = unix_sk(s);
1958                 unix_state_rlock(s);
1959
1960                 seq_printf(seq, "%p: %08X %08X %08X %04X %02X %5lu",
1961                         s,
1962                         atomic_read(&s->sk_refcnt),
1963                         0,
1964                         s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
1965                         s->sk_type,
1966                         s->sk_socket ?
1967                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
1968                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
1969                         sock_i_ino(s));
1970
1971                 if (u->addr) {
1972                         int i, len;
1973                         seq_putc(seq, ' ');
1974
1975                         i = 0;
1976                         len = u->addr->len - sizeof(short);
1977                         if (!UNIX_ABSTRACT(s))
1978                                 len--;
1979                         else {
1980                                 seq_putc(seq, '@');
1981                                 i++;
1982                         }
1983                         for ( ; i < len; i++)
1984                                 seq_putc(seq, u->addr->name->sun_path[i]);
1985                 }
1986                 unix_state_runlock(s);
1987                 seq_putc(seq, '\n');
1988         }
1989
1990         return 0;
1991 }
1992
1993 static struct seq_operations unix_seq_ops = {
1994         .start  = unix_seq_start,
1995         .next   = unix_seq_next,
1996         .stop   = unix_seq_stop,
1997         .show   = unix_seq_show,
1998 };
1999
2000
2001 static int unix_seq_open(struct inode *inode, struct file *file)
2002 {
2003         struct seq_file *seq;
2004         int rc = -ENOMEM;
2005         int *iter = kmalloc(sizeof(int), GFP_KERNEL);
2006
2007         if (!iter)
2008                 goto out;
2009
2010         rc = seq_open(file, &unix_seq_ops);
2011         if (rc)
2012                 goto out_kfree;
2013
2014         seq          = file->private_data;
2015         seq->private = iter;
2016         *iter = 0;
2017 out:
2018         return rc;
2019 out_kfree:
2020         kfree(iter);
2021         goto out;
2022 }
2023
2024 static struct file_operations unix_seq_fops = {
2025         .owner          = THIS_MODULE,
2026         .open           = unix_seq_open,
2027         .read           = seq_read,
2028         .llseek         = seq_lseek,
2029         .release        = seq_release_private,
2030 };
2031
2032 #endif
2033
2034 static struct net_proto_family unix_family_ops = {
2035         .family = PF_UNIX,
2036         .create = unix_create,
2037         .owner  = THIS_MODULE,
2038 };
2039
2040 #ifdef CONFIG_SYSCTL
2041 extern void unix_sysctl_register(void);
2042 extern void unix_sysctl_unregister(void);
2043 #else
2044 static inline void unix_sysctl_register(void) {}
2045 static inline void unix_sysctl_unregister(void) {}
2046 #endif
2047
2048 static int __init af_unix_init(void)
2049 {
2050         int rc = -1;
2051         struct sk_buff *dummy_skb;
2052
2053         if (sizeof(struct unix_skb_parms) > sizeof(dummy_skb->cb)) {
2054                 printk(KERN_CRIT "%s: panic\n", __FUNCTION__);
2055                 goto out;
2056         }
2057
2058         rc = proto_register(&unix_proto, 1);
2059         if (rc != 0) {
2060                 printk(KERN_CRIT "%s: Cannot create unix_sock SLAB cache!\n",
2061                        __FUNCTION__);
2062                 goto out;
2063         }
2064
2065         sock_register(&unix_family_ops);
2066 #ifdef CONFIG_PROC_FS
2067         proc_net_fops_create("unix", 0, &unix_seq_fops);
2068 #endif
2069         unix_sysctl_register();
2070 out:
2071         return rc;
2072 }
2073
2074 static void __exit af_unix_exit(void)
2075 {
2076         sock_unregister(PF_UNIX);
2077         unix_sysctl_unregister();
2078         proc_net_remove("unix");
2079         proto_unregister(&unix_proto);
2080 }
2081
2082 module_init(af_unix_init);
2083 module_exit(af_unix_exit);
2084
2085 MODULE_LICENSE("GPL");
2086 MODULE_ALIAS_NETPROTO(PF_UNIX);