vserver 1.9.5.x5
[linux-2.6.git] / net / unix / af_unix.c
1 /*
2  * NET4:        Implementation of BSD Unix domain sockets.
3  *
4  * Authors:     Alan Cox, <alan.cox@linux.org>
5  *
6  *              This program is free software; you can redistribute it and/or
7  *              modify it under the terms of the GNU General Public License
8  *              as published by the Free Software Foundation; either version
9  *              2 of the License, or (at your option) any later version.
10  *
11  * Version:     $Id: af_unix.c,v 1.133 2002/02/08 03:57:19 davem Exp $
12  *
13  * Fixes:
14  *              Linus Torvalds  :       Assorted bug cures.
15  *              Niibe Yutaka    :       async I/O support.
16  *              Carsten Paeth   :       PF_UNIX check, address fixes.
17  *              Alan Cox        :       Limit size of allocated blocks.
18  *              Alan Cox        :       Fixed the stupid socketpair bug.
19  *              Alan Cox        :       BSD compatibility fine tuning.
20  *              Alan Cox        :       Fixed a bug in connect when interrupted.
21  *              Alan Cox        :       Sorted out a proper draft version of
22  *                                      file descriptor passing hacked up from
23  *                                      Mike Shaver's work.
24  *              Marty Leisner   :       Fixes to fd passing
25  *              Nick Nevin      :       recvmsg bugfix.
26  *              Alan Cox        :       Started proper garbage collector
27  *              Heiko EiBfeldt  :       Missing verify_area check
28  *              Alan Cox        :       Started POSIXisms
29  *              Andreas Schwab  :       Replace inode by dentry for proper
30  *                                      reference counting
31  *              Kirk Petersen   :       Made this a module
32  *          Christoph Rohland   :       Elegant non-blocking accept/connect algorithm.
33  *                                      Lots of bug fixes.
34  *           Alexey Kuznetosv   :       Repaired (I hope) bugs introduces
35  *                                      by above two patches.
36  *           Andrea Arcangeli   :       If possible we block in connect(2)
37  *                                      if the max backlog of the listen socket
38  *                                      is been reached. This won't break
39  *                                      old apps and it will avoid huge amount
40  *                                      of socks hashed (this for unix_gc()
41  *                                      performances reasons).
42  *                                      Security fix that limits the max
43  *                                      number of socks to 2*max_files and
44  *                                      the number of skb queueable in the
45  *                                      dgram receiver.
46  *              Artur Skawina   :       Hash function optimizations
47  *           Alexey Kuznetsov   :       Full scale SMP. Lot of bugs are introduced 8)
48  *            Malcolm Beattie   :       Set peercred for socketpair
49  *           Michal Ostrowski   :       Module initialization cleanup.
50  *           Arnaldo C. Melo    :       Remove MOD_{INC,DEC}_USE_COUNT,
51  *                                      the core infrastructure is doing that
52  *                                      for all net proto families now (2.5.69+)
53  *
54  *
55  * Known differences from reference BSD that was tested:
56  *
57  *      [TO FIX]
58  *      ECONNREFUSED is not returned from one end of a connected() socket to the
59  *              other the moment one end closes.
60  *      fstat() doesn't return st_dev=0, and give the blksize as high water mark
61  *              and a fake inode identifier (nor the BSD first socket fstat twice bug).
62  *      [NOT TO FIX]
63  *      accept() returns a path name even if the connecting socket has closed
64  *              in the meantime (BSD loses the path and gives up).
65  *      accept() returns 0 length path for an unbound connector. BSD returns 16
66  *              and a null first byte in the path (but not for gethost/peername - BSD bug ??)
67  *      socketpair(...SOCK_RAW..) doesn't panic the kernel.
68  *      BSD af_unix apparently has connect forgetting to block properly.
69  *              (need to check this with the POSIX spec in detail)
70  *
71  * Differences from 2.0.0-11-... (ANK)
72  *      Bug fixes and improvements.
73  *              - client shutdown killed server socket.
74  *              - removed all useless cli/sti pairs.
75  *
76  *      Semantic changes/extensions.
77  *              - generic control message passing.
78  *              - SCM_CREDENTIALS control message.
79  *              - "Abstract" (not FS based) socket bindings.
80  *                Abstract names are sequences of bytes (not zero terminated)
81  *                started by 0, so that this name space does not intersect
82  *                with BSD names.
83  */
84
85 #include <linux/module.h>
86 #include <linux/config.h>
87 #include <linux/kernel.h>
88 #include <linux/major.h>
89 #include <linux/signal.h>
90 #include <linux/sched.h>
91 #include <linux/errno.h>
92 #include <linux/string.h>
93 #include <linux/stat.h>
94 #include <linux/dcache.h>
95 #include <linux/namei.h>
96 #include <linux/socket.h>
97 #include <linux/un.h>
98 #include <linux/fcntl.h>
99 #include <linux/termios.h>
100 #include <linux/sockios.h>
101 #include <linux/net.h>
102 #include <linux/in.h>
103 #include <linux/fs.h>
104 #include <linux/slab.h>
105 #include <asm/uaccess.h>
106 #include <linux/skbuff.h>
107 #include <linux/netdevice.h>
108 #include <net/sock.h>
109 #include <linux/tcp.h>
110 #include <net/af_unix.h>
111 #include <linux/proc_fs.h>
112 #include <linux/seq_file.h>
113 #include <net/scm.h>
114 #include <linux/init.h>
115 #include <linux/poll.h>
116 #include <linux/smp_lock.h>
117 #include <linux/rtnetlink.h>
118 #include <linux/mount.h>
119 #include <net/checksum.h>
120 #include <linux/security.h>
121 #include <linux/vs_context.h>
122 #include <linux/vs_network.h>
123 #include <linux/vs_limit.h>
124
125 int sysctl_unix_max_dgram_qlen = 10;
126
127 static kmem_cache_t *unix_sk_cachep;
128
129 struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1];
130 DEFINE_RWLOCK(unix_table_lock);
131 static atomic_t unix_nr_socks = ATOMIC_INIT(0);
132
133 #define unix_sockets_unbound    (&unix_socket_table[UNIX_HASH_SIZE])
134
135 #define UNIX_ABSTRACT(sk)       (unix_sk(sk)->addr->hash != UNIX_HASH_SIZE)
136
137 /*
138  *  SMP locking strategy:
139  *    hash table is protected with rwlock unix_table_lock
140  *    each socket state is protected by separate rwlock.
141  */
142
143 static inline unsigned unix_hash_fold(unsigned hash)
144 {
145         hash ^= hash>>16;
146         hash ^= hash>>8;
147         return hash&(UNIX_HASH_SIZE-1);
148 }
149
150 #define unix_peer(sk) (unix_sk(sk)->peer)
151
152 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
153 {
154         return unix_peer(osk) == sk;
155 }
156
157 static inline int unix_may_send(struct sock *sk, struct sock *osk)
158 {
159         return (unix_peer(osk) == NULL || unix_our_peer(sk, osk));
160 }
161
162 static struct sock *unix_peer_get(struct sock *s)
163 {
164         struct sock *peer;
165
166         unix_state_rlock(s);
167         peer = unix_peer(s);
168         if (peer)
169                 sock_hold(peer);
170         unix_state_runlock(s);
171         return peer;
172 }
173
174 static inline void unix_release_addr(struct unix_address *addr)
175 {
176         if (atomic_dec_and_test(&addr->refcnt))
177                 kfree(addr);
178 }
179
180 /*
181  *      Check unix socket name:
182  *              - should be not zero length.
183  *              - if started by not zero, should be NULL terminated (FS object)
184  *              - if started by zero, it is abstract name.
185  */
186  
187 static int unix_mkname(struct sockaddr_un * sunaddr, int len, unsigned *hashp)
188 {
189         if (len <= sizeof(short) || len > sizeof(*sunaddr))
190                 return -EINVAL;
191         if (!sunaddr || sunaddr->sun_family != AF_UNIX)
192                 return -EINVAL;
193         if (sunaddr->sun_path[0]) {
194                 ((char *)sunaddr)[len]=0;
195                 len = strlen(sunaddr->sun_path)+1+sizeof(short);
196                 return len;
197         }
198
199         *hashp = unix_hash_fold(csum_partial((char*)sunaddr, len, 0));
200         return len;
201 }
202
203 static void __unix_remove_socket(struct sock *sk)
204 {
205         sk_del_node_init(sk);
206 }
207
208 static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
209 {
210         BUG_TRAP(sk_unhashed(sk));
211         sk_add_node(sk, list);
212 }
213
214 static inline void unix_remove_socket(struct sock *sk)
215 {
216         write_lock(&unix_table_lock);
217         __unix_remove_socket(sk);
218         write_unlock(&unix_table_lock);
219 }
220
221 static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
222 {
223         write_lock(&unix_table_lock);
224         __unix_insert_socket(list, sk);
225         write_unlock(&unix_table_lock);
226 }
227
228 static struct sock *__unix_find_socket_byname(struct sockaddr_un *sunname,
229                                               int len, int type, unsigned hash)
230 {
231         struct sock *s;
232         struct hlist_node *node;
233
234         sk_for_each(s, node, &unix_socket_table[hash ^ type]) {
235                 struct unix_sock *u = unix_sk(s);
236
237                 if (u->addr->len == len &&
238                     !memcmp(u->addr->name, sunname, len))
239                         goto found;
240         }
241         s = NULL;
242 found:
243         return s;
244 }
245
246 static inline struct sock *unix_find_socket_byname(struct sockaddr_un *sunname,
247                                                    int len, int type,
248                                                    unsigned hash)
249 {
250         struct sock *s;
251
252         read_lock(&unix_table_lock);
253         s = __unix_find_socket_byname(sunname, len, type, hash);
254         if (s)
255                 sock_hold(s);
256         read_unlock(&unix_table_lock);
257         return s;
258 }
259
260 static struct sock *unix_find_socket_byinode(struct inode *i)
261 {
262         struct sock *s;
263         struct hlist_node *node;
264
265         read_lock(&unix_table_lock);
266         sk_for_each(s, node,
267                     &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
268                 struct dentry *dentry = unix_sk(s)->dentry;
269
270                 if(dentry && dentry->d_inode == i)
271                 {
272                         sock_hold(s);
273                         goto found;
274                 }
275         }
276         s = NULL;
277 found:
278         read_unlock(&unix_table_lock);
279         return s;
280 }
281
282 static inline int unix_writable(struct sock *sk)
283 {
284         return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
285 }
286
287 static void unix_write_space(struct sock *sk)
288 {
289         read_lock(&sk->sk_callback_lock);
290         if (unix_writable(sk)) {
291                 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
292                         wake_up_interruptible(sk->sk_sleep);
293                 sk_wake_async(sk, 2, POLL_OUT);
294         }
295         read_unlock(&sk->sk_callback_lock);
296 }
297
298 /* When dgram socket disconnects (or changes its peer), we clear its receive
299  * queue of packets arrived from previous peer. First, it allows to do
300  * flow control based only on wmem_alloc; second, sk connected to peer
301  * may receive messages only from that peer. */
302 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
303 {
304         if (skb_queue_len(&sk->sk_receive_queue)) {
305                 skb_queue_purge(&sk->sk_receive_queue);
306                 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
307
308                 /* If one link of bidirectional dgram pipe is disconnected,
309                  * we signal error. Messages are lost. Do not make this,
310                  * when peer was not connected to us.
311                  */
312                 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
313                         other->sk_err = ECONNRESET;
314                         other->sk_error_report(other);
315                 }
316         }
317 }
318
319 static void unix_sock_destructor(struct sock *sk)
320 {
321         struct unix_sock *u = unix_sk(sk);
322
323         skb_queue_purge(&sk->sk_receive_queue);
324
325         BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
326         BUG_TRAP(sk_unhashed(sk));
327         BUG_TRAP(!sk->sk_socket);
328         if (!sock_flag(sk, SOCK_DEAD)) {
329                 printk("Attempt to release alive unix socket: %p\n", sk);
330                 return;
331         }
332
333         if (u->addr)
334                 unix_release_addr(u->addr);
335
336         atomic_dec(&unix_nr_socks);
337 #ifdef UNIX_REFCNT_DEBUG
338         printk(KERN_DEBUG "UNIX %p is destroyed, %d are still alive.\n", sk, atomic_read(&unix_nr_socks));
339 #endif
340 }
341
342 static int unix_release_sock (struct sock *sk, int embrion)
343 {
344         struct unix_sock *u = unix_sk(sk);
345         struct dentry *dentry;
346         struct vfsmount *mnt;
347         struct sock *skpair;
348         struct sk_buff *skb;
349         int state;
350
351         unix_remove_socket(sk);
352
353         /* Clear state */
354         unix_state_wlock(sk);
355         sock_orphan(sk);
356         sk->sk_shutdown = SHUTDOWN_MASK;
357         dentry       = u->dentry;
358         u->dentry    = NULL;
359         mnt          = u->mnt;
360         u->mnt       = NULL;
361         state = sk->sk_state;
362         sk->sk_state = TCP_CLOSE;
363         unix_state_wunlock(sk);
364
365         wake_up_interruptible_all(&u->peer_wait);
366
367         skpair=unix_peer(sk);
368
369         if (skpair!=NULL) {
370                 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
371                         unix_state_wlock(skpair);
372                         /* No more writes */
373                         skpair->sk_shutdown = SHUTDOWN_MASK;
374                         if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
375                                 skpair->sk_err = ECONNRESET;
376                         unix_state_wunlock(skpair);
377                         skpair->sk_state_change(skpair);
378                         read_lock(&skpair->sk_callback_lock);
379                         sk_wake_async(skpair,1,POLL_HUP);
380                         read_unlock(&skpair->sk_callback_lock);
381                 }
382                 sock_put(skpair); /* It may now die */
383                 unix_peer(sk) = NULL;
384         }
385
386         /* Try to flush out this socket. Throw out buffers at least */
387
388         while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
389                 if (state==TCP_LISTEN)
390                         unix_release_sock(skb->sk, 1);
391                 /* passed fds are erased in the kfree_skb hook        */
392                 kfree_skb(skb);
393         }
394
395         if (dentry) {
396                 dput(dentry);
397                 mntput(mnt);
398         }
399
400         sock_put(sk);
401
402         /* ---- Socket is dead now and most probably destroyed ---- */
403
404         /*
405          * Fixme: BSD difference: In BSD all sockets connected to use get
406          *        ECONNRESET and we die on the spot. In Linux we behave
407          *        like files and pipes do and wait for the last
408          *        dereference.
409          *
410          * Can't we simply set sock->err?
411          *
412          *        What the above comment does talk about? --ANK(980817)
413          */
414
415         if (atomic_read(&unix_tot_inflight))
416                 unix_gc();              /* Garbage collect fds */       
417
418         return 0;
419 }
420
421 static int unix_listen(struct socket *sock, int backlog)
422 {
423         int err;
424         struct sock *sk = sock->sk;
425         struct unix_sock *u = unix_sk(sk);
426
427         err = -EOPNOTSUPP;
428         if (sock->type!=SOCK_STREAM && sock->type!=SOCK_SEQPACKET)
429                 goto out;                       /* Only stream/seqpacket sockets accept */
430         err = -EINVAL;
431         if (!u->addr)
432                 goto out;                       /* No listens on an unbound socket */
433         unix_state_wlock(sk);
434         if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
435                 goto out_unlock;
436         if (backlog > sk->sk_max_ack_backlog)
437                 wake_up_interruptible_all(&u->peer_wait);
438         sk->sk_max_ack_backlog  = backlog;
439         sk->sk_state            = TCP_LISTEN;
440         /* set credentials so connect can copy them */
441         sk->sk_peercred.pid     = current->tgid;
442         sk->sk_peercred.uid     = current->euid;
443         sk->sk_peercred.gid     = current->egid;
444         err = 0;
445
446 out_unlock:
447         unix_state_wunlock(sk);
448 out:
449         return err;
450 }
451
452 static int unix_release(struct socket *);
453 static int unix_bind(struct socket *, struct sockaddr *, int);
454 static int unix_stream_connect(struct socket *, struct sockaddr *,
455                                int addr_len, int flags);
456 static int unix_socketpair(struct socket *, struct socket *);
457 static int unix_accept(struct socket *, struct socket *, int);
458 static int unix_getname(struct socket *, struct sockaddr *, int *, int);
459 static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
460 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
461 static int unix_shutdown(struct socket *, int);
462 static int unix_stream_sendmsg(struct kiocb *, struct socket *,
463                                struct msghdr *, size_t);
464 static int unix_stream_recvmsg(struct kiocb *, struct socket *,
465                                struct msghdr *, size_t, int);
466 static int unix_dgram_sendmsg(struct kiocb *, struct socket *,
467                               struct msghdr *, size_t);
468 static int unix_dgram_recvmsg(struct kiocb *, struct socket *,
469                               struct msghdr *, size_t, int);
470 static int unix_dgram_connect(struct socket *, struct sockaddr *,
471                               int, int);
472 static int unix_seqpacket_sendmsg(struct kiocb *, struct socket *,
473                                   struct msghdr *, size_t);
474
475 static struct proto_ops unix_stream_ops = {
476         .family =       PF_UNIX,
477         .owner =        THIS_MODULE,
478         .release =      unix_release,
479         .bind =         unix_bind,
480         .connect =      unix_stream_connect,
481         .socketpair =   unix_socketpair,
482         .accept =       unix_accept,
483         .getname =      unix_getname,
484         .poll =         unix_poll,
485         .ioctl =        unix_ioctl,
486         .listen =       unix_listen,
487         .shutdown =     unix_shutdown,
488         .setsockopt =   sock_no_setsockopt,
489         .getsockopt =   sock_no_getsockopt,
490         .sendmsg =      unix_stream_sendmsg,
491         .recvmsg =      unix_stream_recvmsg,
492         .mmap =         sock_no_mmap,
493         .sendpage =     sock_no_sendpage,
494 };
495
496 static struct proto_ops unix_dgram_ops = {
497         .family =       PF_UNIX,
498         .owner =        THIS_MODULE,
499         .release =      unix_release,
500         .bind =         unix_bind,
501         .connect =      unix_dgram_connect,
502         .socketpair =   unix_socketpair,
503         .accept =       sock_no_accept,
504         .getname =      unix_getname,
505         .poll =         datagram_poll,
506         .ioctl =        unix_ioctl,
507         .listen =       sock_no_listen,
508         .shutdown =     unix_shutdown,
509         .setsockopt =   sock_no_setsockopt,
510         .getsockopt =   sock_no_getsockopt,
511         .sendmsg =      unix_dgram_sendmsg,
512         .recvmsg =      unix_dgram_recvmsg,
513         .mmap =         sock_no_mmap,
514         .sendpage =     sock_no_sendpage,
515 };
516
517 static struct proto_ops unix_seqpacket_ops = {
518         .family =       PF_UNIX,
519         .owner =        THIS_MODULE,
520         .release =      unix_release,
521         .bind =         unix_bind,
522         .connect =      unix_stream_connect,
523         .socketpair =   unix_socketpair,
524         .accept =       unix_accept,
525         .getname =      unix_getname,
526         .poll =         datagram_poll,
527         .ioctl =        unix_ioctl,
528         .listen =       unix_listen,
529         .shutdown =     unix_shutdown,
530         .setsockopt =   sock_no_setsockopt,
531         .getsockopt =   sock_no_getsockopt,
532         .sendmsg =      unix_seqpacket_sendmsg,
533         .recvmsg =      unix_dgram_recvmsg,
534         .mmap =         sock_no_mmap,
535         .sendpage =     sock_no_sendpage,
536 };
537
538 static struct sock * unix_create1(struct socket *sock)
539 {
540         struct sock *sk = NULL;
541         struct unix_sock *u;
542
543         if (atomic_read(&unix_nr_socks) >= 2*files_stat.max_files)
544                 goto out;
545
546         sk = sk_alloc(PF_UNIX, GFP_KERNEL, sizeof(struct unix_sock),
547                       unix_sk_cachep);
548         if (!sk)
549                 goto out;
550
551         atomic_inc(&unix_nr_socks);
552
553         sock_init_data(sock,sk);
554         sk_set_owner(sk, THIS_MODULE);
555
556         sk->sk_write_space      = unix_write_space;
557         sk->sk_max_ack_backlog  = sysctl_unix_max_dgram_qlen;
558         sk->sk_destruct         = unix_sock_destructor;
559         u         = unix_sk(sk);
560         u->dentry = NULL;
561         u->mnt    = NULL;
562         rwlock_init(&u->lock);
563         atomic_set(&u->inflight, sock ? 0 : -1);
564         init_MUTEX(&u->readsem); /* single task reading lock */
565         init_waitqueue_head(&u->peer_wait);
566         unix_insert_socket(unix_sockets_unbound, sk);
567 out:
568         return sk;
569 }
570
571 static int unix_create(struct socket *sock, int protocol)
572 {
573         if (protocol && protocol != PF_UNIX)
574                 return -EPROTONOSUPPORT;
575
576         sock->state = SS_UNCONNECTED;
577
578         switch (sock->type) {
579         case SOCK_STREAM:
580                 sock->ops = &unix_stream_ops;
581                 break;
582                 /*
583                  *      Believe it or not BSD has AF_UNIX, SOCK_RAW though
584                  *      nothing uses it.
585                  */
586         case SOCK_RAW:
587                 sock->type=SOCK_DGRAM;
588         case SOCK_DGRAM:
589                 sock->ops = &unix_dgram_ops;
590                 break;
591         case SOCK_SEQPACKET:
592                 sock->ops = &unix_seqpacket_ops;
593                 break;
594         default:
595                 return -ESOCKTNOSUPPORT;
596         }
597
598         return unix_create1(sock) ? 0 : -ENOMEM;
599 }
600
601 static int unix_release(struct socket *sock)
602 {
603         struct sock *sk = sock->sk;
604
605         if (!sk)
606                 return 0;
607
608         sock->sk = NULL;
609
610         return unix_release_sock (sk, 0);
611 }
612
613 static int unix_autobind(struct socket *sock)
614 {
615         struct sock *sk = sock->sk;
616         struct unix_sock *u = unix_sk(sk);
617         static u32 ordernum = 1;
618         struct unix_address * addr;
619         int err;
620
621         down(&u->readsem);
622
623         err = 0;
624         if (u->addr)
625                 goto out;
626
627         err = -ENOMEM;
628         addr = kmalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
629         if (!addr)
630                 goto out;
631
632         memset(addr, 0, sizeof(*addr) + sizeof(short) + 16);
633         addr->name->sun_family = AF_UNIX;
634         atomic_set(&addr->refcnt, 1);
635
636 retry:
637         addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
638         addr->hash = unix_hash_fold(csum_partial((void*)addr->name, addr->len, 0));
639
640         write_lock(&unix_table_lock);
641         ordernum = (ordernum+1)&0xFFFFF;
642
643         if (__unix_find_socket_byname(addr->name, addr->len, sock->type,
644                                       addr->hash)) {
645                 write_unlock(&unix_table_lock);
646                 /* Sanity yield. It is unusual case, but yet... */
647                 if (!(ordernum&0xFF))
648                         yield();
649                 goto retry;
650         }
651         addr->hash ^= sk->sk_type;
652
653         __unix_remove_socket(sk);
654         u->addr = addr;
655         __unix_insert_socket(&unix_socket_table[addr->hash], sk);
656         write_unlock(&unix_table_lock);
657         err = 0;
658
659 out:    up(&u->readsem);
660         return err;
661 }
662
663 static struct sock *unix_find_other(struct sockaddr_un *sunname, int len,
664                                     int type, unsigned hash, int *error)
665 {
666         struct sock *u;
667         struct nameidata nd;
668         int err = 0;
669         
670         if (sunname->sun_path[0]) {
671                 err = path_lookup(sunname->sun_path, LOOKUP_FOLLOW, &nd);
672                 if (err)
673                         goto fail;
674                 err = permission(nd.dentry->d_inode,MAY_WRITE, &nd);
675                 if (err)
676                         goto put_fail;
677
678                 err = -ECONNREFUSED;
679                 if (!S_ISSOCK(nd.dentry->d_inode->i_mode))
680                         goto put_fail;
681                 u=unix_find_socket_byinode(nd.dentry->d_inode);
682                 if (!u)
683                         goto put_fail;
684
685                 if (u->sk_type == type)
686                         touch_atime(nd.mnt, nd.dentry);
687
688                 path_release(&nd);
689
690                 err=-EPROTOTYPE;
691                 if (u->sk_type != type) {
692                         sock_put(u);
693                         goto fail;
694                 }
695         } else {
696                 err = -ECONNREFUSED;
697                 u=unix_find_socket_byname(sunname, len, type, hash);
698                 if (u) {
699                         struct dentry *dentry;
700                         dentry = unix_sk(u)->dentry;
701                         if (dentry)
702                                 touch_atime(unix_sk(u)->mnt, dentry);
703                 } else
704                         goto fail;
705         }
706         return u;
707
708 put_fail:
709         path_release(&nd);
710 fail:
711         *error=err;
712         return NULL;
713 }
714
715
716 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
717 {
718         struct sock *sk = sock->sk;
719         struct unix_sock *u = unix_sk(sk);
720         struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr;
721         struct dentry * dentry = NULL;
722         struct nameidata nd;
723         int err;
724         unsigned hash;
725         struct unix_address *addr;
726         struct hlist_head *list;
727
728         err = -EINVAL;
729         if (sunaddr->sun_family != AF_UNIX)
730                 goto out;
731
732         if (addr_len==sizeof(short)) {
733                 err = unix_autobind(sock);
734                 goto out;
735         }
736
737         err = unix_mkname(sunaddr, addr_len, &hash);
738         if (err < 0)
739                 goto out;
740         addr_len = err;
741
742         down(&u->readsem);
743
744         err = -EINVAL;
745         if (u->addr)
746                 goto out_up;
747
748         err = -ENOMEM;
749         addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
750         if (!addr)
751                 goto out_up;
752
753         memcpy(addr->name, sunaddr, addr_len);
754         addr->len = addr_len;
755         addr->hash = hash ^ sk->sk_type;
756         atomic_set(&addr->refcnt, 1);
757
758         if (sunaddr->sun_path[0]) {
759                 unsigned int mode;
760                 err = 0;
761                 /*
762                  * Get the parent directory, calculate the hash for last
763                  * component.
764                  */
765                 err = path_lookup(sunaddr->sun_path, LOOKUP_PARENT, &nd);
766                 if (err)
767                         goto out_mknod_parent;
768                 /*
769                  * Yucky last component or no last component at all?
770                  * (foo/., foo/.., /////)
771                  */
772                 err = -EEXIST;
773                 if (nd.last_type != LAST_NORM)
774                         goto out_mknod;
775                 /*
776                  * Lock the directory.
777                  */
778                 down(&nd.dentry->d_inode->i_sem);
779                 /*
780                  * Do the final lookup.
781                  */
782                 dentry = lookup_hash(&nd.last, nd.dentry);
783                 err = PTR_ERR(dentry);
784                 if (IS_ERR(dentry))
785                         goto out_mknod_unlock;
786                 err = -ENOENT;
787                 /*
788                  * Special case - lookup gave negative, but... we had foo/bar/
789                  * From the vfs_mknod() POV we just have a negative dentry -
790                  * all is fine. Let's be bastards - you had / on the end, you've
791                  * been asking for (non-existent) directory. -ENOENT for you.
792                  */
793                 if (nd.last.name[nd.last.len] && !dentry->d_inode)
794                         goto out_mknod_dput;
795                 /*
796                  * All right, let's create it.
797                  */
798                 mode = S_IFSOCK |
799                        (SOCK_INODE(sock)->i_mode & ~current->fs->umask);
800                 err = vfs_mknod(nd.dentry->d_inode, dentry, mode, 0);
801                 if (err)
802                         goto out_mknod_dput;
803                 up(&nd.dentry->d_inode->i_sem);
804                 dput(nd.dentry);
805                 nd.dentry = dentry;
806
807                 addr->hash = UNIX_HASH_SIZE;
808         }
809
810         write_lock(&unix_table_lock);
811
812         if (!sunaddr->sun_path[0]) {
813                 err = -EADDRINUSE;
814                 if (__unix_find_socket_byname(sunaddr, addr_len,
815                                               sk->sk_type, hash)) {
816                         unix_release_addr(addr);
817                         goto out_unlock;
818                 }
819
820                 list = &unix_socket_table[addr->hash];
821         } else {
822                 list = &unix_socket_table[dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1)];
823                 u->dentry = nd.dentry;
824                 u->mnt    = nd.mnt;
825         }
826
827         err = 0;
828         __unix_remove_socket(sk);
829         u->addr = addr;
830         __unix_insert_socket(list, sk);
831
832 out_unlock:
833         write_unlock(&unix_table_lock);
834 out_up:
835         up(&u->readsem);
836 out:
837         return err;
838
839 out_mknod_dput:
840         dput(dentry);
841 out_mknod_unlock:
842         up(&nd.dentry->d_inode->i_sem);
843 out_mknod:
844         path_release(&nd);
845 out_mknod_parent:
846         if (err==-EEXIST)
847                 err=-EADDRINUSE;
848         unix_release_addr(addr);
849         goto out_up;
850 }
851
852 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
853                               int alen, int flags)
854 {
855         struct sock *sk = sock->sk;
856         struct sockaddr_un *sunaddr=(struct sockaddr_un*)addr;
857         struct sock *other;
858         unsigned hash;
859         int err;
860
861         if (addr->sa_family != AF_UNSPEC) {
862                 err = unix_mkname(sunaddr, alen, &hash);
863                 if (err < 0)
864                         goto out;
865                 alen = err;
866
867                 if (test_bit(SOCK_PASS_CRED, &sock->flags) && !unix_sk(sk)->addr &&
868                     (err = unix_autobind(sock)) != 0)
869                         goto out;
870
871                 other=unix_find_other(sunaddr, alen, sock->type, hash, &err);
872                 if (!other)
873                         goto out;
874
875                 unix_state_wlock(sk);
876
877                 err = -EPERM;
878                 if (!unix_may_send(sk, other))
879                         goto out_unlock;
880
881                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
882                 if (err)
883                         goto out_unlock;
884
885         } else {
886                 /*
887                  *      1003.1g breaking connected state with AF_UNSPEC
888                  */
889                 other = NULL;
890                 unix_state_wlock(sk);
891         }
892
893         /*
894          * If it was connected, reconnect.
895          */
896         if (unix_peer(sk)) {
897                 struct sock *old_peer = unix_peer(sk);
898                 unix_peer(sk)=other;
899                 unix_state_wunlock(sk);
900
901                 if (other != old_peer)
902                         unix_dgram_disconnected(sk, old_peer);
903                 sock_put(old_peer);
904         } else {
905                 unix_peer(sk)=other;
906                 unix_state_wunlock(sk);
907         }
908         return 0;
909
910 out_unlock:
911         unix_state_wunlock(sk);
912         sock_put(other);
913 out:
914         return err;
915 }
916
917 static long unix_wait_for_peer(struct sock *other, long timeo)
918 {
919         struct unix_sock *u = unix_sk(other);
920         int sched;
921         DEFINE_WAIT(wait);
922
923         prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
924
925         sched = !sock_flag(other, SOCK_DEAD) &&
926                 !(other->sk_shutdown & RCV_SHUTDOWN) &&
927                 (skb_queue_len(&other->sk_receive_queue) >
928                  other->sk_max_ack_backlog);
929
930         unix_state_runlock(other);
931
932         if (sched)
933                 timeo = schedule_timeout(timeo);
934
935         finish_wait(&u->peer_wait, &wait);
936         return timeo;
937 }
938
939 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
940                                int addr_len, int flags)
941 {
942         struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr;
943         struct sock *sk = sock->sk;
944         struct unix_sock *u = unix_sk(sk), *newu, *otheru;
945         struct sock *newsk = NULL;
946         struct sock *other = NULL;
947         struct sk_buff *skb = NULL;
948         unsigned hash;
949         int st;
950         int err;
951         long timeo;
952
953         err = unix_mkname(sunaddr, addr_len, &hash);
954         if (err < 0)
955                 goto out;
956         addr_len = err;
957
958         if (test_bit(SOCK_PASS_CRED, &sock->flags)
959                 && !u->addr && (err = unix_autobind(sock)) != 0)
960                 goto out;
961
962         timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
963
964         /* First of all allocate resources.
965            If we will make it after state is locked,
966            we will have to recheck all again in any case.
967          */
968
969         err = -ENOMEM;
970
971         /* create new sock for complete connection */
972         newsk = unix_create1(NULL);
973         if (newsk == NULL)
974                 goto out;
975
976         /* Allocate skb for sending to listening sock */
977         skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
978         if (skb == NULL)
979                 goto out;
980
981 restart:
982         /*  Find listening sock. */
983         other = unix_find_other(sunaddr, addr_len, sk->sk_type, hash, &err);
984         if (!other)
985                 goto out;
986
987         /* Latch state of peer */
988         unix_state_rlock(other);
989
990         /* Apparently VFS overslept socket death. Retry. */
991         if (sock_flag(other, SOCK_DEAD)) {
992                 unix_state_runlock(other);
993                 sock_put(other);
994                 goto restart;
995         }
996
997         err = -ECONNREFUSED;
998         if (other->sk_state != TCP_LISTEN)
999                 goto out_unlock;
1000
1001         if (skb_queue_len(&other->sk_receive_queue) >
1002             other->sk_max_ack_backlog) {
1003                 err = -EAGAIN;
1004                 if (!timeo)
1005                         goto out_unlock;
1006
1007                 timeo = unix_wait_for_peer(other, timeo);
1008
1009                 err = sock_intr_errno(timeo);
1010                 if (signal_pending(current))
1011                         goto out;
1012                 sock_put(other);
1013                 goto restart;
1014         }
1015
1016         /* Latch our state.
1017
1018            It is tricky place. We need to grab write lock and cannot
1019            drop lock on peer. It is dangerous because deadlock is
1020            possible. Connect to self case and simultaneous
1021            attempt to connect are eliminated by checking socket
1022            state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1023            check this before attempt to grab lock.
1024
1025            Well, and we have to recheck the state after socket locked.
1026          */
1027         st = sk->sk_state;
1028
1029         switch (st) {
1030         case TCP_CLOSE:
1031                 /* This is ok... continue with connect */
1032                 break;
1033         case TCP_ESTABLISHED:
1034                 /* Socket is already connected */
1035                 err = -EISCONN;
1036                 goto out_unlock;
1037         default:
1038                 err = -EINVAL;
1039                 goto out_unlock;
1040         }
1041
1042         unix_state_wlock(sk);
1043
1044         if (sk->sk_state != st) {
1045                 unix_state_wunlock(sk);
1046                 unix_state_runlock(other);
1047                 sock_put(other);
1048                 goto restart;
1049         }
1050
1051         err = security_unix_stream_connect(sock, other->sk_socket, newsk);
1052         if (err) {
1053                 unix_state_wunlock(sk);
1054                 goto out_unlock;
1055         }
1056
1057         /* The way is open! Fastly set all the necessary fields... */
1058
1059         sock_hold(sk);
1060         unix_peer(newsk)        = sk;
1061         newsk->sk_state         = TCP_ESTABLISHED;
1062         newsk->sk_type          = sk->sk_type;
1063         newsk->sk_peercred.pid  = current->tgid;
1064         newsk->sk_peercred.uid  = current->euid;
1065         newsk->sk_peercred.gid  = current->egid;
1066         newu = unix_sk(newsk);
1067         newsk->sk_sleep         = &newu->peer_wait;
1068         otheru = unix_sk(other);
1069
1070         /* copy address information from listening to new sock*/
1071         if (otheru->addr) {
1072                 atomic_inc(&otheru->addr->refcnt);
1073                 newu->addr = otheru->addr;
1074         }
1075         if (otheru->dentry) {
1076                 newu->dentry    = dget(otheru->dentry);
1077                 newu->mnt       = mntget(otheru->mnt);
1078         }
1079
1080         /* Set credentials */
1081         sk->sk_peercred = other->sk_peercred;
1082
1083         sock_hold(newsk);
1084         unix_peer(sk)   = newsk;
1085         sock->state     = SS_CONNECTED;
1086         sk->sk_state    = TCP_ESTABLISHED;
1087
1088         unix_state_wunlock(sk);
1089
1090         /* take ten and and send info to listening sock */
1091         spin_lock(&other->sk_receive_queue.lock);
1092         __skb_queue_tail(&other->sk_receive_queue, skb);
1093         /* Undo artificially decreased inflight after embrion
1094          * is installed to listening socket. */
1095         atomic_inc(&newu->inflight);
1096         spin_unlock(&other->sk_receive_queue.lock);
1097         unix_state_runlock(other);
1098         other->sk_data_ready(other, 0);
1099         sock_put(other);
1100         return 0;
1101
1102 out_unlock:
1103         if (other)
1104                 unix_state_runlock(other);
1105
1106 out:
1107         if (skb)
1108                 kfree_skb(skb);
1109         if (newsk)
1110                 unix_release_sock(newsk, 0);
1111         if (other)
1112                 sock_put(other);
1113         return err;
1114 }
1115
1116 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1117 {
1118         struct sock *ska=socka->sk, *skb = sockb->sk;
1119
1120         /* Join our sockets back to back */
1121         sock_hold(ska);
1122         sock_hold(skb);
1123         unix_peer(ska)=skb;
1124         unix_peer(skb)=ska;
1125         ska->sk_peercred.pid = skb->sk_peercred.pid = current->tgid;
1126         ska->sk_peercred.uid = skb->sk_peercred.uid = current->euid;
1127         ska->sk_peercred.gid = skb->sk_peercred.gid = current->egid;
1128
1129         if (ska->sk_type != SOCK_DGRAM) {
1130                 ska->sk_state = TCP_ESTABLISHED;
1131                 skb->sk_state = TCP_ESTABLISHED;
1132                 socka->state  = SS_CONNECTED;
1133                 sockb->state  = SS_CONNECTED;
1134         }
1135         return 0;
1136 }
1137
1138 static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
1139 {
1140         struct sock *sk = sock->sk;
1141         struct sock *tsk;
1142         struct sk_buff *skb;
1143         int err;
1144
1145         err = -EOPNOTSUPP;
1146         if (sock->type!=SOCK_STREAM && sock->type!=SOCK_SEQPACKET)
1147                 goto out;
1148
1149         err = -EINVAL;
1150         if (sk->sk_state != TCP_LISTEN)
1151                 goto out;
1152
1153         /* If socket state is TCP_LISTEN it cannot change (for now...),
1154          * so that no locks are necessary.
1155          */
1156
1157         skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1158         if (!skb) {
1159                 /* This means receive shutdown. */
1160                 if (err == 0)
1161                         err = -EINVAL;
1162                 goto out;
1163         }
1164
1165         tsk = skb->sk;
1166         skb_free_datagram(sk, skb);
1167         wake_up_interruptible(&unix_sk(sk)->peer_wait);
1168
1169         /* attach accepted sock to socket */
1170         unix_state_wlock(tsk);
1171         newsock->state = SS_CONNECTED;
1172         sock_graft(tsk, newsock);
1173         unix_state_wunlock(tsk);
1174         return 0;
1175
1176 out:
1177         return err;
1178 }
1179
1180
1181 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer)
1182 {
1183         struct sock *sk = sock->sk;
1184         struct unix_sock *u;
1185         struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr;
1186         int err = 0;
1187
1188         if (peer) {
1189                 sk = unix_peer_get(sk);
1190
1191                 err = -ENOTCONN;
1192                 if (!sk)
1193                         goto out;
1194                 err = 0;
1195         } else {
1196                 sock_hold(sk);
1197         }
1198
1199         u = unix_sk(sk);
1200         unix_state_rlock(sk);
1201         if (!u->addr) {
1202                 sunaddr->sun_family = AF_UNIX;
1203                 sunaddr->sun_path[0] = 0;
1204                 *uaddr_len = sizeof(short);
1205         } else {
1206                 struct unix_address *addr = u->addr;
1207
1208                 *uaddr_len = addr->len;
1209                 memcpy(sunaddr, addr->name, *uaddr_len);
1210         }
1211         unix_state_runlock(sk);
1212         sock_put(sk);
1213 out:
1214         return err;
1215 }
1216
1217 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1218 {
1219         int i;
1220
1221         scm->fp = UNIXCB(skb).fp;
1222         skb->destructor = sock_wfree;
1223         UNIXCB(skb).fp = NULL;
1224
1225         for (i=scm->fp->count-1; i>=0; i--)
1226                 unix_notinflight(scm->fp->fp[i]);
1227 }
1228
1229 static void unix_destruct_fds(struct sk_buff *skb)
1230 {
1231         struct scm_cookie scm;
1232         memset(&scm, 0, sizeof(scm));
1233         unix_detach_fds(&scm, skb);
1234
1235         /* Alas, it calls VFS */
1236         /* So fscking what? fput() had been SMP-safe since the last Summer */
1237         scm_destroy(&scm);
1238         sock_wfree(skb);
1239 }
1240
1241 static void unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1242 {
1243         int i;
1244         for (i=scm->fp->count-1; i>=0; i--)
1245                 unix_inflight(scm->fp->fp[i]);
1246         UNIXCB(skb).fp = scm->fp;
1247         skb->destructor = unix_destruct_fds;
1248         scm->fp = NULL;
1249 }
1250
1251 /*
1252  *      Send AF_UNIX data.
1253  */
1254
1255 static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
1256                               struct msghdr *msg, size_t len)
1257 {
1258         struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1259         struct sock *sk = sock->sk;
1260         struct unix_sock *u = unix_sk(sk);
1261         struct sockaddr_un *sunaddr=msg->msg_name;
1262         struct sock *other = NULL;
1263         int namelen = 0; /* fake GCC */
1264         int err;
1265         unsigned hash;
1266         struct sk_buff *skb;
1267         long timeo;
1268         struct scm_cookie tmp_scm;
1269
1270         if (NULL == siocb->scm)
1271                 siocb->scm = &tmp_scm;
1272         err = scm_send(sock, msg, siocb->scm);
1273         if (err < 0)
1274                 return err;
1275
1276         err = -EOPNOTSUPP;
1277         if (msg->msg_flags&MSG_OOB)
1278                 goto out;
1279
1280         if (msg->msg_namelen) {
1281                 err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1282                 if (err < 0)
1283                         goto out;
1284                 namelen = err;
1285         } else {
1286                 sunaddr = NULL;
1287                 err = -ENOTCONN;
1288                 other = unix_peer_get(sk);
1289                 if (!other)
1290                         goto out;
1291         }
1292
1293         if (test_bit(SOCK_PASS_CRED, &sock->flags)
1294                 && !u->addr && (err = unix_autobind(sock)) != 0)
1295                 goto out;
1296
1297         err = -EMSGSIZE;
1298         if (len > sk->sk_sndbuf - 32)
1299                 goto out;
1300
1301         skb = sock_alloc_send_skb(sk, len, msg->msg_flags&MSG_DONTWAIT, &err);
1302         if (skb==NULL)
1303                 goto out;
1304
1305         memcpy(UNIXCREDS(skb), &siocb->scm->creds, sizeof(struct ucred));
1306         if (siocb->scm->fp)
1307                 unix_attach_fds(siocb->scm, skb);
1308
1309         skb->h.raw = skb->data;
1310         err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
1311         if (err)
1312                 goto out_free;
1313
1314         timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1315
1316 restart:
1317         if (!other) {
1318                 err = -ECONNRESET;
1319                 if (sunaddr == NULL)
1320                         goto out_free;
1321
1322                 other = unix_find_other(sunaddr, namelen, sk->sk_type,
1323                                         hash, &err);
1324                 if (other==NULL)
1325                         goto out_free;
1326         }
1327
1328         unix_state_rlock(other);
1329         err = -EPERM;
1330         if (!unix_may_send(sk, other))
1331                 goto out_unlock;
1332
1333         if (sock_flag(other, SOCK_DEAD)) {
1334                 /*
1335                  *      Check with 1003.1g - what should
1336                  *      datagram error
1337                  */
1338                 unix_state_runlock(other);
1339                 sock_put(other);
1340
1341                 err = 0;
1342                 unix_state_wlock(sk);
1343                 if (unix_peer(sk) == other) {
1344                         unix_peer(sk)=NULL;
1345                         unix_state_wunlock(sk);
1346
1347                         unix_dgram_disconnected(sk, other);
1348                         sock_put(other);
1349                         err = -ECONNREFUSED;
1350                 } else {
1351                         unix_state_wunlock(sk);
1352                 }
1353
1354                 other = NULL;
1355                 if (err)
1356                         goto out_free;
1357                 goto restart;
1358         }
1359
1360         err = -EPIPE;
1361         if (other->sk_shutdown & RCV_SHUTDOWN)
1362                 goto out_unlock;
1363
1364         if (sk->sk_type != SOCK_SEQPACKET) {
1365                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1366                 if (err)
1367                         goto out_unlock;
1368         }
1369
1370         if (unix_peer(other) != sk &&
1371             (skb_queue_len(&other->sk_receive_queue) >
1372              other->sk_max_ack_backlog)) {
1373                 if (!timeo) {
1374                         err = -EAGAIN;
1375                         goto out_unlock;
1376                 }
1377
1378                 timeo = unix_wait_for_peer(other, timeo);
1379
1380                 err = sock_intr_errno(timeo);
1381                 if (signal_pending(current))
1382                         goto out_free;
1383
1384                 goto restart;
1385         }
1386
1387         skb_queue_tail(&other->sk_receive_queue, skb);
1388         unix_state_runlock(other);
1389         other->sk_data_ready(other, len);
1390         sock_put(other);
1391         scm_destroy(siocb->scm);
1392         return len;
1393
1394 out_unlock:
1395         unix_state_runlock(other);
1396 out_free:
1397         kfree_skb(skb);
1398 out:
1399         if (other)
1400                 sock_put(other);
1401         scm_destroy(siocb->scm);
1402         return err;
1403 }
1404
1405                 
1406 static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
1407                                struct msghdr *msg, size_t len)
1408 {
1409         struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1410         struct sock *sk = sock->sk;
1411         struct sock *other = NULL;
1412         struct sockaddr_un *sunaddr=msg->msg_name;
1413         int err,size;
1414         struct sk_buff *skb;
1415         int sent=0;
1416         struct scm_cookie tmp_scm;
1417
1418         if (NULL == siocb->scm)
1419                 siocb->scm = &tmp_scm;
1420         err = scm_send(sock, msg, siocb->scm);
1421         if (err < 0)
1422                 return err;
1423
1424         err = -EOPNOTSUPP;
1425         if (msg->msg_flags&MSG_OOB)
1426                 goto out_err;
1427
1428         if (msg->msg_namelen) {
1429                 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1430                 goto out_err;
1431         } else {
1432                 sunaddr = NULL;
1433                 err = -ENOTCONN;
1434                 other = unix_peer_get(sk);
1435                 if (!other)
1436                         goto out_err;
1437         }
1438
1439         if (sk->sk_shutdown & SEND_SHUTDOWN)
1440                 goto pipe_err;
1441
1442         while(sent < len)
1443         {
1444                 /*
1445                  *      Optimisation for the fact that under 0.01% of X messages typically
1446                  *      need breaking up.
1447                  */
1448
1449                 size=len-sent;
1450
1451                 /* Keep two messages in the pipe so it schedules better */
1452                 if (size > sk->sk_sndbuf / 2 - 64)
1453                         size = sk->sk_sndbuf / 2 - 64;
1454
1455                 if (size > SKB_MAX_ALLOC)
1456                         size = SKB_MAX_ALLOC;
1457                         
1458                 /*
1459                  *      Grab a buffer
1460                  */
1461                  
1462                 skb=sock_alloc_send_skb(sk,size,msg->msg_flags&MSG_DONTWAIT, &err);
1463
1464                 if (skb==NULL)
1465                         goto out_err;
1466
1467                 /*
1468                  *      If you pass two values to the sock_alloc_send_skb
1469                  *      it tries to grab the large buffer with GFP_NOFS
1470                  *      (which can fail easily), and if it fails grab the
1471                  *      fallback size buffer which is under a page and will
1472                  *      succeed. [Alan]
1473                  */
1474                 size = min_t(int, size, skb_tailroom(skb));
1475
1476                 memcpy(UNIXCREDS(skb), &siocb->scm->creds, sizeof(struct ucred));
1477                 if (siocb->scm->fp)
1478                         unix_attach_fds(siocb->scm, skb);
1479
1480                 if ((err = memcpy_fromiovec(skb_put(skb,size), msg->msg_iov, size)) != 0) {
1481                         kfree_skb(skb);
1482                         goto out_err;
1483                 }
1484
1485                 unix_state_rlock(other);
1486
1487                 if (sock_flag(other, SOCK_DEAD) ||
1488                     (other->sk_shutdown & RCV_SHUTDOWN))
1489                         goto pipe_err_free;
1490
1491                 skb_queue_tail(&other->sk_receive_queue, skb);
1492                 unix_state_runlock(other);
1493                 other->sk_data_ready(other, size);
1494                 sent+=size;
1495         }
1496         sock_put(other);
1497
1498         scm_destroy(siocb->scm);
1499         siocb->scm = NULL;
1500
1501         return sent;
1502
1503 pipe_err_free:
1504         unix_state_runlock(other);
1505         kfree_skb(skb);
1506 pipe_err:
1507         if (sent==0 && !(msg->msg_flags&MSG_NOSIGNAL))
1508                 send_sig(SIGPIPE,current,0);
1509         err = -EPIPE;
1510 out_err:
1511         if (other)
1512                 sock_put(other);
1513         scm_destroy(siocb->scm);
1514         siocb->scm = NULL;
1515         return sent ? : err;
1516 }
1517
1518 static int unix_seqpacket_sendmsg(struct kiocb *kiocb, struct socket *sock,
1519                                   struct msghdr *msg, size_t len)
1520 {
1521         int err;
1522         struct sock *sk = sock->sk;
1523         
1524         err = sock_error(sk);
1525         if (err)
1526                 return err;
1527
1528         if (sk->sk_state != TCP_ESTABLISHED)
1529                 return -ENOTCONN;
1530
1531         if (msg->msg_namelen)
1532                 msg->msg_namelen = 0;
1533
1534         return unix_dgram_sendmsg(kiocb, sock, msg, len);
1535 }
1536                                                                                             
1537 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
1538 {
1539         struct unix_sock *u = unix_sk(sk);
1540
1541         msg->msg_namelen = 0;
1542         if (u->addr) {
1543                 msg->msg_namelen = u->addr->len;
1544                 memcpy(msg->msg_name, u->addr->name, u->addr->len);
1545         }
1546 }
1547
1548 static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
1549                               struct msghdr *msg, size_t size,
1550                               int flags)
1551 {
1552         struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1553         struct scm_cookie tmp_scm;
1554         struct sock *sk = sock->sk;
1555         struct unix_sock *u = unix_sk(sk);
1556         int noblock = flags & MSG_DONTWAIT;
1557         struct sk_buff *skb;
1558         int err;
1559
1560         err = -EOPNOTSUPP;
1561         if (flags&MSG_OOB)
1562                 goto out;
1563
1564         msg->msg_namelen = 0;
1565
1566         down(&u->readsem);
1567
1568         skb = skb_recv_datagram(sk, flags, noblock, &err);
1569         if (!skb)
1570                 goto out_unlock;
1571
1572         wake_up_interruptible(&u->peer_wait);
1573
1574         if (msg->msg_name)
1575                 unix_copy_addr(msg, skb->sk);
1576
1577         if (size > skb->len)
1578                 size = skb->len;
1579         else if (size < skb->len)
1580                 msg->msg_flags |= MSG_TRUNC;
1581
1582         err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, size);
1583         if (err)
1584                 goto out_free;
1585
1586         if (!siocb->scm) {
1587                 siocb->scm = &tmp_scm;
1588                 memset(&tmp_scm, 0, sizeof(tmp_scm));
1589         }
1590         siocb->scm->creds = *UNIXCREDS(skb);
1591
1592         if (!(flags & MSG_PEEK))
1593         {
1594                 if (UNIXCB(skb).fp)
1595                         unix_detach_fds(siocb->scm, skb);
1596         }
1597         else 
1598         {
1599                 /* It is questionable: on PEEK we could:
1600                    - do not return fds - good, but too simple 8)
1601                    - return fds, and do not return them on read (old strategy,
1602                      apparently wrong)
1603                    - clone fds (I chose it for now, it is the most universal
1604                      solution)
1605                 
1606                    POSIX 1003.1g does not actually define this clearly
1607                    at all. POSIX 1003.1g doesn't define a lot of things
1608                    clearly however!                  
1609                    
1610                 */
1611                 if (UNIXCB(skb).fp)
1612                         siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1613         }
1614         err = size;
1615
1616         scm_recv(sock, msg, siocb->scm, flags);
1617
1618 out_free:
1619         skb_free_datagram(sk,skb);
1620 out_unlock:
1621         up(&u->readsem);
1622 out:
1623         return err;
1624 }
1625
1626 /*
1627  *      Sleep until data has arrive. But check for races..
1628  */
1629  
1630 static long unix_stream_data_wait(struct sock * sk, long timeo)
1631 {
1632         DEFINE_WAIT(wait);
1633
1634         unix_state_rlock(sk);
1635
1636         for (;;) {
1637                 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1638
1639                 if (skb_queue_len(&sk->sk_receive_queue) ||
1640                     sk->sk_err ||
1641                     (sk->sk_shutdown & RCV_SHUTDOWN) ||
1642                     signal_pending(current) ||
1643                     !timeo)
1644                         break;
1645
1646                 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1647                 unix_state_runlock(sk);
1648                 timeo = schedule_timeout(timeo);
1649                 unix_state_rlock(sk);
1650                 clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1651         }
1652
1653         finish_wait(sk->sk_sleep, &wait);
1654         unix_state_runlock(sk);
1655         return timeo;
1656 }
1657
1658
1659
1660 static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
1661                                struct msghdr *msg, size_t size,
1662                                int flags)
1663 {
1664         struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1665         struct scm_cookie tmp_scm;
1666         struct sock *sk = sock->sk;
1667         struct unix_sock *u = unix_sk(sk);
1668         struct sockaddr_un *sunaddr=msg->msg_name;
1669         int copied = 0;
1670         int check_creds = 0;
1671         int target;
1672         int err = 0;
1673         long timeo;
1674
1675         err = -EINVAL;
1676         if (sk->sk_state != TCP_ESTABLISHED)
1677                 goto out;
1678
1679         err = -EOPNOTSUPP;
1680         if (flags&MSG_OOB)
1681                 goto out;
1682
1683         target = sock_rcvlowat(sk, flags&MSG_WAITALL, size);
1684         timeo = sock_rcvtimeo(sk, flags&MSG_DONTWAIT);
1685
1686         msg->msg_namelen = 0;
1687
1688         /* Lock the socket to prevent queue disordering
1689          * while sleeps in memcpy_tomsg
1690          */
1691
1692         if (!siocb->scm) {
1693                 siocb->scm = &tmp_scm;
1694                 memset(&tmp_scm, 0, sizeof(tmp_scm));
1695         }
1696
1697         down(&u->readsem);
1698
1699         do
1700         {
1701                 int chunk;
1702                 struct sk_buff *skb;
1703
1704                 skb = skb_dequeue(&sk->sk_receive_queue);
1705                 if (skb==NULL)
1706                 {
1707                         if (copied >= target)
1708                                 break;
1709
1710                         /*
1711                          *      POSIX 1003.1g mandates this order.
1712                          */
1713                          
1714                         if ((err = sock_error(sk)) != 0)
1715                                 break;
1716                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1717                                 break;
1718                         err = -EAGAIN;
1719                         if (!timeo)
1720                                 break;
1721                         up(&u->readsem);
1722
1723                         timeo = unix_stream_data_wait(sk, timeo);
1724
1725                         if (signal_pending(current)) {
1726                                 err = sock_intr_errno(timeo);
1727                                 goto out;
1728                         }
1729                         down(&u->readsem);
1730                         continue;
1731                 }
1732
1733                 if (check_creds) {
1734                         /* Never glue messages from different writers */
1735                         if (memcmp(UNIXCREDS(skb), &siocb->scm->creds, sizeof(siocb->scm->creds)) != 0) {
1736                                 skb_queue_head(&sk->sk_receive_queue, skb);
1737                                 break;
1738                         }
1739                 } else {
1740                         /* Copy credentials */
1741                         siocb->scm->creds = *UNIXCREDS(skb);
1742                         check_creds = 1;
1743                 }
1744
1745                 /* Copy address just once */
1746                 if (sunaddr)
1747                 {
1748                         unix_copy_addr(msg, skb->sk);
1749                         sunaddr = NULL;
1750                 }
1751
1752                 chunk = min_t(unsigned int, skb->len, size);
1753                 if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) {
1754                         skb_queue_head(&sk->sk_receive_queue, skb);
1755                         if (copied == 0)
1756                                 copied = -EFAULT;
1757                         break;
1758                 }
1759                 copied += chunk;
1760                 size -= chunk;
1761
1762                 /* Mark read part of skb as used */
1763                 if (!(flags & MSG_PEEK))
1764                 {
1765                         skb_pull(skb, chunk);
1766
1767                         if (UNIXCB(skb).fp)
1768                                 unix_detach_fds(siocb->scm, skb);
1769
1770                         /* put the skb back if we didn't use it up.. */
1771                         if (skb->len)
1772                         {
1773                                 skb_queue_head(&sk->sk_receive_queue, skb);
1774                                 break;
1775                         }
1776
1777                         kfree_skb(skb);
1778
1779                         if (siocb->scm->fp)
1780                                 break;
1781                 }
1782                 else
1783                 {
1784                         /* It is questionable, see note in unix_dgram_recvmsg.
1785                          */
1786                         if (UNIXCB(skb).fp)
1787                                 siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1788
1789                         /* put message back and return */
1790                         skb_queue_head(&sk->sk_receive_queue, skb);
1791                         break;
1792                 }
1793         } while (size);
1794
1795         up(&u->readsem);
1796         scm_recv(sock, msg, siocb->scm, flags);
1797 out:
1798         return copied ? : err;
1799 }
1800
1801 static int unix_shutdown(struct socket *sock, int mode)
1802 {
1803         struct sock *sk = sock->sk;
1804         struct sock *other;
1805
1806         mode = (mode+1)&(RCV_SHUTDOWN|SEND_SHUTDOWN);
1807
1808         if (mode) {
1809                 unix_state_wlock(sk);
1810                 sk->sk_shutdown |= mode;
1811                 other=unix_peer(sk);
1812                 if (other)
1813                         sock_hold(other);
1814                 unix_state_wunlock(sk);
1815                 sk->sk_state_change(sk);
1816
1817                 if (other &&
1818                         (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
1819
1820                         int peer_mode = 0;
1821
1822                         if (mode&RCV_SHUTDOWN)
1823                                 peer_mode |= SEND_SHUTDOWN;
1824                         if (mode&SEND_SHUTDOWN)
1825                                 peer_mode |= RCV_SHUTDOWN;
1826                         unix_state_wlock(other);
1827                         other->sk_shutdown |= peer_mode;
1828                         unix_state_wunlock(other);
1829                         other->sk_state_change(other);
1830                         read_lock(&other->sk_callback_lock);
1831                         if (peer_mode == SHUTDOWN_MASK)
1832                                 sk_wake_async(other,1,POLL_HUP);
1833                         else if (peer_mode & RCV_SHUTDOWN)
1834                                 sk_wake_async(other,1,POLL_IN);
1835                         read_unlock(&other->sk_callback_lock);
1836                 }
1837                 if (other)
1838                         sock_put(other);
1839         }
1840         return 0;
1841 }
1842
1843 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1844 {
1845         struct sock *sk = sock->sk;
1846         long amount=0;
1847         int err;
1848
1849         switch(cmd)
1850         {
1851                 case SIOCOUTQ:
1852                         amount = atomic_read(&sk->sk_wmem_alloc);
1853                         err = put_user(amount, (int __user *)arg);
1854                         break;
1855                 case SIOCINQ:
1856                 {
1857                         struct sk_buff *skb;
1858
1859                         if (sk->sk_state == TCP_LISTEN) {
1860                                 err = -EINVAL;
1861                                 break;
1862                         }
1863
1864                         spin_lock(&sk->sk_receive_queue.lock);
1865                         if (sk->sk_type == SOCK_STREAM ||
1866                             sk->sk_type == SOCK_SEQPACKET) {
1867                                 skb_queue_walk(&sk->sk_receive_queue, skb)
1868                                         amount += skb->len;
1869                         } else {
1870                                 skb = skb_peek(&sk->sk_receive_queue);
1871                                 if (skb)
1872                                         amount=skb->len;
1873                         }
1874                         spin_unlock(&sk->sk_receive_queue.lock);
1875                         err = put_user(amount, (int __user *)arg);
1876                         break;
1877                 }
1878
1879                 default:
1880                         err = dev_ioctl(cmd, (void __user *)arg);
1881                         break;
1882         }
1883         return err;
1884 }
1885
1886 static unsigned int unix_poll(struct file * file, struct socket *sock, poll_table *wait)
1887 {
1888         struct sock *sk = sock->sk;
1889         unsigned int mask;
1890
1891         poll_wait(file, sk->sk_sleep, wait);
1892         mask = 0;
1893
1894         /* exceptional events? */
1895         if (sk->sk_err)
1896                 mask |= POLLERR;
1897         if (sk->sk_shutdown == SHUTDOWN_MASK)
1898                 mask |= POLLHUP;
1899
1900         /* readable? */
1901         if (!skb_queue_empty(&sk->sk_receive_queue) ||
1902             (sk->sk_shutdown & RCV_SHUTDOWN))
1903                 mask |= POLLIN | POLLRDNORM;
1904
1905         /* Connection-based need to check for termination and startup */
1906         if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) && sk->sk_state == TCP_CLOSE)
1907                 mask |= POLLHUP;
1908
1909         /*
1910          * we set writable also when the other side has shut down the
1911          * connection. This prevents stuck sockets.
1912          */
1913         if (unix_writable(sk))
1914                 mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
1915
1916         return mask;
1917 }
1918
1919
1920 #ifdef CONFIG_PROC_FS
1921 static struct sock *unix_seq_idx(int *iter, loff_t pos)
1922 {
1923         loff_t off = 0;
1924         struct sock *s;
1925
1926         for (s = first_unix_socket(iter); s; s = next_unix_socket(iter, s)) {
1927                 if (off == pos) 
1928                         return s;
1929                 ++off;
1930         }
1931         return NULL;
1932 }
1933
1934
1935 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
1936 {
1937         read_lock(&unix_table_lock);
1938         return *pos ? unix_seq_idx(seq->private, *pos - 1) : ((void *) 1);
1939 }
1940
1941 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1942 {
1943         ++*pos;
1944
1945         if (v == (void *)1) 
1946                 return first_unix_socket(seq->private);
1947         return next_unix_socket(seq->private, v);
1948 }
1949
1950 static void unix_seq_stop(struct seq_file *seq, void *v)
1951 {
1952         read_unlock(&unix_table_lock);
1953 }
1954
1955 static int unix_seq_show(struct seq_file *seq, void *v)
1956 {
1957         
1958         if (v == (void *)1)
1959                 seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
1960                          "Inode Path\n");
1961         else {
1962                 struct sock *s = v;
1963                 struct unix_sock *u = unix_sk(s);
1964                 unix_state_rlock(s);
1965
1966                 seq_printf(seq, "%p: %08X %08X %08X %04X %02X %5lu",
1967                         s,
1968                         atomic_read(&s->sk_refcnt),
1969                         0,
1970                         s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
1971                         s->sk_type,
1972                         s->sk_socket ?
1973                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
1974                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
1975                         sock_i_ino(s));
1976
1977                 if (u->addr) {
1978                         int i, len;
1979                         seq_putc(seq, ' ');
1980
1981                         i = 0;
1982                         len = u->addr->len - sizeof(short);
1983                         if (!UNIX_ABSTRACT(s))
1984                                 len--;
1985                         else {
1986                                 seq_putc(seq, '@');
1987                                 i++;
1988                         }
1989                         for ( ; i < len; i++)
1990                                 seq_putc(seq, u->addr->name->sun_path[i]);
1991                 }
1992                 unix_state_runlock(s);
1993                 seq_putc(seq, '\n');
1994         }
1995
1996         return 0;
1997 }
1998
1999 static struct seq_operations unix_seq_ops = {
2000         .start  = unix_seq_start,
2001         .next   = unix_seq_next,
2002         .stop   = unix_seq_stop,
2003         .show   = unix_seq_show,
2004 };
2005
2006
2007 static int unix_seq_open(struct inode *inode, struct file *file)
2008 {
2009         struct seq_file *seq;
2010         int rc = -ENOMEM;
2011         int *iter = kmalloc(sizeof(int), GFP_KERNEL);
2012
2013         if (!iter)
2014                 goto out;
2015
2016         rc = seq_open(file, &unix_seq_ops);
2017         if (rc)
2018                 goto out_kfree;
2019
2020         seq          = file->private_data;
2021         seq->private = iter;
2022         *iter = 0;
2023 out:
2024         return rc;
2025 out_kfree:
2026         kfree(iter);
2027         goto out;
2028 }
2029
2030 static struct file_operations unix_seq_fops = {
2031         .owner          = THIS_MODULE,
2032         .open           = unix_seq_open,
2033         .read           = seq_read,
2034         .llseek         = seq_lseek,
2035         .release        = seq_release_private,
2036 };
2037
2038 #endif
2039
2040 static struct net_proto_family unix_family_ops = {
2041         .family = PF_UNIX,
2042         .create = unix_create,
2043         .owner  = THIS_MODULE,
2044 };
2045
2046 #ifdef CONFIG_SYSCTL
2047 extern void unix_sysctl_register(void);
2048 extern void unix_sysctl_unregister(void);
2049 #else
2050 static inline void unix_sysctl_register(void) {}
2051 static inline void unix_sysctl_unregister(void) {}
2052 #endif
2053
2054 static int __init af_unix_init(void)
2055 {
2056         struct sk_buff *dummy_skb;
2057
2058         if (sizeof(struct unix_skb_parms) > sizeof(dummy_skb->cb)) {
2059                 printk(KERN_CRIT "%s: panic\n", __FUNCTION__);
2060                 return -1;
2061         }
2062         /* allocate our sock slab cache */
2063         unix_sk_cachep = kmem_cache_create("unix_sock",
2064                                            sizeof(struct unix_sock), 0,
2065                                            SLAB_HWCACHE_ALIGN, NULL, NULL);
2066         if (!unix_sk_cachep)
2067                 printk(KERN_CRIT
2068                         "af_unix_init: Cannot create unix_sock SLAB cache!\n");
2069
2070         sock_register(&unix_family_ops);
2071 #ifdef CONFIG_PROC_FS
2072         proc_net_fops_create("unix", 0, &unix_seq_fops);
2073 #endif
2074         unix_sysctl_register();
2075         return 0;
2076 }
2077
2078 static void __exit af_unix_exit(void)
2079 {
2080         sock_unregister(PF_UNIX);
2081         unix_sysctl_unregister();
2082         proc_net_remove("unix");
2083         kmem_cache_destroy(unix_sk_cachep);
2084 }
2085
2086 module_init(af_unix_init);
2087 module_exit(af_unix_exit);
2088
2089 MODULE_LICENSE("GPL");
2090 MODULE_ALIAS_NETPROTO(PF_UNIX);