This commit was manufactured by cvs2svn to create tag
[linux-2.6.git] / net / unix / af_unix.c
1 /*
2  * NET4:        Implementation of BSD Unix domain sockets.
3  *
4  * Authors:     Alan Cox, <alan.cox@linux.org>
5  *
6  *              This program is free software; you can redistribute it and/or
7  *              modify it under the terms of the GNU General Public License
8  *              as published by the Free Software Foundation; either version
9  *              2 of the License, or (at your option) any later version.
10  *
11  * Version:     $Id: af_unix.c,v 1.133 2002/02/08 03:57:19 davem Exp $
12  *
13  * Fixes:
14  *              Linus Torvalds  :       Assorted bug cures.
15  *              Niibe Yutaka    :       async I/O support.
16  *              Carsten Paeth   :       PF_UNIX check, address fixes.
17  *              Alan Cox        :       Limit size of allocated blocks.
18  *              Alan Cox        :       Fixed the stupid socketpair bug.
19  *              Alan Cox        :       BSD compatibility fine tuning.
20  *              Alan Cox        :       Fixed a bug in connect when interrupted.
21  *              Alan Cox        :       Sorted out a proper draft version of
22  *                                      file descriptor passing hacked up from
23  *                                      Mike Shaver's work.
24  *              Marty Leisner   :       Fixes to fd passing
25  *              Nick Nevin      :       recvmsg bugfix.
26  *              Alan Cox        :       Started proper garbage collector
27  *              Heiko EiBfeldt  :       Missing verify_area check
28  *              Alan Cox        :       Started POSIXisms
29  *              Andreas Schwab  :       Replace inode by dentry for proper
30  *                                      reference counting
31  *              Kirk Petersen   :       Made this a module
32  *          Christoph Rohland   :       Elegant non-blocking accept/connect algorithm.
33  *                                      Lots of bug fixes.
34  *           Alexey Kuznetosv   :       Repaired (I hope) bugs introduces
35  *                                      by above two patches.
36  *           Andrea Arcangeli   :       If possible we block in connect(2)
37  *                                      if the max backlog of the listen socket
38  *                                      is been reached. This won't break
39  *                                      old apps and it will avoid huge amount
40  *                                      of socks hashed (this for unix_gc()
41  *                                      performances reasons).
42  *                                      Security fix that limits the max
43  *                                      number of socks to 2*max_files and
44  *                                      the number of skb queueable in the
45  *                                      dgram receiver.
46  *              Artur Skawina   :       Hash function optimizations
47  *           Alexey Kuznetsov   :       Full scale SMP. Lot of bugs are introduced 8)
48  *            Malcolm Beattie   :       Set peercred for socketpair
49  *           Michal Ostrowski   :       Module initialization cleanup.
50  *           Arnaldo C. Melo    :       Remove MOD_{INC,DEC}_USE_COUNT,
51  *                                      the core infrastructure is doing that
52  *                                      for all net proto families now (2.5.69+)
53  *
54  *
55  * Known differences from reference BSD that was tested:
56  *
57  *      [TO FIX]
58  *      ECONNREFUSED is not returned from one end of a connected() socket to the
59  *              other the moment one end closes.
60  *      fstat() doesn't return st_dev=0, and give the blksize as high water mark
61  *              and a fake inode identifier (nor the BSD first socket fstat twice bug).
62  *      [NOT TO FIX]
63  *      accept() returns a path name even if the connecting socket has closed
64  *              in the meantime (BSD loses the path and gives up).
65  *      accept() returns 0 length path for an unbound connector. BSD returns 16
66  *              and a null first byte in the path (but not for gethost/peername - BSD bug ??)
67  *      socketpair(...SOCK_RAW..) doesn't panic the kernel.
68  *      BSD af_unix apparently has connect forgetting to block properly.
69  *              (need to check this with the POSIX spec in detail)
70  *
71  * Differences from 2.0.0-11-... (ANK)
72  *      Bug fixes and improvements.
73  *              - client shutdown killed server socket.
74  *              - removed all useless cli/sti pairs.
75  *
76  *      Semantic changes/extensions.
77  *              - generic control message passing.
78  *              - SCM_CREDENTIALS control message.
79  *              - "Abstract" (not FS based) socket bindings.
80  *                Abstract names are sequences of bytes (not zero terminated)
81  *                started by 0, so that this name space does not intersect
82  *                with BSD names.
83  */
84
85 #include <linux/module.h>
86 #include <linux/config.h>
87 #include <linux/kernel.h>
88 #include <linux/major.h>
89 #include <linux/signal.h>
90 #include <linux/sched.h>
91 #include <linux/errno.h>
92 #include <linux/string.h>
93 #include <linux/stat.h>
94 #include <linux/dcache.h>
95 #include <linux/namei.h>
96 #include <linux/socket.h>
97 #include <linux/un.h>
98 #include <linux/fcntl.h>
99 #include <linux/termios.h>
100 #include <linux/sockios.h>
101 #include <linux/net.h>
102 #include <linux/in.h>
103 #include <linux/fs.h>
104 #include <linux/slab.h>
105 #include <asm/uaccess.h>
106 #include <linux/skbuff.h>
107 #include <linux/netdevice.h>
108 #include <net/sock.h>
109 #include <linux/tcp.h>
110 #include <net/af_unix.h>
111 #include <linux/proc_fs.h>
112 #include <linux/seq_file.h>
113 #include <net/scm.h>
114 #include <linux/init.h>
115 #include <linux/poll.h>
116 #include <linux/smp_lock.h>
117 #include <linux/rtnetlink.h>
118 #include <linux/mount.h>
119 #include <net/checksum.h>
120 #include <linux/security.h>
121 #include <linux/vs_context.h>
122 #include <linux/vs_network.h>
123 #include <linux/vs_limit.h>
124
125 int sysctl_unix_max_dgram_qlen = 10;
126
127 kmem_cache_t *unix_sk_cachep;
128
129 struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1];
130 rwlock_t unix_table_lock = RW_LOCK_UNLOCKED;
131 static atomic_t unix_nr_socks = ATOMIC_INIT(0);
132
133 #define unix_sockets_unbound    (&unix_socket_table[UNIX_HASH_SIZE])
134
135 #define UNIX_ABSTRACT(sk)       (unix_sk(sk)->addr->hash != UNIX_HASH_SIZE)
136
137 /*
138  *  SMP locking strategy:
139  *    hash table is protected with rwlock unix_table_lock
140  *    each socket state is protected by separate rwlock.
141  */
142
143 static inline unsigned unix_hash_fold(unsigned hash)
144 {
145         hash ^= hash>>16;
146         hash ^= hash>>8;
147         return hash&(UNIX_HASH_SIZE-1);
148 }
149
150 #define unix_peer(sk) (unix_sk(sk)->peer)
151
152 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
153 {
154         return unix_peer(osk) == sk;
155 }
156
157 static inline int unix_may_send(struct sock *sk, struct sock *osk)
158 {
159         return (unix_peer(osk) == NULL || unix_our_peer(sk, osk));
160 }
161
162 static struct sock *unix_peer_get(struct sock *s)
163 {
164         struct sock *peer;
165
166         unix_state_rlock(s);
167         peer = unix_peer(s);
168         if (peer)
169                 sock_hold(peer);
170         unix_state_runlock(s);
171         return peer;
172 }
173
174 static inline void unix_release_addr(struct unix_address *addr)
175 {
176         if (atomic_dec_and_test(&addr->refcnt))
177                 kfree(addr);
178 }
179
180 /*
181  *      Check unix socket name:
182  *              - should be not zero length.
183  *              - if started by not zero, should be NULL terminated (FS object)
184  *              - if started by zero, it is abstract name.
185  */
186  
187 static int unix_mkname(struct sockaddr_un * sunaddr, int len, unsigned *hashp)
188 {
189         if (len <= sizeof(short) || len > sizeof(*sunaddr))
190                 return -EINVAL;
191         if (!sunaddr || sunaddr->sun_family != AF_UNIX)
192                 return -EINVAL;
193         if (sunaddr->sun_path[0])
194         {
195                 /*
196                  *      This may look like an off by one error but it is
197                  *      a bit more subtle. 108 is the longest valid AF_UNIX
198                  *      path for a binding. sun_path[108] doesn't as such
199                  *      exist. However in kernel space we are guaranteed that
200                  *      it is a valid memory location in our kernel
201                  *      address buffer.
202                  */
203                 if (len > sizeof(*sunaddr))
204                         len = sizeof(*sunaddr);
205                 ((char *)sunaddr)[len]=0;
206                 len = strlen(sunaddr->sun_path)+1+sizeof(short);
207                 return len;
208         }
209
210         *hashp = unix_hash_fold(csum_partial((char*)sunaddr, len, 0));
211         return len;
212 }
213
214 static void __unix_remove_socket(struct sock *sk)
215 {
216         sk_del_node_init(sk);
217 }
218
219 static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
220 {
221         BUG_TRAP(sk_unhashed(sk));
222         sk_add_node(sk, list);
223 }
224
225 static inline void unix_remove_socket(struct sock *sk)
226 {
227         write_lock(&unix_table_lock);
228         __unix_remove_socket(sk);
229         write_unlock(&unix_table_lock);
230 }
231
232 static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
233 {
234         write_lock(&unix_table_lock);
235         __unix_insert_socket(list, sk);
236         write_unlock(&unix_table_lock);
237 }
238
239 static struct sock *__unix_find_socket_byname(struct sockaddr_un *sunname,
240                                               int len, int type, unsigned hash)
241 {
242         struct sock *s;
243         struct hlist_node *node;
244
245         sk_for_each(s, node, &unix_socket_table[hash ^ type]) {
246                 struct unix_sock *u = unix_sk(s);
247
248                 if (u->addr->len == len &&
249                     !memcmp(u->addr->name, sunname, len))
250                         goto found;
251         }
252         s = NULL;
253 found:
254         return s;
255 }
256
257 static inline struct sock *unix_find_socket_byname(struct sockaddr_un *sunname,
258                                                    int len, int type,
259                                                    unsigned hash)
260 {
261         struct sock *s;
262
263         read_lock(&unix_table_lock);
264         s = __unix_find_socket_byname(sunname, len, type, hash);
265         if (s)
266                 sock_hold(s);
267         read_unlock(&unix_table_lock);
268         return s;
269 }
270
271 static struct sock *unix_find_socket_byinode(struct inode *i)
272 {
273         struct sock *s;
274         struct hlist_node *node;
275
276         read_lock(&unix_table_lock);
277         sk_for_each(s, node,
278                     &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
279                 struct dentry *dentry = unix_sk(s)->dentry;
280
281                 if(dentry && dentry->d_inode == i)
282                 {
283                         sock_hold(s);
284                         goto found;
285                 }
286         }
287         s = NULL;
288 found:
289         read_unlock(&unix_table_lock);
290         return s;
291 }
292
293 static inline int unix_writable(struct sock *sk)
294 {
295         return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
296 }
297
298 static void unix_write_space(struct sock *sk)
299 {
300         read_lock(&sk->sk_callback_lock);
301         if (unix_writable(sk)) {
302                 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
303                         wake_up_interruptible(sk->sk_sleep);
304                 sk_wake_async(sk, 2, POLL_OUT);
305         }
306         read_unlock(&sk->sk_callback_lock);
307 }
308
309 /* When dgram socket disconnects (or changes its peer), we clear its receive
310  * queue of packets arrived from previous peer. First, it allows to do
311  * flow control based only on wmem_alloc; second, sk connected to peer
312  * may receive messages only from that peer. */
313 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
314 {
315         if (skb_queue_len(&sk->sk_receive_queue)) {
316                 skb_queue_purge(&sk->sk_receive_queue);
317                 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
318
319                 /* If one link of bidirectional dgram pipe is disconnected,
320                  * we signal error. Messages are lost. Do not make this,
321                  * when peer was not connected to us.
322                  */
323                 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
324                         other->sk_err = ECONNRESET;
325                         other->sk_error_report(other);
326                 }
327         }
328 }
329
330 static void unix_sock_destructor(struct sock *sk)
331 {
332         struct unix_sock *u = unix_sk(sk);
333
334         skb_queue_purge(&sk->sk_receive_queue);
335
336         BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
337         BUG_TRAP(sk_unhashed(sk));
338         BUG_TRAP(!sk->sk_socket);
339         if (!sock_flag(sk, SOCK_DEAD)) {
340                 printk("Attempt to release alive unix socket: %p\n", sk);
341                 return;
342         }
343
344         if (u->addr)
345                 unix_release_addr(u->addr);
346
347         atomic_dec(&unix_nr_socks);
348 #ifdef UNIX_REFCNT_DEBUG
349         printk(KERN_DEBUG "UNIX %p is destroyed, %d are still alive.\n", sk, atomic_read(&unix_nr_socks));
350 #endif
351 }
352
353 static int unix_release_sock (struct sock *sk, int embrion)
354 {
355         struct unix_sock *u = unix_sk(sk);
356         struct dentry *dentry;
357         struct vfsmount *mnt;
358         struct sock *skpair;
359         struct sk_buff *skb;
360         int state;
361
362         unix_remove_socket(sk);
363
364         /* Clear state */
365         unix_state_wlock(sk);
366         sock_orphan(sk);
367         sk->sk_shutdown = SHUTDOWN_MASK;
368         dentry       = u->dentry;
369         u->dentry    = NULL;
370         mnt          = u->mnt;
371         u->mnt       = NULL;
372         state = sk->sk_state;
373         sk->sk_state = TCP_CLOSE;
374         unix_state_wunlock(sk);
375
376         wake_up_interruptible_all(&u->peer_wait);
377
378         skpair=unix_peer(sk);
379
380         if (skpair!=NULL) {
381                 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
382                         unix_state_wlock(skpair);
383                         /* No more writes */
384                         skpair->sk_shutdown = SHUTDOWN_MASK;
385                         if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
386                                 skpair->sk_err = ECONNRESET;
387                         unix_state_wunlock(skpair);
388                         skpair->sk_state_change(skpair);
389                         read_lock(&skpair->sk_callback_lock);
390                         sk_wake_async(skpair,1,POLL_HUP);
391                         read_unlock(&skpair->sk_callback_lock);
392                 }
393                 sock_put(skpair); /* It may now die */
394                 unix_peer(sk) = NULL;
395         }
396
397         /* Try to flush out this socket. Throw out buffers at least */
398
399         while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
400                 if (state==TCP_LISTEN)
401                         unix_release_sock(skb->sk, 1);
402                 /* passed fds are erased in the kfree_skb hook        */
403                 kfree_skb(skb);
404         }
405
406         if (dentry) {
407                 dput(dentry);
408                 mntput(mnt);
409         }
410
411         vx_sock_dec(sk);
412         clr_vx_info(&sk->sk_vx_info);
413         clr_nx_info(&sk->sk_nx_info);
414         sock_put(sk);
415
416         /* ---- Socket is dead now and most probably destroyed ---- */
417
418         /*
419          * Fixme: BSD difference: In BSD all sockets connected to use get
420          *        ECONNRESET and we die on the spot. In Linux we behave
421          *        like files and pipes do and wait for the last
422          *        dereference.
423          *
424          * Can't we simply set sock->err?
425          *
426          *        What the above comment does talk about? --ANK(980817)
427          */
428
429         if (atomic_read(&unix_tot_inflight))
430                 unix_gc();              /* Garbage collect fds */       
431
432         return 0;
433 }
434
435 static int unix_listen(struct socket *sock, int backlog)
436 {
437         int err;
438         struct sock *sk = sock->sk;
439         struct unix_sock *u = unix_sk(sk);
440
441         err = -EOPNOTSUPP;
442         if (sock->type!=SOCK_STREAM && sock->type!=SOCK_SEQPACKET)
443                 goto out;                       /* Only stream/seqpacket sockets accept */
444         err = -EINVAL;
445         if (!u->addr)
446                 goto out;                       /* No listens on an unbound socket */
447         unix_state_wlock(sk);
448         if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
449                 goto out_unlock;
450         if (backlog > sk->sk_max_ack_backlog)
451                 wake_up_interruptible_all(&u->peer_wait);
452         sk->sk_max_ack_backlog  = backlog;
453         sk->sk_state            = TCP_LISTEN;
454         /* set credentials so connect can copy them */
455         sk->sk_peercred.pid     = current->tgid;
456         sk->sk_peercred.uid     = current->euid;
457         sk->sk_peercred.gid     = current->egid;
458         err = 0;
459
460 out_unlock:
461         unix_state_wunlock(sk);
462 out:
463         return err;
464 }
465
466 static int unix_release(struct socket *);
467 static int unix_bind(struct socket *, struct sockaddr *, int);
468 static int unix_stream_connect(struct socket *, struct sockaddr *,
469                                int addr_len, int flags);
470 static int unix_socketpair(struct socket *, struct socket *);
471 static int unix_accept(struct socket *, struct socket *, int);
472 static int unix_getname(struct socket *, struct sockaddr *, int *, int);
473 static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
474 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
475 static int unix_shutdown(struct socket *, int);
476 static int unix_stream_sendmsg(struct kiocb *, struct socket *,
477                                struct msghdr *, size_t);
478 static int unix_stream_recvmsg(struct kiocb *, struct socket *,
479                                struct msghdr *, size_t, int);
480 static int unix_dgram_sendmsg(struct kiocb *, struct socket *,
481                               struct msghdr *, size_t);
482 static int unix_dgram_recvmsg(struct kiocb *, struct socket *,
483                               struct msghdr *, size_t, int);
484 static int unix_dgram_connect(struct socket *, struct sockaddr *,
485                               int, int);
486 static int unix_seqpacket_sendmsg(struct kiocb *, struct socket *,
487                                   struct msghdr *, size_t);
488
489 static struct proto_ops unix_stream_ops = {
490         .family =       PF_UNIX,
491         .owner =        THIS_MODULE,
492         .release =      unix_release,
493         .bind =         unix_bind,
494         .connect =      unix_stream_connect,
495         .socketpair =   unix_socketpair,
496         .accept =       unix_accept,
497         .getname =      unix_getname,
498         .poll =         unix_poll,
499         .ioctl =        unix_ioctl,
500         .listen =       unix_listen,
501         .shutdown =     unix_shutdown,
502         .setsockopt =   sock_no_setsockopt,
503         .getsockopt =   sock_no_getsockopt,
504         .sendmsg =      unix_stream_sendmsg,
505         .recvmsg =      unix_stream_recvmsg,
506         .mmap =         sock_no_mmap,
507         .sendpage =     sock_no_sendpage,
508 };
509
510 static struct proto_ops unix_dgram_ops = {
511         .family =       PF_UNIX,
512         .owner =        THIS_MODULE,
513         .release =      unix_release,
514         .bind =         unix_bind,
515         .connect =      unix_dgram_connect,
516         .socketpair =   unix_socketpair,
517         .accept =       sock_no_accept,
518         .getname =      unix_getname,
519         .poll =         datagram_poll,
520         .ioctl =        unix_ioctl,
521         .listen =       sock_no_listen,
522         .shutdown =     unix_shutdown,
523         .setsockopt =   sock_no_setsockopt,
524         .getsockopt =   sock_no_getsockopt,
525         .sendmsg =      unix_dgram_sendmsg,
526         .recvmsg =      unix_dgram_recvmsg,
527         .mmap =         sock_no_mmap,
528         .sendpage =     sock_no_sendpage,
529 };
530
531 static struct proto_ops unix_seqpacket_ops = {
532         .family =       PF_UNIX,
533         .owner =        THIS_MODULE,
534         .release =      unix_release,
535         .bind =         unix_bind,
536         .connect =      unix_stream_connect,
537         .socketpair =   unix_socketpair,
538         .accept =       unix_accept,
539         .getname =      unix_getname,
540         .poll =         datagram_poll,
541         .ioctl =        unix_ioctl,
542         .listen =       unix_listen,
543         .shutdown =     unix_shutdown,
544         .setsockopt =   sock_no_setsockopt,
545         .getsockopt =   sock_no_getsockopt,
546         .sendmsg =      unix_seqpacket_sendmsg,
547         .recvmsg =      unix_dgram_recvmsg,
548         .mmap =         sock_no_mmap,
549         .sendpage =     sock_no_sendpage,
550 };
551
552 static struct sock * unix_create1(struct socket *sock)
553 {
554         struct sock *sk = NULL;
555         struct unix_sock *u;
556
557         if (atomic_read(&unix_nr_socks) >= 2*files_stat.max_files)
558                 goto out;
559
560         sk = sk_alloc(PF_UNIX, GFP_KERNEL, sizeof(struct unix_sock),
561                       unix_sk_cachep);
562         if (!sk)
563                 goto out;
564
565         atomic_inc(&unix_nr_socks);
566
567         sock_init_data(sock,sk);
568         sk_set_owner(sk, THIS_MODULE);
569
570         set_vx_info(&sk->sk_vx_info, current->vx_info);
571         sk->sk_xid = vx_current_xid();
572         vx_sock_inc(sk);
573         set_nx_info(&sk->sk_nx_info, current->nx_info);
574
575         sk->sk_write_space      = unix_write_space;
576         sk->sk_max_ack_backlog  = sysctl_unix_max_dgram_qlen;
577         sk->sk_destruct         = unix_sock_destructor;
578         u         = unix_sk(sk);
579         u->dentry = NULL;
580         u->mnt    = NULL;
581         rwlock_init(&u->lock);
582         atomic_set(&u->inflight, sock ? 0 : -1);
583         init_MUTEX(&u->readsem); /* single task reading lock */
584         init_waitqueue_head(&u->peer_wait);
585         unix_insert_socket(unix_sockets_unbound, sk);
586 out:
587         return sk;
588 }
589
590 static int unix_create(struct socket *sock, int protocol)
591 {
592         if (protocol && protocol != PF_UNIX)
593                 return -EPROTONOSUPPORT;
594
595         sock->state = SS_UNCONNECTED;
596
597         switch (sock->type) {
598         case SOCK_STREAM:
599                 sock->ops = &unix_stream_ops;
600                 break;
601                 /*
602                  *      Believe it or not BSD has AF_UNIX, SOCK_RAW though
603                  *      nothing uses it.
604                  */
605         case SOCK_RAW:
606                 sock->type=SOCK_DGRAM;
607         case SOCK_DGRAM:
608                 sock->ops = &unix_dgram_ops;
609                 break;
610         case SOCK_SEQPACKET:
611                 sock->ops = &unix_seqpacket_ops;
612                 break;
613         default:
614                 return -ESOCKTNOSUPPORT;
615         }
616
617         return unix_create1(sock) ? 0 : -ENOMEM;
618 }
619
620 static int unix_release(struct socket *sock)
621 {
622         struct sock *sk = sock->sk;
623
624         if (!sk)
625                 return 0;
626
627         sock->sk = NULL;
628
629         return unix_release_sock (sk, 0);
630 }
631
632 static int unix_autobind(struct socket *sock)
633 {
634         struct sock *sk = sock->sk;
635         struct unix_sock *u = unix_sk(sk);
636         static u32 ordernum = 1;
637         struct unix_address * addr;
638         int err;
639
640         down(&u->readsem);
641
642         err = 0;
643         if (u->addr)
644                 goto out;
645
646         err = -ENOMEM;
647         addr = kmalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
648         if (!addr)
649                 goto out;
650
651         memset(addr, 0, sizeof(*addr) + sizeof(short) + 16);
652         addr->name->sun_family = AF_UNIX;
653         atomic_set(&addr->refcnt, 1);
654
655 retry:
656         addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
657         addr->hash = unix_hash_fold(csum_partial((void*)addr->name, addr->len, 0));
658
659         write_lock(&unix_table_lock);
660         ordernum = (ordernum+1)&0xFFFFF;
661
662         if (__unix_find_socket_byname(addr->name, addr->len, sock->type,
663                                       addr->hash)) {
664                 write_unlock(&unix_table_lock);
665                 /* Sanity yield. It is unusual case, but yet... */
666                 if (!(ordernum&0xFF))
667                         yield();
668                 goto retry;
669         }
670         addr->hash ^= sk->sk_type;
671
672         __unix_remove_socket(sk);
673         u->addr = addr;
674         __unix_insert_socket(&unix_socket_table[addr->hash], sk);
675         write_unlock(&unix_table_lock);
676         err = 0;
677
678 out:    up(&u->readsem);
679         return err;
680 }
681
682 static struct sock *unix_find_other(struct sockaddr_un *sunname, int len,
683                                     int type, unsigned hash, int *error)
684 {
685         struct sock *u;
686         struct nameidata nd;
687         int err = 0;
688         
689         if (sunname->sun_path[0]) {
690                 err = path_lookup(sunname->sun_path, LOOKUP_FOLLOW, &nd);
691                 if (err)
692                         goto fail;
693                 err = permission(nd.dentry->d_inode,MAY_WRITE, &nd);
694                 if (err)
695                         goto put_fail;
696
697                 err = -ECONNREFUSED;
698                 if (!S_ISSOCK(nd.dentry->d_inode->i_mode))
699                         goto put_fail;
700                 u=unix_find_socket_byinode(nd.dentry->d_inode);
701                 if (!u)
702                         goto put_fail;
703
704                 if (u->sk_type == type)
705                         touch_atime(nd.mnt, nd.dentry);
706
707                 path_release(&nd);
708
709                 err=-EPROTOTYPE;
710                 if (u->sk_type != type) {
711                         sock_put(u);
712                         goto fail;
713                 }
714         } else {
715                 err = -ECONNREFUSED;
716                 u=unix_find_socket_byname(sunname, len, type, hash);
717                 if (u) {
718                         struct dentry *dentry;
719                         dentry = unix_sk(u)->dentry;
720                         if (dentry)
721                                 touch_atime(unix_sk(u)->mnt, dentry);
722                 } else
723                         goto fail;
724         }
725         return u;
726
727 put_fail:
728         path_release(&nd);
729 fail:
730         *error=err;
731         return NULL;
732 }
733
734
735 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
736 {
737         struct sock *sk = sock->sk;
738         struct unix_sock *u = unix_sk(sk);
739         struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr;
740         struct dentry * dentry = NULL;
741         struct nameidata nd;
742         int err;
743         unsigned hash;
744         struct unix_address *addr;
745         struct hlist_head *list;
746
747         err = -EINVAL;
748         if (sunaddr->sun_family != AF_UNIX)
749                 goto out;
750
751         if (addr_len==sizeof(short)) {
752                 err = unix_autobind(sock);
753                 goto out;
754         }
755
756         err = unix_mkname(sunaddr, addr_len, &hash);
757         if (err < 0)
758                 goto out;
759         addr_len = err;
760
761         down(&u->readsem);
762
763         err = -EINVAL;
764         if (u->addr)
765                 goto out_up;
766
767         err = -ENOMEM;
768         addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
769         if (!addr)
770                 goto out_up;
771
772         memcpy(addr->name, sunaddr, addr_len);
773         addr->len = addr_len;
774         addr->hash = hash ^ sk->sk_type;
775         atomic_set(&addr->refcnt, 1);
776
777         if (sunaddr->sun_path[0]) {
778                 unsigned int mode;
779                 err = 0;
780                 /*
781                  * Get the parent directory, calculate the hash for last
782                  * component.
783                  */
784                 err = path_lookup(sunaddr->sun_path, LOOKUP_PARENT, &nd);
785                 if (err)
786                         goto out_mknod_parent;
787                 /*
788                  * Yucky last component or no last component at all?
789                  * (foo/., foo/.., /////)
790                  */
791                 err = -EEXIST;
792                 if (nd.last_type != LAST_NORM)
793                         goto out_mknod;
794                 /*
795                  * Lock the directory.
796                  */
797                 down(&nd.dentry->d_inode->i_sem);
798                 /*
799                  * Do the final lookup.
800                  */
801                 dentry = lookup_hash(&nd.last, nd.dentry);
802                 err = PTR_ERR(dentry);
803                 if (IS_ERR(dentry))
804                         goto out_mknod_unlock;
805                 err = -ENOENT;
806                 /*
807                  * Special case - lookup gave negative, but... we had foo/bar/
808                  * From the vfs_mknod() POV we just have a negative dentry -
809                  * all is fine. Let's be bastards - you had / on the end, you've
810                  * been asking for (non-existent) directory. -ENOENT for you.
811                  */
812                 if (nd.last.name[nd.last.len] && !dentry->d_inode)
813                         goto out_mknod_dput;
814                 /*
815                  * All right, let's create it.
816                  */
817                 mode = S_IFSOCK |
818                        (SOCK_INODE(sock)->i_mode & ~current->fs->umask);
819                 err = vfs_mknod(nd.dentry->d_inode, dentry, mode, 0);
820                 if (err)
821                         goto out_mknod_dput;
822                 up(&nd.dentry->d_inode->i_sem);
823                 dput(nd.dentry);
824                 nd.dentry = dentry;
825
826                 addr->hash = UNIX_HASH_SIZE;
827         }
828
829         write_lock(&unix_table_lock);
830
831         if (!sunaddr->sun_path[0]) {
832                 err = -EADDRINUSE;
833                 if (__unix_find_socket_byname(sunaddr, addr_len,
834                                               sk->sk_type, hash)) {
835                         unix_release_addr(addr);
836                         goto out_unlock;
837                 }
838
839                 list = &unix_socket_table[addr->hash];
840         } else {
841                 list = &unix_socket_table[dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1)];
842                 u->dentry = nd.dentry;
843                 u->mnt    = nd.mnt;
844         }
845
846         err = 0;
847         __unix_remove_socket(sk);
848         u->addr = addr;
849         __unix_insert_socket(list, sk);
850
851 out_unlock:
852         write_unlock(&unix_table_lock);
853 out_up:
854         up(&u->readsem);
855 out:
856         return err;
857
858 out_mknod_dput:
859         dput(dentry);
860 out_mknod_unlock:
861         up(&nd.dentry->d_inode->i_sem);
862 out_mknod:
863         path_release(&nd);
864 out_mknod_parent:
865         if (err==-EEXIST)
866                 err=-EADDRINUSE;
867         unix_release_addr(addr);
868         goto out_up;
869 }
870
871 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
872                               int alen, int flags)
873 {
874         struct sock *sk = sock->sk;
875         struct sockaddr_un *sunaddr=(struct sockaddr_un*)addr;
876         struct sock *other;
877         unsigned hash;
878         int err;
879
880         if (addr->sa_family != AF_UNSPEC) {
881                 err = unix_mkname(sunaddr, alen, &hash);
882                 if (err < 0)
883                         goto out;
884                 alen = err;
885
886                 if (test_bit(SOCK_PASS_CRED, &sock->flags) && !unix_sk(sk)->addr &&
887                     (err = unix_autobind(sock)) != 0)
888                         goto out;
889
890                 other=unix_find_other(sunaddr, alen, sock->type, hash, &err);
891                 if (!other)
892                         goto out;
893
894                 unix_state_wlock(sk);
895
896                 err = -EPERM;
897                 if (!unix_may_send(sk, other))
898                         goto out_unlock;
899
900                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
901                 if (err)
902                         goto out_unlock;
903
904         } else {
905                 /*
906                  *      1003.1g breaking connected state with AF_UNSPEC
907                  */
908                 other = NULL;
909                 unix_state_wlock(sk);
910         }
911
912         /*
913          * If it was connected, reconnect.
914          */
915         if (unix_peer(sk)) {
916                 struct sock *old_peer = unix_peer(sk);
917                 unix_peer(sk)=other;
918                 unix_state_wunlock(sk);
919
920                 if (other != old_peer)
921                         unix_dgram_disconnected(sk, old_peer);
922                 sock_put(old_peer);
923         } else {
924                 unix_peer(sk)=other;
925                 unix_state_wunlock(sk);
926         }
927         return 0;
928
929 out_unlock:
930         unix_state_wunlock(sk);
931         sock_put(other);
932 out:
933         return err;
934 }
935
936 static long unix_wait_for_peer(struct sock *other, long timeo)
937 {
938         struct unix_sock *u = unix_sk(other);
939         int sched;
940         DEFINE_WAIT(wait);
941
942         prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
943
944         sched = !sock_flag(other, SOCK_DEAD) &&
945                 !(other->sk_shutdown & RCV_SHUTDOWN) &&
946                 (skb_queue_len(&other->sk_receive_queue) >
947                  other->sk_max_ack_backlog);
948
949         unix_state_runlock(other);
950
951         if (sched)
952                 timeo = schedule_timeout(timeo);
953
954         finish_wait(&u->peer_wait, &wait);
955         return timeo;
956 }
957
958 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
959                                int addr_len, int flags)
960 {
961         struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr;
962         struct sock *sk = sock->sk;
963         struct unix_sock *u = unix_sk(sk), *newu, *otheru;
964         struct sock *newsk = NULL;
965         struct sock *other = NULL;
966         struct sk_buff *skb = NULL;
967         unsigned hash;
968         int st;
969         int err;
970         long timeo;
971
972         err = unix_mkname(sunaddr, addr_len, &hash);
973         if (err < 0)
974                 goto out;
975         addr_len = err;
976
977         if (test_bit(SOCK_PASS_CRED, &sock->flags)
978                 && !u->addr && (err = unix_autobind(sock)) != 0)
979                 goto out;
980
981         timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
982
983         /* First of all allocate resources.
984            If we will make it after state is locked,
985            we will have to recheck all again in any case.
986          */
987
988         err = -ENOMEM;
989
990         /* create new sock for complete connection */
991         newsk = unix_create1(NULL);
992         if (newsk == NULL)
993                 goto out;
994
995         /* Allocate skb for sending to listening sock */
996         skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
997         if (skb == NULL)
998                 goto out;
999
1000 restart:
1001         /*  Find listening sock. */
1002         other = unix_find_other(sunaddr, addr_len, sk->sk_type, hash, &err);
1003         if (!other)
1004                 goto out;
1005
1006         /* Latch state of peer */
1007         unix_state_rlock(other);
1008
1009         /* Apparently VFS overslept socket death. Retry. */
1010         if (sock_flag(other, SOCK_DEAD)) {
1011                 unix_state_runlock(other);
1012                 sock_put(other);
1013                 goto restart;
1014         }
1015
1016         err = -ECONNREFUSED;
1017         if (other->sk_state != TCP_LISTEN)
1018                 goto out_unlock;
1019
1020         if (skb_queue_len(&other->sk_receive_queue) >
1021             other->sk_max_ack_backlog) {
1022                 err = -EAGAIN;
1023                 if (!timeo)
1024                         goto out_unlock;
1025
1026                 timeo = unix_wait_for_peer(other, timeo);
1027
1028                 err = sock_intr_errno(timeo);
1029                 if (signal_pending(current))
1030                         goto out;
1031                 sock_put(other);
1032                 goto restart;
1033         }
1034
1035         /* Latch our state.
1036
1037            It is tricky place. We need to grab write lock and cannot
1038            drop lock on peer. It is dangerous because deadlock is
1039            possible. Connect to self case and simultaneous
1040            attempt to connect are eliminated by checking socket
1041            state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1042            check this before attempt to grab lock.
1043
1044            Well, and we have to recheck the state after socket locked.
1045          */
1046         st = sk->sk_state;
1047
1048         switch (st) {
1049         case TCP_CLOSE:
1050                 /* This is ok... continue with connect */
1051                 break;
1052         case TCP_ESTABLISHED:
1053                 /* Socket is already connected */
1054                 err = -EISCONN;
1055                 goto out_unlock;
1056         default:
1057                 err = -EINVAL;
1058                 goto out_unlock;
1059         }
1060
1061         unix_state_wlock(sk);
1062
1063         if (sk->sk_state != st) {
1064                 unix_state_wunlock(sk);
1065                 unix_state_runlock(other);
1066                 sock_put(other);
1067                 goto restart;
1068         }
1069
1070         err = security_unix_stream_connect(sock, other->sk_socket, newsk);
1071         if (err) {
1072                 unix_state_wunlock(sk);
1073                 goto out_unlock;
1074         }
1075
1076         /* The way is open! Fastly set all the necessary fields... */
1077
1078         sock_hold(sk);
1079         unix_peer(newsk)        = sk;
1080         newsk->sk_state         = TCP_ESTABLISHED;
1081         newsk->sk_type          = sk->sk_type;
1082         newsk->sk_peercred.pid  = current->tgid;
1083         newsk->sk_peercred.uid  = current->euid;
1084         newsk->sk_peercred.gid  = current->egid;
1085         newu = unix_sk(newsk);
1086         newsk->sk_sleep         = &newu->peer_wait;
1087         otheru = unix_sk(other);
1088
1089         /* copy address information from listening to new sock*/
1090         if (otheru->addr) {
1091                 atomic_inc(&otheru->addr->refcnt);
1092                 newu->addr = otheru->addr;
1093         }
1094         if (otheru->dentry) {
1095                 newu->dentry    = dget(otheru->dentry);
1096                 newu->mnt       = mntget(otheru->mnt);
1097         }
1098
1099         /* Set credentials */
1100         sk->sk_peercred = other->sk_peercred;
1101
1102         sock_hold(newsk);
1103         unix_peer(sk)   = newsk;
1104         sock->state     = SS_CONNECTED;
1105         sk->sk_state    = TCP_ESTABLISHED;
1106
1107         unix_state_wunlock(sk);
1108
1109         /* take ten and and send info to listening sock */
1110         spin_lock(&other->sk_receive_queue.lock);
1111         __skb_queue_tail(&other->sk_receive_queue, skb);
1112         /* Undo artificially decreased inflight after embrion
1113          * is installed to listening socket. */
1114         atomic_inc(&newu->inflight);
1115         spin_unlock(&other->sk_receive_queue.lock);
1116         unix_state_runlock(other);
1117         other->sk_data_ready(other, 0);
1118         sock_put(other);
1119         return 0;
1120
1121 out_unlock:
1122         if (other)
1123                 unix_state_runlock(other);
1124
1125 out:
1126         if (skb)
1127                 kfree_skb(skb);
1128         if (newsk)
1129                 unix_release_sock(newsk, 0);
1130         if (other)
1131                 sock_put(other);
1132         return err;
1133 }
1134
1135 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1136 {
1137         struct sock *ska=socka->sk, *skb = sockb->sk;
1138
1139         /* Join our sockets back to back */
1140         sock_hold(ska);
1141         sock_hold(skb);
1142         unix_peer(ska)=skb;
1143         unix_peer(skb)=ska;
1144         ska->sk_peercred.pid = skb->sk_peercred.pid = current->tgid;
1145         ska->sk_peercred.uid = skb->sk_peercred.uid = current->euid;
1146         ska->sk_peercred.gid = skb->sk_peercred.gid = current->egid;
1147
1148         if (ska->sk_type != SOCK_DGRAM) {
1149                 ska->sk_state = TCP_ESTABLISHED;
1150                 skb->sk_state = TCP_ESTABLISHED;
1151                 socka->state  = SS_CONNECTED;
1152                 sockb->state  = SS_CONNECTED;
1153         }
1154         return 0;
1155 }
1156
1157 static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
1158 {
1159         struct sock *sk = sock->sk;
1160         struct sock *tsk;
1161         struct sk_buff *skb;
1162         int err;
1163
1164         err = -EOPNOTSUPP;
1165         if (sock->type!=SOCK_STREAM && sock->type!=SOCK_SEQPACKET)
1166                 goto out;
1167
1168         err = -EINVAL;
1169         if (sk->sk_state != TCP_LISTEN)
1170                 goto out;
1171
1172         /* If socket state is TCP_LISTEN it cannot change (for now...),
1173          * so that no locks are necessary.
1174          */
1175
1176         skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1177         if (!skb) {
1178                 /* This means receive shutdown. */
1179                 if (err == 0)
1180                         err = -EINVAL;
1181                 goto out;
1182         }
1183
1184         tsk = skb->sk;
1185         skb_free_datagram(sk, skb);
1186         wake_up_interruptible(&unix_sk(sk)->peer_wait);
1187
1188         /* attach accepted sock to socket */
1189         unix_state_wlock(tsk);
1190         newsock->state = SS_CONNECTED;
1191         sock_graft(tsk, newsock);
1192         unix_state_wunlock(tsk);
1193         return 0;
1194
1195 out:
1196         return err;
1197 }
1198
1199
1200 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer)
1201 {
1202         struct sock *sk = sock->sk;
1203         struct unix_sock *u;
1204         struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr;
1205         int err = 0;
1206
1207         if (peer) {
1208                 sk = unix_peer_get(sk);
1209
1210                 err = -ENOTCONN;
1211                 if (!sk)
1212                         goto out;
1213                 err = 0;
1214         } else {
1215                 sock_hold(sk);
1216         }
1217
1218         u = unix_sk(sk);
1219         unix_state_rlock(sk);
1220         if (!u->addr) {
1221                 sunaddr->sun_family = AF_UNIX;
1222                 sunaddr->sun_path[0] = 0;
1223                 *uaddr_len = sizeof(short);
1224         } else {
1225                 struct unix_address *addr = u->addr;
1226
1227                 *uaddr_len = addr->len;
1228                 memcpy(sunaddr, addr->name, *uaddr_len);
1229         }
1230         unix_state_runlock(sk);
1231         sock_put(sk);
1232 out:
1233         return err;
1234 }
1235
1236 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1237 {
1238         int i;
1239
1240         scm->fp = UNIXCB(skb).fp;
1241         skb->destructor = sock_wfree;
1242         UNIXCB(skb).fp = NULL;
1243
1244         for (i=scm->fp->count-1; i>=0; i--)
1245                 unix_notinflight(scm->fp->fp[i]);
1246 }
1247
1248 static void unix_destruct_fds(struct sk_buff *skb)
1249 {
1250         struct scm_cookie scm;
1251         memset(&scm, 0, sizeof(scm));
1252         unix_detach_fds(&scm, skb);
1253
1254         /* Alas, it calls VFS */
1255         /* So fscking what? fput() had been SMP-safe since the last Summer */
1256         scm_destroy(&scm);
1257         sock_wfree(skb);
1258 }
1259
1260 static void unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1261 {
1262         int i;
1263         for (i=scm->fp->count-1; i>=0; i--)
1264                 unix_inflight(scm->fp->fp[i]);
1265         UNIXCB(skb).fp = scm->fp;
1266         skb->destructor = unix_destruct_fds;
1267         scm->fp = NULL;
1268 }
1269
1270 /*
1271  *      Send AF_UNIX data.
1272  */
1273
1274 static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
1275                               struct msghdr *msg, size_t len)
1276 {
1277         struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1278         struct sock *sk = sock->sk;
1279         struct unix_sock *u = unix_sk(sk);
1280         struct sockaddr_un *sunaddr=msg->msg_name;
1281         struct sock *other = NULL;
1282         int namelen = 0; /* fake GCC */
1283         int err;
1284         unsigned hash;
1285         struct sk_buff *skb;
1286         long timeo;
1287         struct scm_cookie tmp_scm;
1288
1289         if (NULL == siocb->scm)
1290                 siocb->scm = &tmp_scm;
1291         err = scm_send(sock, msg, siocb->scm);
1292         if (err < 0)
1293                 return err;
1294
1295         err = -EOPNOTSUPP;
1296         if (msg->msg_flags&MSG_OOB)
1297                 goto out;
1298
1299         if (msg->msg_namelen) {
1300                 err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1301                 if (err < 0)
1302                         goto out;
1303                 namelen = err;
1304         } else {
1305                 sunaddr = NULL;
1306                 err = -ENOTCONN;
1307                 other = unix_peer_get(sk);
1308                 if (!other)
1309                         goto out;
1310         }
1311
1312         if (test_bit(SOCK_PASS_CRED, &sock->flags)
1313                 && !u->addr && (err = unix_autobind(sock)) != 0)
1314                 goto out;
1315
1316         err = -EMSGSIZE;
1317         if (len > sk->sk_sndbuf - 32)
1318                 goto out;
1319
1320         skb = sock_alloc_send_skb(sk, len, msg->msg_flags&MSG_DONTWAIT, &err);
1321         if (skb==NULL)
1322                 goto out;
1323
1324         memcpy(UNIXCREDS(skb), &siocb->scm->creds, sizeof(struct ucred));
1325         if (siocb->scm->fp)
1326                 unix_attach_fds(siocb->scm, skb);
1327
1328         skb->h.raw = skb->data;
1329         err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
1330         if (err)
1331                 goto out_free;
1332
1333         timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1334
1335 restart:
1336         if (!other) {
1337                 err = -ECONNRESET;
1338                 if (sunaddr == NULL)
1339                         goto out_free;
1340
1341                 other = unix_find_other(sunaddr, namelen, sk->sk_type,
1342                                         hash, &err);
1343                 if (other==NULL)
1344                         goto out_free;
1345         }
1346
1347         unix_state_rlock(other);
1348         err = -EPERM;
1349         if (!unix_may_send(sk, other))
1350                 goto out_unlock;
1351
1352         if (sock_flag(other, SOCK_DEAD)) {
1353                 /*
1354                  *      Check with 1003.1g - what should
1355                  *      datagram error
1356                  */
1357                 unix_state_runlock(other);
1358                 sock_put(other);
1359
1360                 err = 0;
1361                 unix_state_wlock(sk);
1362                 if (unix_peer(sk) == other) {
1363                         unix_peer(sk)=NULL;
1364                         unix_state_wunlock(sk);
1365
1366                         unix_dgram_disconnected(sk, other);
1367                         sock_put(other);
1368                         err = -ECONNREFUSED;
1369                 } else {
1370                         unix_state_wunlock(sk);
1371                 }
1372
1373                 other = NULL;
1374                 if (err)
1375                         goto out_free;
1376                 goto restart;
1377         }
1378
1379         err = -EPIPE;
1380         if (other->sk_shutdown & RCV_SHUTDOWN)
1381                 goto out_unlock;
1382
1383         if (sk->sk_type != SOCK_SEQPACKET) {
1384                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1385                 if (err)
1386                         goto out_unlock;
1387         }
1388
1389         if (unix_peer(other) != sk &&
1390             (skb_queue_len(&other->sk_receive_queue) >
1391              other->sk_max_ack_backlog)) {
1392                 if (!timeo) {
1393                         err = -EAGAIN;
1394                         goto out_unlock;
1395                 }
1396
1397                 timeo = unix_wait_for_peer(other, timeo);
1398
1399                 err = sock_intr_errno(timeo);
1400                 if (signal_pending(current))
1401                         goto out_free;
1402
1403                 goto restart;
1404         }
1405
1406         skb_queue_tail(&other->sk_receive_queue, skb);
1407         unix_state_runlock(other);
1408         other->sk_data_ready(other, len);
1409         sock_put(other);
1410         scm_destroy(siocb->scm);
1411         return len;
1412
1413 out_unlock:
1414         unix_state_runlock(other);
1415 out_free:
1416         kfree_skb(skb);
1417 out:
1418         if (other)
1419                 sock_put(other);
1420         scm_destroy(siocb->scm);
1421         return err;
1422 }
1423
1424                 
1425 static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
1426                                struct msghdr *msg, size_t len)
1427 {
1428         struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1429         struct sock *sk = sock->sk;
1430         struct sock *other = NULL;
1431         struct sockaddr_un *sunaddr=msg->msg_name;
1432         int err,size;
1433         struct sk_buff *skb;
1434         int sent=0;
1435         struct scm_cookie tmp_scm;
1436
1437         if (NULL == siocb->scm)
1438                 siocb->scm = &tmp_scm;
1439         err = scm_send(sock, msg, siocb->scm);
1440         if (err < 0)
1441                 return err;
1442
1443         err = -EOPNOTSUPP;
1444         if (msg->msg_flags&MSG_OOB)
1445                 goto out_err;
1446
1447         if (msg->msg_namelen) {
1448                 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1449                 goto out_err;
1450         } else {
1451                 sunaddr = NULL;
1452                 err = -ENOTCONN;
1453                 other = unix_peer_get(sk);
1454                 if (!other)
1455                         goto out_err;
1456         }
1457
1458         if (sk->sk_shutdown & SEND_SHUTDOWN)
1459                 goto pipe_err;
1460
1461         while(sent < len)
1462         {
1463                 /*
1464                  *      Optimisation for the fact that under 0.01% of X messages typically
1465                  *      need breaking up.
1466                  */
1467
1468                 size=len-sent;
1469
1470                 /* Keep two messages in the pipe so it schedules better */
1471                 if (size > sk->sk_sndbuf / 2 - 64)
1472                         size = sk->sk_sndbuf / 2 - 64;
1473
1474                 if (size > SKB_MAX_ALLOC)
1475                         size = SKB_MAX_ALLOC;
1476                         
1477                 /*
1478                  *      Grab a buffer
1479                  */
1480                  
1481                 skb=sock_alloc_send_skb(sk,size,msg->msg_flags&MSG_DONTWAIT, &err);
1482
1483                 if (skb==NULL)
1484                         goto out_err;
1485
1486                 /*
1487                  *      If you pass two values to the sock_alloc_send_skb
1488                  *      it tries to grab the large buffer with GFP_NOFS
1489                  *      (which can fail easily), and if it fails grab the
1490                  *      fallback size buffer which is under a page and will
1491                  *      succeed. [Alan]
1492                  */
1493                 size = min_t(int, size, skb_tailroom(skb));
1494
1495                 memcpy(UNIXCREDS(skb), &siocb->scm->creds, sizeof(struct ucred));
1496                 if (siocb->scm->fp)
1497                         unix_attach_fds(siocb->scm, skb);
1498
1499                 if ((err = memcpy_fromiovec(skb_put(skb,size), msg->msg_iov, size)) != 0) {
1500                         kfree_skb(skb);
1501                         goto out_err;
1502                 }
1503
1504                 unix_state_rlock(other);
1505
1506                 if (sock_flag(other, SOCK_DEAD) ||
1507                     (other->sk_shutdown & RCV_SHUTDOWN))
1508                         goto pipe_err_free;
1509
1510                 skb_queue_tail(&other->sk_receive_queue, skb);
1511                 unix_state_runlock(other);
1512                 other->sk_data_ready(other, size);
1513                 sent+=size;
1514         }
1515         sock_put(other);
1516
1517         scm_destroy(siocb->scm);
1518         siocb->scm = NULL;
1519
1520         return sent;
1521
1522 pipe_err_free:
1523         unix_state_runlock(other);
1524         kfree_skb(skb);
1525 pipe_err:
1526         if (sent==0 && !(msg->msg_flags&MSG_NOSIGNAL))
1527                 send_sig(SIGPIPE,current,0);
1528         err = -EPIPE;
1529 out_err:
1530         if (other)
1531                 sock_put(other);
1532         scm_destroy(siocb->scm);
1533         siocb->scm = NULL;
1534         return sent ? : err;
1535 }
1536
1537 static int unix_seqpacket_sendmsg(struct kiocb *kiocb, struct socket *sock,
1538                                   struct msghdr *msg, size_t len)
1539 {
1540         int err;
1541         struct sock *sk = sock->sk;
1542         
1543         err = sock_error(sk);
1544         if (err)
1545                 return err;
1546
1547         if (sk->sk_state != TCP_ESTABLISHED)
1548                 return -ENOTCONN;
1549
1550         if (msg->msg_namelen)
1551                 msg->msg_namelen = 0;
1552
1553         return unix_dgram_sendmsg(kiocb, sock, msg, len);
1554 }
1555                                                                                             
1556 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
1557 {
1558         struct unix_sock *u = unix_sk(sk);
1559
1560         msg->msg_namelen = 0;
1561         if (u->addr) {
1562                 msg->msg_namelen = u->addr->len;
1563                 memcpy(msg->msg_name, u->addr->name, u->addr->len);
1564         }
1565 }
1566
1567 static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
1568                               struct msghdr *msg, size_t size,
1569                               int flags)
1570 {
1571         struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1572         struct scm_cookie tmp_scm;
1573         struct sock *sk = sock->sk;
1574         struct unix_sock *u = unix_sk(sk);
1575         int noblock = flags & MSG_DONTWAIT;
1576         struct sk_buff *skb;
1577         int err;
1578
1579         err = -EOPNOTSUPP;
1580         if (flags&MSG_OOB)
1581                 goto out;
1582
1583         msg->msg_namelen = 0;
1584
1585         down(&u->readsem);
1586
1587         skb = skb_recv_datagram(sk, flags, noblock, &err);
1588         if (!skb)
1589                 goto out_unlock;
1590
1591         wake_up_interruptible(&u->peer_wait);
1592
1593         if (msg->msg_name)
1594                 unix_copy_addr(msg, skb->sk);
1595
1596         if (size > skb->len)
1597                 size = skb->len;
1598         else if (size < skb->len)
1599                 msg->msg_flags |= MSG_TRUNC;
1600
1601         err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, size);
1602         if (err)
1603                 goto out_free;
1604
1605         if (!siocb->scm) {
1606                 siocb->scm = &tmp_scm;
1607                 memset(&tmp_scm, 0, sizeof(tmp_scm));
1608         }
1609         siocb->scm->creds = *UNIXCREDS(skb);
1610
1611         if (!(flags & MSG_PEEK))
1612         {
1613                 if (UNIXCB(skb).fp)
1614                         unix_detach_fds(siocb->scm, skb);
1615         }
1616         else 
1617         {
1618                 /* It is questionable: on PEEK we could:
1619                    - do not return fds - good, but too simple 8)
1620                    - return fds, and do not return them on read (old strategy,
1621                      apparently wrong)
1622                    - clone fds (I chose it for now, it is the most universal
1623                      solution)
1624                 
1625                    POSIX 1003.1g does not actually define this clearly
1626                    at all. POSIX 1003.1g doesn't define a lot of things
1627                    clearly however!                  
1628                    
1629                 */
1630                 if (UNIXCB(skb).fp)
1631                         siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1632         }
1633         err = size;
1634
1635         scm_recv(sock, msg, siocb->scm, flags);
1636
1637 out_free:
1638         skb_free_datagram(sk,skb);
1639 out_unlock:
1640         up(&u->readsem);
1641 out:
1642         return err;
1643 }
1644
1645 /*
1646  *      Sleep until data has arrive. But check for races..
1647  */
1648  
1649 static long unix_stream_data_wait(struct sock * sk, long timeo)
1650 {
1651         DEFINE_WAIT(wait);
1652
1653         unix_state_rlock(sk);
1654
1655         for (;;) {
1656                 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1657
1658                 if (skb_queue_len(&sk->sk_receive_queue) ||
1659                     sk->sk_err ||
1660                     (sk->sk_shutdown & RCV_SHUTDOWN) ||
1661                     signal_pending(current) ||
1662                     !timeo)
1663                         break;
1664
1665                 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1666                 unix_state_runlock(sk);
1667                 timeo = schedule_timeout(timeo);
1668                 unix_state_rlock(sk);
1669                 clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1670         }
1671
1672         finish_wait(sk->sk_sleep, &wait);
1673         unix_state_runlock(sk);
1674         return timeo;
1675 }
1676
1677
1678
1679 static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
1680                                struct msghdr *msg, size_t size,
1681                                int flags)
1682 {
1683         struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1684         struct scm_cookie tmp_scm;
1685         struct sock *sk = sock->sk;
1686         struct unix_sock *u = unix_sk(sk);
1687         struct sockaddr_un *sunaddr=msg->msg_name;
1688         int copied = 0;
1689         int check_creds = 0;
1690         int target;
1691         int err = 0;
1692         long timeo;
1693
1694         err = -EINVAL;
1695         if (sk->sk_state != TCP_ESTABLISHED)
1696                 goto out;
1697
1698         err = -EOPNOTSUPP;
1699         if (flags&MSG_OOB)
1700                 goto out;
1701
1702         target = sock_rcvlowat(sk, flags&MSG_WAITALL, size);
1703         timeo = sock_rcvtimeo(sk, flags&MSG_DONTWAIT);
1704
1705         msg->msg_namelen = 0;
1706
1707         /* Lock the socket to prevent queue disordering
1708          * while sleeps in memcpy_tomsg
1709          */
1710
1711         if (!siocb->scm) {
1712                 siocb->scm = &tmp_scm;
1713                 memset(&tmp_scm, 0, sizeof(tmp_scm));
1714         }
1715
1716         down(&u->readsem);
1717
1718         do
1719         {
1720                 int chunk;
1721                 struct sk_buff *skb;
1722
1723                 skb = skb_dequeue(&sk->sk_receive_queue);
1724                 if (skb==NULL)
1725                 {
1726                         if (copied >= target)
1727                                 break;
1728
1729                         /*
1730                          *      POSIX 1003.1g mandates this order.
1731                          */
1732                          
1733                         if ((err = sock_error(sk)) != 0)
1734                                 break;
1735                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1736                                 break;
1737                         err = -EAGAIN;
1738                         if (!timeo)
1739                                 break;
1740                         up(&u->readsem);
1741
1742                         timeo = unix_stream_data_wait(sk, timeo);
1743
1744                         if (signal_pending(current)) {
1745                                 err = sock_intr_errno(timeo);
1746                                 goto out;
1747                         }
1748                         down(&u->readsem);
1749                         continue;
1750                 }
1751
1752                 if (check_creds) {
1753                         /* Never glue messages from different writers */
1754                         if (memcmp(UNIXCREDS(skb), &siocb->scm->creds, sizeof(siocb->scm->creds)) != 0) {
1755                                 skb_queue_head(&sk->sk_receive_queue, skb);
1756                                 break;
1757                         }
1758                 } else {
1759                         /* Copy credentials */
1760                         siocb->scm->creds = *UNIXCREDS(skb);
1761                         check_creds = 1;
1762                 }
1763
1764                 /* Copy address just once */
1765                 if (sunaddr)
1766                 {
1767                         unix_copy_addr(msg, skb->sk);
1768                         sunaddr = NULL;
1769                 }
1770
1771                 chunk = min_t(unsigned int, skb->len, size);
1772                 if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) {
1773                         skb_queue_head(&sk->sk_receive_queue, skb);
1774                         if (copied == 0)
1775                                 copied = -EFAULT;
1776                         break;
1777                 }
1778                 copied += chunk;
1779                 size -= chunk;
1780
1781                 /* Mark read part of skb as used */
1782                 if (!(flags & MSG_PEEK))
1783                 {
1784                         skb_pull(skb, chunk);
1785
1786                         if (UNIXCB(skb).fp)
1787                                 unix_detach_fds(siocb->scm, skb);
1788
1789                         /* put the skb back if we didn't use it up.. */
1790                         if (skb->len)
1791                         {
1792                                 skb_queue_head(&sk->sk_receive_queue, skb);
1793                                 break;
1794                         }
1795
1796                         kfree_skb(skb);
1797
1798                         if (siocb->scm->fp)
1799                                 break;
1800                 }
1801                 else
1802                 {
1803                         /* It is questionable, see note in unix_dgram_recvmsg.
1804                          */
1805                         if (UNIXCB(skb).fp)
1806                                 siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1807
1808                         /* put message back and return */
1809                         skb_queue_head(&sk->sk_receive_queue, skb);
1810                         break;
1811                 }
1812         } while (size);
1813
1814         up(&u->readsem);
1815         scm_recv(sock, msg, siocb->scm, flags);
1816 out:
1817         return copied ? : err;
1818 }
1819
1820 static int unix_shutdown(struct socket *sock, int mode)
1821 {
1822         struct sock *sk = sock->sk;
1823         struct sock *other;
1824
1825         mode = (mode+1)&(RCV_SHUTDOWN|SEND_SHUTDOWN);
1826
1827         if (mode) {
1828                 unix_state_wlock(sk);
1829                 sk->sk_shutdown |= mode;
1830                 other=unix_peer(sk);
1831                 if (other)
1832                         sock_hold(other);
1833                 unix_state_wunlock(sk);
1834                 sk->sk_state_change(sk);
1835
1836                 if (other &&
1837                         (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
1838
1839                         int peer_mode = 0;
1840
1841                         if (mode&RCV_SHUTDOWN)
1842                                 peer_mode |= SEND_SHUTDOWN;
1843                         if (mode&SEND_SHUTDOWN)
1844                                 peer_mode |= RCV_SHUTDOWN;
1845                         unix_state_wlock(other);
1846                         other->sk_shutdown |= peer_mode;
1847                         unix_state_wunlock(other);
1848                         other->sk_state_change(other);
1849                         read_lock(&other->sk_callback_lock);
1850                         if (peer_mode == SHUTDOWN_MASK)
1851                                 sk_wake_async(other,1,POLL_HUP);
1852                         else if (peer_mode & RCV_SHUTDOWN)
1853                                 sk_wake_async(other,1,POLL_IN);
1854                         read_unlock(&other->sk_callback_lock);
1855                 }
1856                 if (other)
1857                         sock_put(other);
1858         }
1859         return 0;
1860 }
1861
1862 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1863 {
1864         struct sock *sk = sock->sk;
1865         long amount=0;
1866         int err;
1867
1868         switch(cmd)
1869         {
1870                 case SIOCOUTQ:
1871                         amount = atomic_read(&sk->sk_wmem_alloc);
1872                         err = put_user(amount, (int __user *)arg);
1873                         break;
1874                 case SIOCINQ:
1875                 {
1876                         struct sk_buff *skb;
1877                         if (sk->sk_state == TCP_LISTEN) {
1878                                 err = -EINVAL;
1879                                 break;
1880                         }
1881
1882                         spin_lock(&sk->sk_receive_queue.lock);
1883                         skb = skb_peek(&sk->sk_receive_queue);
1884                         if (skb)
1885                                 amount=skb->len;
1886                         spin_unlock(&sk->sk_receive_queue.lock);
1887                         err = put_user(amount, (int __user *)arg);
1888                         break;
1889                 }
1890
1891                 default:
1892                         err = dev_ioctl(cmd, (void __user *)arg);
1893                         break;
1894         }
1895         return err;
1896 }
1897
1898 static unsigned int unix_poll(struct file * file, struct socket *sock, poll_table *wait)
1899 {
1900         struct sock *sk = sock->sk;
1901         unsigned int mask;
1902
1903         poll_wait(file, sk->sk_sleep, wait);
1904         mask = 0;
1905
1906         /* exceptional events? */
1907         if (sk->sk_err)
1908                 mask |= POLLERR;
1909         if (sk->sk_shutdown == SHUTDOWN_MASK)
1910                 mask |= POLLHUP;
1911
1912         /* readable? */
1913         if (!skb_queue_empty(&sk->sk_receive_queue) ||
1914             (sk->sk_shutdown & RCV_SHUTDOWN))
1915                 mask |= POLLIN | POLLRDNORM;
1916
1917         /* Connection-based need to check for termination and startup */
1918         if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) && sk->sk_state == TCP_CLOSE)
1919                 mask |= POLLHUP;
1920
1921         /*
1922          * we set writable also when the other side has shut down the
1923          * connection. This prevents stuck sockets.
1924          */
1925         if (unix_writable(sk))
1926                 mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
1927
1928         return mask;
1929 }
1930
1931
1932 #ifdef CONFIG_PROC_FS
1933 static struct sock *unix_seq_idx(int *iter, loff_t pos)
1934 {
1935         loff_t off = 0;
1936         struct sock *s;
1937
1938         for (s = first_unix_socket(iter); s; s = next_unix_socket(iter, s)) {
1939                 if (off == pos) 
1940                         return s;
1941                 ++off;
1942         }
1943         return NULL;
1944 }
1945
1946
1947 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
1948 {
1949         read_lock(&unix_table_lock);
1950         return *pos ? unix_seq_idx(seq->private, *pos - 1) : ((void *) 1);
1951 }
1952
1953 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1954 {
1955         ++*pos;
1956
1957         if (v == (void *)1) 
1958                 return first_unix_socket(seq->private);
1959         return next_unix_socket(seq->private, v);
1960 }
1961
1962 static void unix_seq_stop(struct seq_file *seq, void *v)
1963 {
1964         read_unlock(&unix_table_lock);
1965 }
1966
1967 static int unix_seq_show(struct seq_file *seq, void *v)
1968 {
1969         
1970         if (v == (void *)1)
1971                 seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
1972                          "Inode Path\n");
1973         else {
1974                 struct sock *s = v;
1975                 struct unix_sock *u = unix_sk(s);
1976                 unix_state_rlock(s);
1977
1978                 seq_printf(seq, "%p: %08X %08X %08X %04X %02X %5lu",
1979                         s,
1980                         atomic_read(&s->sk_refcnt),
1981                         0,
1982                         s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
1983                         s->sk_type,
1984                         s->sk_socket ?
1985                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
1986                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
1987                         sock_i_ino(s));
1988
1989                 if (u->addr) {
1990                         int i, len;
1991                         seq_putc(seq, ' ');
1992
1993                         i = 0;
1994                         len = u->addr->len - sizeof(short);
1995                         if (!UNIX_ABSTRACT(s))
1996                                 len--;
1997                         else {
1998                                 seq_putc(seq, '@');
1999                                 i++;
2000                         }
2001                         for ( ; i < len; i++)
2002                                 seq_putc(seq, u->addr->name->sun_path[i]);
2003                 }
2004                 unix_state_runlock(s);
2005                 seq_putc(seq, '\n');
2006         }
2007
2008         return 0;
2009 }
2010
2011 static struct seq_operations unix_seq_ops = {
2012         .start  = unix_seq_start,
2013         .next   = unix_seq_next,
2014         .stop   = unix_seq_stop,
2015         .show   = unix_seq_show,
2016 };
2017
2018
2019 static int unix_seq_open(struct inode *inode, struct file *file)
2020 {
2021         struct seq_file *seq;
2022         int rc = -ENOMEM;
2023         int *iter = kmalloc(sizeof(int), GFP_KERNEL);
2024
2025         if (!iter)
2026                 goto out;
2027
2028         rc = seq_open(file, &unix_seq_ops);
2029         if (rc)
2030                 goto out_kfree;
2031
2032         seq          = file->private_data;
2033         seq->private = iter;
2034         *iter = 0;
2035 out:
2036         return rc;
2037 out_kfree:
2038         kfree(iter);
2039         goto out;
2040 }
2041
2042 static struct file_operations unix_seq_fops = {
2043         .owner          = THIS_MODULE,
2044         .open           = unix_seq_open,
2045         .read           = seq_read,
2046         .llseek         = seq_lseek,
2047         .release        = seq_release_private,
2048 };
2049
2050 #endif
2051
2052 static struct net_proto_family unix_family_ops = {
2053         .family = PF_UNIX,
2054         .create = unix_create,
2055         .owner  = THIS_MODULE,
2056 };
2057
2058 #ifdef CONFIG_SYSCTL
2059 extern void unix_sysctl_register(void);
2060 extern void unix_sysctl_unregister(void);
2061 #else
2062 static inline void unix_sysctl_register(void) {}
2063 static inline void unix_sysctl_unregister(void) {}
2064 #endif
2065
2066 static int __init af_unix_init(void)
2067 {
2068         struct sk_buff *dummy_skb;
2069
2070         if (sizeof(struct unix_skb_parms) > sizeof(dummy_skb->cb)) {
2071                 printk(KERN_CRIT "%s: panic\n", __FUNCTION__);
2072                 return -1;
2073         }
2074         /* allocate our sock slab cache */
2075         unix_sk_cachep = kmem_cache_create("unix_sock",
2076                                            sizeof(struct unix_sock), 0,
2077                                            SLAB_HWCACHE_ALIGN, NULL, NULL);
2078         if (!unix_sk_cachep)
2079                 printk(KERN_CRIT
2080                         "af_unix_init: Cannot create unix_sock SLAB cache!\n");
2081
2082         sock_register(&unix_family_ops);
2083 #ifdef CONFIG_PROC_FS
2084         proc_net_fops_create("unix", 0, &unix_seq_fops);
2085 #endif
2086         unix_sysctl_register();
2087         return 0;
2088 }
2089
2090 static void __exit af_unix_exit(void)
2091 {
2092         sock_unregister(PF_UNIX);
2093         unix_sysctl_unregister();
2094         proc_net_remove("unix");
2095         kmem_cache_destroy(unix_sk_cachep);
2096 }
2097
2098 module_init(af_unix_init);
2099 module_exit(af_unix_exit);
2100
2101 MODULE_LICENSE("GPL");
2102 MODULE_ALIAS_NETPROTO(PF_UNIX);