patch-2.6.6-vs1.9.0
[linux-2.6.git] / net / unix / af_unix.c
1 /*
2  * NET4:        Implementation of BSD Unix domain sockets.
3  *
4  * Authors:     Alan Cox, <alan.cox@linux.org>
5  *
6  *              This program is free software; you can redistribute it and/or
7  *              modify it under the terms of the GNU General Public License
8  *              as published by the Free Software Foundation; either version
9  *              2 of the License, or (at your option) any later version.
10  *
11  * Version:     $Id: af_unix.c,v 1.133 2002/02/08 03:57:19 davem Exp $
12  *
13  * Fixes:
14  *              Linus Torvalds  :       Assorted bug cures.
15  *              Niibe Yutaka    :       async I/O support.
16  *              Carsten Paeth   :       PF_UNIX check, address fixes.
17  *              Alan Cox        :       Limit size of allocated blocks.
18  *              Alan Cox        :       Fixed the stupid socketpair bug.
19  *              Alan Cox        :       BSD compatibility fine tuning.
20  *              Alan Cox        :       Fixed a bug in connect when interrupted.
21  *              Alan Cox        :       Sorted out a proper draft version of
22  *                                      file descriptor passing hacked up from
23  *                                      Mike Shaver's work.
24  *              Marty Leisner   :       Fixes to fd passing
25  *              Nick Nevin      :       recvmsg bugfix.
26  *              Alan Cox        :       Started proper garbage collector
27  *              Heiko EiBfeldt  :       Missing verify_area check
28  *              Alan Cox        :       Started POSIXisms
29  *              Andreas Schwab  :       Replace inode by dentry for proper
30  *                                      reference counting
31  *              Kirk Petersen   :       Made this a module
32  *          Christoph Rohland   :       Elegant non-blocking accept/connect algorithm.
33  *                                      Lots of bug fixes.
34  *           Alexey Kuznetosv   :       Repaired (I hope) bugs introduces
35  *                                      by above two patches.
36  *           Andrea Arcangeli   :       If possible we block in connect(2)
37  *                                      if the max backlog of the listen socket
38  *                                      is been reached. This won't break
39  *                                      old apps and it will avoid huge amount
40  *                                      of socks hashed (this for unix_gc()
41  *                                      performances reasons).
42  *                                      Security fix that limits the max
43  *                                      number of socks to 2*max_files and
44  *                                      the number of skb queueable in the
45  *                                      dgram receiver.
46  *              Artur Skawina   :       Hash function optimizations
47  *           Alexey Kuznetsov   :       Full scale SMP. Lot of bugs are introduced 8)
48  *            Malcolm Beattie   :       Set peercred for socketpair
49  *           Michal Ostrowski   :       Module initialization cleanup.
50  *           Arnaldo C. Melo    :       Remove MOD_{INC,DEC}_USE_COUNT,
51  *                                      the core infrastructure is doing that
52  *                                      for all net proto families now (2.5.69+)
53  *
54  *
55  * Known differences from reference BSD that was tested:
56  *
57  *      [TO FIX]
58  *      ECONNREFUSED is not returned from one end of a connected() socket to the
59  *              other the moment one end closes.
60  *      fstat() doesn't return st_dev=0, and give the blksize as high water mark
61  *              and a fake inode identifier (nor the BSD first socket fstat twice bug).
62  *      [NOT TO FIX]
63  *      accept() returns a path name even if the connecting socket has closed
64  *              in the meantime (BSD loses the path and gives up).
65  *      accept() returns 0 length path for an unbound connector. BSD returns 16
66  *              and a null first byte in the path (but not for gethost/peername - BSD bug ??)
67  *      socketpair(...SOCK_RAW..) doesn't panic the kernel.
68  *      BSD af_unix apparently has connect forgetting to block properly.
69  *              (need to check this with the POSIX spec in detail)
70  *
71  * Differences from 2.0.0-11-... (ANK)
72  *      Bug fixes and improvements.
73  *              - client shutdown killed server socket.
74  *              - removed all useless cli/sti pairs.
75  *
76  *      Semantic changes/extensions.
77  *              - generic control message passing.
78  *              - SCM_CREDENTIALS control message.
79  *              - "Abstract" (not FS based) socket bindings.
80  *                Abstract names are sequences of bytes (not zero terminated)
81  *                started by 0, so that this name space does not intersect
82  *                with BSD names.
83  */
84
85 #include <linux/module.h>
86 #include <linux/config.h>
87 #include <linux/kernel.h>
88 #include <linux/major.h>
89 #include <linux/signal.h>
90 #include <linux/sched.h>
91 #include <linux/errno.h>
92 #include <linux/string.h>
93 #include <linux/stat.h>
94 #include <linux/dcache.h>
95 #include <linux/namei.h>
96 #include <linux/socket.h>
97 #include <linux/un.h>
98 #include <linux/fcntl.h>
99 #include <linux/termios.h>
100 #include <linux/sockios.h>
101 #include <linux/net.h>
102 #include <linux/in.h>
103 #include <linux/fs.h>
104 #include <linux/slab.h>
105 #include <asm/uaccess.h>
106 #include <linux/skbuff.h>
107 #include <linux/netdevice.h>
108 #include <net/sock.h>
109 #include <linux/tcp.h>
110 #include <net/af_unix.h>
111 #include <linux/proc_fs.h>
112 #include <linux/seq_file.h>
113 #include <net/scm.h>
114 #include <linux/init.h>
115 #include <linux/poll.h>
116 #include <linux/smp_lock.h>
117 #include <linux/rtnetlink.h>
118 #include <linux/mount.h>
119 #include <net/checksum.h>
120 #include <linux/security.h>
121
122 int sysctl_unix_max_dgram_qlen = 10;
123
124 kmem_cache_t *unix_sk_cachep;
125
126 struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1];
127 rwlock_t unix_table_lock = RW_LOCK_UNLOCKED;
128 static atomic_t unix_nr_socks = ATOMIC_INIT(0);
129
130 #define unix_sockets_unbound    (&unix_socket_table[UNIX_HASH_SIZE])
131
132 #define UNIX_ABSTRACT(sk)       (unix_sk(sk)->addr->hash != UNIX_HASH_SIZE)
133
134 /*
135  *  SMP locking strategy:
136  *    hash table is protected with rwlock unix_table_lock
137  *    each socket state is protected by separate rwlock.
138  */
139
140 static inline unsigned unix_hash_fold(unsigned hash)
141 {
142         hash ^= hash>>16;
143         hash ^= hash>>8;
144         return hash&(UNIX_HASH_SIZE-1);
145 }
146
147 #define unix_peer(sk) ((sk)->sk_pair)
148
149 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
150 {
151         return unix_peer(osk) == sk;
152 }
153
154 static inline int unix_may_send(struct sock *sk, struct sock *osk)
155 {
156         return (unix_peer(osk) == NULL || unix_our_peer(sk, osk));
157 }
158
159 static struct sock *unix_peer_get(struct sock *s)
160 {
161         struct sock *peer;
162
163         unix_state_rlock(s);
164         peer = unix_peer(s);
165         if (peer)
166                 sock_hold(peer);
167         unix_state_runlock(s);
168         return peer;
169 }
170
171 static inline void unix_release_addr(struct unix_address *addr)
172 {
173         if (atomic_dec_and_test(&addr->refcnt))
174                 kfree(addr);
175 }
176
177 /*
178  *      Check unix socket name:
179  *              - should be not zero length.
180  *              - if started by not zero, should be NULL terminated (FS object)
181  *              - if started by zero, it is abstract name.
182  */
183  
184 static int unix_mkname(struct sockaddr_un * sunaddr, int len, unsigned *hashp)
185 {
186         if (len <= sizeof(short) || len > sizeof(*sunaddr))
187                 return -EINVAL;
188         if (!sunaddr || sunaddr->sun_family != AF_UNIX)
189                 return -EINVAL;
190         if (sunaddr->sun_path[0])
191         {
192                 /*
193                  *      This may look like an off by one error but it is
194                  *      a bit more subtle. 108 is the longest valid AF_UNIX
195                  *      path for a binding. sun_path[108] doesn't as such
196                  *      exist. However in kernel space we are guaranteed that
197                  *      it is a valid memory location in our kernel
198                  *      address buffer.
199                  */
200                 if (len > sizeof(*sunaddr))
201                         len = sizeof(*sunaddr);
202                 ((char *)sunaddr)[len]=0;
203                 len = strlen(sunaddr->sun_path)+1+sizeof(short);
204                 return len;
205         }
206
207         *hashp = unix_hash_fold(csum_partial((char*)sunaddr, len, 0));
208         return len;
209 }
210
211 static void __unix_remove_socket(struct sock *sk)
212 {
213         sk_del_node_init(sk);
214 }
215
216 static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
217 {
218         BUG_TRAP(sk_unhashed(sk));
219         sk_add_node(sk, list);
220 }
221
222 static inline void unix_remove_socket(struct sock *sk)
223 {
224         write_lock(&unix_table_lock);
225         __unix_remove_socket(sk);
226         write_unlock(&unix_table_lock);
227 }
228
229 static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
230 {
231         write_lock(&unix_table_lock);
232         __unix_insert_socket(list, sk);
233         write_unlock(&unix_table_lock);
234 }
235
236 static struct sock *__unix_find_socket_byname(struct sockaddr_un *sunname,
237                                               int len, int type, unsigned hash)
238 {
239         struct sock *s;
240         struct hlist_node *node;
241
242         sk_for_each(s, node, &unix_socket_table[hash ^ type]) {
243                 struct unix_sock *u = unix_sk(s);
244
245                 if (u->addr->len == len &&
246                     !memcmp(u->addr->name, sunname, len))
247                         goto found;
248         }
249         s = NULL;
250 found:
251         return s;
252 }
253
254 static inline struct sock *unix_find_socket_byname(struct sockaddr_un *sunname,
255                                                    int len, int type,
256                                                    unsigned hash)
257 {
258         struct sock *s;
259
260         read_lock(&unix_table_lock);
261         s = __unix_find_socket_byname(sunname, len, type, hash);
262         if (s)
263                 sock_hold(s);
264         read_unlock(&unix_table_lock);
265         return s;
266 }
267
268 static struct sock *unix_find_socket_byinode(struct inode *i)
269 {
270         struct sock *s;
271         struct hlist_node *node;
272
273         read_lock(&unix_table_lock);
274         sk_for_each(s, node,
275                     &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
276                 struct dentry *dentry = unix_sk(s)->dentry;
277
278                 if(dentry && dentry->d_inode == i)
279                 {
280                         sock_hold(s);
281                         goto found;
282                 }
283         }
284         s = NULL;
285 found:
286         read_unlock(&unix_table_lock);
287         return s;
288 }
289
290 static inline int unix_writable(struct sock *sk)
291 {
292         return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
293 }
294
295 static void unix_write_space(struct sock *sk)
296 {
297         read_lock(&sk->sk_callback_lock);
298         if (unix_writable(sk)) {
299                 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
300                         wake_up_interruptible(sk->sk_sleep);
301                 sk_wake_async(sk, 2, POLL_OUT);
302         }
303         read_unlock(&sk->sk_callback_lock);
304 }
305
306 /* When dgram socket disconnects (or changes its peer), we clear its receive
307  * queue of packets arrived from previous peer. First, it allows to do
308  * flow control based only on wmem_alloc; second, sk connected to peer
309  * may receive messages only from that peer. */
310 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
311 {
312         if (skb_queue_len(&sk->sk_receive_queue)) {
313                 skb_queue_purge(&sk->sk_receive_queue);
314                 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
315
316                 /* If one link of bidirectional dgram pipe is disconnected,
317                  * we signal error. Messages are lost. Do not make this,
318                  * when peer was not connected to us.
319                  */
320                 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
321                         other->sk_err = ECONNRESET;
322                         other->sk_error_report(other);
323                 }
324         }
325 }
326
327 static void unix_sock_destructor(struct sock *sk)
328 {
329         struct unix_sock *u = unix_sk(sk);
330
331         skb_queue_purge(&sk->sk_receive_queue);
332
333         BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
334         BUG_TRAP(sk_unhashed(sk));
335         BUG_TRAP(!sk->sk_socket);
336         if (!sock_flag(sk, SOCK_DEAD)) {
337                 printk("Attempt to release alive unix socket: %p\n", sk);
338                 return;
339         }
340
341         if (u->addr)
342                 unix_release_addr(u->addr);
343
344         atomic_dec(&unix_nr_socks);
345 #ifdef UNIX_REFCNT_DEBUG
346         printk(KERN_DEBUG "UNIX %p is destroyed, %d are still alive.\n", sk, atomic_read(&unix_nr_socks));
347 #endif
348 }
349
350 static int unix_release_sock (struct sock *sk, int embrion)
351 {
352         struct unix_sock *u = unix_sk(sk);
353         struct dentry *dentry;
354         struct vfsmount *mnt;
355         struct sock *skpair;
356         struct sk_buff *skb;
357         int state;
358
359         unix_remove_socket(sk);
360
361         /* Clear state */
362         unix_state_wlock(sk);
363         sock_orphan(sk);
364         sk->sk_shutdown = SHUTDOWN_MASK;
365         dentry       = u->dentry;
366         u->dentry    = NULL;
367         mnt          = u->mnt;
368         u->mnt       = NULL;
369         state = sk->sk_state;
370         sk->sk_state = TCP_CLOSE;
371         unix_state_wunlock(sk);
372
373         wake_up_interruptible_all(&u->peer_wait);
374
375         skpair=unix_peer(sk);
376
377         if (skpair!=NULL) {
378                 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
379                         unix_state_wlock(skpair);
380                         /* No more writes */
381                         skpair->sk_shutdown = SHUTDOWN_MASK;
382                         if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
383                                 skpair->sk_err = ECONNRESET;
384                         unix_state_wunlock(skpair);
385                         skpair->sk_state_change(skpair);
386                         read_lock(&skpair->sk_callback_lock);
387                         sk_wake_async(skpair,1,POLL_HUP);
388                         read_unlock(&skpair->sk_callback_lock);
389                 }
390                 sock_put(skpair); /* It may now die */
391                 unix_peer(sk) = NULL;
392         }
393
394         /* Try to flush out this socket. Throw out buffers at least */
395
396         while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
397                 if (state==TCP_LISTEN)
398                         unix_release_sock(skb->sk, 1);
399                 /* passed fds are erased in the kfree_skb hook        */
400                 kfree_skb(skb);
401         }
402
403         if (dentry) {
404                 dput(dentry);
405                 mntput(mnt);
406         }
407
408         clr_vx_info(&sk->sk_vx_info);
409         clr_nx_info(&sk->sk_nx_info);
410         sock_put(sk);
411
412         /* ---- Socket is dead now and most probably destroyed ---- */
413
414         /*
415          * Fixme: BSD difference: In BSD all sockets connected to use get
416          *        ECONNRESET and we die on the spot. In Linux we behave
417          *        like files and pipes do and wait for the last
418          *        dereference.
419          *
420          * Can't we simply set sock->err?
421          *
422          *        What the above comment does talk about? --ANK(980817)
423          */
424
425         if (atomic_read(&unix_tot_inflight))
426                 unix_gc();              /* Garbage collect fds */       
427
428         return 0;
429 }
430
431 static int unix_listen(struct socket *sock, int backlog)
432 {
433         int err;
434         struct sock *sk = sock->sk;
435         struct unix_sock *u = unix_sk(sk);
436
437         err = -EOPNOTSUPP;
438         if (sock->type!=SOCK_STREAM && sock->type!=SOCK_SEQPACKET)
439                 goto out;                       /* Only stream/seqpacket sockets accept */
440         err = -EINVAL;
441         if (!u->addr)
442                 goto out;                       /* No listens on an unbound socket */
443         unix_state_wlock(sk);
444         if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
445                 goto out_unlock;
446         if (backlog > sk->sk_max_ack_backlog)
447                 wake_up_interruptible_all(&u->peer_wait);
448         sk->sk_max_ack_backlog  = backlog;
449         sk->sk_state            = TCP_LISTEN;
450         /* set credentials so connect can copy them */
451         sk->sk_peercred.pid     = current->tgid;
452         sk->sk_peercred.uid     = current->euid;
453         sk->sk_peercred.gid     = current->egid;
454         err = 0;
455
456 out_unlock:
457         unix_state_wunlock(sk);
458 out:
459         return err;
460 }
461
462 static int unix_release(struct socket *);
463 static int unix_bind(struct socket *, struct sockaddr *, int);
464 static int unix_stream_connect(struct socket *, struct sockaddr *,
465                                int addr_len, int flags);
466 static int unix_socketpair(struct socket *, struct socket *);
467 static int unix_accept(struct socket *, struct socket *, int);
468 static int unix_getname(struct socket *, struct sockaddr *, int *, int);
469 static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
470 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
471 static int unix_shutdown(struct socket *, int);
472 static int unix_stream_sendmsg(struct kiocb *, struct socket *,
473                                struct msghdr *, size_t);
474 static int unix_stream_recvmsg(struct kiocb *, struct socket *,
475                                struct msghdr *, size_t, int);
476 static int unix_dgram_sendmsg(struct kiocb *, struct socket *,
477                               struct msghdr *, size_t);
478 static int unix_dgram_recvmsg(struct kiocb *, struct socket *,
479                               struct msghdr *, size_t, int);
480 static int unix_dgram_connect(struct socket *, struct sockaddr *,
481                               int, int);
482
483 static struct proto_ops unix_stream_ops = {
484         .family =       PF_UNIX,
485         .owner =        THIS_MODULE,
486         .release =      unix_release,
487         .bind =         unix_bind,
488         .connect =      unix_stream_connect,
489         .socketpair =   unix_socketpair,
490         .accept =       unix_accept,
491         .getname =      unix_getname,
492         .poll =         unix_poll,
493         .ioctl =        unix_ioctl,
494         .listen =       unix_listen,
495         .shutdown =     unix_shutdown,
496         .setsockopt =   sock_no_setsockopt,
497         .getsockopt =   sock_no_getsockopt,
498         .sendmsg =      unix_stream_sendmsg,
499         .recvmsg =      unix_stream_recvmsg,
500         .mmap =         sock_no_mmap,
501         .sendpage =     sock_no_sendpage,
502 };
503
504 static struct proto_ops unix_dgram_ops = {
505         .family =       PF_UNIX,
506         .owner =        THIS_MODULE,
507         .release =      unix_release,
508         .bind =         unix_bind,
509         .connect =      unix_dgram_connect,
510         .socketpair =   unix_socketpair,
511         .accept =       sock_no_accept,
512         .getname =      unix_getname,
513         .poll =         datagram_poll,
514         .ioctl =        unix_ioctl,
515         .listen =       sock_no_listen,
516         .shutdown =     unix_shutdown,
517         .setsockopt =   sock_no_setsockopt,
518         .getsockopt =   sock_no_getsockopt,
519         .sendmsg =      unix_dgram_sendmsg,
520         .recvmsg =      unix_dgram_recvmsg,
521         .mmap =         sock_no_mmap,
522         .sendpage =     sock_no_sendpage,
523 };
524
525 static struct proto_ops unix_seqpacket_ops = {
526         .family =       PF_UNIX,
527         .owner =        THIS_MODULE,
528         .release =      unix_release,
529         .bind =         unix_bind,
530         .connect =      unix_stream_connect,
531         .socketpair =   unix_socketpair,
532         .accept =       unix_accept,
533         .getname =      unix_getname,
534         .poll =         datagram_poll,
535         .ioctl =        unix_ioctl,
536         .listen =       unix_listen,
537         .shutdown =     unix_shutdown,
538         .setsockopt =   sock_no_setsockopt,
539         .getsockopt =   sock_no_getsockopt,
540         .sendmsg =      unix_dgram_sendmsg,
541         .recvmsg =      unix_dgram_recvmsg,
542         .mmap =         sock_no_mmap,
543         .sendpage =     sock_no_sendpage,
544 };
545
546 static struct sock * unix_create1(struct socket *sock)
547 {
548         struct sock *sk = NULL;
549         struct unix_sock *u;
550
551         if (atomic_read(&unix_nr_socks) >= 2*files_stat.max_files)
552                 goto out;
553
554         sk = sk_alloc(PF_UNIX, GFP_KERNEL, sizeof(struct unix_sock),
555                       unix_sk_cachep);
556         if (!sk)
557                 goto out;
558
559         atomic_inc(&unix_nr_socks);
560
561         sock_init_data(sock,sk);
562         sk_set_owner(sk, THIS_MODULE);
563
564         set_vx_info(&sk->sk_vx_info, current->vx_info);
565         set_nx_info(&sk->sk_nx_info, current->nx_info);
566         sk->sk_xid = vx_current_xid();
567
568         sk->sk_write_space      = unix_write_space;
569         sk->sk_max_ack_backlog  = sysctl_unix_max_dgram_qlen;
570         sk->sk_destruct         = unix_sock_destructor;
571         u         = unix_sk(sk);
572         u->dentry = NULL;
573         u->mnt    = NULL;
574         rwlock_init(&u->lock);
575         atomic_set(&u->inflight, sock ? 0 : -1);
576         init_MUTEX(&u->readsem); /* single task reading lock */
577         init_waitqueue_head(&u->peer_wait);
578         unix_insert_socket(unix_sockets_unbound, sk);
579 out:
580         return sk;
581 }
582
583 static int unix_create(struct socket *sock, int protocol)
584 {
585         if (protocol && protocol != PF_UNIX)
586                 return -EPROTONOSUPPORT;
587
588         sock->state = SS_UNCONNECTED;
589
590         switch (sock->type) {
591         case SOCK_STREAM:
592                 sock->ops = &unix_stream_ops;
593                 break;
594                 /*
595                  *      Believe it or not BSD has AF_UNIX, SOCK_RAW though
596                  *      nothing uses it.
597                  */
598         case SOCK_RAW:
599                 sock->type=SOCK_DGRAM;
600         case SOCK_DGRAM:
601                 sock->ops = &unix_dgram_ops;
602                 break;
603         case SOCK_SEQPACKET:
604                 sock->ops = &unix_seqpacket_ops;
605                 break;
606         default:
607                 return -ESOCKTNOSUPPORT;
608         }
609
610         return unix_create1(sock) ? 0 : -ENOMEM;
611 }
612
613 static int unix_release(struct socket *sock)
614 {
615         struct sock *sk = sock->sk;
616
617         if (!sk)
618                 return 0;
619
620         sock->sk = NULL;
621
622         return unix_release_sock (sk, 0);
623 }
624
625 static int unix_autobind(struct socket *sock)
626 {
627         struct sock *sk = sock->sk;
628         struct unix_sock *u = unix_sk(sk);
629         static u32 ordernum = 1;
630         struct unix_address * addr;
631         int err;
632
633         down(&u->readsem);
634
635         err = 0;
636         if (u->addr)
637                 goto out;
638
639         err = -ENOMEM;
640         addr = kmalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
641         if (!addr)
642                 goto out;
643
644         memset(addr, 0, sizeof(*addr) + sizeof(short) + 16);
645         addr->name->sun_family = AF_UNIX;
646         atomic_set(&addr->refcnt, 1);
647
648 retry:
649         addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
650         addr->hash = unix_hash_fold(csum_partial((void*)addr->name, addr->len, 0));
651
652         write_lock(&unix_table_lock);
653         ordernum = (ordernum+1)&0xFFFFF;
654
655         if (__unix_find_socket_byname(addr->name, addr->len, sock->type,
656                                       addr->hash)) {
657                 write_unlock(&unix_table_lock);
658                 /* Sanity yield. It is unusual case, but yet... */
659                 if (!(ordernum&0xFF))
660                         yield();
661                 goto retry;
662         }
663         addr->hash ^= sk->sk_type;
664
665         __unix_remove_socket(sk);
666         u->addr = addr;
667         __unix_insert_socket(&unix_socket_table[addr->hash], sk);
668         write_unlock(&unix_table_lock);
669         err = 0;
670
671 out:    up(&u->readsem);
672         return err;
673 }
674
675 static struct sock *unix_find_other(struct sockaddr_un *sunname, int len,
676                                     int type, unsigned hash, int *error)
677 {
678         struct sock *u;
679         struct nameidata nd;
680         int err = 0;
681         
682         if (sunname->sun_path[0]) {
683                 err = path_lookup(sunname->sun_path, LOOKUP_FOLLOW, &nd);
684                 if (err)
685                         goto fail;
686                 err = permission(nd.dentry->d_inode,MAY_WRITE, &nd);
687                 if (err)
688                         goto put_fail;
689
690                 err = -ECONNREFUSED;
691                 if (!S_ISSOCK(nd.dentry->d_inode->i_mode))
692                         goto put_fail;
693                 u=unix_find_socket_byinode(nd.dentry->d_inode);
694                 if (!u)
695                         goto put_fail;
696
697                 if (u->sk_type == type)
698                         touch_atime(nd.mnt, nd.dentry);
699
700                 path_release(&nd);
701
702                 err=-EPROTOTYPE;
703                 if (u->sk_type != type) {
704                         sock_put(u);
705                         goto fail;
706                 }
707         } else {
708                 err = -ECONNREFUSED;
709                 u=unix_find_socket_byname(sunname, len, type, hash);
710                 if (u) {
711                         struct dentry *dentry;
712                         dentry = unix_sk(u)->dentry;
713                         if (dentry)
714                                 touch_atime(unix_sk(u)->mnt, dentry);
715                 } else
716                         goto fail;
717         }
718         return u;
719
720 put_fail:
721         path_release(&nd);
722 fail:
723         *error=err;
724         return NULL;
725 }
726
727
728 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
729 {
730         struct sock *sk = sock->sk;
731         struct unix_sock *u = unix_sk(sk);
732         struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr;
733         struct dentry * dentry = NULL;
734         struct nameidata nd;
735         int err;
736         unsigned hash;
737         struct unix_address *addr;
738         struct hlist_head *list;
739
740         err = -EINVAL;
741         if (sunaddr->sun_family != AF_UNIX)
742                 goto out;
743
744         if (addr_len==sizeof(short)) {
745                 err = unix_autobind(sock);
746                 goto out;
747         }
748
749         err = unix_mkname(sunaddr, addr_len, &hash);
750         if (err < 0)
751                 goto out;
752         addr_len = err;
753
754         down(&u->readsem);
755
756         err = -EINVAL;
757         if (u->addr)
758                 goto out_up;
759
760         err = -ENOMEM;
761         addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
762         if (!addr)
763                 goto out_up;
764
765         memcpy(addr->name, sunaddr, addr_len);
766         addr->len = addr_len;
767         addr->hash = hash ^ sk->sk_type;
768         atomic_set(&addr->refcnt, 1);
769
770         if (sunaddr->sun_path[0]) {
771                 unsigned int mode;
772                 err = 0;
773                 /*
774                  * Get the parent directory, calculate the hash for last
775                  * component.
776                  */
777                 err = path_lookup(sunaddr->sun_path, LOOKUP_PARENT, &nd);
778                 if (err)
779                         goto out_mknod_parent;
780                 /*
781                  * Yucky last component or no last component at all?
782                  * (foo/., foo/.., /////)
783                  */
784                 err = -EEXIST;
785                 if (nd.last_type != LAST_NORM)
786                         goto out_mknod;
787                 /*
788                  * Lock the directory.
789                  */
790                 down(&nd.dentry->d_inode->i_sem);
791                 /*
792                  * Do the final lookup.
793                  */
794                 dentry = lookup_hash(&nd.last, nd.dentry);
795                 err = PTR_ERR(dentry);
796                 if (IS_ERR(dentry))
797                         goto out_mknod_unlock;
798                 err = -ENOENT;
799                 /*
800                  * Special case - lookup gave negative, but... we had foo/bar/
801                  * From the vfs_mknod() POV we just have a negative dentry -
802                  * all is fine. Let's be bastards - you had / on the end, you've
803                  * been asking for (non-existent) directory. -ENOENT for you.
804                  */
805                 if (nd.last.name[nd.last.len] && !dentry->d_inode)
806                         goto out_mknod_dput;
807                 /*
808                  * All right, let's create it.
809                  */
810                 mode = S_IFSOCK |
811                        (SOCK_INODE(sock)->i_mode & ~current->fs->umask);
812                 err = vfs_mknod(nd.dentry->d_inode, dentry, mode, 0);
813                 if (err)
814                         goto out_mknod_dput;
815                 up(&nd.dentry->d_inode->i_sem);
816                 dput(nd.dentry);
817                 nd.dentry = dentry;
818
819                 addr->hash = UNIX_HASH_SIZE;
820         }
821
822         write_lock(&unix_table_lock);
823
824         if (!sunaddr->sun_path[0]) {
825                 err = -EADDRINUSE;
826                 if (__unix_find_socket_byname(sunaddr, addr_len,
827                                               sk->sk_type, hash)) {
828                         unix_release_addr(addr);
829                         goto out_unlock;
830                 }
831
832                 list = &unix_socket_table[addr->hash];
833         } else {
834                 list = &unix_socket_table[dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1)];
835                 u->dentry = nd.dentry;
836                 u->mnt    = nd.mnt;
837         }
838
839         err = 0;
840         __unix_remove_socket(sk);
841         u->addr = addr;
842         __unix_insert_socket(list, sk);
843
844 out_unlock:
845         write_unlock(&unix_table_lock);
846 out_up:
847         up(&u->readsem);
848 out:
849         return err;
850
851 out_mknod_dput:
852         dput(dentry);
853 out_mknod_unlock:
854         up(&nd.dentry->d_inode->i_sem);
855 out_mknod:
856         path_release(&nd);
857 out_mknod_parent:
858         if (err==-EEXIST)
859                 err=-EADDRINUSE;
860         unix_release_addr(addr);
861         goto out_up;
862 }
863
864 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
865                               int alen, int flags)
866 {
867         struct sock *sk = sock->sk;
868         struct sockaddr_un *sunaddr=(struct sockaddr_un*)addr;
869         struct sock *other;
870         unsigned hash;
871         int err;
872
873         if (addr->sa_family != AF_UNSPEC) {
874                 err = unix_mkname(sunaddr, alen, &hash);
875                 if (err < 0)
876                         goto out;
877                 alen = err;
878
879                 if (test_bit(SOCK_PASS_CRED, &sock->flags) && !unix_sk(sk)->addr &&
880                     (err = unix_autobind(sock)) != 0)
881                         goto out;
882
883                 other=unix_find_other(sunaddr, alen, sock->type, hash, &err);
884                 if (!other)
885                         goto out;
886
887                 unix_state_wlock(sk);
888
889                 err = -EPERM;
890                 if (!unix_may_send(sk, other))
891                         goto out_unlock;
892
893                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
894                 if (err)
895                         goto out_unlock;
896
897         } else {
898                 /*
899                  *      1003.1g breaking connected state with AF_UNSPEC
900                  */
901                 other = NULL;
902                 unix_state_wlock(sk);
903         }
904
905         /*
906          * If it was connected, reconnect.
907          */
908         if (unix_peer(sk)) {
909                 struct sock *old_peer = unix_peer(sk);
910                 unix_peer(sk)=other;
911                 unix_state_wunlock(sk);
912
913                 if (other != old_peer)
914                         unix_dgram_disconnected(sk, old_peer);
915                 sock_put(old_peer);
916         } else {
917                 unix_peer(sk)=other;
918                 unix_state_wunlock(sk);
919         }
920         return 0;
921
922 out_unlock:
923         unix_state_wunlock(sk);
924         sock_put(other);
925 out:
926         return err;
927 }
928
929 static long unix_wait_for_peer(struct sock *other, long timeo)
930 {
931         struct unix_sock *u = unix_sk(other);
932         int sched;
933         DEFINE_WAIT(wait);
934
935         prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
936
937         sched = !sock_flag(other, SOCK_DEAD) &&
938                 !(other->sk_shutdown & RCV_SHUTDOWN) &&
939                 (skb_queue_len(&other->sk_receive_queue) >
940                  other->sk_max_ack_backlog);
941
942         unix_state_runlock(other);
943
944         if (sched)
945                 timeo = schedule_timeout(timeo);
946
947         finish_wait(&u->peer_wait, &wait);
948         return timeo;
949 }
950
951 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
952                                int addr_len, int flags)
953 {
954         struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr;
955         struct sock *sk = sock->sk;
956         struct unix_sock *u = unix_sk(sk), *newu, *otheru;
957         struct sock *newsk = NULL;
958         struct sock *other = NULL;
959         struct sk_buff *skb = NULL;
960         unsigned hash;
961         int st;
962         int err;
963         long timeo;
964
965         err = unix_mkname(sunaddr, addr_len, &hash);
966         if (err < 0)
967                 goto out;
968         addr_len = err;
969
970         if (test_bit(SOCK_PASS_CRED, &sock->flags)
971                 && !u->addr && (err = unix_autobind(sock)) != 0)
972                 goto out;
973
974         timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
975
976         /* First of all allocate resources.
977            If we will make it after state is locked,
978            we will have to recheck all again in any case.
979          */
980
981         err = -ENOMEM;
982
983         /* create new sock for complete connection */
984         newsk = unix_create1(NULL);
985         if (newsk == NULL)
986                 goto out;
987
988         /* Allocate skb for sending to listening sock */
989         skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
990         if (skb == NULL)
991                 goto out;
992
993 restart:
994         /*  Find listening sock. */
995         other = unix_find_other(sunaddr, addr_len, sk->sk_type, hash, &err);
996         if (!other)
997                 goto out;
998
999         /* Latch state of peer */
1000         unix_state_rlock(other);
1001
1002         /* Apparently VFS overslept socket death. Retry. */
1003         if (sock_flag(other, SOCK_DEAD)) {
1004                 unix_state_runlock(other);
1005                 sock_put(other);
1006                 goto restart;
1007         }
1008
1009         err = -ECONNREFUSED;
1010         if (other->sk_state != TCP_LISTEN)
1011                 goto out_unlock;
1012
1013         if (skb_queue_len(&other->sk_receive_queue) >
1014             other->sk_max_ack_backlog) {
1015                 err = -EAGAIN;
1016                 if (!timeo)
1017                         goto out_unlock;
1018
1019                 timeo = unix_wait_for_peer(other, timeo);
1020
1021                 err = sock_intr_errno(timeo);
1022                 if (signal_pending(current))
1023                         goto out;
1024                 sock_put(other);
1025                 goto restart;
1026         }
1027
1028         /* Latch our state.
1029
1030            It is tricky place. We need to grab write lock and cannot
1031            drop lock on peer. It is dangerous because deadlock is
1032            possible. Connect to self case and simultaneous
1033            attempt to connect are eliminated by checking socket
1034            state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1035            check this before attempt to grab lock.
1036
1037            Well, and we have to recheck the state after socket locked.
1038          */
1039         st = sk->sk_state;
1040
1041         switch (st) {
1042         case TCP_CLOSE:
1043                 /* This is ok... continue with connect */
1044                 break;
1045         case TCP_ESTABLISHED:
1046                 /* Socket is already connected */
1047                 err = -EISCONN;
1048                 goto out_unlock;
1049         default:
1050                 err = -EINVAL;
1051                 goto out_unlock;
1052         }
1053
1054         unix_state_wlock(sk);
1055
1056         if (sk->sk_state != st) {
1057                 unix_state_wunlock(sk);
1058                 unix_state_runlock(other);
1059                 sock_put(other);
1060                 goto restart;
1061         }
1062
1063         err = security_unix_stream_connect(sock, other->sk_socket, newsk);
1064         if (err) {
1065                 unix_state_wunlock(sk);
1066                 goto out_unlock;
1067         }
1068
1069         /* The way is open! Fastly set all the necessary fields... */
1070
1071         sock_hold(sk);
1072         unix_peer(newsk)        = sk;
1073         newsk->sk_state         = TCP_ESTABLISHED;
1074         newsk->sk_type          = sk->sk_type;
1075         newsk->sk_peercred.pid  = current->tgid;
1076         newsk->sk_peercred.uid  = current->euid;
1077         newsk->sk_peercred.gid  = current->egid;
1078         newu = unix_sk(newsk);
1079         newsk->sk_sleep         = &newu->peer_wait;
1080         otheru = unix_sk(other);
1081
1082         /* copy address information from listening to new sock*/
1083         if (otheru->addr) {
1084                 atomic_inc(&otheru->addr->refcnt);
1085                 newu->addr = otheru->addr;
1086         }
1087         if (otheru->dentry) {
1088                 newu->dentry    = dget(otheru->dentry);
1089                 newu->mnt       = mntget(otheru->mnt);
1090         }
1091
1092         /* Set credentials */
1093         sk->sk_peercred = other->sk_peercred;
1094
1095         sock_hold(newsk);
1096         unix_peer(sk)   = newsk;
1097         sock->state     = SS_CONNECTED;
1098         sk->sk_state    = TCP_ESTABLISHED;
1099
1100         unix_state_wunlock(sk);
1101
1102         /* take ten and and send info to listening sock */
1103         spin_lock(&other->sk_receive_queue.lock);
1104         __skb_queue_tail(&other->sk_receive_queue, skb);
1105         /* Undo artificially decreased inflight after embrion
1106          * is installed to listening socket. */
1107         atomic_inc(&newu->inflight);
1108         spin_unlock(&other->sk_receive_queue.lock);
1109         unix_state_runlock(other);
1110         other->sk_data_ready(other, 0);
1111         sock_put(other);
1112         return 0;
1113
1114 out_unlock:
1115         if (other)
1116                 unix_state_runlock(other);
1117
1118 out:
1119         if (skb)
1120                 kfree_skb(skb);
1121         if (newsk)
1122                 unix_release_sock(newsk, 0);
1123         if (other)
1124                 sock_put(other);
1125         return err;
1126 }
1127
1128 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1129 {
1130         struct sock *ska=socka->sk, *skb = sockb->sk;
1131
1132         /* Join our sockets back to back */
1133         sock_hold(ska);
1134         sock_hold(skb);
1135         unix_peer(ska)=skb;
1136         unix_peer(skb)=ska;
1137         ska->sk_peercred.pid = skb->sk_peercred.pid = current->tgid;
1138         ska->sk_peercred.uid = skb->sk_peercred.uid = current->euid;
1139         ska->sk_peercred.gid = skb->sk_peercred.gid = current->egid;
1140
1141         if (ska->sk_type != SOCK_DGRAM) {
1142                 ska->sk_state = TCP_ESTABLISHED;
1143                 skb->sk_state = TCP_ESTABLISHED;
1144                 socka->state  = SS_CONNECTED;
1145                 sockb->state  = SS_CONNECTED;
1146         }
1147         return 0;
1148 }
1149
1150 static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
1151 {
1152         struct sock *sk = sock->sk;
1153         struct sock *tsk;
1154         struct sk_buff *skb;
1155         int err;
1156
1157         err = -EOPNOTSUPP;
1158         if (sock->type!=SOCK_STREAM && sock->type!=SOCK_SEQPACKET)
1159                 goto out;
1160
1161         err = -EINVAL;
1162         if (sk->sk_state != TCP_LISTEN)
1163                 goto out;
1164
1165         /* If socket state is TCP_LISTEN it cannot change (for now...),
1166          * so that no locks are necessary.
1167          */
1168
1169         skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1170         if (!skb) {
1171                 /* This means receive shutdown. */
1172                 if (err == 0)
1173                         err = -EINVAL;
1174                 goto out;
1175         }
1176
1177         tsk = skb->sk;
1178         skb_free_datagram(sk, skb);
1179         wake_up_interruptible(&unix_sk(sk)->peer_wait);
1180
1181         /* attach accepted sock to socket */
1182         unix_state_wlock(tsk);
1183         newsock->state = SS_CONNECTED;
1184         sock_graft(tsk, newsock);
1185         unix_state_wunlock(tsk);
1186         return 0;
1187
1188 out:
1189         return err;
1190 }
1191
1192
1193 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer)
1194 {
1195         struct sock *sk = sock->sk;
1196         struct unix_sock *u;
1197         struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr;
1198         int err = 0;
1199
1200         if (peer) {
1201                 sk = unix_peer_get(sk);
1202
1203                 err = -ENOTCONN;
1204                 if (!sk)
1205                         goto out;
1206                 err = 0;
1207         } else {
1208                 sock_hold(sk);
1209         }
1210
1211         u = unix_sk(sk);
1212         unix_state_rlock(sk);
1213         if (!u->addr) {
1214                 sunaddr->sun_family = AF_UNIX;
1215                 sunaddr->sun_path[0] = 0;
1216                 *uaddr_len = sizeof(short);
1217         } else {
1218                 struct unix_address *addr = u->addr;
1219
1220                 *uaddr_len = addr->len;
1221                 memcpy(sunaddr, addr->name, *uaddr_len);
1222         }
1223         unix_state_runlock(sk);
1224         sock_put(sk);
1225 out:
1226         return err;
1227 }
1228
1229 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1230 {
1231         int i;
1232
1233         scm->fp = UNIXCB(skb).fp;
1234         skb->destructor = sock_wfree;
1235         UNIXCB(skb).fp = NULL;
1236
1237         for (i=scm->fp->count-1; i>=0; i--)
1238                 unix_notinflight(scm->fp->fp[i]);
1239 }
1240
1241 static void unix_destruct_fds(struct sk_buff *skb)
1242 {
1243         struct scm_cookie scm;
1244         memset(&scm, 0, sizeof(scm));
1245         unix_detach_fds(&scm, skb);
1246
1247         /* Alas, it calls VFS */
1248         /* So fscking what? fput() had been SMP-safe since the last Summer */
1249         scm_destroy(&scm);
1250         sock_wfree(skb);
1251 }
1252
1253 static void unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1254 {
1255         int i;
1256         for (i=scm->fp->count-1; i>=0; i--)
1257                 unix_inflight(scm->fp->fp[i]);
1258         UNIXCB(skb).fp = scm->fp;
1259         skb->destructor = unix_destruct_fds;
1260         scm->fp = NULL;
1261 }
1262
1263 /*
1264  *      Send AF_UNIX data.
1265  */
1266
1267 static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
1268                               struct msghdr *msg, size_t len)
1269 {
1270         struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1271         struct sock *sk = sock->sk;
1272         struct unix_sock *u = unix_sk(sk);
1273         struct sockaddr_un *sunaddr=msg->msg_name;
1274         struct sock *other = NULL;
1275         int namelen = 0; /* fake GCC */
1276         int err;
1277         unsigned hash;
1278         struct sk_buff *skb;
1279         long timeo;
1280         struct scm_cookie tmp_scm;
1281
1282         if (NULL == siocb->scm)
1283                 siocb->scm = &tmp_scm;
1284         err = scm_send(sock, msg, siocb->scm);
1285         if (err < 0)
1286                 return err;
1287
1288         err = -EOPNOTSUPP;
1289         if (msg->msg_flags&MSG_OOB)
1290                 goto out;
1291
1292         if (msg->msg_namelen) {
1293                 err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1294                 if (err < 0)
1295                         goto out;
1296                 namelen = err;
1297         } else {
1298                 sunaddr = NULL;
1299                 err = -ENOTCONN;
1300                 other = unix_peer_get(sk);
1301                 if (!other)
1302                         goto out;
1303         }
1304
1305         if (test_bit(SOCK_PASS_CRED, &sock->flags)
1306                 && !u->addr && (err = unix_autobind(sock)) != 0)
1307                 goto out;
1308
1309         err = -EMSGSIZE;
1310         if (len > sk->sk_sndbuf - 32)
1311                 goto out;
1312
1313         skb = sock_alloc_send_skb(sk, len, msg->msg_flags&MSG_DONTWAIT, &err);
1314         if (skb==NULL)
1315                 goto out;
1316
1317         memcpy(UNIXCREDS(skb), &siocb->scm->creds, sizeof(struct ucred));
1318         if (siocb->scm->fp)
1319                 unix_attach_fds(siocb->scm, skb);
1320
1321         skb->h.raw = skb->data;
1322         err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
1323         if (err)
1324                 goto out_free;
1325
1326         timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1327
1328 restart:
1329         if (!other) {
1330                 err = -ECONNRESET;
1331                 if (sunaddr == NULL)
1332                         goto out_free;
1333
1334                 other = unix_find_other(sunaddr, namelen, sk->sk_type,
1335                                         hash, &err);
1336                 if (other==NULL)
1337                         goto out_free;
1338         }
1339
1340         unix_state_rlock(other);
1341         err = -EPERM;
1342         if (!unix_may_send(sk, other))
1343                 goto out_unlock;
1344
1345         if (sock_flag(other, SOCK_DEAD)) {
1346                 /*
1347                  *      Check with 1003.1g - what should
1348                  *      datagram error
1349                  */
1350                 unix_state_runlock(other);
1351                 sock_put(other);
1352
1353                 err = 0;
1354                 unix_state_wlock(sk);
1355                 if (unix_peer(sk) == other) {
1356                         unix_peer(sk)=NULL;
1357                         unix_state_wunlock(sk);
1358
1359                         unix_dgram_disconnected(sk, other);
1360                         sock_put(other);
1361                         err = -ECONNREFUSED;
1362                 } else {
1363                         unix_state_wunlock(sk);
1364                 }
1365
1366                 other = NULL;
1367                 if (err)
1368                         goto out_free;
1369                 goto restart;
1370         }
1371
1372         err = -EPIPE;
1373         if (other->sk_shutdown & RCV_SHUTDOWN)
1374                 goto out_unlock;
1375
1376         err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1377         if (err)
1378                 goto out_unlock;
1379
1380         if (unix_peer(other) != sk &&
1381             (skb_queue_len(&other->sk_receive_queue) >
1382              other->sk_max_ack_backlog)) {
1383                 if (!timeo) {
1384                         err = -EAGAIN;
1385                         goto out_unlock;
1386                 }
1387
1388                 timeo = unix_wait_for_peer(other, timeo);
1389
1390                 err = sock_intr_errno(timeo);
1391                 if (signal_pending(current))
1392                         goto out_free;
1393
1394                 goto restart;
1395         }
1396
1397         skb_queue_tail(&other->sk_receive_queue, skb);
1398         unix_state_runlock(other);
1399         other->sk_data_ready(other, len);
1400         sock_put(other);
1401         scm_destroy(siocb->scm);
1402         return len;
1403
1404 out_unlock:
1405         unix_state_runlock(other);
1406 out_free:
1407         kfree_skb(skb);
1408 out:
1409         if (other)
1410                 sock_put(other);
1411         scm_destroy(siocb->scm);
1412         return err;
1413 }
1414
1415                 
1416 static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
1417                                struct msghdr *msg, size_t len)
1418 {
1419         struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1420         struct sock *sk = sock->sk;
1421         struct sock *other = NULL;
1422         struct sockaddr_un *sunaddr=msg->msg_name;
1423         int err,size;
1424         struct sk_buff *skb;
1425         int sent=0;
1426         struct scm_cookie tmp_scm;
1427
1428         if (NULL == siocb->scm)
1429                 siocb->scm = &tmp_scm;
1430         err = scm_send(sock, msg, siocb->scm);
1431         if (err < 0)
1432                 return err;
1433
1434         err = -EOPNOTSUPP;
1435         if (msg->msg_flags&MSG_OOB)
1436                 goto out_err;
1437
1438         if (msg->msg_namelen) {
1439                 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1440                 goto out_err;
1441         } else {
1442                 sunaddr = NULL;
1443                 err = -ENOTCONN;
1444                 other = unix_peer_get(sk);
1445                 if (!other)
1446                         goto out_err;
1447         }
1448
1449         if (sk->sk_shutdown & SEND_SHUTDOWN)
1450                 goto pipe_err;
1451
1452         while(sent < len)
1453         {
1454                 /*
1455                  *      Optimisation for the fact that under 0.01% of X messages typically
1456                  *      need breaking up.
1457                  */
1458
1459                 size=len-sent;
1460
1461                 /* Keep two messages in the pipe so it schedules better */
1462                 if (size > sk->sk_sndbuf / 2 - 64)
1463                         size = sk->sk_sndbuf / 2 - 64;
1464
1465                 if (size > SKB_MAX_ALLOC)
1466                         size = SKB_MAX_ALLOC;
1467                         
1468                 /*
1469                  *      Grab a buffer
1470                  */
1471                  
1472                 skb=sock_alloc_send_skb(sk,size,msg->msg_flags&MSG_DONTWAIT, &err);
1473
1474                 if (skb==NULL)
1475                         goto out_err;
1476
1477                 /*
1478                  *      If you pass two values to the sock_alloc_send_skb
1479                  *      it tries to grab the large buffer with GFP_NOFS
1480                  *      (which can fail easily), and if it fails grab the
1481                  *      fallback size buffer which is under a page and will
1482                  *      succeed. [Alan]
1483                  */
1484                 size = min_t(int, size, skb_tailroom(skb));
1485
1486                 memcpy(UNIXCREDS(skb), &siocb->scm->creds, sizeof(struct ucred));
1487                 if (siocb->scm->fp)
1488                         unix_attach_fds(siocb->scm, skb);
1489
1490                 if ((err = memcpy_fromiovec(skb_put(skb,size), msg->msg_iov, size)) != 0) {
1491                         kfree_skb(skb);
1492                         goto out_err;
1493                 }
1494
1495                 unix_state_rlock(other);
1496
1497                 if (sock_flag(other, SOCK_DEAD) ||
1498                     (other->sk_shutdown & RCV_SHUTDOWN))
1499                         goto pipe_err_free;
1500
1501                 skb_queue_tail(&other->sk_receive_queue, skb);
1502                 unix_state_runlock(other);
1503                 other->sk_data_ready(other, size);
1504                 sent+=size;
1505         }
1506         sock_put(other);
1507
1508         scm_destroy(siocb->scm);
1509         siocb->scm = NULL;
1510
1511         return sent;
1512
1513 pipe_err_free:
1514         unix_state_runlock(other);
1515         kfree_skb(skb);
1516 pipe_err:
1517         if (sent==0 && !(msg->msg_flags&MSG_NOSIGNAL))
1518                 send_sig(SIGPIPE,current,0);
1519         err = -EPIPE;
1520 out_err:
1521         if (other)
1522                 sock_put(other);
1523         scm_destroy(siocb->scm);
1524         siocb->scm = NULL;
1525         return sent ? : err;
1526 }
1527
1528 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
1529 {
1530         struct unix_sock *u = unix_sk(sk);
1531
1532         msg->msg_namelen = 0;
1533         if (u->addr) {
1534                 msg->msg_namelen = u->addr->len;
1535                 memcpy(msg->msg_name, u->addr->name, u->addr->len);
1536         }
1537 }
1538
1539 static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
1540                               struct msghdr *msg, size_t size,
1541                               int flags)
1542 {
1543         struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1544         struct scm_cookie tmp_scm;
1545         struct sock *sk = sock->sk;
1546         struct unix_sock *u = unix_sk(sk);
1547         int noblock = flags & MSG_DONTWAIT;
1548         struct sk_buff *skb;
1549         int err;
1550
1551         err = -EOPNOTSUPP;
1552         if (flags&MSG_OOB)
1553                 goto out;
1554
1555         msg->msg_namelen = 0;
1556
1557         skb = skb_recv_datagram(sk, flags, noblock, &err);
1558         if (!skb)
1559                 goto out;
1560
1561         wake_up_interruptible(&u->peer_wait);
1562
1563         if (msg->msg_name)
1564                 unix_copy_addr(msg, skb->sk);
1565
1566         if (size > skb->len)
1567                 size = skb->len;
1568         else if (size < skb->len)
1569                 msg->msg_flags |= MSG_TRUNC;
1570
1571         err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, size);
1572         if (err)
1573                 goto out_free;
1574
1575         if (!siocb->scm) {
1576                 siocb->scm = &tmp_scm;
1577                 memset(&tmp_scm, 0, sizeof(tmp_scm));
1578         }
1579         siocb->scm->creds = *UNIXCREDS(skb);
1580
1581         if (!(flags & MSG_PEEK))
1582         {
1583                 if (UNIXCB(skb).fp)
1584                         unix_detach_fds(siocb->scm, skb);
1585         }
1586         else 
1587         {
1588                 /* It is questionable: on PEEK we could:
1589                    - do not return fds - good, but too simple 8)
1590                    - return fds, and do not return them on read (old strategy,
1591                      apparently wrong)
1592                    - clone fds (I chose it for now, it is the most universal
1593                      solution)
1594                 
1595                    POSIX 1003.1g does not actually define this clearly
1596                    at all. POSIX 1003.1g doesn't define a lot of things
1597                    clearly however!                  
1598                    
1599                 */
1600                 if (UNIXCB(skb).fp)
1601                         siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1602         }
1603         err = size;
1604
1605         scm_recv(sock, msg, siocb->scm, flags);
1606
1607 out_free:
1608         skb_free_datagram(sk,skb);
1609 out:
1610         return err;
1611 }
1612
1613 /*
1614  *      Sleep until data has arrive. But check for races..
1615  */
1616  
1617 static long unix_stream_data_wait(struct sock * sk, long timeo)
1618 {
1619         DEFINE_WAIT(wait);
1620
1621         unix_state_rlock(sk);
1622
1623         for (;;) {
1624                 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1625
1626                 if (skb_queue_len(&sk->sk_receive_queue) ||
1627                     sk->sk_err ||
1628                     (sk->sk_shutdown & RCV_SHUTDOWN) ||
1629                     signal_pending(current) ||
1630                     !timeo)
1631                         break;
1632
1633                 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1634                 unix_state_runlock(sk);
1635                 timeo = schedule_timeout(timeo);
1636                 unix_state_rlock(sk);
1637                 clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1638         }
1639
1640         finish_wait(sk->sk_sleep, &wait);
1641         unix_state_runlock(sk);
1642         return timeo;
1643 }
1644
1645
1646
1647 static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
1648                                struct msghdr *msg, size_t size,
1649                                int flags)
1650 {
1651         struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1652         struct scm_cookie tmp_scm;
1653         struct sock *sk = sock->sk;
1654         struct unix_sock *u = unix_sk(sk);
1655         struct sockaddr_un *sunaddr=msg->msg_name;
1656         int copied = 0;
1657         int check_creds = 0;
1658         int target;
1659         int err = 0;
1660         long timeo;
1661
1662         err = -EINVAL;
1663         if (sk->sk_state != TCP_ESTABLISHED)
1664                 goto out;
1665
1666         err = -EOPNOTSUPP;
1667         if (flags&MSG_OOB)
1668                 goto out;
1669
1670         target = sock_rcvlowat(sk, flags&MSG_WAITALL, size);
1671         timeo = sock_rcvtimeo(sk, flags&MSG_DONTWAIT);
1672
1673         msg->msg_namelen = 0;
1674
1675         /* Lock the socket to prevent queue disordering
1676          * while sleeps in memcpy_tomsg
1677          */
1678
1679         if (!siocb->scm) {
1680                 siocb->scm = &tmp_scm;
1681                 memset(&tmp_scm, 0, sizeof(tmp_scm));
1682         }
1683
1684         down(&u->readsem);
1685
1686         do
1687         {
1688                 int chunk;
1689                 struct sk_buff *skb;
1690
1691                 skb = skb_dequeue(&sk->sk_receive_queue);
1692                 if (skb==NULL)
1693                 {
1694                         if (copied >= target)
1695                                 break;
1696
1697                         /*
1698                          *      POSIX 1003.1g mandates this order.
1699                          */
1700                          
1701                         if ((err = sock_error(sk)) != 0)
1702                                 break;
1703                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1704                                 break;
1705                         err = -EAGAIN;
1706                         if (!timeo)
1707                                 break;
1708                         up(&u->readsem);
1709
1710                         timeo = unix_stream_data_wait(sk, timeo);
1711
1712                         if (signal_pending(current)) {
1713                                 err = sock_intr_errno(timeo);
1714                                 goto out;
1715                         }
1716                         down(&u->readsem);
1717                         continue;
1718                 }
1719
1720                 if (check_creds) {
1721                         /* Never glue messages from different writers */
1722                         if (memcmp(UNIXCREDS(skb), &siocb->scm->creds, sizeof(siocb->scm->creds)) != 0) {
1723                                 skb_queue_head(&sk->sk_receive_queue, skb);
1724                                 break;
1725                         }
1726                 } else {
1727                         /* Copy credentials */
1728                         siocb->scm->creds = *UNIXCREDS(skb);
1729                         check_creds = 1;
1730                 }
1731
1732                 /* Copy address just once */
1733                 if (sunaddr)
1734                 {
1735                         unix_copy_addr(msg, skb->sk);
1736                         sunaddr = NULL;
1737                 }
1738
1739                 chunk = min_t(unsigned int, skb->len, size);
1740                 if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) {
1741                         skb_queue_head(&sk->sk_receive_queue, skb);
1742                         if (copied == 0)
1743                                 copied = -EFAULT;
1744                         break;
1745                 }
1746                 copied += chunk;
1747                 size -= chunk;
1748
1749                 /* Mark read part of skb as used */
1750                 if (!(flags & MSG_PEEK))
1751                 {
1752                         skb_pull(skb, chunk);
1753
1754                         if (UNIXCB(skb).fp)
1755                                 unix_detach_fds(siocb->scm, skb);
1756
1757                         /* put the skb back if we didn't use it up.. */
1758                         if (skb->len)
1759                         {
1760                                 skb_queue_head(&sk->sk_receive_queue, skb);
1761                                 break;
1762                         }
1763
1764                         kfree_skb(skb);
1765
1766                         if (siocb->scm->fp)
1767                                 break;
1768                 }
1769                 else
1770                 {
1771                         /* It is questionable, see note in unix_dgram_recvmsg.
1772                          */
1773                         if (UNIXCB(skb).fp)
1774                                 siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1775
1776                         /* put message back and return */
1777                         skb_queue_head(&sk->sk_receive_queue, skb);
1778                         break;
1779                 }
1780         } while (size);
1781
1782         up(&u->readsem);
1783         scm_recv(sock, msg, siocb->scm, flags);
1784 out:
1785         return copied ? : err;
1786 }
1787
1788 static int unix_shutdown(struct socket *sock, int mode)
1789 {
1790         struct sock *sk = sock->sk;
1791         struct sock *other;
1792
1793         mode = (mode+1)&(RCV_SHUTDOWN|SEND_SHUTDOWN);
1794
1795         if (mode) {
1796                 unix_state_wlock(sk);
1797                 sk->sk_shutdown |= mode;
1798                 other=unix_peer(sk);
1799                 if (other)
1800                         sock_hold(other);
1801                 unix_state_wunlock(sk);
1802                 sk->sk_state_change(sk);
1803
1804                 if (other &&
1805                         (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
1806
1807                         int peer_mode = 0;
1808
1809                         if (mode&RCV_SHUTDOWN)
1810                                 peer_mode |= SEND_SHUTDOWN;
1811                         if (mode&SEND_SHUTDOWN)
1812                                 peer_mode |= RCV_SHUTDOWN;
1813                         unix_state_wlock(other);
1814                         other->sk_shutdown |= peer_mode;
1815                         unix_state_wunlock(other);
1816                         other->sk_state_change(other);
1817                         read_lock(&other->sk_callback_lock);
1818                         if (peer_mode == SHUTDOWN_MASK)
1819                                 sk_wake_async(other,1,POLL_HUP);
1820                         else if (peer_mode & RCV_SHUTDOWN)
1821                                 sk_wake_async(other,1,POLL_IN);
1822                         read_unlock(&other->sk_callback_lock);
1823                 }
1824                 if (other)
1825                         sock_put(other);
1826         }
1827         return 0;
1828 }
1829
1830 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1831 {
1832         struct sock *sk = sock->sk;
1833         long amount=0;
1834         int err;
1835
1836         switch(cmd)
1837         {
1838                 case SIOCOUTQ:
1839                         amount = atomic_read(&sk->sk_wmem_alloc);
1840                         err = put_user(amount, (int *)arg);
1841                         break;
1842                 case SIOCINQ:
1843                 {
1844                         struct sk_buff *skb;
1845                         if (sk->sk_state == TCP_LISTEN) {
1846                                 err = -EINVAL;
1847                                 break;
1848                         }
1849
1850                         spin_lock(&sk->sk_receive_queue.lock);
1851                         skb = skb_peek(&sk->sk_receive_queue);
1852                         if (skb)
1853                                 amount=skb->len;
1854                         spin_unlock(&sk->sk_receive_queue.lock);
1855                         err = put_user(amount, (int *)arg);
1856                         break;
1857                 }
1858
1859                 default:
1860                         err = dev_ioctl(cmd, (void *)arg);
1861                         break;
1862         }
1863         return err;
1864 }
1865
1866 static unsigned int unix_poll(struct file * file, struct socket *sock, poll_table *wait)
1867 {
1868         struct sock *sk = sock->sk;
1869         unsigned int mask;
1870
1871         poll_wait(file, sk->sk_sleep, wait);
1872         mask = 0;
1873
1874         /* exceptional events? */
1875         if (sk->sk_err)
1876                 mask |= POLLERR;
1877         if (sk->sk_shutdown == SHUTDOWN_MASK)
1878                 mask |= POLLHUP;
1879
1880         /* readable? */
1881         if (!skb_queue_empty(&sk->sk_receive_queue) ||
1882             (sk->sk_shutdown & RCV_SHUTDOWN))
1883                 mask |= POLLIN | POLLRDNORM;
1884
1885         /* Connection-based need to check for termination and startup */
1886         if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) && sk->sk_state == TCP_CLOSE)
1887                 mask |= POLLHUP;
1888
1889         /*
1890          * we set writable also when the other side has shut down the
1891          * connection. This prevents stuck sockets.
1892          */
1893         if (unix_writable(sk))
1894                 mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
1895
1896         return mask;
1897 }
1898
1899
1900 #ifdef CONFIG_PROC_FS
1901 static struct sock *unix_seq_idx(int *iter, loff_t pos)
1902 {
1903         loff_t off = 0;
1904         struct sock *s;
1905
1906         for (s = first_unix_socket(iter); s; s = next_unix_socket(iter, s)) {
1907                 if (off == pos) 
1908                         return s;
1909                 ++off;
1910         }
1911         return NULL;
1912 }
1913
1914
1915 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
1916 {
1917         read_lock(&unix_table_lock);
1918         return *pos ? unix_seq_idx(seq->private, *pos - 1) : ((void *) 1);
1919 }
1920
1921 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1922 {
1923         ++*pos;
1924
1925         if (v == (void *)1) 
1926                 return first_unix_socket(seq->private);
1927         return next_unix_socket(seq->private, v);
1928 }
1929
1930 static void unix_seq_stop(struct seq_file *seq, void *v)
1931 {
1932         read_unlock(&unix_table_lock);
1933 }
1934
1935 static int unix_seq_show(struct seq_file *seq, void *v)
1936 {
1937         
1938         if (v == (void *)1)
1939                 seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
1940                          "Inode Path\n");
1941         else {
1942                 struct sock *s = v;
1943                 struct unix_sock *u = unix_sk(s);
1944                 unix_state_rlock(s);
1945
1946                 seq_printf(seq, "%p: %08X %08X %08X %04X %02X %5lu",
1947                         s,
1948                         atomic_read(&s->sk_refcnt),
1949                         0,
1950                         s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
1951                         s->sk_type,
1952                         s->sk_socket ?
1953                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
1954                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
1955                         sock_i_ino(s));
1956
1957                 if (u->addr) {
1958                         int i, len;
1959                         seq_putc(seq, ' ');
1960
1961                         i = 0;
1962                         len = u->addr->len - sizeof(short);
1963                         if (!UNIX_ABSTRACT(s))
1964                                 len--;
1965                         else {
1966                                 seq_putc(seq, '@');
1967                                 i++;
1968                         }
1969                         for ( ; i < len; i++)
1970                                 seq_putc(seq, u->addr->name->sun_path[i]);
1971                 }
1972                 unix_state_runlock(s);
1973                 seq_putc(seq, '\n');
1974         }
1975
1976         return 0;
1977 }
1978
1979 static struct seq_operations unix_seq_ops = {
1980         .start  = unix_seq_start,
1981         .next   = unix_seq_next,
1982         .stop   = unix_seq_stop,
1983         .show   = unix_seq_show,
1984 };
1985
1986
1987 static int unix_seq_open(struct inode *inode, struct file *file)
1988 {
1989         struct seq_file *seq;
1990         int rc = -ENOMEM;
1991         int *iter = kmalloc(sizeof(int), GFP_KERNEL);
1992
1993         if (!iter)
1994                 goto out;
1995
1996         rc = seq_open(file, &unix_seq_ops);
1997         if (rc)
1998                 goto out_kfree;
1999
2000         seq          = file->private_data;
2001         seq->private = iter;
2002         *iter = 0;
2003 out:
2004         return rc;
2005 out_kfree:
2006         kfree(iter);
2007         goto out;
2008 }
2009
2010 static struct file_operations unix_seq_fops = {
2011         .owner          = THIS_MODULE,
2012         .open           = unix_seq_open,
2013         .read           = seq_read,
2014         .llseek         = seq_lseek,
2015         .release        = seq_release_private,
2016 };
2017
2018 #endif
2019
2020 static struct net_proto_family unix_family_ops = {
2021         .family = PF_UNIX,
2022         .create = unix_create,
2023         .owner  = THIS_MODULE,
2024 };
2025
2026 #ifdef CONFIG_SYSCTL
2027 extern void unix_sysctl_register(void);
2028 extern void unix_sysctl_unregister(void);
2029 #else
2030 static inline void unix_sysctl_register(void) {}
2031 static inline void unix_sysctl_unregister(void) {}
2032 #endif
2033
2034 static int __init af_unix_init(void)
2035 {
2036         struct sk_buff *dummy_skb;
2037
2038         if (sizeof(struct unix_skb_parms) > sizeof(dummy_skb->cb)) {
2039                 printk(KERN_CRIT "%s: panic\n", __FUNCTION__);
2040                 return -1;
2041         }
2042         /* allocate our sock slab cache */
2043         unix_sk_cachep = kmem_cache_create("unix_sock",
2044                                            sizeof(struct unix_sock), 0,
2045                                            SLAB_HWCACHE_ALIGN, 0, 0);
2046         if (!unix_sk_cachep)
2047                 printk(KERN_CRIT
2048                         "af_unix_init: Cannot create unix_sock SLAB cache!\n");
2049
2050         sock_register(&unix_family_ops);
2051 #ifdef CONFIG_PROC_FS
2052         proc_net_fops_create("unix", 0, &unix_seq_fops);
2053 #endif
2054         unix_sysctl_register();
2055         return 0;
2056 }
2057
2058 static void __exit af_unix_exit(void)
2059 {
2060         sock_unregister(PF_UNIX);
2061         unix_sysctl_unregister();
2062         proc_net_remove("unix");
2063         kmem_cache_destroy(unix_sk_cachep);
2064 }
2065
2066 module_init(af_unix_init);
2067 module_exit(af_unix_exit);
2068
2069 MODULE_LICENSE("GPL");
2070 MODULE_ALIAS_NETPROTO(PF_UNIX);