ftp://ftp.kernel.org/pub/linux/kernel/v2.6/linux-2.6.6.tar.bz2
[linux-2.6.git] / fs / pipe.c
1 /*
2  *  linux/fs/pipe.c
3  *
4  *  Copyright (C) 1991, 1992, 1999  Linus Torvalds
5  */
6
7 #include <linux/mm.h>
8 #include <linux/file.h>
9 #include <linux/poll.h>
10 #include <linux/slab.h>
11 #include <linux/module.h>
12 #include <linux/init.h>
13 #include <linux/fs.h>
14 #include <linux/mount.h>
15 #include <linux/pipe_fs_i.h>
16 #include <linux/uio.h>
17 #include <asm/uaccess.h>
18 #include <asm/ioctls.h>
19
20 /*
21  * We use a start+len construction, which provides full use of the 
22  * allocated memory.
23  * -- Florian Coosmann (FGC)
24  * 
25  * Reads with count = 0 should always return 0.
26  * -- Julian Bradfield 1999-06-07.
27  *
28  * FIFOs and Pipes now generate SIGIO for both readers and writers.
29  * -- Jeremy Elson <jelson@circlemud.org> 2001-08-16
30  *
31  * pipe_read & write cleanup
32  * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09
33  */
34
35 /* Drop the inode semaphore and wait for a pipe event, atomically */
36 void pipe_wait(struct inode * inode)
37 {
38         DEFINE_WAIT(wait);
39
40         prepare_to_wait(PIPE_WAIT(*inode), &wait, TASK_INTERRUPTIBLE);
41         up(PIPE_SEM(*inode));
42         schedule();
43         finish_wait(PIPE_WAIT(*inode), &wait);
44         down(PIPE_SEM(*inode));
45 }
46
47 static inline int
48 pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len)
49 {
50         unsigned long copy;
51
52         while (len > 0) {
53                 while (!iov->iov_len)
54                         iov++;
55                 copy = min_t(unsigned long, len, iov->iov_len);
56
57                 if (copy_from_user(to, iov->iov_base, copy))
58                         return -EFAULT;
59                 to += copy;
60                 len -= copy;
61                 iov->iov_base += copy;
62                 iov->iov_len -= copy;
63         }
64         return 0;
65 }
66
67 static inline int
68 pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len)
69 {
70         unsigned long copy;
71
72         while (len > 0) {
73                 while (!iov->iov_len)
74                         iov++;
75                 copy = min_t(unsigned long, len, iov->iov_len);
76
77                 if (copy_to_user(iov->iov_base, from, copy))
78                         return -EFAULT;
79                 from += copy;
80                 len -= copy;
81                 iov->iov_base += copy;
82                 iov->iov_len -= copy;
83         }
84         return 0;
85 }
86
87 static ssize_t
88 pipe_readv(struct file *filp, const struct iovec *_iov,
89            unsigned long nr_segs, loff_t *ppos)
90 {
91         struct inode *inode = filp->f_dentry->d_inode;
92         int do_wakeup;
93         ssize_t ret;
94         struct iovec *iov = (struct iovec *)_iov;
95         size_t total_len;
96
97         /* pread is not allowed on pipes. */
98         if (unlikely(ppos != &filp->f_pos))
99                 return -ESPIPE;
100
101         total_len = iov_length(iov, nr_segs);
102         /* Null read succeeds. */
103         if (unlikely(total_len == 0))
104                 return 0;
105
106         do_wakeup = 0;
107         ret = 0;
108         down(PIPE_SEM(*inode));
109         for (;;) {
110                 int size = PIPE_LEN(*inode);
111                 if (size) {
112                         char *pipebuf = PIPE_BASE(*inode) + PIPE_START(*inode);
113                         ssize_t chars = PIPE_MAX_RCHUNK(*inode);
114
115                         if (chars > total_len)
116                                 chars = total_len;
117                         if (chars > size)
118                                 chars = size;
119
120                         if (pipe_iov_copy_to_user(iov, pipebuf, chars)) {
121                                 if (!ret) ret = -EFAULT;
122                                 break;
123                         }
124                         ret += chars;
125
126                         PIPE_START(*inode) += chars;
127                         PIPE_START(*inode) &= (PIPE_SIZE - 1);
128                         PIPE_LEN(*inode) -= chars;
129                         total_len -= chars;
130                         do_wakeup = 1;
131                         if (!total_len)
132                                 break;  /* common path: read succeeded */
133                 }
134                 if (PIPE_LEN(*inode)) /* test for cyclic buffers */
135                         continue;
136                 if (!PIPE_WRITERS(*inode))
137                         break;
138                 if (!PIPE_WAITING_WRITERS(*inode)) {
139                         /* syscall merging: Usually we must not sleep
140                          * if O_NONBLOCK is set, or if we got some data.
141                          * But if a writer sleeps in kernel space, then
142                          * we can wait for that data without violating POSIX.
143                          */
144                         if (ret)
145                                 break;
146                         if (filp->f_flags & O_NONBLOCK) {
147                                 ret = -EAGAIN;
148                                 break;
149                         }
150                 }
151                 if (signal_pending(current)) {
152                         if (!ret) ret = -ERESTARTSYS;
153                         break;
154                 }
155                 if (do_wakeup) {
156                         wake_up_interruptible_sync(PIPE_WAIT(*inode));
157                         kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT);
158                 }
159                 pipe_wait(inode);
160         }
161         up(PIPE_SEM(*inode));
162         /* Signal writers asynchronously that there is more room.  */
163         if (do_wakeup) {
164                 wake_up_interruptible(PIPE_WAIT(*inode));
165                 kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT);
166         }
167         if (ret > 0)
168                 file_accessed(filp);
169         return ret;
170 }
171
172 static ssize_t
173 pipe_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
174 {
175         struct iovec iov = { .iov_base = buf, .iov_len = count };
176         return pipe_readv(filp, &iov, 1, ppos);
177 }
178
179 static ssize_t
180 pipe_writev(struct file *filp, const struct iovec *_iov,
181             unsigned long nr_segs, loff_t *ppos)
182 {
183         struct inode *inode = filp->f_dentry->d_inode;
184         ssize_t ret;
185         size_t min;
186         int do_wakeup;
187         struct iovec *iov = (struct iovec *)_iov;
188         size_t total_len;
189
190         /* pwrite is not allowed on pipes. */
191         if (unlikely(ppos != &filp->f_pos))
192                 return -ESPIPE;
193
194         total_len = iov_length(iov, nr_segs);
195         /* Null write succeeds. */
196         if (unlikely(total_len == 0))
197                 return 0;
198
199         do_wakeup = 0;
200         ret = 0;
201         min = total_len;
202         if (min > PIPE_BUF)
203                 min = 1;
204         down(PIPE_SEM(*inode));
205         for (;;) {
206                 int free;
207                 if (!PIPE_READERS(*inode)) {
208                         send_sig(SIGPIPE, current, 0);
209                         if (!ret) ret = -EPIPE;
210                         break;
211                 }
212                 free = PIPE_FREE(*inode);
213                 if (free >= min) {
214                         /* transfer data */
215                         ssize_t chars = PIPE_MAX_WCHUNK(*inode);
216                         char *pipebuf = PIPE_BASE(*inode) + PIPE_END(*inode);
217                         /* Always wakeup, even if the copy fails. Otherwise
218                          * we lock up (O_NONBLOCK-)readers that sleep due to
219                          * syscall merging.
220                          */
221                         do_wakeup = 1;
222                         if (chars > total_len)
223                                 chars = total_len;
224                         if (chars > free)
225                                 chars = free;
226
227                         if (pipe_iov_copy_from_user(pipebuf, iov, chars)) {
228                                 if (!ret) ret = -EFAULT;
229                                 break;
230                         }
231                         ret += chars;
232
233                         PIPE_LEN(*inode) += chars;
234                         total_len -= chars;
235                         if (!total_len)
236                                 break;
237                 }
238                 if (PIPE_FREE(*inode) && ret) {
239                         /* handle cyclic data buffers */
240                         min = 1;
241                         continue;
242                 }
243                 if (filp->f_flags & O_NONBLOCK) {
244                         if (!ret) ret = -EAGAIN;
245                         break;
246                 }
247                 if (signal_pending(current)) {
248                         if (!ret) ret = -ERESTARTSYS;
249                         break;
250                 }
251                 if (do_wakeup) {
252                         wake_up_interruptible_sync(PIPE_WAIT(*inode));
253                         kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
254                         do_wakeup = 0;
255                 }
256                 PIPE_WAITING_WRITERS(*inode)++;
257                 pipe_wait(inode);
258                 PIPE_WAITING_WRITERS(*inode)--;
259         }
260         up(PIPE_SEM(*inode));
261         if (do_wakeup) {
262                 wake_up_interruptible(PIPE_WAIT(*inode));
263                 kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
264         }
265         if (ret > 0)
266                 inode_update_time(inode, 1);    /* mtime and ctime */
267         return ret;
268 }
269
270 static ssize_t
271 pipe_write(struct file *filp, const char __user *buf,
272            size_t count, loff_t *ppos)
273 {
274         struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count };
275         return pipe_writev(filp, &iov, 1, ppos);
276 }
277
278 static ssize_t
279 bad_pipe_r(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
280 {
281         return -EBADF;
282 }
283
284 static ssize_t
285 bad_pipe_w(struct file *filp, const char __user *buf, size_t count, loff_t *ppos)
286 {
287         return -EBADF;
288 }
289
290 static int
291 pipe_ioctl(struct inode *pino, struct file *filp,
292            unsigned int cmd, unsigned long arg)
293 {
294         switch (cmd) {
295                 case FIONREAD:
296                         return put_user(PIPE_LEN(*pino), (int __user *)arg);
297                 default:
298                         return -EINVAL;
299         }
300 }
301
302 /* No kernel lock held - fine */
303 static unsigned int
304 pipe_poll(struct file *filp, poll_table *wait)
305 {
306         unsigned int mask;
307         struct inode *inode = filp->f_dentry->d_inode;
308
309         poll_wait(filp, PIPE_WAIT(*inode), wait);
310
311         /* Reading only -- no need for acquiring the semaphore.  */
312         mask = POLLIN | POLLRDNORM;
313         if (PIPE_EMPTY(*inode))
314                 mask = POLLOUT | POLLWRNORM;
315         if (!PIPE_WRITERS(*inode) && filp->f_version != PIPE_WCOUNTER(*inode))
316                 mask |= POLLHUP;
317         if (!PIPE_READERS(*inode))
318                 mask |= POLLERR;
319
320         return mask;
321 }
322
323 /* FIXME: most Unices do not set POLLERR for fifos */
324 #define fifo_poll pipe_poll
325
326 static int
327 pipe_release(struct inode *inode, int decr, int decw)
328 {
329         down(PIPE_SEM(*inode));
330         PIPE_READERS(*inode) -= decr;
331         PIPE_WRITERS(*inode) -= decw;
332         if (!PIPE_READERS(*inode) && !PIPE_WRITERS(*inode)) {
333                 struct pipe_inode_info *info = inode->i_pipe;
334                 inode->i_pipe = NULL;
335                 free_page((unsigned long) info->base);
336                 kfree(info);
337         } else {
338                 wake_up_interruptible(PIPE_WAIT(*inode));
339                 kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
340                 kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT);
341         }
342         up(PIPE_SEM(*inode));
343
344         return 0;
345 }
346
347 static int
348 pipe_read_fasync(int fd, struct file *filp, int on)
349 {
350         struct inode *inode = filp->f_dentry->d_inode;
351         int retval;
352
353         down(PIPE_SEM(*inode));
354         retval = fasync_helper(fd, filp, on, PIPE_FASYNC_READERS(*inode));
355         up(PIPE_SEM(*inode));
356
357         if (retval < 0)
358                 return retval;
359
360         return 0;
361 }
362
363
364 static int
365 pipe_write_fasync(int fd, struct file *filp, int on)
366 {
367         struct inode *inode = filp->f_dentry->d_inode;
368         int retval;
369
370         down(PIPE_SEM(*inode));
371         retval = fasync_helper(fd, filp, on, PIPE_FASYNC_WRITERS(*inode));
372         up(PIPE_SEM(*inode));
373
374         if (retval < 0)
375                 return retval;
376
377         return 0;
378 }
379
380
381 static int
382 pipe_rdwr_fasync(int fd, struct file *filp, int on)
383 {
384         struct inode *inode = filp->f_dentry->d_inode;
385         int retval;
386
387         down(PIPE_SEM(*inode));
388
389         retval = fasync_helper(fd, filp, on, PIPE_FASYNC_READERS(*inode));
390
391         if (retval >= 0)
392                 retval = fasync_helper(fd, filp, on, PIPE_FASYNC_WRITERS(*inode));
393
394         up(PIPE_SEM(*inode));
395
396         if (retval < 0)
397                 return retval;
398
399         return 0;
400 }
401
402
403 static int
404 pipe_read_release(struct inode *inode, struct file *filp)
405 {
406         pipe_read_fasync(-1, filp, 0);
407         return pipe_release(inode, 1, 0);
408 }
409
410 static int
411 pipe_write_release(struct inode *inode, struct file *filp)
412 {
413         pipe_write_fasync(-1, filp, 0);
414         return pipe_release(inode, 0, 1);
415 }
416
417 static int
418 pipe_rdwr_release(struct inode *inode, struct file *filp)
419 {
420         int decr, decw;
421
422         pipe_rdwr_fasync(-1, filp, 0);
423         decr = (filp->f_mode & FMODE_READ) != 0;
424         decw = (filp->f_mode & FMODE_WRITE) != 0;
425         return pipe_release(inode, decr, decw);
426 }
427
428 static int
429 pipe_read_open(struct inode *inode, struct file *filp)
430 {
431         /* We could have perhaps used atomic_t, but this and friends
432            below are the only places.  So it doesn't seem worthwhile.  */
433         down(PIPE_SEM(*inode));
434         PIPE_READERS(*inode)++;
435         up(PIPE_SEM(*inode));
436
437         return 0;
438 }
439
440 static int
441 pipe_write_open(struct inode *inode, struct file *filp)
442 {
443         down(PIPE_SEM(*inode));
444         PIPE_WRITERS(*inode)++;
445         up(PIPE_SEM(*inode));
446
447         return 0;
448 }
449
450 static int
451 pipe_rdwr_open(struct inode *inode, struct file *filp)
452 {
453         down(PIPE_SEM(*inode));
454         if (filp->f_mode & FMODE_READ)
455                 PIPE_READERS(*inode)++;
456         if (filp->f_mode & FMODE_WRITE)
457                 PIPE_WRITERS(*inode)++;
458         up(PIPE_SEM(*inode));
459
460         return 0;
461 }
462
463 /*
464  * The file_operations structs are not static because they
465  * are also used in linux/fs/fifo.c to do operations on FIFOs.
466  */
467 struct file_operations read_fifo_fops = {
468         .llseek         = no_llseek,
469         .read           = pipe_read,
470         .readv          = pipe_readv,
471         .write          = bad_pipe_w,
472         .poll           = fifo_poll,
473         .ioctl          = pipe_ioctl,
474         .open           = pipe_read_open,
475         .release        = pipe_read_release,
476         .fasync         = pipe_read_fasync,
477 };
478
479 struct file_operations write_fifo_fops = {
480         .llseek         = no_llseek,
481         .read           = bad_pipe_r,
482         .write          = pipe_write,
483         .writev         = pipe_writev,
484         .poll           = fifo_poll,
485         .ioctl          = pipe_ioctl,
486         .open           = pipe_write_open,
487         .release        = pipe_write_release,
488         .fasync         = pipe_write_fasync,
489 };
490
491 struct file_operations rdwr_fifo_fops = {
492         .llseek         = no_llseek,
493         .read           = pipe_read,
494         .readv          = pipe_readv,
495         .write          = pipe_write,
496         .writev         = pipe_writev,
497         .poll           = fifo_poll,
498         .ioctl          = pipe_ioctl,
499         .open           = pipe_rdwr_open,
500         .release        = pipe_rdwr_release,
501         .fasync         = pipe_rdwr_fasync,
502 };
503
504 struct file_operations read_pipe_fops = {
505         .llseek         = no_llseek,
506         .read           = pipe_read,
507         .readv          = pipe_readv,
508         .write          = bad_pipe_w,
509         .poll           = pipe_poll,
510         .ioctl          = pipe_ioctl,
511         .open           = pipe_read_open,
512         .release        = pipe_read_release,
513         .fasync         = pipe_read_fasync,
514 };
515
516 struct file_operations write_pipe_fops = {
517         .llseek         = no_llseek,
518         .read           = bad_pipe_r,
519         .write          = pipe_write,
520         .writev         = pipe_writev,
521         .poll           = pipe_poll,
522         .ioctl          = pipe_ioctl,
523         .open           = pipe_write_open,
524         .release        = pipe_write_release,
525         .fasync         = pipe_write_fasync,
526 };
527
528 struct file_operations rdwr_pipe_fops = {
529         .llseek         = no_llseek,
530         .read           = pipe_read,
531         .readv          = pipe_readv,
532         .write          = pipe_write,
533         .writev         = pipe_writev,
534         .poll           = pipe_poll,
535         .ioctl          = pipe_ioctl,
536         .open           = pipe_rdwr_open,
537         .release        = pipe_rdwr_release,
538         .fasync         = pipe_rdwr_fasync,
539 };
540
541 struct inode* pipe_new(struct inode* inode)
542 {
543         unsigned long page;
544
545         page = __get_free_page(GFP_USER);
546         if (!page)
547                 return NULL;
548
549         inode->i_pipe = kmalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
550         if (!inode->i_pipe)
551                 goto fail_page;
552
553         init_waitqueue_head(PIPE_WAIT(*inode));
554         PIPE_BASE(*inode) = (char*) page;
555         PIPE_START(*inode) = PIPE_LEN(*inode) = 0;
556         PIPE_READERS(*inode) = PIPE_WRITERS(*inode) = 0;
557         PIPE_WAITING_WRITERS(*inode) = 0;
558         PIPE_RCOUNTER(*inode) = PIPE_WCOUNTER(*inode) = 1;
559         *PIPE_FASYNC_READERS(*inode) = *PIPE_FASYNC_WRITERS(*inode) = NULL;
560
561         return inode;
562 fail_page:
563         free_page(page);
564         return NULL;
565 }
566
567 static struct vfsmount *pipe_mnt;
568 static int pipefs_delete_dentry(struct dentry *dentry)
569 {
570         return 1;
571 }
572 static struct dentry_operations pipefs_dentry_operations = {
573         .d_delete       = pipefs_delete_dentry,
574 };
575
576 static struct inode * get_pipe_inode(void)
577 {
578         struct inode *inode = new_inode(pipe_mnt->mnt_sb);
579
580         if (!inode)
581                 goto fail_inode;
582
583         if(!pipe_new(inode))
584                 goto fail_iput;
585         PIPE_READERS(*inode) = PIPE_WRITERS(*inode) = 1;
586         inode->i_fop = &rdwr_pipe_fops;
587
588         /*
589          * Mark the inode dirty from the very beginning,
590          * that way it will never be moved to the dirty
591          * list because "mark_inode_dirty()" will think
592          * that it already _is_ on the dirty list.
593          */
594         inode->i_state = I_DIRTY;
595         inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR;
596         inode->i_uid = current->fsuid;
597         inode->i_gid = current->fsgid;
598         inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
599         inode->i_blksize = PAGE_SIZE;
600         return inode;
601
602 fail_iput:
603         iput(inode);
604 fail_inode:
605         return NULL;
606 }
607
608 int do_pipe(int *fd)
609 {
610         struct qstr this;
611         char name[32];
612         struct dentry *dentry;
613         struct inode * inode;
614         struct file *f1, *f2;
615         int error;
616         int i,j;
617
618         error = -ENFILE;
619         f1 = get_empty_filp();
620         if (!f1)
621                 goto no_files;
622
623         f2 = get_empty_filp();
624         if (!f2)
625                 goto close_f1;
626
627         inode = get_pipe_inode();
628         if (!inode)
629                 goto close_f12;
630
631         error = get_unused_fd();
632         if (error < 0)
633                 goto close_f12_inode;
634         i = error;
635
636         error = get_unused_fd();
637         if (error < 0)
638                 goto close_f12_inode_i;
639         j = error;
640
641         error = -ENOMEM;
642         sprintf(name, "[%lu]", inode->i_ino);
643         this.name = name;
644         this.len = strlen(name);
645         this.hash = inode->i_ino; /* will go */
646         dentry = d_alloc(pipe_mnt->mnt_sb->s_root, &this);
647         if (!dentry)
648                 goto close_f12_inode_i_j;
649         dentry->d_op = &pipefs_dentry_operations;
650         d_add(dentry, inode);
651         f1->f_vfsmnt = f2->f_vfsmnt = mntget(mntget(pipe_mnt));
652         f1->f_dentry = f2->f_dentry = dget(dentry);
653         f1->f_mapping = f2->f_mapping = inode->i_mapping;
654
655         /* read file */
656         f1->f_pos = f2->f_pos = 0;
657         f1->f_flags = O_RDONLY;
658         f1->f_op = &read_pipe_fops;
659         f1->f_mode = 1;
660         f1->f_version = 0;
661
662         /* write file */
663         f2->f_flags = O_WRONLY;
664         f2->f_op = &write_pipe_fops;
665         f2->f_mode = 2;
666         f2->f_version = 0;
667
668         fd_install(i, f1);
669         fd_install(j, f2);
670         fd[0] = i;
671         fd[1] = j;
672         return 0;
673
674 close_f12_inode_i_j:
675         put_unused_fd(j);
676 close_f12_inode_i:
677         put_unused_fd(i);
678 close_f12_inode:
679         free_page((unsigned long) PIPE_BASE(*inode));
680         kfree(inode->i_pipe);
681         inode->i_pipe = NULL;
682         iput(inode);
683 close_f12:
684         put_filp(f2);
685 close_f1:
686         put_filp(f1);
687 no_files:
688         return error;   
689 }
690
691 /*
692  * pipefs should _never_ be mounted by userland - too much of security hassle,
693  * no real gain from having the whole whorehouse mounted. So we don't need
694  * any operations on the root directory. However, we need a non-trivial
695  * d_name - pipe: will go nicely and kill the special-casing in procfs.
696  */
697
698 static struct super_block *pipefs_get_sb(struct file_system_type *fs_type,
699         int flags, const char *dev_name, void *data)
700 {
701         return get_sb_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC);
702 }
703
704 static struct file_system_type pipe_fs_type = {
705         .name           = "pipefs",
706         .get_sb         = pipefs_get_sb,
707         .kill_sb        = kill_anon_super,
708 };
709
710 static int __init init_pipe_fs(void)
711 {
712         int err = register_filesystem(&pipe_fs_type);
713         if (!err) {
714                 pipe_mnt = kern_mount(&pipe_fs_type);
715                 if (IS_ERR(pipe_mnt)) {
716                         err = PTR_ERR(pipe_mnt);
717                         unregister_filesystem(&pipe_fs_type);
718                 }
719         }
720         return err;
721 }
722
723 static void __exit exit_pipe_fs(void)
724 {
725         unregister_filesystem(&pipe_fs_type);
726         mntput(pipe_mnt);
727 }
728
729 module_init(init_pipe_fs)
730 module_exit(exit_pipe_fs)