VServer 1.9.2 (patch-2.6.8.1-vs1.9.2.diff)
[linux-2.6.git] / fs / pipe.c
1 /*
2  *  linux/fs/pipe.c
3  *
4  *  Copyright (C) 1991, 1992, 1999  Linus Torvalds
5  */
6
7 #include <linux/mm.h>
8 #include <linux/file.h>
9 #include <linux/poll.h>
10 #include <linux/slab.h>
11 #include <linux/module.h>
12 #include <linux/init.h>
13 #include <linux/fs.h>
14 #include <linux/mount.h>
15 #include <linux/pipe_fs_i.h>
16 #include <linux/uio.h>
17 #include <asm/uaccess.h>
18 #include <asm/ioctls.h>
19
20 /*
21  * We use a start+len construction, which provides full use of the 
22  * allocated memory.
23  * -- Florian Coosmann (FGC)
24  * 
25  * Reads with count = 0 should always return 0.
26  * -- Julian Bradfield 1999-06-07.
27  *
28  * FIFOs and Pipes now generate SIGIO for both readers and writers.
29  * -- Jeremy Elson <jelson@circlemud.org> 2001-08-16
30  *
31  * pipe_read & write cleanup
32  * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09
33  */
34
35 /* Drop the inode semaphore and wait for a pipe event, atomically */
36 void pipe_wait(struct inode * inode)
37 {
38         DEFINE_WAIT(wait);
39
40         prepare_to_wait(PIPE_WAIT(*inode), &wait, TASK_INTERRUPTIBLE);
41         up(PIPE_SEM(*inode));
42         schedule();
43         finish_wait(PIPE_WAIT(*inode), &wait);
44         down(PIPE_SEM(*inode));
45 }
46
47 static inline int
48 pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len)
49 {
50         unsigned long copy;
51
52         while (len > 0) {
53                 while (!iov->iov_len)
54                         iov++;
55                 copy = min_t(unsigned long, len, iov->iov_len);
56
57                 if (copy_from_user(to, iov->iov_base, copy))
58                         return -EFAULT;
59                 to += copy;
60                 len -= copy;
61                 iov->iov_base += copy;
62                 iov->iov_len -= copy;
63         }
64         return 0;
65 }
66
67 static inline int
68 pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len)
69 {
70         unsigned long copy;
71
72         while (len > 0) {
73                 while (!iov->iov_len)
74                         iov++;
75                 copy = min_t(unsigned long, len, iov->iov_len);
76
77                 if (copy_to_user(iov->iov_base, from, copy))
78                         return -EFAULT;
79                 from += copy;
80                 len -= copy;
81                 iov->iov_base += copy;
82                 iov->iov_len -= copy;
83         }
84         return 0;
85 }
86
87 static ssize_t
88 pipe_readv(struct file *filp, const struct iovec *_iov,
89            unsigned long nr_segs, loff_t *ppos)
90 {
91         struct inode *inode = filp->f_dentry->d_inode;
92         int do_wakeup;
93         ssize_t ret;
94         struct iovec *iov = (struct iovec *)_iov;
95         size_t total_len;
96
97         total_len = iov_length(iov, nr_segs);
98         /* Null read succeeds. */
99         if (unlikely(total_len == 0))
100                 return 0;
101
102         do_wakeup = 0;
103         ret = 0;
104         down(PIPE_SEM(*inode));
105         for (;;) {
106                 int size = PIPE_LEN(*inode);
107                 if (size) {
108                         char *pipebuf = PIPE_BASE(*inode) + PIPE_START(*inode);
109                         ssize_t chars = PIPE_MAX_RCHUNK(*inode);
110
111                         if (chars > total_len)
112                                 chars = total_len;
113                         if (chars > size)
114                                 chars = size;
115
116                         if (pipe_iov_copy_to_user(iov, pipebuf, chars)) {
117                                 if (!ret) ret = -EFAULT;
118                                 break;
119                         }
120                         ret += chars;
121
122                         PIPE_START(*inode) += chars;
123                         PIPE_START(*inode) &= (PIPE_SIZE - 1);
124                         PIPE_LEN(*inode) -= chars;
125                         total_len -= chars;
126                         do_wakeup = 1;
127                         if (!total_len)
128                                 break;  /* common path: read succeeded */
129                 }
130                 if (PIPE_LEN(*inode)) /* test for cyclic buffers */
131                         continue;
132                 if (!PIPE_WRITERS(*inode))
133                         break;
134                 if (!PIPE_WAITING_WRITERS(*inode)) {
135                         /* syscall merging: Usually we must not sleep
136                          * if O_NONBLOCK is set, or if we got some data.
137                          * But if a writer sleeps in kernel space, then
138                          * we can wait for that data without violating POSIX.
139                          */
140                         if (ret)
141                                 break;
142                         if (filp->f_flags & O_NONBLOCK) {
143                                 ret = -EAGAIN;
144                                 break;
145                         }
146                 }
147                 if (signal_pending(current)) {
148                         if (!ret) ret = -ERESTARTSYS;
149                         break;
150                 }
151                 if (do_wakeup) {
152                         wake_up_interruptible_sync(PIPE_WAIT(*inode));
153                         kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT);
154                 }
155                 pipe_wait(inode);
156         }
157         up(PIPE_SEM(*inode));
158         /* Signal writers asynchronously that there is more room.  */
159         if (do_wakeup) {
160                 wake_up_interruptible(PIPE_WAIT(*inode));
161                 kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT);
162         }
163         if (ret > 0)
164                 file_accessed(filp);
165         return ret;
166 }
167
168 static ssize_t
169 pipe_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
170 {
171         struct iovec iov = { .iov_base = buf, .iov_len = count };
172         return pipe_readv(filp, &iov, 1, ppos);
173 }
174
175 static ssize_t
176 pipe_writev(struct file *filp, const struct iovec *_iov,
177             unsigned long nr_segs, loff_t *ppos)
178 {
179         struct inode *inode = filp->f_dentry->d_inode;
180         ssize_t ret;
181         size_t min;
182         int do_wakeup;
183         struct iovec *iov = (struct iovec *)_iov;
184         size_t total_len;
185
186         total_len = iov_length(iov, nr_segs);
187         /* Null write succeeds. */
188         if (unlikely(total_len == 0))
189                 return 0;
190
191         do_wakeup = 0;
192         ret = 0;
193         min = total_len;
194         if (min > PIPE_BUF)
195                 min = 1;
196         down(PIPE_SEM(*inode));
197         for (;;) {
198                 int free;
199                 if (!PIPE_READERS(*inode)) {
200                         send_sig(SIGPIPE, current, 0);
201                         if (!ret) ret = -EPIPE;
202                         break;
203                 }
204                 free = PIPE_FREE(*inode);
205                 if (free >= min) {
206                         /* transfer data */
207                         ssize_t chars = PIPE_MAX_WCHUNK(*inode);
208                         char *pipebuf = PIPE_BASE(*inode) + PIPE_END(*inode);
209                         /* Always wakeup, even if the copy fails. Otherwise
210                          * we lock up (O_NONBLOCK-)readers that sleep due to
211                          * syscall merging.
212                          */
213                         do_wakeup = 1;
214                         if (chars > total_len)
215                                 chars = total_len;
216                         if (chars > free)
217                                 chars = free;
218
219                         if (pipe_iov_copy_from_user(pipebuf, iov, chars)) {
220                                 if (!ret) ret = -EFAULT;
221                                 break;
222                         }
223                         ret += chars;
224
225                         PIPE_LEN(*inode) += chars;
226                         total_len -= chars;
227                         if (!total_len)
228                                 break;
229                 }
230                 if (PIPE_FREE(*inode) && ret) {
231                         /* handle cyclic data buffers */
232                         min = 1;
233                         continue;
234                 }
235                 if (filp->f_flags & O_NONBLOCK) {
236                         if (!ret) ret = -EAGAIN;
237                         break;
238                 }
239                 if (signal_pending(current)) {
240                         if (!ret) ret = -ERESTARTSYS;
241                         break;
242                 }
243                 if (do_wakeup) {
244                         wake_up_interruptible_sync(PIPE_WAIT(*inode));
245                         kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
246                         do_wakeup = 0;
247                 }
248                 PIPE_WAITING_WRITERS(*inode)++;
249                 pipe_wait(inode);
250                 PIPE_WAITING_WRITERS(*inode)--;
251         }
252         up(PIPE_SEM(*inode));
253         if (do_wakeup) {
254                 wake_up_interruptible(PIPE_WAIT(*inode));
255                 kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
256         }
257         if (ret > 0)
258                 inode_update_time(inode, 1);    /* mtime and ctime */
259         return ret;
260 }
261
262 static ssize_t
263 pipe_write(struct file *filp, const char __user *buf,
264            size_t count, loff_t *ppos)
265 {
266         struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count };
267         return pipe_writev(filp, &iov, 1, ppos);
268 }
269
270 static ssize_t
271 bad_pipe_r(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
272 {
273         return -EBADF;
274 }
275
276 static ssize_t
277 bad_pipe_w(struct file *filp, const char __user *buf, size_t count, loff_t *ppos)
278 {
279         return -EBADF;
280 }
281
282 static int
283 pipe_ioctl(struct inode *pino, struct file *filp,
284            unsigned int cmd, unsigned long arg)
285 {
286         switch (cmd) {
287                 case FIONREAD:
288                         return put_user(PIPE_LEN(*pino), (int __user *)arg);
289                 default:
290                         return -EINVAL;
291         }
292 }
293
294 /* No kernel lock held - fine */
295 static unsigned int
296 pipe_poll(struct file *filp, poll_table *wait)
297 {
298         unsigned int mask;
299         struct inode *inode = filp->f_dentry->d_inode;
300
301         poll_wait(filp, PIPE_WAIT(*inode), wait);
302
303         /* Reading only -- no need for acquiring the semaphore.  */
304         mask = POLLIN | POLLRDNORM;
305         if (PIPE_EMPTY(*inode))
306                 mask = POLLOUT | POLLWRNORM;
307         if (!PIPE_WRITERS(*inode) && filp->f_version != PIPE_WCOUNTER(*inode))
308                 mask |= POLLHUP;
309         if (!PIPE_READERS(*inode))
310                 mask |= POLLERR;
311
312         return mask;
313 }
314
315 /* FIXME: most Unices do not set POLLERR for fifos */
316 #define fifo_poll pipe_poll
317
318 static int
319 pipe_release(struct inode *inode, int decr, int decw)
320 {
321         down(PIPE_SEM(*inode));
322         PIPE_READERS(*inode) -= decr;
323         PIPE_WRITERS(*inode) -= decw;
324         if (!PIPE_READERS(*inode) && !PIPE_WRITERS(*inode)) {
325                 struct pipe_inode_info *info = inode->i_pipe;
326                 inode->i_pipe = NULL;
327                 free_page((unsigned long) info->base);
328                 kfree(info);
329         } else {
330                 wake_up_interruptible(PIPE_WAIT(*inode));
331                 kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
332                 kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT);
333         }
334         up(PIPE_SEM(*inode));
335
336         return 0;
337 }
338
339 static int
340 pipe_read_fasync(int fd, struct file *filp, int on)
341 {
342         struct inode *inode = filp->f_dentry->d_inode;
343         int retval;
344
345         down(PIPE_SEM(*inode));
346         retval = fasync_helper(fd, filp, on, PIPE_FASYNC_READERS(*inode));
347         up(PIPE_SEM(*inode));
348
349         if (retval < 0)
350                 return retval;
351
352         return 0;
353 }
354
355
356 static int
357 pipe_write_fasync(int fd, struct file *filp, int on)
358 {
359         struct inode *inode = filp->f_dentry->d_inode;
360         int retval;
361
362         down(PIPE_SEM(*inode));
363         retval = fasync_helper(fd, filp, on, PIPE_FASYNC_WRITERS(*inode));
364         up(PIPE_SEM(*inode));
365
366         if (retval < 0)
367                 return retval;
368
369         return 0;
370 }
371
372
373 static int
374 pipe_rdwr_fasync(int fd, struct file *filp, int on)
375 {
376         struct inode *inode = filp->f_dentry->d_inode;
377         int retval;
378
379         down(PIPE_SEM(*inode));
380
381         retval = fasync_helper(fd, filp, on, PIPE_FASYNC_READERS(*inode));
382
383         if (retval >= 0)
384                 retval = fasync_helper(fd, filp, on, PIPE_FASYNC_WRITERS(*inode));
385
386         up(PIPE_SEM(*inode));
387
388         if (retval < 0)
389                 return retval;
390
391         return 0;
392 }
393
394
395 static int
396 pipe_read_release(struct inode *inode, struct file *filp)
397 {
398         pipe_read_fasync(-1, filp, 0);
399         return pipe_release(inode, 1, 0);
400 }
401
402 static int
403 pipe_write_release(struct inode *inode, struct file *filp)
404 {
405         pipe_write_fasync(-1, filp, 0);
406         return pipe_release(inode, 0, 1);
407 }
408
409 static int
410 pipe_rdwr_release(struct inode *inode, struct file *filp)
411 {
412         int decr, decw;
413
414         pipe_rdwr_fasync(-1, filp, 0);
415         decr = (filp->f_mode & FMODE_READ) != 0;
416         decw = (filp->f_mode & FMODE_WRITE) != 0;
417         return pipe_release(inode, decr, decw);
418 }
419
420 static int
421 pipe_read_open(struct inode *inode, struct file *filp)
422 {
423         /* We could have perhaps used atomic_t, but this and friends
424            below are the only places.  So it doesn't seem worthwhile.  */
425         down(PIPE_SEM(*inode));
426         PIPE_READERS(*inode)++;
427         up(PIPE_SEM(*inode));
428
429         return 0;
430 }
431
432 static int
433 pipe_write_open(struct inode *inode, struct file *filp)
434 {
435         down(PIPE_SEM(*inode));
436         PIPE_WRITERS(*inode)++;
437         up(PIPE_SEM(*inode));
438
439         return 0;
440 }
441
442 static int
443 pipe_rdwr_open(struct inode *inode, struct file *filp)
444 {
445         down(PIPE_SEM(*inode));
446         if (filp->f_mode & FMODE_READ)
447                 PIPE_READERS(*inode)++;
448         if (filp->f_mode & FMODE_WRITE)
449                 PIPE_WRITERS(*inode)++;
450         up(PIPE_SEM(*inode));
451
452         return 0;
453 }
454
455 /*
456  * The file_operations structs are not static because they
457  * are also used in linux/fs/fifo.c to do operations on FIFOs.
458  */
459 struct file_operations read_fifo_fops = {
460         .llseek         = no_llseek,
461         .read           = pipe_read,
462         .readv          = pipe_readv,
463         .write          = bad_pipe_w,
464         .poll           = fifo_poll,
465         .ioctl          = pipe_ioctl,
466         .open           = pipe_read_open,
467         .release        = pipe_read_release,
468         .fasync         = pipe_read_fasync,
469 };
470
471 struct file_operations write_fifo_fops = {
472         .llseek         = no_llseek,
473         .read           = bad_pipe_r,
474         .write          = pipe_write,
475         .writev         = pipe_writev,
476         .poll           = fifo_poll,
477         .ioctl          = pipe_ioctl,
478         .open           = pipe_write_open,
479         .release        = pipe_write_release,
480         .fasync         = pipe_write_fasync,
481 };
482
483 struct file_operations rdwr_fifo_fops = {
484         .llseek         = no_llseek,
485         .read           = pipe_read,
486         .readv          = pipe_readv,
487         .write          = pipe_write,
488         .writev         = pipe_writev,
489         .poll           = fifo_poll,
490         .ioctl          = pipe_ioctl,
491         .open           = pipe_rdwr_open,
492         .release        = pipe_rdwr_release,
493         .fasync         = pipe_rdwr_fasync,
494 };
495
496 struct file_operations read_pipe_fops = {
497         .llseek         = no_llseek,
498         .read           = pipe_read,
499         .readv          = pipe_readv,
500         .write          = bad_pipe_w,
501         .poll           = pipe_poll,
502         .ioctl          = pipe_ioctl,
503         .open           = pipe_read_open,
504         .release        = pipe_read_release,
505         .fasync         = pipe_read_fasync,
506 };
507
508 struct file_operations write_pipe_fops = {
509         .llseek         = no_llseek,
510         .read           = bad_pipe_r,
511         .write          = pipe_write,
512         .writev         = pipe_writev,
513         .poll           = pipe_poll,
514         .ioctl          = pipe_ioctl,
515         .open           = pipe_write_open,
516         .release        = pipe_write_release,
517         .fasync         = pipe_write_fasync,
518 };
519
520 struct file_operations rdwr_pipe_fops = {
521         .llseek         = no_llseek,
522         .read           = pipe_read,
523         .readv          = pipe_readv,
524         .write          = pipe_write,
525         .writev         = pipe_writev,
526         .poll           = pipe_poll,
527         .ioctl          = pipe_ioctl,
528         .open           = pipe_rdwr_open,
529         .release        = pipe_rdwr_release,
530         .fasync         = pipe_rdwr_fasync,
531 };
532
533 struct inode* pipe_new(struct inode* inode)
534 {
535         unsigned long page;
536
537         page = __get_free_page(GFP_USER);
538         if (!page)
539                 return NULL;
540
541         inode->i_pipe = kmalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
542         if (!inode->i_pipe)
543                 goto fail_page;
544
545         init_waitqueue_head(PIPE_WAIT(*inode));
546         PIPE_BASE(*inode) = (char*) page;
547         PIPE_START(*inode) = PIPE_LEN(*inode) = 0;
548         PIPE_READERS(*inode) = PIPE_WRITERS(*inode) = 0;
549         PIPE_WAITING_WRITERS(*inode) = 0;
550         PIPE_RCOUNTER(*inode) = PIPE_WCOUNTER(*inode) = 1;
551         *PIPE_FASYNC_READERS(*inode) = *PIPE_FASYNC_WRITERS(*inode) = NULL;
552
553         return inode;
554 fail_page:
555         free_page(page);
556         return NULL;
557 }
558
559 static struct vfsmount *pipe_mnt;
560 static int pipefs_delete_dentry(struct dentry *dentry)
561 {
562         return 1;
563 }
564 static struct dentry_operations pipefs_dentry_operations = {
565         .d_delete       = pipefs_delete_dentry,
566 };
567
568 static struct inode * get_pipe_inode(void)
569 {
570         struct inode *inode = new_inode(pipe_mnt->mnt_sb);
571
572         if (!inode)
573                 goto fail_inode;
574
575         if(!pipe_new(inode))
576                 goto fail_iput;
577         PIPE_READERS(*inode) = PIPE_WRITERS(*inode) = 1;
578         inode->i_fop = &rdwr_pipe_fops;
579
580         /*
581          * Mark the inode dirty from the very beginning,
582          * that way it will never be moved to the dirty
583          * list because "mark_inode_dirty()" will think
584          * that it already _is_ on the dirty list.
585          */
586         inode->i_state = I_DIRTY;
587         inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR;
588         inode->i_uid = current->fsuid;
589         inode->i_gid = current->fsgid;
590         inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
591         inode->i_blksize = PAGE_SIZE;
592         return inode;
593
594 fail_iput:
595         iput(inode);
596 fail_inode:
597         return NULL;
598 }
599
600 int do_pipe(int *fd)
601 {
602         struct qstr this;
603         char name[32];
604         struct dentry *dentry;
605         struct inode * inode;
606         struct file *f1, *f2;
607         int error;
608         int i,j;
609
610         error = -ENFILE;
611         f1 = get_empty_filp();
612         if (!f1)
613                 goto no_files;
614
615         f2 = get_empty_filp();
616         if (!f2)
617                 goto close_f1;
618
619         inode = get_pipe_inode();
620         if (!inode)
621                 goto close_f12;
622
623         error = get_unused_fd();
624         if (error < 0)
625                 goto close_f12_inode;
626         i = error;
627
628         error = get_unused_fd();
629         if (error < 0)
630                 goto close_f12_inode_i;
631         j = error;
632
633         error = -ENOMEM;
634         sprintf(name, "[%lu]", inode->i_ino);
635         this.name = name;
636         this.len = strlen(name);
637         this.hash = inode->i_ino; /* will go */
638         dentry = d_alloc(pipe_mnt->mnt_sb->s_root, &this);
639         if (!dentry)
640                 goto close_f12_inode_i_j;
641         dentry->d_op = &pipefs_dentry_operations;
642         d_add(dentry, inode);
643         f1->f_vfsmnt = f2->f_vfsmnt = mntget(mntget(pipe_mnt));
644         f1->f_dentry = f2->f_dentry = dget(dentry);
645         f1->f_mapping = f2->f_mapping = inode->i_mapping;
646
647         /* read file */
648         f1->f_pos = f2->f_pos = 0;
649         f1->f_flags = O_RDONLY;
650         f1->f_op = &read_pipe_fops;
651         f1->f_mode = FMODE_READ;
652         f1->f_version = 0;
653
654         /* write file */
655         f2->f_flags = O_WRONLY;
656         f2->f_op = &write_pipe_fops;
657         f2->f_mode = FMODE_WRITE;
658         f2->f_version = 0;
659
660         fd_install(i, f1);
661         fd_install(j, f2);
662         fd[0] = i;
663         fd[1] = j;
664         return 0;
665
666 close_f12_inode_i_j:
667         put_unused_fd(j);
668 close_f12_inode_i:
669         put_unused_fd(i);
670 close_f12_inode:
671         free_page((unsigned long) PIPE_BASE(*inode));
672         kfree(inode->i_pipe);
673         inode->i_pipe = NULL;
674         iput(inode);
675 close_f12:
676         put_filp(f2);
677 close_f1:
678         put_filp(f1);
679 no_files:
680         return error;   
681 }
682
683 /*
684  * pipefs should _never_ be mounted by userland - too much of security hassle,
685  * no real gain from having the whole whorehouse mounted. So we don't need
686  * any operations on the root directory. However, we need a non-trivial
687  * d_name - pipe: will go nicely and kill the special-casing in procfs.
688  */
689
690 static struct super_block *pipefs_get_sb(struct file_system_type *fs_type,
691         int flags, const char *dev_name, void *data)
692 {
693         return get_sb_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC);
694 }
695
696 static struct file_system_type pipe_fs_type = {
697         .name           = "pipefs",
698         .get_sb         = pipefs_get_sb,
699         .kill_sb        = kill_anon_super,
700 };
701
702 static int __init init_pipe_fs(void)
703 {
704         int err = register_filesystem(&pipe_fs_type);
705         if (!err) {
706                 pipe_mnt = kern_mount(&pipe_fs_type);
707                 if (IS_ERR(pipe_mnt)) {
708                         err = PTR_ERR(pipe_mnt);
709                         unregister_filesystem(&pipe_fs_type);
710                 }
711         }
712         return err;
713 }
714
715 static void __exit exit_pipe_fs(void)
716 {
717         unregister_filesystem(&pipe_fs_type);
718         mntput(pipe_mnt);
719 }
720
721 module_init(init_pipe_fs)
722 module_exit(exit_pipe_fs)