Initial revision
[linux-2.6.git] / fs / pipe.c
index 9ae8d83..8aada8e 100644 (file)
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -14,6 +14,8 @@
 #include <linux/mount.h>
 #include <linux/pipe_fs_i.h>
 #include <linux/uio.h>
+#include <linux/highmem.h>
+
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
 
@@ -37,14 +39,18 @@ void pipe_wait(struct inode * inode)
 {
        DEFINE_WAIT(wait);
 
-       prepare_to_wait(PIPE_WAIT(*inode), &wait, TASK_INTERRUPTIBLE);
-       up(PIPE_SEM(*inode));
+       /*
+        * Pipes are system-local resources, so sleeping on them
+        * is considered a noninteractive wait:
+        */
+       prepare_to_wait(PIPE_WAIT(*inode), &wait, TASK_INTERRUPTIBLE|TASK_NONINTERACTIVE);
+       mutex_unlock(PIPE_MUTEX(*inode));
        schedule();
        finish_wait(PIPE_WAIT(*inode), &wait);
-       down(PIPE_SEM(*inode));
+       mutex_lock(PIPE_MUTEX(*inode));
 }
 
-static inline int
+static int
 pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len)
 {
        unsigned long copy;
@@ -64,7 +70,7 @@ pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len)
        return 0;
 }
 
-static inline int
+static int
 pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len)
 {
        unsigned long copy;
@@ -84,20 +90,45 @@ pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len)
        return 0;
 }
 
+static void anon_pipe_buf_release(struct pipe_inode_info *info, struct pipe_buffer *buf)
+{
+       struct page *page = buf->page;
+
+       if (info->tmp_page) {
+               __free_page(page);
+               return;
+       }
+       info->tmp_page = page;
+}
+
+static void *anon_pipe_buf_map(struct file *file, struct pipe_inode_info *info, struct pipe_buffer *buf)
+{
+       return kmap(buf->page);
+}
+
+static void anon_pipe_buf_unmap(struct pipe_inode_info *info, struct pipe_buffer *buf)
+{
+       kunmap(buf->page);
+}
+
+static struct pipe_buf_operations anon_pipe_buf_ops = {
+       .can_merge = 1,
+       .map = anon_pipe_buf_map,
+       .unmap = anon_pipe_buf_unmap,
+       .release = anon_pipe_buf_release,
+};
+
 static ssize_t
 pipe_readv(struct file *filp, const struct iovec *_iov,
           unsigned long nr_segs, loff_t *ppos)
 {
        struct inode *inode = filp->f_dentry->d_inode;
+       struct pipe_inode_info *info;
        int do_wakeup;
        ssize_t ret;
        struct iovec *iov = (struct iovec *)_iov;
        size_t total_len;
 
-       /* pread is not allowed on pipes. */
-       if (unlikely(ppos != &filp->f_pos))
-               return -ESPIPE;
-
        total_len = iov_length(iov, nr_segs);
        /* Null read succeeds. */
        if (unlikely(total_len == 0))
@@ -105,33 +136,44 @@ pipe_readv(struct file *filp, const struct iovec *_iov,
 
        do_wakeup = 0;
        ret = 0;
-       down(PIPE_SEM(*inode));
+       mutex_lock(PIPE_MUTEX(*inode));
+       info = inode->i_pipe;
        for (;;) {
-               int size = PIPE_LEN(*inode);
-               if (size) {
-                       char *pipebuf = PIPE_BASE(*inode) + PIPE_START(*inode);
-                       ssize_t chars = PIPE_MAX_RCHUNK(*inode);
+               int bufs = info->nrbufs;
+               if (bufs) {
+                       int curbuf = info->curbuf;
+                       struct pipe_buffer *buf = info->bufs + curbuf;
+                       struct pipe_buf_operations *ops = buf->ops;
+                       void *addr;
+                       size_t chars = buf->len;
+                       int error;
 
                        if (chars > total_len)
                                chars = total_len;
-                       if (chars > size)
-                               chars = size;
 
-                       if (pipe_iov_copy_to_user(iov, pipebuf, chars)) {
+                       addr = ops->map(filp, info, buf);
+                       error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars);
+                       ops->unmap(info, buf);
+                       if (unlikely(error)) {
                                if (!ret) ret = -EFAULT;
                                break;
                        }
                        ret += chars;
-
-                       PIPE_START(*inode) += chars;
-                       PIPE_START(*inode) &= (PIPE_SIZE - 1);
-                       PIPE_LEN(*inode) -= chars;
+                       buf->offset += chars;
+                       buf->len -= chars;
+                       if (!buf->len) {
+                               buf->ops = NULL;
+                               ops->release(info, buf);
+                               curbuf = (curbuf + 1) & (PIPE_BUFFERS-1);
+                               info->curbuf = curbuf;
+                               info->nrbufs = --bufs;
+                               do_wakeup = 1;
+                       }
                        total_len -= chars;
-                       do_wakeup = 1;
                        if (!total_len)
                                break;  /* common path: read succeeded */
                }
-               if (PIPE_LEN(*inode)) /* test for cyclic buffers */
+               if (bufs)       /* More to do? */
                        continue;
                if (!PIPE_WRITERS(*inode))
                        break;
@@ -158,7 +200,7 @@ pipe_readv(struct file *filp, const struct iovec *_iov,
                }
                pipe_wait(inode);
        }
-       up(PIPE_SEM(*inode));
+       mutex_unlock(PIPE_MUTEX(*inode));
        /* Signal writers asynchronously that there is more room.  */
        if (do_wakeup) {
                wake_up_interruptible(PIPE_WAIT(*inode));
@@ -181,15 +223,12 @@ pipe_writev(struct file *filp, const struct iovec *_iov,
            unsigned long nr_segs, loff_t *ppos)
 {
        struct inode *inode = filp->f_dentry->d_inode;
+       struct pipe_inode_info *info;
        ssize_t ret;
-       size_t min;
        int do_wakeup;
        struct iovec *iov = (struct iovec *)_iov;
        size_t total_len;
-
-       /* pwrite is not allowed on pipes. */
-       if (unlikely(ppos != &filp->f_pos))
-               return -ESPIPE;
+       ssize_t chars;
 
        total_len = iov_length(iov, nr_segs);
        /* Null write succeeds. */
@@ -198,48 +237,92 @@ pipe_writev(struct file *filp, const struct iovec *_iov,
 
        do_wakeup = 0;
        ret = 0;
-       min = total_len;
-       if (min > PIPE_BUF)
-               min = 1;
-       down(PIPE_SEM(*inode));
+       mutex_lock(PIPE_MUTEX(*inode));
+       info = inode->i_pipe;
+
+       if (!PIPE_READERS(*inode)) {
+               send_sig(SIGPIPE, current, 0);
+               ret = -EPIPE;
+               goto out;
+       }
+
+       /* We try to merge small writes */
+       chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */
+       if (info->nrbufs && chars != 0) {
+               int lastbuf = (info->curbuf + info->nrbufs - 1) & (PIPE_BUFFERS-1);
+               struct pipe_buffer *buf = info->bufs + lastbuf;
+               struct pipe_buf_operations *ops = buf->ops;
+               int offset = buf->offset + buf->len;
+               if (ops->can_merge && offset + chars <= PAGE_SIZE) {
+                       void *addr = ops->map(filp, info, buf);
+                       int error = pipe_iov_copy_from_user(offset + addr, iov, chars);
+                       ops->unmap(info, buf);
+                       ret = error;
+                       do_wakeup = 1;
+                       if (error)
+                               goto out;
+                       buf->len += chars;
+                       total_len -= chars;
+                       ret = chars;
+                       if (!total_len)
+                               goto out;
+               }
+       }
+
        for (;;) {
-               int free;
+               int bufs;
                if (!PIPE_READERS(*inode)) {
                        send_sig(SIGPIPE, current, 0);
                        if (!ret) ret = -EPIPE;
                        break;
                }
-               free = PIPE_FREE(*inode);
-               if (free >= min) {
-                       /* transfer data */
-                       ssize_t chars = PIPE_MAX_WCHUNK(*inode);
-                       char *pipebuf = PIPE_BASE(*inode) + PIPE_END(*inode);
+               bufs = info->nrbufs;
+               if (bufs < PIPE_BUFFERS) {
+                       int newbuf = (info->curbuf + bufs) & (PIPE_BUFFERS-1);
+                       struct pipe_buffer *buf = info->bufs + newbuf;
+                       struct page *page = info->tmp_page;
+                       int error;
+
+                       if (!page) {
+                               page = alloc_page(GFP_HIGHUSER);
+                               if (unlikely(!page)) {
+                                       ret = ret ? : -ENOMEM;
+                                       break;
+                               }
+                               info->tmp_page = page;
+                       }
                        /* Always wakeup, even if the copy fails. Otherwise
                         * we lock up (O_NONBLOCK-)readers that sleep due to
                         * syscall merging.
+                        * FIXME! Is this really true?
                         */
                        do_wakeup = 1;
+                       chars = PAGE_SIZE;
                        if (chars > total_len)
                                chars = total_len;
-                       if (chars > free)
-                               chars = free;
 
-                       if (pipe_iov_copy_from_user(pipebuf, iov, chars)) {
+                       error = pipe_iov_copy_from_user(kmap(page), iov, chars);
+                       kunmap(page);
+                       if (unlikely(error)) {
                                if (!ret) ret = -EFAULT;
                                break;
                        }
                        ret += chars;
 
-                       PIPE_LEN(*inode) += chars;
+                       /* Insert it into the buffer array */
+                       buf->page = page;
+                       buf->ops = &anon_pipe_buf_ops;
+                       buf->offset = 0;
+                       buf->len = chars;
+                       info->nrbufs = ++bufs;
+                       info->tmp_page = NULL;
+
                        total_len -= chars;
                        if (!total_len)
                                break;
                }
-               if (PIPE_FREE(*inode) && ret) {
-                       /* handle cyclic data buffers */
-                       min = 1;
+               if (bufs < PIPE_BUFFERS)
                        continue;
-               }
                if (filp->f_flags & O_NONBLOCK) {
                        if (!ret) ret = -EAGAIN;
                        break;
@@ -257,13 +340,14 @@ pipe_writev(struct file *filp, const struct iovec *_iov,
                pipe_wait(inode);
                PIPE_WAITING_WRITERS(*inode)--;
        }
-       up(PIPE_SEM(*inode));
+out:
+       mutex_unlock(PIPE_MUTEX(*inode));
        if (do_wakeup) {
                wake_up_interruptible(PIPE_WAIT(*inode));
                kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
        }
        if (ret > 0)
-               inode_update_time(inode, 1);    /* mtime and ctime */
+               file_update_time(filp);
        return ret;
 }
 
@@ -291,9 +375,23 @@ static int
 pipe_ioctl(struct inode *pino, struct file *filp,
           unsigned int cmd, unsigned long arg)
 {
+       struct inode *inode = filp->f_dentry->d_inode;
+       struct pipe_inode_info *info;
+       int count, buf, nrbufs;
+
        switch (cmd) {
                case FIONREAD:
-                       return put_user(PIPE_LEN(*pino), (int __user *)arg);
+                       mutex_lock(PIPE_MUTEX(*inode));
+                       info =  inode->i_pipe;
+                       count = 0;
+                       buf = info->curbuf;
+                       nrbufs = info->nrbufs;
+                       while (--nrbufs >= 0) {
+                               count += info->bufs[buf].len;
+                               buf = (buf+1) & (PIPE_BUFFERS-1);
+                       }
+                       mutex_unlock(PIPE_MUTEX(*inode));
+                       return put_user(count, (int __user *)arg);
                default:
                        return -EINVAL;
        }
@@ -305,41 +403,47 @@ pipe_poll(struct file *filp, poll_table *wait)
 {
        unsigned int mask;
        struct inode *inode = filp->f_dentry->d_inode;
+       struct pipe_inode_info *info = inode->i_pipe;
+       int nrbufs;
 
        poll_wait(filp, PIPE_WAIT(*inode), wait);
 
        /* Reading only -- no need for acquiring the semaphore.  */
-       mask = POLLIN | POLLRDNORM;
-       if (PIPE_EMPTY(*inode))
-               mask = POLLOUT | POLLWRNORM;
-       if (!PIPE_WRITERS(*inode) && filp->f_version != PIPE_WCOUNTER(*inode))
-               mask |= POLLHUP;
-       if (!PIPE_READERS(*inode))
-               mask |= POLLERR;
+       nrbufs = info->nrbufs;
+       mask = 0;
+       if (filp->f_mode & FMODE_READ) {
+               mask = (nrbufs > 0) ? POLLIN | POLLRDNORM : 0;
+               if (!PIPE_WRITERS(*inode) && filp->f_version != PIPE_WCOUNTER(*inode))
+                       mask |= POLLHUP;
+       }
+
+       if (filp->f_mode & FMODE_WRITE) {
+               mask |= (nrbufs < PIPE_BUFFERS) ? POLLOUT | POLLWRNORM : 0;
+               /*
+                * Most Unices do not set POLLERR for FIFOs but on Linux they
+                * behave exactly like pipes for poll().
+                */
+               if (!PIPE_READERS(*inode))
+                       mask |= POLLERR;
+       }
 
        return mask;
 }
 
-/* FIXME: most Unices do not set POLLERR for fifos */
-#define fifo_poll pipe_poll
-
 static int
 pipe_release(struct inode *inode, int decr, int decw)
 {
-       down(PIPE_SEM(*inode));
+       mutex_lock(PIPE_MUTEX(*inode));
        PIPE_READERS(*inode) -= decr;
        PIPE_WRITERS(*inode) -= decw;
        if (!PIPE_READERS(*inode) && !PIPE_WRITERS(*inode)) {
-               struct pipe_inode_info *info = inode->i_pipe;
-               inode->i_pipe = NULL;
-               free_page((unsigned long) info->base);
-               kfree(info);
+               free_pipe_info(inode);
        } else {
                wake_up_interruptible(PIPE_WAIT(*inode));
                kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
                kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT);
        }
-       up(PIPE_SEM(*inode));
+       mutex_unlock(PIPE_MUTEX(*inode));
 
        return 0;
 }
@@ -350,9 +454,9 @@ pipe_read_fasync(int fd, struct file *filp, int on)
        struct inode *inode = filp->f_dentry->d_inode;
        int retval;
 
-       down(PIPE_SEM(*inode));
+       mutex_lock(PIPE_MUTEX(*inode));
        retval = fasync_helper(fd, filp, on, PIPE_FASYNC_READERS(*inode));
-       up(PIPE_SEM(*inode));
+       mutex_unlock(PIPE_MUTEX(*inode));
 
        if (retval < 0)
                return retval;
@@ -367,9 +471,9 @@ pipe_write_fasync(int fd, struct file *filp, int on)
        struct inode *inode = filp->f_dentry->d_inode;
        int retval;
 
-       down(PIPE_SEM(*inode));
+       mutex_lock(PIPE_MUTEX(*inode));
        retval = fasync_helper(fd, filp, on, PIPE_FASYNC_WRITERS(*inode));
-       up(PIPE_SEM(*inode));
+       mutex_unlock(PIPE_MUTEX(*inode));
 
        if (retval < 0)
                return retval;
@@ -384,14 +488,14 @@ pipe_rdwr_fasync(int fd, struct file *filp, int on)
        struct inode *inode = filp->f_dentry->d_inode;
        int retval;
 
-       down(PIPE_SEM(*inode));
+       mutex_lock(PIPE_MUTEX(*inode));
 
        retval = fasync_helper(fd, filp, on, PIPE_FASYNC_READERS(*inode));
 
        if (retval >= 0)
                retval = fasync_helper(fd, filp, on, PIPE_FASYNC_WRITERS(*inode));
 
-       up(PIPE_SEM(*inode));
+       mutex_unlock(PIPE_MUTEX(*inode));
 
        if (retval < 0)
                return retval;
@@ -430,9 +534,9 @@ pipe_read_open(struct inode *inode, struct file *filp)
 {
        /* We could have perhaps used atomic_t, but this and friends
           below are the only places.  So it doesn't seem worthwhile.  */
-       down(PIPE_SEM(*inode));
+       mutex_lock(PIPE_MUTEX(*inode));
        PIPE_READERS(*inode)++;
-       up(PIPE_SEM(*inode));
+       mutex_unlock(PIPE_MUTEX(*inode));
 
        return 0;
 }
@@ -440,9 +544,9 @@ pipe_read_open(struct inode *inode, struct file *filp)
 static int
 pipe_write_open(struct inode *inode, struct file *filp)
 {
-       down(PIPE_SEM(*inode));
+       mutex_lock(PIPE_MUTEX(*inode));
        PIPE_WRITERS(*inode)++;
-       up(PIPE_SEM(*inode));
+       mutex_unlock(PIPE_MUTEX(*inode));
 
        return 0;
 }
@@ -450,12 +554,12 @@ pipe_write_open(struct inode *inode, struct file *filp)
 static int
 pipe_rdwr_open(struct inode *inode, struct file *filp)
 {
-       down(PIPE_SEM(*inode));
+       mutex_lock(PIPE_MUTEX(*inode));
        if (filp->f_mode & FMODE_READ)
                PIPE_READERS(*inode)++;
        if (filp->f_mode & FMODE_WRITE)
                PIPE_WRITERS(*inode)++;
-       up(PIPE_SEM(*inode));
+       mutex_unlock(PIPE_MUTEX(*inode));
 
        return 0;
 }
@@ -469,7 +573,7 @@ struct file_operations read_fifo_fops = {
        .read           = pipe_read,
        .readv          = pipe_readv,
        .write          = bad_pipe_w,
-       .poll           = fifo_poll,
+       .poll           = pipe_poll,
        .ioctl          = pipe_ioctl,
        .open           = pipe_read_open,
        .release        = pipe_read_release,
@@ -481,7 +585,7 @@ struct file_operations write_fifo_fops = {
        .read           = bad_pipe_r,
        .write          = pipe_write,
        .writev         = pipe_writev,
-       .poll           = fifo_poll,
+       .poll           = pipe_poll,
        .ioctl          = pipe_ioctl,
        .open           = pipe_write_open,
        .release        = pipe_write_release,
@@ -494,14 +598,14 @@ struct file_operations rdwr_fifo_fops = {
        .readv          = pipe_readv,
        .write          = pipe_write,
        .writev         = pipe_writev,
-       .poll           = fifo_poll,
+       .poll           = pipe_poll,
        .ioctl          = pipe_ioctl,
        .open           = pipe_rdwr_open,
        .release        = pipe_rdwr_release,
        .fasync         = pipe_rdwr_fasync,
 };
 
-struct file_operations read_pipe_fops = {
+static struct file_operations read_pipe_fops = {
        .llseek         = no_llseek,
        .read           = pipe_read,
        .readv          = pipe_readv,
@@ -513,7 +617,7 @@ struct file_operations read_pipe_fops = {
        .fasync         = pipe_read_fasync,
 };
 
-struct file_operations write_pipe_fops = {
+static struct file_operations write_pipe_fops = {
        .llseek         = no_llseek,
        .read           = bad_pipe_r,
        .write          = pipe_write,
@@ -525,7 +629,7 @@ struct file_operations write_pipe_fops = {
        .fasync         = pipe_write_fasync,
 };
 
-struct file_operations rdwr_pipe_fops = {
+static struct file_operations rdwr_pipe_fops = {
        .llseek         = no_llseek,
        .read           = pipe_read,
        .readv          = pipe_readv,
@@ -538,29 +642,37 @@ struct file_operations rdwr_pipe_fops = {
        .fasync         = pipe_rdwr_fasync,
 };
 
-struct inode* pipe_new(struct inode* inode)
+void free_pipe_info(struct inode *inode)
 {
-       unsigned long page;
+       int i;
+       struct pipe_inode_info *info = inode->i_pipe;
 
-       page = __get_free_page(GFP_USER);
-       if (!page)
-               return NULL;
+       inode->i_pipe = NULL;
+       for (i = 0; i < PIPE_BUFFERS; i++) {
+               struct pipe_buffer *buf = info->bufs + i;
+               if (buf->ops)
+                       buf->ops->release(info, buf);
+       }
+       if (info->tmp_page)
+               __free_page(info->tmp_page);
+       kfree(info);
+}
+
+struct inode* pipe_new(struct inode* inode)
+{
+       struct pipe_inode_info *info;
 
-       inode->i_pipe = kmalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
-       if (!inode->i_pipe)
+       info = kmalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
+       if (!info)
                goto fail_page;
+       memset(info, 0, sizeof(*info));
+       inode->i_pipe = info;
 
        init_waitqueue_head(PIPE_WAIT(*inode));
-       PIPE_BASE(*inode) = (char*) page;
-       PIPE_START(*inode) = PIPE_LEN(*inode) = 0;
-       PIPE_READERS(*inode) = PIPE_WRITERS(*inode) = 0;
-       PIPE_WAITING_WRITERS(*inode) = 0;
        PIPE_RCOUNTER(*inode) = PIPE_WCOUNTER(*inode) = 1;
-       *PIPE_FASYNC_READERS(*inode) = *PIPE_FASYNC_WRITERS(*inode) = NULL;
 
        return inode;
 fail_page:
-       free_page(page);
        return NULL;
 }
 
@@ -656,13 +768,13 @@ int do_pipe(int *fd)
        f1->f_pos = f2->f_pos = 0;
        f1->f_flags = O_RDONLY;
        f1->f_op = &read_pipe_fops;
-       f1->f_mode = 1;
+       f1->f_mode = FMODE_READ;
        f1->f_version = 0;
 
        /* write file */
        f2->f_flags = O_WRONLY;
        f2->f_op = &write_pipe_fops;
-       f2->f_mode = 2;
+       f2->f_mode = FMODE_WRITE;
        f2->f_version = 0;
 
        fd_install(i, f1);
@@ -676,9 +788,7 @@ close_f12_inode_i_j:
 close_f12_inode_i:
        put_unused_fd(i);
 close_f12_inode:
-       free_page((unsigned long) PIPE_BASE(*inode));
-       kfree(inode->i_pipe);
-       inode->i_pipe = NULL;
+       free_pipe_info(inode);
        iput(inode);
 close_f12:
        put_filp(f2);
@@ -688,8 +798,6 @@ no_files:
        return error;   
 }
 
-EXPORT_SYMBOL_GPL(do_pipe);
-
 /*
  * pipefs should _never_ be mounted by userland - too much of security hassle,
  * no real gain from having the whole whorehouse mounted. So we don't need
@@ -728,5 +836,5 @@ static void __exit exit_pipe_fs(void)
        mntput(pipe_mnt);
 }
 
-module_init(init_pipe_fs)
-module_exit(exit_pipe_fs)
+fs_initcall(init_pipe_fs);
+module_exit(exit_pipe_fs);