vserver 1.9.5.x5
[linux-2.6.git] / fs / pipe.c
index 2b42a25..13db326 100644 (file)
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -14,6 +14,8 @@
 #include <linux/mount.h>
 #include <linux/pipe_fs_i.h>
 #include <linux/uio.h>
+#include <linux/highmem.h>
+
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
 
@@ -84,11 +86,40 @@ pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len)
        return 0;
 }
 
+static void anon_pipe_buf_release(struct pipe_inode_info *info, struct pipe_buffer *buf)
+{
+       struct page *page = buf->page;
+
+       if (info->tmp_page) {
+               __free_page(page);
+               return;
+       }
+       info->tmp_page = page;
+}
+
+static void *anon_pipe_buf_map(struct file *file, struct pipe_inode_info *info, struct pipe_buffer *buf)
+{
+       return kmap(buf->page);
+}
+
+static void anon_pipe_buf_unmap(struct pipe_inode_info *info, struct pipe_buffer *buf)
+{
+       kunmap(buf->page);
+}
+
+static struct pipe_buf_operations anon_pipe_buf_ops = {
+       .can_merge = 1,
+       .map = anon_pipe_buf_map,
+       .unmap = anon_pipe_buf_unmap,
+       .release = anon_pipe_buf_release,
+};
+
 static ssize_t
 pipe_readv(struct file *filp, const struct iovec *_iov,
           unsigned long nr_segs, loff_t *ppos)
 {
        struct inode *inode = filp->f_dentry->d_inode;
+       struct pipe_inode_info *info;
        int do_wakeup;
        ssize_t ret;
        struct iovec *iov = (struct iovec *)_iov;
@@ -102,32 +133,43 @@ pipe_readv(struct file *filp, const struct iovec *_iov,
        do_wakeup = 0;
        ret = 0;
        down(PIPE_SEM(*inode));
+       info = inode->i_pipe;
        for (;;) {
-               int size = PIPE_LEN(*inode);
-               if (size) {
-                       char *pipebuf = PIPE_BASE(*inode) + PIPE_START(*inode);
-                       ssize_t chars = PIPE_MAX_RCHUNK(*inode);
+               int bufs = info->nrbufs;
+               if (bufs) {
+                       int curbuf = info->curbuf;
+                       struct pipe_buffer *buf = info->bufs + curbuf;
+                       struct pipe_buf_operations *ops = buf->ops;
+                       void *addr;
+                       size_t chars = buf->len;
+                       int error;
 
                        if (chars > total_len)
                                chars = total_len;
-                       if (chars > size)
-                               chars = size;
 
-                       if (pipe_iov_copy_to_user(iov, pipebuf, chars)) {
+                       addr = ops->map(filp, info, buf);
+                       error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars);
+                       ops->unmap(info, buf);
+                       if (unlikely(error)) {
                                if (!ret) ret = -EFAULT;
                                break;
                        }
                        ret += chars;
-
-                       PIPE_START(*inode) += chars;
-                       PIPE_START(*inode) &= (PIPE_SIZE - 1);
-                       PIPE_LEN(*inode) -= chars;
+                       buf->offset += chars;
+                       buf->len -= chars;
+                       if (!buf->len) {
+                               buf->ops = NULL;
+                               ops->release(info, buf);
+                               curbuf = (curbuf + 1) & (PIPE_BUFFERS-1);
+                               info->curbuf = curbuf;
+                               info->nrbufs = --bufs;
+                               do_wakeup = 1;
+                       }
                        total_len -= chars;
-                       do_wakeup = 1;
                        if (!total_len)
                                break;  /* common path: read succeeded */
                }
-               if (PIPE_LEN(*inode)) /* test for cyclic buffers */
+               if (bufs)       /* More to do? */
                        continue;
                if (!PIPE_WRITERS(*inode))
                        break;
@@ -177,8 +219,8 @@ pipe_writev(struct file *filp, const struct iovec *_iov,
            unsigned long nr_segs, loff_t *ppos)
 {
        struct inode *inode = filp->f_dentry->d_inode;
+       struct pipe_inode_info *info;
        ssize_t ret;
-       size_t min;
        int do_wakeup;
        struct iovec *iov = (struct iovec *)_iov;
        size_t total_len;
@@ -190,48 +232,91 @@ pipe_writev(struct file *filp, const struct iovec *_iov,
 
        do_wakeup = 0;
        ret = 0;
-       min = total_len;
-       if (min > PIPE_BUF)
-               min = 1;
        down(PIPE_SEM(*inode));
+       info = inode->i_pipe;
+
+       if (!PIPE_READERS(*inode)) {
+               send_sig(SIGPIPE, current, 0);
+               ret = -EPIPE;
+               goto out;
+       }
+
+       /* We try to merge small writes */
+       if (info->nrbufs && total_len < PAGE_SIZE) {
+               int lastbuf = (info->curbuf + info->nrbufs - 1) & (PIPE_BUFFERS-1);
+               struct pipe_buffer *buf = info->bufs + lastbuf;
+               struct pipe_buf_operations *ops = buf->ops;
+               int offset = buf->offset + buf->len;
+               if (ops->can_merge && offset + total_len <= PAGE_SIZE) {
+                       void *addr = ops->map(filp, info, buf);
+                       int error = pipe_iov_copy_from_user(offset + addr, iov, total_len);
+                       ops->unmap(info, buf);
+                       ret = error;
+                       do_wakeup = 1;
+                       if (error)
+                               goto out;
+                       buf->len += total_len;
+                       ret = total_len;
+                       goto out;
+               }
+                       
+       }
+
        for (;;) {
-               int free;
+               int bufs;
                if (!PIPE_READERS(*inode)) {
                        send_sig(SIGPIPE, current, 0);
                        if (!ret) ret = -EPIPE;
                        break;
                }
-               free = PIPE_FREE(*inode);
-               if (free >= min) {
-                       /* transfer data */
-                       ssize_t chars = PIPE_MAX_WCHUNK(*inode);
-                       char *pipebuf = PIPE_BASE(*inode) + PIPE_END(*inode);
+               bufs = info->nrbufs;
+               if (bufs < PIPE_BUFFERS) {
+                       ssize_t chars;
+                       int newbuf = (info->curbuf + bufs) & (PIPE_BUFFERS-1);
+                       struct pipe_buffer *buf = info->bufs + newbuf;
+                       struct page *page = info->tmp_page;
+                       int error;
+
+                       if (!page) {
+                               page = alloc_page(GFP_HIGHUSER);
+                               if (unlikely(!page)) {
+                                       ret = ret ? : -ENOMEM;
+                                       break;
+                               }
+                               info->tmp_page = page;
+                       }
                        /* Always wakeup, even if the copy fails. Otherwise
                         * we lock up (O_NONBLOCK-)readers that sleep due to
                         * syscall merging.
+                        * FIXME! Is this really true?
                         */
                        do_wakeup = 1;
+                       chars = PAGE_SIZE;
                        if (chars > total_len)
                                chars = total_len;
-                       if (chars > free)
-                               chars = free;
 
-                       if (pipe_iov_copy_from_user(pipebuf, iov, chars)) {
+                       error = pipe_iov_copy_from_user(kmap(page), iov, chars);
+                       kunmap(page);
+                       if (unlikely(error)) {
                                if (!ret) ret = -EFAULT;
                                break;
                        }
                        ret += chars;
 
-                       PIPE_LEN(*inode) += chars;
+                       /* Insert it into the buffer array */
+                       buf->page = page;
+                       buf->ops = &anon_pipe_buf_ops;
+                       buf->offset = 0;
+                       buf->len = chars;
+                       info->nrbufs = ++bufs;
+                       info->tmp_page = NULL;
+
                        total_len -= chars;
                        if (!total_len)
                                break;
                }
-               if (PIPE_FREE(*inode) && ret) {
-                       /* handle cyclic data buffers */
-                       min = 1;
+               if (bufs < PIPE_BUFFERS)
                        continue;
-               }
                if (filp->f_flags & O_NONBLOCK) {
                        if (!ret) ret = -EAGAIN;
                        break;
@@ -249,6 +334,7 @@ pipe_writev(struct file *filp, const struct iovec *_iov,
                pipe_wait(inode);
                PIPE_WAITING_WRITERS(*inode)--;
        }
+out:
        up(PIPE_SEM(*inode));
        if (do_wakeup) {
                wake_up_interruptible(PIPE_WAIT(*inode));
@@ -283,9 +369,23 @@ static int
 pipe_ioctl(struct inode *pino, struct file *filp,
           unsigned int cmd, unsigned long arg)
 {
+       struct inode *inode = filp->f_dentry->d_inode;
+       struct pipe_inode_info *info;
+       int count, buf, nrbufs;
+
        switch (cmd) {
                case FIONREAD:
-                       return put_user(PIPE_LEN(*pino), (int __user *)arg);
+                       down(PIPE_SEM(*inode));
+                       info =  inode->i_pipe;
+                       count = 0;
+                       buf = info->curbuf;
+                       nrbufs = info->nrbufs;
+                       while (--nrbufs >= 0) {
+                               count += info->bufs[buf].len;
+                               buf = (buf+1) & (PIPE_BUFFERS-1);
+                       }
+                       up(PIPE_SEM(*inode));
+                       return put_user(count, (int __user *)arg);
                default:
                        return -EINVAL;
        }
@@ -297,17 +397,25 @@ pipe_poll(struct file *filp, poll_table *wait)
 {
        unsigned int mask;
        struct inode *inode = filp->f_dentry->d_inode;
+       struct pipe_inode_info *info = inode->i_pipe;
+       int nrbufs;
 
        poll_wait(filp, PIPE_WAIT(*inode), wait);
 
        /* Reading only -- no need for acquiring the semaphore.  */
-       mask = POLLIN | POLLRDNORM;
-       if (PIPE_EMPTY(*inode))
-               mask = POLLOUT | POLLWRNORM;
-       if (!PIPE_WRITERS(*inode) && filp->f_version != PIPE_WCOUNTER(*inode))
-               mask |= POLLHUP;
-       if (!PIPE_READERS(*inode))
-               mask |= POLLERR;
+       nrbufs = info->nrbufs;
+       mask = 0;
+       if (filp->f_mode & FMODE_READ) {
+               mask = (nrbufs > 0) ? POLLIN | POLLRDNORM : 0;
+               if (!PIPE_WRITERS(*inode) && filp->f_version != PIPE_WCOUNTER(*inode))
+                       mask |= POLLHUP;
+       }
+
+       if (filp->f_mode & FMODE_WRITE) {
+               mask |= (nrbufs < PIPE_BUFFERS) ? POLLOUT | POLLWRNORM : 0;
+               if (!PIPE_READERS(*inode))
+                       mask |= POLLERR;
+       }
 
        return mask;
 }
@@ -322,10 +430,7 @@ pipe_release(struct inode *inode, int decr, int decw)
        PIPE_READERS(*inode) -= decr;
        PIPE_WRITERS(*inode) -= decw;
        if (!PIPE_READERS(*inode) && !PIPE_WRITERS(*inode)) {
-               struct pipe_inode_info *info = inode->i_pipe;
-               inode->i_pipe = NULL;
-               free_page((unsigned long) info->base);
-               kfree(info);
+               free_pipe_info(inode);
        } else {
                wake_up_interruptible(PIPE_WAIT(*inode));
                kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
@@ -530,29 +635,37 @@ struct file_operations rdwr_pipe_fops = {
        .fasync         = pipe_rdwr_fasync,
 };
 
-struct inode* pipe_new(struct inode* inode)
+void free_pipe_info(struct inode *inode)
 {
-       unsigned long page;
+       int i;
+       struct pipe_inode_info *info = inode->i_pipe;
+
+       inode->i_pipe = NULL;
+       for (i = 0; i < PIPE_BUFFERS; i++) {
+               struct pipe_buffer *buf = info->bufs + i;
+               if (buf->ops)
+                       buf->ops->release(info, buf);
+       }
+       if (info->tmp_page)
+               __free_page(info->tmp_page);
+       kfree(info);
+}
 
-       page = __get_free_page(GFP_USER);
-       if (!page)
-               return NULL;
+struct inode* pipe_new(struct inode* inode)
+{
+       struct pipe_inode_info *info;
 
-       inode->i_pipe = kmalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
-       if (!inode->i_pipe)
+       info = kmalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
+       if (!info)
                goto fail_page;
+       memset(info, 0, sizeof(*info));
+       inode->i_pipe = info;
 
        init_waitqueue_head(PIPE_WAIT(*inode));
-       PIPE_BASE(*inode) = (char*) page;
-       PIPE_START(*inode) = PIPE_LEN(*inode) = 0;
-       PIPE_READERS(*inode) = PIPE_WRITERS(*inode) = 0;
-       PIPE_WAITING_WRITERS(*inode) = 0;
        PIPE_RCOUNTER(*inode) = PIPE_WCOUNTER(*inode) = 1;
-       *PIPE_FASYNC_READERS(*inode) = *PIPE_FASYNC_WRITERS(*inode) = NULL;
 
        return inode;
 fail_page:
-       free_page(page);
        return NULL;
 }
 
@@ -668,9 +781,7 @@ close_f12_inode_i_j:
 close_f12_inode_i:
        put_unused_fd(i);
 close_f12_inode:
-       free_page((unsigned long) PIPE_BASE(*inode));
-       kfree(inode->i_pipe);
-       inode->i_pipe = NULL;
+       free_pipe_info(inode);
        iput(inode);
 close_f12:
        put_filp(f2);
@@ -718,5 +829,5 @@ static void __exit exit_pipe_fs(void)
        mntput(pipe_mnt);
 }
 
-module_init(init_pipe_fs)
-module_exit(exit_pipe_fs)
+fs_initcall(init_pipe_fs);
+module_exit(exit_pipe_fs);