X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=fs%2Fsplice.c;h=2fca6ebf4cc2d17ceb05ccb6cbe7f93ab3571a87;hb=refs%2Fheads%2Fvserver;hp=a285fd746dc0d34aa97762a6e138416acae292e7;hpb=16cf0ec7408f389279d413869e94c1a351392f97;p=linux-2.6.git diff --git a/fs/splice.c b/fs/splice.c index a285fd746..2fca6ebf4 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -12,7 +12,7 @@ * Jens to support splicing to files, network, direct splicing, etc and * fixing lots of bugs. * - * Copyright (C) 2005-2006 Jens Axboe + * Copyright (C) 2005-2006 Jens Axboe * Copyright (C) 2005-2006 Linus Torvalds * Copyright (C) 2006 Ingo Molnar * @@ -42,7 +42,7 @@ struct splice_pipe_desc { struct partial_page *partial; /* pages[] may not be contig */ int nr_pages; /* number of pages in map */ unsigned int flags; /* splice flags */ - struct pipe_buf_operations *ops;/* ops associated with output pipe */ + const struct pipe_buf_operations *ops;/* ops associated with output pipe */ }; /* @@ -55,31 +55,43 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct page *page = buf->page; - struct address_space *mapping = page_mapping(page); + struct address_space *mapping; lock_page(page); - WARN_ON(!PageUptodate(page)); + mapping = page_mapping(page); + if (mapping) { + WARN_ON(!PageUptodate(page)); - /* - * At least for ext2 with nobh option, we need to wait on writeback - * completing on this page, since we'll remove it from the pagecache. - * Otherwise truncate wont wait on the page, allowing the disk - * blocks to be reused by someone else before we actually wrote our - * data to them. fs corruption ensues. - */ - wait_on_page_writeback(page); + /* + * At least for ext2 with nobh option, we need to wait on + * writeback completing on this page, since we'll remove it + * from the pagecache. Otherwise truncate wont wait on the + * page, allowing the disk blocks to be reused by someone else + * before we actually wrote our data to them. fs corruption + * ensues. + */ + wait_on_page_writeback(page); - if (PagePrivate(page)) - try_to_release_page(page, mapping_gfp_mask(mapping)); + if (PagePrivate(page)) + try_to_release_page(page, GFP_KERNEL); - if (!remove_mapping(mapping, page)) { - unlock_page(page); - return 1; + /* + * If we succeeded in removing the mapping, set LRU flag + * and return good. + */ + if (remove_mapping(mapping, page)) { + buf->flags |= PIPE_BUF_FLAG_LRU; + return 0; + } } - buf->flags |= PIPE_BUF_FLAG_LRU; - return 0; + /* + * Raced with truncate or failed to remove page from current + * address space, unlock and return failure. + */ + unlock_page(page); + return 1; } static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe, @@ -127,7 +139,7 @@ error: return err; } -static struct pipe_buf_operations page_cache_pipe_buf_ops = { +static const struct pipe_buf_operations page_cache_pipe_buf_ops = { .can_merge = 0, .map = generic_pipe_buf_map, .unmap = generic_pipe_buf_unmap, @@ -147,7 +159,7 @@ static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe, return generic_pipe_buf_steal(pipe, buf); } -static struct pipe_buf_operations user_page_pipe_buf_ops = { +static const struct pipe_buf_operations user_page_pipe_buf_ops = { .can_merge = 0, .map = generic_pipe_buf_map, .unmap = generic_pipe_buf_unmap, @@ -321,7 +333,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, break; error = add_to_page_cache_lru(page, mapping, index, - mapping_gfp_mask(mapping)); + GFP_KERNEL); if (unlikely(error)) { page_cache_release(page); if (error == -EEXIST) @@ -545,7 +557,6 @@ static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf, { struct file *file = sd->file; struct address_space *mapping = file->f_mapping; - gfp_t gfp_mask = mapping_gfp_mask(mapping); unsigned int offset, this_len; struct page *page; pgoff_t index; @@ -579,7 +590,7 @@ static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf, goto find_page; page = buf->page; - if (add_to_page_cache(page, mapping, index, gfp_mask)) { + if (add_to_page_cache(page, mapping, index, GFP_KERNEL)) { unlock_page(page); goto find_page; } @@ -595,13 +606,13 @@ find_page: ret = -ENOMEM; page = page_cache_alloc_cold(mapping); if (unlikely(!page)) - goto out_nomem; + goto out_ret; /* * This will also lock the page */ ret = add_to_page_cache_lru(page, mapping, index, - gfp_mask); + GFP_KERNEL); if (unlikely(ret)) goto out; } @@ -654,7 +665,7 @@ find_page: if (sd->pos + this_len > isize) vmtruncate(mapping->host, isize); - goto out; + goto out_ret; } if (buf->page != page) { @@ -686,7 +697,7 @@ find_page: out: page_cache_release(page); unlock_page(page); -out_nomem: +out_ret: return ret; } @@ -695,9 +706,9 @@ out_nomem: * key here is the 'actor' worker passed in that actually moves the data * to the wanted destination. See pipe_to_file/pipe_to_sendpage above. */ -ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, - loff_t *ppos, size_t len, unsigned int flags, - splice_actor *actor) +static ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, + struct file *out, loff_t *ppos, size_t len, + unsigned int flags, splice_actor *actor) { int ret, do_wakeup, err; struct splice_desc sd; @@ -710,13 +721,10 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, sd.file = out; sd.pos = *ppos; - if (pipe->inode) - mutex_lock(&pipe->inode->i_mutex); - for (;;) { if (pipe->nrbufs) { struct pipe_buffer *buf = pipe->bufs + pipe->curbuf; - struct pipe_buf_operations *ops = buf->ops; + const struct pipe_buf_operations *ops = buf->ops; sd.len = buf->len; if (sd.len > sd.total_len) @@ -785,9 +793,6 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, pipe_wait(pipe); } - if (pipe->inode) - mutex_unlock(&pipe->inode->i_mutex); - if (do_wakeup) { smp_mb(); if (waitqueue_active(&pipe->wait)) @@ -798,6 +803,73 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, return ret; } +ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, + loff_t *ppos, size_t len, unsigned int flags, + splice_actor *actor) +{ + ssize_t ret; + struct inode *inode = out->f_mapping->host; + + /* + * The actor worker might be calling ->prepare_write and + * ->commit_write. Most of the time, these expect i_mutex to + * be held. Since this may result in an ABBA deadlock with + * pipe->inode, we have to order lock acquiry here. + */ + inode_double_lock(inode, pipe->inode); + ret = __splice_from_pipe(pipe, out, ppos, len, flags, actor); + inode_double_unlock(inode, pipe->inode); + + return ret; +} + +/** + * generic_file_splice_write_nolock - generic_file_splice_write without mutexes + * @pipe: pipe info + * @out: file to write to + * @len: number of bytes to splice + * @flags: splice modifier flags + * + * Will either move or copy pages (determined by @flags options) from + * the given pipe inode to the given file. The caller is responsible + * for acquiring i_mutex on both inodes. + * + */ +ssize_t +generic_file_splice_write_nolock(struct pipe_inode_info *pipe, struct file *out, + loff_t *ppos, size_t len, unsigned int flags) +{ + struct address_space *mapping = out->f_mapping; + struct inode *inode = mapping->host; + ssize_t ret; + int err; + + err = remove_suid(out->f_path.dentry); + if (unlikely(err)) + return err; + + ret = __splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_file); + if (ret > 0) { + *ppos += ret; + + /* + * If file or inode is SYNC and we actually wrote some data, + * sync it. + */ + if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { + err = generic_osync_inode(inode, mapping, + OSYNC_METADATA|OSYNC_DATA); + + if (err) + ret = err; + } + } + + return ret; +} + +EXPORT_SYMBOL(generic_file_splice_write_nolock); + /** * generic_file_splice_write - splice data from a pipe to a file * @pipe: pipe info @@ -814,12 +886,21 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) { struct address_space *mapping = out->f_mapping; + struct inode *inode = mapping->host; ssize_t ret; + int err; + + err = should_remove_suid(out->f_path.dentry); + if (unlikely(err)) { + mutex_lock(&inode->i_mutex); + err = __remove_suid(out->f_path.dentry, err); + mutex_unlock(&inode->i_mutex); + if (err) + return err; + } ret = splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_file); if (ret > 0) { - struct inode *inode = mapping->host; - *ppos += ret; /* @@ -827,8 +908,6 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, * sync it. */ if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { - int err; - mutex_lock(&inode->i_mutex); err = generic_osync_inode(inode, mapping, OSYNC_METADATA|OSYNC_DATA); @@ -929,7 +1008,7 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, * randomly drop data for eg socket -> socket splicing. Use the * piped splicing for that! */ - i_mode = in->f_dentry->d_inode->i_mode; + i_mode = in->f_path.dentry->d_inode->i_mode; if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode))) return -EINVAL; @@ -1029,6 +1108,19 @@ out_release: EXPORT_SYMBOL(do_splice_direct); +/* + * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same + * location, so checking ->i_pipe is not enough to verify that this is a + * pipe. + */ +static inline struct pipe_inode_info *pipe_info(struct inode *inode) +{ + if (S_ISFIFO(inode->i_mode)) + return inode->i_pipe; + + return NULL; +} + /* * Determine where to splice to/from. */ @@ -1040,7 +1132,7 @@ static long do_splice(struct file *in, loff_t __user *off_in, loff_t offset, *off; long ret; - pipe = in->f_dentry->d_inode->i_pipe; + pipe = pipe_info(in->f_path.dentry->d_inode); if (pipe) { if (off_in) return -ESPIPE; @@ -1061,7 +1153,7 @@ static long do_splice(struct file *in, loff_t __user *off_in, return ret; } - pipe = out->f_dentry->d_inode->i_pipe; + pipe = pipe_info(out->f_path.dentry->d_inode); if (pipe) { if (off_out) return -ESPIPE; @@ -1219,7 +1311,7 @@ static int get_iovec_page_array(const struct iovec __user *iov, static long do_vmsplice(struct file *file, const struct iovec __user *iov, unsigned long nr_segs, unsigned int flags) { - struct pipe_inode_info *pipe = file->f_dentry->d_inode->i_pipe; + struct pipe_inode_info *pipe; struct page *pages[PIPE_BUFFERS]; struct partial_page partial[PIPE_BUFFERS]; struct splice_pipe_desc spd = { @@ -1229,7 +1321,8 @@ static long do_vmsplice(struct file *file, const struct iovec __user *iov, .ops = &user_page_pipe_buf_ops, }; - if (unlikely(!pipe)) + pipe = pipe_info(file->f_path.dentry->d_inode); + if (!pipe) return -EBADF; if (unlikely(nr_segs > UIO_MAXIOV)) return -EINVAL; @@ -1294,6 +1387,85 @@ asmlinkage long sys_splice(int fd_in, loff_t __user *off_in, return error; } +/* + * Make sure there's data to read. Wait for input if we can, otherwise + * return an appropriate error. + */ +static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) +{ + int ret; + + /* + * Check ->nrbufs without the inode lock first. This function + * is speculative anyways, so missing one is ok. + */ + if (pipe->nrbufs) + return 0; + + ret = 0; + mutex_lock(&pipe->inode->i_mutex); + + while (!pipe->nrbufs) { + if (signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + if (!pipe->writers) + break; + if (!pipe->waiting_writers) { + if (flags & SPLICE_F_NONBLOCK) { + ret = -EAGAIN; + break; + } + } + pipe_wait(pipe); + } + + mutex_unlock(&pipe->inode->i_mutex); + return ret; +} + +/* + * Make sure there's writeable room. Wait for room if we can, otherwise + * return an appropriate error. + */ +static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) +{ + int ret; + + /* + * Check ->nrbufs without the inode lock first. This function + * is speculative anyways, so missing one is ok. + */ + if (pipe->nrbufs < PIPE_BUFFERS) + return 0; + + ret = 0; + mutex_lock(&pipe->inode->i_mutex); + + while (pipe->nrbufs >= PIPE_BUFFERS) { + if (!pipe->readers) { + send_sig(SIGPIPE, current, 0); + ret = -EPIPE; + break; + } + if (flags & SPLICE_F_NONBLOCK) { + ret = -EAGAIN; + break; + } + if (signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + pipe->waiting_writers++; + pipe_wait(pipe); + pipe->waiting_writers--; + } + + mutex_unlock(&pipe->inode->i_mutex); + return ret; +} + /* * Link contents of ipipe to opipe. */ @@ -1302,136 +1474,63 @@ static int link_pipe(struct pipe_inode_info *ipipe, size_t len, unsigned int flags) { struct pipe_buffer *ibuf, *obuf; - int ret, do_wakeup, i, ipipe_first; - - ret = do_wakeup = ipipe_first = 0; + int ret = 0, i = 0, nbuf; /* * Potential ABBA deadlock, work around it by ordering lock * grabbing by inode address. Otherwise two different processes * could deadlock (one doing tee from A -> B, the other from B -> A). */ - if (ipipe->inode < opipe->inode) { - ipipe_first = 1; - mutex_lock(&ipipe->inode->i_mutex); - mutex_lock(&opipe->inode->i_mutex); - } else { - mutex_lock(&opipe->inode->i_mutex); - mutex_lock(&ipipe->inode->i_mutex); - } + inode_double_lock(ipipe->inode, opipe->inode); - for (i = 0;; i++) { + do { if (!opipe->readers) { send_sig(SIGPIPE, current, 0); if (!ret) ret = -EPIPE; break; } - if (ipipe->nrbufs - i) { - ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1)); - - /* - * If we have room, fill this buffer - */ - if (opipe->nrbufs < PIPE_BUFFERS) { - int nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1); - - /* - * Get a reference to this pipe buffer, - * so we can copy the contents over. - */ - ibuf->ops->get(ipipe, ibuf); - - obuf = opipe->bufs + nbuf; - *obuf = *ibuf; - - /* - * Don't inherit the gift flag, we need to - * prevent multiple steals of this page. - */ - obuf->flags &= ~PIPE_BUF_FLAG_GIFT; - if (obuf->len > len) - obuf->len = len; - - opipe->nrbufs++; - do_wakeup = 1; - ret += obuf->len; - len -= obuf->len; - - if (!len) - break; - if (opipe->nrbufs < PIPE_BUFFERS) - continue; - } - - /* - * We have input available, but no output room. - * If we already copied data, return that. If we - * need to drop the opipe lock, it must be ordered - * last to avoid deadlocks. - */ - if ((flags & SPLICE_F_NONBLOCK) || !ipipe_first) { - if (!ret) - ret = -EAGAIN; - break; - } - if (signal_pending(current)) { - if (!ret) - ret = -ERESTARTSYS; - break; - } - if (do_wakeup) { - smp_mb(); - if (waitqueue_active(&opipe->wait)) - wake_up_interruptible(&opipe->wait); - kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN); - do_wakeup = 0; - } + /* + * If we have iterated all input buffers or ran out of + * output room, break. + */ + if (i >= ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) + break; - opipe->waiting_writers++; - pipe_wait(opipe); - opipe->waiting_writers--; - continue; - } + ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1)); + nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1); /* - * No input buffers, do the usual checks for available - * writers and blocking and wait if necessary + * Get a reference to this pipe buffer, + * so we can copy the contents over. */ - if (!ipipe->writers) - break; - if (!ipipe->waiting_writers) { - if (ret) - break; - } + ibuf->ops->get(ipipe, ibuf); + + obuf = opipe->bufs + nbuf; + *obuf = *ibuf; + /* - * pipe_wait() drops the ipipe mutex. To avoid deadlocks - * with another process, we can only safely do that if - * the ipipe lock is ordered last. + * Don't inherit the gift flag, we need to + * prevent multiple steals of this page. */ - if ((flags & SPLICE_F_NONBLOCK) || ipipe_first) { - if (!ret) - ret = -EAGAIN; - break; - } - if (signal_pending(current)) { - if (!ret) - ret = -ERESTARTSYS; - break; - } + obuf->flags &= ~PIPE_BUF_FLAG_GIFT; - if (waitqueue_active(&ipipe->wait)) - wake_up_interruptible_sync(&ipipe->wait); - kill_fasync(&ipipe->fasync_writers, SIGIO, POLL_OUT); + if (obuf->len > len) + obuf->len = len; - pipe_wait(ipipe); - } + opipe->nrbufs++; + ret += obuf->len; + len -= obuf->len; + i++; + } while (len); - mutex_unlock(&ipipe->inode->i_mutex); - mutex_unlock(&opipe->inode->i_mutex); + inode_double_unlock(ipipe->inode, opipe->inode); - if (do_wakeup) { + /* + * If we put data in the output pipe, wakeup any potential readers. + */ + if (ret > 0) { smp_mb(); if (waitqueue_active(&opipe->wait)) wake_up_interruptible(&opipe->wait); @@ -1450,16 +1549,31 @@ static int link_pipe(struct pipe_inode_info *ipipe, static long do_tee(struct file *in, struct file *out, size_t len, unsigned int flags) { - struct pipe_inode_info *ipipe = in->f_dentry->d_inode->i_pipe; - struct pipe_inode_info *opipe = out->f_dentry->d_inode->i_pipe; + struct pipe_inode_info *ipipe = pipe_info(in->f_path.dentry->d_inode); + struct pipe_inode_info *opipe = pipe_info(out->f_path.dentry->d_inode); + int ret = -EINVAL; /* - * Link ipipe to the two output pipes, consuming as we go along. + * Duplicate the contents of ipipe to opipe without actually + * copying the data. */ - if (ipipe && opipe) - return link_pipe(ipipe, opipe, len, flags); + if (ipipe && opipe && ipipe != opipe) { + /* + * Keep going, unless we encounter an error. The ipipe/opipe + * ordering doesn't really matter. + */ + ret = link_ipipe_prep(ipipe, flags); + if (!ret) { + ret = link_opipe_prep(opipe, flags); + if (!ret) { + ret = link_pipe(ipipe, opipe, len, flags); + if (!ret && (flags & SPLICE_F_NONBLOCK)) + ret = -EAGAIN; + } + } + } - return -EINVAL; + return ret; } asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags)