X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=fs%2Fdirect-io.c;h=b05d1b218776946f4448ba2d0fd51cee477730cf;hb=43bc926fffd92024b46cafaf7350d669ba9ca884;hp=37f593c42e56654cf5e6b31d9dc9d1079d17f81a;hpb=9bf4aaab3e101692164d49b7ca357651eb691cb6;p=linux-2.6.git diff --git a/fs/direct-io.c b/fs/direct-io.c index 37f593c42..b05d1b218 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -53,9 +53,12 @@ * If blkfactor is zero then the user's request was aligned to the filesystem's * blocksize. * - * needs_locking is set for regular files on direct-IO-naive filesystems. It - * determines whether we need to do the fancy locking which prevents direct-IO - * from being able to read uninitialised disk blocks. + * lock_type is DIO_LOCKING for regular files on direct-IO-naive filesystems. + * This determines whether we need to do the fancy locking which prevents + * direct-IO from being able to read uninitialised disk blocks. If its zero + * (blockdev) this locking is not done, and if it is DIO_OWN_LOCKING i_mutex is + * not held for the entire direct write (taken briefly, initially, during a + * direct read though, but its never held for the duration of a direct-IO). */ struct dio { @@ -63,7 +66,8 @@ struct dio { struct bio *bio; /* bio under assembly */ struct inode *inode; int rw; - int needs_locking; /* doesn't change */ + loff_t i_size; /* i_size when submitted */ + int lock_type; /* doesn't change */ unsigned blkbits; /* doesn't change */ unsigned blkfactor; /* When we're using an alignment which is finer than the filesystem's soft @@ -82,12 +86,12 @@ struct dio { unsigned first_block_in_page; /* doesn't change, Used only once */ int boundary; /* prev block is at a boundary */ int reap_counter; /* rate limit reaping */ - get_blocks_t *get_blocks; /* block mapping function */ + get_block_t *get_block; /* block mapping function */ dio_iodone_t *end_io; /* IO completion function */ sector_t final_block_in_bio; /* current final block in bio + 1 */ sector_t next_block_for_io; /* next block to be put under IO, in dio_blocks units */ - struct buffer_head map_bh; /* last get_blocks() result */ + struct buffer_head map_bh; /* last get_block() result */ /* * Deferred addition of a page to the dio. These variables are @@ -125,6 +129,7 @@ struct dio { /* AIO related stuff */ struct kiocb *iocb; /* kiocb */ int is_async; /* is IO async ? */ + int io_error; /* IO error in completion path */ ssize_t result; /* IO result */ }; @@ -158,6 +163,7 @@ static int dio_refill_pages(struct dio *dio) up_read(¤t->mm->mmap_sem); if (ret < 0 && dio->blocks_available && (dio->rw == WRITE)) { + struct page *page = ZERO_PAGE(dio->curr_user_address); /* * A memory fault, but the filesystem has some outstanding * mapped blocks. We need to use those blocks up to avoid @@ -165,7 +171,8 @@ static int dio_refill_pages(struct dio *dio) */ if (dio->page_errors == 0) dio->page_errors = ret; - dio->pages[0] = ZERO_PAGE(dio->curr_user_address); + page_cache_get(page); + dio->pages[0] = page; dio->head = 0; dio->tail = 1; ret = 0; @@ -204,15 +211,15 @@ static struct page *dio_get_page(struct dio *dio) /* * Called when all DIO BIO I/O has been completed - let the filesystem - * know, if it registered an interest earlier via get_blocks. Pass the + * know, if it registered an interest earlier via get_block. Pass the * private field of the map buffer_head so that filesystems can use it - * to hold additional state between get_blocks calls and dio_complete. + * to hold additional state between get_block calls and dio_complete. */ static void dio_complete(struct dio *dio, loff_t offset, ssize_t bytes) { if (dio->end_io && dio->result) - dio->end_io(dio->inode, offset, bytes, dio->map_bh.b_private); - if (dio->needs_locking) + dio->end_io(dio->iocb, offset, bytes, dio->map_bh.b_private); + if (dio->lock_type == DIO_LOCKING) up_read(&dio->inode->i_alloc_sem); } @@ -227,16 +234,33 @@ static void finished_one_bio(struct dio *dio) spin_lock_irqsave(&dio->bio_lock, flags); if (dio->bio_count == 1) { if (dio->is_async) { + ssize_t transferred; + loff_t offset; + /* * Last reference to the dio is going away. * Drop spinlock and complete the DIO. */ spin_unlock_irqrestore(&dio->bio_lock, flags); - dio_complete(dio, dio->block_in_file << dio->blkbits, - dio->result); + + /* Check for short read case */ + transferred = dio->result; + offset = dio->iocb->ki_pos; + + if ((dio->rw == READ) && + ((offset + transferred) > dio->i_size)) + transferred = dio->i_size - offset; + + /* check for error in completion path */ + if (dio->io_error) + transferred = dio->io_error; + + dio_complete(dio, offset, transferred); + /* Complete AIO later if falling back to buffered i/o */ - if (dio->result == dio->size || dio->rw == READ) { - aio_complete(dio->iocb, dio->result, 0); + if (dio->result == dio->size || + ((dio->rw == READ) && dio->result)) { + aio_complete(dio->iocb, transferred, 0); kfree(dio); return; } else { @@ -387,7 +411,7 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio) int page_no; if (!uptodate) - dio->result = -EIO; + dio->io_error = -EIO; if (dio->is_async && dio->rw == READ) { bio_check_pages_dirty(bio); /* transfers ownership */ @@ -469,7 +493,7 @@ static int dio_bio_reap(struct dio *dio) * The fs is allowed to map lots of blocks at once. If it wants to do that, * it uses the passed inode-relative block number as the file offset, as usual. * - * get_blocks() is passed the number of i_blkbits-sized blocks which direct_io + * get_block() is passed the number of i_blkbits-sized blocks which direct_io * has remaining to do. The fs should not map more than this number of blocks. * * If the fs has mapped a lot of blocks, it should populate bh->b_size to @@ -482,7 +506,7 @@ static int dio_bio_reap(struct dio *dio) * In the case of filesystem holes: the fs may return an arbitrarily-large * hole by returning an appropriate value in b_size and by clearing * buffer_mapped(). However the direct-io code will only process holes one - * block at a time - it will repeatedly call get_blocks() as it walks the hole. + * block at a time - it will repeatedly call get_block() as it walks the hole. */ static int get_more_blocks(struct dio *dio) { @@ -492,7 +516,7 @@ static int get_more_blocks(struct dio *dio) unsigned long fs_count; /* Number of filesystem-sized blocks */ unsigned long dio_count;/* Number of dio_block-sized blocks */ unsigned long blkmask; - int beyond_eof = 0; + int create; /* * If there was a memory error and we've overwritten all the @@ -500,8 +524,6 @@ static int get_more_blocks(struct dio *dio) */ ret = dio->page_errors; if (ret == 0) { - map_bh->b_state = 0; - map_bh->b_size = 0; BUG_ON(dio->block_in_file >= dio->final_block_in_request); fs_startblk = dio->block_in_file >> dio->blkfactor; dio_count = dio->final_block_in_request - dio->block_in_file; @@ -510,19 +532,26 @@ static int get_more_blocks(struct dio *dio) if (dio_count & blkmask) fs_count++; - if (dio->needs_locking) { - if (dio->block_in_file >= (i_size_read(dio->inode) >> + map_bh->b_state = 0; + map_bh->b_size = fs_count << dio->inode->i_blkbits; + + create = dio->rw == WRITE; + if (dio->lock_type == DIO_LOCKING) { + if (dio->block_in_file < (i_size_read(dio->inode) >> dio->blkbits)) - beyond_eof = 1; + create = 0; + } else if (dio->lock_type == DIO_NO_LOCKING) { + create = 0; } + /* * For writes inside i_size we forbid block creations: only * overwrites are permitted. We fall back to buffered writes * at a higher level for inside-i_size block-instantiating * writes. */ - ret = (*dio->get_blocks)(dio->inode, fs_startblk, fs_count, - map_bh, (dio->rw == WRITE) && beyond_eof); + ret = (*dio->get_block)(dio->inode, fs_startblk, + map_bh, create); } return ret; } @@ -561,7 +590,11 @@ static int dio_bio_add_page(struct dio *dio) ret = bio_add_page(dio->bio, dio->cur_page, dio->cur_page_len, dio->cur_page_offset); if (ret == dio->cur_page_len) { - dio->pages_in_io--; + /* + * Decrement count only, if we are done with this page + */ + if ((dio->cur_page_len + dio->cur_page_offset) == PAGE_SIZE) + dio->pages_in_io--; page_cache_get(dio->cur_page); dio->final_block_in_bio = dio->cur_page_block + (dio->cur_page_len >> dio->blkbits); @@ -752,11 +785,11 @@ static void dio_zero_block(struct dio *dio, int end) * happily perform page-sized but 512-byte aligned IOs. It is important that * blockdev IO be able to have fine alignment and large sizes. * - * So what we do is to permit the ->get_blocks function to populate bh.b_size + * So what we do is to permit the ->get_block function to populate bh.b_size * with the size of IO which is permitted at this offset and this i_blkbits. * * For best results, the blockdev should be set up with 512-byte i_blkbits and - * it should set b_size to PAGE_SIZE or more inside get_blocks(). This gives + * it should set b_size to PAGE_SIZE or more inside get_block(). This gives * fine alignment but still allows this function to work in PAGE_SIZE units. */ static int do_direct_IO(struct dio *dio) @@ -831,13 +864,22 @@ do_holes: /* Handle holes */ if (!buffer_mapped(map_bh)) { char *kaddr; + loff_t i_size_aligned; /* AKPM: eargh, -ENOTBLK is a hack */ - if (dio->rw == WRITE) + if (dio->rw == WRITE) { + page_cache_release(page); return -ENOTBLK; + } + /* + * Be sure to account for a partial block as the + * last block in the file + */ + i_size_aligned = ALIGN(i_size_read(dio->inode), + 1 << blkbits); if (dio->block_in_file >= - i_size_read(dio->inode)>>blkbits) { + i_size_aligned >> blkbits) { /* We hit eof */ page_cache_release(page); goto out; @@ -887,8 +929,7 @@ do_holes: block_in_page += this_chunk_blocks; dio->blocks_available -= this_chunk_blocks; next_block: - if (dio->block_in_file > dio->final_block_in_request) - BUG(); + BUG_ON(dio->block_in_file > dio->final_block_in_request); if (dio->block_in_file == dio->final_block_in_request) break; } @@ -902,12 +943,12 @@ out: } /* - * Releases both i_sem and i_alloc_sem + * Releases both i_mutex and i_alloc_sem */ static ssize_t direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, const struct iovec *iov, loff_t offset, unsigned long nr_segs, - unsigned blkbits, get_blocks_t get_blocks, dio_iodone_t end_io, + unsigned blkbits, get_block_t get_block, dio_iodone_t end_io, struct dio *dio) { unsigned long user_addr; @@ -929,15 +970,17 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, dio->boundary = 0; dio->reap_counter = 0; - dio->get_blocks = get_blocks; + dio->get_block = get_block; dio->end_io = end_io; dio->map_bh.b_private = NULL; dio->final_block_in_bio = -1; dio->next_block_for_io = -1; dio->page_errors = 0; + dio->io_error = 0; dio->result = 0; dio->iocb = iocb; + dio->i_size = i_size_read(inode); /* * BIO completion state. @@ -953,9 +996,21 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, dio->bio_list = NULL; dio->waiter = NULL; - dio->pages_in_io = 0; - for (seg = 0; seg < nr_segs; seg++) - dio->pages_in_io += (iov[seg].iov_len >> blkbits) + 2; + /* + * In case of non-aligned buffers, we may need 2 more + * pages since we need to zero out first and last block. + */ + if (unlikely(dio->blkfactor)) + dio->pages_in_io = 2; + else + dio->pages_in_io = 0; + + for (seg = 0; seg < nr_segs; seg++) { + user_addr = (unsigned long)iov[seg].iov_base; + dio->pages_in_io += + ((user_addr+iov[seg].iov_len +PAGE_SIZE-1)/PAGE_SIZE + - user_addr/PAGE_SIZE); + } for (seg = 0; seg < nr_segs; seg++) { user_addr = (unsigned long)iov[seg].iov_base; @@ -1021,11 +1076,11 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, /* * All block lookups have been performed. For READ requests - * we can let i_sem go now that its achieved its purpose + * we can let i_mutex go now that its achieved its purpose * of protecting us from looking up uninitialized blocks. */ - if ((rw == READ) && dio->needs_locking) - up(&dio->inode->i_sem); + if ((rw == READ) && (dio->lock_type == DIO_LOCKING)) + mutex_unlock(&dio->inode->i_mutex); /* * OK, all BIOs are submitted, so we can decrement bio_count to truly @@ -1101,17 +1156,30 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, /* * This is a library function for use by filesystem drivers. + * The locking rules are governed by the dio_lock_type parameter. + * + * DIO_NO_LOCKING (no locking, for raw block device access) + * For writes, i_mutex is not held on entry; it is never taken. * - * For writes to S_ISREG files, we are called under i_sem and return with i_sem - * held, even though it is internally dropped. + * DIO_LOCKING (simple locking for regular files) + * For writes we are called under i_mutex and return with i_mutex held, even + * though it is internally dropped. + * For reads, i_mutex is not held on entry, but it is taken and dropped before + * returning. * - * For writes to S_ISBLK files, i_sem is not held on entry; it is never taken. + * DIO_OWN_LOCKING (filesystem provides synchronisation and handling of + * uninitialised data, allowing parallel direct readers and writers) + * For writes we are called without i_mutex, return without it, never touch it. + * For reads we are called under i_mutex and return with i_mutex held, even + * though it may be internally dropped. + * + * Additional i_alloc_sem locking requirements described inline below. */ ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, struct block_device *bdev, const struct iovec *iov, loff_t offset, - unsigned long nr_segs, get_blocks_t get_blocks, dio_iodone_t end_io, - int needs_special_locking) + unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, + int dio_lock_type) { int seg; size_t size; @@ -1122,7 +1190,11 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, ssize_t retval = -EINVAL; loff_t end = offset; struct dio *dio; - int needs_locking; + int release_i_mutex = 0; + int acquire_i_mutex = 0; + + if (rw & WRITE) + current->flags |= PF_SYNCWRITE; if (bdev) bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev)); @@ -1155,28 +1227,43 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, goto out; /* - * For regular files, - * readers need to grab i_sem and i_alloc_sem - * writers need to grab i_alloc_sem only (i_sem is already held) + * For block device access DIO_NO_LOCKING is used, + * neither readers nor writers do any locking at all + * For regular files using DIO_LOCKING, + * readers need to grab i_mutex and i_alloc_sem + * writers need to grab i_alloc_sem only (i_mutex is already held) + * For regular files using DIO_OWN_LOCKING, + * neither readers nor writers take any locks here */ - needs_locking = 0; - if (S_ISREG(inode->i_mode) && needs_special_locking) { - needs_locking = 1; - if (rw == READ) { + dio->lock_type = dio_lock_type; + if (dio_lock_type != DIO_NO_LOCKING) { + /* watch out for a 0 len io from a tricksy fs */ + if (rw == READ && end > offset) { struct address_space *mapping; mapping = iocb->ki_filp->f_mapping; - down(&inode->i_sem); - retval = filemap_write_and_wait(mapping); + if (dio_lock_type != DIO_OWN_LOCKING) { + mutex_lock(&inode->i_mutex); + release_i_mutex = 1; + } + + retval = filemap_write_and_wait_range(mapping, offset, + end - 1); if (retval) { - up(&inode->i_sem); kfree(dio); goto out; } + + if (dio_lock_type == DIO_OWN_LOCKING) { + mutex_unlock(&inode->i_mutex); + acquire_i_mutex = 1; + } } - down_read(&inode->i_alloc_sem); + + if (dio_lock_type == DIO_LOCKING) + down_read(&inode->i_alloc_sem); } - dio->needs_locking = needs_locking; + /* * For file extending writes updating i_size before data * writeouts complete can expose uninitialized blocks. So @@ -1187,8 +1274,18 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, (end > i_size_read(inode))); retval = direct_io_worker(rw, iocb, inode, iov, offset, - nr_segs, blkbits, get_blocks, end_io, dio); + nr_segs, blkbits, get_block, end_io, dio); + + if (rw == READ && dio_lock_type == DIO_LOCKING) + release_i_mutex = 0; + out: + if (release_i_mutex) + mutex_unlock(&inode->i_mutex); + else if (acquire_i_mutex) + mutex_lock(&inode->i_mutex); + if (rw & WRITE) + current->flags &= ~PF_SYNCWRITE; return retval; } EXPORT_SYMBOL(__blockdev_direct_IO);