X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=fs%2Fxfs%2Flinux-2.6%2Fxfs_buf.c;h=690cc7799a7c5b045384f276ca381c97a292719e;hb=6a77f38946aaee1cd85eeec6cf4229b204c15071;hp=4c0a72c305c531a8ab59dee51e88f3ef52eb40df;hpb=87fc8d1bb10cd459024a742c6a10961fefcef18f;p=linux-2.6.git diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 4c0a72c30..690cc7799 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -51,15 +51,12 @@ #include #include #include -#include #include +#include +#include #include "xfs_linux.h" -#ifndef GFP_READAHEAD -#define GFP_READAHEAD (__GFP_NOWARN|__GFP_NORETRY) -#endif - /* * File wide globals */ @@ -118,8 +115,8 @@ ktrace_t *pagebuf_trace_buf; */ #define pb_to_gfp(flags) \ - (((flags) & PBF_READ_AHEAD) ? GFP_READAHEAD : \ - ((flags) & PBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL) + ((((flags) & PBF_READ_AHEAD) ? __GFP_NORETRY : \ + ((flags) & PBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL) | __GFP_NOWARN) #define pb_to_km(flags) \ (((flags) & PBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP) @@ -131,34 +128,71 @@ ktrace_t *pagebuf_trace_buf; kmem_zone_free(pagebuf_cache, (pb)); /* - * Pagebuf hashing + * Page Region interfaces. + * + * For pages in filesystems where the blocksize is smaller than the + * pagesize, we use the page->private field (long) to hold a bitmap + * of uptodate regions within the page. + * + * Each such region is "bytes per page / bits per long" bytes long. + * + * NBPPR == number-of-bytes-per-page-region + * BTOPR == bytes-to-page-region (rounded up) + * BTOPRT == bytes-to-page-region-truncated (rounded down) */ +#if (BITS_PER_LONG == 32) +#define PRSHIFT (PAGE_CACHE_SHIFT - 5) /* (32 == 1<<5) */ +#elif (BITS_PER_LONG == 64) +#define PRSHIFT (PAGE_CACHE_SHIFT - 6) /* (64 == 1<<6) */ +#else +#error BITS_PER_LONG must be 32 or 64 +#endif +#define NBPPR (PAGE_CACHE_SIZE/BITS_PER_LONG) +#define BTOPR(b) (((unsigned int)(b) + (NBPPR - 1)) >> PRSHIFT) +#define BTOPRT(b) (((unsigned int)(b) >> PRSHIFT)) + +STATIC unsigned long +page_region_mask( + size_t offset, + size_t length) +{ + unsigned long mask; + int first, final; -#define NBITS 8 -#define NHASH (1<>= BITS_PER_LONG - (final); -STATIC pb_hash_t pbhash[NHASH]; -#define pb_hash(pb) &pbhash[pb->pb_hash_index] + ASSERT(offset + length <= PAGE_CACHE_SIZE); + ASSERT((final - first) < BITS_PER_LONG && (final - first) >= 0); -STATIC int -_bhash( - struct block_device *bdev, - loff_t base) + return mask; +} + +STATIC inline void +set_page_region( + struct page *page, + size_t offset, + size_t length) { - int bit, hval; + page->private |= page_region_mask(offset, length); + if (page->private == ~0UL) + SetPageUptodate(page); +} - base >>= 9; - base ^= (unsigned long)bdev / L1_CACHE_BYTES; - for (bit = hval = 0; base && bit < sizeof(base) * 8; bit += NBITS) { - hval ^= (int)base & (NHASH-1); - base >>= NBITS; - } - return hval; +STATIC inline int +test_page_region( + struct page *page, + size_t offset, + size_t length) +{ + unsigned long mask = page_region_mask(offset, length); + + return (mask && (page->private & mask) == mask); } /* @@ -172,7 +206,7 @@ typedef struct a_list { STATIC a_list_t *as_free_head; STATIC int as_list_len; -STATIC spinlock_t as_lock = SPIN_LOCK_UNLOCKED; +STATIC DEFINE_SPINLOCK(as_lock); /* * Try to batch vunmaps because they are costly. @@ -183,8 +217,8 @@ free_address( { a_list_t *aentry; - aentry = kmalloc(sizeof(a_list_t), GFP_ATOMIC); - if (aentry) { + aentry = kmalloc(sizeof(a_list_t), GFP_ATOMIC & ~__GFP_HIGH); + if (likely(aentry)) { spin_lock(&as_lock); aentry->next = as_free_head; aentry->vm_addr = addr; @@ -344,7 +378,6 @@ _pagebuf_lookup_pages( uint flags) { struct address_space *mapping = bp->pb_target->pbr_mapping; - unsigned int sectorshift = bp->pb_target->pbr_sshift; size_t blocksize = bp->pb_target->pbr_bsize; size_t size = bp->pb_count_desired; size_t nbytes, offset; @@ -387,13 +420,13 @@ _pagebuf_lookup_pages( */ if (!(++retries % 100)) printk(KERN_ERR - "possible deadlock in %s (mode:0x%x)\n", + "XFS: possible memory allocation " + "deadlock in %s (mode:0x%x)\n", __FUNCTION__, gfp_mask); XFS_STATS_INC(pb_page_retries); pagebuf_daemon_wakeup(0, gfp_mask); - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(10); + blk_congestion_wait(WRITE, HZ/50); goto retry; } @@ -404,22 +437,11 @@ _pagebuf_lookup_pages( if (!PageUptodate(page)) { page_count--; - if (blocksize == PAGE_CACHE_SIZE) { + if (blocksize >= PAGE_CACHE_SIZE) { if (flags & PBF_READ) bp->pb_locked = 1; } else if (!PagePrivate(page)) { - unsigned long j, range; - - /* - * In this case page->private holds a bitmap - * of uptodate sectors within the page - */ - ASSERT(blocksize < PAGE_CACHE_SIZE); - range = (offset + nbytes) >> sectorshift; - for (j = offset >> sectorshift; j < range; j++) - if (!test_bit(j, &page->private)) - break; - if (j == range) + if (test_page_region(page, offset, nbytes)) page_count++; } } @@ -486,9 +508,9 @@ _pagebuf_map_pages( * which may imply that this call will block until those buffers * are unlocked. No I/O is implied by this call. */ -STATIC xfs_buf_t * -_pagebuf_find( /* find buffer for block */ - xfs_buftarg_t *target,/* target for block */ +xfs_buf_t * +_pagebuf_find( + xfs_buftarg_t *btp, /* block device target */ loff_t ioff, /* starting offset of range */ size_t isize, /* length of range */ page_buf_flags_t flags, /* PBF_TRYLOCK */ @@ -496,59 +518,55 @@ _pagebuf_find( /* find buffer for block */ { loff_t range_base; size_t range_length; - int hval; - pb_hash_t *h; + xfs_bufhash_t *hash; xfs_buf_t *pb, *n; - int not_locked; range_base = (ioff << BBSHIFT); range_length = (isize << BBSHIFT); - /* Ensure we never do IOs smaller than the sector size */ - BUG_ON(range_length < (1 << target->pbr_sshift)); + /* Check for IOs smaller than the sector size / not sector aligned */ + ASSERT(!(range_length < (1 << btp->pbr_sshift))); + ASSERT(!(range_base & (loff_t)btp->pbr_smask)); - /* Ensure we never do IOs that are not sector aligned */ - BUG_ON(range_base & (loff_t)target->pbr_smask); + hash = &btp->bt_hash[hash_long((unsigned long)ioff, btp->bt_hashshift)]; - hval = _bhash(target->pbr_bdev, range_base); - h = &pbhash[hval]; + spin_lock(&hash->bh_lock); - spin_lock(&h->pb_hash_lock); - list_for_each_entry_safe(pb, n, &h->pb_hash, pb_hash_list) { - if (pb->pb_target == target && - pb->pb_file_offset == range_base && + list_for_each_entry_safe(pb, n, &hash->bh_list, pb_hash_list) { + ASSERT(btp == pb->pb_target); + if (pb->pb_file_offset == range_base && pb->pb_buffer_length == range_length) { - /* If we look at something bring it to the - * front of the list for next time + /* + * If we look at something bring it to the + * front of the list for next time. */ atomic_inc(&pb->pb_hold); - list_move(&pb->pb_hash_list, &h->pb_hash); + list_move(&pb->pb_hash_list, &hash->bh_list); goto found; } } /* No match found */ if (new_pb) { - _pagebuf_initialize(new_pb, target, range_base, + _pagebuf_initialize(new_pb, btp, range_base, range_length, flags); - new_pb->pb_hash_index = hval; - list_add(&new_pb->pb_hash_list, &h->pb_hash); + new_pb->pb_hash = hash; + list_add(&new_pb->pb_hash_list, &hash->bh_list); } else { XFS_STATS_INC(pb_miss_locked); } - spin_unlock(&h->pb_hash_lock); - return (new_pb); + spin_unlock(&hash->bh_lock); + return new_pb; found: - spin_unlock(&h->pb_hash_lock); + spin_unlock(&hash->bh_lock); /* Attempt to get the semaphore without sleeping, * if this does not work then we need to drop the * spinlock and do a hard attempt on the semaphore. */ - not_locked = down_trylock(&pb->pb_sema); - if (not_locked) { + if (down_trylock(&pb->pb_sema)) { if (!(flags & PBF_TRYLOCK)) { /* wait for buffer ownership */ PB_TRACE(pb, "get_lock", 0); @@ -578,39 +596,14 @@ found: return (pb); } - -/* - * pagebuf_find - * - * pagebuf_find returns a buffer matching the specified range of - * data for the specified target, if any of the relevant blocks - * are in memory. The buffer may have unallocated holes, if - * some, but not all, of the blocks are in memory. Even where - * pages are present in the buffer, not all of every page may be - * valid. - */ -xfs_buf_t * -pagebuf_find( /* find buffer for block */ - /* if the block is in memory */ - xfs_buftarg_t *target,/* target for block */ - loff_t ioff, /* starting offset of range */ - size_t isize, /* length of range */ - page_buf_flags_t flags) /* PBF_TRYLOCK */ -{ - return _pagebuf_find(target, ioff, isize, flags, NULL); -} - /* - * pagebuf_get + * xfs_buf_get_flags assembles a buffer covering the specified range. * - * pagebuf_get assembles a buffer covering the specified range. - * Some or all of the blocks in the range may be valid. Storage - * in memory for all portions of the buffer will be allocated, - * although backing storage may not be. If PBF_READ is set in - * flags, pagebuf_iostart is called also. + * Storage in memory for all portions of the buffer will be allocated, + * although backing storage may not be. */ xfs_buf_t * -pagebuf_get( /* allocate a buffer */ +xfs_buf_get_flags( /* allocate a buffer */ xfs_buftarg_t *target,/* target for buffer */ loff_t ioff, /* starting offset of range */ size_t isize, /* length of range */ @@ -640,8 +633,8 @@ pagebuf_get( /* allocate a buffer */ if (!(pb->pb_flags & PBF_MAPPED)) { error = _pagebuf_map_pages(pb, flags); if (unlikely(error)) { - printk(KERN_WARNING - "pagebuf_get: failed to map pages\n"); + printk(KERN_WARNING "%s: failed to map pages\n", + __FUNCTION__); goto no_buffer; } } @@ -655,30 +648,50 @@ pagebuf_get( /* allocate a buffer */ pb->pb_bn = ioff; pb->pb_count_desired = pb->pb_buffer_length; - if (flags & PBF_READ) { + PB_TRACE(pb, "get", (unsigned long)flags); + return pb; + + no_buffer: + if (flags & (PBF_LOCK | PBF_TRYLOCK)) + pagebuf_unlock(pb); + pagebuf_rele(pb); + return NULL; +} + +xfs_buf_t * +xfs_buf_read_flags( + xfs_buftarg_t *target, + loff_t ioff, + size_t isize, + page_buf_flags_t flags) +{ + xfs_buf_t *pb; + + flags |= PBF_READ; + + pb = xfs_buf_get_flags(target, ioff, isize, flags); + if (pb) { if (PBF_NOT_DONE(pb)) { - PB_TRACE(pb, "get_read", (unsigned long)flags); + PB_TRACE(pb, "read", (unsigned long)flags); XFS_STATS_INC(pb_get_read); pagebuf_iostart(pb, flags); } else if (flags & PBF_ASYNC) { - PB_TRACE(pb, "get_read_async", (unsigned long)flags); + PB_TRACE(pb, "read_async", (unsigned long)flags); /* * Read ahead call which is already satisfied, * drop the buffer */ goto no_buffer; } else { - PB_TRACE(pb, "get_read_done", (unsigned long)flags); + PB_TRACE(pb, "read_done", (unsigned long)flags); /* We do not want read in the flags */ pb->pb_flags &= ~PBF_READ; } - } else { - PB_TRACE(pb, "get_write", (unsigned long)flags); } return pb; -no_buffer: + no_buffer: if (flags & (PBF_LOCK | PBF_TRYLOCK)) pagebuf_unlock(pb); pagebuf_rele(pb); @@ -720,11 +733,9 @@ pagebuf_readahead( bdi = target->pbr_mapping->backing_dev_info; if (bdi_read_congested(bdi)) return; - if (bdi_write_congested(bdi)) - return; - flags |= (PBF_TRYLOCK|PBF_READ|PBF_ASYNC|PBF_READ_AHEAD); - pagebuf_get(target, ioff, isize, flags); + flags |= (PBF_TRYLOCK|PBF_ASYNC|PBF_READ_AHEAD); + xfs_buf_read_flags(target, ioff, isize, flags); } xfs_buf_t * @@ -875,18 +886,29 @@ void pagebuf_rele( xfs_buf_t *pb) { - pb_hash_t *hash = pb_hash(pb); + xfs_bufhash_t *hash = pb->pb_hash; PB_TRACE(pb, "rele", pb->pb_relse); - if (atomic_dec_and_lock(&pb->pb_hold, &hash->pb_hash_lock)) { + /* + * pagebuf_lookup buffers are not hashed, not delayed write, + * and don't have their own release routines. Special case. + */ + if (unlikely(!hash)) { + ASSERT(!pb->pb_relse); + if (atomic_dec_and_test(&pb->pb_hold)) + xfs_buf_free(pb); + return; + } + + if (atomic_dec_and_lock(&pb->pb_hold, &hash->bh_lock)) { int do_free = 1; if (pb->pb_relse) { atomic_inc(&pb->pb_hold); - spin_unlock(&hash->pb_hash_lock); + spin_unlock(&hash->bh_lock); (*(pb->pb_relse)) (pb); - spin_lock(&hash->pb_hash_lock); + spin_lock(&hash->bh_lock); do_free = 0; } @@ -901,10 +923,10 @@ pagebuf_rele( if (do_free) { list_del_init(&pb->pb_hash_list); - spin_unlock(&hash->pb_hash_lock); + spin_unlock(&hash->bh_lock); pagebuf_free(pb); } else { - spin_unlock(&hash->pb_hash_lock); + spin_unlock(&hash->bh_lock); } } } @@ -944,6 +966,7 @@ pagebuf_cond_lock( /* lock buffer, if not locked */ return(locked ? 0 : -EBUSY); } +#ifdef DEBUG /* * pagebuf_lock_value * @@ -955,6 +978,7 @@ pagebuf_lock_value( { return(atomic_read(&pb->pb_sema.count)); } +#endif /* * pagebuf_lock @@ -1084,7 +1108,7 @@ _pagebuf_wait_unpin( * done with respect to that I/O. The pb_iodone routine, if * present, will be called as a side-effect. */ -void +STATIC void pagebuf_iodone_work( void *v) { @@ -1225,7 +1249,6 @@ bio_end_io_pagebuf( { xfs_buf_t *pb = (xfs_buf_t *)bio->bi_private; unsigned int i, blocksize = pb->pb_target->pbr_bsize; - unsigned int sectorshift = pb->pb_target->pbr_sshift; struct bio_vec *bvec = bio->bi_io_vec; if (bio->bi_size) @@ -1243,14 +1266,7 @@ bio_end_io_pagebuf( SetPageUptodate(page); } else if (!PagePrivate(page) && (pb->pb_flags & _PBF_PAGE_CACHE)) { - unsigned long j, range; - - ASSERT(blocksize < PAGE_CACHE_SIZE); - range = (bvec->bv_offset + bvec->bv_len) >> sectorshift; - for (j = bvec->bv_offset >> sectorshift; j < range; j++) - set_bit(j, &page->private); - if (page->private == (unsigned long)(PAGE_CACHE_SIZE-1)) - SetPageUptodate(page); + set_page_region(page, bvec->bv_offset, bvec->bv_len); } if (_pagebuf_iolocked(pb)) { @@ -1263,7 +1279,7 @@ bio_end_io_pagebuf( return 0; } -void +STATIC void _pagebuf_ioapply( xfs_buf_t *pb) { @@ -1473,6 +1489,65 @@ pagebuf_iomove( * Handling of buftargs. */ +/* + * Wait for any bufs with callbacks that have been submitted but + * have not yet returned... walk the hash list for the target. + */ +void +xfs_wait_buftarg( + xfs_buftarg_t *btp) +{ + xfs_buf_t *bp, *n; + xfs_bufhash_t *hash; + uint i; + + for (i = 0; i < (1 << btp->bt_hashshift); i++) { + hash = &btp->bt_hash[i]; +again: + spin_lock(&hash->bh_lock); + list_for_each_entry_safe(bp, n, &hash->bh_list, pb_hash_list) { + ASSERT(btp == bp->pb_target); + if (!(bp->pb_flags & PBF_FS_MANAGED)) { + spin_unlock(&hash->bh_lock); + delay(100); + goto again; + } + } + spin_unlock(&hash->bh_lock); + } +} + +/* + * Allocate buffer hash table for a given target. + * For devices containing metadata (i.e. not the log/realtime devices) + * we need to allocate a much larger hash table. + */ +STATIC void +xfs_alloc_bufhash( + xfs_buftarg_t *btp, + int external) +{ + unsigned int i; + + btp->bt_hashshift = external ? 3 : 8; /* 8 or 256 buckets */ + btp->bt_hashmask = (1 << btp->bt_hashshift) - 1; + btp->bt_hash = kmem_zalloc((1 << btp->bt_hashshift) * + sizeof(xfs_bufhash_t), KM_SLEEP); + for (i = 0; i < (1 << btp->bt_hashshift); i++) { + spin_lock_init(&btp->bt_hash[i].bh_lock); + INIT_LIST_HEAD(&btp->bt_hash[i].bh_list); + } +} + +STATIC void +xfs_free_bufhash( + xfs_buftarg_t *btp) +{ + kmem_free(btp->bt_hash, + (1 << btp->bt_hashshift) * sizeof(xfs_bufhash_t)); + btp->bt_hash = NULL; +} + void xfs_free_buftarg( xfs_buftarg_t *btp, @@ -1481,6 +1556,7 @@ xfs_free_buftarg( xfs_flush_buftarg(btp, 1); if (external) xfs_blkdev_put(btp->pbr_bdev); + xfs_free_bufhash(btp); iput(btp->pbr_mapping->host); kmem_free(btp, sizeof(*btp)); } @@ -1495,11 +1571,12 @@ xfs_incore_relse( truncate_inode_pages(btp->pbr_mapping, 0LL); } -int -xfs_setsize_buftarg( +STATIC int +xfs_setsize_buftarg_flags( xfs_buftarg_t *btp, unsigned int blocksize, - unsigned int sectorsize) + unsigned int sectorsize, + int verbose) { btp->pbr_bsize = blocksize; btp->pbr_sshift = ffs(sectorsize) - 1; @@ -1511,9 +1588,42 @@ xfs_setsize_buftarg( sectorsize, XFS_BUFTARG_NAME(btp)); return EINVAL; } + + if (verbose && + (PAGE_CACHE_SIZE / BITS_PER_LONG) > sectorsize) { + printk(KERN_WARNING + "XFS: %u byte sectors in use on device %s. " + "This is suboptimal; %u or greater is ideal.\n", + sectorsize, XFS_BUFTARG_NAME(btp), + (unsigned int)PAGE_CACHE_SIZE / BITS_PER_LONG); + } + return 0; } +/* +* When allocating the initial buffer target we have not yet +* read in the superblock, so don't know what sized sectors +* are being used is at this early stage. Play safe. +*/ +STATIC int +xfs_setsize_buftarg_early( + xfs_buftarg_t *btp, + struct block_device *bdev) +{ + return xfs_setsize_buftarg_flags(btp, + PAGE_CACHE_SIZE, bdev_hardsect_size(bdev), 0); +} + +int +xfs_setsize_buftarg( + xfs_buftarg_t *btp, + unsigned int blocksize, + unsigned int sectorsize) +{ + return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1); +} + STATIC int xfs_mapping_buftarg( xfs_buftarg_t *btp, @@ -1542,14 +1652,15 @@ xfs_mapping_buftarg( mapping = &inode->i_data; mapping->a_ops = &mapping_aops; mapping->backing_dev_info = bdi; - mapping_set_gfp_mask(mapping, GFP_KERNEL); + mapping_set_gfp_mask(mapping, GFP_NOFS); btp->pbr_mapping = mapping; return 0; } xfs_buftarg_t * xfs_alloc_buftarg( - struct block_device *bdev) + struct block_device *bdev, + int external) { xfs_buftarg_t *btp; @@ -1557,10 +1668,11 @@ xfs_alloc_buftarg( btp->pbr_dev = bdev->bd_dev; btp->pbr_bdev = bdev; - if (xfs_setsize_buftarg(btp, PAGE_CACHE_SIZE, bdev_hardsect_size(bdev))) + if (xfs_setsize_buftarg_early(btp, bdev)) goto error; if (xfs_mapping_buftarg(btp, bdev)) goto error; + xfs_alloc_bufhash(btp, external); return btp; error: @@ -1574,7 +1686,7 @@ error: */ STATIC LIST_HEAD(pbd_delwrite_queue); -STATIC spinlock_t pbd_delwrite_lock = SPIN_LOCK_UNLOCKED; +STATIC DEFINE_SPINLOCK(pbd_delwrite_lock); STATIC void pagebuf_delwri_queue( @@ -1665,9 +1777,7 @@ pagebuf_daemon( INIT_LIST_HEAD(&tmp); do { - /* swsusp */ - if (current->flags & PF_FREEZE) - refrigerator(PF_FREEZE); + try_to_freeze(PF_FREEZE); set_current_state(TASK_INTERRUPTIBLE); schedule_timeout((xfs_buf_timer_centisecs * HZ) / 100); @@ -1826,8 +1936,6 @@ pagebuf_daemon_stop(void) int __init pagebuf_init(void) { - int i; - pagebuf_cache = kmem_cache_create("xfs_buf_t", sizeof(xfs_buf_t), 0, SLAB_HWCACHE_ALIGN, NULL, NULL); if (pagebuf_cache == NULL) { @@ -1848,11 +1956,6 @@ pagebuf_init(void) return -ENOMEM; } - for (i = 0; i < NHASH; i++) { - spin_lock_init(&pbhash[i].pb_hash_lock); - INIT_LIST_HEAD(&pbhash[i].pb_hash); - } - return 0; }