X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=drivers%2Fmd%2Fraid5.c;h=470e17e609975b95ee99823c19f9a5e6e1772a5c;hb=6a77f38946aaee1cd85eeec6cf4229b204c15071;hp=d1f54a9ad324d8fffb9344ce0de2a0c5f9398d1d;hpb=87fc8d1bb10cd459024a742c6a10961fefcef18f;p=linux-2.6.git diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index d1f54a9ad..470e17e60 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include /* @@ -49,14 +49,14 @@ * This macro is used to determine the 'next' bio in the list, given the sector * of the current stripe+device */ -#define r5_next_bio(bio, sect) ( ( bio->bi_sector + (bio->bi_size>>9) < sect + STRIPE_SECTORS) ? bio->bi_next : NULL) +#define r5_next_bio(bio, sect) ( ( (bio)->bi_sector + ((bio)->bi_size>>9) < sect + STRIPE_SECTORS) ? (bio)->bi_next : NULL) /* * The following can be used to debug the driver */ #define RAID5_DEBUG 0 #define RAID5_PARANOIA 1 #if RAID5_PARANOIA && defined(CONFIG_SMP) -# define CHECK_DEVLOCK() if (!spin_is_locked(&conf->device_lock)) BUG() +# define CHECK_DEVLOCK() assert_spin_locked(&conf->device_lock) #else # define CHECK_DEVLOCK() #endif @@ -232,6 +232,7 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector) } static void unplug_slaves(mddev_t *mddev); +static void raid5_unplug_device(request_queue_t *q); static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector, int pd_idx, int noblock) @@ -302,7 +303,7 @@ static int grow_stripes(raid5_conf_t *conf, int num) return 1; memset(sh, 0, sizeof(*sh) + (devs-1)*sizeof(struct r5dev)); sh->raid_conf = conf; - sh->lock = SPIN_LOCK_UNLOCKED; + spin_lock_init(&sh->lock); if (grow_buffers(sh, conf->raid_disks)) { shrink_buffers(sh, conf->raid_disks); @@ -457,6 +458,7 @@ static void raid5_build_block (struct stripe_head *sh, int i) bio_init(&dev->req); dev->req.bi_io_vec = &dev->vec; dev->req.bi_vcnt++; + dev->req.bi_max_vecs++; dev->vec.bv_page = dev->page; dev->vec.bv_len = STRIPE_SIZE; dev->vec.bv_offset = 0; @@ -612,11 +614,10 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i) /* - * Copy data between a page in the stripe cache, and one or more bion - * The page could align with the middle of the bio, or there could be - * several bion, each with several bio_vecs, which cover part of the page - * Multiple bion are linked together on bi_next. There may be extras - * at the end of this list. We ignore them. + * Copy data between a page in the stripe cache, and a bio. + * There are no alignment or size guarantees between the page or the + * bio except that there is some overlap. + * All iovecs in the bio must be considered. */ static void copy_data(int frombio, struct bio *bio, struct page *page, @@ -625,41 +626,38 @@ static void copy_data(int frombio, struct bio *bio, char *pa = page_address(page); struct bio_vec *bvl; int i; + int page_offset; - for (;bio && bio->bi_sector < sector+STRIPE_SECTORS; - bio = r5_next_bio(bio, sector) ) { - int page_offset; - if (bio->bi_sector >= sector) - page_offset = (signed)(bio->bi_sector - sector) * 512; - else - page_offset = (signed)(sector - bio->bi_sector) * -512; - bio_for_each_segment(bvl, bio, i) { - int len = bio_iovec_idx(bio,i)->bv_len; - int clen; - int b_offset = 0; - - if (page_offset < 0) { - b_offset = -page_offset; - page_offset += b_offset; - len -= b_offset; - } + if (bio->bi_sector >= sector) + page_offset = (signed)(bio->bi_sector - sector) * 512; + else + page_offset = (signed)(sector - bio->bi_sector) * -512; + bio_for_each_segment(bvl, bio, i) { + int len = bio_iovec_idx(bio,i)->bv_len; + int clen; + int b_offset = 0; + + if (page_offset < 0) { + b_offset = -page_offset; + page_offset += b_offset; + len -= b_offset; + } - if (len > 0 && page_offset + len > STRIPE_SIZE) - clen = STRIPE_SIZE - page_offset; - else clen = len; + if (len > 0 && page_offset + len > STRIPE_SIZE) + clen = STRIPE_SIZE - page_offset; + else clen = len; - if (clen > 0) { - char *ba = __bio_kmap_atomic(bio, i, KM_USER0); - if (frombio) - memcpy(pa+page_offset, ba+b_offset, clen); - else - memcpy(ba+b_offset, pa+page_offset, clen); - __bio_kunmap_atomic(ba, KM_USER0); - } - if (clen < len) /* hit end of page */ - break; - page_offset += len; + if (clen > 0) { + char *ba = __bio_kmap_atomic(bio, i, KM_USER0); + if (frombio) + memcpy(pa+page_offset, ba+b_offset, clen); + else + memcpy(ba+b_offset, pa+page_offset, clen); + __bio_kunmap_atomic(ba, KM_USER0); } + if (clen < len) /* hit end of page */ + break; + page_offset += len; } } @@ -725,6 +723,10 @@ static void compute_parity(struct stripe_head *sh, int method) ptr[count++] = page_address(sh->dev[i].page); chosen = sh->dev[i].towrite; sh->dev[i].towrite = NULL; + + if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) + wake_up(&conf->wait_for_overlap); + if (sh->dev[i].written) BUG(); sh->dev[i].written = chosen; check_xor(); @@ -737,6 +739,10 @@ static void compute_parity(struct stripe_head *sh, int method) if (i!=pd_idx && sh->dev[i].towrite) { chosen = sh->dev[i].towrite; sh->dev[i].towrite = NULL; + + if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) + wake_up(&conf->wait_for_overlap); + if (sh->dev[i].written) BUG(); sh->dev[i].written = chosen; } @@ -793,7 +799,7 @@ static void compute_parity(struct stripe_head *sh, int method) * toread/towrite point to the first in a chain. * The bi_next chain must be in order. */ -static void add_stripe_bio (struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite) +static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite) { struct bio **bip; raid5_conf_t *conf = sh->raid_conf; @@ -810,10 +816,13 @@ static void add_stripe_bio (struct stripe_head *sh, struct bio *bi, int dd_idx, else bip = &sh->dev[dd_idx].toread; while (*bip && (*bip)->bi_sector < bi->bi_sector) { - BUG_ON((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector); + if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector) + goto overlap; bip = & (*bip)->bi_next; } -/* FIXME do I need to worry about overlapping bion */ + if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9)) + goto overlap; + if (*bip && bi->bi_next && (*bip) != bi->bi_next) BUG(); if (*bip) @@ -828,7 +837,7 @@ static void add_stripe_bio (struct stripe_head *sh, struct bio *bi, int dd_idx, (unsigned long long)sh->sector, dd_idx); if (forwrite) { - /* check if page is coverred */ + /* check if page is covered */ sector_t sector = sh->dev[dd_idx].sector; for (bi=sh->dev[dd_idx].towrite; sector < sh->dev[dd_idx].sector + STRIPE_SECTORS && @@ -840,6 +849,13 @@ static void add_stripe_bio (struct stripe_head *sh, struct bio *bi, int dd_idx, if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); } + return 1; + + overlap: + set_bit(R5_Overlap, &sh->dev[dd_idx].flags); + spin_unlock_irq(&conf->device_lock); + spin_unlock(&sh->lock); + return 0; } @@ -900,6 +916,8 @@ static void handle_stripe(struct stripe_head *sh) spin_lock_irq(&conf->device_lock); rbi = dev->toread; dev->toread = NULL; + if (test_and_clear_bit(R5_Overlap, &dev->flags)) + wake_up(&conf->wait_for_overlap); spin_unlock_irq(&conf->device_lock); while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) { copy_data(0, rbi, dev->page, dev->sector); @@ -947,6 +965,9 @@ static void handle_stripe(struct stripe_head *sh) sh->dev[i].towrite = NULL; if (bi) to_write--; + if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) + wake_up(&conf->wait_for_overlap); + while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){ struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); clear_bit(BIO_UPTODATE, &bi->bi_flags); @@ -975,6 +996,8 @@ static void handle_stripe(struct stripe_head *sh) if (!test_bit(R5_Insync, &sh->dev[i].flags)) { bi = sh->dev[i].toread; sh->dev[i].toread = NULL; + if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) + wake_up(&conf->wait_for_overlap); if (bi) to_read--; while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){ struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); @@ -1247,13 +1270,13 @@ static void handle_stripe(struct stripe_head *sh) else bi->bi_end_io = raid5_end_read_request; - spin_lock_irq(&conf->device_lock); + rcu_read_lock(); rdev = conf->disks[i].rdev; if (rdev && rdev->faulty) rdev = NULL; if (rdev) atomic_inc(&rdev->nr_pending); - spin_unlock_irq(&conf->device_lock); + rcu_read_unlock(); if (rdev) { if (test_bit(R5_Syncio, &sh->dev[i].flags)) @@ -1266,6 +1289,7 @@ static void handle_stripe(struct stripe_head *sh) bi->bi_sector = sh->sector + rdev->data_offset; bi->bi_flags = 1 << BIO_UPTODATE; bi->bi_vcnt = 1; + bi->bi_max_vecs = 1; bi->bi_idx = 0; bi->bi_io_vec = &sh->dev[i].vec; bi->bi_io_vec[0].bv_len = STRIPE_SIZE; @@ -1302,25 +1326,24 @@ static void unplug_slaves(mddev_t *mddev) { raid5_conf_t *conf = mddev_to_conf(mddev); int i; - unsigned long flags; - spin_lock_irqsave(&conf->device_lock, flags); + rcu_read_lock(); for (i=0; iraid_disks; i++) { mdk_rdev_t *rdev = conf->disks[i].rdev; - if (rdev && atomic_read(&rdev->nr_pending)) { + if (rdev && !rdev->faulty && atomic_read(&rdev->nr_pending)) { request_queue_t *r_queue = bdev_get_queue(rdev->bdev); atomic_inc(&rdev->nr_pending); - spin_unlock_irqrestore(&conf->device_lock, flags); + rcu_read_unlock(); - if (r_queue && r_queue->unplug_fn) + if (r_queue->unplug_fn) r_queue->unplug_fn(r_queue); - spin_lock_irqsave(&conf->device_lock, flags); - atomic_dec(&rdev->nr_pending); + rdev_dec_pending(rdev, mddev); + rcu_read_lock(); } } - spin_unlock_irqrestore(&conf->device_lock, flags); + rcu_read_unlock(); } static void raid5_unplug_device(request_queue_t *q) @@ -1347,29 +1370,26 @@ static int raid5_issue_flush(request_queue_t *q, struct gendisk *disk, raid5_conf_t *conf = mddev_to_conf(mddev); int i, ret = 0; - for (i=0; iraid_disks; i++) { + rcu_read_lock(); + for (i=0; iraid_disks && ret == 0; i++) { mdk_rdev_t *rdev = conf->disks[i].rdev; if (rdev && !rdev->faulty) { struct block_device *bdev = rdev->bdev; - request_queue_t *r_queue; - - if (!bdev) - continue; + request_queue_t *r_queue = bdev_get_queue(bdev); - r_queue = bdev_get_queue(bdev); - if (!r_queue) - continue; - - if (!r_queue->issue_flush_fn) { + if (!r_queue->issue_flush_fn) ret = -EOPNOTSUPP; - break; + else { + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, + error_sector); + rdev_dec_pending(rdev, mddev); + rcu_read_lock(); } - - ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector); - if (ret) - break; } } + rcu_read_unlock(); return ret; } @@ -1406,25 +1426,37 @@ static int make_request (request_queue_t *q, struct bio * bi) if ( bio_data_dir(bi) == WRITE ) md_write_start(mddev); for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { + DEFINE_WAIT(w); new_sector = raid5_compute_sector(logical_sector, raid_disks, data_disks, &dd_idx, &pd_idx, conf); - PRINTK("raid5: make_request, sector %Lu logical %Lu\n", + PRINTK("raid5: make_request, sector %llu logical %llu\n", (unsigned long long)new_sector, (unsigned long long)logical_sector); + retry: + prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); sh = get_active_stripe(conf, new_sector, pd_idx, (bi->bi_rw&RWA_MASK)); if (sh) { - - add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK)); - + if (!add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) { + /* Add failed due to overlap. Flush everything + * and wait a while + */ + raid5_unplug_device(mddev->queue); + release_stripe(sh); + schedule(); + goto retry; + } + finish_wait(&conf->wait_for_overlap, &w); raid5_plug_device(conf); handle_stripe(sh); release_stripe(sh); + } else { /* cannot get stripe for read-ahead, just give-up */ clear_bit(BIO_UPTODATE, &bi->bi_flags); + finish_wait(&conf->wait_for_overlap, &w); break; } @@ -1570,8 +1602,9 @@ static int run (mddev_t *mddev) goto abort; memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE); - conf->device_lock = SPIN_LOCK_UNLOCKED; + spin_lock_init(&conf->device_lock); init_waitqueue_head(&conf->wait_for_stripe); + init_waitqueue_head(&conf->wait_for_overlap); INIT_LIST_HEAD(&conf->handle_list); INIT_LIST_HEAD(&conf->delayed_list); INIT_LIST_HEAD(&conf->inactive_list); @@ -1711,6 +1744,7 @@ static int stop (mddev_t *mddev) mddev->thread = NULL; shrink_stripes(conf); free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER); + blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ kfree(conf); mddev->private = NULL; return 0; @@ -1799,7 +1833,6 @@ static int raid5_spare_active(mddev_t *mddev) raid5_conf_t *conf = mddev->private; struct disk_info *tmp; - spin_lock_irq(&conf->device_lock); for (i = 0; i < conf->raid_disks; i++) { tmp = conf->disks + i; if (tmp->rdev @@ -1811,7 +1844,6 @@ static int raid5_spare_active(mddev_t *mddev) tmp->rdev->in_sync = 1; } } - spin_unlock_irq(&conf->device_lock); print_raid5_conf(conf); return 0; } @@ -1819,25 +1851,28 @@ static int raid5_spare_active(mddev_t *mddev) static int raid5_remove_disk(mddev_t *mddev, int number) { raid5_conf_t *conf = mddev->private; - int err = 1; + int err = 0; + mdk_rdev_t *rdev; struct disk_info *p = conf->disks + number; print_raid5_conf(conf); - spin_lock_irq(&conf->device_lock); - - if (p->rdev) { - if (p->rdev->in_sync || - atomic_read(&p->rdev->nr_pending)) { + rdev = p->rdev; + if (rdev) { + if (rdev->in_sync || + atomic_read(&rdev->nr_pending)) { err = -EBUSY; goto abort; } p->rdev = NULL; - err = 0; + synchronize_kernel(); + if (atomic_read(&rdev->nr_pending)) { + /* lost the race, try later */ + err = -EBUSY; + p->rdev = rdev; + } } - if (err) - MD_BUG(); abort: - spin_unlock_irq(&conf->device_lock); + print_raid5_conf(conf); return err; } @@ -1849,19 +1884,17 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) int disk; struct disk_info *p; - spin_lock_irq(&conf->device_lock); /* * find the disk ... */ for (disk=0; disk < mddev->raid_disks; disk++) if ((p=conf->disks + disk)->rdev == NULL) { - p->rdev = rdev; rdev->in_sync = 0; rdev->raid_disk = disk; found = 1; + p->rdev = rdev; break; } - spin_unlock_irq(&conf->device_lock); print_raid5_conf(conf); return found; }