X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=drivers%2Fmd%2Fraid1.c;h=060c6a28c86c3ae5acb9325653886b63d73eae24;hb=6a77f38946aaee1cd85eeec6cf4229b204c15071;hp=1b80c5b68f54fcff58194253abb8d237bbcee658;hpb=9213980e6a70d8473e0ffd4b39ab5b6caaba9ff5;p=linux-2.6.git diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 1b80c5b68..060c6a28c 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -24,18 +24,12 @@ #include -#define MAJOR_NR MD_MAJOR -#define MD_DRIVER -#define MD_PERSONALITY - /* * Number of guaranteed r1bios in case of extreme VM load: */ #define NR_RAID1_BIOS 256 static mdk_personality_t raid1_personality; -static spinlock_t retry_list_lock = SPIN_LOCK_UNLOCKED; -static LIST_HEAD(retry_list_head); static void unplug_slaves(mddev_t *mddev); @@ -44,13 +38,12 @@ static void * r1bio_pool_alloc(int gfp_flags, void *data) { struct pool_info *pi = data; r1bio_t *r1_bio; + int size = offsetof(r1bio_t, bios[pi->raid_disks]); /* allocate a r1bio with room for raid_disks entries in the bios array */ - r1_bio = kmalloc(sizeof(r1bio_t) + sizeof(struct bio*)*pi->raid_disks, - gfp_flags); + r1_bio = kmalloc(size, gfp_flags); if (r1_bio) - memset(r1_bio, 0, sizeof(*r1_bio) + - sizeof(struct bio*) * pi->raid_disks); + memset(r1_bio, 0, size); else unplug_slaves(pi->mddev); @@ -104,7 +97,7 @@ static void * r1buf_pool_alloc(int gfp_flags, void *data) bio->bi_io_vec[i].bv_page = page; } - r1_bio->master_bio = bio; + r1_bio->master_bio = NULL; return r1_bio; @@ -189,40 +182,15 @@ static inline void put_buf(r1bio_t *r1_bio) spin_unlock_irqrestore(&conf->resync_lock, flags); } -static int map(mddev_t *mddev, mdk_rdev_t **rdevp) -{ - conf_t *conf = mddev_to_conf(mddev); - int i, disks = conf->raid_disks; - - /* - * Later we do read balancing on the read side - * now we use the first available disk. - */ - - spin_lock_irq(&conf->device_lock); - for (i = 0; i < disks; i++) { - mdk_rdev_t *rdev = conf->mirrors[i].rdev; - if (rdev && rdev->in_sync) { - *rdevp = rdev; - atomic_inc(&rdev->nr_pending); - spin_unlock_irq(&conf->device_lock); - return 0; - } - } - spin_unlock_irq(&conf->device_lock); - - printk(KERN_ERR "raid1_map(): huh, no more operational devices?\n"); - return -1; -} - static void reschedule_retry(r1bio_t *r1_bio) { unsigned long flags; mddev_t *mddev = r1_bio->mddev; + conf_t *conf = mddev_to_conf(mddev); - spin_lock_irqsave(&retry_list_lock, flags); - list_add(&r1_bio->retry_list, &retry_list_head); - spin_unlock_irqrestore(&retry_list_lock, flags); + spin_lock_irqsave(&conf->device_lock, flags); + list_add(&r1_bio->retry_list, &conf->retry_list); + spin_unlock_irqrestore(&conf->device_lock, flags); md_wakeup_thread(mddev->thread); } @@ -292,8 +260,9 @@ static int raid1_end_read_request(struct bio *bio, unsigned int bytes_done, int * oops, read error: */ char b[BDEVNAME_SIZE]; - printk(KERN_ERR "raid1: %s: rescheduling sector %llu\n", - bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector); + if (printk_ratelimit()) + printk(KERN_ERR "raid1: %s: rescheduling sector %llu\n", + bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector); reschedule_retry(r1_bio); } @@ -363,28 +332,29 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int * * The rdev for the device selected will have nr_pending incremented. */ -static int read_balance(conf_t *conf, struct bio *bio, r1bio_t *r1_bio) +static int read_balance(conf_t *conf, r1bio_t *r1_bio) { const unsigned long this_sector = r1_bio->sector; int new_disk = conf->last_used, disk = new_disk; - const int sectors = bio->bi_size >> 9; + const int sectors = r1_bio->sectors; sector_t new_distance, current_distance; - spin_lock_irq(&conf->device_lock); + rcu_read_lock(); /* * Check if it if we can balance. We can balance on the whole * device if no resync is going on, or below the resync window. * We take the first readable disk when above the resync window. */ - if (!conf->mddev->in_sync && (this_sector + sectors >= conf->next_resync)) { - /* make sure that disk is operational */ + if (conf->mddev->recovery_cp < MaxSector && + (this_sector + sectors >= conf->next_resync)) { + /* Choose the first operation device, for consistancy */ new_disk = 0; while (!conf->mirrors[new_disk].rdev || !conf->mirrors[new_disk].rdev->in_sync) { new_disk++; if (new_disk == conf->raid_disks) { - new_disk = 0; + new_disk = -1; break; } } @@ -399,7 +369,7 @@ static int read_balance(conf_t *conf, struct bio *bio, r1bio_t *r1_bio) new_disk = conf->raid_disks; new_disk--; if (new_disk == disk) { - new_disk = conf->last_used; + new_disk = -1; goto rb_out; } } @@ -439,14 +409,14 @@ static int read_balance(conf_t *conf, struct bio *bio, r1bio_t *r1_bio) } while (disk != conf->last_used); rb_out: - r1_bio->read_disk = new_disk; - conf->next_seq_sect = this_sector + sectors; - conf->last_used = new_disk; - if (conf->mirrors[new_disk].rdev) + if (new_disk >= 0) { + conf->next_seq_sect = this_sector + sectors; + conf->last_used = new_disk; atomic_inc(&conf->mirrors[new_disk].rdev->nr_pending); - spin_unlock_irq(&conf->device_lock); + } + rcu_read_unlock(); return new_disk; } @@ -455,31 +425,61 @@ static void unplug_slaves(mddev_t *mddev) { conf_t *conf = mddev_to_conf(mddev); int i; - unsigned long flags; - spin_lock_irqsave(&conf->device_lock, flags); + rcu_read_lock(); for (i=0; iraid_disks; i++) { mdk_rdev_t *rdev = conf->mirrors[i].rdev; - if (rdev && atomic_read(&rdev->nr_pending)) { + if (rdev && !rdev->faulty && atomic_read(&rdev->nr_pending)) { request_queue_t *r_queue = bdev_get_queue(rdev->bdev); atomic_inc(&rdev->nr_pending); - spin_unlock_irqrestore(&conf->device_lock, flags); + rcu_read_unlock(); if (r_queue->unplug_fn) r_queue->unplug_fn(r_queue); - spin_lock_irqsave(&conf->device_lock, flags); - atomic_dec(&rdev->nr_pending); + rdev_dec_pending(rdev, mddev); + rcu_read_lock(); } } - spin_unlock_irqrestore(&conf->device_lock, flags); + rcu_read_unlock(); } + static void raid1_unplug(request_queue_t *q) { unplug_slaves(q->queuedata); } +static int raid1_issue_flush(request_queue_t *q, struct gendisk *disk, + sector_t *error_sector) +{ + mddev_t *mddev = q->queuedata; + conf_t *conf = mddev_to_conf(mddev); + int i, ret = 0; + + rcu_read_lock(); + for (i=0; iraid_disks && ret == 0; i++) { + mdk_rdev_t *rdev = conf->mirrors[i].rdev; + if (rdev && !rdev->faulty) { + struct block_device *bdev = rdev->bdev; + request_queue_t *r_queue = bdev_get_queue(bdev); + + if (!r_queue->issue_flush_fn) + ret = -EOPNOTSUPP; + else { + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, + error_sector); + rdev_dec_pending(rdev, mddev); + rcu_read_lock(); + } + } + } + rcu_read_unlock(); + return ret; +} + /* * Throttle resync depth, so that we can both get proper overlapping of * requests, but are still able to handle normal requests quickly. @@ -544,15 +544,26 @@ static int make_request(request_queue_t *q, struct bio * bio) r1_bio->mddev = mddev; r1_bio->sector = bio->bi_sector; + r1_bio->state = 0; + if (bio_data_dir(bio) == READ) { /* * read balancing logic: */ - mirror = conf->mirrors + read_balance(conf, bio, r1_bio); + int rdisk = read_balance(conf, r1_bio); + + if (rdisk < 0) { + /* couldn't find anywhere to read from */ + raid_end_bio_io(r1_bio); + return 0; + } + mirror = conf->mirrors + rdisk; + + r1_bio->read_disk = rdisk; read_bio = bio_clone(bio, GFP_NOIO); - r1_bio->bios[r1_bio->read_disk] = read_bio; + r1_bio->bios[rdisk] = read_bio; read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset; read_bio->bi_bdev = mirror->rdev->bdev; @@ -572,7 +583,7 @@ static int make_request(request_queue_t *q, struct bio * bio) * bios[x] to bio */ disks = conf->raid_disks; - spin_lock_irq(&conf->device_lock); + rcu_read_lock(); for (i = 0; i < disks; i++) { if (conf->mirrors[i].rdev && !conf->mirrors[i].rdev->faulty) { @@ -581,7 +592,7 @@ static int make_request(request_queue_t *q, struct bio * bio) } else r1_bio->bios[i] = NULL; } - spin_unlock_irq(&conf->device_lock); + rcu_read_unlock(); atomic_set(&r1_bio->remaining, 1); md_write_start(mddev); @@ -703,7 +714,6 @@ static int raid1_spare_active(mddev_t *mddev) conf_t *conf = mddev->private; mirror_info_t *tmp; - spin_lock_irq(&conf->device_lock); /* * Find all failed disks within the RAID1 configuration * and mark them readable @@ -718,7 +728,6 @@ static int raid1_spare_active(mddev_t *mddev) tmp->rdev->in_sync = 1; } } - spin_unlock_irq(&conf->device_lock); print_conf(conf); return 0; @@ -732,10 +741,8 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) int mirror; mirror_info_t *p; - spin_lock_irq(&conf->device_lock); for (mirror=0; mirror < mddev->raid_disks; mirror++) if ( !(p=conf->mirrors+mirror)->rdev) { - p->rdev = rdev; blk_queue_stack_limits(mddev->queue, rdev->bdev->bd_disk->queue); @@ -745,14 +752,14 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) */ if (rdev->bdev->bd_disk->queue->merge_bvec_fn && mddev->queue->max_sectors > (PAGE_SIZE>>9)) - mddev->queue->max_sectors = (PAGE_SIZE>>9); + blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); p->head_position = 0; rdev->raid_disk = mirror; found = 1; + p->rdev = rdev; break; } - spin_unlock_irq(&conf->device_lock); print_conf(conf); return found; @@ -761,24 +768,27 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) static int raid1_remove_disk(mddev_t *mddev, int number) { conf_t *conf = mddev->private; - int err = 1; + int err = 0; + mdk_rdev_t *rdev; mirror_info_t *p = conf->mirrors+ number; print_conf(conf); - spin_lock_irq(&conf->device_lock); - if (p->rdev) { - if (p->rdev->in_sync || - atomic_read(&p->rdev->nr_pending)) { + rdev = p->rdev; + if (rdev) { + if (rdev->in_sync || + atomic_read(&rdev->nr_pending)) { err = -EBUSY; goto abort; } p->rdev = NULL; - err = 0; + synchronize_kernel(); + if (atomic_read(&rdev->nr_pending)) { + /* lost the race, try later */ + err = -EBUSY; + p->rdev = rdev; + } } - if (err) - MD_BUG(); abort: - spin_unlock_irq(&conf->device_lock); print_conf(conf); return err; @@ -876,7 +886,7 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) atomic_inc(&conf->mirrors[i].rdev->nr_pending); atomic_inc(&r1_bio->remaining); - md_sync_acct(conf->mirrors[i].rdev, wbio->bi_size >> 9); + md_sync_acct(conf->mirrors[i].rdev->bdev, wbio->bi_size >> 9); generic_make_request(wbio); } @@ -896,11 +906,11 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) static void raid1d(mddev_t *mddev) { - struct list_head *head = &retry_list_head; r1bio_t *r1_bio; struct bio *bio; unsigned long flags; conf_t *conf = mddev_to_conf(mddev); + struct list_head *head = &conf->retry_list; int unplug=0; mdk_rdev_t *rdev; @@ -909,40 +919,50 @@ static void raid1d(mddev_t *mddev) for (;;) { char b[BDEVNAME_SIZE]; - spin_lock_irqsave(&retry_list_lock, flags); + spin_lock_irqsave(&conf->device_lock, flags); if (list_empty(head)) break; r1_bio = list_entry(head->prev, r1bio_t, retry_list); list_del(head->prev); - spin_unlock_irqrestore(&retry_list_lock, flags); + spin_unlock_irqrestore(&conf->device_lock, flags); mddev = r1_bio->mddev; conf = mddev_to_conf(mddev); - bio = r1_bio->master_bio; if (test_bit(R1BIO_IsSync, &r1_bio->state)) { sync_request_write(mddev, r1_bio); unplug = 1; } else { - if (map(mddev, &rdev) == -1) { + int disk; + bio = r1_bio->bios[r1_bio->read_disk]; + if ((disk=read_balance(conf, r1_bio)) == -1) { printk(KERN_ALERT "raid1: %s: unrecoverable I/O" " read error for block %llu\n", bdevname(bio->bi_bdev,b), (unsigned long long)r1_bio->sector); raid_end_bio_io(r1_bio); } else { - printk(KERN_ERR "raid1: %s: redirecting sector %llu to" - " another mirror\n", - bdevname(rdev->bdev,b), - (unsigned long long)r1_bio->sector); - bio->bi_bdev = rdev->bdev; + r1_bio->bios[r1_bio->read_disk] = NULL; + r1_bio->read_disk = disk; + bio_put(bio); + bio = bio_clone(r1_bio->master_bio, GFP_NOIO); + r1_bio->bios[r1_bio->read_disk] = bio; + rdev = conf->mirrors[disk].rdev; + if (printk_ratelimit()) + printk(KERN_ERR "raid1: %s: redirecting sector %llu to" + " another mirror\n", + bdevname(rdev->bdev,b), + (unsigned long long)r1_bio->sector); bio->bi_sector = r1_bio->sector + rdev->data_offset; + bio->bi_bdev = rdev->bdev; + bio->bi_end_io = raid1_end_read_request; bio->bi_rw = READ; + bio->bi_private = r1_bio; unplug = 1; generic_make_request(bio); } } } - spin_unlock_irqrestore(&retry_list_lock, flags); + spin_unlock_irqrestore(&conf->device_lock, flags); if (unplug) unplug_slaves(mddev); } @@ -999,7 +1019,7 @@ static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster) * put in a delay to throttle resync. */ if (!go_faster && waitqueue_active(&conf->wait_resume)) - schedule_timeout(HZ); + msleep_interruptible(1000); device_barrier(conf, sector_nr + RESYNC_SECTORS); /* @@ -1009,7 +1029,7 @@ static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster) */ disk = conf->last_used; /* make sure disk is operational */ - spin_lock_irq(&conf->device_lock); + while (conf->mirrors[disk].rdev == NULL || !conf->mirrors[disk].rdev->in_sync) { if (disk <= 0) @@ -1020,7 +1040,7 @@ static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster) } conf->last_used = disk; atomic_inc(&conf->mirrors[disk].rdev->nr_pending); - spin_unlock_irq(&conf->device_lock); + mirror = conf->mirrors + disk; @@ -1073,7 +1093,7 @@ static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster) int rv = max_sector - sector_nr; md_done_sync(mddev, rv, 1); put_buf(r1_bio); - atomic_dec(&conf->mirrors[disk].rdev->nr_pending); + rdev_dec_pending(conf->mirrors[disk].rdev, mddev); return rv; } @@ -1112,7 +1132,7 @@ static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster) bio = r1_bio->bios[disk]; r1_bio->sectors = nr_sectors; - md_sync_acct(mirror->rdev, nr_sectors); + md_sync_acct(mirror->rdev->bdev, nr_sectors); generic_make_request(bio); @@ -1163,6 +1183,7 @@ static int run(mddev_t *mddev) mddev->queue->unplug_fn = raid1_unplug; + mddev->queue->issue_flush_fn = raid1_issue_flush; ITERATE_RDEV(mddev, rdev, tmp) { disk_idx = rdev->raid_disk; @@ -1181,7 +1202,7 @@ static int run(mddev_t *mddev) */ if (rdev->bdev->bd_disk->queue->merge_bvec_fn && mddev->queue->max_sectors > (PAGE_SIZE>>9)) - mddev->queue->max_sectors = (PAGE_SIZE>>9); + blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); disk->head_position = 0; if (!rdev->faulty && rdev->in_sync) @@ -1189,11 +1210,12 @@ static int run(mddev_t *mddev) } conf->raid_disks = mddev->raid_disks; conf->mddev = mddev; - conf->device_lock = SPIN_LOCK_UNLOCKED; + spin_lock_init(&conf->device_lock); + INIT_LIST_HEAD(&conf->retry_list); if (conf->working_disks == 1) mddev->recovery_cp = MaxSector; - conf->resync_lock = SPIN_LOCK_UNLOCKED; + spin_lock_init(&conf->resync_lock); init_waitqueue_head(&conf->wait_idle); init_waitqueue_head(&conf->wait_resume); @@ -1271,6 +1293,7 @@ static int stop(mddev_t *mddev) md_unregister_thread(mddev->thread); mddev->thread = NULL; + blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ if (conf->r1bio_pool) mempool_destroy(conf->r1bio_pool); if (conf->mirrors)