X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=drivers%2Fmd%2Fmd.c;h=68669ce1026895fd89d1449848b7a9c860b629c9;hb=6a77f38946aaee1cd85eeec6cf4229b204c15071;hp=c451c8dba0b8a4f052efdcd49caa935182feb9ca;hpb=5273a3df6485dc2ad6aa7ddd441b9a21970f003b;p=linux-2.6.git diff --git a/drivers/md/md.c b/drivers/md/md.c index c451c8dba..68669ce10 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -61,7 +61,7 @@ static void autostart_arrays (int part); #endif static mdk_personality_t *pers[MAX_PERSONALITY]; -static spinlock_t pers_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(pers_lock); /* * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' @@ -129,7 +129,7 @@ static struct block_device_operations md_fops; * all_mddevs_lock protects this list. */ static LIST_HEAD(all_mddevs); -static spinlock_t all_mddevs_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(all_mddevs_lock); /* @@ -154,6 +154,7 @@ static spinlock_t all_mddevs_lock = SPIN_LOCK_UNLOCKED; tmp = tmp->next;}) \ ) + static int md_fail_request (request_queue_t *q, struct bio *bio) { bio_io_error(bio, bio->bi_size); @@ -331,29 +332,24 @@ static int bi_complete(struct bio *bio, unsigned int bytes_done, int error) static int sync_page_io(struct block_device *bdev, sector_t sector, int size, struct page *page, int rw) { - struct bio bio; - struct bio_vec vec; + struct bio *bio = bio_alloc(GFP_KERNEL, 1); struct completion event; + int ret; rw |= (1 << BIO_RW_SYNC); - bio_init(&bio); - bio.bi_io_vec = &vec; - vec.bv_page = page; - vec.bv_len = size; - vec.bv_offset = 0; - bio.bi_vcnt = 1; - bio.bi_idx = 0; - bio.bi_size = size; - bio.bi_bdev = bdev; - bio.bi_sector = sector; + bio->bi_bdev = bdev; + bio->bi_sector = sector; + bio_add_page(bio, page, size, 0); init_completion(&event); - bio.bi_private = &event; - bio.bi_end_io = bi_complete; - submit_bio(rw, &bio); + bio->bi_private = &event; + bio->bi_end_io = bi_complete; + submit_bio(rw, bio); wait_for_completion(&event); - return test_bit(BIO_UPTODATE, &bio.bi_flags); + ret = test_bit(BIO_UPTODATE, &bio->bi_flags); + bio_put(bio); + return ret; } static int read_disk_sb(mdk_rdev_t * rdev) @@ -373,7 +369,7 @@ static int read_disk_sb(mdk_rdev_t * rdev) return 0; fail: - printk(KERN_ERR "md: disabled device %s, could not read superblock.\n", + printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n", bdevname(rdev->bdev,b)); return -EINVAL; } @@ -439,6 +435,7 @@ static unsigned int calc_sb_csum(mdp_super_t * sb) return csum; } + /* * Handle superblock details. * We want to be able to handle multiple superblock formats @@ -521,7 +518,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version if (sb->raid_disks <= 0) goto abort; - if (calc_sb_csum(sb) != sb->sb_csum) { + if (csum_fold(calc_sb_csum(sb)) != csum_fold(sb->sb_csum)) { printk(KERN_WARNING "md: invalid superblock checksum on %s\n", b); goto abort; @@ -745,13 +742,23 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb) { unsigned int disk_csum, csum; - int size = 256 + sb->max_dev*2; + unsigned long long newcsum; + int size = 256 + le32_to_cpu(sb->max_dev)*2; + unsigned int *isuper = (unsigned int*)sb; + int i; disk_csum = sb->sb_csum; sb->sb_csum = 0; - csum = csum_partial((void *)sb, size, 0); + newcsum = 0; + for (i=0; size>=4; size -= 4 ) + newcsum += le32_to_cpu(*isuper++); + + if (size == 2) + newcsum += le16_to_cpu(*(unsigned short*) isuper); + + csum = (newcsum & 0xffffffff) + (newcsum >> 32); sb->sb_csum = disk_csum; - return csum; + return cpu_to_le32(csum); } static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) @@ -773,7 +780,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) case 0: sb_offset = rdev->bdev->bd_inode->i_size >> 9; sb_offset -= 8*2; - sb_offset &= ~(4*2); + sb_offset &= ~(4*2-1); /* convert from sectors to K */ sb_offset /= 2; break; @@ -806,6 +813,11 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) bdevname(rdev->bdev,b)); return -EINVAL; } + if (le64_to_cpu(sb->data_size) < 10) { + printk("md: data_size too small on %s\n", + bdevname(rdev->bdev,b)); + return -EINVAL; + } rdev->preferred_minor = 0xffff; rdev->data_offset = le64_to_cpu(sb->data_offset); @@ -850,7 +862,6 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) if (mddev->raid_disks == 0) { mddev->major_version = 1; - mddev->minor_version = 0; mddev->patch_version = 0; mddev->persistent = 1; mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9; @@ -859,7 +870,7 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) mddev->level = le32_to_cpu(sb->level); mddev->layout = le32_to_cpu(sb->layout); mddev->raid_disks = le32_to_cpu(sb->raid_disks); - mddev->size = (u32)le64_to_cpu(sb->size); + mddev->size = le64_to_cpu(sb->size)/2; mddev->events = le64_to_cpu(sb->events); mddev->recovery_cp = le64_to_cpu(sb->resync_offset); @@ -924,10 +935,10 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) max_dev = 0; ITERATE_RDEV(mddev,rdev2,tmp) - if (rdev2->desc_nr > max_dev) - max_dev = rdev2->desc_nr; + if (rdev2->desc_nr+1 > max_dev) + max_dev = rdev2->desc_nr+1; - sb->max_dev = max_dev; + sb->max_dev = cpu_to_le32(max_dev); for (i=0; idev_roles[max_dev] = cpu_to_le16(0xfffe); @@ -942,6 +953,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) } sb->recovery_offset = cpu_to_le64(0); /* not supported yet */ + sb->sb_csum = calc_sb_1_csum(sb); } @@ -1042,20 +1054,24 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev) /* * prevent the device from being mounted, repartitioned or * otherwise reused by a RAID array (or any other kernel - * subsystem), by opening the device. [simply getting an - * inode is not enough, the SCSI module usage code needs - * an explicit open() on the device] + * subsystem), by bd_claiming the device. */ static int lock_rdev(mdk_rdev_t *rdev, dev_t dev) { int err = 0; struct block_device *bdev; + char b[BDEVNAME_SIZE]; bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE); - if (IS_ERR(bdev)) + if (IS_ERR(bdev)) { + printk(KERN_ERR "md: could not open %s.\n", + __bdevname(dev, b)); return PTR_ERR(bdev); + } err = bd_claim(bdev, rdev); if (err) { + printk(KERN_ERR "md: could not bd_claim %s.\n", + bdevname(bdev, b)); blkdev_put(bdev); return err; } @@ -1117,10 +1133,7 @@ static void export_array(mddev_t *mddev) static void print_desc(mdp_disk_t *desc) { - char b[BDEVNAME_SIZE]; - - printk(" DISK\n", desc->number, - __bdevname(MKDEV(desc->major, desc->minor), b), + printk(" DISK\n", desc->number, desc->major,desc->minor,desc->raid_disk,desc->state); } @@ -1312,8 +1325,7 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL); if (!rdev) { - printk(KERN_ERR "md: could not alloc mem for %s!\n", - __bdevname(newdev, b)); + printk(KERN_ERR "md: could not alloc mem for new device!\n"); return ERR_PTR(-ENOMEM); } memset(rdev, 0, sizeof(*rdev)); @@ -1322,11 +1334,9 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi goto abort_free; err = lock_rdev(rdev, newdev); - if (err) { - printk(KERN_ERR "md: could not lock %s.\n", - __bdevname(newdev, b)); + if (err) goto abort_free; - } + rdev->desc_nr = -1; rdev->faulty = 0; rdev->in_sync = 0; @@ -1424,17 +1434,6 @@ static int analyze_sbs(mddev_t * mddev) } - /* - * Check if we can support this RAID array - */ - if (mddev->major_version != MD_MAJOR_VERSION || - mddev->minor_version > MD_MINOR_VERSION) { - printk(KERN_ALERT - "md: %s: unsupported raid array version %d.%d.%d\n", - mdname(mddev), mddev->major_version, - mddev->minor_version, mddev->patch_version); - goto abort; - } if ((mddev->recovery_cp != MaxSector) && ((mddev->level == 1) || @@ -1444,8 +1443,6 @@ static int analyze_sbs(mddev_t * mddev) mdname(mddev)); return 0; -abort: - return 1; } int mdp_major = 0; @@ -1476,10 +1473,13 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data) } disk->major = MAJOR(dev); disk->first_minor = unit << shift; - if (partitioned) + if (partitioned) { sprintf(disk->disk_name, "md_d%d", unit); - else + sprintf(disk->devfs_name, "md/d%d", unit); + } else { sprintf(disk->disk_name, "md%d", unit); + sprintf(disk->devfs_name, "md/%d", unit); + } disk->fops = &md_fops; disk->private_data = mddev; disk->queue = mddev->queue; @@ -1607,7 +1607,7 @@ static int do_md_run(mddev_t * mddev) spin_lock(&pers_lock); if (!pers[pnum] || !try_module_get(pers[pnum]->owner)) { spin_unlock(&pers_lock); - printk(KERN_ERR "md: personality %d is not loaded!\n", + printk(KERN_WARNING "md: personality %d is not loaded!\n", pnum); return -EINVAL; } @@ -1615,6 +1615,8 @@ static int do_md_run(mddev_t * mddev) mddev->pers = pers[pnum]; spin_unlock(&pers_lock); + mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */ + err = mddev->pers->run(mddev); if (err) { printk(KERN_ERR "md: pers->run() failed ...\n"); @@ -1776,7 +1778,7 @@ static void autorun_array(mddev_t *mddev) err = do_md_run (mddev); if (err) { - printk(KERN_WARNING "md :do_md_run() returned %d\n", err); + printk(KERN_WARNING "md: do_md_run() returned %d\n", err); do_md_stop (mddev, 0); } } @@ -1881,11 +1883,9 @@ static int autostart_array(dev_t startdev) mdk_rdev_t *start_rdev = NULL, *rdev; start_rdev = md_import_device(startdev, 0, 0); - if (IS_ERR(start_rdev)) { - printk(KERN_WARNING "md: could not import %s!\n", - __bdevname(startdev, b)); + if (IS_ERR(start_rdev)) return err; - } + /* NOTE: this can only work for 0.90.0 superblocks */ sb = (mdp_super_t*)page_address(start_rdev->sb_page); @@ -1916,12 +1916,9 @@ static int autostart_array(dev_t startdev) if (MAJOR(dev) != desc->major || MINOR(dev) != desc->minor) continue; rdev = md_import_device(dev, 0, 0); - if (IS_ERR(rdev)) { - printk(KERN_WARNING "md: could not import %s," - " trying to run array nevertheless.\n", - __bdevname(dev, b)); + if (IS_ERR(rdev)) continue; - } + list_add(&rdev->same_set, &pending_raid_disks); } @@ -1934,7 +1931,7 @@ static int autostart_array(dev_t startdev) } -static int get_version(void * arg) +static int get_version(void __user * arg) { mdu_version_t ver; @@ -1948,7 +1945,7 @@ static int get_version(void * arg) return 0; } -static int get_array_info(mddev_t * mddev, void * arg) +static int get_array_info(mddev_t * mddev, void __user * arg) { mdu_array_info_t info; int nr,working,active,failed,spare; @@ -1971,7 +1968,7 @@ static int get_array_info(mddev_t * mddev, void * arg) info.major_version = mddev->major_version; info.minor_version = mddev->minor_version; - info.patch_version = 1; + info.patch_version = MD_PATCHLEVEL_VERSION; info.ctime = mddev->ctime; info.level = mddev->level; info.size = mddev->size; @@ -1998,7 +1995,7 @@ static int get_array_info(mddev_t * mddev, void * arg) return 0; } -static int get_disk_info(mddev_t * mddev, void * arg) +static int get_disk_info(mddev_t * mddev, void __user * arg) { mdu_disk_info_t info; unsigned int nr; @@ -2153,42 +2150,6 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) return 0; } -static int hot_generate_error(mddev_t * mddev, dev_t dev) -{ - char b[BDEVNAME_SIZE]; - struct request_queue *q; - mdk_rdev_t *rdev; - - if (!mddev->pers) - return -ENODEV; - - printk(KERN_INFO "md: trying to generate %s error in %s ... \n", - __bdevname(dev, b), mdname(mddev)); - - rdev = find_rdev(mddev, dev); - if (!rdev) { - /* MD_BUG(); */ /* like hell - it's not a driver bug */ - return -ENXIO; - } - - if (rdev->desc_nr == -1) { - MD_BUG(); - return -EINVAL; - } - if (!rdev->in_sync) - return -ENODEV; - - q = bdev_get_queue(rdev->bdev); - if (!q) { - MD_BUG(); - return -ENODEV; - } - printk(KERN_INFO "md: okay, generating error!\n"); -// q->oneshot_error = 1; // disabled for now - - return 0; -} - static int hot_remove_disk(mddev_t * mddev, dev_t dev) { char b[BDEVNAME_SIZE]; @@ -2197,9 +2158,6 @@ static int hot_remove_disk(mddev_t * mddev, dev_t dev) if (!mddev->pers) return -ENODEV; - printk(KERN_INFO "md: trying to remove %s from %s ... \n", - __bdevname(dev, b), mdname(mddev)); - rdev = find_rdev(mddev, dev); if (!rdev) return -ENXIO; @@ -2227,9 +2185,6 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev) if (!mddev->pers) return -ENODEV; - printk(KERN_INFO "md: trying to hot-add %s to %s ... \n", - __bdevname(dev, b), mdname(mddev)); - if (mddev->major_version != 0) { printk(KERN_WARNING "%s: HOT_ADD may only be used with" " version-0 superblocks.\n", @@ -2251,7 +2206,12 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev) return -EINVAL; } - rdev->sb_offset = calc_dev_sboffset(rdev->bdev); + if (mddev->persistent) + rdev->sb_offset = calc_dev_sboffset(rdev->bdev); + else + rdev->sb_offset = + rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; + size = calc_dev_size(rdev, mddev->chunk_size); rdev->size = size; @@ -2372,10 +2332,121 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) return 0; } +/* + * update_array_info is used to change the configuration of an + * on-line array. + * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size + * fields in the info are checked against the array. + * Any differences that cannot be handled will cause an error. + * Normally, only one change can be managed at a time. + */ +static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) +{ + int rv = 0; + int cnt = 0; + + if (mddev->major_version != info->major_version || + mddev->minor_version != info->minor_version || +/* mddev->patch_version != info->patch_version || */ + mddev->ctime != info->ctime || + mddev->level != info->level || +/* mddev->layout != info->layout || */ + !mddev->persistent != info->not_persistent|| + mddev->chunk_size != info->chunk_size ) + return -EINVAL; + /* Check there is only one change */ + if (mddev->size != info->size) cnt++; + if (mddev->raid_disks != info->raid_disks) cnt++; + if (mddev->layout != info->layout) cnt++; + if (cnt == 0) return 0; + if (cnt > 1) return -EINVAL; + + if (mddev->layout != info->layout) { + /* Change layout + * we don't need to do anything at the md level, the + * personality will take care of it all. + */ + if (mddev->pers->reconfig == NULL) + return -EINVAL; + else + return mddev->pers->reconfig(mddev, info->layout, -1); + } + if (mddev->size != info->size) { + mdk_rdev_t * rdev; + struct list_head *tmp; + if (mddev->pers->resize == NULL) + return -EINVAL; + /* The "size" is the amount of each device that is used. + * This can only make sense for arrays with redundancy. + * linear and raid0 always use whatever space is available + * We can only consider changing the size if no resync + * or reconstruction is happening, and if the new size + * is acceptable. It must fit before the sb_offset or, + * if that is sync_thread) + return -EBUSY; + ITERATE_RDEV(mddev,rdev,tmp) { + sector_t avail; + int fit = (info->size == 0); + if (rdev->sb_offset > rdev->data_offset) + avail = (rdev->sb_offset*2) - rdev->data_offset; + else + avail = get_capacity(rdev->bdev->bd_disk) + - rdev->data_offset; + if (fit && (info->size == 0 || info->size > avail/2)) + info->size = avail/2; + if (avail < ((sector_t)info->size << 1)) + return -ENOSPC; + } + rv = mddev->pers->resize(mddev, (sector_t)info->size *2); + if (!rv) { + struct block_device *bdev; + + bdev = bdget_disk(mddev->gendisk, 0); + if (bdev) { + down(&bdev->bd_inode->i_sem); + i_size_write(bdev->bd_inode, mddev->array_size << 10); + up(&bdev->bd_inode->i_sem); + bdput(bdev); + } + } + } + if (mddev->raid_disks != info->raid_disks) { + /* change the number of raid disks */ + if (mddev->pers->reshape == NULL) + return -EINVAL; + if (info->raid_disks <= 0 || + info->raid_disks >= mddev->max_disks) + return -EINVAL; + if (mddev->sync_thread) + return -EBUSY; + rv = mddev->pers->reshape(mddev, info->raid_disks); + if (!rv) { + struct block_device *bdev; + + bdev = bdget_disk(mddev->gendisk, 0); + if (bdev) { + down(&bdev->bd_inode->i_sem); + i_size_write(bdev->bd_inode, mddev->array_size << 10); + up(&bdev->bd_inode->i_sem); + bdput(bdev); + } + } + } + md_update_sb(mddev); + return rv; +} + static int set_disk_faulty(mddev_t *mddev, dev_t dev) { mdk_rdev_t *rdev; + if (mddev->pers == NULL) + return -ENODEV; + rdev = find_rdev(mddev, dev); if (!rdev) return -ENODEV; @@ -2387,9 +2458,9 @@ static int set_disk_faulty(mddev_t *mddev, dev_t dev) static int md_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg) { - char b[BDEVNAME_SIZE]; int err = 0; - struct hd_geometry *loc = (struct hd_geometry *) arg; + void __user *argp = (void __user *)arg; + struct hd_geometry __user *loc = argp; mddev_t *mddev = NULL; if (!capable(CAP_SYS_ADMIN)) @@ -2402,7 +2473,7 @@ static int md_ioctl(struct inode *inode, struct file *file, switch (cmd) { case RAID_VERSION: - err = get_version((void *)arg); + err = get_version(argp); goto done; case PRINT_RAID_DEBUG: @@ -2445,8 +2516,7 @@ static int md_ioctl(struct inode *inode, struct file *file, } err = autostart_array(new_decode_dev(arg)); if (err) { - printk(KERN_WARNING "md: autostart %s failed!\n", - __bdevname(arg, b)); + printk(KERN_WARNING "md: autostart failed!\n"); goto abort; } goto done; @@ -2463,33 +2533,41 @@ static int md_ioctl(struct inode *inode, struct file *file, switch (cmd) { case SET_ARRAY_INFO: - - if (!list_empty(&mddev->disks)) { - printk(KERN_WARNING - "md: array %s already has disks!\n", - mdname(mddev)); - err = -EBUSY; - goto abort_unlock; - } - if (mddev->raid_disks) { - printk(KERN_WARNING - "md: array %s already initialised!\n", - mdname(mddev)); - err = -EBUSY; - goto abort_unlock; - } { mdu_array_info_t info; if (!arg) memset(&info, 0, sizeof(info)); - else if (copy_from_user(&info, (void*)arg, sizeof(info))) { + else if (copy_from_user(&info, argp, sizeof(info))) { err = -EFAULT; goto abort_unlock; } + if (mddev->pers) { + err = update_array_info(mddev, &info); + if (err) { + printk(KERN_WARNING "md: couldn't update" + " array info. %d\n", err); + goto abort_unlock; + } + goto done_unlock; + } + if (!list_empty(&mddev->disks)) { + printk(KERN_WARNING + "md: array %s already has disks!\n", + mdname(mddev)); + err = -EBUSY; + goto abort_unlock; + } + if (mddev->raid_disks) { + printk(KERN_WARNING + "md: array %s already initialised!\n", + mdname(mddev)); + err = -EBUSY; + goto abort_unlock; + } err = set_array_info(mddev, &info); if (err) { printk(KERN_WARNING "md: couldn't set" - " array info. %d\n", err); + " array info. %d\n", err); goto abort_unlock; } } @@ -2513,11 +2591,11 @@ static int md_ioctl(struct inode *inode, struct file *file, switch (cmd) { case GET_ARRAY_INFO: - err = get_array_info(mddev, (void *)arg); + err = get_array_info(mddev, argp); goto done_unlock; case GET_DISK_INFO: - err = get_disk_info(mddev, (void *)arg); + err = get_disk_info(mddev, argp); goto done_unlock; case RESTART_ARRAY_RW: @@ -2543,18 +2621,18 @@ static int md_ioctl(struct inode *inode, struct file *file, err = -EINVAL; goto abort_unlock; } - err = put_user (2, (char *) &loc->heads); + err = put_user (2, (char __user *) &loc->heads); if (err) goto abort_unlock; - err = put_user (4, (char *) &loc->sectors); + err = put_user (4, (char __user *) &loc->sectors); if (err) goto abort_unlock; err = put_user(get_capacity(mddev->gendisk)/8, - (short *) &loc->cylinders); + (short __user *) &loc->cylinders); if (err) goto abort_unlock; err = put_user (get_start_sect(inode->i_bdev), - (long *) &loc->start); + (long __user *) &loc->start); goto done_unlock; } @@ -2573,15 +2651,13 @@ static int md_ioctl(struct inode *inode, struct file *file, case ADD_NEW_DISK: { mdu_disk_info_t info; - if (copy_from_user(&info, (void*)arg, sizeof(info))) + if (copy_from_user(&info, argp, sizeof(info))) err = -EFAULT; else err = add_new_disk(mddev, &info); goto done_unlock; } - case HOT_GENERATE_ERROR: - err = hot_generate_error(mddev, new_decode_dev(arg)); - goto done_unlock; + case HOT_REMOVE_DISK: err = hot_remove_disk(mddev, new_decode_dev(arg)); goto done_unlock; @@ -2765,7 +2841,7 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, return thread; } -void md_interrupt_thread(mdk_thread_t *thread) +static void md_interrupt_thread(mdk_thread_t *thread) { if (!thread->tsk) { MD_BUG(); @@ -2808,6 +2884,7 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev) if (!mddev->pers->error_handler) return; mddev->pers->error_handler(mddev,rdev); + set_bit(MD_RECOVERY_INTR, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); } @@ -2840,7 +2917,11 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev) unsigned long max_blocks, resync, res, dt, db, rt; resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2; - max_blocks = mddev->size; + + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) + max_blocks = mddev->resync_max_sectors >> 1; + else + max_blocks = mddev->size; /* * Should not happen. @@ -3076,11 +3157,6 @@ int unregister_md_personality(int pnum) return 0; } -void md_sync_acct(mdk_rdev_t *rdev, unsigned long nr_sectors) -{ - rdev->bdev->bd_contains->bd_disk->sync_io += nr_sectors; -} - static int is_mddev_idle(mddev_t *mddev) { mdk_rdev_t * rdev; @@ -3093,8 +3169,12 @@ static int is_mddev_idle(mddev_t *mddev) struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; curr_events = disk_stat_read(disk, read_sectors) + disk_stat_read(disk, write_sectors) - - disk->sync_io; - if ((curr_events - rdev->last_events) > 32) { + atomic_read(&disk->sync_io); + /* Allow some slack between valud of curr_events and last_events, + * as there are some uninteresting races. + * Note: the following is an unsigned comparison. + */ + if ((curr_events - rdev->last_events + 32) > 64) { rdev->last_events = curr_events; idle = 0; } @@ -3197,38 +3277,66 @@ static void md_do_sync(mddev_t *mddev) * 1 == like 2, but have yielded to allow conflicting resync to * commense * other == active in resync - this many blocks + * + * Before starting a resync we must have set curr_resync to + * 2, and then checked that every "conflicting" array has curr_resync + * less than ours. When we find one that is the same or higher + * we wait on resync_wait. To avoid deadlock, we reduce curr_resync + * to 1 if we choose to yield (based arbitrarily on address of mddev structure). + * This will mean we have to start checking from the beginning again. + * */ + do { mddev->curr_resync = 2; + try_again: + if (signal_pending(current)) { + flush_signals(current); + goto skip; + } ITERATE_MDDEV(mddev2,tmp) { + printk("."); if (mddev2 == mddev) continue; if (mddev2->curr_resync && match_mddev_units(mddev,mddev2)) { - printk(KERN_INFO "md: delaying resync of %s" - " until %s has finished resync (they" - " share one or more physical units)\n", - mdname(mddev), mdname(mddev2)); - if (mddev < mddev2) {/* arbitrarily yield */ + DEFINE_WAIT(wq); + if (mddev < mddev2 && mddev->curr_resync == 2) { + /* arbitrarily yield */ mddev->curr_resync = 1; wake_up(&resync_wait); } - if (wait_event_interruptible(resync_wait, - mddev2->curr_resync < mddev->curr_resync)) { - flush_signals(current); + if (mddev > mddev2 && mddev->curr_resync == 1) + /* no need to wait here, we can wait the next + * time 'round when curr_resync == 2 + */ + continue; + prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); + if (!signal_pending(current) + && mddev2->curr_resync >= mddev->curr_resync) { + printk(KERN_INFO "md: delaying resync of %s" + " until %s has finished resync (they" + " share one or more physical units)\n", + mdname(mddev), mdname(mddev2)); mddev_put(mddev2); - goto skip; + schedule(); + finish_wait(&resync_wait, &wq); + goto try_again; } - } - if (mddev->curr_resync == 1) { - mddev_put(mddev2); - break; + finish_wait(&resync_wait, &wq); } } } while (mddev->curr_resync < 2); - max_sectors = mddev->size << 1; + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) + /* resync follows the size requested by the personality, + * which default to physical size, but can be virtual size + */ + max_sectors = mddev->resync_max_sectors; + else + /* recovery follows the physical size of devices */ + max_sectors = mddev->size << 1; printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev)); printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:" @@ -3254,17 +3362,19 @@ static void md_do_sync(mddev_t *mddev) * Tune reconstruction: */ window = 32*(PAGE_SIZE/512); - printk(KERN_INFO "md: using %dk window, over a total of %Lu blocks.\n", + printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n", window/2,(unsigned long long) max_sectors/2); atomic_set(&mddev->recovery_active, 0); init_waitqueue_head(&mddev->recovery_wait); last_check = 0; - if (j) + if (j>2) { printk(KERN_INFO "md: resuming recovery of %s from checkpoint.\n", mdname(mddev)); + mddev->curr_resync = j; + } while (j < max_sectors) { int sectors; @@ -3278,7 +3388,7 @@ static void md_do_sync(mddev_t *mddev) j += sectors; if (j>1) mddev->curr_resync = j; - if (last_check + window > j) + if (last_check + window > j || j == max_sectors) continue; last_check = j; @@ -3288,7 +3398,7 @@ static void md_do_sync(mddev_t *mddev) break; repeat: - if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) { + if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { /* step marks */ int next = (last_mark+1) % SYNC_MARKS; @@ -3327,8 +3437,7 @@ static void md_do_sync(mddev_t *mddev) if (currspeed > sysctl_speed_limit_min) { if ((currspeed > sysctl_speed_limit_max) || !is_mddev_idle(mddev)) { - current->state = TASK_INTERRUPTIBLE; - schedule_timeout(HZ/4); + msleep_interruptible(250); goto repeat; } } @@ -3347,7 +3456,7 @@ static void md_do_sync(mddev_t *mddev) if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && mddev->curr_resync > 2 && - mddev->curr_resync > mddev->recovery_cp) { + mddev->curr_resync >= mddev->recovery_cp) { if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { printk(KERN_INFO "md: checkpointing recovery of %s.\n", @@ -3360,6 +3469,7 @@ static void md_do_sync(mddev_t *mddev) md_enter_safemode(mddev); skip: mddev->curr_resync = 0; + wake_up(&resync_wait); set_bit(MD_RECOVERY_DONE, &mddev->recovery); md_wakeup_thread(mddev->thread); } @@ -3427,14 +3537,11 @@ void md_check_recovery(mddev_t *mddev) mddev->recovery = 0; /* flag recovery needed just to double check */ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - wake_up(&resync_wait); goto unlock; } - if (mddev->recovery) { + if (mddev->recovery) /* probably just the RECOVERY_NEEDED flag */ mddev->recovery = 0; - wake_up(&resync_wait); - } /* no recovery is running. * remove any failed drives, then @@ -3444,8 +3551,8 @@ void md_check_recovery(mddev_t *mddev) if (rdev->raid_disk >= 0 && rdev->faulty && atomic_read(&rdev->nr_pending)==0) { - mddev->pers->hot_remove_disk(mddev, rdev->raid_disk); - rdev->raid_disk = -1; + if (mddev->pers->hot_remove_disk(mddev, rdev->raid_disk)==0) + rdev->raid_disk = -1; } if (!rdev->faulty && rdev->raid_disk >= 0 && !rdev->in_sync) spares++; @@ -3557,7 +3664,7 @@ int __init md_init(void) for (minor=0; minor < MAX_MD_DEVS; ++minor) devfs_mk_bdev(MKDEV(mdp_major, minor<faulty) { MD_BUG(); continue; @@ -3651,7 +3755,6 @@ module_exit(md_exit) EXPORT_SYMBOL(register_md_personality); EXPORT_SYMBOL(unregister_md_personality); EXPORT_SYMBOL(md_error); -EXPORT_SYMBOL(md_sync_acct); EXPORT_SYMBOL(md_done_sync); EXPORT_SYMBOL(md_write_start); EXPORT_SYMBOL(md_write_end); @@ -3660,6 +3763,5 @@ EXPORT_SYMBOL(md_register_thread); EXPORT_SYMBOL(md_unregister_thread); EXPORT_SYMBOL(md_wakeup_thread); EXPORT_SYMBOL(md_print_devices); -EXPORT_SYMBOL(md_interrupt_thread); EXPORT_SYMBOL(md_check_recovery); MODULE_LICENSE("GPL");