X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=drivers%2Fmd%2Fmd.c;h=68669ce1026895fd89d1449848b7a9c860b629c9;hb=6a77f38946aaee1cd85eeec6cf4229b204c15071;hp=745f7e7bd3ee445f0782285355d811023ae9ca70;hpb=87fc8d1bb10cd459024a742c6a10961fefcef18f;p=linux-2.6.git diff --git a/drivers/md/md.c b/drivers/md/md.c index 745f7e7bd..68669ce10 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -61,7 +61,7 @@ static void autostart_arrays (int part); #endif static mdk_personality_t *pers[MAX_PERSONALITY]; -static spinlock_t pers_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(pers_lock); /* * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' @@ -129,7 +129,7 @@ static struct block_device_operations md_fops; * all_mddevs_lock protects this list. */ static LIST_HEAD(all_mddevs); -static spinlock_t all_mddevs_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(all_mddevs_lock); /* @@ -154,38 +154,6 @@ static spinlock_t all_mddevs_lock = SPIN_LOCK_UNLOCKED; tmp = tmp->next;}) \ ) -int md_flush_mddev(mddev_t *mddev, sector_t *error_sector) -{ - struct list_head *tmp; - mdk_rdev_t *rdev; - int ret = 0; - - /* - * this list iteration is done without any locking in md?! - */ - ITERATE_RDEV(mddev, rdev, tmp) { - request_queue_t *r_queue = bdev_get_queue(rdev->bdev); - int err; - - if (!r_queue->issue_flush_fn) - err = -EOPNOTSUPP; - else - err = r_queue->issue_flush_fn(r_queue, rdev->bdev->bd_disk, error_sector); - - if (!ret) - ret = err; - } - - return ret; -} - -static int md_flush_all(request_queue_t *q, struct gendisk *disk, - sector_t *error_sector) -{ - mddev_t *mddev = q->queuedata; - - return md_flush_mddev(mddev, error_sector); -} static int md_fail_request (request_queue_t *q, struct bio *bio) { @@ -364,29 +332,24 @@ static int bi_complete(struct bio *bio, unsigned int bytes_done, int error) static int sync_page_io(struct block_device *bdev, sector_t sector, int size, struct page *page, int rw) { - struct bio bio; - struct bio_vec vec; + struct bio *bio = bio_alloc(GFP_KERNEL, 1); struct completion event; + int ret; rw |= (1 << BIO_RW_SYNC); - bio_init(&bio); - bio.bi_io_vec = &vec; - vec.bv_page = page; - vec.bv_len = size; - vec.bv_offset = 0; - bio.bi_vcnt = 1; - bio.bi_idx = 0; - bio.bi_size = size; - bio.bi_bdev = bdev; - bio.bi_sector = sector; + bio->bi_bdev = bdev; + bio->bi_sector = sector; + bio_add_page(bio, page, size, 0); init_completion(&event); - bio.bi_private = &event; - bio.bi_end_io = bi_complete; - submit_bio(rw, &bio); + bio->bi_private = &event; + bio->bi_end_io = bi_complete; + submit_bio(rw, bio); wait_for_completion(&event); - return test_bit(BIO_UPTODATE, &bio.bi_flags); + ret = test_bit(BIO_UPTODATE, &bio->bi_flags); + bio_put(bio); + return ret; } static int read_disk_sb(mdk_rdev_t * rdev) @@ -472,30 +435,6 @@ static unsigned int calc_sb_csum(mdp_super_t * sb) return csum; } -/* csum_partial is not consistent between different architectures. - * Some (i386) do a 32bit csum. Some (alpha) do 16 bit. - * This makes it hard for user-space to know what to do. - * So we use calc_sb_csum to set the checksum to allow working - * with older kernels, but allow calc_sb_csum_common to - * be used when checking if a checksum is correct, to - * make life easier for user-space tools that might write - * a superblock. - */ -static unsigned int calc_sb_csum_common(mdp_super_t *super) -{ - unsigned int disk_csum = super->sb_csum; - unsigned long long newcsum = 0; - unsigned int csum; - int i; - unsigned int *superc = (int*) super; - super->sb_csum = 0; - - for (i=0; i>32); - super->sb_csum = disk_csum; - return csum; -} /* * Handle superblock details. @@ -579,8 +518,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version if (sb->raid_disks <= 0) goto abort; - if (calc_sb_csum(sb) != sb->sb_csum && - calc_sb_csum_common(sb) != sb->sb_csum) { + if (csum_fold(calc_sb_csum(sb)) != csum_fold(sb->sb_csum)) { printk(KERN_WARNING "md: invalid superblock checksum on %s\n", b); goto abort; @@ -805,7 +743,7 @@ static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb) { unsigned int disk_csum, csum; unsigned long long newcsum; - int size = 256 + sb->max_dev*2; + int size = 256 + le32_to_cpu(sb->max_dev)*2; unsigned int *isuper = (unsigned int*)sb; int i; @@ -820,7 +758,7 @@ static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb) csum = (newcsum & 0xffffffff) + (newcsum >> 32); sb->sb_csum = disk_csum; - return csum; + return cpu_to_le32(csum); } static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) @@ -842,7 +780,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) case 0: sb_offset = rdev->bdev->bd_inode->i_size >> 9; sb_offset -= 8*2; - sb_offset &= ~(4*2); + sb_offset &= ~(4*2-1); /* convert from sectors to K */ sb_offset /= 2; break; @@ -875,6 +813,11 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) bdevname(rdev->bdev,b)); return -EINVAL; } + if (le64_to_cpu(sb->data_size) < 10) { + printk("md: data_size too small on %s\n", + bdevname(rdev->bdev,b)); + return -EINVAL; + } rdev->preferred_minor = 0xffff; rdev->data_offset = le64_to_cpu(sb->data_offset); @@ -919,7 +862,6 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) if (mddev->raid_disks == 0) { mddev->major_version = 1; - mddev->minor_version = 0; mddev->patch_version = 0; mddev->persistent = 1; mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9; @@ -928,7 +870,7 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) mddev->level = le32_to_cpu(sb->level); mddev->layout = le32_to_cpu(sb->layout); mddev->raid_disks = le32_to_cpu(sb->raid_disks); - mddev->size = (u32)le64_to_cpu(sb->size); + mddev->size = le64_to_cpu(sb->size)/2; mddev->events = le64_to_cpu(sb->events); mddev->recovery_cp = le64_to_cpu(sb->resync_offset); @@ -993,10 +935,10 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) max_dev = 0; ITERATE_RDEV(mddev,rdev2,tmp) - if (rdev2->desc_nr > max_dev) - max_dev = rdev2->desc_nr; + if (rdev2->desc_nr+1 > max_dev) + max_dev = rdev2->desc_nr+1; - sb->max_dev = max_dev; + sb->max_dev = cpu_to_le32(max_dev); for (i=0; idev_roles[max_dev] = cpu_to_le16(0xfffe); @@ -1011,6 +953,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) } sb->recovery_offset = cpu_to_le64(0); /* not supported yet */ + sb->sb_csum = calc_sb_1_csum(sb); } @@ -1491,17 +1434,6 @@ static int analyze_sbs(mddev_t * mddev) } - /* - * Check if we can support this RAID array - */ - if (mddev->major_version != MD_MAJOR_VERSION || - mddev->minor_version > MD_MINOR_VERSION) { - printk(KERN_ALERT - "md: %s: unsupported raid array version %d.%d.%d\n", - mdname(mddev), mddev->major_version, - mddev->minor_version, mddev->patch_version); - goto abort; - } if ((mddev->recovery_cp != MaxSector) && ((mddev->level == 1) || @@ -1511,8 +1443,6 @@ static int analyze_sbs(mddev_t * mddev) mdname(mddev)); return 0; -abort: - return 1; } int mdp_major = 0; @@ -1543,10 +1473,13 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data) } disk->major = MAJOR(dev); disk->first_minor = unit << shift; - if (partitioned) + if (partitioned) { sprintf(disk->disk_name, "md_d%d", unit); - else + sprintf(disk->devfs_name, "md/d%d", unit); + } else { sprintf(disk->disk_name, "md%d", unit); + sprintf(disk->devfs_name, "md/%d", unit); + } disk->fops = &md_fops; disk->private_data = mddev; disk->queue = mddev->queue; @@ -1714,7 +1647,6 @@ static int do_md_run(mddev_t * mddev) */ mddev->queue->queuedata = mddev; mddev->queue->make_request_fn = mddev->pers->make_request; - mddev->queue->issue_flush_fn = md_flush_all; mddev->changed = 1; return 0; @@ -1846,7 +1778,7 @@ static void autorun_array(mddev_t *mddev) err = do_md_run (mddev); if (err) { - printk(KERN_WARNING "md :do_md_run() returned %d\n", err); + printk(KERN_WARNING "md: do_md_run() returned %d\n", err); do_md_stop (mddev, 0); } } @@ -2036,7 +1968,7 @@ static int get_array_info(mddev_t * mddev, void __user * arg) info.major_version = mddev->major_version; info.minor_version = mddev->minor_version; - info.patch_version = 1; + info.patch_version = MD_PATCHLEVEL_VERSION; info.ctime = mddev->ctime; info.level = mddev->level; info.size = mddev->size; @@ -2418,16 +2350,27 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) /* mddev->patch_version != info->patch_version || */ mddev->ctime != info->ctime || mddev->level != info->level || - mddev->layout != info->layout || +/* mddev->layout != info->layout || */ !mddev->persistent != info->not_persistent|| mddev->chunk_size != info->chunk_size ) return -EINVAL; /* Check there is only one change */ if (mddev->size != info->size) cnt++; if (mddev->raid_disks != info->raid_disks) cnt++; + if (mddev->layout != info->layout) cnt++; if (cnt == 0) return 0; if (cnt > 1) return -EINVAL; + if (mddev->layout != info->layout) { + /* Change layout + * we don't need to do anything at the md level, the + * personality will take care of it all. + */ + if (mddev->pers->reconfig == NULL) + return -EINVAL; + else + return mddev->pers->reconfig(mddev, info->layout, -1); + } if (mddev->size != info->size) { mdk_rdev_t * rdev; struct list_head *tmp; @@ -2436,7 +2379,7 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) /* The "size" is the amount of each device that is used. * This can only make sense for arrays with redundancy. * linear and raid0 always use whatever space is available - * We can only consider changing the size of no resync + * We can only consider changing the size if no resync * or reconstruction is happening, and if the new size * is acceptable. It must fit before the sb_offset or, * if that is pers == NULL) + return -ENODEV; + rdev = find_rdev(mddev, dev); if (!rdev) return -ENODEV; @@ -3331,33 +3277,54 @@ static void md_do_sync(mddev_t *mddev) * 1 == like 2, but have yielded to allow conflicting resync to * commense * other == active in resync - this many blocks + * + * Before starting a resync we must have set curr_resync to + * 2, and then checked that every "conflicting" array has curr_resync + * less than ours. When we find one that is the same or higher + * we wait on resync_wait. To avoid deadlock, we reduce curr_resync + * to 1 if we choose to yield (based arbitrarily on address of mddev structure). + * This will mean we have to start checking from the beginning again. + * */ + do { mddev->curr_resync = 2; + try_again: + if (signal_pending(current)) { + flush_signals(current); + goto skip; + } ITERATE_MDDEV(mddev2,tmp) { + printk("."); if (mddev2 == mddev) continue; if (mddev2->curr_resync && match_mddev_units(mddev,mddev2)) { - printk(KERN_INFO "md: delaying resync of %s" - " until %s has finished resync (they" - " share one or more physical units)\n", - mdname(mddev), mdname(mddev2)); - if (mddev < mddev2) {/* arbitrarily yield */ + DEFINE_WAIT(wq); + if (mddev < mddev2 && mddev->curr_resync == 2) { + /* arbitrarily yield */ mddev->curr_resync = 1; wake_up(&resync_wait); } - if (wait_event_interruptible(resync_wait, - mddev2->curr_resync < mddev->curr_resync)) { - flush_signals(current); + if (mddev > mddev2 && mddev->curr_resync == 1) + /* no need to wait here, we can wait the next + * time 'round when curr_resync == 2 + */ + continue; + prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); + if (!signal_pending(current) + && mddev2->curr_resync >= mddev->curr_resync) { + printk(KERN_INFO "md: delaying resync of %s" + " until %s has finished resync (they" + " share one or more physical units)\n", + mdname(mddev), mdname(mddev2)); mddev_put(mddev2); - goto skip; + schedule(); + finish_wait(&resync_wait, &wq); + goto try_again; } - } - if (mddev->curr_resync == 1) { - mddev_put(mddev2); - break; + finish_wait(&resync_wait, &wq); } } } while (mddev->curr_resync < 2); @@ -3395,17 +3362,19 @@ static void md_do_sync(mddev_t *mddev) * Tune reconstruction: */ window = 32*(PAGE_SIZE/512); - printk(KERN_INFO "md: using %dk window, over a total of %Lu blocks.\n", + printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n", window/2,(unsigned long long) max_sectors/2); atomic_set(&mddev->recovery_active, 0); init_waitqueue_head(&mddev->recovery_wait); last_check = 0; - if (j) + if (j>2) { printk(KERN_INFO "md: resuming recovery of %s from checkpoint.\n", mdname(mddev)); + mddev->curr_resync = j; + } while (j < max_sectors) { int sectors; @@ -3429,7 +3398,7 @@ static void md_do_sync(mddev_t *mddev) break; repeat: - if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) { + if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { /* step marks */ int next = (last_mark+1) % SYNC_MARKS; @@ -3468,8 +3437,7 @@ static void md_do_sync(mddev_t *mddev) if (currspeed > sysctl_speed_limit_min) { if ((currspeed > sysctl_speed_limit_max) || !is_mddev_idle(mddev)) { - current->state = TASK_INTERRUPTIBLE; - schedule_timeout(HZ/4); + msleep_interruptible(250); goto repeat; } } @@ -3488,7 +3456,7 @@ static void md_do_sync(mddev_t *mddev) if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && mddev->curr_resync > 2 && - mddev->curr_resync > mddev->recovery_cp) { + mddev->curr_resync >= mddev->recovery_cp) { if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { printk(KERN_INFO "md: checkpointing recovery of %s.\n", @@ -3501,6 +3469,7 @@ static void md_do_sync(mddev_t *mddev) md_enter_safemode(mddev); skip: mddev->curr_resync = 0; + wake_up(&resync_wait); set_bit(MD_RECOVERY_DONE, &mddev->recovery); md_wakeup_thread(mddev->thread); } @@ -3568,14 +3537,11 @@ void md_check_recovery(mddev_t *mddev) mddev->recovery = 0; /* flag recovery needed just to double check */ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - wake_up(&resync_wait); goto unlock; } - if (mddev->recovery) { + if (mddev->recovery) /* probably just the RECOVERY_NEEDED flag */ mddev->recovery = 0; - wake_up(&resync_wait); - } /* no recovery is running. * remove any failed drives, then @@ -3698,7 +3664,7 @@ int __init md_init(void) for (minor=0; minor < MAX_MD_DEVS; ++minor) devfs_mk_bdev(MKDEV(mdp_major, minor<