X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=drivers%2Fmd%2Fmd.c;h=af1f87678981eaeb4404f473f1a905507e131f8f;hb=97bf2856c6014879bd04983a3e9dfcdac1e7fe85;hp=2fd2cd4470a678a657a8ca1e7f578c3e966692e5;hpb=76828883507a47dae78837ab5dec5a5b4513c667;p=linux-2.6.git diff --git a/drivers/md/md.c b/drivers/md/md.c index 2fd2cd447..af1f87678 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -33,16 +33,16 @@ */ #include -#include #include #include #include #include #include -#include #include /* for invalidate_bdev */ -#include #include +#include +#include +#include #include @@ -71,6 +71,10 @@ static void autostart_arrays (int part); static LIST_HEAD(pers_list); static DEFINE_SPINLOCK(pers_lock); +static void md_print_devices(void); + +#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } + /* * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' * is 1000 KB/sec, so the extra system load does not show up that much. @@ -106,7 +110,7 @@ static ctl_table raid_table[] = { .procname = "speed_limit_min", .data = &sysctl_speed_limit_min, .maxlen = sizeof(int), - .mode = 0644, + .mode = S_IRUGO|S_IWUSR, .proc_handler = &proc_dointvec, }, { @@ -114,7 +118,7 @@ static ctl_table raid_table[] = { .procname = "speed_limit_max", .data = &sysctl_speed_limit_max, .maxlen = sizeof(int), - .mode = 0644, + .mode = S_IRUGO|S_IWUSR, .proc_handler = &proc_dointvec, }, { .ctl_name = 0 } @@ -125,7 +129,7 @@ static ctl_table raid_dir_table[] = { .ctl_name = DEV_RAID, .procname = "raid", .maxlen = 0, - .mode = 0555, + .mode = S_IRUGO|S_IXUGO, .child = raid_table, }, { .ctl_name = 0 } @@ -158,7 +162,18 @@ static int start_readonly; */ static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); static atomic_t md_event_count; -static void md_new_event(mddev_t *mddev) +void md_new_event(mddev_t *mddev) +{ + atomic_inc(&md_event_count); + wake_up(&md_event_waiters); + sysfs_notify(&mddev->kobj, NULL, "sync_action"); +} +EXPORT_SYMBOL_GPL(md_new_event); + +/* Alternate version that can be called from interrupts + * when calling sysfs_notify isn't needed. + */ +static void md_new_event_inintr(mddev_t *mddev) { atomic_inc(&md_event_count); wake_up(&md_event_waiters); @@ -213,10 +228,11 @@ static void mddev_put(mddev_t *mddev) return; if (!mddev->raid_disks && list_empty(&mddev->disks)) { list_del(&mddev->all_mddevs); - blk_put_queue(mddev->queue); + spin_unlock(&all_mddevs_lock); + blk_cleanup_queue(mddev->queue); kobject_unregister(&mddev->kobj); - } - spin_unlock(&all_mddevs_lock); + } else + spin_unlock(&all_mddevs_lock); } static mddev_t * mddev_find(dev_t unit) @@ -250,7 +266,7 @@ static mddev_t * mddev_find(dev_t unit) else new->md_minor = MINOR(unit) >> MdpMinorShift; - init_MUTEX(&new->reconfig_sem); + mutex_init(&new->reconfig_mutex); INIT_LIST_HEAD(&new->disks); INIT_LIST_HEAD(&new->all_mddevs); init_timer(&new->safemode_timer); @@ -263,6 +279,7 @@ static mddev_t * mddev_find(dev_t unit) kfree(new); return NULL; } + set_bit(QUEUE_FLAG_CLUSTER, &new->queue->queue_flags); blk_queue_make_request(new->queue, md_fail_request); @@ -271,22 +288,17 @@ static mddev_t * mddev_find(dev_t unit) static inline int mddev_lock(mddev_t * mddev) { - return down_interruptible(&mddev->reconfig_sem); -} - -static inline void mddev_lock_uninterruptible(mddev_t * mddev) -{ - down(&mddev->reconfig_sem); + return mutex_lock_interruptible(&mddev->reconfig_mutex); } static inline int mddev_trylock(mddev_t * mddev) { - return down_trylock(&mddev->reconfig_sem); + return mutex_trylock(&mddev->reconfig_mutex); } static inline void mddev_unlock(mddev_t * mddev) { - up(&mddev->reconfig_sem); + mutex_unlock(&mddev->reconfig_mutex); md_wakeup_thread(mddev->thread); } @@ -377,8 +389,12 @@ static int super_written(struct bio *bio, unsigned int bytes_done, int error) if (bio->bi_size) return 1; - if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) + if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) { + printk("md: super_written gets error=%d, uptodate=%d\n", + error, test_bit(BIO_UPTODATE, &bio->bi_flags)); + WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags)); md_error(mddev, rdev); + } if (atomic_dec_and_test(&mddev->pending_writes)) wake_up(&mddev->sb_wait); @@ -657,7 +673,8 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version } if (sb->major_version != 0 || - sb->minor_version != 90) { + sb->minor_version < 90 || + sb->minor_version > 91) { printk(KERN_WARNING "Bad version number %d.%d on %s\n", sb->major_version, sb->minor_version, b); @@ -722,6 +739,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) { mdp_disk_t *desc; mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); + __u64 ev1 = md_event(sb); rdev->raid_disk = -1; rdev->flags = 0; @@ -738,10 +756,24 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) mddev->layout = sb->layout; mddev->raid_disks = sb->raid_disks; mddev->size = sb->size; - mddev->events = md_event(sb); + mddev->events = ev1; mddev->bitmap_offset = 0; mddev->default_bitmap_offset = MD_SB_BYTES >> 9; + if (mddev->minor_version >= 91) { + mddev->reshape_position = sb->reshape_position; + mddev->delta_disks = sb->delta_disks; + mddev->new_level = sb->new_level; + mddev->new_layout = sb->new_layout; + mddev->new_chunk = sb->new_chunk; + } else { + mddev->reshape_position = MaxSector; + mddev->delta_disks = 0; + mddev->new_level = mddev->level; + mddev->new_layout = mddev->layout; + mddev->new_chunk = mddev->chunk_size; + } + if (sb->state & (1<recovery_cp = MaxSector; else { @@ -761,7 +793,8 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) if (sb->state & (1<bitmap_file == NULL) { - if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6 + if (mddev->level != 1 && mddev->level != 4 + && mddev->level != 5 && mddev->level != 6 && mddev->level != 10) { /* FIXME use a better test */ printk(KERN_WARNING "md: bitmaps not supported for this level.\n"); @@ -772,7 +805,6 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) } else if (mddev->pers == NULL) { /* Insist on good event counter while assembling */ - __u64 ev1 = md_event(sb); ++ev1; if (ev1 < mddev->events) return -EINVAL; @@ -780,19 +812,21 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) /* if adding to array with a bitmap, then we can accept an * older device ... but not too old. */ - __u64 ev1 = md_event(sb); if (ev1 < mddev->bitmap->events_cleared) return 0; - } else /* just a hot-add of a new device, leave raid_disk at -1 */ - return 0; + } else { + if (ev1 < mddev->events) + /* just a hot-add of a new device, leave raid_disk at -1 */ + return 0; + } if (mddev->level != LEVEL_MULTIPATH) { desc = sb->disks + rdev->desc_nr; if (desc->state & (1<flags); - else if (desc->state & (1<raid_disk < mddev->raid_disks) { + else if (desc->state & (1<raid_disk < mddev->raid_disks */) { set_bit(In_sync, &rdev->flags); rdev->raid_disk = desc->raid_disk; } @@ -835,7 +869,6 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->md_magic = MD_SB_MAGIC; sb->major_version = mddev->major_version; - sb->minor_version = mddev->minor_version; sb->patch_version = mddev->patch_version; sb->gvalid_words = 0; /* ignored */ memcpy(&sb->set_uuid0, mddev->uuid+0, 4); @@ -854,6 +887,17 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->events_hi = (mddev->events>>32); sb->events_lo = (u32)mddev->events; + if (mddev->reshape_position == MaxSector) + sb->minor_version = 90; + else { + sb->minor_version = 91; + sb->reshape_position = mddev->reshape_position; + sb->new_level = mddev->new_level; + sb->delta_disks = mddev->delta_disks; + sb->new_layout = mddev->new_layout; + sb->new_chunk = mddev->new_chunk; + } + mddev->minor_version = sb->minor_version; if (mddev->in_sync) { sb->recovery_cp = mddev->recovery_cp; @@ -890,10 +934,9 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) d->raid_disk = rdev2->raid_disk; else d->raid_disk = rdev2->desc_nr; /* compatibility */ - if (test_bit(Faulty, &rdev2->flags)) { + if (test_bit(Faulty, &rdev2->flags)) d->state = (1<flags)) { + else if (test_bit(In_sync, &rdev2->flags)) { d->state = (1<state |= (1<max_dev)*2; - unsigned int *isuper = (unsigned int*)sb; + __le32 *isuper = (__le32*)sb; int i; disk_csum = sb->sb_csum; @@ -946,7 +990,7 @@ static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb) newcsum += le32_to_cpu(*isuper++); if (size == 2) - newcsum += le16_to_cpu(*(unsigned short*) isuper); + newcsum += le16_to_cpu(*(__le16*) isuper); csum = (newcsum & 0xffffffff) + (newcsum >> 32); sb->sb_csum = disk_csum; @@ -1023,6 +1067,11 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) if (rdev->sb_size & bmask) rdev-> sb_size = (rdev->sb_size | bmask)+1; + if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) + rdev->desc_nr = -1; + else + rdev->desc_nr = le32_to_cpu(sb->dev_number); + if (refdev == 0) ret = 1; else { @@ -1058,7 +1107,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) if (le32_to_cpu(sb->chunksize)) rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1); - if (le32_to_cpu(sb->size) > rdev->size*2) + if (le64_to_cpu(sb->size) > rdev->size*2) return -EINVAL; return ret; } @@ -1066,6 +1115,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) { struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); + __u64 ev1 = le64_to_cpu(sb->events); rdev->raid_disk = -1; rdev->flags = 0; @@ -1081,7 +1131,7 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) mddev->layout = le32_to_cpu(sb->layout); mddev->raid_disks = le32_to_cpu(sb->raid_disks); mddev->size = le64_to_cpu(sb->size)/2; - mddev->events = le64_to_cpu(sb->events); + mddev->events = ev1; mddev->bitmap_offset = 0; mddev->default_bitmap_offset = 1024 >> 9; @@ -1099,9 +1149,22 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) } mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset); } + if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { + mddev->reshape_position = le64_to_cpu(sb->reshape_position); + mddev->delta_disks = le32_to_cpu(sb->delta_disks); + mddev->new_level = le32_to_cpu(sb->new_level); + mddev->new_layout = le32_to_cpu(sb->new_layout); + mddev->new_chunk = le32_to_cpu(sb->new_chunk)<<9; + } else { + mddev->reshape_position = MaxSector; + mddev->delta_disks = 0; + mddev->new_level = mddev->level; + mddev->new_layout = mddev->layout; + mddev->new_chunk = mddev->chunk_size; + } + } else if (mddev->pers == NULL) { /* Insist of good event counter while assembling */ - __u64 ev1 = le64_to_cpu(sb->events); ++ev1; if (ev1 < mddev->events) return -EINVAL; @@ -1109,15 +1172,15 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) /* If adding to array with a bitmap, then we can accept an * older device, but not too old. */ - __u64 ev1 = le64_to_cpu(sb->events); if (ev1 < mddev->bitmap->events_cleared) return 0; - } else /* just a hot-add of a new device, leave raid_disk at -1 */ - return 0; - + } else { + if (ev1 < mddev->events) + /* just a hot-add of a new device, leave raid_disk at -1 */ + return 0; + } if (mddev->level != LEVEL_MULTIPATH) { int role; - rdev->desc_nr = le32_to_cpu(sb->dev_number); role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); switch(role) { case 0xffff: /* spare */ @@ -1126,7 +1189,11 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) set_bit(Faulty, &rdev->flags); break; default: - set_bit(In_sync, &rdev->flags); + if ((le32_to_cpu(sb->feature_map) & + MD_FEATURE_RECOVERY_OFFSET)) + rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); + else + set_bit(In_sync, &rdev->flags); rdev->raid_disk = role; break; } @@ -1150,6 +1217,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->feature_map = 0; sb->pad0 = 0; + sb->recovery_offset = cpu_to_le64(0); memset(sb->pad1, 0, sizeof(sb->pad1)); memset(sb->pad2, 0, sizeof(sb->pad2)); memset(sb->pad3, 0, sizeof(sb->pad3)); @@ -1161,7 +1229,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) else sb->resync_offset = cpu_to_le64(0); - sb->cnt_corrected_read = atomic_read(&rdev->corrected_errors); + sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); sb->raid_disks = cpu_to_le32(mddev->raid_disks); sb->size = cpu_to_le64(mddev->size<<1); @@ -1171,6 +1239,22 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); } + if (rdev->raid_disk >= 0 && + !test_bit(In_sync, &rdev->flags) && + rdev->recovery_offset > 0) { + sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); + sb->recovery_offset = cpu_to_le64(rdev->recovery_offset); + } + + if (mddev->reshape_position != MaxSector) { + sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); + sb->reshape_position = cpu_to_le64(mddev->reshape_position); + sb->new_layout = cpu_to_le32(mddev->new_layout); + sb->delta_disks = cpu_to_le32(mddev->delta_disks); + sb->new_level = cpu_to_le32(mddev->new_level); + sb->new_chunk = cpu_to_le32(mddev->new_chunk>>9); + } + max_dev = 0; ITERATE_RDEV(mddev,rdev2,tmp) if (rdev2->desc_nr+1 > max_dev) @@ -1186,11 +1270,12 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->dev_roles[i] = cpu_to_le16(0xfffe); else if (test_bit(In_sync, &rdev2->flags)) sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); + else if (rdev2->raid_disk >= 0 && rdev2->recovery_offset > 0) + sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); else sb->dev_roles[i] = cpu_to_le16(0xffff); } - sb->recovery_offset = cpu_to_le64(0); /* not supported yet */ sb->sb_csum = calc_sb_1_csum(sb); } @@ -1298,6 +1383,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) else ko = &rdev->bdev->bd_disk->kobj; sysfs_create_link(&rdev->kobj, ko, "block"); + bd_claim_by_disk(rdev->bdev, rdev, mddev->gendisk); return 0; } @@ -1308,6 +1394,7 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev) MD_BUG(); return; } + bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk); list_del_init(&rdev->same_set); printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); rdev->mddev = NULL; @@ -1449,7 +1536,7 @@ static void print_rdev(mdk_rdev_t *rdev) printk(KERN_INFO "md: no rdev superblock!\n"); } -void md_print_devices(void) +static void md_print_devices(void) { struct list_head *tmp, *tmp2; mdk_rdev_t *rdev; @@ -1478,30 +1565,94 @@ void md_print_devices(void) } -static void sync_sbs(mddev_t * mddev) +static void sync_sbs(mddev_t * mddev, int nospares) { + /* Update each superblock (in-memory image), but + * if we are allowed to, skip spares which already + * have the right event counter, or have one earlier + * (which would mean they aren't being marked as dirty + * with the rest of the array) + */ mdk_rdev_t *rdev; struct list_head *tmp; ITERATE_RDEV(mddev,rdev,tmp) { - super_types[mddev->major_version]. - sync_super(mddev, rdev); - rdev->sb_loaded = 1; + if (rdev->sb_events == mddev->events || + (nospares && + rdev->raid_disk < 0 && + (rdev->sb_events&1)==0 && + rdev->sb_events+1 == mddev->events)) { + /* Don't update this superblock */ + rdev->sb_loaded = 2; + } else { + super_types[mddev->major_version]. + sync_super(mddev, rdev); + rdev->sb_loaded = 1; + } } } -static void md_update_sb(mddev_t * mddev) +static void md_update_sb(mddev_t * mddev, int force_change) { int err; struct list_head *tmp; mdk_rdev_t *rdev; int sync_req; + int nospares = 0; repeat: spin_lock_irq(&mddev->write_lock); + + set_bit(MD_CHANGE_PENDING, &mddev->flags); + if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags)) + force_change = 1; + if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags)) + /* just a clean<-> dirty transition, possibly leave spares alone, + * though if events isn't the right even/odd, we will have to do + * spares after all + */ + nospares = 1; + if (force_change) + nospares = 0; + if (mddev->degraded) + /* If the array is degraded, then skipping spares is both + * dangerous and fairly pointless. + * Dangerous because a device that was removed from the array + * might have a event_count that still looks up-to-date, + * so it can be re-added without a resync. + * Pointless because if there are any spares to skip, + * then a recovery will happen and soon that array won't + * be degraded any more and the spare can go back to sleep then. + */ + nospares = 0; + sync_req = mddev->in_sync; mddev->utime = get_seconds(); - mddev->events ++; + + /* If this is just a dirty<->clean transition, and the array is clean + * and 'events' is odd, we can roll back to the previous clean state */ + if (nospares + && (mddev->in_sync && mddev->recovery_cp == MaxSector) + && (mddev->events & 1) + && mddev->events != 1) + mddev->events--; + else { + /* otherwise we have to go forward and ... */ + mddev->events ++; + if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */ + /* .. if the array isn't clean, insist on an odd 'events' */ + if ((mddev->events&1)==0) { + mddev->events++; + nospares = 0; + } + } else { + /* otherwise insist on an even 'events' (for clean states) */ + if ((mddev->events&1)) { + mddev->events++; + nospares = 0; + } + } + } if (!mddev->events) { /* @@ -1512,15 +1663,14 @@ repeat: MD_BUG(); mddev->events --; } - mddev->sb_dirty = 2; - sync_sbs(mddev); + sync_sbs(mddev, nospares); /* * do not write anything to disk if using * nonpersistent superblocks */ if (!mddev->persistent) { - mddev->sb_dirty = 0; + clear_bit(MD_CHANGE_PENDING, &mddev->flags); spin_unlock_irq(&mddev->write_lock); wake_up(&mddev->sb_wait); return; @@ -1535,6 +1685,8 @@ repeat: ITERATE_RDEV(mddev,rdev,tmp) { char b[BDEVNAME_SIZE]; dprintk(KERN_INFO "md: "); + if (rdev->sb_loaded != 1) + continue; /* no noise on spare devices */ if (test_bit(Faulty, &rdev->flags)) dprintk("(skipping faulty "); @@ -1546,6 +1698,7 @@ repeat: dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", bdevname(rdev->bdev,b), (unsigned long long)rdev->sb_offset); + rdev->sb_events = mddev->events; } else dprintk(")\n"); @@ -1554,15 +1707,16 @@ repeat: break; } md_super_wait(mddev); - /* if there was a failure, sb_dirty was set to 1, and we re-write super */ + /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */ spin_lock_irq(&mddev->write_lock); - if (mddev->in_sync != sync_req|| mddev->sb_dirty == 1) { + if (mddev->in_sync != sync_req || + test_bit(MD_CHANGE_DEVS, &mddev->flags)) { /* have to write it out again */ spin_unlock_irq(&mddev->write_lock); goto repeat; } - mddev->sb_dirty = 0; + clear_bit(MD_CHANGE_PENDING, &mddev->flags); spin_unlock_irq(&mddev->write_lock); wake_up(&mddev->sb_wait); @@ -1608,6 +1762,10 @@ state_show(mdk_rdev_t *rdev, char *page) len += sprintf(page+len, "%sin_sync",sep); sep = ","; } + if (test_bit(WriteMostly, &rdev->flags)) { + len += sprintf(page+len, "%swrite_mostly",sep); + sep = ","; + } if (!test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags)) { len += sprintf(page+len, "%sspare", sep); @@ -1616,8 +1774,41 @@ state_show(mdk_rdev_t *rdev, char *page) return len+sprintf(page+len, "\n"); } -static struct rdev_sysfs_entry -rdev_state = __ATTR_RO(state); +static ssize_t +state_store(mdk_rdev_t *rdev, const char *buf, size_t len) +{ + /* can write + * faulty - simulates and error + * remove - disconnects the device + * writemostly - sets write_mostly + * -writemostly - clears write_mostly + */ + int err = -EINVAL; + if (cmd_match(buf, "faulty") && rdev->mddev->pers) { + md_error(rdev->mddev, rdev); + err = 0; + } else if (cmd_match(buf, "remove")) { + if (rdev->raid_disk >= 0) + err = -EBUSY; + else { + mddev_t *mddev = rdev->mddev; + kick_rdev_from_array(rdev); + if (mddev->pers) + md_update_sb(mddev, 1); + md_new_event(mddev); + err = 0; + } + } else if (cmd_match(buf, "writemostly")) { + set_bit(WriteMostly, &rdev->flags); + err = 0; + } else if (cmd_match(buf, "-writemostly")) { + clear_bit(WriteMostly, &rdev->flags); + err = 0; + } + return err ? err : len; +} +static struct rdev_sysfs_entry rdev_state = +__ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store); static ssize_t super_show(mdk_rdev_t *rdev, char *page) @@ -1648,7 +1839,7 @@ errors_store(mdk_rdev_t *rdev, const char *buf, size_t len) return -EINVAL; } static struct rdev_sysfs_entry rdev_errors = -__ATTR(errors, 0644, errors_show, errors_store); +__ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); static ssize_t slot_show(mdk_rdev_t *rdev, char *page) @@ -1682,7 +1873,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) static struct rdev_sysfs_entry rdev_slot = -__ATTR(slot, 0644, slot_show, slot_store); +__ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store); static ssize_t offset_show(mdk_rdev_t *rdev, char *page) @@ -1704,7 +1895,7 @@ offset_store(mdk_rdev_t *rdev, const char *buf, size_t len) } static struct rdev_sysfs_entry rdev_offset = -__ATTR(offset, 0644, offset_show, offset_store); +__ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); static ssize_t rdev_size_show(mdk_rdev_t *rdev, char *page) @@ -1728,7 +1919,7 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) } static struct rdev_sysfs_entry rdev_size = -__ATTR(size, 0644, rdev_size_show, rdev_size_store); +__ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); static struct attribute *rdev_default_attrs[] = { &rdev_state.attr, @@ -1759,6 +1950,8 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr, if (!entry->store) return -EIO; + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; return entry->store(rdev, page, length); } @@ -1812,8 +2005,11 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi kobject_init(&rdev->kobj); rdev->desc_nr = -1; + rdev->saved_raid_disk = -1; + rdev->raid_disk = -1; rdev->flags = 0; rdev->data_offset = 0; + rdev->sb_events = 0; atomic_set(&rdev->nr_pending, 0); atomic_set(&rdev->read_errors, 0); atomic_set(&rdev->corrected_errors, 0); @@ -1918,6 +2114,54 @@ static void analyze_sbs(mddev_t * mddev) } +static ssize_t +safe_delay_show(mddev_t *mddev, char *page) +{ + int msec = (mddev->safemode_delay*1000)/HZ; + return sprintf(page, "%d.%03d\n", msec/1000, msec%1000); +} +static ssize_t +safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len) +{ + int scale=1; + int dot=0; + int i; + unsigned long msec; + char buf[30]; + char *e; + /* remove a period, and count digits after it */ + if (len >= sizeof(buf)) + return -EINVAL; + strlcpy(buf, cbuf, len); + buf[len] = 0; + for (i=0; isafemode_delay = 0; + else { + mddev->safemode_delay = (msec*HZ)/1000; + if (mddev->safemode_delay == 0) + mddev->safemode_delay = 1; + } + return len; +} +static struct md_sysfs_entry md_safe_delay = +__ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store); + static ssize_t level_show(mddev_t *mddev, char *page) { @@ -1951,7 +2195,33 @@ level_store(mddev_t *mddev, const char *buf, size_t len) } static struct md_sysfs_entry md_level = -__ATTR(level, 0644, level_show, level_store); +__ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store); + + +static ssize_t +layout_show(mddev_t *mddev, char *page) +{ + /* just a number, not meaningful for all levels */ + return sprintf(page, "%d\n", mddev->layout); +} + +static ssize_t +layout_store(mddev_t *mddev, const char *buf, size_t len) +{ + char *e; + unsigned long n = simple_strtoul(buf, &e, 10); + if (mddev->pers) + return -EBUSY; + + if (!*buf || (*e && *e != '\n')) + return -EINVAL; + + mddev->layout = n; + return len; +} +static struct md_sysfs_entry md_layout = +__ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store); + static ssize_t raid_disks_show(mddev_t *mddev, char *page) @@ -1966,7 +2236,6 @@ static int update_raid_disks(mddev_t *mddev, int raid_disks); static ssize_t raid_disks_store(mddev_t *mddev, const char *buf, size_t len) { - /* can only set raid_disks if array is not yet active */ char *e; int rv = 0; unsigned long n = simple_strtoul(buf, &e, 10); @@ -1981,7 +2250,7 @@ raid_disks_store(mddev_t *mddev, const char *buf, size_t len) return rv ? rv : len; } static struct md_sysfs_entry md_raid_disks = -__ATTR(raid_disks, 0644, raid_disks_show, raid_disks_store); +__ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); static ssize_t chunk_size_show(mddev_t *mddev, char *page) @@ -2005,7 +2274,202 @@ chunk_size_store(mddev_t *mddev, const char *buf, size_t len) return len; } static struct md_sysfs_entry md_chunk_size = -__ATTR(chunk_size, 0644, chunk_size_show, chunk_size_store); +__ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); + +static ssize_t +resync_start_show(mddev_t *mddev, char *page) +{ + return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); +} + +static ssize_t +resync_start_store(mddev_t *mddev, const char *buf, size_t len) +{ + /* can only set chunk_size if array is not yet active */ + char *e; + unsigned long long n = simple_strtoull(buf, &e, 10); + + if (mddev->pers) + return -EBUSY; + if (!*buf || (*e && *e != '\n')) + return -EINVAL; + + mddev->recovery_cp = n; + return len; +} +static struct md_sysfs_entry md_resync_start = +__ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store); + +/* + * The array state can be: + * + * clear + * No devices, no size, no level + * Equivalent to STOP_ARRAY ioctl + * inactive + * May have some settings, but array is not active + * all IO results in error + * When written, doesn't tear down array, but just stops it + * suspended (not supported yet) + * All IO requests will block. The array can be reconfigured. + * Writing this, if accepted, will block until array is quiessent + * readonly + * no resync can happen. no superblocks get written. + * write requests fail + * read-auto + * like readonly, but behaves like 'clean' on a write request. + * + * clean - no pending writes, but otherwise active. + * When written to inactive array, starts without resync + * If a write request arrives then + * if metadata is known, mark 'dirty' and switch to 'active'. + * if not known, block and switch to write-pending + * If written to an active array that has pending writes, then fails. + * active + * fully active: IO and resync can be happening. + * When written to inactive array, starts with resync + * + * write-pending + * clean, but writes are blocked waiting for 'active' to be written. + * + * active-idle + * like active, but no writes have been seen for a while (100msec). + * + */ +enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active, + write_pending, active_idle, bad_word}; +static char *array_states[] = { + "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active", + "write-pending", "active-idle", NULL }; + +static int match_word(const char *word, char **list) +{ + int n; + for (n=0; list[n]; n++) + if (cmd_match(word, list[n])) + break; + return n; +} + +static ssize_t +array_state_show(mddev_t *mddev, char *page) +{ + enum array_state st = inactive; + + if (mddev->pers) + switch(mddev->ro) { + case 1: + st = readonly; + break; + case 2: + st = read_auto; + break; + case 0: + if (mddev->in_sync) + st = clean; + else if (mddev->safemode) + st = active_idle; + else + st = active; + } + else { + if (list_empty(&mddev->disks) && + mddev->raid_disks == 0 && + mddev->size == 0) + st = clear; + else + st = inactive; + } + return sprintf(page, "%s\n", array_states[st]); +} + +static int do_md_stop(mddev_t * mddev, int ro); +static int do_md_run(mddev_t * mddev); +static int restart_array(mddev_t *mddev); + +static ssize_t +array_state_store(mddev_t *mddev, const char *buf, size_t len) +{ + int err = -EINVAL; + enum array_state st = match_word(buf, array_states); + switch(st) { + case bad_word: + break; + case clear: + /* stopping an active array */ + if (mddev->pers) { + if (atomic_read(&mddev->active) > 1) + return -EBUSY; + err = do_md_stop(mddev, 0); + } + break; + case inactive: + /* stopping an active array */ + if (mddev->pers) { + if (atomic_read(&mddev->active) > 1) + return -EBUSY; + err = do_md_stop(mddev, 2); + } + break; + case suspended: + break; /* not supported yet */ + case readonly: + if (mddev->pers) + err = do_md_stop(mddev, 1); + else { + mddev->ro = 1; + err = do_md_run(mddev); + } + break; + case read_auto: + /* stopping an active array */ + if (mddev->pers) { + err = do_md_stop(mddev, 1); + if (err == 0) + mddev->ro = 2; /* FIXME mark devices writable */ + } else { + mddev->ro = 2; + err = do_md_run(mddev); + } + break; + case clean: + if (mddev->pers) { + restart_array(mddev); + spin_lock_irq(&mddev->write_lock); + if (atomic_read(&mddev->writes_pending) == 0) { + mddev->in_sync = 1; + set_bit(MD_CHANGE_CLEAN, &mddev->flags); + } + spin_unlock_irq(&mddev->write_lock); + } else { + mddev->ro = 0; + mddev->recovery_cp = MaxSector; + err = do_md_run(mddev); + } + break; + case active: + if (mddev->pers) { + restart_array(mddev); + clear_bit(MD_CHANGE_CLEAN, &mddev->flags); + wake_up(&mddev->sb_wait); + err = 0; + } else { + mddev->ro = 0; + err = do_md_run(mddev); + } + break; + case write_pending: + case active_idle: + /* these cannot be set */ + break; + } + if (err) + return err; + else + return len; +} +static struct md_sysfs_entry md_array_state = +__ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); static ssize_t null_show(mddev_t *mddev, char *page) @@ -2065,7 +2529,37 @@ new_dev_store(mddev_t *mddev, const char *buf, size_t len) } static struct md_sysfs_entry md_new_device = -__ATTR(new_dev, 0200, null_show, new_dev_store); +__ATTR(new_dev, S_IWUSR, null_show, new_dev_store); + +static ssize_t +bitmap_store(mddev_t *mddev, const char *buf, size_t len) +{ + char *end; + unsigned long chunk, end_chunk; + + if (!mddev->bitmap) + goto out; + /* buf should be ... or - ... (range) */ + while (*buf) { + chunk = end_chunk = simple_strtoul(buf, &end, 0); + if (buf == end) break; + if (*end == '-') { /* range */ + buf = end + 1; + end_chunk = simple_strtoul(buf, &end, 0); + if (buf == end) break; + } + if (*end && !isspace(*end)) break; + bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk); + buf = end; + while (isspace(*buf)) buf++; + } + bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ +out: + return len; +} + +static struct md_sysfs_entry md_bitmap = +__ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store); static ssize_t size_show(mddev_t *mddev, char *page) @@ -2091,7 +2585,7 @@ size_store(mddev_t *mddev, const char *buf, size_t len) if (mddev->pers) { err = update_size(mddev, size); - md_update_sb(mddev); + md_update_sb(mddev, 1); } else { if (mddev->size == 0 || mddev->size > size) @@ -2103,7 +2597,7 @@ size_store(mddev_t *mddev, const char *buf, size_t len) } static struct md_sysfs_entry md_size = -__ATTR(component_size, 0644, size_show, size_store); +__ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store); /* Metdata version. @@ -2139,7 +2633,7 @@ metadata_store(mddev_t *mddev, const char *buf, size_t len) return -EINVAL; buf = e+1; minor = simple_strtoul(buf, &e, 10); - if (e==buf || *e != '\n') + if (e==buf || (*e && *e != '\n') ) return -EINVAL; if (major >= sizeof(super_types)/sizeof(super_types[0]) || super_types[major].name == NULL) @@ -2151,7 +2645,7 @@ metadata_store(mddev_t *mddev, const char *buf, size_t len) } static struct md_sysfs_entry md_metadata = -__ATTR(metadata_version, 0644, metadata_show, metadata_store); +__ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); static ssize_t action_show(mddev_t *mddev, char *page) @@ -2159,7 +2653,9 @@ action_show(mddev_t *mddev, char *page) char *type = "idle"; if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) { - if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) + type = "reshape"; + else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) type = "resync"; else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) @@ -2190,10 +2686,17 @@ action_store(mddev_t *mddev, const char *page, size_t len) return -EBUSY; else if (cmd_match(page, "resync") || cmd_match(page, "recover")) set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - else { + else if (cmd_match(page, "reshape")) { + int err; + if (mddev->pers->start_reshape == NULL) + return -EINVAL; + err = mddev->pers->start_reshape(mddev); + if (err) + return err; + } else { if (cmd_match(page, "check")) set_bit(MD_RECOVERY_CHECK, &mddev->recovery); - else if (cmd_match(page, "repair")) + else if (!cmd_match(page, "repair")) return -EINVAL; set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); set_bit(MD_RECOVERY_SYNC, &mddev->recovery); @@ -2210,12 +2713,11 @@ mismatch_cnt_show(mddev_t *mddev, char *page) (unsigned long long) mddev->resync_mismatches); } -static struct md_sysfs_entry -md_scan_mode = __ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); +static struct md_sysfs_entry md_scan_mode = +__ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); -static struct md_sysfs_entry -md_mismatches = __ATTR_RO(mismatch_cnt); +static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt); static ssize_t sync_min_show(mddev_t *mddev, char *page) @@ -2274,15 +2776,14 @@ static ssize_t sync_speed_show(mddev_t *mddev, char *page) { unsigned long resync, dt, db; - resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active)); + resync = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active)); dt = ((jiffies - mddev->resync_mark) / HZ); if (!dt) dt++; db = resync - (mddev->resync_mark_cnt); return sprintf(page, "%ld\n", db/dt/2); /* K/sec */ } -static struct md_sysfs_entry -md_sync_speed = __ATTR_RO(sync_speed); +static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); static ssize_t sync_completed_show(mddev_t *mddev, char *page) @@ -2298,16 +2799,76 @@ sync_completed_show(mddev_t *mddev, char *page) return sprintf(page, "%lu / %lu\n", resync, max_blocks); } -static struct md_sysfs_entry -md_sync_completed = __ATTR_RO(sync_completed); +static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); + +static ssize_t +suspend_lo_show(mddev_t *mddev, char *page) +{ + return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo); +} + +static ssize_t +suspend_lo_store(mddev_t *mddev, const char *buf, size_t len) +{ + char *e; + unsigned long long new = simple_strtoull(buf, &e, 10); + + if (mddev->pers->quiesce == NULL) + return -EINVAL; + if (buf == e || (*e && *e != '\n')) + return -EINVAL; + if (new >= mddev->suspend_hi || + (new > mddev->suspend_lo && new < mddev->suspend_hi)) { + mddev->suspend_lo = new; + mddev->pers->quiesce(mddev, 2); + return len; + } else + return -EINVAL; +} +static struct md_sysfs_entry md_suspend_lo = +__ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); + + +static ssize_t +suspend_hi_show(mddev_t *mddev, char *page) +{ + return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi); +} + +static ssize_t +suspend_hi_store(mddev_t *mddev, const char *buf, size_t len) +{ + char *e; + unsigned long long new = simple_strtoull(buf, &e, 10); + + if (mddev->pers->quiesce == NULL) + return -EINVAL; + if (buf == e || (*e && *e != '\n')) + return -EINVAL; + if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) || + (new > mddev->suspend_lo && new > mddev->suspend_hi)) { + mddev->suspend_hi = new; + mddev->pers->quiesce(mddev, 1); + mddev->pers->quiesce(mddev, 0); + return len; + } else + return -EINVAL; +} +static struct md_sysfs_entry md_suspend_hi = +__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); + static struct attribute *md_default_attrs[] = { &md_level.attr, + &md_layout.attr, &md_raid_disks.attr, &md_chunk_size.attr, &md_size.attr, + &md_resync_start.attr, &md_metadata.attr, &md_new_device.attr, + &md_safe_delay.attr, + &md_array_state.attr, NULL, }; @@ -2318,6 +2879,9 @@ static struct attribute *md_redundancy_attrs[] = { &md_sync_max.attr, &md_sync_speed.attr, &md_sync_completed.attr, + &md_suspend_lo.attr, + &md_suspend_hi.attr, + &md_bitmap.attr, NULL, }; static struct attribute_group md_redundancy_group = { @@ -2335,9 +2899,11 @@ md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) if (!entry->show) return -EIO; - mddev_lock(mddev); - rv = entry->show(mddev, page); - mddev_unlock(mddev); + rv = mddev_lock(mddev); + if (!rv) { + rv = entry->show(mddev, page); + mddev_unlock(mddev); + } return rv; } @@ -2351,9 +2917,13 @@ md_attr_store(struct kobject *kobj, struct attribute *attr, if (!entry->store) return -EIO; - mddev_lock(mddev); - rv = entry->store(mddev, page, length); - mddev_unlock(mddev); + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + rv = mddev_lock(mddev); + if (!rv) { + rv = entry->store(mddev, page, length); + mddev_unlock(mddev); + } return rv; } @@ -2377,7 +2947,7 @@ int mdp_major = 0; static struct kobject *md_probe(dev_t dev, int *part, void *data) { - static DECLARE_MUTEX(disks_sem); + static DEFINE_MUTEX(disks_mutex); mddev_t *mddev = mddev_find(dev); struct gendisk *disk; int partitioned = (MAJOR(dev) != MD_MAJOR); @@ -2387,33 +2957,30 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data) if (!mddev) return NULL; - down(&disks_sem); + mutex_lock(&disks_mutex); if (mddev->gendisk) { - up(&disks_sem); + mutex_unlock(&disks_mutex); mddev_put(mddev); return NULL; } disk = alloc_disk(1 << shift); if (!disk) { - up(&disks_sem); + mutex_unlock(&disks_mutex); mddev_put(mddev); return NULL; } disk->major = MAJOR(dev); disk->first_minor = unit << shift; - if (partitioned) { + if (partitioned) sprintf(disk->disk_name, "md_d%d", unit); - sprintf(disk->devfs_name, "md/d%d", unit); - } else { + else sprintf(disk->disk_name, "md%d", unit); - sprintf(disk->devfs_name, "md/%d", unit); - } disk->fops = &md_fops; disk->private_data = mddev; disk->queue = mddev->queue; add_disk(disk); mddev->gendisk = disk; - up(&disks_sem); + mutex_unlock(&disks_mutex); mddev->kobj.parent = &disk->kobj; mddev->kobj.k_name = NULL; snprintf(mddev->kobj.name, KOBJ_NAME_LEN, "%s", "md"); @@ -2422,8 +2989,6 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data) return NULL; } -void md_wakeup_thread(mdk_thread_t *thread); - static void md_safemode_timeout(unsigned long data) { mddev_t *mddev = (mddev_t *) data; @@ -2536,6 +3101,14 @@ static int do_md_run(mddev_t * mddev) mddev->level = pers->level; strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); + if (mddev->reshape_position != MaxSector && + pers->start_reshape == NULL) { + /* This personality cannot handle reshaping... */ + mddev->pers = NULL; + module_put(pers->owner); + return -EINVAL; + } + mddev->recovery = 0; mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */ mddev->barriers_work = 1; @@ -2569,7 +3142,7 @@ static int do_md_run(mddev_t * mddev) mddev->safemode = 0; mddev->safemode_timer.function = md_safemode_timeout; mddev->safemode_timer.data = (unsigned long) mddev; - mddev->safemode_delay = (20 * HZ)/1000 +1; /* 20 msec delay */ + mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ mddev->in_sync = 1; ITERATE_RDEV(mddev,rdev,tmp) @@ -2580,10 +3153,9 @@ static int do_md_run(mddev_t * mddev) } set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - md_wakeup_thread(mddev->thread); - if (mddev->sb_dirty) - md_update_sb(mddev); + if (mddev->flags) + md_update_sb(mddev, 0); set_capacity(disk, mddev->array_size<<1); @@ -2597,8 +3169,40 @@ static int do_md_run(mddev_t * mddev) mddev->queue->queuedata = mddev; mddev->queue->make_request_fn = mddev->pers->make_request; + /* If there is a partially-recovered drive we need to + * start recovery here. If we leave it to md_check_recovery, + * it will remove the drives and not do the right thing + */ + if (mddev->degraded && !mddev->sync_thread) { + struct list_head *rtmp; + int spares = 0; + ITERATE_RDEV(mddev,rdev,rtmp) + if (rdev->raid_disk >= 0 && + !test_bit(In_sync, &rdev->flags) && + !test_bit(Faulty, &rdev->flags)) + /* complete an interrupted recovery */ + spares++; + if (spares && mddev->pers->sync_request) { + mddev->recovery = 0; + set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + mddev->sync_thread = md_register_thread(md_do_sync, + mddev, + "%s_resync"); + if (!mddev->sync_thread) { + printk(KERN_ERR "%s: could not start resync" + " thread...\n", + mdname(mddev)); + /* leave the spares where they are, it shouldn't hurt */ + mddev->recovery = 0; + } + } + } + md_wakeup_thread(mddev->thread); + md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ + mddev->changed = 1; md_new_event(mddev); + kobject_uevent(&mddev->gendisk->kobj, KOBJ_CHANGE); return 0; } @@ -2630,18 +3234,47 @@ static int restart_array(mddev_t *mddev) */ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); + md_wakeup_thread(mddev->sync_thread); err = 0; - } else { - printk(KERN_ERR "md: %s has no personality assigned.\n", - mdname(mddev)); + } else err = -EINVAL; - } out: return err; } -static int do_md_stop(mddev_t * mddev, int ro) +/* similar to deny_write_access, but accounts for our holding a reference + * to the file ourselves */ +static int deny_bitmap_write_access(struct file * file) +{ + struct inode *inode = file->f_mapping->host; + + spin_lock(&inode->i_lock); + if (atomic_read(&inode->i_writecount) > 1) { + spin_unlock(&inode->i_lock); + return -ETXTBSY; + } + atomic_set(&inode->i_writecount, -1); + spin_unlock(&inode->i_lock); + + return 0; +} + +static void restore_bitmap_write_access(struct file *file) +{ + struct inode *inode = file->f_mapping->host; + + spin_lock(&inode->i_lock); + atomic_set(&inode->i_writecount, 1); + spin_unlock(&inode->i_lock); +} + +/* mode: + * 0 - completely stop and dis-assemble array + * 1 - switch to readonly + * 2 - stop but do not disassemble array + */ +static int do_md_stop(mddev_t * mddev, int mode) { int err = 0; struct gendisk *disk = mddev->gendisk; @@ -2653,6 +3286,7 @@ static int do_md_stop(mddev_t * mddev, int ro) } if (mddev->sync_thread) { + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); set_bit(MD_RECOVERY_INTR, &mddev->recovery); md_unregister_thread(mddev->sync_thread); mddev->sync_thread = NULL; @@ -2662,12 +3296,15 @@ static int do_md_stop(mddev_t * mddev, int ro) invalidate_partition(disk, 0); - if (ro) { + switch(mode) { + case 1: /* readonly */ err = -ENXIO; if (mddev->ro==1) goto out; mddev->ro = 1; - } else { + break; + case 0: /* disassemble */ + case 2: /* stop */ bitmap_flush(mddev); md_super_wait(mddev); if (mddev->ro) @@ -2679,30 +3316,35 @@ static int do_md_stop(mddev_t * mddev, int ro) module_put(mddev->pers->owner); mddev->pers = NULL; + + set_capacity(disk, 0); + mddev->changed = 1; + if (mddev->ro) mddev->ro = 0; } - if (!mddev->in_sync) { + if (!mddev->in_sync || mddev->flags) { /* mark array as shutdown cleanly */ mddev->in_sync = 1; - md_update_sb(mddev); + md_update_sb(mddev, 1); } - if (ro) + if (mode == 1) set_disk_ro(disk, 1); + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); } /* * Free resources if final stop */ - if (!ro) { + if (mode == 0) { mdk_rdev_t *rdev; struct list_head *tmp; - struct gendisk *disk; + printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); bitmap_destroy(mddev); if (mddev->bitmap_file) { - atomic_set(&mddev->bitmap_file->f_dentry->d_inode->i_writecount, 1); + restore_bitmap_write_access(mddev->bitmap_file); fput(mddev->bitmap_file); mddev->bitmap_file = NULL; } @@ -2718,11 +3360,11 @@ static int do_md_stop(mddev_t * mddev, int ro) export_array(mddev); mddev->array_size = 0; - disk = mddev->gendisk; - if (disk) - set_capacity(disk, 0); - mddev->changed = 1; - } else + mddev->size = 0; + mddev->raid_disks = 0; + mddev->recovery_cp = 0; + + } else if (mddev->pers) printk(KERN_INFO "md: %s switched to read-only mode.\n", mdname(mddev)); err = 0; @@ -2731,6 +3373,7 @@ out: return err; } +#ifndef MODULE static void autorun_array(mddev_t *mddev) { mdk_rdev_t *rdev; @@ -2769,7 +3412,6 @@ static void autorun_array(mddev_t *mddev) */ static void autorun_devices(int part) { - struct list_head candidates; struct list_head *tmp; mdk_rdev_t *rdev0, *rdev; mddev_t *mddev; @@ -2777,7 +3419,9 @@ static void autorun_devices(int part) printk(KERN_INFO "md: autorun ...\n"); while (!list_empty(&pending_raid_disks)) { + int unit; dev_t dev; + LIST_HEAD(candidates); rdev0 = list_entry(pending_raid_disks.next, mdk_rdev_t, same_set); @@ -2795,16 +3439,19 @@ static void autorun_devices(int part) * mostly sane superblocks. It's time to allocate the * mddev. */ - if (rdev0->preferred_minor < 0 || rdev0->preferred_minor >= MAX_MD_DEVS) { + if (part) { + dev = MKDEV(mdp_major, + rdev0->preferred_minor << MdpMinorShift); + unit = MINOR(dev) >> MdpMinorShift; + } else { + dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); + unit = MINOR(dev); + } + if (rdev0->preferred_minor != unit) { printk(KERN_INFO "md: unit number in %s is bad: %d\n", bdevname(rdev0->bdev, b), rdev0->preferred_minor); break; } - if (part) - dev = MKDEV(mdp_major, - rdev0->preferred_minor << MdpMinorShift); - else - dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); md_probe(dev, NULL, NULL); mddev = mddev_find(dev); @@ -2841,6 +3488,7 @@ static void autorun_devices(int part) } printk(KERN_INFO "md: ... autorun DONE.\n"); } +#endif /* !MODULE */ /* * import RAID devices based on one partition @@ -2977,6 +3625,8 @@ static int get_bitmap_file(mddev_t * mddev, void __user * arg) char *ptr, *buf = NULL; int err = -ENOMEM; + md_allow_write(mddev); + file = kmalloc(sizeof(*file), GFP_KERNEL); if (!file) goto out; @@ -3125,9 +3775,21 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) rdev->raid_disk = -1; err = bind_rdev_to_array(rdev, mddev); + if (!err && !mddev->pers->hot_remove_disk) { + /* If there is hot_add_disk but no hot_remove_disk + * then added disks for geometry changes, + * and should be added immediately. + */ + super_types[mddev->major_version]. + validate_super(mddev, rdev); + err = mddev->pers->hot_add_disk(mddev, rdev); + if (err) + unbind_rdev_from_array(rdev); + } if (err) export_rdev(rdev); + md_update_sb(mddev, 1); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); return err; @@ -3199,7 +3861,7 @@ static int hot_remove_disk(mddev_t * mddev, dev_t dev) goto busy; kick_rdev_from_array(rdev); - md_update_sb(mddev); + md_update_sb(mddev, 1); md_new_event(mddev); return 0; @@ -3258,6 +3920,7 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev) } clear_bit(In_sync, &rdev->flags); rdev->desc_nr = -1; + rdev->saved_raid_disk = -1; err = bind_rdev_to_array(rdev, mddev); if (err) goto abort_export; @@ -3276,7 +3939,7 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev) rdev->raid_disk = -1; - md_update_sb(mddev); + md_update_sb(mddev, 1); /* * Kick recovery, maybe this spare has to be added to the @@ -3295,23 +3958,6 @@ abort_export: return err; } -/* similar to deny_write_access, but accounts for our holding a reference - * to the file ourselves */ -static int deny_bitmap_write_access(struct file * file) -{ - struct inode *inode = file->f_mapping->host; - - spin_lock(&inode->i_lock); - if (atomic_read(&inode->i_writecount) > 1) { - spin_unlock(&inode->i_lock); - return -ETXTBSY; - } - atomic_set(&inode->i_writecount, -1); - spin_unlock(&inode->i_lock); - - return 0; -} - static int set_bitmap_file(mddev_t *mddev, int fd) { int err; @@ -3352,12 +3998,17 @@ static int set_bitmap_file(mddev_t *mddev, int fd) mddev->pers->quiesce(mddev, 1); if (fd >= 0) err = bitmap_create(mddev); - if (fd < 0 || err) + if (fd < 0 || err) { bitmap_destroy(mddev); + fd = -1; /* make sure to put the file */ + } mddev->pers->quiesce(mddev, 0); - } else if (fd < 0) { - if (mddev->bitmap_file) + } + if (fd < 0) { + if (mddev->bitmap_file) { + restore_bitmap_write_access(mddev->bitmap_file); fput(mddev->bitmap_file); + } mddev->bitmap_file = NULL; } @@ -3394,6 +4045,7 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) mddev->major_version = info->major_version; mddev->minor_version = info->minor_version; mddev->patch_version = info->patch_version; + mddev->persistent = !info->not_persistent; return 0; } mddev->major_version = MD_MAJOR_VERSION; @@ -3419,16 +4071,24 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) mddev->max_disks = MD_SB_DISKS; - mddev->sb_dirty = 1; + mddev->flags = 0; + set_bit(MD_CHANGE_DEVS, &mddev->flags); mddev->default_bitmap_offset = MD_SB_BYTES >> 9; mddev->bitmap_offset = 0; + mddev->reshape_position = MaxSector; + /* * Generate a 128 bit UUID */ get_random_bytes(mddev->uuid, 16); + mddev->new_level = mddev->level; + mddev->new_chunk = mddev->chunk_size; + mddev->new_layout = mddev->layout; + mddev->delta_disks = 0; + return 0; } @@ -3455,11 +4115,8 @@ static int update_size(mddev_t *mddev, unsigned long size) return -EBUSY; ITERATE_RDEV(mddev,rdev,tmp) { sector_t avail; - if (rdev->sb_offset > rdev->data_offset) - avail = (rdev->sb_offset*2) - rdev->data_offset; - else - avail = get_capacity(rdev->bdev->bd_disk) - - rdev->data_offset; + avail = rdev->size * 2; + if (fit && (size == 0 || size > avail/2)) size = avail/2; if (avail < ((sector_t)size << 1)) @@ -3484,14 +4141,16 @@ static int update_raid_disks(mddev_t *mddev, int raid_disks) { int rv; /* change the number of raid disks */ - if (mddev->pers->reshape == NULL) + if (mddev->pers->check_reshape == NULL) return -EINVAL; if (raid_disks <= 0 || raid_disks >= mddev->max_disks) return -EINVAL; - if (mddev->sync_thread) + if (mddev->sync_thread || mddev->reshape_position != MaxSector) return -EBUSY; - rv = mddev->pers->reshape(mddev, raid_disks); + mddev->delta_disks = raid_disks - mddev->raid_disks; + + rv = mddev->pers->check_reshape(mddev); return rv; } @@ -3579,7 +4238,7 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) mddev->bitmap_offset = 0; } } - md_update_sb(mddev); + md_update_sb(mddev, 1); return rv; } @@ -3662,7 +4321,7 @@ static int md_ioctl(struct inode *inode, struct file *file, if (cnt > 0 ) { printk(KERN_WARNING "md: %s(pid %d) used deprecated START_ARRAY ioctl. " - "START_ARRAY is removed in kernel 2.6.19 and above.\n", + "This will not be supported beyond July 2006\n", current->comm, current->pid); cnt--; } @@ -3732,9 +4391,10 @@ static int md_ioctl(struct inode *inode, struct file *file, * Commands querying/configuring an existing array: */ /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, - * RUN_ARRAY, and SET_BITMAP_FILE are allowed */ + * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */ if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY - && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE) { + && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE + && cmd != GET_BITMAP_FILE) { err = -ENODEV; goto abort_unlock; } @@ -3829,11 +4489,6 @@ static int md_ioctl(struct inode *inode, struct file *file, goto done_unlock; default: - if (_IOC_TYPE(cmd) == MD_MAJOR) - printk(KERN_WARNING "md: %s(pid %d) used" - " obsolete MD ioctl, upgrade your" - " software to use new ictls.\n", - current->comm, current->pid); err = -EINVAL; goto abort_unlock; } @@ -3859,7 +4514,7 @@ static int md_open(struct inode *inode, struct file *file) mddev_t *mddev = inode->i_bdev->bd_disk->private_data; int err; - if ((err = mddev_lock(mddev))) + if ((err = mutex_lock_interruptible_nested(&mddev->reconfig_mutex, 1))) goto out; err = 0; @@ -3875,8 +4530,7 @@ static int md_release(struct inode *inode, struct file * file) { mddev_t *mddev = inode->i_bdev->bd_disk->private_data; - if (!mddev) - BUG(); + BUG_ON(!mddev); mddev_put(mddev); return 0; @@ -3923,6 +4577,7 @@ static int md_thread(void * arg) * many dirty RAID5 blocks. */ + current->flags |= PF_NOFREEZE; allow_signal(SIGKILL); while (!kthread_should_stop()) { @@ -3939,7 +4594,6 @@ static int md_thread(void * arg) test_bit(THREAD_WAKEUP, &thread->flags) || kthread_should_stop(), thread->timeout); - try_to_freeze(); clear_bit(THREAD_WAKEUP, &thread->flags); @@ -4004,13 +4658,15 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev) __builtin_return_address(0),__builtin_return_address(1), __builtin_return_address(2),__builtin_return_address(3)); */ + if (!mddev->pers) + return; if (!mddev->pers->error_handler) return; mddev->pers->error_handler(mddev,rdev); set_bit(MD_RECOVERY_INTR, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); - md_new_event(mddev); + md_new_event_inintr(mddev); } /* seq_file implementation /proc/mdstat */ @@ -4038,7 +4694,10 @@ static void status_unused(struct seq_file *seq) static void status_resync(struct seq_file *seq, mddev_t * mddev) { - unsigned long max_blocks, resync, res, dt, db, rt; + sector_t max_blocks, resync, res; + unsigned long dt, db, rt; + int scale; + unsigned int per_milli; resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2; @@ -4054,9 +4713,22 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev) MD_BUG(); return; } - res = (resync/1024)*1000/(max_blocks/1024 + 1); + /* Pick 'scale' such that (resync>>scale)*1000 will fit + * in a sector_t, and (max_blocks>>scale) will fit in a + * u32, as those are the requirements for sector_div. + * Thus 'scale' must be at least 10 + */ + scale = 10; + if (sizeof(sector_t) > sizeof(unsigned long)) { + while ( max_blocks/2 > (1ULL<<(scale+32))) + scale++; + } + res = (resync>>scale)*1000; + sector_div(res, (u32)((max_blocks>>scale)+1)); + + per_milli = res; { - int i, x = res/50, y = 20-x; + int i, x = per_milli/50, y = 20-x; seq_printf(seq, "["); for (i = 0; i < x; i++) seq_printf(seq, "="); @@ -4065,10 +4737,16 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev) seq_printf(seq, "."); seq_printf(seq, "] "); } - seq_printf(seq, " %s =%3lu.%lu%% (%lu/%lu)", - (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? - "resync" : "recovery"), - res/10, res % 10, resync, max_blocks); + seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)", + (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)? + "reshape" : + (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)? + "check" : + (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? + "resync" : "recovery"))), + per_milli/10, per_milli % 10, + (unsigned long long) resync, + (unsigned long long) max_blocks); /* * We do not want to overflow, so the order of operands and @@ -4081,12 +4759,13 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev) */ dt = ((jiffies - mddev->resync_mark) / HZ); if (!dt) dt++; - db = resync - (mddev->resync_mark_cnt/2); - rt = (dt * ((max_blocks-resync) / (db/100+1)))/100; + db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active)) + - mddev->resync_mark_cnt; + rt = (dt * ((unsigned long)(max_blocks-resync) / (db/2/100+1)))/100; seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6); - seq_printf(seq, " speed=%ldK/sec", db/dt); + seq_printf(seq, " speed=%ldK/sec", db/2/dt); } static void *md_seq_start(struct seq_file *seq, loff_t *pos) @@ -4181,8 +4860,9 @@ static int md_seq_show(struct seq_file *seq, void *v) return 0; } - if (mddev_lock(mddev)!=0) + if (mddev_lock(mddev) < 0) return -EINTR; + if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { seq_printf(seq, "%s : %sactive", mdname(mddev), mddev->pers ? "" : "in"); @@ -4257,8 +4937,8 @@ static int md_seq_show(struct seq_file *seq, void *v) chunk_kb ? "KB" : "B"); if (bitmap->file) { seq_printf(seq, ", file: "); - seq_path(seq, bitmap->file->f_vfsmnt, - bitmap->file->f_dentry," \t\n"); + seq_path(seq, bitmap->file->f_path.mnt, + bitmap->file->f_path.dentry," \t\n"); } seq_printf(seq, "\n"); @@ -4323,6 +5003,7 @@ static unsigned int mdstat_poll(struct file *filp, poll_table *wait) } static struct file_operations md_seq_fops = { + .owner = THIS_MODULE, .open = md_seq_open, .read = seq_read, .llseek = seq_lseek, @@ -4417,12 +5098,12 @@ void md_write_start(mddev_t *mddev, struct bio *bi) spin_lock_irq(&mddev->write_lock); if (mddev->in_sync) { mddev->in_sync = 0; - mddev->sb_dirty = 1; + set_bit(MD_CHANGE_CLEAN, &mddev->flags); md_wakeup_thread(mddev->thread); } spin_unlock_irq(&mddev->write_lock); } - wait_event(mddev->sb_wait, mddev->sb_dirty==0); + wait_event(mddev->sb_wait, mddev->flags==0); } void md_write_end(mddev_t *mddev) @@ -4430,16 +5111,43 @@ void md_write_end(mddev_t *mddev) if (atomic_dec_and_test(&mddev->writes_pending)) { if (mddev->safemode == 2) md_wakeup_thread(mddev->thread); - else + else if (mddev->safemode_delay) mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay); } } +/* md_allow_write(mddev) + * Calling this ensures that the array is marked 'active' so that writes + * may proceed without blocking. It is important to call this before + * attempting a GFP_KERNEL allocation while holding the mddev lock. + * Must be called with mddev_lock held. + */ +void md_allow_write(mddev_t *mddev) +{ + if (!mddev->pers) + return; + if (mddev->ro) + return; + + spin_lock_irq(&mddev->write_lock); + if (mddev->in_sync) { + mddev->in_sync = 0; + set_bit(MD_CHANGE_CLEAN, &mddev->flags); + if (mddev->safemode_delay && + mddev->safemode == 0) + mddev->safemode = 1; + spin_unlock_irq(&mddev->write_lock); + md_update_sb(mddev, 0); + } else + spin_unlock_irq(&mddev->write_lock); +} +EXPORT_SYMBOL_GPL(md_allow_write); + static DECLARE_WAIT_QUEUE_HEAD(resync_wait); #define SYNC_MARKS 10 #define SYNC_MARK_STEP (3*HZ) -static void md_do_sync(mddev_t *mddev) +void md_do_sync(mddev_t *mddev) { mddev_t *mddev2; unsigned int currspeed = 0, @@ -4451,10 +5159,27 @@ static void md_do_sync(mddev_t *mddev) struct list_head *tmp; sector_t last_check; int skipped = 0; + struct list_head *rtmp; + mdk_rdev_t *rdev; + char *desc; /* just incase thread restarts... */ if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) return; + if (mddev->ro) /* never try to sync a read-only array */ + return; + + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { + if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) + desc = "data-check"; + else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) + desc = "requested-resync"; + else + desc = "resync"; + } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) + desc = "reshape"; + else + desc = "recovery"; /* we overload curr_resync somewhat here. * 0 == not engaged in resync at all @@ -4499,10 +5224,10 @@ static void md_do_sync(mddev_t *mddev) prepare_to_wait(&resync_wait, &wq, TASK_UNINTERRUPTIBLE); if (!kthread_should_stop() && mddev2->curr_resync >= mddev->curr_resync) { - printk(KERN_INFO "md: delaying resync of %s" - " until %s has finished resync (they" + printk(KERN_INFO "md: delaying %s of %s" + " until %s has finished (they" " share one or more physical units)\n", - mdname(mddev), mdname(mddev2)); + desc, mdname(mddev), mdname(mddev2)); mddev_put(mddev2); schedule(); finish_wait(&resync_wait, &wq); @@ -4513,30 +5238,40 @@ static void md_do_sync(mddev_t *mddev) } } while (mddev->curr_resync < 2); + j = 0; if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { /* resync follows the size requested by the personality, * which defaults to physical size, but can be virtual size */ max_sectors = mddev->resync_max_sectors; mddev->resync_mismatches = 0; - } else + /* we don't use the checkpoint if there's a bitmap */ + if (!mddev->bitmap && + !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) + j = mddev->recovery_cp; + } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) + max_sectors = mddev->size << 1; + else { /* recovery follows the physical size of devices */ max_sectors = mddev->size << 1; + j = MaxSector; + ITERATE_RDEV(mddev,rdev,rtmp) + if (rdev->raid_disk >= 0 && + !test_bit(Faulty, &rdev->flags) && + !test_bit(In_sync, &rdev->flags) && + rdev->recovery_offset < j) + j = rdev->recovery_offset; + } - printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev)); - printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:" - " %d KB/sec/disc.\n", speed_min(mddev)); + printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev)); + printk(KERN_INFO "md: minimum _guaranteed_ speed:" + " %d KB/sec/disk.\n", speed_min(mddev)); printk(KERN_INFO "md: using maximum available idle IO bandwidth " - "(but not more than %d KB/sec) for reconstruction.\n", - speed_max(mddev)); + "(but not more than %d KB/sec) for %s.\n", + speed_max(mddev), desc); is_mddev_idle(mddev); /* this also initializes IO event counters */ - /* we don't use the checkpoint if there's a bitmap */ - if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && !mddev->bitmap - && ! test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) - j = mddev->recovery_cp; - else - j = 0; + io_sectors = 0; for (m = 0; m < SYNC_MARKS; m++) { mark[m] = jiffies; @@ -4559,8 +5294,8 @@ static void md_do_sync(mddev_t *mddev) if (j>2) { printk(KERN_INFO - "md: resuming recovery of %s from checkpoint.\n", - mdname(mddev)); + "md: resuming %s of %s from checkpoint.\n", + desc, mdname(mddev)); mddev->curr_resync = j; } @@ -4582,6 +5317,7 @@ static void md_do_sync(mddev_t *mddev) j += sectors; if (j>1) mddev->curr_resync = j; + mddev->curr_mark_cnt = io_sectors; if (last_check == 0) /* this is the earliers that rebuilt will be * visible in /proc/mdstat @@ -4642,7 +5378,7 @@ static void md_do_sync(mddev_t *mddev) } } } - printk(KERN_INFO "md: %s: sync done.\n",mdname(mddev)); + printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc); /* * this also signals 'finished resyncing' to md_stop */ @@ -4655,16 +5391,30 @@ static void md_do_sync(mddev_t *mddev) mddev->pers->sync_request(mddev, max_sectors, &skipped, 1); if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && - mddev->curr_resync > 2 && - mddev->curr_resync >= mddev->recovery_cp) { - if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { - printk(KERN_INFO - "md: checkpointing recovery of %s.\n", - mdname(mddev)); - mddev->recovery_cp = mddev->curr_resync; - } else - mddev->recovery_cp = MaxSector; + !test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && + mddev->curr_resync > 2) { + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { + if (mddev->curr_resync >= mddev->recovery_cp) { + printk(KERN_INFO + "md: checkpointing %s of %s.\n", + desc, mdname(mddev)); + mddev->recovery_cp = mddev->curr_resync; + } + } else + mddev->recovery_cp = MaxSector; + } else { + if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) + mddev->curr_resync = MaxSector; + ITERATE_RDEV(mddev,rdev,rtmp) + if (rdev->raid_disk >= 0 && + !test_bit(Faulty, &rdev->flags) && + !test_bit(In_sync, &rdev->flags) && + rdev->recovery_offset < mddev->curr_resync) + rdev->recovery_offset = mddev->curr_resync; + } } + set_bit(MD_CHANGE_DEVS, &mddev->flags); skip: mddev->curr_resync = 0; @@ -4672,6 +5422,7 @@ static void md_do_sync(mddev_t *mddev) set_bit(MD_RECOVERY_DONE, &mddev->recovery); md_wakeup_thread(mddev->thread); } +EXPORT_SYMBOL_GPL(md_do_sync); /* @@ -4718,7 +5469,7 @@ void md_check_recovery(mddev_t *mddev) } if ( ! ( - mddev->sb_dirty || + mddev->flags || test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || test_bit(MD_RECOVERY_DONE, &mddev->recovery) || (mddev->safemode == 1) || @@ -4727,21 +5478,21 @@ void md_check_recovery(mddev_t *mddev) )) return; - if (mddev_trylock(mddev)==0) { + if (mddev_trylock(mddev)) { int spares =0; spin_lock_irq(&mddev->write_lock); if (mddev->safemode && !atomic_read(&mddev->writes_pending) && !mddev->in_sync && mddev->recovery_cp == MaxSector) { mddev->in_sync = 1; - mddev->sb_dirty = 1; + set_bit(MD_CHANGE_CLEAN, &mddev->flags); } if (mddev->safemode == 1) mddev->safemode = 0; spin_unlock_irq(&mddev->write_lock); - if (mddev->sb_dirty) - md_update_sb(mddev); + if (mddev->flags) + md_update_sb(mddev, 0); if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && @@ -4760,7 +5511,7 @@ void md_check_recovery(mddev_t *mddev) /* activate any spares */ mddev->pers->spare_active(mddev); } - md_update_sb(mddev); + md_update_sb(mddev, 1); /* if array is no-longer degraded, then any saved_raid_disk * information must be scrapped @@ -4783,6 +5534,8 @@ void md_check_recovery(mddev_t *mddev) clear_bit(MD_RECOVERY_INTR, &mddev->recovery); clear_bit(MD_RECOVERY_DONE, &mddev->recovery); + if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) + goto unlock; /* no recovery is running. * remove any failed drives, then * add spares if possible. @@ -4805,6 +5558,7 @@ void md_check_recovery(mddev_t *mddev) ITERATE_RDEV(mddev,rdev,rtmp) if (rdev->raid_disk < 0 && !test_bit(Faulty, &rdev->flags)) { + rdev->recovery_offset = 0; if (mddev->pers->hot_add_disk(mddev,rdev)) { char nm[20]; sprintf(nm, "rd%d", rdev->raid_disk); @@ -4863,8 +5617,10 @@ static int md_notify_reboot(struct notifier_block *this, printk(KERN_INFO "md: stopping all md devices.\n"); ITERATE_MDDEV(mddev,tmp) - if (mddev_trylock(mddev)==0) + if (mddev_trylock(mddev)) { do_md_stop (mddev, 1); + mddev_unlock(mddev); + } /* * certain more exotic SCSI devices are known to be * volatile wrt too early system reboots. While the @@ -4895,37 +5651,16 @@ static void md_geninit(void) static int __init md_init(void) { - int minor; - - printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d," - " MD_SB_DISKS=%d\n", - MD_MAJOR_VERSION, MD_MINOR_VERSION, - MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS); - printk(KERN_INFO "md: bitmap version %d.%d\n", BITMAP_MAJOR_HI, - BITMAP_MINOR); - if (register_blkdev(MAJOR_NR, "md")) return -1; if ((mdp_major=register_blkdev(0, "mdp"))<=0) { unregister_blkdev(MAJOR_NR, "md"); return -1; } - devfs_mk_dir("md"); - blk_register_region(MKDEV(MAJOR_NR, 0), MAX_MD_DEVS, THIS_MODULE, - md_probe, NULL, NULL); - blk_register_region(MKDEV(mdp_major, 0), MAX_MD_DEVS<