tmp = tmp->next;}) \
)
+int md_flush_mddev(mddev_t *mddev, sector_t *error_sector)
+{
+ struct list_head *tmp;
+ mdk_rdev_t *rdev;
+ int ret = 0;
+
+ /*
+ * this list iteration is done without any locking in md?!
+ */
+ ITERATE_RDEV(mddev, rdev, tmp) {
+ request_queue_t *r_queue = bdev_get_queue(rdev->bdev);
+ int err;
+
+ if (!r_queue->issue_flush_fn)
+ err = -EOPNOTSUPP;
+ else
+ err = r_queue->issue_flush_fn(r_queue, rdev->bdev->bd_disk, error_sector);
+
+ if (!ret)
+ ret = err;
+ }
+
+ return ret;
+}
+
+static int md_flush_all(request_queue_t *q, struct gendisk *disk,
+ sector_t *error_sector)
+{
+ mddev_t *mddev = q->queuedata;
+
+ return md_flush_mddev(mddev, error_sector);
+}
+
static int md_fail_request (request_queue_t *q, struct bio *bio)
{
bio_io_error(bio, bio->bi_size);
return 0;
fail:
- printk(KERN_ERR "md: disabled device %s, could not read superblock.\n",
+ printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n",
bdevname(rdev->bdev,b));
return -EINVAL;
}
return csum;
}
+/* csum_partial is not consistent between different architectures.
+ * Some (i386) do a 32bit csum. Some (alpha) do 16 bit.
+ * This makes it hard for user-space to know what to do.
+ * So we use calc_sb_csum to set the checksum to allow working
+ * with older kernels, but allow calc_sb_csum_common to
+ * be used when checking if a checksum is correct, to
+ * make life easier for user-space tools that might write
+ * a superblock.
+ */
+static unsigned int calc_sb_csum_common(mdp_super_t *super)
+{
+ unsigned int disk_csum = super->sb_csum;
+ unsigned long long newcsum = 0;
+ unsigned int csum;
+ int i;
+ unsigned int *superc = (int*) super;
+ super->sb_csum = 0;
+
+ for (i=0; i<MD_SB_BYTES/4; i++)
+ newcsum+= superc[i];
+ csum = (newcsum& 0xffffffff) + (newcsum>>32);
+ super->sb_csum = disk_csum;
+ return csum;
+}
+
/*
* Handle superblock details.
* We want to be able to handle multiple superblock formats
if (sb->raid_disks <= 0)
goto abort;
- if (calc_sb_csum(sb) != sb->sb_csum) {
+ if (calc_sb_csum(sb) != sb->sb_csum &&
+ calc_sb_csum_common(sb) != sb->sb_csum) {
printk(KERN_WARNING "md: invalid superblock checksum on %s\n",
b);
goto abort;
static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb)
{
unsigned int disk_csum, csum;
+ unsigned long long newcsum;
int size = 256 + sb->max_dev*2;
+ unsigned int *isuper = (unsigned int*)sb;
+ int i;
disk_csum = sb->sb_csum;
sb->sb_csum = 0;
- csum = csum_partial((void *)sb, size, 0);
+ newcsum = 0;
+ for (i=0; size>=4; size -= 4 )
+ newcsum += le32_to_cpu(*isuper++);
+
+ if (size == 2)
+ newcsum += le16_to_cpu(*(unsigned short*) isuper);
+
+ csum = (newcsum & 0xffffffff) + (newcsum >> 32);
sb->sb_csum = disk_csum;
return csum;
}
/*
* prevent the device from being mounted, repartitioned or
* otherwise reused by a RAID array (or any other kernel
- * subsystem), by opening the device. [simply getting an
- * inode is not enough, the SCSI module usage code needs
- * an explicit open() on the device]
+ * subsystem), by bd_claiming the device.
*/
static int lock_rdev(mdk_rdev_t *rdev, dev_t dev)
{
int err = 0;
struct block_device *bdev;
+ char b[BDEVNAME_SIZE];
bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
- if (IS_ERR(bdev))
+ if (IS_ERR(bdev)) {
+ printk(KERN_ERR "md: could not open %s.\n",
+ __bdevname(dev, b));
return PTR_ERR(bdev);
+ }
err = bd_claim(bdev, rdev);
if (err) {
+ printk(KERN_ERR "md: could not bd_claim %s.\n",
+ bdevname(bdev, b));
blkdev_put(bdev);
return err;
}
static void print_desc(mdp_disk_t *desc)
{
- char b[BDEVNAME_SIZE];
-
- printk(" DISK<N:%d,%s(%d,%d),R:%d,S:%d>\n", desc->number,
- __bdevname(MKDEV(desc->major, desc->minor), b),
+ printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number,
desc->major,desc->minor,desc->raid_disk,desc->state);
}
rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL);
if (!rdev) {
- printk(KERN_ERR "md: could not alloc mem for %s!\n",
- __bdevname(newdev, b));
+ printk(KERN_ERR "md: could not alloc mem for new device!\n");
return ERR_PTR(-ENOMEM);
}
memset(rdev, 0, sizeof(*rdev));
goto abort_free;
err = lock_rdev(rdev, newdev);
- if (err) {
- printk(KERN_ERR "md: could not lock %s.\n",
- __bdevname(newdev, b));
+ if (err)
goto abort_free;
- }
+
rdev->desc_nr = -1;
rdev->faulty = 0;
rdev->in_sync = 0;
spin_lock(&pers_lock);
if (!pers[pnum] || !try_module_get(pers[pnum]->owner)) {
spin_unlock(&pers_lock);
- printk(KERN_ERR "md: personality %d is not loaded!\n",
+ printk(KERN_WARNING "md: personality %d is not loaded!\n",
pnum);
return -EINVAL;
}
mddev->pers = pers[pnum];
spin_unlock(&pers_lock);
+ mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */
+
err = mddev->pers->run(mddev);
if (err) {
printk(KERN_ERR "md: pers->run() failed ...\n");
*/
mddev->queue->queuedata = mddev;
mddev->queue->make_request_fn = mddev->pers->make_request;
+ mddev->queue->issue_flush_fn = md_flush_all;
mddev->changed = 1;
return 0;
mdk_rdev_t *start_rdev = NULL, *rdev;
start_rdev = md_import_device(startdev, 0, 0);
- if (IS_ERR(start_rdev)) {
- printk(KERN_WARNING "md: could not import %s!\n",
- __bdevname(startdev, b));
+ if (IS_ERR(start_rdev))
return err;
- }
+
/* NOTE: this can only work for 0.90.0 superblocks */
sb = (mdp_super_t*)page_address(start_rdev->sb_page);
if (MAJOR(dev) != desc->major || MINOR(dev) != desc->minor)
continue;
rdev = md_import_device(dev, 0, 0);
- if (IS_ERR(rdev)) {
- printk(KERN_WARNING "md: could not import %s,"
- " trying to run array nevertheless.\n",
- __bdevname(dev, b));
+ if (IS_ERR(rdev))
continue;
- }
+
list_add(&rdev->same_set, &pending_raid_disks);
}
}
-static int get_version(void * arg)
+static int get_version(void __user * arg)
{
mdu_version_t ver;
return 0;
}
-static int get_array_info(mddev_t * mddev, void * arg)
+static int get_array_info(mddev_t * mddev, void __user * arg)
{
mdu_array_info_t info;
int nr,working,active,failed,spare;
return 0;
}
-static int get_disk_info(mddev_t * mddev, void * arg)
+static int get_disk_info(mddev_t * mddev, void __user * arg)
{
mdu_disk_info_t info;
unsigned int nr;
return 0;
}
-static int hot_generate_error(mddev_t * mddev, dev_t dev)
-{
- char b[BDEVNAME_SIZE];
- struct request_queue *q;
- mdk_rdev_t *rdev;
-
- if (!mddev->pers)
- return -ENODEV;
-
- printk(KERN_INFO "md: trying to generate %s error in %s ... \n",
- __bdevname(dev, b), mdname(mddev));
-
- rdev = find_rdev(mddev, dev);
- if (!rdev) {
- /* MD_BUG(); */ /* like hell - it's not a driver bug */
- return -ENXIO;
- }
-
- if (rdev->desc_nr == -1) {
- MD_BUG();
- return -EINVAL;
- }
- if (!rdev->in_sync)
- return -ENODEV;
-
- q = bdev_get_queue(rdev->bdev);
- if (!q) {
- MD_BUG();
- return -ENODEV;
- }
- printk(KERN_INFO "md: okay, generating error!\n");
-// q->oneshot_error = 1; // disabled for now
-
- return 0;
-}
-
static int hot_remove_disk(mddev_t * mddev, dev_t dev)
{
char b[BDEVNAME_SIZE];
if (!mddev->pers)
return -ENODEV;
- printk(KERN_INFO "md: trying to remove %s from %s ... \n",
- __bdevname(dev, b), mdname(mddev));
-
rdev = find_rdev(mddev, dev);
if (!rdev)
return -ENXIO;
if (!mddev->pers)
return -ENODEV;
- printk(KERN_INFO "md: trying to hot-add %s to %s ... \n",
- __bdevname(dev, b), mdname(mddev));
-
if (mddev->major_version != 0) {
printk(KERN_WARNING "%s: HOT_ADD may only be used with"
" version-0 superblocks.\n",
return -EINVAL;
}
- rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
+ if (mddev->persistent)
+ rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
+ else
+ rdev->sb_offset =
+ rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
+
size = calc_dev_size(rdev, mddev->chunk_size);
rdev->size = size;
return 0;
}
+/*
+ * update_array_info is used to change the configuration of an
+ * on-line array.
+ * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size
+ * fields in the info are checked against the array.
+ * Any differences that cannot be handled will cause an error.
+ * Normally, only one change can be managed at a time.
+ */
+static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
+{
+ int rv = 0;
+ int cnt = 0;
+
+ if (mddev->major_version != info->major_version ||
+ mddev->minor_version != info->minor_version ||
+/* mddev->patch_version != info->patch_version || */
+ mddev->ctime != info->ctime ||
+ mddev->level != info->level ||
+ mddev->layout != info->layout ||
+ !mddev->persistent != info->not_persistent||
+ mddev->chunk_size != info->chunk_size )
+ return -EINVAL;
+ /* Check there is only one change */
+ if (mddev->size != info->size) cnt++;
+ if (mddev->raid_disks != info->raid_disks) cnt++;
+ if (cnt == 0) return 0;
+ if (cnt > 1) return -EINVAL;
+
+ if (mddev->size != info->size) {
+ mdk_rdev_t * rdev;
+ struct list_head *tmp;
+ if (mddev->pers->resize == NULL)
+ return -EINVAL;
+ /* The "size" is the amount of each device that is used.
+ * This can only make sense for arrays with redundancy.
+ * linear and raid0 always use whatever space is available
+ * We can only consider changing the size of no resync
+ * or reconstruction is happening, and if the new size
+ * is acceptable. It must fit before the sb_offset or,
+ * if that is <data_offset, it must fit before the
+ * size of each device.
+ * If size is zero, we find the largest size that fits.
+ */
+ if (mddev->sync_thread)
+ return -EBUSY;
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ sector_t avail;
+ int fit = (info->size == 0);
+ if (rdev->sb_offset > rdev->data_offset)
+ avail = (rdev->sb_offset*2) - rdev->data_offset;
+ else
+ avail = get_capacity(rdev->bdev->bd_disk)
+ - rdev->data_offset;
+ if (fit && (info->size == 0 || info->size > avail/2))
+ info->size = avail/2;
+ if (avail < ((sector_t)info->size << 1))
+ return -ENOSPC;
+ }
+ rv = mddev->pers->resize(mddev, (sector_t)info->size *2);
+ if (!rv) {
+ struct block_device *bdev;
+
+ bdev = bdget_disk(mddev->gendisk, 0);
+ if (bdev) {
+ down(&bdev->bd_inode->i_sem);
+ i_size_write(bdev->bd_inode, mddev->array_size << 10);
+ up(&bdev->bd_inode->i_sem);
+ bdput(bdev);
+ }
+ }
+ }
+ if (mddev->raid_disks != info->raid_disks) {
+ /* change the number of raid disks */
+ if (mddev->pers->reshape == NULL)
+ return -EINVAL;
+ if (info->raid_disks <= 0 ||
+ info->raid_disks >= mddev->max_disks)
+ return -EINVAL;
+ if (mddev->sync_thread)
+ return -EBUSY;
+ rv = mddev->pers->reshape(mddev, info->raid_disks);
+ if (!rv) {
+ struct block_device *bdev;
+
+ bdev = bdget_disk(mddev->gendisk, 0);
+ if (bdev) {
+ down(&bdev->bd_inode->i_sem);
+ i_size_write(bdev->bd_inode, mddev->array_size << 10);
+ up(&bdev->bd_inode->i_sem);
+ bdput(bdev);
+ }
+ }
+ }
+ md_update_sb(mddev);
+ return rv;
+}
+
static int set_disk_faulty(mddev_t *mddev, dev_t dev)
{
mdk_rdev_t *rdev;
static int md_ioctl(struct inode *inode, struct file *file,
unsigned int cmd, unsigned long arg)
{
- char b[BDEVNAME_SIZE];
int err = 0;
- struct hd_geometry *loc = (struct hd_geometry *) arg;
+ void __user *argp = (void __user *)arg;
+ struct hd_geometry __user *loc = argp;
mddev_t *mddev = NULL;
if (!capable(CAP_SYS_ADMIN))
switch (cmd)
{
case RAID_VERSION:
- err = get_version((void *)arg);
+ err = get_version(argp);
goto done;
case PRINT_RAID_DEBUG:
}
err = autostart_array(new_decode_dev(arg));
if (err) {
- printk(KERN_WARNING "md: autostart %s failed!\n",
- __bdevname(arg, b));
+ printk(KERN_WARNING "md: autostart failed!\n");
goto abort;
}
goto done;
switch (cmd)
{
case SET_ARRAY_INFO:
-
- if (!list_empty(&mddev->disks)) {
- printk(KERN_WARNING
- "md: array %s already has disks!\n",
- mdname(mddev));
- err = -EBUSY;
- goto abort_unlock;
- }
- if (mddev->raid_disks) {
- printk(KERN_WARNING
- "md: array %s already initialised!\n",
- mdname(mddev));
- err = -EBUSY;
- goto abort_unlock;
- }
{
mdu_array_info_t info;
if (!arg)
memset(&info, 0, sizeof(info));
- else if (copy_from_user(&info, (void*)arg, sizeof(info))) {
+ else if (copy_from_user(&info, argp, sizeof(info))) {
err = -EFAULT;
goto abort_unlock;
}
+ if (mddev->pers) {
+ err = update_array_info(mddev, &info);
+ if (err) {
+ printk(KERN_WARNING "md: couldn't update"
+ " array info. %d\n", err);
+ goto abort_unlock;
+ }
+ goto done_unlock;
+ }
+ if (!list_empty(&mddev->disks)) {
+ printk(KERN_WARNING
+ "md: array %s already has disks!\n",
+ mdname(mddev));
+ err = -EBUSY;
+ goto abort_unlock;
+ }
+ if (mddev->raid_disks) {
+ printk(KERN_WARNING
+ "md: array %s already initialised!\n",
+ mdname(mddev));
+ err = -EBUSY;
+ goto abort_unlock;
+ }
err = set_array_info(mddev, &info);
if (err) {
printk(KERN_WARNING "md: couldn't set"
- " array info. %d\n", err);
+ " array info. %d\n", err);
goto abort_unlock;
}
}
switch (cmd)
{
case GET_ARRAY_INFO:
- err = get_array_info(mddev, (void *)arg);
+ err = get_array_info(mddev, argp);
goto done_unlock;
case GET_DISK_INFO:
- err = get_disk_info(mddev, (void *)arg);
+ err = get_disk_info(mddev, argp);
goto done_unlock;
case RESTART_ARRAY_RW:
err = -EINVAL;
goto abort_unlock;
}
- err = put_user (2, (char *) &loc->heads);
+ err = put_user (2, (char __user *) &loc->heads);
if (err)
goto abort_unlock;
- err = put_user (4, (char *) &loc->sectors);
+ err = put_user (4, (char __user *) &loc->sectors);
if (err)
goto abort_unlock;
err = put_user(get_capacity(mddev->gendisk)/8,
- (short *) &loc->cylinders);
+ (short __user *) &loc->cylinders);
if (err)
goto abort_unlock;
err = put_user (get_start_sect(inode->i_bdev),
- (long *) &loc->start);
+ (long __user *) &loc->start);
goto done_unlock;
}
case ADD_NEW_DISK:
{
mdu_disk_info_t info;
- if (copy_from_user(&info, (void*)arg, sizeof(info)))
+ if (copy_from_user(&info, argp, sizeof(info)))
err = -EFAULT;
else
err = add_new_disk(mddev, &info);
goto done_unlock;
}
- case HOT_GENERATE_ERROR:
- err = hot_generate_error(mddev, new_decode_dev(arg));
- goto done_unlock;
+
case HOT_REMOVE_DISK:
err = hot_remove_disk(mddev, new_decode_dev(arg));
goto done_unlock;
return thread;
}
-void md_interrupt_thread(mdk_thread_t *thread)
+static void md_interrupt_thread(mdk_thread_t *thread)
{
if (!thread->tsk) {
MD_BUG();
if (!mddev->pers->error_handler)
return;
mddev->pers->error_handler(mddev,rdev);
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
}
unsigned long max_blocks, resync, res, dt, db, rt;
resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2;
- max_blocks = mddev->size;
+
+ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
+ max_blocks = mddev->resync_max_sectors >> 1;
+ else
+ max_blocks = mddev->size;
/*
* Should not happen.
return 0;
}
-void md_sync_acct(mdk_rdev_t *rdev, unsigned long nr_sectors)
-{
- rdev->bdev->bd_contains->bd_disk->sync_io += nr_sectors;
-}
-
static int is_mddev_idle(mddev_t *mddev)
{
mdk_rdev_t * rdev;
struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
curr_events = disk_stat_read(disk, read_sectors) +
disk_stat_read(disk, write_sectors) -
- disk->sync_io;
- if ((curr_events - rdev->last_events) > 32) {
+ atomic_read(&disk->sync_io);
+ /* Allow some slack between valud of curr_events and last_events,
+ * as there are some uninteresting races.
+ * Note: the following is an unsigned comparison.
+ */
+ if ((curr_events - rdev->last_events + 32) > 64) {
rdev->last_events = curr_events;
idle = 0;
}
}
} while (mddev->curr_resync < 2);
- max_sectors = mddev->size << 1;
+ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
+ /* resync follows the size requested by the personality,
+ * which default to physical size, but can be virtual size
+ */
+ max_sectors = mddev->resync_max_sectors;
+ else
+ /* recovery follows the physical size of devices */
+ max_sectors = mddev->size << 1;
printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev));
printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:"
j += sectors;
if (j>1) mddev->curr_resync = j;
- if (last_check + window > j)
+ if (last_check + window > j || j == max_sectors)
continue;
last_check = j;
if (rdev->raid_disk >= 0 &&
rdev->faulty &&
atomic_read(&rdev->nr_pending)==0) {
- mddev->pers->hot_remove_disk(mddev, rdev->raid_disk);
- rdev->raid_disk = -1;
+ if (mddev->pers->hot_remove_disk(mddev, rdev->raid_disk)==0)
+ rdev->raid_disk = -1;
}
if (!rdev->faulty && rdev->raid_disk >= 0 && !rdev->in_sync)
spares++;
static void autostart_arrays(int part)
{
- char b[BDEVNAME_SIZE];
mdk_rdev_t *rdev;
int i;
dev_t dev = detected_devices[i];
rdev = md_import_device(dev,0, 0);
- if (IS_ERR(rdev)) {
- printk(KERN_ALERT "md: could not import %s!\n",
- __bdevname(dev, b));
+ if (IS_ERR(rdev))
continue;
- }
+
if (rdev->faulty) {
MD_BUG();
continue;
EXPORT_SYMBOL(register_md_personality);
EXPORT_SYMBOL(unregister_md_personality);
EXPORT_SYMBOL(md_error);
-EXPORT_SYMBOL(md_sync_acct);
EXPORT_SYMBOL(md_done_sync);
EXPORT_SYMBOL(md_write_start);
EXPORT_SYMBOL(md_write_end);
EXPORT_SYMBOL(md_unregister_thread);
EXPORT_SYMBOL(md_wakeup_thread);
EXPORT_SYMBOL(md_print_devices);
-EXPORT_SYMBOL(md_interrupt_thread);
EXPORT_SYMBOL(md_check_recovery);
MODULE_LICENSE("GPL");