#include <linux/raid/raid1.h>
-#define MAJOR_NR MD_MAJOR
-#define MD_DRIVER
-#define MD_PERSONALITY
-
/*
* Number of guaranteed r1bios in case of extreme VM load:
*/
static void * r1bio_pool_alloc(int gfp_flags, void *data)
{
- mddev_t *mddev = data;
+ struct pool_info *pi = data;
r1bio_t *r1_bio;
+ int size = offsetof(r1bio_t, bios[pi->raid_disks]);
/* allocate a r1bio with room for raid_disks entries in the bios array */
- r1_bio = kmalloc(sizeof(r1bio_t) + sizeof(struct bio*)*mddev->raid_disks,
- gfp_flags);
+ r1_bio = kmalloc(size, gfp_flags);
if (r1_bio)
- memset(r1_bio, 0, sizeof(*r1_bio) + sizeof(struct bio*)*mddev->raid_disks);
+ memset(r1_bio, 0, size);
else
- unplug_slaves(mddev);
+ unplug_slaves(pi->mddev);
return r1_bio;
}
static void * r1buf_pool_alloc(int gfp_flags, void *data)
{
- conf_t *conf = data;
+ struct pool_info *pi = data;
struct page *page;
r1bio_t *r1_bio;
struct bio *bio;
int i, j;
- r1_bio = r1bio_pool_alloc(gfp_flags, conf->mddev);
+ r1_bio = r1bio_pool_alloc(gfp_flags, pi);
if (!r1_bio) {
- unplug_slaves(conf->mddev);
+ unplug_slaves(pi->mddev);
return NULL;
}
/*
* Allocate bios : 1 for reading, n-1 for writing
*/
- for (j = conf->raid_disks ; j-- ; ) {
+ for (j = pi->raid_disks ; j-- ; ) {
bio = bio_alloc(gfp_flags, RESYNC_PAGES);
if (!bio)
goto out_free_bio;
bio->bi_io_vec[i].bv_page = page;
}
- r1_bio->master_bio = bio;
+ r1_bio->master_bio = NULL;
return r1_bio;
for ( ; i > 0 ; i--)
__free_page(bio->bi_io_vec[i-1].bv_page);
out_free_bio:
- while ( ++j < conf->raid_disks )
+ while ( ++j < pi->raid_disks )
bio_put(r1_bio->bios[j]);
- r1bio_pool_free(r1_bio, conf->mddev);
+ r1bio_pool_free(r1_bio, data);
return NULL;
}
static void r1buf_pool_free(void *__r1_bio, void *data)
{
+ struct pool_info *pi = data;
int i;
- conf_t *conf = data;
r1bio_t *r1bio = __r1_bio;
struct bio *bio = r1bio->bios[0];
__free_page(bio->bi_io_vec[i].bv_page);
bio->bi_io_vec[i].bv_page = NULL;
}
- for (i=0 ; i < conf->raid_disks; i++)
+ for (i=0 ; i < pi->raid_disks; i++)
bio_put(r1bio->bios[i]);
- r1bio_pool_free(r1bio, conf->mddev);
+ r1bio_pool_free(r1bio, data);
}
static void put_all_bios(conf_t *conf, r1bio_t *r1_bio)
spin_unlock_irqrestore(&conf->resync_lock, flags);
}
-static int map(mddev_t *mddev, mdk_rdev_t **rdevp)
-{
- conf_t *conf = mddev_to_conf(mddev);
- int i, disks = conf->raid_disks;
-
- /*
- * Later we do read balancing on the read side
- * now we use the first available disk.
- */
-
- spin_lock_irq(&conf->device_lock);
- for (i = 0; i < disks; i++) {
- mdk_rdev_t *rdev = conf->mirrors[i].rdev;
- if (rdev && rdev->in_sync) {
- *rdevp = rdev;
- atomic_inc(&rdev->nr_pending);
- spin_unlock_irq(&conf->device_lock);
- return 0;
- }
- }
- spin_unlock_irq(&conf->device_lock);
-
- printk(KERN_ERR "raid1_map(): huh, no more operational devices?\n");
- return -1;
-}
-
static void reschedule_retry(r1bio_t *r1_bio)
{
unsigned long flags;
* oops, read error:
*/
char b[BDEVNAME_SIZE];
- printk(KERN_ERR "raid1: %s: rescheduling sector %llu\n",
- bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector);
+ if (printk_ratelimit())
+ printk(KERN_ERR "raid1: %s: rescheduling sector %llu\n",
+ bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector);
reschedule_retry(r1_bio);
}
- atomic_dec(&conf->mirrors[mirror].rdev->nr_pending);
+ rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
return 0;
}
raid_end_bio_io(r1_bio);
}
- atomic_dec(&conf->mirrors[mirror].rdev->nr_pending);
+ rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
return 0;
}
*
* The rdev for the device selected will have nr_pending incremented.
*/
-static int read_balance(conf_t *conf, struct bio *bio, r1bio_t *r1_bio)
+static int read_balance(conf_t *conf, r1bio_t *r1_bio)
{
const unsigned long this_sector = r1_bio->sector;
int new_disk = conf->last_used, disk = new_disk;
- const int sectors = bio->bi_size >> 9;
+ const int sectors = r1_bio->sectors;
sector_t new_distance, current_distance;
spin_lock_irq(&conf->device_lock);
* device if no resync is going on, or below the resync window.
* We take the first readable disk when above the resync window.
*/
- if (!conf->mddev->in_sync && (this_sector + sectors >= conf->next_resync)) {
- /* make sure that disk is operational */
+ if (conf->mddev->recovery_cp < MaxSector &&
+ (this_sector + sectors >= conf->next_resync)) {
+ /* Choose the first operation device, for consistancy */
new_disk = 0;
while (!conf->mirrors[new_disk].rdev ||
!conf->mirrors[new_disk].rdev->in_sync) {
new_disk++;
if (new_disk == conf->raid_disks) {
- new_disk = 0;
+ new_disk = -1;
break;
}
}
new_disk = conf->raid_disks;
new_disk--;
if (new_disk == disk) {
- new_disk = conf->last_used;
+ new_disk = -1;
goto rb_out;
}
}
} while (disk != conf->last_used);
rb_out:
- r1_bio->read_disk = new_disk;
- conf->next_seq_sect = this_sector + sectors;
- conf->last_used = new_disk;
- if (conf->mirrors[new_disk].rdev)
+ if (new_disk >= 0) {
+ conf->next_seq_sect = this_sector + sectors;
+ conf->last_used = new_disk;
atomic_inc(&conf->mirrors[new_disk].rdev->nr_pending);
+ }
spin_unlock_irq(&conf->device_lock);
return new_disk;
spin_lock_irqsave(&conf->device_lock, flags);
for (i=0; i<mddev->raid_disks; i++) {
mdk_rdev_t *rdev = conf->mirrors[i].rdev;
- if (rdev && !rdev->faulty) {
+ if (rdev && atomic_read(&rdev->nr_pending)) {
request_queue_t *r_queue = bdev_get_queue(rdev->bdev);
+ atomic_inc(&rdev->nr_pending);
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+
if (r_queue->unplug_fn)
r_queue->unplug_fn(r_queue);
+
+ spin_lock_irqsave(&conf->device_lock, flags);
+ atomic_dec(&rdev->nr_pending);
}
}
spin_unlock_irqrestore(&conf->device_lock, flags);
unplug_slaves(q->queuedata);
}
+static int raid1_issue_flush(request_queue_t *q, struct gendisk *disk,
+ sector_t *error_sector)
+{
+ mddev_t *mddev = q->queuedata;
+ conf_t *conf = mddev_to_conf(mddev);
+ unsigned long flags;
+ int i, ret = 0;
+
+ spin_lock_irqsave(&conf->device_lock, flags);
+ for (i=0; i<mddev->raid_disks; i++) {
+ mdk_rdev_t *rdev = conf->mirrors[i].rdev;
+ if (rdev && !rdev->faulty) {
+ struct block_device *bdev = rdev->bdev;
+ request_queue_t *r_queue = bdev_get_queue(bdev);
+
+ if (r_queue->issue_flush_fn) {
+ ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector);
+ if (ret)
+ break;
+ }
+ }
+ }
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ return ret;
+}
+
/*
* Throttle resync depth, so that we can both get proper overlapping of
* requests, but are still able to handle normal requests quickly.
mirror_info_t *mirror;
r1bio_t *r1_bio;
struct bio *read_bio;
- int i, disks = conf->raid_disks;
+ int i, disks;
/*
* Register the new request and wait if the reconstruction
r1_bio->mddev = mddev;
r1_bio->sector = bio->bi_sector;
+ r1_bio->state = 0;
+
if (bio_data_dir(bio) == READ) {
/*
* read balancing logic:
*/
- mirror = conf->mirrors + read_balance(conf, bio, r1_bio);
+ int rdisk = read_balance(conf, r1_bio);
+
+ if (rdisk < 0) {
+ /* couldn't find anywhere to read from */
+ raid_end_bio_io(r1_bio);
+ return 0;
+ }
+ mirror = conf->mirrors + rdisk;
+
+ r1_bio->read_disk = rdisk;
read_bio = bio_clone(bio, GFP_NOIO);
- r1_bio->bios[r1_bio->read_disk] = read_bio;
+ r1_bio->bios[rdisk] = read_bio;
read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset;
read_bio->bi_bdev = mirror->rdev->bdev;
* inc refcount on their rdev. Record them by setting
* bios[x] to bio
*/
+ disks = conf->raid_disks;
spin_lock_irq(&conf->device_lock);
for (i = 0; i < disks; i++) {
if (conf->mirrors[i].rdev &&
*/
if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
mddev->queue->max_sectors > (PAGE_SIZE>>9))
- mddev->queue->max_sectors = (PAGE_SIZE>>9);
+ blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
p->head_position = 0;
rdev->raid_disk = mirror;
conf->mirrors[r1_bio->read_disk].rdev);
else
set_bit(R1BIO_Uptodate, &r1_bio->state);
- atomic_dec(&conf->mirrors[r1_bio->read_disk].rdev->nr_pending);
+ rdev_dec_pending(conf->mirrors[r1_bio->read_disk].rdev, conf->mddev);
reschedule_retry(r1_bio);
return 0;
}
md_done_sync(mddev, r1_bio->sectors, uptodate);
put_buf(r1_bio);
}
- atomic_dec(&conf->mirrors[mirror].rdev->nr_pending);
+ rdev_dec_pending(conf->mirrors[mirror].rdev, mddev);
return 0;
}
atomic_inc(&conf->mirrors[i].rdev->nr_pending);
atomic_inc(&r1_bio->remaining);
- md_sync_acct(conf->mirrors[i].rdev, wbio->bi_size >> 9);
+ md_sync_acct(conf->mirrors[i].rdev->bdev, wbio->bi_size >> 9);
generic_make_request(wbio);
}
mddev = r1_bio->mddev;
conf = mddev_to_conf(mddev);
- bio = r1_bio->master_bio;
if (test_bit(R1BIO_IsSync, &r1_bio->state)) {
sync_request_write(mddev, r1_bio);
unplug = 1;
} else {
- if (map(mddev, &rdev) == -1) {
+ int disk;
+ bio = r1_bio->bios[r1_bio->read_disk];
+ if ((disk=read_balance(conf, r1_bio)) == -1) {
printk(KERN_ALERT "raid1: %s: unrecoverable I/O"
" read error for block %llu\n",
bdevname(bio->bi_bdev,b),
(unsigned long long)r1_bio->sector);
raid_end_bio_io(r1_bio);
} else {
- printk(KERN_ERR "raid1: %s: redirecting sector %llu to"
- " another mirror\n",
- bdevname(rdev->bdev,b),
- (unsigned long long)r1_bio->sector);
+ r1_bio->bios[r1_bio->read_disk] = NULL;
+ r1_bio->read_disk = disk;
+ r1_bio->bios[r1_bio->read_disk] = bio;
+ rdev = conf->mirrors[disk].rdev;
+ if (printk_ratelimit())
+ printk(KERN_ERR "raid1: %s: redirecting sector %llu to"
+ " another mirror\n",
+ bdevname(rdev->bdev,b),
+ (unsigned long long)r1_bio->sector);
bio->bi_bdev = rdev->bdev;
bio->bi_sector = r1_bio->sector + rdev->data_offset;
bio->bi_rw = READ;
buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
if (conf->r1buf_pool)
BUG();
- conf->r1buf_pool = mempool_create(buffs, r1buf_pool_alloc, r1buf_pool_free, conf);
+ conf->r1buf_pool = mempool_create(buffs, r1buf_pool_alloc, r1buf_pool_free,
+ conf->poolinfo);
if (!conf->r1buf_pool)
return -ENOMEM;
conf->next_resync = 0;
sector_t max_sector, nr_sectors;
int disk;
int i;
+ int write_targets = 0;
if (!conf->r1buf_pool)
if (init_resync(conf))
sector_nr + RESYNC_SECTORS > mddev->recovery_cp)) {
bio->bi_rw = WRITE;
bio->bi_end_io = end_sync_write;
+ write_targets ++;
} else
continue;
bio->bi_sector = sector_nr + conf->mirrors[i].rdev->data_offset;
bio->bi_bdev = conf->mirrors[i].rdev->bdev;
bio->bi_private = r1_bio;
}
+ if (write_targets == 0) {
+ /* There is nowhere to write, so all non-sync
+ * drives must be failed - so we are finished
+ */
+ int rv = max_sector - sector_nr;
+ md_done_sync(mddev, rv, 1);
+ put_buf(r1_bio);
+ atomic_dec(&conf->mirrors[disk].rdev->nr_pending);
+ return rv;
+ }
+
nr_sectors = 0;
do {
struct page *page;
bio = r1_bio->bios[disk];
r1_bio->sectors = nr_sectors;
- md_sync_acct(mirror->rdev, nr_sectors);
+ md_sync_acct(mirror->rdev->bdev, nr_sectors);
generic_make_request(bio);
*/
conf = kmalloc(sizeof(conf_t), GFP_KERNEL);
mddev->private = conf;
- if (!conf) {
- printk(KERN_ERR "raid1: couldn't allocate memory for %s\n",
- mdname(mddev));
- goto out;
- }
+ if (!conf)
+ goto out_no_mem;
+
memset(conf, 0, sizeof(*conf));
conf->mirrors = kmalloc(sizeof(struct mirror_info)*mddev->raid_disks,
GFP_KERNEL);
- if (!conf->mirrors) {
- printk(KERN_ERR "raid1: couldn't allocate memory for %s\n",
- mdname(mddev));
- goto out_free_conf;
- }
+ if (!conf->mirrors)
+ goto out_no_mem;
+
memset(conf->mirrors, 0, sizeof(struct mirror_info)*mddev->raid_disks);
+ conf->poolinfo = kmalloc(sizeof(*conf->poolinfo), GFP_KERNEL);
+ if (!conf->poolinfo)
+ goto out_no_mem;
+ conf->poolinfo->mddev = mddev;
+ conf->poolinfo->raid_disks = mddev->raid_disks;
conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
- r1bio_pool_free, mddev);
- if (!conf->r1bio_pool) {
- printk(KERN_ERR "raid1: couldn't allocate memory for %s\n",
- mdname(mddev));
- goto out_free_conf;
- }
+ r1bio_pool_free,
+ conf->poolinfo);
+ if (!conf->r1bio_pool)
+ goto out_no_mem;
+
mddev->queue->unplug_fn = raid1_unplug;
+ mddev->queue->issue_flush_fn = raid1_issue_flush;
ITERATE_RDEV(mddev, rdev, tmp) {
disk_idx = rdev->raid_disk;
*/
if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
mddev->queue->max_sectors > (PAGE_SIZE>>9))
- mddev->queue->max_sectors = (PAGE_SIZE>>9);
+ blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
disk->head_position = 0;
if (!rdev->faulty && rdev->in_sync)
return 0;
+out_no_mem:
+ printk(KERN_ERR "raid1: couldn't allocate memory for %s\n",
+ mdname(mddev));
+
out_free_conf:
- if (conf->r1bio_pool)
- mempool_destroy(conf->r1bio_pool);
- if (conf->mirrors)
- kfree(conf->mirrors);
- kfree(conf);
- mddev->private = NULL;
+ if (conf) {
+ if (conf->r1bio_pool)
+ mempool_destroy(conf->r1bio_pool);
+ if (conf->mirrors)
+ kfree(conf->mirrors);
+ if (conf->poolinfo)
+ kfree(conf->poolinfo);
+ kfree(conf);
+ mddev->private = NULL;
+ }
out:
return -EIO;
}
mempool_destroy(conf->r1bio_pool);
if (conf->mirrors)
kfree(conf->mirrors);
+ if (conf->poolinfo)
+ kfree(conf->poolinfo);
kfree(conf);
mddev->private = NULL;
return 0;
}
+static int raid1_resize(mddev_t *mddev, sector_t sectors)
+{
+ /* no resync is happening, and there is enough space
+ * on all devices, so we can resize.
+ * We need to make sure resync covers any new space.
+ * If the array is shrinking we should possibly wait until
+ * any io in the removed space completes, but it hardly seems
+ * worth it.
+ */
+ mddev->array_size = sectors>>1;
+ set_capacity(mddev->gendisk, mddev->array_size << 1);
+ mddev->changed = 1;
+ if (mddev->array_size > mddev->size && mddev->recovery_cp == MaxSector) {
+ mddev->recovery_cp = mddev->size << 1;
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ }
+ mddev->size = mddev->array_size;
+ return 0;
+}
+
+static int raid1_reshape(mddev_t *mddev, int raid_disks)
+{
+ /* We need to:
+ * 1/ resize the r1bio_pool
+ * 2/ resize conf->mirrors
+ *
+ * We allocate a new r1bio_pool if we can.
+ * Then raise a device barrier and wait until all IO stops.
+ * Then resize conf->mirrors and swap in the new r1bio pool.
+ */
+ mempool_t *newpool, *oldpool;
+ struct pool_info *newpoolinfo;
+ mirror_info_t *newmirrors;
+ conf_t *conf = mddev_to_conf(mddev);
+
+ int d;
+
+ for (d= raid_disks; d < conf->raid_disks; d++)
+ if (conf->mirrors[d].rdev)
+ return -EBUSY;
+
+ newpoolinfo = kmalloc(sizeof(newpoolinfo), GFP_KERNEL);
+ if (!newpoolinfo)
+ return -ENOMEM;
+ newpoolinfo->mddev = mddev;
+ newpoolinfo->raid_disks = raid_disks;
+
+ newpool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
+ r1bio_pool_free, newpoolinfo);
+ if (!newpool) {
+ kfree(newpoolinfo);
+ return -ENOMEM;
+ }
+ newmirrors = kmalloc(sizeof(struct mirror_info) * raid_disks, GFP_KERNEL);
+ if (!newmirrors) {
+ kfree(newpoolinfo);
+ mempool_destroy(newpool);
+ return -ENOMEM;
+ }
+ memset(newmirrors, 0, sizeof(struct mirror_info)*raid_disks);
+
+ spin_lock_irq(&conf->resync_lock);
+ conf->barrier++;
+ wait_event_lock_irq(conf->wait_idle, !conf->nr_pending,
+ conf->resync_lock, unplug_slaves(mddev));
+ spin_unlock_irq(&conf->resync_lock);
+
+ /* ok, everything is stopped */
+ oldpool = conf->r1bio_pool;
+ conf->r1bio_pool = newpool;
+ for (d=0; d < raid_disks && d < conf->raid_disks; d++)
+ newmirrors[d] = conf->mirrors[d];
+ kfree(conf->mirrors);
+ conf->mirrors = newmirrors;
+ kfree(conf->poolinfo);
+ conf->poolinfo = newpoolinfo;
+
+ mddev->degraded += (raid_disks - conf->raid_disks);
+ conf->raid_disks = mddev->raid_disks = raid_disks;
+
+ spin_lock_irq(&conf->resync_lock);
+ conf->barrier--;
+ spin_unlock_irq(&conf->resync_lock);
+ wake_up(&conf->wait_resume);
+ wake_up(&conf->wait_idle);
+
+
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+
+ mempool_destroy(oldpool);
+ return 0;
+}
+
+
static mdk_personality_t raid1_personality =
{
.name = "raid1",
.hot_remove_disk= raid1_remove_disk,
.spare_active = raid1_spare_active,
.sync_request = sync_request,
+ .resize = raid1_resize,
+ .reshape = raid1_reshape,
};
static int __init raid_init(void)