#include <linux/slab.h>
#include <linux/raid/raid5.h>
#include <linux/highmem.h>
-#include <asm/bitops.h>
+#include <linux/bitops.h>
#include <asm/atomic.h>
/*
* This macro is used to determine the 'next' bio in the list, given the sector
* of the current stripe+device
*/
-#define r5_next_bio(bio, sect) ( ( bio->bi_sector + (bio->bi_size>>9) < sect + STRIPE_SECTORS) ? bio->bi_next : NULL)
+#define r5_next_bio(bio, sect) ( ( (bio)->bi_sector + ((bio)->bi_size>>9) < sect + STRIPE_SECTORS) ? (bio)->bi_next : NULL)
/*
* The following can be used to debug the driver
*/
#define RAID5_DEBUG 0
#define RAID5_PARANOIA 1
#if RAID5_PARANOIA && defined(CONFIG_SMP)
-# define CHECK_DEVLOCK() if (!spin_is_locked(&conf->device_lock)) BUG()
+# define CHECK_DEVLOCK() assert_spin_locked(&conf->device_lock)
#else
# define CHECK_DEVLOCK()
#endif
}
static void unplug_slaves(mddev_t *mddev);
+static void raid5_unplug_device(request_queue_t *q);
static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector,
int pd_idx, int noblock)
return 1;
memset(sh, 0, sizeof(*sh) + (devs-1)*sizeof(struct r5dev));
sh->raid_conf = conf;
- sh->lock = SPIN_LOCK_UNLOCKED;
+ spin_lock_init(&sh->lock);
if (grow_buffers(sh, conf->raid_disks)) {
shrink_buffers(sh, conf->raid_disks);
bio_init(&dev->req);
dev->req.bi_io_vec = &dev->vec;
dev->req.bi_vcnt++;
+ dev->req.bi_max_vecs++;
dev->vec.bv_page = dev->page;
dev->vec.bv_len = STRIPE_SIZE;
dev->vec.bv_offset = 0;
/*
- * Copy data between a page in the stripe cache, and one or more bion
- * The page could align with the middle of the bio, or there could be
- * several bion, each with several bio_vecs, which cover part of the page
- * Multiple bion are linked together on bi_next. There may be extras
- * at the end of this list. We ignore them.
+ * Copy data between a page in the stripe cache, and a bio.
+ * There are no alignment or size guarantees between the page or the
+ * bio except that there is some overlap.
+ * All iovecs in the bio must be considered.
*/
static void copy_data(int frombio, struct bio *bio,
struct page *page,
char *pa = page_address(page);
struct bio_vec *bvl;
int i;
+ int page_offset;
- for (;bio && bio->bi_sector < sector+STRIPE_SECTORS;
- bio = r5_next_bio(bio, sector) ) {
- int page_offset;
- if (bio->bi_sector >= sector)
- page_offset = (signed)(bio->bi_sector - sector) * 512;
- else
- page_offset = (signed)(sector - bio->bi_sector) * -512;
- bio_for_each_segment(bvl, bio, i) {
- int len = bio_iovec_idx(bio,i)->bv_len;
- int clen;
- int b_offset = 0;
-
- if (page_offset < 0) {
- b_offset = -page_offset;
- page_offset += b_offset;
- len -= b_offset;
- }
+ if (bio->bi_sector >= sector)
+ page_offset = (signed)(bio->bi_sector - sector) * 512;
+ else
+ page_offset = (signed)(sector - bio->bi_sector) * -512;
+ bio_for_each_segment(bvl, bio, i) {
+ int len = bio_iovec_idx(bio,i)->bv_len;
+ int clen;
+ int b_offset = 0;
+
+ if (page_offset < 0) {
+ b_offset = -page_offset;
+ page_offset += b_offset;
+ len -= b_offset;
+ }
- if (len > 0 && page_offset + len > STRIPE_SIZE)
- clen = STRIPE_SIZE - page_offset;
- else clen = len;
+ if (len > 0 && page_offset + len > STRIPE_SIZE)
+ clen = STRIPE_SIZE - page_offset;
+ else clen = len;
- if (clen > 0) {
- char *ba = __bio_kmap_atomic(bio, i, KM_USER0);
- if (frombio)
- memcpy(pa+page_offset, ba+b_offset, clen);
- else
- memcpy(ba+b_offset, pa+page_offset, clen);
- __bio_kunmap_atomic(ba, KM_USER0);
- }
- if (clen < len) /* hit end of page */
- break;
- page_offset += len;
+ if (clen > 0) {
+ char *ba = __bio_kmap_atomic(bio, i, KM_USER0);
+ if (frombio)
+ memcpy(pa+page_offset, ba+b_offset, clen);
+ else
+ memcpy(ba+b_offset, pa+page_offset, clen);
+ __bio_kunmap_atomic(ba, KM_USER0);
}
+ if (clen < len) /* hit end of page */
+ break;
+ page_offset += len;
}
}
ptr[count++] = page_address(sh->dev[i].page);
chosen = sh->dev[i].towrite;
sh->dev[i].towrite = NULL;
+
+ if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
+ wake_up(&conf->wait_for_overlap);
+
if (sh->dev[i].written) BUG();
sh->dev[i].written = chosen;
check_xor();
if (i!=pd_idx && sh->dev[i].towrite) {
chosen = sh->dev[i].towrite;
sh->dev[i].towrite = NULL;
+
+ if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
+ wake_up(&conf->wait_for_overlap);
+
if (sh->dev[i].written) BUG();
sh->dev[i].written = chosen;
}
* toread/towrite point to the first in a chain.
* The bi_next chain must be in order.
*/
-static void add_stripe_bio (struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
+static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
{
struct bio **bip;
raid5_conf_t *conf = sh->raid_conf;
else
bip = &sh->dev[dd_idx].toread;
while (*bip && (*bip)->bi_sector < bi->bi_sector) {
- BUG_ON((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector);
+ if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector)
+ goto overlap;
bip = & (*bip)->bi_next;
}
-/* FIXME do I need to worry about overlapping bion */
+ if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9))
+ goto overlap;
+
if (*bip && bi->bi_next && (*bip) != bi->bi_next)
BUG();
if (*bip)
(unsigned long long)sh->sector, dd_idx);
if (forwrite) {
- /* check if page is coverred */
+ /* check if page is covered */
sector_t sector = sh->dev[dd_idx].sector;
for (bi=sh->dev[dd_idx].towrite;
sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
}
+ return 1;
+
+ overlap:
+ set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
+ spin_unlock_irq(&conf->device_lock);
+ spin_unlock(&sh->lock);
+ return 0;
}
spin_lock_irq(&conf->device_lock);
rbi = dev->toread;
dev->toread = NULL;
+ if (test_and_clear_bit(R5_Overlap, &dev->flags))
+ wake_up(&conf->wait_for_overlap);
spin_unlock_irq(&conf->device_lock);
while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) {
copy_data(0, rbi, dev->page, dev->sector);
sh->dev[i].towrite = NULL;
if (bi) to_write--;
+ if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
+ wake_up(&conf->wait_for_overlap);
+
while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
clear_bit(BIO_UPTODATE, &bi->bi_flags);
if (!test_bit(R5_Insync, &sh->dev[i].flags)) {
bi = sh->dev[i].toread;
sh->dev[i].toread = NULL;
+ if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
+ wake_up(&conf->wait_for_overlap);
if (bi) to_read--;
while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
else
bi->bi_end_io = raid5_end_read_request;
- spin_lock_irq(&conf->device_lock);
+ rcu_read_lock();
rdev = conf->disks[i].rdev;
if (rdev && rdev->faulty)
rdev = NULL;
if (rdev)
atomic_inc(&rdev->nr_pending);
- spin_unlock_irq(&conf->device_lock);
+ rcu_read_unlock();
if (rdev) {
if (test_bit(R5_Syncio, &sh->dev[i].flags))
bi->bi_sector = sh->sector + rdev->data_offset;
bi->bi_flags = 1 << BIO_UPTODATE;
bi->bi_vcnt = 1;
+ bi->bi_max_vecs = 1;
bi->bi_idx = 0;
bi->bi_io_vec = &sh->dev[i].vec;
bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
{
raid5_conf_t *conf = mddev_to_conf(mddev);
int i;
- unsigned long flags;
- spin_lock_irqsave(&conf->device_lock, flags);
+ rcu_read_lock();
for (i=0; i<mddev->raid_disks; i++) {
mdk_rdev_t *rdev = conf->disks[i].rdev;
- if (rdev && atomic_read(&rdev->nr_pending)) {
+ if (rdev && !rdev->faulty && atomic_read(&rdev->nr_pending)) {
request_queue_t *r_queue = bdev_get_queue(rdev->bdev);
atomic_inc(&rdev->nr_pending);
- spin_unlock_irqrestore(&conf->device_lock, flags);
+ rcu_read_unlock();
- if (r_queue && r_queue->unplug_fn)
+ if (r_queue->unplug_fn)
r_queue->unplug_fn(r_queue);
- spin_lock_irqsave(&conf->device_lock, flags);
- atomic_dec(&rdev->nr_pending);
+ rdev_dec_pending(rdev, mddev);
+ rcu_read_lock();
}
}
- spin_unlock_irqrestore(&conf->device_lock, flags);
+ rcu_read_unlock();
}
static void raid5_unplug_device(request_queue_t *q)
raid5_conf_t *conf = mddev_to_conf(mddev);
int i, ret = 0;
- for (i=0; i<mddev->raid_disks; i++) {
+ rcu_read_lock();
+ for (i=0; i<mddev->raid_disks && ret == 0; i++) {
mdk_rdev_t *rdev = conf->disks[i].rdev;
if (rdev && !rdev->faulty) {
struct block_device *bdev = rdev->bdev;
- request_queue_t *r_queue;
-
- if (!bdev)
- continue;
+ request_queue_t *r_queue = bdev_get_queue(bdev);
- r_queue = bdev_get_queue(bdev);
- if (!r_queue)
- continue;
-
- if (!r_queue->issue_flush_fn) {
+ if (!r_queue->issue_flush_fn)
ret = -EOPNOTSUPP;
- break;
+ else {
+ atomic_inc(&rdev->nr_pending);
+ rcu_read_unlock();
+ ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk,
+ error_sector);
+ rdev_dec_pending(rdev, mddev);
+ rcu_read_lock();
}
-
- ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector);
- if (ret)
- break;
}
}
+ rcu_read_unlock();
return ret;
}
if ( bio_data_dir(bi) == WRITE )
md_write_start(mddev);
for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
+ DEFINE_WAIT(w);
new_sector = raid5_compute_sector(logical_sector,
raid_disks, data_disks, &dd_idx, &pd_idx, conf);
- PRINTK("raid5: make_request, sector %Lu logical %Lu\n",
+ PRINTK("raid5: make_request, sector %llu logical %llu\n",
(unsigned long long)new_sector,
(unsigned long long)logical_sector);
+ retry:
+ prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
sh = get_active_stripe(conf, new_sector, pd_idx, (bi->bi_rw&RWA_MASK));
if (sh) {
-
- add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK));
-
+ if (!add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) {
+ /* Add failed due to overlap. Flush everything
+ * and wait a while
+ */
+ raid5_unplug_device(mddev->queue);
+ release_stripe(sh);
+ schedule();
+ goto retry;
+ }
+ finish_wait(&conf->wait_for_overlap, &w);
raid5_plug_device(conf);
handle_stripe(sh);
release_stripe(sh);
+
} else {
/* cannot get stripe for read-ahead, just give-up */
clear_bit(BIO_UPTODATE, &bi->bi_flags);
+ finish_wait(&conf->wait_for_overlap, &w);
break;
}
goto abort;
memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE);
- conf->device_lock = SPIN_LOCK_UNLOCKED;
+ spin_lock_init(&conf->device_lock);
init_waitqueue_head(&conf->wait_for_stripe);
+ init_waitqueue_head(&conf->wait_for_overlap);
INIT_LIST_HEAD(&conf->handle_list);
INIT_LIST_HEAD(&conf->delayed_list);
INIT_LIST_HEAD(&conf->inactive_list);
mddev->thread = NULL;
shrink_stripes(conf);
free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER);
+ blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
kfree(conf);
mddev->private = NULL;
return 0;
raid5_conf_t *conf = mddev->private;
struct disk_info *tmp;
- spin_lock_irq(&conf->device_lock);
for (i = 0; i < conf->raid_disks; i++) {
tmp = conf->disks + i;
if (tmp->rdev
tmp->rdev->in_sync = 1;
}
}
- spin_unlock_irq(&conf->device_lock);
print_raid5_conf(conf);
return 0;
}
static int raid5_remove_disk(mddev_t *mddev, int number)
{
raid5_conf_t *conf = mddev->private;
- int err = 1;
+ int err = 0;
+ mdk_rdev_t *rdev;
struct disk_info *p = conf->disks + number;
print_raid5_conf(conf);
- spin_lock_irq(&conf->device_lock);
-
- if (p->rdev) {
- if (p->rdev->in_sync ||
- atomic_read(&p->rdev->nr_pending)) {
+ rdev = p->rdev;
+ if (rdev) {
+ if (rdev->in_sync ||
+ atomic_read(&rdev->nr_pending)) {
err = -EBUSY;
goto abort;
}
p->rdev = NULL;
- err = 0;
+ synchronize_kernel();
+ if (atomic_read(&rdev->nr_pending)) {
+ /* lost the race, try later */
+ err = -EBUSY;
+ p->rdev = rdev;
+ }
}
- if (err)
- MD_BUG();
abort:
- spin_unlock_irq(&conf->device_lock);
+
print_raid5_conf(conf);
return err;
}
int disk;
struct disk_info *p;
- spin_lock_irq(&conf->device_lock);
/*
* find the disk ...
*/
for (disk=0; disk < mddev->raid_disks; disk++)
if ((p=conf->disks + disk)->rdev == NULL) {
- p->rdev = rdev;
rdev->in_sync = 0;
rdev->raid_disk = disk;
found = 1;
+ p->rdev = rdev;
break;
}
- spin_unlock_irq(&conf->device_lock);
print_raid5_conf(conf);
return found;
}