2 * raid5.c : Multiple Devices driver for Linux
3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
4 * Copyright (C) 1999, 2000 Ingo Molnar
6 * RAID-5 management functions.
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2, or (at your option)
13 * You should have received a copy of the GNU General Public License
14 * (for example /usr/src/linux/COPYING); if not, write to the Free
15 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 #include <linux/config.h>
20 #include <linux/module.h>
21 #include <linux/slab.h>
22 #include <linux/raid/raid5.h>
23 #include <linux/highmem.h>
24 #include <asm/bitops.h>
25 #include <asm/atomic.h>
31 #define NR_STRIPES 256
32 #define STRIPE_SIZE PAGE_SIZE
33 #define STRIPE_SHIFT (PAGE_SHIFT - 9)
34 #define STRIPE_SECTORS (STRIPE_SIZE>>9)
35 #define IO_THRESHOLD 1
37 #define HASH_PAGES_ORDER 0
38 #define NR_HASH (HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *))
39 #define HASH_MASK (NR_HASH - 1)
41 #define stripe_hash(conf, sect) ((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK])
43 /* bio's attached to a stripe+device for I/O are linked together in bi_sector
44 * order without overlap. There may be several bio's per stripe+device, and
45 * a bio could span several devices.
46 * When walking this list for a particular stripe+device, we must never proceed
47 * beyond a bio that extends past this device, as the next bio might no longer
49 * This macro is used to determine the 'next' bio in the list, given the sector
50 * of the current stripe+device
52 #define r5_next_bio(bio, sect) ( ( bio->bi_sector + (bio->bi_size>>9) < sect + STRIPE_SECTORS) ? bio->bi_next : NULL)
54 * The following can be used to debug the driver
57 #define RAID5_PARANOIA 1
58 #if RAID5_PARANOIA && CONFIG_SMP
59 # define CHECK_DEVLOCK() if (!spin_is_locked(&conf->device_lock)) BUG()
61 # define CHECK_DEVLOCK()
64 #define PRINTK(x...) ((void)(RAID5_DEBUG && printk(x)))
70 static void print_raid5_conf (raid5_conf_t *conf);
72 static inline void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
74 if (atomic_dec_and_test(&sh->count)) {
75 if (!list_empty(&sh->lru))
77 if (atomic_read(&conf->active_stripes)==0)
79 if (test_bit(STRIPE_HANDLE, &sh->state)) {
80 if (test_bit(STRIPE_DELAYED, &sh->state))
81 list_add_tail(&sh->lru, &conf->delayed_list);
83 list_add_tail(&sh->lru, &conf->handle_list);
84 md_wakeup_thread(conf->mddev->thread);
86 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
87 atomic_dec(&conf->preread_active_stripes);
88 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
89 md_wakeup_thread(conf->mddev->thread);
91 list_add_tail(&sh->lru, &conf->inactive_list);
92 atomic_dec(&conf->active_stripes);
93 if (!conf->inactive_blocked ||
94 atomic_read(&conf->active_stripes) < (NR_STRIPES*3/4))
95 wake_up(&conf->wait_for_stripe);
99 static void release_stripe(struct stripe_head *sh)
101 raid5_conf_t *conf = sh->raid_conf;
104 spin_lock_irqsave(&conf->device_lock, flags);
105 __release_stripe(conf, sh);
106 spin_unlock_irqrestore(&conf->device_lock, flags);
109 static void remove_hash(struct stripe_head *sh)
111 PRINTK("remove_hash(), stripe %llu\n", (unsigned long long)sh->sector);
113 if (sh->hash_pprev) {
115 sh->hash_next->hash_pprev = sh->hash_pprev;
116 *sh->hash_pprev = sh->hash_next;
117 sh->hash_pprev = NULL;
121 static __inline__ void insert_hash(raid5_conf_t *conf, struct stripe_head *sh)
123 struct stripe_head **shp = &stripe_hash(conf, sh->sector);
125 PRINTK("insert_hash(), stripe %llu\n", (unsigned long long)sh->sector);
128 if ((sh->hash_next = *shp) != NULL)
129 (*shp)->hash_pprev = &sh->hash_next;
131 sh->hash_pprev = shp;
135 /* find an idle stripe, make sure it is unhashed, and return it. */
136 static struct stripe_head *get_free_stripe(raid5_conf_t *conf)
138 struct stripe_head *sh = NULL;
139 struct list_head *first;
142 if (list_empty(&conf->inactive_list))
144 first = conf->inactive_list.next;
145 sh = list_entry(first, struct stripe_head, lru);
146 list_del_init(first);
148 atomic_inc(&conf->active_stripes);
153 static void shrink_buffers(struct stripe_head *sh, int num)
158 for (i=0; i<num ; i++) {
162 sh->dev[i].page = NULL;
163 page_cache_release(p);
167 static int grow_buffers(struct stripe_head *sh, int num)
171 for (i=0; i<num; i++) {
174 if (!(page = alloc_page(GFP_KERNEL))) {
177 sh->dev[i].page = page;
182 static void raid5_build_block (struct stripe_head *sh, int i);
184 static inline void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx)
186 raid5_conf_t *conf = sh->raid_conf;
187 int disks = conf->raid_disks, i;
189 if (atomic_read(&sh->count) != 0)
191 if (test_bit(STRIPE_HANDLE, &sh->state))
195 PRINTK("init_stripe called, stripe %llu\n",
196 (unsigned long long)sh->sector);
204 for (i=disks; i--; ) {
205 struct r5dev *dev = &sh->dev[i];
207 if (dev->toread || dev->towrite || dev->written ||
208 test_bit(R5_LOCKED, &dev->flags)) {
209 printk("sector=%llx i=%d %p %p %p %d\n",
210 (unsigned long long)sh->sector, i, dev->toread,
211 dev->towrite, dev->written,
212 test_bit(R5_LOCKED, &dev->flags));
216 raid5_build_block(sh, i);
218 insert_hash(conf, sh);
221 static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector)
223 struct stripe_head *sh;
226 PRINTK("__find_stripe, sector %llu\n", (unsigned long long)sector);
227 for (sh = stripe_hash(conf, sector); sh; sh = sh->hash_next)
228 if (sh->sector == sector)
230 PRINTK("__stripe %llu not in cache\n", (unsigned long long)sector);
234 static void unplug_slaves(mddev_t *mddev);
236 static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector,
237 int pd_idx, int noblock)
239 struct stripe_head *sh;
241 PRINTK("get_stripe, sector %llu\n", (unsigned long long)sector);
243 spin_lock_irq(&conf->device_lock);
246 sh = __find_stripe(conf, sector);
248 if (!conf->inactive_blocked)
249 sh = get_free_stripe(conf);
250 if (noblock && sh == NULL)
253 conf->inactive_blocked = 1;
254 wait_event_lock_irq(conf->wait_for_stripe,
255 !list_empty(&conf->inactive_list) &&
256 (atomic_read(&conf->active_stripes) < (NR_STRIPES *3/4)
257 || !conf->inactive_blocked),
259 unplug_slaves(conf->mddev);
261 conf->inactive_blocked = 0;
263 init_stripe(sh, sector, pd_idx);
265 if (atomic_read(&sh->count)) {
266 if (!list_empty(&sh->lru))
269 if (!test_bit(STRIPE_HANDLE, &sh->state))
270 atomic_inc(&conf->active_stripes);
271 if (list_empty(&sh->lru))
273 list_del_init(&sh->lru);
276 } while (sh == NULL);
279 atomic_inc(&sh->count);
281 spin_unlock_irq(&conf->device_lock);
285 static int grow_stripes(raid5_conf_t *conf, int num)
287 struct stripe_head *sh;
289 int devs = conf->raid_disks;
291 sprintf(conf->cache_name, "raid5/%s", mdname(conf->mddev));
293 sc = kmem_cache_create(conf->cache_name,
294 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
298 conf->slab_cache = sc;
300 sh = kmem_cache_alloc(sc, GFP_KERNEL);
303 memset(sh, 0, sizeof(*sh) + (devs-1)*sizeof(struct r5dev));
304 sh->raid_conf = conf;
305 sh->lock = SPIN_LOCK_UNLOCKED;
307 if (grow_buffers(sh, conf->raid_disks)) {
308 shrink_buffers(sh, conf->raid_disks);
309 kmem_cache_free(sc, sh);
312 /* we just created an active stripe so... */
313 atomic_set(&sh->count, 1);
314 atomic_inc(&conf->active_stripes);
315 INIT_LIST_HEAD(&sh->lru);
321 static void shrink_stripes(raid5_conf_t *conf)
323 struct stripe_head *sh;
326 spin_lock_irq(&conf->device_lock);
327 sh = get_free_stripe(conf);
328 spin_unlock_irq(&conf->device_lock);
331 if (atomic_read(&sh->count))
333 shrink_buffers(sh, conf->raid_disks);
334 kmem_cache_free(conf->slab_cache, sh);
335 atomic_dec(&conf->active_stripes);
337 kmem_cache_destroy(conf->slab_cache);
338 conf->slab_cache = NULL;
341 static int raid5_end_read_request (struct bio * bi, unsigned int bytes_done,
344 struct stripe_head *sh = bi->bi_private;
345 raid5_conf_t *conf = sh->raid_conf;
346 int disks = conf->raid_disks, i;
347 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
352 for (i=0 ; i<disks; i++)
353 if (bi == &sh->dev[i].req)
356 PRINTK("end_read_request %llu/%d, count: %d, uptodate %d.\n",
357 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
368 spin_lock_irqsave(&conf->device_lock, flags);
369 /* we can return a buffer if we bypassed the cache or
370 * if the top buffer is not in highmem. If there are
371 * multiple buffers, leave the extra work to
374 buffer = sh->bh_read[i];
376 (!PageHighMem(buffer->b_page)
377 || buffer->b_page == bh->b_page )
379 sh->bh_read[i] = buffer->b_reqnext;
380 buffer->b_reqnext = NULL;
383 spin_unlock_irqrestore(&conf->device_lock, flags);
384 if (sh->bh_page[i]==bh->b_page)
385 set_buffer_uptodate(bh);
387 if (buffer->b_page != bh->b_page)
388 memcpy(buffer->b_data, bh->b_data, bh->b_size);
389 buffer->b_end_io(buffer, 1);
392 set_bit(R5_UPTODATE, &sh->dev[i].flags);
395 md_error(conf->mddev, conf->disks[i].rdev);
396 clear_bit(R5_UPTODATE, &sh->dev[i].flags);
398 atomic_dec(&conf->disks[i].rdev->nr_pending);
400 /* must restore b_page before unlocking buffer... */
401 if (sh->bh_page[i] != bh->b_page) {
402 bh->b_page = sh->bh_page[i];
403 bh->b_data = page_address(bh->b_page);
404 clear_buffer_uptodate(bh);
407 clear_bit(R5_LOCKED, &sh->dev[i].flags);
408 set_bit(STRIPE_HANDLE, &sh->state);
413 static int raid5_end_write_request (struct bio *bi, unsigned int bytes_done,
416 struct stripe_head *sh = bi->bi_private;
417 raid5_conf_t *conf = sh->raid_conf;
418 int disks = conf->raid_disks, i;
420 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
425 for (i=0 ; i<disks; i++)
426 if (bi == &sh->dev[i].req)
429 PRINTK("end_write_request %llu/%d, count %d, uptodate: %d.\n",
430 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
437 spin_lock_irqsave(&conf->device_lock, flags);
439 md_error(conf->mddev, conf->disks[i].rdev);
441 atomic_dec(&conf->disks[i].rdev->nr_pending);
443 clear_bit(R5_LOCKED, &sh->dev[i].flags);
444 set_bit(STRIPE_HANDLE, &sh->state);
445 __release_stripe(conf, sh);
446 spin_unlock_irqrestore(&conf->device_lock, flags);
451 static sector_t compute_blocknr(struct stripe_head *sh, int i);
453 static void raid5_build_block (struct stripe_head *sh, int i)
455 struct r5dev *dev = &sh->dev[i];
458 dev->req.bi_io_vec = &dev->vec;
460 dev->vec.bv_page = dev->page;
461 dev->vec.bv_len = STRIPE_SIZE;
462 dev->vec.bv_offset = 0;
464 dev->req.bi_sector = sh->sector;
465 dev->req.bi_private = sh;
469 dev->sector = compute_blocknr(sh, i);
472 static void error(mddev_t *mddev, mdk_rdev_t *rdev)
474 char b[BDEVNAME_SIZE];
475 raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
476 PRINTK("raid5: error called\n");
480 conf->working_disks--;
483 conf->failed_disks++;
486 * if recovery was running, make sure it aborts.
488 set_bit(MD_RECOVERY_ERR, &mddev->recovery);
492 "raid5: Disk failure on %s, disabling device."
493 " Operation continuing on %d devices\n",
494 bdevname(rdev->bdev,b), conf->working_disks);
499 * Input: a 'big' sector number,
500 * Output: index of the data and parity disk, and the sector # in them.
502 static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks,
503 unsigned int data_disks, unsigned int * dd_idx,
504 unsigned int * pd_idx, raid5_conf_t *conf)
507 unsigned long chunk_number;
508 unsigned int chunk_offset;
510 int sectors_per_chunk = conf->chunk_size >> 9;
512 /* First compute the information on this sector */
515 * Compute the chunk number and the sector offset inside the chunk
517 chunk_offset = sector_div(r_sector, sectors_per_chunk);
518 chunk_number = r_sector;
519 BUG_ON(r_sector != chunk_number);
522 * Compute the stripe number
524 stripe = chunk_number / data_disks;
527 * Compute the data disk and parity disk indexes inside the stripe
529 *dd_idx = chunk_number % data_disks;
532 * Select the parity disk based on the user selected algorithm.
534 if (conf->level == 4)
535 *pd_idx = data_disks;
536 else switch (conf->algorithm) {
537 case ALGORITHM_LEFT_ASYMMETRIC:
538 *pd_idx = data_disks - stripe % raid_disks;
539 if (*dd_idx >= *pd_idx)
542 case ALGORITHM_RIGHT_ASYMMETRIC:
543 *pd_idx = stripe % raid_disks;
544 if (*dd_idx >= *pd_idx)
547 case ALGORITHM_LEFT_SYMMETRIC:
548 *pd_idx = data_disks - stripe % raid_disks;
549 *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
551 case ALGORITHM_RIGHT_SYMMETRIC:
552 *pd_idx = stripe % raid_disks;
553 *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
556 printk("raid5: unsupported algorithm %d\n",
561 * Finally, compute the new sector number
563 new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset;
568 static sector_t compute_blocknr(struct stripe_head *sh, int i)
570 raid5_conf_t *conf = sh->raid_conf;
571 int raid_disks = conf->raid_disks, data_disks = raid_disks - 1;
572 sector_t new_sector = sh->sector, check;
573 int sectors_per_chunk = conf->chunk_size >> 9;
576 int chunk_number, dummy1, dummy2, dd_idx = i;
579 chunk_offset = sector_div(new_sector, sectors_per_chunk);
581 BUG_ON(new_sector != stripe);
584 switch (conf->algorithm) {
585 case ALGORITHM_LEFT_ASYMMETRIC:
586 case ALGORITHM_RIGHT_ASYMMETRIC:
590 case ALGORITHM_LEFT_SYMMETRIC:
591 case ALGORITHM_RIGHT_SYMMETRIC:
594 i -= (sh->pd_idx + 1);
597 printk("raid5: unsupported algorithm %d\n",
601 chunk_number = stripe * data_disks + i;
602 r_sector = (sector_t)chunk_number * sectors_per_chunk + chunk_offset;
604 check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf);
605 if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) {
606 printk("compute_blocknr: map not correct\n");
615 * Copy data between a page in the stripe cache, and one or more bion
616 * The page could align with the middle of the bio, or there could be
617 * several bion, each with several bio_vecs, which cover part of the page
618 * Multiple bion are linked together on bi_next. There may be extras
619 * at the end of this list. We ignore them.
621 static void copy_data(int frombio, struct bio *bio,
625 char *pa = page_address(page);
629 for (;bio && bio->bi_sector < sector+STRIPE_SECTORS;
630 bio = r5_next_bio(bio, sector) ) {
632 if (bio->bi_sector >= sector)
633 page_offset = (signed)(bio->bi_sector - sector) * 512;
635 page_offset = (signed)(sector - bio->bi_sector) * -512;
636 bio_for_each_segment(bvl, bio, i) {
637 int len = bio_iovec_idx(bio,i)->bv_len;
641 if (page_offset < 0) {
642 b_offset = -page_offset;
643 page_offset += b_offset;
647 if (len > 0 && page_offset + len > STRIPE_SIZE)
648 clen = STRIPE_SIZE - page_offset;
652 char *ba = __bio_kmap_atomic(bio, i, KM_USER0);
654 memcpy(pa+page_offset, ba+b_offset, clen);
656 memcpy(ba+b_offset, pa+page_offset, clen);
657 __bio_kunmap_atomic(ba, KM_USER0);
659 if (clen < len) /* hit end of page */
666 #define check_xor() do { \
667 if (count == MAX_XOR_BLOCKS) { \
668 xor_block(count, STRIPE_SIZE, ptr); \
674 static void compute_block(struct stripe_head *sh, int dd_idx)
676 raid5_conf_t *conf = sh->raid_conf;
677 int i, count, disks = conf->raid_disks;
678 void *ptr[MAX_XOR_BLOCKS], *p;
680 PRINTK("compute_block, stripe %llu, idx %d\n",
681 (unsigned long long)sh->sector, dd_idx);
683 ptr[0] = page_address(sh->dev[dd_idx].page);
684 memset(ptr[0], 0, STRIPE_SIZE);
686 for (i = disks ; i--; ) {
689 p = page_address(sh->dev[i].page);
690 if (test_bit(R5_UPTODATE, &sh->dev[i].flags))
693 printk("compute_block() %d, stripe %llu, %d"
694 " not present\n", dd_idx,
695 (unsigned long long)sh->sector, i);
700 xor_block(count, STRIPE_SIZE, ptr);
701 set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
704 static void compute_parity(struct stripe_head *sh, int method)
706 raid5_conf_t *conf = sh->raid_conf;
707 int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, count;
708 void *ptr[MAX_XOR_BLOCKS];
711 PRINTK("compute_parity, stripe %llu, method %d\n",
712 (unsigned long long)sh->sector, method);
715 ptr[0] = page_address(sh->dev[pd_idx].page);
717 case READ_MODIFY_WRITE:
718 if (!test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags))
720 for (i=disks ; i-- ;) {
723 if (sh->dev[i].towrite &&
724 test_bit(R5_UPTODATE, &sh->dev[i].flags)) {
725 ptr[count++] = page_address(sh->dev[i].page);
726 chosen = sh->dev[i].towrite;
727 sh->dev[i].towrite = NULL;
728 if (sh->dev[i].written) BUG();
729 sh->dev[i].written = chosen;
734 case RECONSTRUCT_WRITE:
735 memset(ptr[0], 0, STRIPE_SIZE);
736 for (i= disks; i-- ;)
737 if (i!=pd_idx && sh->dev[i].towrite) {
738 chosen = sh->dev[i].towrite;
739 sh->dev[i].towrite = NULL;
740 if (sh->dev[i].written) BUG();
741 sh->dev[i].written = chosen;
748 xor_block(count, STRIPE_SIZE, ptr);
752 for (i = disks; i--;)
753 if (sh->dev[i].written) {
754 sector_t sector = sh->dev[i].sector;
755 struct bio *wbi = sh->dev[i].written;
756 while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
757 copy_data(1, wbi, sh->dev[i].page, sector);
758 wbi = r5_next_bio(wbi, sector);
761 set_bit(R5_LOCKED, &sh->dev[i].flags);
762 set_bit(R5_UPTODATE, &sh->dev[i].flags);
766 case RECONSTRUCT_WRITE:
770 ptr[count++] = page_address(sh->dev[i].page);
774 case READ_MODIFY_WRITE:
775 for (i = disks; i--;)
776 if (sh->dev[i].written) {
777 ptr[count++] = page_address(sh->dev[i].page);
782 xor_block(count, STRIPE_SIZE, ptr);
784 if (method != CHECK_PARITY) {
785 set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
786 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
788 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
792 * Each stripe/dev can have one or more bion attached.
793 * toread/towrite point to the first in a chain.
794 * The bi_next chain must be in order.
796 static void add_stripe_bio (struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
799 raid5_conf_t *conf = sh->raid_conf;
801 PRINTK("adding bh b#%llu to stripe s#%llu\n",
802 (unsigned long long)bi->bi_sector,
803 (unsigned long long)sh->sector);
806 spin_lock(&sh->lock);
807 spin_lock_irq(&conf->device_lock);
809 bip = &sh->dev[dd_idx].towrite;
811 bip = &sh->dev[dd_idx].toread;
812 while (*bip && (*bip)->bi_sector < bi->bi_sector) {
813 BUG_ON((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector);
814 bip = & (*bip)->bi_next;
816 /* FIXME do I need to worry about overlapping bion */
817 if (*bip && bi->bi_next && (*bip) != bi->bi_next)
822 bi->bi_phys_segments ++;
823 spin_unlock_irq(&conf->device_lock);
824 spin_unlock(&sh->lock);
826 PRINTK("added bi b#%llu to stripe s#%llu, disk %d.\n",
827 (unsigned long long)bi->bi_sector,
828 (unsigned long long)sh->sector, dd_idx);
831 /* check if page is coverred */
832 sector_t sector = sh->dev[dd_idx].sector;
833 for (bi=sh->dev[dd_idx].towrite;
834 sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
835 bi && bi->bi_sector <= sector;
836 bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) {
837 if (bi->bi_sector + (bi->bi_size>>9) >= sector)
838 sector = bi->bi_sector + (bi->bi_size>>9);
840 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
841 set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
847 * handle_stripe - do things to a stripe.
849 * We lock the stripe and then examine the state of various bits
850 * to see what needs to be done.
852 * return some read request which now have data
853 * return some write requests which are safely on disc
854 * schedule a read on some buffers
855 * schedule a write of some buffers
856 * return confirmation of parity correctness
858 * Parity calculations are done inside the stripe lock
859 * buffers are taken off read_list or write_list, and bh_cache buffers
860 * get BH_Lock set before the stripe lock is released.
864 static void handle_stripe(struct stripe_head *sh)
866 raid5_conf_t *conf = sh->raid_conf;
867 int disks = conf->raid_disks;
868 struct bio *return_bi= NULL;
872 int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
873 int non_overwrite = 0;
877 PRINTK("handling stripe %llu, cnt=%d, pd_idx=%d\n",
878 (unsigned long long)sh->sector, atomic_read(&sh->count),
881 spin_lock(&sh->lock);
882 clear_bit(STRIPE_HANDLE, &sh->state);
883 clear_bit(STRIPE_DELAYED, &sh->state);
885 syncing = test_bit(STRIPE_SYNCING, &sh->state);
886 /* Now to look around and see what can be done */
888 for (i=disks; i--; ) {
891 clear_bit(R5_Insync, &dev->flags);
892 clear_bit(R5_Syncio, &dev->flags);
894 PRINTK("check %d: state 0x%lx read %p write %p written %p\n",
895 i, dev->flags, dev->toread, dev->towrite, dev->written);
896 /* maybe we can reply to a read */
897 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) {
898 struct bio *rbi, *rbi2;
899 PRINTK("Return read for disc %d\n", i);
900 spin_lock_irq(&conf->device_lock);
903 spin_unlock_irq(&conf->device_lock);
904 while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) {
905 copy_data(0, rbi, dev->page, dev->sector);
906 rbi2 = r5_next_bio(rbi, dev->sector);
907 spin_lock_irq(&conf->device_lock);
908 if (--rbi->bi_phys_segments == 0) {
909 rbi->bi_next = return_bi;
912 spin_unlock_irq(&conf->device_lock);
917 /* now count some things */
918 if (test_bit(R5_LOCKED, &dev->flags)) locked++;
919 if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++;
922 if (dev->toread) to_read++;
925 if (!test_bit(R5_OVERWRITE, &dev->flags))
928 if (dev->written) written++;
929 rdev = conf->disks[i].rdev; /* FIXME, should I be looking rdev */
930 if (!rdev || !rdev->in_sync) {
934 set_bit(R5_Insync, &dev->flags);
936 PRINTK("locked=%d uptodate=%d to_read=%d"
937 " to_write=%d failed=%d failed_num=%d\n",
938 locked, uptodate, to_read, to_write, failed, failed_num);
939 /* check if the array has lost two devices and, if so, some requests might
942 if (failed > 1 && to_read+to_write+written) {
943 spin_lock_irq(&conf->device_lock);
944 for (i=disks; i--; ) {
945 /* fail all writes first */
946 bi = sh->dev[i].towrite;
947 sh->dev[i].towrite = NULL;
950 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
951 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
952 clear_bit(BIO_UPTODATE, &bi->bi_flags);
953 if (--bi->bi_phys_segments == 0) {
954 md_write_end(conf->mddev);
955 bi->bi_next = return_bi;
960 /* and fail all 'written' */
961 bi = sh->dev[i].written;
962 sh->dev[i].written = NULL;
963 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) {
964 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
965 clear_bit(BIO_UPTODATE, &bi->bi_flags);
966 if (--bi->bi_phys_segments == 0) {
967 md_write_end(conf->mddev);
968 bi->bi_next = return_bi;
974 /* fail any reads if this device is non-operational */
975 if (!test_bit(R5_Insync, &sh->dev[i].flags)) {
976 bi = sh->dev[i].toread;
977 sh->dev[i].toread = NULL;
979 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
980 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
981 clear_bit(BIO_UPTODATE, &bi->bi_flags);
982 if (--bi->bi_phys_segments == 0) {
983 bi->bi_next = return_bi;
990 spin_unlock_irq(&conf->device_lock);
992 if (failed > 1 && syncing) {
993 md_done_sync(conf->mddev, STRIPE_SECTORS,0);
994 clear_bit(STRIPE_SYNCING, &sh->state);
998 /* might be able to return some write requests if the parity block
999 * is safe, or on a failed drive
1001 dev = &sh->dev[sh->pd_idx];
1003 ( (test_bit(R5_Insync, &dev->flags) && !test_bit(R5_LOCKED, &dev->flags) &&
1004 test_bit(R5_UPTODATE, &dev->flags))
1005 || (failed == 1 && failed_num == sh->pd_idx))
1007 /* any written block on an uptodate or failed drive can be returned.
1008 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but
1009 * never LOCKED, so we don't need to test 'failed' directly.
1011 for (i=disks; i--; )
1012 if (sh->dev[i].written) {
1014 if (!test_bit(R5_LOCKED, &dev->flags) &&
1015 test_bit(R5_UPTODATE, &dev->flags) ) {
1016 /* We can return any write requests */
1017 struct bio *wbi, *wbi2;
1018 PRINTK("Return write for disc %d\n", i);
1019 spin_lock_irq(&conf->device_lock);
1021 dev->written = NULL;
1022 while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
1023 wbi2 = r5_next_bio(wbi, dev->sector);
1024 if (--wbi->bi_phys_segments == 0) {
1025 md_write_end(conf->mddev);
1026 wbi->bi_next = return_bi;
1031 spin_unlock_irq(&conf->device_lock);
1036 /* Now we might consider reading some blocks, either to check/generate
1037 * parity, or to satisfy requests
1038 * or to load a block that is being partially written.
1040 if (to_read || non_overwrite || (syncing && (uptodate+failed < disks))) {
1041 for (i=disks; i--;) {
1043 if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
1045 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
1047 (failed && (sh->dev[failed_num].toread ||
1048 (sh->dev[failed_num].towrite && !test_bit(R5_OVERWRITE, &sh->dev[failed_num].flags))))
1051 /* we would like to get this block, possibly
1052 * by computing it, but we might not be able to
1054 if (uptodate == disks-1) {
1055 PRINTK("Computing block %d\n", i);
1056 compute_block(sh, i);
1058 } else if (test_bit(R5_Insync, &dev->flags)) {
1059 set_bit(R5_LOCKED, &dev->flags);
1060 set_bit(R5_Wantread, &dev->flags);
1062 /* if I am just reading this block and we don't have
1063 a failed drive, or any pending writes then sidestep the cache */
1064 if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext &&
1065 ! syncing && !failed && !to_write) {
1066 sh->bh_cache[i]->b_page = sh->bh_read[i]->b_page;
1067 sh->bh_cache[i]->b_data = sh->bh_read[i]->b_data;
1071 PRINTK("Reading block %d (sync=%d)\n",
1074 md_sync_acct(conf->disks[i].rdev, STRIPE_SECTORS);
1078 set_bit(STRIPE_HANDLE, &sh->state);
1081 /* now to consider writing and what else, if anything should be read */
1084 for (i=disks ; i--;) {
1085 /* would I have to read this buffer for read_modify_write */
1087 if ((dev->towrite || i == sh->pd_idx) &&
1088 (!test_bit(R5_LOCKED, &dev->flags)
1090 || sh->bh_page[i]!=bh->b_page
1093 !test_bit(R5_UPTODATE, &dev->flags)) {
1094 if (test_bit(R5_Insync, &dev->flags)
1095 /* && !(!mddev->insync && i == sh->pd_idx) */
1098 else rmw += 2*disks; /* cannot read it */
1100 /* Would I have to read this buffer for reconstruct_write */
1101 if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
1102 (!test_bit(R5_LOCKED, &dev->flags)
1104 || sh->bh_page[i] != bh->b_page
1107 !test_bit(R5_UPTODATE, &dev->flags)) {
1108 if (test_bit(R5_Insync, &dev->flags)) rcw++;
1109 else rcw += 2*disks;
1112 PRINTK("for sector %llu, rmw=%d rcw=%d\n",
1113 (unsigned long long)sh->sector, rmw, rcw);
1114 set_bit(STRIPE_HANDLE, &sh->state);
1115 if (rmw < rcw && rmw > 0)
1116 /* prefer read-modify-write, but need to get some data */
1117 for (i=disks; i--;) {
1119 if ((dev->towrite || i == sh->pd_idx) &&
1120 !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
1121 test_bit(R5_Insync, &dev->flags)) {
1122 if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
1124 PRINTK("Read_old block %d for r-m-w\n", i);
1125 set_bit(R5_LOCKED, &dev->flags);
1126 set_bit(R5_Wantread, &dev->flags);
1129 set_bit(STRIPE_DELAYED, &sh->state);
1130 set_bit(STRIPE_HANDLE, &sh->state);
1134 if (rcw <= rmw && rcw > 0)
1135 /* want reconstruct write, but need to get some data */
1136 for (i=disks; i--;) {
1138 if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
1139 !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
1140 test_bit(R5_Insync, &dev->flags)) {
1141 if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
1143 PRINTK("Read_old block %d for Reconstruct\n", i);
1144 set_bit(R5_LOCKED, &dev->flags);
1145 set_bit(R5_Wantread, &dev->flags);
1148 set_bit(STRIPE_DELAYED, &sh->state);
1149 set_bit(STRIPE_HANDLE, &sh->state);
1153 /* now if nothing is locked, and if we have enough data, we can start a write request */
1154 if (locked == 0 && (rcw == 0 ||rmw == 0)) {
1155 PRINTK("Computing parity...\n");
1156 compute_parity(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE);
1157 /* now every locked buffer is ready to be written */
1159 if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
1160 PRINTK("Writing block %d\n", i);
1162 set_bit(R5_Wantwrite, &sh->dev[i].flags);
1163 if (!test_bit(R5_Insync, &sh->dev[i].flags)
1164 || (i==sh->pd_idx && failed == 0))
1165 set_bit(STRIPE_INSYNC, &sh->state);
1167 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
1168 atomic_dec(&conf->preread_active_stripes);
1169 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
1170 md_wakeup_thread(conf->mddev->thread);
1175 /* maybe we need to check and possibly fix the parity for this stripe
1176 * Any reads will already have been scheduled, so we just see if enough data
1179 if (syncing && locked == 0 &&
1180 !test_bit(STRIPE_INSYNC, &sh->state) && failed <= 1) {
1181 set_bit(STRIPE_HANDLE, &sh->state);
1184 if (uptodate != disks)
1186 compute_parity(sh, CHECK_PARITY);
1188 pagea = page_address(sh->dev[sh->pd_idx].page);
1189 if ((*(u32*)pagea) == 0 &&
1190 !memcmp(pagea, pagea+4, STRIPE_SIZE-4)) {
1191 /* parity is correct (on disc, not in buffer any more) */
1192 set_bit(STRIPE_INSYNC, &sh->state);
1195 if (!test_bit(STRIPE_INSYNC, &sh->state)) {
1197 failed_num = sh->pd_idx;
1198 /* should be able to compute the missing block and write it to spare */
1199 if (!test_bit(R5_UPTODATE, &sh->dev[failed_num].flags)) {
1200 if (uptodate+1 != disks)
1202 compute_block(sh, failed_num);
1205 if (uptodate != disks)
1207 dev = &sh->dev[failed_num];
1208 set_bit(R5_LOCKED, &dev->flags);
1209 set_bit(R5_Wantwrite, &dev->flags);
1211 set_bit(STRIPE_INSYNC, &sh->state);
1212 set_bit(R5_Syncio, &dev->flags);
1215 if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
1216 md_done_sync(conf->mddev, STRIPE_SECTORS,1);
1217 clear_bit(STRIPE_SYNCING, &sh->state);
1220 spin_unlock(&sh->lock);
1222 while ((bi=return_bi)) {
1223 int bytes = bi->bi_size;
1225 return_bi = bi->bi_next;
1228 bi->bi_end_io(bi, bytes, 0);
1230 for (i=disks; i-- ;) {
1234 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))
1236 else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
1241 bi = &sh->dev[i].req;
1245 bi->bi_end_io = raid5_end_write_request;
1247 bi->bi_end_io = raid5_end_read_request;
1249 spin_lock_irq(&conf->device_lock);
1250 rdev = conf->disks[i].rdev;
1251 if (rdev && rdev->faulty)
1254 atomic_inc(&rdev->nr_pending);
1255 spin_unlock_irq(&conf->device_lock);
1258 if (test_bit(R5_Syncio, &sh->dev[i].flags))
1259 md_sync_acct(rdev, STRIPE_SECTORS);
1261 bi->bi_bdev = rdev->bdev;
1262 PRINTK("for %llu schedule op %ld on disc %d\n",
1263 (unsigned long long)sh->sector, bi->bi_rw, i);
1264 atomic_inc(&sh->count);
1265 bi->bi_sector = sh->sector + rdev->data_offset;
1266 bi->bi_flags = 1 << BIO_UPTODATE;
1269 bi->bi_io_vec = &sh->dev[i].vec;
1270 bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
1271 bi->bi_io_vec[0].bv_offset = 0;
1272 bi->bi_size = STRIPE_SIZE;
1274 generic_make_request(bi);
1276 PRINTK("skip op %ld on disc %d for sector %llu\n",
1277 bi->bi_rw, i, (unsigned long long)sh->sector);
1278 clear_bit(R5_LOCKED, &sh->dev[i].flags);
1279 set_bit(STRIPE_HANDLE, &sh->state);
1284 static inline void raid5_activate_delayed(raid5_conf_t *conf)
1286 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
1287 while (!list_empty(&conf->delayed_list)) {
1288 struct list_head *l = conf->delayed_list.next;
1289 struct stripe_head *sh;
1290 sh = list_entry(l, struct stripe_head, lru);
1292 clear_bit(STRIPE_DELAYED, &sh->state);
1293 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
1294 atomic_inc(&conf->preread_active_stripes);
1295 list_add_tail(&sh->lru, &conf->handle_list);
1300 static void unplug_slaves(mddev_t *mddev)
1302 raid5_conf_t *conf = mddev_to_conf(mddev);
1305 for (i=0; i<mddev->raid_disks; i++) {
1306 mdk_rdev_t *rdev = conf->disks[i].rdev;
1307 if (rdev && !rdev->faulty) {
1308 struct block_device *bdev = rdev->bdev;
1310 request_queue_t *r_queue = bdev_get_queue(bdev);
1311 if (r_queue && r_queue->unplug_fn)
1312 r_queue->unplug_fn(r_queue);
1318 static void raid5_unplug_device(request_queue_t *q)
1320 mddev_t *mddev = q->queuedata;
1321 raid5_conf_t *conf = mddev_to_conf(mddev);
1322 unsigned long flags;
1324 spin_lock_irqsave(&conf->device_lock, flags);
1326 if (blk_remove_plug(q))
1327 raid5_activate_delayed(conf);
1328 md_wakeup_thread(mddev->thread);
1330 spin_unlock_irqrestore(&conf->device_lock, flags);
1332 unplug_slaves(mddev);
1335 static inline void raid5_plug_device(raid5_conf_t *conf)
1337 spin_lock_irq(&conf->device_lock);
1338 blk_plug_device(conf->mddev->queue);
1339 spin_unlock_irq(&conf->device_lock);
1342 static int make_request (request_queue_t *q, struct bio * bi)
1344 mddev_t *mddev = q->queuedata;
1345 raid5_conf_t *conf = mddev_to_conf(mddev);
1346 const unsigned int raid_disks = conf->raid_disks;
1347 const unsigned int data_disks = raid_disks - 1;
1348 unsigned int dd_idx, pd_idx;
1349 sector_t new_sector;
1350 sector_t logical_sector, last_sector;
1351 struct stripe_head *sh;
1353 if (bio_data_dir(bi)==WRITE) {
1354 disk_stat_inc(mddev->gendisk, writes);
1355 disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bi));
1357 disk_stat_inc(mddev->gendisk, reads);
1358 disk_stat_add(mddev->gendisk, read_sectors, bio_sectors(bi));
1361 logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
1362 last_sector = bi->bi_sector + (bi->bi_size>>9);
1364 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
1365 if ( bio_data_dir(bi) == WRITE )
1366 md_write_start(mddev);
1367 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
1369 new_sector = raid5_compute_sector(logical_sector,
1370 raid_disks, data_disks, &dd_idx, &pd_idx, conf);
1372 PRINTK("raid5: make_request, sector %Lu logical %Lu\n",
1373 (unsigned long long)new_sector,
1374 (unsigned long long)logical_sector);
1376 sh = get_active_stripe(conf, new_sector, pd_idx, (bi->bi_rw&RWA_MASK));
1379 add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK));
1381 raid5_plug_device(conf);
1385 /* cannot get stripe for read-ahead, just give-up */
1386 clear_bit(BIO_UPTODATE, &bi->bi_flags);
1391 spin_lock_irq(&conf->device_lock);
1392 if (--bi->bi_phys_segments == 0) {
1393 int bytes = bi->bi_size;
1395 if ( bio_data_dir(bi) == WRITE )
1396 md_write_end(mddev);
1398 bi->bi_end_io(bi, bytes, 0);
1400 spin_unlock_irq(&conf->device_lock);
1404 /* FIXME go_faster isn't used */
1405 static int sync_request (mddev_t *mddev, sector_t sector_nr, int go_faster)
1407 raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
1408 struct stripe_head *sh;
1409 int sectors_per_chunk = conf->chunk_size >> 9;
1411 unsigned long stripe;
1414 sector_t first_sector;
1415 int raid_disks = conf->raid_disks;
1416 int data_disks = raid_disks-1;
1418 if (sector_nr >= mddev->size <<1) {
1419 /* just being told to finish up .. nothing much to do */
1420 unplug_slaves(mddev);
1425 chunk_offset = sector_div(x, sectors_per_chunk);
1427 BUG_ON(x != stripe);
1429 first_sector = raid5_compute_sector((sector_t)stripe*data_disks*sectors_per_chunk
1430 + chunk_offset, raid_disks, data_disks, &dd_idx, &pd_idx, conf);
1431 sh = get_active_stripe(conf, sector_nr, pd_idx, 1);
1433 sh = get_active_stripe(conf, sector_nr, pd_idx, 0);
1434 /* make sure we don't swamp the stripe cache if someone else
1435 * is trying to get access
1437 set_current_state(TASK_UNINTERRUPTIBLE);
1438 schedule_timeout(1);
1440 spin_lock(&sh->lock);
1441 set_bit(STRIPE_SYNCING, &sh->state);
1442 clear_bit(STRIPE_INSYNC, &sh->state);
1443 spin_unlock(&sh->lock);
1448 return STRIPE_SECTORS;
1452 * This is our raid5 kernel thread.
1454 * We scan the hash table for stripes which can be handled now.
1455 * During the scan, completed stripes are saved for us by the interrupt
1456 * handler, so that they will not have to wait for our next wakeup.
1458 static void raid5d (mddev_t *mddev)
1460 struct stripe_head *sh;
1461 raid5_conf_t *conf = mddev_to_conf(mddev);
1464 PRINTK("+++ raid5d active\n");
1466 md_check_recovery(mddev);
1467 md_handle_safemode(mddev);
1470 spin_lock_irq(&conf->device_lock);
1472 struct list_head *first;
1474 if (list_empty(&conf->handle_list) &&
1475 atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD &&
1476 !blk_queue_plugged(mddev->queue) &&
1477 !list_empty(&conf->delayed_list))
1478 raid5_activate_delayed(conf);
1480 if (list_empty(&conf->handle_list))
1483 first = conf->handle_list.next;
1484 sh = list_entry(first, struct stripe_head, lru);
1486 list_del_init(first);
1487 atomic_inc(&sh->count);
1488 if (atomic_read(&sh->count)!= 1)
1490 spin_unlock_irq(&conf->device_lock);
1496 spin_lock_irq(&conf->device_lock);
1498 PRINTK("%d stripes handled\n", handled);
1500 spin_unlock_irq(&conf->device_lock);
1502 unplug_slaves(mddev);
1504 PRINTK("--- raid5d inactive\n");
1507 static int run (mddev_t *mddev)
1510 int raid_disk, memory;
1512 struct disk_info *disk;
1513 struct list_head *tmp;
1515 if (mddev->level != 5 && mddev->level != 4) {
1516 printk("raid5: %s: raid level not set to 4/5 (%d)\n", mdname(mddev), mddev->level);
1520 mddev->private = kmalloc (sizeof (raid5_conf_t)
1521 + mddev->raid_disks * sizeof(struct disk_info),
1523 if ((conf = mddev->private) == NULL)
1525 memset (conf, 0, sizeof (*conf) + mddev->raid_disks * sizeof(struct disk_info) );
1526 conf->mddev = mddev;
1528 if ((conf->stripe_hashtbl = (struct stripe_head **) __get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL)
1530 memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE);
1532 conf->device_lock = SPIN_LOCK_UNLOCKED;
1533 init_waitqueue_head(&conf->wait_for_stripe);
1534 INIT_LIST_HEAD(&conf->handle_list);
1535 INIT_LIST_HEAD(&conf->delayed_list);
1536 INIT_LIST_HEAD(&conf->inactive_list);
1537 atomic_set(&conf->active_stripes, 0);
1538 atomic_set(&conf->preread_active_stripes, 0);
1540 mddev->queue->unplug_fn = raid5_unplug_device;
1542 PRINTK("raid5: run(%s) called.\n", mdname(mddev));
1544 ITERATE_RDEV(mddev,rdev,tmp) {
1545 raid_disk = rdev->raid_disk;
1546 if (raid_disk >= mddev->raid_disks
1549 disk = conf->disks + raid_disk;
1553 if (rdev->in_sync) {
1554 char b[BDEVNAME_SIZE];
1555 printk(KERN_INFO "raid5: device %s operational as raid"
1556 " disk %d\n", bdevname(rdev->bdev,b),
1558 conf->working_disks++;
1562 conf->raid_disks = mddev->raid_disks;
1564 * 0 for a fully functional array, 1 for a degraded array.
1566 mddev->degraded = conf->failed_disks = conf->raid_disks - conf->working_disks;
1567 conf->mddev = mddev;
1568 conf->chunk_size = mddev->chunk_size;
1569 conf->level = mddev->level;
1570 conf->algorithm = mddev->layout;
1571 conf->max_nr_stripes = NR_STRIPES;
1573 if (!conf->chunk_size || conf->chunk_size % 4) {
1574 printk(KERN_ERR "raid5: invalid chunk size %d for %s\n",
1575 conf->chunk_size, mdname(mddev));
1578 if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) {
1580 "raid5: unsupported parity algorithm %d for %s\n",
1581 conf->algorithm, mdname(mddev));
1584 if (mddev->degraded > 1) {
1585 printk(KERN_ERR "raid5: not enough operational devices for %s"
1586 " (%d/%d failed)\n",
1587 mdname(mddev), conf->failed_disks, conf->raid_disks);
1591 if (mddev->degraded == 1 &&
1592 mddev->recovery_cp != MaxSector) {
1594 "raid5: cannot start dirty degraded array for %s\n",
1600 mddev->thread = md_register_thread(raid5d, mddev, "%s_raid5");
1601 if (!mddev->thread) {
1603 "raid5: couldn't allocate thread for %s\n",
1608 memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
1609 conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
1610 if (grow_stripes(conf, conf->max_nr_stripes)) {
1612 "raid5: couldn't allocate %dkB for buffers\n", memory);
1613 shrink_stripes(conf);
1614 md_unregister_thread(mddev->thread);
1617 printk(KERN_INFO "raid5: allocated %dkB for %s\n",
1618 memory, mdname(mddev));
1620 if (mddev->degraded == 0)
1621 printk("raid5: raid level %d set %s active with %d out of %d"
1622 " devices, algorithm %d\n", conf->level, mdname(mddev),
1623 mddev->raid_disks-mddev->degraded, mddev->raid_disks,
1626 printk(KERN_ALERT "raid5: raid level %d set %s active with %d"
1627 " out of %d devices, algorithm %d\n", conf->level,
1628 mdname(mddev), mddev->raid_disks - mddev->degraded,
1629 mddev->raid_disks, conf->algorithm);
1631 print_raid5_conf(conf);
1633 /* read-ahead size must cover two whole stripes, which is
1634 * 2 * (n-1) * chunksize where 'n' is the number of raid devices
1637 int stripe = (mddev->raid_disks-1) * mddev->chunk_size
1639 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
1640 mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
1643 /* Ok, everything is just fine now */
1644 mddev->array_size = mddev->size * (mddev->raid_disks - 1);
1648 print_raid5_conf(conf);
1649 if (conf->stripe_hashtbl)
1650 free_pages((unsigned long) conf->stripe_hashtbl,
1654 mddev->private = NULL;
1655 printk(KERN_ALERT "raid5: failed to run raid set %s\n", mdname(mddev));
1661 static int stop (mddev_t *mddev)
1663 raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
1665 md_unregister_thread(mddev->thread);
1666 mddev->thread = NULL;
1667 shrink_stripes(conf);
1668 free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER);
1670 mddev->private = NULL;
1675 static void print_sh (struct stripe_head *sh)
1679 printk("sh %llu, pd_idx %d, state %ld.\n",
1680 (unsigned long long)sh->sector, sh->pd_idx, sh->state);
1681 printk("sh %llu, count %d.\n",
1682 (unsigned long long)sh->sector, atomic_read(&sh->count));
1683 printk("sh %llu, ", (unsigned long long)sh->sector);
1684 for (i = 0; i < sh->raid_conf->raid_disks; i++) {
1685 printk("(cache%d: %p %ld) ",
1686 i, sh->dev[i].page, sh->dev[i].flags);
1691 static void printall (raid5_conf_t *conf)
1693 struct stripe_head *sh;
1696 spin_lock_irq(&conf->device_lock);
1697 for (i = 0; i < NR_HASH; i++) {
1698 sh = conf->stripe_hashtbl[i];
1699 for (; sh; sh = sh->hash_next) {
1700 if (sh->raid_conf != conf)
1705 spin_unlock_irq(&conf->device_lock);
1709 static void status (struct seq_file *seq, mddev_t *mddev)
1711 raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
1714 seq_printf (seq, " level %d, %dk chunk, algorithm %d", mddev->level, mddev->chunk_size >> 10, mddev->layout);
1715 seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->working_disks);
1716 for (i = 0; i < conf->raid_disks; i++)
1717 seq_printf (seq, "%s",
1718 conf->disks[i].rdev &&
1719 conf->disks[i].rdev->in_sync ? "U" : "_");
1720 seq_printf (seq, "]");
1723 seq_printf (seq, "<"#x":%d>", atomic_read(&conf->x))
1728 static void print_raid5_conf (raid5_conf_t *conf)
1731 struct disk_info *tmp;
1733 printk("RAID5 conf printout:\n");
1735 printk("(conf==NULL)\n");
1738 printk(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks,
1739 conf->working_disks, conf->failed_disks);
1741 for (i = 0; i < conf->raid_disks; i++) {
1742 char b[BDEVNAME_SIZE];
1743 tmp = conf->disks + i;
1745 printk(" disk %d, o:%d, dev:%s\n",
1746 i, !tmp->rdev->faulty,
1747 bdevname(tmp->rdev->bdev,b));
1751 static int raid5_spare_active(mddev_t *mddev)
1754 raid5_conf_t *conf = mddev->private;
1755 struct disk_info *tmp;
1757 spin_lock_irq(&conf->device_lock);
1758 for (i = 0; i < conf->raid_disks; i++) {
1759 tmp = conf->disks + i;
1761 && !tmp->rdev->faulty
1762 && !tmp->rdev->in_sync) {
1764 conf->failed_disks--;
1765 conf->working_disks++;
1766 tmp->rdev->in_sync = 1;
1769 spin_unlock_irq(&conf->device_lock);
1770 print_raid5_conf(conf);
1774 static int raid5_remove_disk(mddev_t *mddev, int number)
1776 raid5_conf_t *conf = mddev->private;
1778 struct disk_info *p = conf->disks + number;
1780 print_raid5_conf(conf);
1781 spin_lock_irq(&conf->device_lock);
1784 if (p->rdev->in_sync ||
1785 atomic_read(&p->rdev->nr_pending)) {
1795 spin_unlock_irq(&conf->device_lock);
1796 print_raid5_conf(conf);
1800 static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1802 raid5_conf_t *conf = mddev->private;
1805 struct disk_info *p;
1807 spin_lock_irq(&conf->device_lock);
1811 for (disk=0; disk < mddev->raid_disks; disk++)
1812 if ((p=conf->disks + disk)->rdev == NULL) {
1815 rdev->raid_disk = disk;
1819 spin_unlock_irq(&conf->device_lock);
1820 print_raid5_conf(conf);
1824 static mdk_personality_t raid5_personality=
1827 .owner = THIS_MODULE,
1828 .make_request = make_request,
1832 .error_handler = error,
1833 .hot_add_disk = raid5_add_disk,
1834 .hot_remove_disk= raid5_remove_disk,
1835 .spare_active = raid5_spare_active,
1836 .sync_request = sync_request,
1839 static int __init raid5_init (void)
1841 return register_md_personality (RAID5, &raid5_personality);
1844 static void raid5_exit (void)
1846 unregister_md_personality (RAID5);
1849 module_init(raid5_init);
1850 module_exit(raid5_exit);
1851 MODULE_LICENSE("GPL");
1852 MODULE_ALIAS("md-personality-4"); /* RAID5 */