2 md.c : Multiple Devices driver for Linux
3 Copyright (C) 1998, 1999, 2000 Ingo Molnar
5 completely rewritten, based on the MD driver code from Marc Zyngier
9 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
10 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com>
11 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
12 - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
13 - kmod support by: Cyrus Durgin
14 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
15 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
17 - lots of fixes and improvements to the RAID1/RAID5 and generic
18 RAID code (such as request based resynchronization):
20 Neil Brown <neilb@cse.unsw.edu.au>.
22 This program is free software; you can redistribute it and/or modify
23 it under the terms of the GNU General Public License as published by
24 the Free Software Foundation; either version 2, or (at your option)
27 You should have received a copy of the GNU General Public License
28 (for example /usr/src/linux/COPYING); if not, write to the Free
29 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
32 #include <linux/module.h>
33 #include <linux/config.h>
34 #include <linux/linkage.h>
35 #include <linux/raid/md.h>
36 #include <linux/sysctl.h>
37 #include <linux/devfs_fs_kernel.h>
38 #include <linux/buffer_head.h> /* for invalidate_bdev */
39 #include <linux/suspend.h>
41 #include <linux/init.h>
44 #include <linux/kmod.h>
47 #include <asm/unaligned.h>
49 #define MAJOR_NR MD_MAJOR
52 /* 63 partitions with the alternate major number (mdp) */
53 #define MdpMinorShift 6
56 #define dprintk(x...) ((void)(DEBUG && printk(x)))
60 static void autostart_arrays (int part);
63 static mdk_personality_t *pers[MAX_PERSONALITY];
64 static spinlock_t pers_lock = SPIN_LOCK_UNLOCKED;
67 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
68 * is 1000 KB/sec, so the extra system load does not show up that much.
69 * Increase it if you want to have more _guaranteed_ speed. Note that
70 * the RAID driver will use the maximum available bandwith if the IO
71 * subsystem is idle. There is also an 'absolute maximum' reconstruction
72 * speed limit - in case reconstruction slows down your system despite
75 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
78 static int sysctl_speed_limit_min = 1000;
79 static int sysctl_speed_limit_max = 200000;
81 static struct ctl_table_header *raid_table_header;
83 static ctl_table raid_table[] = {
85 .ctl_name = DEV_RAID_SPEED_LIMIT_MIN,
86 .procname = "speed_limit_min",
87 .data = &sysctl_speed_limit_min,
88 .maxlen = sizeof(int),
90 .proc_handler = &proc_dointvec,
93 .ctl_name = DEV_RAID_SPEED_LIMIT_MAX,
94 .procname = "speed_limit_max",
95 .data = &sysctl_speed_limit_max,
96 .maxlen = sizeof(int),
98 .proc_handler = &proc_dointvec,
103 static ctl_table raid_dir_table[] = {
105 .ctl_name = DEV_RAID,
114 static ctl_table raid_root_table[] = {
120 .child = raid_dir_table,
125 static struct block_device_operations md_fops;
128 * Enables to iterate over all existing md arrays
129 * all_mddevs_lock protects this list.
131 static LIST_HEAD(all_mddevs);
132 static spinlock_t all_mddevs_lock = SPIN_LOCK_UNLOCKED;
136 * iterates through all used mddevs in the system.
137 * We take care to grab the all_mddevs_lock whenever navigating
138 * the list, and to always hold a refcount when unlocked.
139 * Any code which breaks out of this loop while own
140 * a reference to the current mddev and must mddev_put it.
142 #define ITERATE_MDDEV(mddev,tmp) \
144 for (({ spin_lock(&all_mddevs_lock); \
145 tmp = all_mddevs.next; \
147 ({ if (tmp != &all_mddevs) \
148 mddev_get(list_entry(tmp, mddev_t, all_mddevs));\
149 spin_unlock(&all_mddevs_lock); \
150 if (mddev) mddev_put(mddev); \
151 mddev = list_entry(tmp, mddev_t, all_mddevs); \
152 tmp != &all_mddevs;}); \
153 ({ spin_lock(&all_mddevs_lock); \
157 static int md_fail_request (request_queue_t *q, struct bio *bio)
159 bio_io_error(bio, bio->bi_size);
163 static inline mddev_t *mddev_get(mddev_t *mddev)
165 atomic_inc(&mddev->active);
169 static void mddev_put(mddev_t *mddev)
171 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
173 if (!mddev->raid_disks && list_empty(&mddev->disks)) {
174 list_del(&mddev->all_mddevs);
175 blk_put_queue(mddev->queue);
178 spin_unlock(&all_mddevs_lock);
181 static mddev_t * mddev_find(dev_t unit)
183 mddev_t *mddev, *new = NULL;
186 spin_lock(&all_mddevs_lock);
187 list_for_each_entry(mddev, &all_mddevs, all_mddevs)
188 if (mddev->unit == unit) {
190 spin_unlock(&all_mddevs_lock);
197 list_add(&new->all_mddevs, &all_mddevs);
198 spin_unlock(&all_mddevs_lock);
201 spin_unlock(&all_mddevs_lock);
203 new = (mddev_t *) kmalloc(sizeof(*new), GFP_KERNEL);
207 memset(new, 0, sizeof(*new));
210 if (MAJOR(unit) == MD_MAJOR)
211 new->md_minor = MINOR(unit);
213 new->md_minor = MINOR(unit) >> MdpMinorShift;
215 init_MUTEX(&new->reconfig_sem);
216 INIT_LIST_HEAD(&new->disks);
217 INIT_LIST_HEAD(&new->all_mddevs);
218 init_timer(&new->safemode_timer);
219 atomic_set(&new->active, 1);
221 new->queue = blk_alloc_queue(GFP_KERNEL);
227 blk_queue_make_request(new->queue, md_fail_request);
232 static inline int mddev_lock(mddev_t * mddev)
234 return down_interruptible(&mddev->reconfig_sem);
237 static inline void mddev_lock_uninterruptible(mddev_t * mddev)
239 down(&mddev->reconfig_sem);
242 static inline int mddev_trylock(mddev_t * mddev)
244 return down_trylock(&mddev->reconfig_sem);
247 static inline void mddev_unlock(mddev_t * mddev)
249 up(&mddev->reconfig_sem);
252 md_wakeup_thread(mddev->thread);
255 mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
258 struct list_head *tmp;
260 ITERATE_RDEV(mddev,rdev,tmp) {
261 if (rdev->desc_nr == nr)
267 static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev)
269 struct list_head *tmp;
272 ITERATE_RDEV(mddev,rdev,tmp) {
273 if (rdev->bdev->bd_dev == dev)
279 inline static sector_t calc_dev_sboffset(struct block_device *bdev)
281 sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
282 return MD_NEW_SIZE_BLOCKS(size);
285 static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size)
289 size = rdev->sb_offset;
292 size &= ~((sector_t)chunk_size/1024 - 1);
296 static int alloc_disk_sb(mdk_rdev_t * rdev)
301 rdev->sb_page = alloc_page(GFP_KERNEL);
302 if (!rdev->sb_page) {
303 printk(KERN_ALERT "md: out of memory.\n");
310 static void free_disk_sb(mdk_rdev_t * rdev)
313 page_cache_release(rdev->sb_page);
315 rdev->sb_page = NULL;
322 static int bi_complete(struct bio *bio, unsigned int bytes_done, int error)
327 complete((struct completion*)bio->bi_private);
331 static int sync_page_io(struct block_device *bdev, sector_t sector, int size,
332 struct page *page, int rw)
336 struct completion event;
338 rw |= (1 << BIO_RW_SYNC);
341 bio.bi_io_vec = &vec;
349 bio.bi_sector = sector;
350 init_completion(&event);
351 bio.bi_private = &event;
352 bio.bi_end_io = bi_complete;
353 submit_bio(rw, &bio);
354 wait_for_completion(&event);
356 return test_bit(BIO_UPTODATE, &bio.bi_flags);
359 static int read_disk_sb(mdk_rdev_t * rdev)
361 char b[BDEVNAME_SIZE];
362 if (!rdev->sb_page) {
370 if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, MD_SB_BYTES, rdev->sb_page, READ))
376 printk(KERN_ERR "md: disabled device %s, could not read superblock.\n",
377 bdevname(rdev->bdev,b));
381 static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
383 if ( (sb1->set_uuid0 == sb2->set_uuid0) &&
384 (sb1->set_uuid1 == sb2->set_uuid1) &&
385 (sb1->set_uuid2 == sb2->set_uuid2) &&
386 (sb1->set_uuid3 == sb2->set_uuid3))
394 static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
397 mdp_super_t *tmp1, *tmp2;
399 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
400 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
402 if (!tmp1 || !tmp2) {
404 printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n");
412 * nr_disks is not constant
417 if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
431 static unsigned int calc_sb_csum(mdp_super_t * sb)
433 unsigned int disk_csum, csum;
435 disk_csum = sb->sb_csum;
437 csum = csum_partial((void *)sb, MD_SB_BYTES, 0);
438 sb->sb_csum = disk_csum;
443 * Handle superblock details.
444 * We want to be able to handle multiple superblock formats
445 * so we have a common interface to them all, and an array of
446 * different handlers.
447 * We rely on user-space to write the initial superblock, and support
448 * reading and updating of superblocks.
449 * Interface methods are:
450 * int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version)
451 * loads and validates a superblock on dev.
452 * if refdev != NULL, compare superblocks on both devices
454 * 0 - dev has a superblock that is compatible with refdev
455 * 1 - dev has a superblock that is compatible and newer than refdev
456 * so dev should be used as the refdev in future
457 * -EINVAL superblock incompatible or invalid
458 * -othererror e.g. -EIO
460 * int validate_super(mddev_t *mddev, mdk_rdev_t *dev)
461 * Verify that dev is acceptable into mddev.
462 * The first time, mddev->raid_disks will be 0, and data from
463 * dev should be merged in. Subsequent calls check that dev
464 * is new enough. Return 0 or -EINVAL
466 * void sync_super(mddev_t *mddev, mdk_rdev_t *dev)
467 * Update the superblock for rdev with data in mddev
468 * This does not write to disc.
474 struct module *owner;
475 int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version);
476 int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev);
477 void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev);
481 * load_super for 0.90.0
483 static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
485 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
491 * Calculate the position of the superblock,
492 * it's at the end of the disk.
494 * It also happens to be a multiple of 4Kb.
496 sb_offset = calc_dev_sboffset(rdev->bdev);
497 rdev->sb_offset = sb_offset;
499 ret = read_disk_sb(rdev);
504 bdevname(rdev->bdev, b);
505 sb = (mdp_super_t*)page_address(rdev->sb_page);
507 if (sb->md_magic != MD_SB_MAGIC) {
508 printk(KERN_ERR "md: invalid raid superblock magic on %s\n",
513 if (sb->major_version != 0 ||
514 sb->minor_version != 90) {
515 printk(KERN_WARNING "Bad version number %d.%d on %s\n",
516 sb->major_version, sb->minor_version,
521 if (sb->raid_disks <= 0)
524 if (calc_sb_csum(sb) != sb->sb_csum) {
525 printk(KERN_WARNING "md: invalid superblock checksum on %s\n",
530 rdev->preferred_minor = sb->md_minor;
531 rdev->data_offset = 0;
533 if (sb->level == MULTIPATH)
536 rdev->desc_nr = sb->this_disk.number;
542 mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page);
543 if (!uuid_equal(refsb, sb)) {
544 printk(KERN_WARNING "md: %s has different UUID to %s\n",
545 b, bdevname(refdev->bdev,b2));
548 if (!sb_equal(refsb, sb)) {
549 printk(KERN_WARNING "md: %s has same UUID"
550 " but different superblock to %s\n",
551 b, bdevname(refdev->bdev, b2));
555 ev2 = md_event(refsb);
561 rdev->size = calc_dev_size(rdev, sb->chunk_size);
568 * validate_super for 0.90.0
570 static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
573 mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page);
575 if (mddev->raid_disks == 0) {
576 mddev->major_version = 0;
577 mddev->minor_version = sb->minor_version;
578 mddev->patch_version = sb->patch_version;
579 mddev->persistent = ! sb->not_persistent;
580 mddev->chunk_size = sb->chunk_size;
581 mddev->ctime = sb->ctime;
582 mddev->utime = sb->utime;
583 mddev->level = sb->level;
584 mddev->layout = sb->layout;
585 mddev->raid_disks = sb->raid_disks;
586 mddev->size = sb->size;
587 mddev->events = md_event(sb);
589 if (sb->state & (1<<MD_SB_CLEAN))
590 mddev->recovery_cp = MaxSector;
592 if (sb->events_hi == sb->cp_events_hi &&
593 sb->events_lo == sb->cp_events_lo) {
594 mddev->recovery_cp = sb->recovery_cp;
596 mddev->recovery_cp = 0;
599 memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
600 memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
601 memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
602 memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
604 mddev->max_disks = MD_SB_DISKS;
609 if (ev1 < mddev->events)
612 if (mddev->level != LEVEL_MULTIPATH) {
613 rdev->raid_disk = -1;
614 rdev->in_sync = rdev->faulty = 0;
615 desc = sb->disks + rdev->desc_nr;
617 if (desc->state & (1<<MD_DISK_FAULTY))
619 else if (desc->state & (1<<MD_DISK_SYNC) &&
620 desc->raid_disk < mddev->raid_disks) {
622 rdev->raid_disk = desc->raid_disk;
629 * sync_super for 0.90.0
631 static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
634 struct list_head *tmp;
636 int next_spare = mddev->raid_disks;
638 /* make rdev->sb match mddev data..
641 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare);
642 * 3/ any empty disks < next_spare become removed
644 * disks[0] gets initialised to REMOVED because
645 * we cannot be sure from other fields if it has
646 * been initialised or not.
649 int active=0, working=0,failed=0,spare=0,nr_disks=0;
651 sb = (mdp_super_t*)page_address(rdev->sb_page);
653 memset(sb, 0, sizeof(*sb));
655 sb->md_magic = MD_SB_MAGIC;
656 sb->major_version = mddev->major_version;
657 sb->minor_version = mddev->minor_version;
658 sb->patch_version = mddev->patch_version;
659 sb->gvalid_words = 0; /* ignored */
660 memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
661 memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
662 memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
663 memcpy(&sb->set_uuid3, mddev->uuid+12,4);
665 sb->ctime = mddev->ctime;
666 sb->level = mddev->level;
667 sb->size = mddev->size;
668 sb->raid_disks = mddev->raid_disks;
669 sb->md_minor = mddev->md_minor;
670 sb->not_persistent = !mddev->persistent;
671 sb->utime = mddev->utime;
673 sb->events_hi = (mddev->events>>32);
674 sb->events_lo = (u32)mddev->events;
678 sb->recovery_cp = mddev->recovery_cp;
679 sb->cp_events_hi = (mddev->events>>32);
680 sb->cp_events_lo = (u32)mddev->events;
681 if (mddev->recovery_cp == MaxSector)
682 sb->state = (1<< MD_SB_CLEAN);
686 sb->layout = mddev->layout;
687 sb->chunk_size = mddev->chunk_size;
689 sb->disks[0].state = (1<<MD_DISK_REMOVED);
690 ITERATE_RDEV(mddev,rdev2,tmp) {
692 if (rdev2->raid_disk >= 0 && rdev2->in_sync && !rdev2->faulty)
693 rdev2->desc_nr = rdev2->raid_disk;
695 rdev2->desc_nr = next_spare++;
696 d = &sb->disks[rdev2->desc_nr];
698 d->number = rdev2->desc_nr;
699 d->major = MAJOR(rdev2->bdev->bd_dev);
700 d->minor = MINOR(rdev2->bdev->bd_dev);
701 if (rdev2->raid_disk >= 0 && rdev->in_sync && !rdev2->faulty)
702 d->raid_disk = rdev2->raid_disk;
704 d->raid_disk = rdev2->desc_nr; /* compatibility */
706 d->state = (1<<MD_DISK_FAULTY);
708 } else if (rdev2->in_sync) {
709 d->state = (1<<MD_DISK_ACTIVE);
710 d->state |= (1<<MD_DISK_SYNC);
720 /* now set the "removed" and "faulty" bits on any missing devices */
721 for (i=0 ; i < mddev->raid_disks ; i++) {
722 mdp_disk_t *d = &sb->disks[i];
723 if (d->state == 0 && d->number == 0) {
726 d->state = (1<<MD_DISK_REMOVED);
727 d->state |= (1<<MD_DISK_FAULTY);
731 sb->nr_disks = nr_disks;
732 sb->active_disks = active;
733 sb->working_disks = working;
734 sb->failed_disks = failed;
735 sb->spare_disks = spare;
737 sb->this_disk = sb->disks[rdev->desc_nr];
738 sb->sb_csum = calc_sb_csum(sb);
742 * version 1 superblock
745 static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb)
747 unsigned int disk_csum, csum;
748 int size = 256 + sb->max_dev*2;
750 disk_csum = sb->sb_csum;
752 csum = csum_partial((void *)sb, size, 0);
753 sb->sb_csum = disk_csum;
757 static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
759 struct mdp_superblock_1 *sb;
762 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
765 * Calculate the position of the superblock.
766 * It is always aligned to a 4K boundary and
767 * depeding on minor_version, it can be:
768 * 0: At least 8K, but less than 12K, from end of device
769 * 1: At start of device
770 * 2: 4K from start of device.
772 switch(minor_version) {
774 sb_offset = rdev->bdev->bd_inode->i_size >> 9;
777 /* convert from sectors to K */
789 rdev->sb_offset = sb_offset;
791 ret = read_disk_sb(rdev);
795 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
797 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
798 sb->major_version != cpu_to_le32(1) ||
799 le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
800 le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) ||
801 sb->feature_map != 0)
804 if (calc_sb_1_csum(sb) != sb->sb_csum) {
805 printk("md: invalid superblock checksum on %s\n",
806 bdevname(rdev->bdev,b));
809 rdev->preferred_minor = 0xffff;
810 rdev->data_offset = le64_to_cpu(sb->data_offset);
816 struct mdp_superblock_1 *refsb =
817 (struct mdp_superblock_1*)page_address(refdev->sb_page);
819 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
820 sb->level != refsb->level ||
821 sb->layout != refsb->layout ||
822 sb->chunksize != refsb->chunksize) {
823 printk(KERN_WARNING "md: %s has strangely different"
824 " superblock to %s\n",
825 bdevname(rdev->bdev,b),
826 bdevname(refdev->bdev,b2));
829 ev1 = le64_to_cpu(sb->events);
830 ev2 = le64_to_cpu(refsb->events);
836 rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2;
838 rdev->size = rdev->sb_offset;
839 if (rdev->size < le64_to_cpu(sb->data_size)/2)
841 rdev->size = le64_to_cpu(sb->data_size)/2;
842 if (le32_to_cpu(sb->chunksize))
843 rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1);
847 static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
849 struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
851 if (mddev->raid_disks == 0) {
852 mddev->major_version = 1;
853 mddev->minor_version = 0;
854 mddev->patch_version = 0;
855 mddev->persistent = 1;
856 mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9;
857 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
858 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
859 mddev->level = le32_to_cpu(sb->level);
860 mddev->layout = le32_to_cpu(sb->layout);
861 mddev->raid_disks = le32_to_cpu(sb->raid_disks);
862 mddev->size = (u32)le64_to_cpu(sb->size);
863 mddev->events = le64_to_cpu(sb->events);
865 mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
866 memcpy(mddev->uuid, sb->set_uuid, 16);
868 mddev->max_disks = (4096-256)/2;
871 ev1 = le64_to_cpu(sb->events);
873 if (ev1 < mddev->events)
877 if (mddev->level != LEVEL_MULTIPATH) {
879 rdev->desc_nr = le32_to_cpu(sb->dev_number);
880 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
882 case 0xffff: /* spare */
885 rdev->raid_disk = -1;
887 case 0xfffe: /* faulty */
890 rdev->raid_disk = -1;
895 rdev->raid_disk = role;
902 static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
904 struct mdp_superblock_1 *sb;
905 struct list_head *tmp;
908 /* make rdev->sb match mddev and rdev data. */
910 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
914 memset(sb->pad1, 0, sizeof(sb->pad1));
915 memset(sb->pad2, 0, sizeof(sb->pad2));
916 memset(sb->pad3, 0, sizeof(sb->pad3));
918 sb->utime = cpu_to_le64((__u64)mddev->utime);
919 sb->events = cpu_to_le64(mddev->events);
921 sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
923 sb->resync_offset = cpu_to_le64(0);
926 ITERATE_RDEV(mddev,rdev2,tmp)
927 if (rdev2->desc_nr > max_dev)
928 max_dev = rdev2->desc_nr;
930 sb->max_dev = max_dev;
931 for (i=0; i<max_dev;i++)
932 sb->dev_roles[max_dev] = cpu_to_le16(0xfffe);
934 ITERATE_RDEV(mddev,rdev2,tmp) {
937 sb->dev_roles[i] = cpu_to_le16(0xfffe);
938 else if (rdev2->in_sync)
939 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
941 sb->dev_roles[i] = cpu_to_le16(0xffff);
944 sb->recovery_offset = cpu_to_le64(0); /* not supported yet */
948 struct super_type super_types[] = {
951 .owner = THIS_MODULE,
952 .load_super = super_90_load,
953 .validate_super = super_90_validate,
954 .sync_super = super_90_sync,
958 .owner = THIS_MODULE,
959 .load_super = super_1_load,
960 .validate_super = super_1_validate,
961 .sync_super = super_1_sync,
965 static mdk_rdev_t * match_dev_unit(mddev_t *mddev, mdk_rdev_t *dev)
967 struct list_head *tmp;
970 ITERATE_RDEV(mddev,rdev,tmp)
971 if (rdev->bdev->bd_contains == dev->bdev->bd_contains)
977 static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
979 struct list_head *tmp;
982 ITERATE_RDEV(mddev1,rdev,tmp)
983 if (match_dev_unit(mddev2, rdev))
989 static LIST_HEAD(pending_raid_disks);
991 static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
993 mdk_rdev_t *same_pdev;
994 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1000 same_pdev = match_dev_unit(mddev, rdev);
1003 "%s: WARNING: %s appears to be on the same physical"
1004 " disk as %s. True\n protection against single-disk"
1005 " failure might be compromised.\n",
1006 mdname(mddev), bdevname(rdev->bdev,b),
1007 bdevname(same_pdev->bdev,b2));
1009 /* Verify rdev->desc_nr is unique.
1010 * If it is -1, assign a free number, else
1011 * check number is not in use
1013 if (rdev->desc_nr < 0) {
1015 if (mddev->pers) choice = mddev->raid_disks;
1016 while (find_rdev_nr(mddev, choice))
1018 rdev->desc_nr = choice;
1020 if (find_rdev_nr(mddev, rdev->desc_nr))
1024 list_add(&rdev->same_set, &mddev->disks);
1025 rdev->mddev = mddev;
1026 printk(KERN_INFO "md: bind<%s>\n", bdevname(rdev->bdev,b));
1030 static void unbind_rdev_from_array(mdk_rdev_t * rdev)
1032 char b[BDEVNAME_SIZE];
1037 list_del_init(&rdev->same_set);
1038 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
1043 * prevent the device from being mounted, repartitioned or
1044 * otherwise reused by a RAID array (or any other kernel
1045 * subsystem), by opening the device. [simply getting an
1046 * inode is not enough, the SCSI module usage code needs
1047 * an explicit open() on the device]
1049 static int lock_rdev(mdk_rdev_t *rdev, dev_t dev)
1052 struct block_device *bdev;
1054 bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
1056 return PTR_ERR(bdev);
1057 err = bd_claim(bdev, rdev);
1066 static void unlock_rdev(mdk_rdev_t *rdev)
1068 struct block_device *bdev = rdev->bdev;
1076 void md_autodetect_dev(dev_t dev);
1078 static void export_rdev(mdk_rdev_t * rdev)
1080 char b[BDEVNAME_SIZE];
1081 printk(KERN_INFO "md: export_rdev(%s)\n",
1082 bdevname(rdev->bdev,b));
1086 list_del_init(&rdev->same_set);
1088 md_autodetect_dev(rdev->bdev->bd_dev);
1094 static void kick_rdev_from_array(mdk_rdev_t * rdev)
1096 unbind_rdev_from_array(rdev);
1100 static void export_array(mddev_t *mddev)
1102 struct list_head *tmp;
1105 ITERATE_RDEV(mddev,rdev,tmp) {
1110 kick_rdev_from_array(rdev);
1112 if (!list_empty(&mddev->disks))
1114 mddev->raid_disks = 0;
1115 mddev->major_version = 0;
1118 static void print_desc(mdp_disk_t *desc)
1120 char b[BDEVNAME_SIZE];
1122 printk(" DISK<N:%d,%s(%d,%d),R:%d,S:%d>\n", desc->number,
1123 __bdevname(MKDEV(desc->major, desc->minor), b),
1124 desc->major,desc->minor,desc->raid_disk,desc->state);
1127 static void print_sb(mdp_super_t *sb)
1132 "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
1133 sb->major_version, sb->minor_version, sb->patch_version,
1134 sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
1136 printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n",
1137 sb->level, sb->size, sb->nr_disks, sb->raid_disks,
1138 sb->md_minor, sb->layout, sb->chunk_size);
1139 printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d"
1140 " FD:%d SD:%d CSUM:%08x E:%08lx\n",
1141 sb->utime, sb->state, sb->active_disks, sb->working_disks,
1142 sb->failed_disks, sb->spare_disks,
1143 sb->sb_csum, (unsigned long)sb->events_lo);
1146 for (i = 0; i < MD_SB_DISKS; i++) {
1149 desc = sb->disks + i;
1150 if (desc->number || desc->major || desc->minor ||
1151 desc->raid_disk || (desc->state && (desc->state != 4))) {
1152 printk(" D %2d: ", i);
1156 printk(KERN_INFO "md: THIS: ");
1157 print_desc(&sb->this_disk);
1161 static void print_rdev(mdk_rdev_t *rdev)
1163 char b[BDEVNAME_SIZE];
1164 printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n",
1165 bdevname(rdev->bdev,b), (unsigned long long)rdev->size,
1166 rdev->faulty, rdev->in_sync, rdev->desc_nr);
1167 if (rdev->sb_loaded) {
1168 printk(KERN_INFO "md: rdev superblock:\n");
1169 print_sb((mdp_super_t*)page_address(rdev->sb_page));
1171 printk(KERN_INFO "md: no rdev superblock!\n");
1174 void md_print_devices(void)
1176 struct list_head *tmp, *tmp2;
1179 char b[BDEVNAME_SIZE];
1182 printk("md: **********************************\n");
1183 printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n");
1184 printk("md: **********************************\n");
1185 ITERATE_MDDEV(mddev,tmp) {
1186 printk("%s: ", mdname(mddev));
1188 ITERATE_RDEV(mddev,rdev,tmp2)
1189 printk("<%s>", bdevname(rdev->bdev,b));
1192 ITERATE_RDEV(mddev,rdev,tmp2)
1195 printk("md: **********************************\n");
1200 static int write_disk_sb(mdk_rdev_t * rdev)
1202 char b[BDEVNAME_SIZE];
1203 if (!rdev->sb_loaded) {
1212 dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
1213 bdevname(rdev->bdev,b),
1214 (unsigned long long)rdev->sb_offset);
1216 if (sync_page_io(rdev->bdev, rdev->sb_offset<<1, MD_SB_BYTES, rdev->sb_page, WRITE))
1219 printk("md: write_disk_sb failed for device %s\n",
1220 bdevname(rdev->bdev,b));
1224 static void sync_sbs(mddev_t * mddev)
1227 struct list_head *tmp;
1229 ITERATE_RDEV(mddev,rdev,tmp) {
1230 super_types[mddev->major_version].
1231 sync_super(mddev, rdev);
1232 rdev->sb_loaded = 1;
1236 static void md_update_sb(mddev_t * mddev)
1238 int err, count = 100;
1239 struct list_head *tmp;
1242 mddev->sb_dirty = 0;
1244 mddev->utime = get_seconds();
1247 if (!mddev->events) {
1249 * oops, this 64-bit counter should never wrap.
1250 * Either we are in around ~1 trillion A.C., assuming
1251 * 1 reboot per second, or we have a bug:
1259 * do not write anything to disk if using
1260 * nonpersistent superblocks
1262 if (!mddev->persistent)
1266 "md: updating %s RAID superblock on device (in sync %d)\n",
1267 mdname(mddev),mddev->in_sync);
1270 ITERATE_RDEV(mddev,rdev,tmp) {
1271 char b[BDEVNAME_SIZE];
1272 dprintk(KERN_INFO "md: ");
1274 dprintk("(skipping faulty ");
1276 dprintk("%s ", bdevname(rdev->bdev,b));
1277 if (!rdev->faulty) {
1278 err += write_disk_sb(rdev);
1281 if (!err && mddev->level == LEVEL_MULTIPATH)
1282 /* only need to write one superblock... */
1287 printk(KERN_ERR "md: errors occurred during superblock"
1288 " update, repeating\n");
1292 "md: excessive errors occurred during superblock update, exiting\n");
1297 * Import a device. If 'super_format' >= 0, then sanity check the superblock
1299 * mark the device faulty if:
1301 * - the device is nonexistent (zero size)
1302 * - the device has no valid superblock
1304 * a faulty rdev _never_ has rdev->sb set.
1306 static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor)
1308 char b[BDEVNAME_SIZE];
1313 rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL);
1315 printk(KERN_ERR "md: could not alloc mem for %s!\n",
1316 __bdevname(newdev, b));
1317 return ERR_PTR(-ENOMEM);
1319 memset(rdev, 0, sizeof(*rdev));
1321 if ((err = alloc_disk_sb(rdev)))
1324 err = lock_rdev(rdev, newdev);
1326 printk(KERN_ERR "md: could not lock %s.\n",
1327 __bdevname(newdev, b));
1333 rdev->data_offset = 0;
1334 atomic_set(&rdev->nr_pending, 0);
1336 size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
1339 "md: %s has zero or unknown size, marking faulty!\n",
1340 bdevname(rdev->bdev,b));
1345 if (super_format >= 0) {
1346 err = super_types[super_format].
1347 load_super(rdev, NULL, super_minor);
1348 if (err == -EINVAL) {
1350 "md: %s has invalid sb, not importing!\n",
1351 bdevname(rdev->bdev,b));
1356 "md: could not read %s's sb, not importing!\n",
1357 bdevname(rdev->bdev,b));
1361 INIT_LIST_HEAD(&rdev->same_set);
1366 if (rdev->sb_page) {
1372 return ERR_PTR(err);
1376 * Check a full RAID array for plausibility
1380 static int analyze_sbs(mddev_t * mddev)
1383 struct list_head *tmp;
1384 mdk_rdev_t *rdev, *freshest;
1385 char b[BDEVNAME_SIZE];
1388 ITERATE_RDEV(mddev,rdev,tmp)
1389 switch (super_types[mddev->major_version].
1390 load_super(rdev, freshest, mddev->minor_version)) {
1398 "md: fatal superblock inconsistency in %s"
1399 " -- removing from array\n",
1400 bdevname(rdev->bdev,b));
1401 kick_rdev_from_array(rdev);
1405 super_types[mddev->major_version].
1406 validate_super(mddev, freshest);
1409 ITERATE_RDEV(mddev,rdev,tmp) {
1410 if (rdev != freshest)
1411 if (super_types[mddev->major_version].
1412 validate_super(mddev, rdev)) {
1413 printk(KERN_WARNING "md: kicking non-fresh %s"
1415 bdevname(rdev->bdev,b));
1416 kick_rdev_from_array(rdev);
1419 if (mddev->level == LEVEL_MULTIPATH) {
1420 rdev->desc_nr = i++;
1421 rdev->raid_disk = rdev->desc_nr;
1428 * Check if we can support this RAID array
1430 if (mddev->major_version != MD_MAJOR_VERSION ||
1431 mddev->minor_version > MD_MINOR_VERSION) {
1433 "md: %s: unsupported raid array version %d.%d.%d\n",
1434 mdname(mddev), mddev->major_version,
1435 mddev->minor_version, mddev->patch_version);
1439 if ((mddev->recovery_cp != MaxSector) &&
1440 ((mddev->level == 1) ||
1441 ((mddev->level >= 4) && (mddev->level <= 6))))
1442 printk(KERN_ERR "md: %s: raid array is not clean"
1443 " -- starting background reconstruction\n",
1453 static struct kobject *md_probe(dev_t dev, int *part, void *data)
1455 static DECLARE_MUTEX(disks_sem);
1456 mddev_t *mddev = mddev_find(dev);
1457 struct gendisk *disk;
1458 int partitioned = (MAJOR(dev) != MD_MAJOR);
1459 int shift = partitioned ? MdpMinorShift : 0;
1460 int unit = MINOR(dev) >> shift;
1466 if (mddev->gendisk) {
1471 disk = alloc_disk(1 << shift);
1477 disk->major = MAJOR(dev);
1478 disk->first_minor = unit << shift;
1480 sprintf(disk->disk_name, "md_d%d", unit);
1482 sprintf(disk->disk_name, "md%d", unit);
1483 disk->fops = &md_fops;
1484 disk->private_data = mddev;
1485 disk->queue = mddev->queue;
1487 mddev->gendisk = disk;
1492 void md_wakeup_thread(mdk_thread_t *thread);
1494 static void md_safemode_timeout(unsigned long data)
1496 mddev_t *mddev = (mddev_t *) data;
1498 mddev->safemode = 1;
1499 md_wakeup_thread(mddev->thread);
1503 static int do_md_run(mddev_t * mddev)
1507 struct list_head *tmp;
1509 struct gendisk *disk;
1510 char b[BDEVNAME_SIZE];
1512 if (list_empty(&mddev->disks)) {
1521 * Analyze all RAID superblock(s)
1523 if (!mddev->raid_disks && analyze_sbs(mddev)) {
1528 chunk_size = mddev->chunk_size;
1529 pnum = level_to_pers(mddev->level);
1531 if ((pnum != MULTIPATH) && (pnum != RAID1)) {
1534 * 'default chunksize' in the old md code used to
1535 * be PAGE_SIZE, baaad.
1536 * we abort here to be on the safe side. We don't
1537 * want to continue the bad practice.
1540 "no chunksize specified, see 'man raidtab'\n");
1543 if (chunk_size > MAX_CHUNK_SIZE) {
1544 printk(KERN_ERR "too big chunk_size: %d > %d\n",
1545 chunk_size, MAX_CHUNK_SIZE);
1549 * chunk-size has to be a power of 2 and multiples of PAGE_SIZE
1551 if ( (1 << ffz(~chunk_size)) != chunk_size) {
1555 if (chunk_size < PAGE_SIZE) {
1556 printk(KERN_ERR "too small chunk_size: %d < %ld\n",
1557 chunk_size, PAGE_SIZE);
1561 /* devices must have minimum size of one chunk */
1562 ITERATE_RDEV(mddev,rdev,tmp) {
1565 if (rdev->size < chunk_size / 1024) {
1567 "md: Dev %s smaller than chunk_size:"
1569 bdevname(rdev->bdev,b),
1570 (unsigned long long)rdev->size,
1577 if (pnum >= MAX_PERSONALITY) {
1585 request_module("md-personality-%d", pnum);
1590 * Drop all container device buffers, from now on
1591 * the only valid external interface is through the md
1593 * Also find largest hardsector size
1595 ITERATE_RDEV(mddev,rdev,tmp) {
1598 sync_blockdev(rdev->bdev);
1599 invalidate_bdev(rdev->bdev, 0);
1602 md_probe(mddev->unit, NULL, NULL);
1603 disk = mddev->gendisk;
1607 spin_lock(&pers_lock);
1608 if (!pers[pnum] || !try_module_get(pers[pnum]->owner)) {
1609 spin_unlock(&pers_lock);
1610 printk(KERN_WARNING "md: personality %d is not loaded!\n",
1615 mddev->pers = pers[pnum];
1616 spin_unlock(&pers_lock);
1618 err = mddev->pers->run(mddev);
1620 printk(KERN_ERR "md: pers->run() failed ...\n");
1621 module_put(mddev->pers->owner);
1625 atomic_set(&mddev->writes_pending,0);
1626 mddev->safemode = 0;
1627 mddev->safemode_timer.function = md_safemode_timeout;
1628 mddev->safemode_timer.data = (unsigned long) mddev;
1629 mddev->safemode_delay = (20 * HZ)/1000 +1; /* 20 msec delay */
1632 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
1634 if (mddev->sb_dirty)
1635 md_update_sb(mddev);
1637 set_capacity(disk, mddev->array_size<<1);
1639 /* If we call blk_queue_make_request here, it will
1640 * re-initialise max_sectors etc which may have been
1641 * refined inside -> run. So just set the bits we need to set.
1642 * Most initialisation happended when we called
1643 * blk_queue_make_request(..., md_fail_request)
1646 mddev->queue->queuedata = mddev;
1647 mddev->queue->make_request_fn = mddev->pers->make_request;
1653 static int restart_array(mddev_t *mddev)
1655 struct gendisk *disk = mddev->gendisk;
1659 * Complain if it has no devices
1662 if (list_empty(&mddev->disks))
1670 mddev->safemode = 0;
1672 set_disk_ro(disk, 0);
1674 printk(KERN_INFO "md: %s switched to read-write mode.\n",
1677 * Kick recovery or resync if necessary
1679 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
1680 md_wakeup_thread(mddev->thread);
1683 printk(KERN_ERR "md: %s has no personality assigned.\n",
1692 static int do_md_stop(mddev_t * mddev, int ro)
1695 struct gendisk *disk = mddev->gendisk;
1698 if (atomic_read(&mddev->active)>2) {
1699 printk("md: %s still in use.\n",mdname(mddev));
1703 if (mddev->sync_thread) {
1704 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1705 md_unregister_thread(mddev->sync_thread);
1706 mddev->sync_thread = NULL;
1709 del_timer_sync(&mddev->safemode_timer);
1711 invalidate_partition(disk, 0);
1720 set_disk_ro(disk, 0);
1721 blk_queue_make_request(mddev->queue, md_fail_request);
1722 mddev->pers->stop(mddev);
1723 module_put(mddev->pers->owner);
1728 if (!mddev->in_sync) {
1729 /* mark array as shutdown cleanly */
1731 md_update_sb(mddev);
1734 set_disk_ro(disk, 1);
1737 * Free resources if final stop
1740 struct gendisk *disk;
1741 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev));
1743 export_array(mddev);
1745 mddev->array_size = 0;
1746 disk = mddev->gendisk;
1748 set_capacity(disk, 0);
1751 printk(KERN_INFO "md: %s switched to read-only mode.\n",
1758 static void autorun_array(mddev_t *mddev)
1761 struct list_head *tmp;
1764 if (list_empty(&mddev->disks)) {
1769 printk(KERN_INFO "md: running: ");
1771 ITERATE_RDEV(mddev,rdev,tmp) {
1772 char b[BDEVNAME_SIZE];
1773 printk("<%s>", bdevname(rdev->bdev,b));
1777 err = do_md_run (mddev);
1779 printk(KERN_WARNING "md :do_md_run() returned %d\n", err);
1780 do_md_stop (mddev, 0);
1785 * lets try to run arrays based on all disks that have arrived
1786 * until now. (those are in pending_raid_disks)
1788 * the method: pick the first pending disk, collect all disks with
1789 * the same UUID, remove all from the pending list and put them into
1790 * the 'same_array' list. Then order this list based on superblock
1791 * update time (freshest comes first), kick out 'old' disks and
1792 * compare superblocks. If everything's fine then run it.
1794 * If "unit" is allocated, then bump its reference count
1796 static void autorun_devices(int part)
1798 struct list_head candidates;
1799 struct list_head *tmp;
1800 mdk_rdev_t *rdev0, *rdev;
1802 char b[BDEVNAME_SIZE];
1804 printk(KERN_INFO "md: autorun ...\n");
1805 while (!list_empty(&pending_raid_disks)) {
1807 rdev0 = list_entry(pending_raid_disks.next,
1808 mdk_rdev_t, same_set);
1810 printk(KERN_INFO "md: considering %s ...\n",
1811 bdevname(rdev0->bdev,b));
1812 INIT_LIST_HEAD(&candidates);
1813 ITERATE_RDEV_PENDING(rdev,tmp)
1814 if (super_90_load(rdev, rdev0, 0) >= 0) {
1815 printk(KERN_INFO "md: adding %s ...\n",
1816 bdevname(rdev->bdev,b));
1817 list_move(&rdev->same_set, &candidates);
1820 * now we have a set of devices, with all of them having
1821 * mostly sane superblocks. It's time to allocate the
1824 if (rdev0->preferred_minor < 0 || rdev0->preferred_minor >= MAX_MD_DEVS) {
1825 printk(KERN_INFO "md: unit number in %s is bad: %d\n",
1826 bdevname(rdev0->bdev, b), rdev0->preferred_minor);
1830 dev = MKDEV(mdp_major,
1831 rdev0->preferred_minor << MdpMinorShift);
1833 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
1835 md_probe(dev, NULL, NULL);
1836 mddev = mddev_find(dev);
1839 "md: cannot allocate memory for md drive.\n");
1842 if (mddev_lock(mddev))
1843 printk(KERN_WARNING "md: %s locked, cannot run\n",
1845 else if (mddev->raid_disks || mddev->major_version
1846 || !list_empty(&mddev->disks)) {
1848 "md: %s already running, cannot run %s\n",
1849 mdname(mddev), bdevname(rdev0->bdev,b));
1850 mddev_unlock(mddev);
1852 printk(KERN_INFO "md: created %s\n", mdname(mddev));
1853 ITERATE_RDEV_GENERIC(candidates,rdev,tmp) {
1854 list_del_init(&rdev->same_set);
1855 if (bind_rdev_to_array(rdev, mddev))
1858 autorun_array(mddev);
1859 mddev_unlock(mddev);
1861 /* on success, candidates will be empty, on error
1864 ITERATE_RDEV_GENERIC(candidates,rdev,tmp)
1868 printk(KERN_INFO "md: ... autorun DONE.\n");
1872 * import RAID devices based on one partition
1873 * if possible, the array gets run as well.
1876 static int autostart_array(dev_t startdev)
1878 char b[BDEVNAME_SIZE];
1879 int err = -EINVAL, i;
1880 mdp_super_t *sb = NULL;
1881 mdk_rdev_t *start_rdev = NULL, *rdev;
1883 start_rdev = md_import_device(startdev, 0, 0);
1884 if (IS_ERR(start_rdev)) {
1885 printk(KERN_WARNING "md: could not import %s!\n",
1886 __bdevname(startdev, b));
1890 /* NOTE: this can only work for 0.90.0 superblocks */
1891 sb = (mdp_super_t*)page_address(start_rdev->sb_page);
1892 if (sb->major_version != 0 ||
1893 sb->minor_version != 90 ) {
1894 printk(KERN_WARNING "md: can only autostart 0.90.0 arrays\n");
1895 export_rdev(start_rdev);
1899 if (start_rdev->faulty) {
1901 "md: can not autostart based on faulty %s!\n",
1902 bdevname(start_rdev->bdev,b));
1903 export_rdev(start_rdev);
1906 list_add(&start_rdev->same_set, &pending_raid_disks);
1908 for (i = 0; i < MD_SB_DISKS; i++) {
1909 mdp_disk_t *desc = sb->disks + i;
1910 dev_t dev = MKDEV(desc->major, desc->minor);
1914 if (dev == startdev)
1916 if (MAJOR(dev) != desc->major || MINOR(dev) != desc->minor)
1918 rdev = md_import_device(dev, 0, 0);
1920 printk(KERN_WARNING "md: could not import %s,"
1921 " trying to run array nevertheless.\n",
1922 __bdevname(dev, b));
1925 list_add(&rdev->same_set, &pending_raid_disks);
1929 * possibly return codes
1937 static int get_version(void * arg)
1941 ver.major = MD_MAJOR_VERSION;
1942 ver.minor = MD_MINOR_VERSION;
1943 ver.patchlevel = MD_PATCHLEVEL_VERSION;
1945 if (copy_to_user(arg, &ver, sizeof(ver)))
1951 static int get_array_info(mddev_t * mddev, void * arg)
1953 mdu_array_info_t info;
1954 int nr,working,active,failed,spare;
1956 struct list_head *tmp;
1958 nr=working=active=failed=spare=0;
1959 ITERATE_RDEV(mddev,rdev,tmp) {
1972 info.major_version = mddev->major_version;
1973 info.minor_version = mddev->minor_version;
1974 info.patch_version = 1;
1975 info.ctime = mddev->ctime;
1976 info.level = mddev->level;
1977 info.size = mddev->size;
1979 info.raid_disks = mddev->raid_disks;
1980 info.md_minor = mddev->md_minor;
1981 info.not_persistent= !mddev->persistent;
1983 info.utime = mddev->utime;
1986 info.state = (1<<MD_SB_CLEAN);
1987 info.active_disks = active;
1988 info.working_disks = working;
1989 info.failed_disks = failed;
1990 info.spare_disks = spare;
1992 info.layout = mddev->layout;
1993 info.chunk_size = mddev->chunk_size;
1995 if (copy_to_user(arg, &info, sizeof(info)))
2001 static int get_disk_info(mddev_t * mddev, void * arg)
2003 mdu_disk_info_t info;
2007 if (copy_from_user(&info, arg, sizeof(info)))
2012 rdev = find_rdev_nr(mddev, nr);
2014 info.major = MAJOR(rdev->bdev->bd_dev);
2015 info.minor = MINOR(rdev->bdev->bd_dev);
2016 info.raid_disk = rdev->raid_disk;
2019 info.state |= (1<<MD_DISK_FAULTY);
2020 else if (rdev->in_sync) {
2021 info.state |= (1<<MD_DISK_ACTIVE);
2022 info.state |= (1<<MD_DISK_SYNC);
2025 info.major = info.minor = 0;
2026 info.raid_disk = -1;
2027 info.state = (1<<MD_DISK_REMOVED);
2030 if (copy_to_user(arg, &info, sizeof(info)))
2036 static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
2038 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
2040 dev_t dev = MKDEV(info->major,info->minor);
2042 if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
2045 if (!mddev->raid_disks) {
2047 /* expecting a device which has a superblock */
2048 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
2051 "md: md_import_device returned %ld\n",
2053 return PTR_ERR(rdev);
2055 if (!list_empty(&mddev->disks)) {
2056 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
2057 mdk_rdev_t, same_set);
2058 int err = super_types[mddev->major_version]
2059 .load_super(rdev, rdev0, mddev->minor_version);
2062 "md: %s has different UUID to %s\n",
2063 bdevname(rdev->bdev,b),
2064 bdevname(rdev0->bdev,b2));
2069 err = bind_rdev_to_array(rdev, mddev);
2076 * add_new_disk can be used once the array is assembled
2077 * to add "hot spares". They must already have a superblock
2082 if (!mddev->pers->hot_add_disk) {
2084 "%s: personality does not support diskops!\n",
2088 rdev = md_import_device(dev, mddev->major_version,
2089 mddev->minor_version);
2092 "md: md_import_device returned %ld\n",
2094 return PTR_ERR(rdev);
2096 rdev->in_sync = 0; /* just to be sure */
2097 rdev->raid_disk = -1;
2098 err = bind_rdev_to_array(rdev, mddev);
2102 md_wakeup_thread(mddev->thread);
2106 /* otherwise, add_new_disk is only allowed
2107 * for major_version==0 superblocks
2109 if (mddev->major_version != 0) {
2110 printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n",
2115 if (!(info->state & (1<<MD_DISK_FAULTY))) {
2117 rdev = md_import_device (dev, -1, 0);
2120 "md: error, md_import_device() returned %ld\n",
2122 return PTR_ERR(rdev);
2124 rdev->desc_nr = info->number;
2125 if (info->raid_disk < mddev->raid_disks)
2126 rdev->raid_disk = info->raid_disk;
2128 rdev->raid_disk = -1;
2131 if (rdev->raid_disk < mddev->raid_disks)
2132 rdev->in_sync = (info->state & (1<<MD_DISK_SYNC));
2136 err = bind_rdev_to_array(rdev, mddev);
2142 if (!mddev->persistent) {
2143 printk(KERN_INFO "md: nonpersistent superblock ...\n");
2144 rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
2146 rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
2147 rdev->size = calc_dev_size(rdev, mddev->chunk_size);
2149 if (!mddev->size || (mddev->size > rdev->size))
2150 mddev->size = rdev->size;
2156 static int hot_generate_error(mddev_t * mddev, dev_t dev)
2158 char b[BDEVNAME_SIZE];
2159 struct request_queue *q;
2165 printk(KERN_INFO "md: trying to generate %s error in %s ... \n",
2166 __bdevname(dev, b), mdname(mddev));
2168 rdev = find_rdev(mddev, dev);
2170 /* MD_BUG(); */ /* like hell - it's not a driver bug */
2174 if (rdev->desc_nr == -1) {
2181 q = bdev_get_queue(rdev->bdev);
2186 printk(KERN_INFO "md: okay, generating error!\n");
2187 // q->oneshot_error = 1; // disabled for now
2192 static int hot_remove_disk(mddev_t * mddev, dev_t dev)
2194 char b[BDEVNAME_SIZE];
2200 printk(KERN_INFO "md: trying to remove %s from %s ... \n",
2201 __bdevname(dev, b), mdname(mddev));
2203 rdev = find_rdev(mddev, dev);
2207 if (rdev->raid_disk >= 0)
2210 kick_rdev_from_array(rdev);
2211 md_update_sb(mddev);
2215 printk(KERN_WARNING "md: cannot remove active disk %s from %s ... \n",
2216 bdevname(rdev->bdev,b), mdname(mddev));
2220 static int hot_add_disk(mddev_t * mddev, dev_t dev)
2222 char b[BDEVNAME_SIZE];
2230 printk(KERN_INFO "md: trying to hot-add %s to %s ... \n",
2231 __bdevname(dev, b), mdname(mddev));
2233 if (mddev->major_version != 0) {
2234 printk(KERN_WARNING "%s: HOT_ADD may only be used with"
2235 " version-0 superblocks.\n",
2239 if (!mddev->pers->hot_add_disk) {
2241 "%s: personality does not support diskops!\n",
2246 rdev = md_import_device (dev, -1, 0);
2249 "md: error, md_import_device() returned %ld\n",
2254 rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
2255 size = calc_dev_size(rdev, mddev->chunk_size);
2258 if (size < mddev->size) {
2260 "%s: disk size %llu blocks < array size %llu\n",
2261 mdname(mddev), (unsigned long long)size,
2262 (unsigned long long)mddev->size);
2269 "md: can not hot-add faulty %s disk to %s!\n",
2270 bdevname(rdev->bdev,b), mdname(mddev));
2276 bind_rdev_to_array(rdev, mddev);
2279 * The rest should better be atomic, we can have disk failures
2280 * noticed in interrupt contexts ...
2283 if (rdev->desc_nr == mddev->max_disks) {
2284 printk(KERN_WARNING "%s: can not hot-add to full array!\n",
2287 goto abort_unbind_export;
2290 rdev->raid_disk = -1;
2292 md_update_sb(mddev);
2295 * Kick recovery, maybe this spare has to be added to the
2296 * array immediately.
2298 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2299 md_wakeup_thread(mddev->thread);
2303 abort_unbind_export:
2304 unbind_rdev_from_array(rdev);
2312 * set_array_info is used two different ways
2313 * The original usage is when creating a new array.
2314 * In this usage, raid_disks is > 0 and it together with
2315 * level, size, not_persistent,layout,chunksize determine the
2316 * shape of the array.
2317 * This will always create an array with a type-0.90.0 superblock.
2318 * The newer usage is when assembling an array.
2319 * In this case raid_disks will be 0, and the major_version field is
2320 * use to determine which style super-blocks are to be found on the devices.
2321 * The minor and patch _version numbers are also kept incase the
2322 * super_block handler wishes to interpret them.
2324 static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
2327 if (info->raid_disks == 0) {
2328 /* just setting version number for superblock loading */
2329 if (info->major_version < 0 ||
2330 info->major_version >= sizeof(super_types)/sizeof(super_types[0]) ||
2331 super_types[info->major_version].name == NULL) {
2332 /* maybe try to auto-load a module? */
2334 "md: superblock version %d not known\n",
2335 info->major_version);
2338 mddev->major_version = info->major_version;
2339 mddev->minor_version = info->minor_version;
2340 mddev->patch_version = info->patch_version;
2343 mddev->major_version = MD_MAJOR_VERSION;
2344 mddev->minor_version = MD_MINOR_VERSION;
2345 mddev->patch_version = MD_PATCHLEVEL_VERSION;
2346 mddev->ctime = get_seconds();
2348 mddev->level = info->level;
2349 mddev->size = info->size;
2350 mddev->raid_disks = info->raid_disks;
2351 /* don't set md_minor, it is determined by which /dev/md* was
2354 if (info->state & (1<<MD_SB_CLEAN))
2355 mddev->recovery_cp = MaxSector;
2357 mddev->recovery_cp = 0;
2358 mddev->persistent = ! info->not_persistent;
2360 mddev->layout = info->layout;
2361 mddev->chunk_size = info->chunk_size;
2363 mddev->max_disks = MD_SB_DISKS;
2365 mddev->sb_dirty = 1;
2368 * Generate a 128 bit UUID
2370 get_random_bytes(mddev->uuid, 16);
2375 static int set_disk_faulty(mddev_t *mddev, dev_t dev)
2379 rdev = find_rdev(mddev, dev);
2383 md_error(mddev, rdev);
2387 static int md_ioctl(struct inode *inode, struct file *file,
2388 unsigned int cmd, unsigned long arg)
2390 char b[BDEVNAME_SIZE];
2392 struct hd_geometry *loc = (struct hd_geometry *) arg;
2393 mddev_t *mddev = NULL;
2395 if (!capable(CAP_SYS_ADMIN))
2399 * Commands dealing with the RAID driver but not any
2405 err = get_version((void *)arg);
2408 case PRINT_RAID_DEBUG:
2416 autostart_arrays(arg);
2423 * Commands creating/starting a new array:
2426 mddev = inode->i_bdev->bd_disk->private_data;
2434 if (cmd == START_ARRAY) {
2435 /* START_ARRAY doesn't need to lock the array as autostart_array
2436 * does the locking, and it could even be a different array
2441 "md: %s(pid %d) used deprecated START_ARRAY ioctl. "
2442 "This will not be supported beyond 2.6\n",
2443 current->comm, current->pid);
2446 err = autostart_array(new_decode_dev(arg));
2448 printk(KERN_WARNING "md: autostart %s failed!\n",
2449 __bdevname(arg, b));
2455 err = mddev_lock(mddev);
2458 "md: ioctl lock interrupted, reason %d, cmd %d\n",
2465 case SET_ARRAY_INFO:
2467 if (!list_empty(&mddev->disks)) {
2469 "md: array %s already has disks!\n",
2474 if (mddev->raid_disks) {
2476 "md: array %s already initialised!\n",
2482 mdu_array_info_t info;
2484 memset(&info, 0, sizeof(info));
2485 else if (copy_from_user(&info, (void*)arg, sizeof(info))) {
2489 err = set_array_info(mddev, &info);
2491 printk(KERN_WARNING "md: couldn't set"
2492 " array info. %d\n", err);
2502 * Commands querying/configuring an existing array:
2504 /* if we are initialised yet, only ADD_NEW_DISK or STOP_ARRAY is allowed */
2505 if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY && cmd != RUN_ARRAY) {
2511 * Commands even a read-only array can execute:
2515 case GET_ARRAY_INFO:
2516 err = get_array_info(mddev, (void *)arg);
2520 err = get_disk_info(mddev, (void *)arg);
2523 case RESTART_ARRAY_RW:
2524 err = restart_array(mddev);
2528 err = do_md_stop (mddev, 0);
2532 err = do_md_stop (mddev, 1);
2536 * We have a problem here : there is no easy way to give a CHS
2537 * virtual geometry. We currently pretend that we have a 2 heads
2538 * 4 sectors (with a BIG number of cylinders...). This drives
2539 * dosfs just mad... ;-)
2546 err = put_user (2, (char *) &loc->heads);
2549 err = put_user (4, (char *) &loc->sectors);
2552 err = put_user(get_capacity(mddev->gendisk)/8,
2553 (short *) &loc->cylinders);
2556 err = put_user (get_start_sect(inode->i_bdev),
2557 (long *) &loc->start);
2562 * The remaining ioctls are changing the state of the
2563 * superblock, so we do not allow read-only arrays
2575 mdu_disk_info_t info;
2576 if (copy_from_user(&info, (void*)arg, sizeof(info)))
2579 err = add_new_disk(mddev, &info);
2582 case HOT_GENERATE_ERROR:
2583 err = hot_generate_error(mddev, new_decode_dev(arg));
2585 case HOT_REMOVE_DISK:
2586 err = hot_remove_disk(mddev, new_decode_dev(arg));
2590 err = hot_add_disk(mddev, new_decode_dev(arg));
2593 case SET_DISK_FAULTY:
2594 err = set_disk_faulty(mddev, new_decode_dev(arg));
2598 err = do_md_run (mddev);
2602 if (_IOC_TYPE(cmd) == MD_MAJOR)
2603 printk(KERN_WARNING "md: %s(pid %d) used"
2604 " obsolete MD ioctl, upgrade your"
2605 " software to use new ictls.\n",
2606 current->comm, current->pid);
2613 mddev_unlock(mddev);
2623 static int md_open(struct inode *inode, struct file *file)
2626 * Succeed if we can lock the mddev, which confirms that
2627 * it isn't being stopped right now.
2629 mddev_t *mddev = inode->i_bdev->bd_disk->private_data;
2632 if ((err = mddev_lock(mddev)))
2637 mddev_unlock(mddev);
2639 check_disk_change(inode->i_bdev);
2644 static int md_release(struct inode *inode, struct file * file)
2646 mddev_t *mddev = inode->i_bdev->bd_disk->private_data;
2655 static int md_media_changed(struct gendisk *disk)
2657 mddev_t *mddev = disk->private_data;
2659 return mddev->changed;
2662 static int md_revalidate(struct gendisk *disk)
2664 mddev_t *mddev = disk->private_data;
2669 static struct block_device_operations md_fops =
2671 .owner = THIS_MODULE,
2673 .release = md_release,
2675 .media_changed = md_media_changed,
2676 .revalidate_disk= md_revalidate,
2679 int md_thread(void * arg)
2681 mdk_thread_t *thread = arg;
2689 daemonize(thread->name, mdname(thread->mddev));
2691 current->exit_signal = SIGCHLD;
2692 allow_signal(SIGKILL);
2693 thread->tsk = current;
2696 * md_thread is a 'system-thread', it's priority should be very
2697 * high. We avoid resource deadlocks individually in each
2698 * raid personality. (RAID5 does preallocation) We also use RR and
2699 * the very same RT priority as kswapd, thus we will never get
2700 * into a priority inversion deadlock.
2702 * we definitely have to have equal or higher priority than
2703 * bdflush, otherwise bdflush will deadlock if there are too
2704 * many dirty RAID5 blocks.
2708 complete(thread->event);
2709 while (thread->run) {
2710 void (*run)(mddev_t *);
2712 wait_event_interruptible(thread->wqueue,
2713 test_bit(THREAD_WAKEUP, &thread->flags));
2714 if (current->flags & PF_FREEZE)
2715 refrigerator(PF_FREEZE);
2717 clear_bit(THREAD_WAKEUP, &thread->flags);
2723 if (signal_pending(current))
2724 flush_signals(current);
2726 complete(thread->event);
2730 void md_wakeup_thread(mdk_thread_t *thread)
2733 dprintk("md: waking up MD thread %s.\n", thread->tsk->comm);
2734 set_bit(THREAD_WAKEUP, &thread->flags);
2735 wake_up(&thread->wqueue);
2739 mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
2742 mdk_thread_t *thread;
2744 struct completion event;
2746 thread = (mdk_thread_t *) kmalloc
2747 (sizeof(mdk_thread_t), GFP_KERNEL);
2751 memset(thread, 0, sizeof(mdk_thread_t));
2752 init_waitqueue_head(&thread->wqueue);
2754 init_completion(&event);
2755 thread->event = &event;
2757 thread->mddev = mddev;
2758 thread->name = name;
2759 ret = kernel_thread(md_thread, thread, 0);
2764 wait_for_completion(&event);
2768 void md_interrupt_thread(mdk_thread_t *thread)
2774 dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid);
2775 send_sig(SIGKILL, thread->tsk, 1);
2778 void md_unregister_thread(mdk_thread_t *thread)
2780 struct completion event;
2782 init_completion(&event);
2784 thread->event = &event;
2786 thread->name = NULL;
2787 md_interrupt_thread(thread);
2788 wait_for_completion(&event);
2792 void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
2799 if (!rdev || rdev->faulty)
2802 dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
2804 MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev),
2805 __builtin_return_address(0),__builtin_return_address(1),
2806 __builtin_return_address(2),__builtin_return_address(3));
2808 if (!mddev->pers->error_handler)
2810 mddev->pers->error_handler(mddev,rdev);
2811 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2812 md_wakeup_thread(mddev->thread);
2815 /* seq_file implementation /proc/mdstat */
2817 static void status_unused(struct seq_file *seq)
2821 struct list_head *tmp;
2823 seq_printf(seq, "unused devices: ");
2825 ITERATE_RDEV_PENDING(rdev,tmp) {
2826 char b[BDEVNAME_SIZE];
2828 seq_printf(seq, "%s ",
2829 bdevname(rdev->bdev,b));
2832 seq_printf(seq, "<none>");
2834 seq_printf(seq, "\n");
2838 static void status_resync(struct seq_file *seq, mddev_t * mddev)
2840 unsigned long max_blocks, resync, res, dt, db, rt;
2842 resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2;
2843 max_blocks = mddev->size;
2846 * Should not happen.
2852 res = (resync/1024)*1000/(max_blocks/1024 + 1);
2854 int i, x = res/50, y = 20-x;
2855 seq_printf(seq, "[");
2856 for (i = 0; i < x; i++)
2857 seq_printf(seq, "=");
2858 seq_printf(seq, ">");
2859 for (i = 0; i < y; i++)
2860 seq_printf(seq, ".");
2861 seq_printf(seq, "] ");
2863 seq_printf(seq, " %s =%3lu.%lu%% (%lu/%lu)",
2864 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
2865 "resync" : "recovery"),
2866 res/10, res % 10, resync, max_blocks);
2869 * We do not want to overflow, so the order of operands and
2870 * the * 100 / 100 trick are important. We do a +1 to be
2871 * safe against division by zero. We only estimate anyway.
2873 * dt: time from mark until now
2874 * db: blocks written from mark until now
2875 * rt: remaining time
2877 dt = ((jiffies - mddev->resync_mark) / HZ);
2879 db = resync - (mddev->resync_mark_cnt/2);
2880 rt = (dt * ((max_blocks-resync) / (db/100+1)))/100;
2882 seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6);
2884 seq_printf(seq, " speed=%ldK/sec", db/dt);
2887 static void *md_seq_start(struct seq_file *seq, loff_t *pos)
2889 struct list_head *tmp;
2899 spin_lock(&all_mddevs_lock);
2900 list_for_each(tmp,&all_mddevs)
2902 mddev = list_entry(tmp, mddev_t, all_mddevs);
2904 spin_unlock(&all_mddevs_lock);
2907 spin_unlock(&all_mddevs_lock);
2909 return (void*)2;/* tail */
2913 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2915 struct list_head *tmp;
2916 mddev_t *next_mddev, *mddev = v;
2922 spin_lock(&all_mddevs_lock);
2924 tmp = all_mddevs.next;
2926 tmp = mddev->all_mddevs.next;
2927 if (tmp != &all_mddevs)
2928 next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs));
2930 next_mddev = (void*)2;
2933 spin_unlock(&all_mddevs_lock);
2941 static void md_seq_stop(struct seq_file *seq, void *v)
2945 if (mddev && v != (void*)1 && v != (void*)2)
2949 static int md_seq_show(struct seq_file *seq, void *v)
2953 struct list_head *tmp2;
2957 if (v == (void*)1) {
2958 seq_printf(seq, "Personalities : ");
2959 spin_lock(&pers_lock);
2960 for (i = 0; i < MAX_PERSONALITY; i++)
2962 seq_printf(seq, "[%s] ", pers[i]->name);
2964 spin_unlock(&pers_lock);
2965 seq_printf(seq, "\n");
2968 if (v == (void*)2) {
2973 if (mddev_lock(mddev)!=0)
2975 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
2976 seq_printf(seq, "%s : %sactive", mdname(mddev),
2977 mddev->pers ? "" : "in");
2980 seq_printf(seq, " (read-only)");
2981 seq_printf(seq, " %s", mddev->pers->name);
2985 ITERATE_RDEV(mddev,rdev,tmp2) {
2986 char b[BDEVNAME_SIZE];
2987 seq_printf(seq, " %s[%d]",
2988 bdevname(rdev->bdev,b), rdev->desc_nr);
2990 seq_printf(seq, "(F)");
2996 if (!list_empty(&mddev->disks)) {
2998 seq_printf(seq, "\n %llu blocks",
2999 (unsigned long long)mddev->array_size);
3001 seq_printf(seq, "\n %llu blocks",
3002 (unsigned long long)size);
3006 mddev->pers->status (seq, mddev);
3007 seq_printf(seq, "\n ");
3008 if (mddev->curr_resync > 2)
3009 status_resync (seq, mddev);
3010 else if (mddev->curr_resync == 1 || mddev->curr_resync == 2)
3011 seq_printf(seq, " resync=DELAYED");
3014 seq_printf(seq, "\n");
3016 mddev_unlock(mddev);
3021 static struct seq_operations md_seq_ops = {
3022 .start = md_seq_start,
3023 .next = md_seq_next,
3024 .stop = md_seq_stop,
3025 .show = md_seq_show,
3028 static int md_seq_open(struct inode *inode, struct file *file)
3032 error = seq_open(file, &md_seq_ops);
3036 static struct file_operations md_seq_fops = {
3037 .open = md_seq_open,
3039 .llseek = seq_lseek,
3040 .release = seq_release,
3043 int register_md_personality(int pnum, mdk_personality_t *p)
3045 if (pnum >= MAX_PERSONALITY) {
3047 "md: tried to install personality %s as nr %d, but max is %lu\n",
3048 p->name, pnum, MAX_PERSONALITY-1);
3052 spin_lock(&pers_lock);
3054 spin_unlock(&pers_lock);
3060 printk(KERN_INFO "md: %s personality registered as nr %d\n", p->name, pnum);
3061 spin_unlock(&pers_lock);
3065 int unregister_md_personality(int pnum)
3067 if (pnum >= MAX_PERSONALITY) {
3072 printk(KERN_INFO "md: %s personality unregistered\n", pers[pnum]->name);
3073 spin_lock(&pers_lock);
3075 spin_unlock(&pers_lock);
3079 void md_sync_acct(mdk_rdev_t *rdev, unsigned long nr_sectors)
3081 rdev->bdev->bd_contains->bd_disk->sync_io += nr_sectors;
3084 static int is_mddev_idle(mddev_t *mddev)
3087 struct list_head *tmp;
3089 unsigned long curr_events;
3092 ITERATE_RDEV(mddev,rdev,tmp) {
3093 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
3094 curr_events = disk_stat_read(disk, read_sectors) +
3095 disk_stat_read(disk, write_sectors) -
3097 if ((curr_events - rdev->last_events) > 32) {
3098 rdev->last_events = curr_events;
3105 void md_done_sync(mddev_t *mddev, int blocks, int ok)
3107 /* another "blocks" (512byte) blocks have been synced */
3108 atomic_sub(blocks, &mddev->recovery_active);
3109 wake_up(&mddev->recovery_wait);
3111 set_bit(MD_RECOVERY_ERR, &mddev->recovery);
3112 md_wakeup_thread(mddev->thread);
3113 // stop recovery, signal do_sync ....
3118 void md_write_start(mddev_t *mddev)
3120 if (!atomic_read(&mddev->writes_pending)) {
3121 mddev_lock_uninterruptible(mddev);
3122 if (mddev->in_sync) {
3124 del_timer(&mddev->safemode_timer);
3125 md_update_sb(mddev);
3127 atomic_inc(&mddev->writes_pending);
3128 mddev_unlock(mddev);
3130 atomic_inc(&mddev->writes_pending);
3133 void md_write_end(mddev_t *mddev)
3135 if (atomic_dec_and_test(&mddev->writes_pending)) {
3136 if (mddev->safemode == 2)
3137 md_wakeup_thread(mddev->thread);
3139 mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay);
3143 static inline void md_enter_safemode(mddev_t *mddev)
3145 if (!mddev->safemode) return;
3146 if (mddev->safemode == 2 &&
3147 (atomic_read(&mddev->writes_pending) || mddev->in_sync ||
3148 mddev->recovery_cp != MaxSector))
3149 return; /* avoid the lock */
3150 mddev_lock_uninterruptible(mddev);
3151 if (mddev->safemode && !atomic_read(&mddev->writes_pending) &&
3152 !mddev->in_sync && mddev->recovery_cp == MaxSector) {
3154 md_update_sb(mddev);
3156 mddev_unlock(mddev);
3158 if (mddev->safemode == 1)
3159 mddev->safemode = 0;
3162 void md_handle_safemode(mddev_t *mddev)
3164 if (signal_pending(current)) {
3165 printk(KERN_INFO "md: %s in immediate safe mode\n",
3167 mddev->safemode = 2;
3168 flush_signals(current);
3170 md_enter_safemode(mddev);
3174 DECLARE_WAIT_QUEUE_HEAD(resync_wait);
3176 #define SYNC_MARKS 10
3177 #define SYNC_MARK_STEP (3*HZ)
3178 static void md_do_sync(mddev_t *mddev)
3181 unsigned int currspeed = 0,
3183 sector_t max_sectors,j;
3184 unsigned long mark[SYNC_MARKS];
3185 sector_t mark_cnt[SYNC_MARKS];
3187 struct list_head *tmp;
3188 sector_t last_check;
3190 /* just incase thread restarts... */
3191 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
3194 /* we overload curr_resync somewhat here.
3195 * 0 == not engaged in resync at all
3196 * 2 == checking that there is no conflict with another sync
3197 * 1 == like 2, but have yielded to allow conflicting resync to
3199 * other == active in resync - this many blocks
3202 mddev->curr_resync = 2;
3204 ITERATE_MDDEV(mddev2,tmp) {
3205 if (mddev2 == mddev)
3207 if (mddev2->curr_resync &&
3208 match_mddev_units(mddev,mddev2)) {
3209 printk(KERN_INFO "md: delaying resync of %s"
3210 " until %s has finished resync (they"
3211 " share one or more physical units)\n",
3212 mdname(mddev), mdname(mddev2));
3213 if (mddev < mddev2) {/* arbitrarily yield */
3214 mddev->curr_resync = 1;
3215 wake_up(&resync_wait);
3217 if (wait_event_interruptible(resync_wait,
3218 mddev2->curr_resync < mddev->curr_resync)) {
3219 flush_signals(current);
3224 if (mddev->curr_resync == 1) {
3229 } while (mddev->curr_resync < 2);
3231 max_sectors = mddev->size << 1;
3233 printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev));
3234 printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:"
3235 " %d KB/sec/disc.\n", sysctl_speed_limit_min);
3236 printk(KERN_INFO "md: using maximum available idle IO bandwith "
3237 "(but not more than %d KB/sec) for reconstruction.\n",
3238 sysctl_speed_limit_max);
3240 is_mddev_idle(mddev); /* this also initializes IO event counters */
3241 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
3242 j = mddev->recovery_cp;
3245 for (m = 0; m < SYNC_MARKS; m++) {
3250 mddev->resync_mark = mark[last_mark];
3251 mddev->resync_mark_cnt = mark_cnt[last_mark];
3254 * Tune reconstruction:
3256 window = 32*(PAGE_SIZE/512);
3257 printk(KERN_INFO "md: using %dk window, over a total of %Lu blocks.\n",
3258 window/2,(unsigned long long) max_sectors/2);
3260 atomic_set(&mddev->recovery_active, 0);
3261 init_waitqueue_head(&mddev->recovery_wait);
3266 "md: resuming recovery of %s from checkpoint.\n",
3269 while (j < max_sectors) {
3272 sectors = mddev->pers->sync_request(mddev, j, currspeed < sysctl_speed_limit_min);
3274 set_bit(MD_RECOVERY_ERR, &mddev->recovery);
3277 atomic_add(sectors, &mddev->recovery_active);
3279 if (j>1) mddev->curr_resync = j;
3281 if (last_check + window > j)
3286 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) ||
3287 test_bit(MD_RECOVERY_ERR, &mddev->recovery))
3291 if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) {
3293 int next = (last_mark+1) % SYNC_MARKS;
3295 mddev->resync_mark = mark[next];
3296 mddev->resync_mark_cnt = mark_cnt[next];
3297 mark[next] = jiffies;
3298 mark_cnt[next] = j - atomic_read(&mddev->recovery_active);
3303 if (signal_pending(current)) {
3305 * got a signal, exit.
3308 "md: md_do_sync() got signal ... exiting\n");
3309 flush_signals(current);
3310 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
3315 * this loop exits only if either when we are slower than
3316 * the 'hard' speed limit, or the system was IO-idle for
3318 * the system might be non-idle CPU-wise, but we only care
3319 * about not overloading the IO subsystem. (things like an
3320 * e2fsck being done on the RAID array should execute fast)
3322 mddev->queue->unplug_fn(mddev->queue);
3325 currspeed = ((unsigned long)(j-mddev->resync_mark_cnt))/2/((jiffies-mddev->resync_mark)/HZ +1) +1;
3327 if (currspeed > sysctl_speed_limit_min) {
3328 if ((currspeed > sysctl_speed_limit_max) ||
3329 !is_mddev_idle(mddev)) {
3330 current->state = TASK_INTERRUPTIBLE;
3331 schedule_timeout(HZ/4);
3336 printk(KERN_INFO "md: %s: sync done.\n",mdname(mddev));
3338 * this also signals 'finished resyncing' to md_stop
3341 mddev->queue->unplug_fn(mddev->queue);
3343 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
3345 /* tell personality that we are finished */
3346 mddev->pers->sync_request(mddev, max_sectors, 1);
3348 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
3349 mddev->curr_resync > 2 &&
3350 mddev->curr_resync > mddev->recovery_cp) {
3351 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
3353 "md: checkpointing recovery of %s.\n",
3355 mddev->recovery_cp = mddev->curr_resync;
3357 mddev->recovery_cp = MaxSector;
3360 md_enter_safemode(mddev);
3362 mddev->curr_resync = 0;
3363 set_bit(MD_RECOVERY_DONE, &mddev->recovery);
3364 md_wakeup_thread(mddev->thread);
3369 * This routine is regularly called by all per-raid-array threads to
3370 * deal with generic issues like resync and super-block update.
3371 * Raid personalities that don't have a thread (linear/raid0) do not
3372 * need this as they never do any recovery or update the superblock.
3374 * It does not do any resync itself, but rather "forks" off other threads
3375 * to do that as needed.
3376 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in
3377 * "->recovery" and create a thread at ->sync_thread.
3378 * When the thread finishes it sets MD_RECOVERY_DONE (and might set MD_RECOVERY_ERR)
3379 * and wakeups up this thread which will reap the thread and finish up.
3380 * This thread also removes any faulty devices (with nr_pending == 0).
3382 * The overall approach is:
3383 * 1/ if the superblock needs updating, update it.
3384 * 2/ If a recovery thread is running, don't do anything else.
3385 * 3/ If recovery has finished, clean up, possibly marking spares active.
3386 * 4/ If there are any faulty devices, remove them.
3387 * 5/ If array is degraded, try to add spares devices
3388 * 6/ If array has spares or is not in-sync, start a resync thread.
3390 void md_check_recovery(mddev_t *mddev)
3393 struct list_head *rtmp;
3396 dprintk(KERN_INFO "md: recovery thread got woken up ...\n");
3402 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
3403 test_bit(MD_RECOVERY_DONE, &mddev->recovery)
3406 if (mddev_trylock(mddev)==0) {
3408 if (mddev->sb_dirty)
3409 md_update_sb(mddev);
3410 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
3411 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
3412 /* resync/recovery still happening */
3413 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3416 if (mddev->sync_thread) {
3417 /* resync has finished, collect result */
3418 md_unregister_thread(mddev->sync_thread);
3419 mddev->sync_thread = NULL;
3420 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
3421 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
3423 /* activate any spares */
3424 mddev->pers->spare_active(mddev);
3426 md_update_sb(mddev);
3427 mddev->recovery = 0;
3428 /* flag recovery needed just to double check */
3429 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3430 wake_up(&resync_wait);
3433 if (mddev->recovery) {
3434 /* probably just the RECOVERY_NEEDED flag */
3435 mddev->recovery = 0;
3436 wake_up(&resync_wait);
3439 /* no recovery is running.
3440 * remove any failed drives, then
3441 * add spares if possible
3443 ITERATE_RDEV(mddev,rdev,rtmp) {
3444 if (rdev->raid_disk >= 0 &&
3446 atomic_read(&rdev->nr_pending)==0) {
3447 mddev->pers->hot_remove_disk(mddev, rdev->raid_disk);
3448 rdev->raid_disk = -1;
3450 if (!rdev->faulty && rdev->raid_disk >= 0 && !rdev->in_sync)
3453 if (mddev->degraded) {
3454 ITERATE_RDEV(mddev,rdev,rtmp)
3455 if (rdev->raid_disk < 0
3457 if (mddev->pers->hot_add_disk(mddev,rdev))
3464 if (!spares && (mddev->recovery_cp == MaxSector )) {
3465 /* nothing we can do ... */
3468 if (mddev->pers->sync_request) {
3469 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
3471 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
3472 mddev->sync_thread = md_register_thread(md_do_sync,
3475 if (!mddev->sync_thread) {
3476 printk(KERN_ERR "%s: could not start resync"
3479 /* leave the spares where they are, it shouldn't hurt */
3480 mddev->recovery = 0;
3482 md_wakeup_thread(mddev->sync_thread);
3486 mddev_unlock(mddev);
3490 int md_notify_reboot(struct notifier_block *this,
3491 unsigned long code, void *x)
3493 struct list_head *tmp;
3496 if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) {
3498 printk(KERN_INFO "md: stopping all md devices.\n");
3500 ITERATE_MDDEV(mddev,tmp)
3501 if (mddev_trylock(mddev)==0)
3502 do_md_stop (mddev, 1);
3504 * certain more exotic SCSI devices are known to be
3505 * volatile wrt too early system reboots. While the
3506 * right place to handle this issue is the given
3507 * driver, we do want to have a safe RAID driver ...
3514 struct notifier_block md_notifier = {
3515 .notifier_call = md_notify_reboot,
3517 .priority = INT_MAX, /* before any real devices */
3520 static void md_geninit(void)
3522 struct proc_dir_entry *p;
3524 dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
3526 p = create_proc_entry("mdstat", S_IRUGO, NULL);
3528 p->proc_fops = &md_seq_fops;
3531 int __init md_init(void)
3535 printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d,"
3536 " MD_SB_DISKS=%d\n",
3537 MD_MAJOR_VERSION, MD_MINOR_VERSION,
3538 MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS);
3540 if (register_blkdev(MAJOR_NR, "md"))
3542 if ((mdp_major=register_blkdev(0, "mdp"))<=0) {
3543 unregister_blkdev(MAJOR_NR, "md");
3547 blk_register_region(MKDEV(MAJOR_NR, 0), MAX_MD_DEVS, THIS_MODULE,
3548 md_probe, NULL, NULL);
3549 blk_register_region(MKDEV(mdp_major, 0), MAX_MD_DEVS<<MdpMinorShift, THIS_MODULE,
3550 md_probe, NULL, NULL);
3552 for (minor=0; minor < MAX_MD_DEVS; ++minor)
3553 devfs_mk_bdev(MKDEV(MAJOR_NR, minor),
3554 S_IFBLK|S_IRUSR|S_IWUSR,
3557 for (minor=0; minor < MAX_MD_DEVS; ++minor)
3558 devfs_mk_bdev(MKDEV(mdp_major, minor<<MdpMinorShift),
3559 S_IFBLK|S_IRUSR|S_IWUSR,
3563 register_reboot_notifier(&md_notifier);
3564 raid_table_header = register_sysctl_table(raid_root_table, 1);
3574 * Searches all registered partitions for autorun RAID arrays
3577 static dev_t detected_devices[128];
3580 void md_autodetect_dev(dev_t dev)
3582 if (dev_cnt >= 0 && dev_cnt < 127)
3583 detected_devices[dev_cnt++] = dev;
3587 static void autostart_arrays(int part)
3589 char b[BDEVNAME_SIZE];
3593 printk(KERN_INFO "md: Autodetecting RAID arrays.\n");
3595 for (i = 0; i < dev_cnt; i++) {
3596 dev_t dev = detected_devices[i];
3598 rdev = md_import_device(dev,0, 0);
3600 printk(KERN_ALERT "md: could not import %s!\n",
3601 __bdevname(dev, b));
3608 list_add(&rdev->same_set, &pending_raid_disks);
3612 autorun_devices(part);
3617 static __exit void md_exit(void)
3620 struct list_head *tmp;
3622 blk_unregister_region(MKDEV(MAJOR_NR,0), MAX_MD_DEVS);
3623 blk_unregister_region(MKDEV(mdp_major,0), MAX_MD_DEVS << MdpMinorShift);
3624 for (i=0; i < MAX_MD_DEVS; i++)
3625 devfs_remove("md/%d", i);
3626 for (i=0; i < MAX_MD_DEVS; i++)
3627 devfs_remove("md/d%d", i);
3631 unregister_blkdev(MAJOR_NR,"md");
3632 unregister_blkdev(mdp_major, "mdp");
3633 unregister_reboot_notifier(&md_notifier);
3634 unregister_sysctl_table(raid_table_header);
3635 remove_proc_entry("mdstat", NULL);
3636 ITERATE_MDDEV(mddev,tmp) {
3637 struct gendisk *disk = mddev->gendisk;
3640 export_array(mddev);
3643 mddev->gendisk = NULL;
3648 module_init(md_init)
3649 module_exit(md_exit)
3651 EXPORT_SYMBOL(register_md_personality);
3652 EXPORT_SYMBOL(unregister_md_personality);
3653 EXPORT_SYMBOL(md_error);
3654 EXPORT_SYMBOL(md_sync_acct);
3655 EXPORT_SYMBOL(md_done_sync);
3656 EXPORT_SYMBOL(md_write_start);
3657 EXPORT_SYMBOL(md_write_end);
3658 EXPORT_SYMBOL(md_handle_safemode);
3659 EXPORT_SYMBOL(md_register_thread);
3660 EXPORT_SYMBOL(md_unregister_thread);
3661 EXPORT_SYMBOL(md_wakeup_thread);
3662 EXPORT_SYMBOL(md_print_devices);
3663 EXPORT_SYMBOL(md_interrupt_thread);
3664 EXPORT_SYMBOL(md_check_recovery);
3665 MODULE_LICENSE("GPL");