Fedora kernel-2.6.17-1.2142_FC4 patched with stable patch-2.6.17.4-vs2.0.2-rc26.diff
[linux-2.6.git] / include / linux / raid / md_k.h
index c9a0d40..e2df61f 100644 (file)
 #ifndef _MD_K_H
 #define _MD_K_H
 
-#define MD_RESERVED       0UL
-#define LINEAR            1UL
-#define RAID0             2UL
-#define RAID1             3UL
-#define RAID5             4UL
-#define TRANSLUCENT       5UL
-#define HSM               6UL
-#define MULTIPATH         7UL
-#define RAID6            8UL
-#define        RAID10            9UL
-#define FAULTY           10UL
-#define MAX_PERSONALITY   11UL
+/* and dm-bio-list.h is not under include/linux because.... ??? */
+#include "../../../drivers/md/dm-bio-list.h"
 
 #define        LEVEL_MULTIPATH         (-4)
 #define        LEVEL_LINEAR            (-1)
 #define        LEVEL_FAULTY            (-5)
 
+/* we need a value for 'no level specified' and 0
+ * means 'raid0', so we need something else.  This is
+ * for internal use only
+ */
+#define        LEVEL_NONE              (-1000000)
+
 #define MaxSector (~(sector_t)0)
 #define MD_THREAD_NAME_MAX 14
 
-static inline int pers_to_level (int pers)
-{
-       switch (pers) {
-               case FAULTY:            return LEVEL_FAULTY;
-               case MULTIPATH:         return LEVEL_MULTIPATH;
-               case HSM:               return -3;
-               case TRANSLUCENT:       return -2;
-               case LINEAR:            return LEVEL_LINEAR;
-               case RAID0:             return 0;
-               case RAID1:             return 1;
-               case RAID5:             return 5;
-               case RAID6:             return 6;
-               case RAID10:            return 10;
-       }
-       BUG();
-       return MD_RESERVED;
-}
-
-static inline int level_to_pers (int level)
-{
-       switch (level) {
-               case LEVEL_FAULTY: return FAULTY;
-               case LEVEL_MULTIPATH: return MULTIPATH;
-               case -3: return HSM;
-               case -2: return TRANSLUCENT;
-               case LEVEL_LINEAR: return LINEAR;
-               case 0: return RAID0;
-               case 1: return RAID1;
-               case 4:
-               case 5: return RAID5;
-               case 6: return RAID6;
-               case 10: return RAID10;
-       }
-       return MD_RESERVED;
-}
-
 typedef struct mddev_s mddev_t;
 typedef struct mdk_rdev_s mdk_rdev_t;
 
@@ -82,70 +42,6 @@ typedef struct mdk_rdev_s mdk_rdev_t;
 
 #define MAX_CHUNK_SIZE (4096*1024)
 
-/*
- * default readahead
- */
-
-static inline int disk_faulty(mdp_disk_t * d)
-{
-       return d->state & (1 << MD_DISK_FAULTY);
-}
-
-static inline int disk_active(mdp_disk_t * d)
-{
-       return d->state & (1 << MD_DISK_ACTIVE);
-}
-
-static inline int disk_sync(mdp_disk_t * d)
-{
-       return d->state & (1 << MD_DISK_SYNC);
-}
-
-static inline int disk_spare(mdp_disk_t * d)
-{
-       return !disk_sync(d) && !disk_active(d) && !disk_faulty(d);
-}
-
-static inline int disk_removed(mdp_disk_t * d)
-{
-       return d->state & (1 << MD_DISK_REMOVED);
-}
-
-static inline void mark_disk_faulty(mdp_disk_t * d)
-{
-       d->state |= (1 << MD_DISK_FAULTY);
-}
-
-static inline void mark_disk_active(mdp_disk_t * d)
-{
-       d->state |= (1 << MD_DISK_ACTIVE);
-}
-
-static inline void mark_disk_sync(mdp_disk_t * d)
-{
-       d->state |= (1 << MD_DISK_SYNC);
-}
-
-static inline void mark_disk_spare(mdp_disk_t * d)
-{
-       d->state = 0;
-}
-
-static inline void mark_disk_removed(mdp_disk_t * d)
-{
-       d->state = (1 << MD_DISK_FAULTY) | (1 << MD_DISK_REMOVED);
-}
-
-static inline void mark_disk_inactive(mdp_disk_t * d)
-{
-       d->state &= ~(1 << MD_DISK_ACTIVE);
-}
-
-static inline void mark_disk_nonsync(mdp_disk_t * d)
-{
-       d->state &= ~(1 << MD_DISK_SYNC);
-}
-
 /*
  * MD's 'extended' device
  */
@@ -163,8 +59,11 @@ struct mdk_rdev_s
        int             sb_loaded;
        sector_t        data_offset;    /* start of data in array */
        sector_t        sb_offset;
+       int             sb_size;        /* bytes in the superblock */
        int             preferred_minor;        /* autorun support */
 
+       struct kobject  kobj;
+
        /* A device can be in one of three states based on two flags:
         * Not working:   faulty==1 in_sync==0
         * Fully working: faulty==0 in_sync==1
@@ -175,24 +74,37 @@ struct mdk_rdev_s
         * It can never have faulty==1, in_sync==1
         * This reduces the burden of testing multiple flags in many cases
         */
-       int faulty;                     /* if faulty do not issue IO requests */
-       int in_sync;                    /* device is a full member of the array */
+
+       unsigned long   flags;
+#define        Faulty          1               /* device is known to have a fault */
+#define        In_sync         2               /* device is in_sync with rest of array */
+#define        WriteMostly     4               /* Avoid reading if at all possible */
+#define        BarriersNotsupp 5               /* BIO_RW_BARRIER is not supported */
 
        int desc_nr;                    /* descriptor index in the superblock */
        int raid_disk;                  /* role of device in array */
+       int saved_raid_disk;            /* role that device used to have in the
+                                        * array and could again if we did a partial
+                                        * resync from the bitmap
+                                        */
 
        atomic_t        nr_pending;     /* number of pending requests.
                                         * only maintained for arrays that
                                         * support hot removal
                                         */
+       atomic_t        read_errors;    /* number of consecutive read errors that
+                                        * we have tried to ignore.
+                                        */
+       atomic_t        corrected_errors; /* number of corrected read errors,
+                                          * for reporting to userspace and storing
+                                          * in superblock.
+                                          */
 };
 
-typedef struct mdk_personality_s mdk_personality_t;
-
 struct mddev_s
 {
        void                            *private;
-       mdk_personality_t               *pers;
+       struct mdk_personality          *pers;
        dev_t                           unit;
        int                             md_minor;
        struct list_head                disks;
@@ -201,6 +113,8 @@ struct mddev_s
 
        struct gendisk                  *gendisk;
 
+       struct kobject                  kobj;
+
        /* Superblock information */
        int                             major_version,
                                        minor_version,
@@ -209,6 +123,7 @@ struct mddev_s
        int                             chunk_size;
        time_t                          ctime, utime;
        int                             level, layout;
+       char                            clevel[16];
        int                             raid_disks;
        int                             max_disks;
        sector_t                        size; /* used size of component devices */
@@ -217,6 +132,14 @@ struct mddev_s
 
        char                            uuid[16];
 
+       /* If the array is being reshaped, we need to record the
+        * new shape and an indication of where we are up to.
+        * This is written to the superblock.
+        * If reshape_position is MaxSector, then no reshape is happening (yet).
+        */
+       sector_t                        reshape_position;
+       int                             delta_disks, new_level, new_layout, new_chunk;
+
        struct mdk_thread_s             *thread;        /* management thread */
        struct mdk_thread_s             *sync_thread;   /* doing resync or reconstruct */
        sector_t                        curr_resync;    /* blocks scheduled */
@@ -224,6 +147,19 @@ struct mddev_s
        sector_t                        resync_mark_cnt;/* blocks written at resync_mark */
 
        sector_t                        resync_max_sectors; /* may be set by personality */
+
+       sector_t                        resync_mismatches; /* count of sectors where
+                                                           * parity/replica mismatch found
+                                                           */
+
+       /* allow user-space to request suspension of IO to regions of the array */
+       sector_t                        suspend_lo;
+       sector_t                        suspend_hi;
+       /* if zero, use the system-wide default */
+       int                             sync_speed_min;
+       int                             sync_speed_max;
+
+       int                             ok_start_degraded;
        /* recovery/resync flags 
         * NEEDED:   we might need to start a resync/recover
         * RUNNING:  a thread is running, or about to be started
@@ -231,6 +167,11 @@ struct mddev_s
         * ERR:      and IO error was detected - abort the resync/recovery
         * INTR:     someone requested a (clean) early abort.
         * DONE:     thread is done and is waiting to be reaped
+        * REQUEST:  user-space has requested a sync (used with SYNC)
+        * CHECK:    user-space request for for check-only, no repair
+        * RESHAPE:  A reshape is happening
+        *
+        * If neither SYNC or RESHAPE are set, then it is a recovery.
         */
 #define        MD_RECOVERY_RUNNING     0
 #define        MD_RECOVERY_SYNC        1
@@ -238,20 +179,35 @@ struct mddev_s
 #define        MD_RECOVERY_INTR        3
 #define        MD_RECOVERY_DONE        4
 #define        MD_RECOVERY_NEEDED      5
+#define        MD_RECOVERY_REQUESTED   6
+#define        MD_RECOVERY_CHECK       7
+#define MD_RECOVERY_RESHAPE    8
        unsigned long                   recovery;
 
        int                             in_sync;        /* know to not need resync */
-       struct semaphore                reconfig_sem;
+       struct mutex                    reconfig_mutex;
        atomic_t                        active;
 
        int                             changed;        /* true if we might need to reread partition info */
        int                             degraded;       /* whether md should consider
                                                         * adding a spare
                                                         */
+       int                             barriers_work;  /* initialised to true, cleared as soon
+                                                        * as a barrier request to slave
+                                                        * fails.  Only supported
+                                                        */
+       struct bio                      *biolist;       /* bios that need to be retried
+                                                        * because BIO_RW_BARRIER is not supported
+                                                        */
 
        atomic_t                        recovery_active; /* blocks scheduled, but not written */
        wait_queue_head_t               recovery_wait;
        sector_t                        recovery_cp;
+
+       spinlock_t                      write_lock;
+       wait_queue_head_t               sb_wait;        /* for waiting on superblock updates */
+       atomic_t                        pending_writes; /* number of active superblock writes */
+
        unsigned int                    safemode;       /* if set, update "clean" superblock
                                                         * when no writes pending.
                                                         */ 
@@ -260,13 +216,27 @@ struct mddev_s
        atomic_t                        writes_pending; 
        request_queue_t                 *queue; /* for plugging ... */
 
+       atomic_t                        write_behind; /* outstanding async IO */
+       unsigned int                    max_write_behind; /* 0 = sync */
+
+       struct bitmap                   *bitmap; /* the bitmap for the device */
+       struct file                     *bitmap_file; /* the bitmap file */
+       long                            bitmap_offset; /* offset from superblock of
+                                                       * start of bitmap. May be
+                                                       * negative, but not '0'
+                                                       */
+       long                            default_bitmap_offset; /* this is the offset to use when
+                                                               * hot-adding a bitmap.  It should
+                                                               * eventually be settable by sysfs.
+                                                               */
+
        struct list_head                all_mddevs;
 };
 
 
 static inline void rdev_dec_pending(mdk_rdev_t *rdev, mddev_t *mddev)
 {
-       int faulty = rdev->faulty;
+       int faulty = test_bit(Faulty, &rdev->flags);
        if (atomic_dec_and_test(&rdev->nr_pending) && faulty)
                set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 }
@@ -276,9 +246,11 @@ static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sect
         atomic_add(nr_sectors, &bdev->bd_contains->bd_disk->sync_io);
 }
 
-struct mdk_personality_s
+struct mdk_personality
 {
        char *name;
+       int level;
+       struct list_head list;
        struct module *owner;
        int (*make_request)(request_queue_t *q, struct bio *bio);
        int (*run)(mddev_t *mddev);
@@ -291,10 +263,24 @@ struct mdk_personality_s
        int (*hot_add_disk) (mddev_t *mddev, mdk_rdev_t *rdev);
        int (*hot_remove_disk) (mddev_t *mddev, int number);
        int (*spare_active) (mddev_t *mddev);
-       int (*sync_request)(mddev_t *mddev, sector_t sector_nr, int go_faster);
+       sector_t (*sync_request)(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster);
        int (*resize) (mddev_t *mddev, sector_t sectors);
-       int (*reshape) (mddev_t *mddev, int raid_disks);
+       int (*check_reshape) (mddev_t *mddev);
+       int (*start_reshape) (mddev_t *mddev);
        int (*reconfig) (mddev_t *mddev, int layout, int chunk_size);
+       /* quiesce moves between quiescence states
+        * 0 - fully active
+        * 1 - no new requests allowed
+        * others - reserved
+        */
+       void (*quiesce) (mddev_t *mddev, int state);
+};
+
+
+struct md_sysfs_entry {
+       struct attribute attr;
+       ssize_t (*show)(mddev_t *, char *);
+       ssize_t (*store)(mddev_t *, const char *, size_t);
 };
 
 
@@ -303,8 +289,6 @@ static inline char * mdname (mddev_t * mddev)
        return mddev->gendisk ? mddev->gendisk->disk_name : "mdX";
 }
 
-extern mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr);
-
 /*
  * iterates through some rdev ringlist. It's safe to remove the
  * current 'rdev'. Dont touch 'tmp' though.
@@ -332,9 +316,8 @@ typedef struct mdk_thread_s {
        mddev_t                 *mddev;
        wait_queue_head_t       wqueue;
        unsigned long           flags;
-       struct completion       *event;
        struct task_struct      *tsk;
-       const char              *name;
+       unsigned long           timeout;
 } mdk_thread_t;
 
 #define THREAD_WAKEUP  0
@@ -365,5 +348,10 @@ do {                                                                       \
        __wait_event_lock_irq(wq, condition, lock, cmd);                \
 } while (0)
 
+static inline void safe_put_page(struct page *p)
+{
+       if (p) put_page(p);
+}
+
 #endif