]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blobdiff - drivers/md/dm-raid.c
Merge branches 'for-4.11/upstream-fixes', 'for-4.12/accutouch', 'for-4.12/cp2112...
[mirror_ubuntu-artful-kernel.git] / drivers / md / dm-raid.c
index 6d53810963f7531a7e5048dad5c55ef53e8aa914..5c9e95d66f3b64d14355a91abd936eb5f3c46440 100644 (file)
  */
 #define        MIN_FREE_RESHAPE_SPACE to_sector(4*4096)
 
+/*
+ * Minimum journal space 4 MiB in sectors.
+ */
+#define        MIN_RAID456_JOURNAL_SPACE (4*2048)
+
 static bool devices_handle_discard_safely = false;
 
 /*
@@ -73,6 +78,9 @@ struct raid_dev {
 #define __CTR_FLAG_DATA_OFFSET         13 /* 2 */ /* Only with reshapable raid4/5/6/10! */
 #define __CTR_FLAG_RAID10_USE_NEAR_SETS 14 /* 2 */ /* Only with raid10! */
 
+/* New for v1.10.0 */
+#define __CTR_FLAG_JOURNAL_DEV         15 /* 2 */ /* Only with raid4/5/6! */
+
 /*
  * Flags for rs->ctr_flags field.
  */
@@ -91,6 +99,7 @@ struct raid_dev {
 #define CTR_FLAG_DELTA_DISKS           (1 << __CTR_FLAG_DELTA_DISKS)
 #define CTR_FLAG_DATA_OFFSET           (1 << __CTR_FLAG_DATA_OFFSET)
 #define CTR_FLAG_RAID10_USE_NEAR_SETS  (1 << __CTR_FLAG_RAID10_USE_NEAR_SETS)
+#define CTR_FLAG_JOURNAL_DEV           (1 << __CTR_FLAG_JOURNAL_DEV)
 
 /*
  * Definitions of various constructor flags to
@@ -160,22 +169,22 @@ struct raid_dev {
                                 CTR_FLAG_DAEMON_SLEEP | \
                                 CTR_FLAG_MIN_RECOVERY_RATE | \
                                 CTR_FLAG_MAX_RECOVERY_RATE | \
-                                CTR_FLAG_MAX_WRITE_BEHIND | \
                                 CTR_FLAG_STRIPE_CACHE | \
                                 CTR_FLAG_REGION_SIZE | \
                                 CTR_FLAG_DELTA_DISKS | \
-                                CTR_FLAG_DATA_OFFSET)
+                                CTR_FLAG_DATA_OFFSET | \
+                                CTR_FLAG_JOURNAL_DEV)
 
 #define RAID6_VALID_FLAGS      (CTR_FLAG_SYNC | \
                                 CTR_FLAG_REBUILD | \
                                 CTR_FLAG_DAEMON_SLEEP | \
                                 CTR_FLAG_MIN_RECOVERY_RATE | \
                                 CTR_FLAG_MAX_RECOVERY_RATE | \
-                                CTR_FLAG_MAX_WRITE_BEHIND | \
                                 CTR_FLAG_STRIPE_CACHE | \
                                 CTR_FLAG_REGION_SIZE | \
                                 CTR_FLAG_DELTA_DISKS | \
-                                CTR_FLAG_DATA_OFFSET)
+                                CTR_FLAG_DATA_OFFSET | \
+                                CTR_FLAG_JOURNAL_DEV)
 /* ...valid options definitions per raid level */
 
 /*
@@ -224,6 +233,12 @@ struct raid_set {
        struct raid_type *raid_type;
        struct dm_target_callbacks callbacks;
 
+       /* Optional raid4/5/6 journal device */
+       struct journal_dev {
+               struct dm_dev *dev;
+               struct md_rdev rdev;
+       } journal_dev;
+
        struct raid_dev dev[0];
 };
 
@@ -308,6 +323,7 @@ static struct arg_name_flag {
        { CTR_FLAG_DATA_OFFSET, "data_offset"},
        { CTR_FLAG_DELTA_DISKS, "delta_disks"},
        { CTR_FLAG_RAID10_USE_NEAR_SETS, "raid10_use_near_sets"},
+       { CTR_FLAG_JOURNAL_DEV, "journal_dev" },
 };
 
 /* Return argument name string for given @flag */
@@ -372,7 +388,7 @@ static bool rs_is_reshapable(struct raid_set *rs)
 /* Return true, if raid set in @rs is recovering */
 static bool rs_is_recovering(struct raid_set *rs)
 {
-       return rs->md.recovery_cp < rs->dev[0].rdev.sectors;
+       return rs->md.recovery_cp < rs->md.dev_sectors;
 }
 
 /* Return true, if raid set in @rs is reshaping */
@@ -629,7 +645,8 @@ static void rs_set_capacity(struct raid_set *rs)
         * is unintended in case of out-of-place reshaping
         */
        rdev_for_each(rdev, mddev)
-               rdev->sectors = mddev->dev_sectors;
+               if (!test_bit(Journal, &rdev->flags))
+                       rdev->sectors = mddev->dev_sectors;
 
        set_capacity(gendisk, mddev->array_sectors);
        revalidate_disk(gendisk);
@@ -715,6 +732,11 @@ static void raid_set_free(struct raid_set *rs)
 {
        int i;
 
+       if (rs->journal_dev.dev) {
+               md_rdev_clear(&rs->journal_dev.rdev);
+               dm_put_device(rs->ti, rs->journal_dev.dev);
+       }
+
        for (i = 0; i < rs->raid_disks; i++) {
                if (rs->dev[i].meta_dev)
                        dm_put_device(rs->ti, rs->dev[i].meta_dev);
@@ -762,10 +784,11 @@ static int parse_dev_params(struct raid_set *rs, struct dm_arg_set *as)
                rs->dev[i].data_dev = NULL;
 
                /*
-                * There are no offsets, since there is a separate device
-                * for data and metadata.
+                * There are no offsets initially.
+                * Out of place reshape will set them accordingly.
                 */
                rs->dev[i].rdev.data_offset = 0;
+               rs->dev[i].rdev.new_data_offset = 0;
                rs->dev[i].rdev.mddev = &rs->md;
 
                arg = dm_shift_arg(as);
@@ -823,6 +846,9 @@ static int parse_dev_params(struct raid_set *rs, struct dm_arg_set *as)
                        rebuild++;
        }
 
+       if (rs->journal_dev.dev)
+               list_add_tail(&rs->journal_dev.rdev.same_set, &rs->md.disks);
+
        if (metadata_available) {
                rs->md.external = 0;
                rs->md.persistent = 1;
@@ -1028,6 +1054,8 @@ too_many:
  *    [max_write_behind <sectors>]     See '-write-behind=' (man mdadm)
  *    [stripe_cache <sectors>]         Stripe cache size for higher RAIDs
  *    [region_size <sectors>]          Defines granularity of bitmap
+ *    [journal_dev <dev>]              raid4/5/6 journaling deviice
+ *                                     (i.e. write hole closing log)
  *
  * RAID10-only options:
  *    [raid10_copies <# copies>]       Number of copies.  (Default: 2)
@@ -1135,7 +1163,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
                /*
                 * Parameters that take a string value are checked here.
                 */
-
+               /* "raid10_format {near|offset|far} */
                if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_FORMAT))) {
                        if (test_and_set_bit(__CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags)) {
                                rs->ti->error = "Only one 'raid10_format' argument pair allowed";
@@ -1153,6 +1181,41 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
                        continue;
                }
 
+               /* "journal_dev dev" */
+               if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_DEV))) {
+                       int r;
+                       struct md_rdev *jdev;
+
+                       if (test_and_set_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) {
+                               rs->ti->error = "Only one raid4/5/6 set journaling device allowed";
+                               return -EINVAL;
+                       }
+                       if (!rt_is_raid456(rt)) {
+                               rs->ti->error = "'journal_dev' is an invalid parameter for this RAID type";
+                               return -EINVAL;
+                       }
+                       r = dm_get_device(rs->ti, arg, dm_table_get_mode(rs->ti->table),
+                                         &rs->journal_dev.dev);
+                       if (r) {
+                               rs->ti->error = "raid4/5/6 journal device lookup failure";
+                               return r;
+                       }
+                       jdev = &rs->journal_dev.rdev;
+                       md_rdev_init(jdev);
+                       jdev->mddev = &rs->md;
+                       jdev->bdev = rs->journal_dev.dev->bdev;
+                       jdev->sectors = to_sector(i_size_read(jdev->bdev->bd_inode));
+                       if (jdev->sectors < MIN_RAID456_JOURNAL_SPACE) {
+                               rs->ti->error = "No space for raid4/5/6 journal";
+                               return -ENOSPC;
+                       }
+                       set_bit(Journal, &jdev->flags);
+                       continue;
+               }
+
+               /*
+                * Parameters with number values from here on.
+                */
                if (kstrtoint(arg, 10, &value) < 0) {
                        rs->ti->error = "Bad numerical argument given in raid params";
                        return -EINVAL;
@@ -1427,6 +1490,25 @@ static unsigned int rs_data_stripes(struct raid_set *rs)
        return rs->raid_disks - rs->raid_type->parity_devs;
 }
 
+/*
+ * Retrieve rdev->sectors from any valid raid device of @rs
+ * to allow userpace to pass in arbitray "- -" device tupples.
+ */
+static sector_t __rdev_sectors(struct raid_set *rs)
+{
+       int i;
+
+       for (i = 0; i < rs->md.raid_disks; i++) {
+               struct md_rdev *rdev = &rs->dev[i].rdev;
+
+               if (!test_bit(Journal, &rdev->flags) &&
+                   rdev->bdev && rdev->sectors)
+                       return rdev->sectors;
+       }
+
+       BUG(); /* Constructor ensures we got some. */
+}
+
 /* Calculate the sectors per device and per array used for @rs */
 static int rs_set_dev_and_array_sectors(struct raid_set *rs, bool use_mddev)
 {
@@ -1470,7 +1552,8 @@ static int rs_set_dev_and_array_sectors(struct raid_set *rs, bool use_mddev)
                array_sectors = (data_stripes + delta_disks) * dev_sectors;
 
        rdev_for_each(rdev, mddev)
-               rdev->sectors = dev_sectors;
+               if (!test_bit(Journal, &rdev->flags))
+                       rdev->sectors = dev_sectors;
 
        mddev->array_sectors = array_sectors;
        mddev->dev_sectors = dev_sectors;
@@ -1512,9 +1595,9 @@ static void rs_setup_recovery(struct raid_set *rs, sector_t dev_sectors)
        else if (dev_sectors == MaxSector)
                /* Prevent recovery */
                __rs_setup_recovery(rs, MaxSector);
-       else if (rs->dev[0].rdev.sectors < dev_sectors)
+       else if (__rdev_sectors(rs) < dev_sectors)
                /* Grown raid set */
-               __rs_setup_recovery(rs, rs->dev[0].rdev.sectors);
+               __rs_setup_recovery(rs, __rdev_sectors(rs));
        else
                __rs_setup_recovery(rs, MaxSector);
 }
@@ -1853,18 +1936,21 @@ static int rs_check_reshape(struct raid_set *rs)
        return -EPERM;
 }
 
-static int read_disk_sb(struct md_rdev *rdev, int size)
+static int read_disk_sb(struct md_rdev *rdev, int size, bool force_reload)
 {
        BUG_ON(!rdev->sb_page);
 
-       if (rdev->sb_loaded)
+       if (rdev->sb_loaded && !force_reload)
                return 0;
 
+       rdev->sb_loaded = 0;
+
        if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, 0, true)) {
                DMERR("Failed to read superblock of device at position %d",
                      rdev->raid_disk);
                md_error(rdev->mddev, rdev);
-               return -EINVAL;
+               set_bit(Faulty, &rdev->flags);
+               return -EIO;
        }
 
        rdev->sb_loaded = 1;
@@ -1992,7 +2078,7 @@ static int super_load(struct md_rdev *rdev, struct md_rdev *refdev)
                return -EINVAL;
        }
 
-       r = read_disk_sb(rdev, rdev->sb_size);
+       r = read_disk_sb(rdev, rdev->sb_size, false);
        if (r)
                return r;
 
@@ -2011,7 +2097,7 @@ static int super_load(struct md_rdev *rdev, struct md_rdev *refdev)
                sb->compat_features = cpu_to_le32(FEATURE_FLAG_SUPPORTS_V190);
 
                /* Force writing of superblocks to disk */
-               set_bit(MD_CHANGE_DEVS, &rdev->mddev->flags);
+               set_bit(MD_SB_CHANGE_DEVS, &rdev->mddev->sb_flags);
 
                /* Any superblock is better than none, choose that if given */
                return refdev ? 0 : 1;
@@ -2050,16 +2136,17 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
 
        mddev->reshape_position = MaxSector;
 
+       mddev->raid_disks = le32_to_cpu(sb->num_devices);
+       mddev->level = le32_to_cpu(sb->level);
+       mddev->layout = le32_to_cpu(sb->layout);
+       mddev->chunk_sectors = le32_to_cpu(sb->stripe_sectors);
+
        /*
         * Reshaping is supported, e.g. reshape_position is valid
         * in superblock and superblock content is authoritative.
         */
        if (le32_to_cpu(sb->compat_features) & FEATURE_FLAG_SUPPORTS_V190) {
                /* Superblock is authoritative wrt given raid set layout! */
-               mddev->raid_disks = le32_to_cpu(sb->num_devices);
-               mddev->level = le32_to_cpu(sb->level);
-               mddev->layout = le32_to_cpu(sb->layout);
-               mddev->chunk_sectors = le32_to_cpu(sb->stripe_sectors);
                mddev->new_level = le32_to_cpu(sb->new_level);
                mddev->new_layout = le32_to_cpu(sb->new_layout);
                mddev->new_chunk_sectors = le32_to_cpu(sb->new_stripe_sectors);
@@ -2087,38 +2174,44 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
                /*
                 * No takeover/reshaping, because we don't have the extended v1.9.0 metadata
                 */
-               if (le32_to_cpu(sb->level) != mddev->new_level) {
-                       DMERR("Reshaping/takeover raid sets not yet supported. (raid level/stripes/size change)");
-                       return -EINVAL;
-               }
-               if (le32_to_cpu(sb->layout) != mddev->new_layout) {
-                       DMERR("Reshaping raid sets not yet supported. (raid layout change)");
-                       DMERR("  0x%X vs 0x%X", le32_to_cpu(sb->layout), mddev->layout);
-                       DMERR("  Old layout: %s w/ %d copies",
-                             raid10_md_layout_to_format(le32_to_cpu(sb->layout)),
-                             raid10_md_layout_to_copies(le32_to_cpu(sb->layout)));
-                       DMERR("  New layout: %s w/ %d copies",
-                             raid10_md_layout_to_format(mddev->layout),
-                             raid10_md_layout_to_copies(mddev->layout));
-                       return -EINVAL;
-               }
-               if (le32_to_cpu(sb->stripe_sectors) != mddev->new_chunk_sectors) {
-                       DMERR("Reshaping raid sets not yet supported. (stripe sectors change)");
-                       return -EINVAL;
-               }
+               struct raid_type *rt_cur = get_raid_type_by_ll(mddev->level, mddev->layout);
+               struct raid_type *rt_new = get_raid_type_by_ll(mddev->new_level, mddev->new_layout);
 
-               /* We can only change the number of devices in raid1 with old (i.e. pre 1.0.7) metadata */
-               if (!rt_is_raid1(rs->raid_type) &&
-                   (le32_to_cpu(sb->num_devices) != mddev->raid_disks)) {
-                       DMERR("Reshaping raid sets not yet supported. (device count change from %u to %u)",
-                             sb->num_devices, mddev->raid_disks);
+               if (rs_takeover_requested(rs)) {
+                       if (rt_cur && rt_new)
+                               DMERR("Takeover raid sets from %s to %s not yet supported by metadata. (raid level change)",
+                                     rt_cur->name, rt_new->name);
+                       else
+                               DMERR("Takeover raid sets not yet supported by metadata. (raid level change)");
+                       return -EINVAL;
+               } else if (rs_reshape_requested(rs)) {
+                       DMERR("Reshaping raid sets not yet supported by metadata. (raid layout change keeping level)");
+                       if (mddev->layout != mddev->new_layout) {
+                               if (rt_cur && rt_new)
+                                       DMERR("  current layout %s vs new layout %s",
+                                             rt_cur->name, rt_new->name);
+                               else
+                                       DMERR("  current layout 0x%X vs new layout 0x%X",
+                                             le32_to_cpu(sb->layout), mddev->new_layout);
+                       }
+                       if (mddev->chunk_sectors != mddev->new_chunk_sectors)
+                               DMERR("  current stripe sectors %u vs new stripe sectors %u",
+                                     mddev->chunk_sectors, mddev->new_chunk_sectors);
+                       if (rs->delta_disks)
+                               DMERR("  current %u disks vs new %u disks",
+                                     mddev->raid_disks, mddev->raid_disks + rs->delta_disks);
+                       if (rs_is_raid10(rs)) {
+                               DMERR("  Old layout: %s w/ %u copies",
+                                     raid10_md_layout_to_format(mddev->layout),
+                                     raid10_md_layout_to_copies(mddev->layout));
+                               DMERR("  New layout: %s w/ %u copies",
+                                     raid10_md_layout_to_format(mddev->new_layout),
+                                     raid10_md_layout_to_copies(mddev->new_layout));
+                       }
                        return -EINVAL;
                }
 
                DMINFO("Discovered old metadata format; upgrading to extended metadata format");
-
-               /* Table line is checked vs. authoritative superblock */
-               rs_set_new(rs);
        }
 
        if (!test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags))
@@ -2141,6 +2234,9 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
         */
        d = 0;
        rdev_for_each(r, mddev) {
+               if (test_bit(Journal, &rdev->flags))
+                       continue;
+
                if (test_bit(FirstUse, &r->flags))
                        new_devs++;
 
@@ -2196,7 +2292,8 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
         */
        sb_retrieve_failed_devices(sb, failed_devices);
        rdev_for_each(r, mddev) {
-               if (!r->sb_page)
+               if (test_bit(Journal, &rdev->flags) ||
+                   !r->sb_page)
                        continue;
                sb2 = page_address(r->sb_page);
                sb2->failed_devices = 0;
@@ -2211,7 +2308,7 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
                                continue;
 
                        if (role != r->raid_disk) {
-                               if (__is_raid10_near(mddev->layout)) {
+                               if (rs_is_raid10(rs) && __is_raid10_near(mddev->layout)) {
                                        if (mddev->raid_disks % __raid10_near_copies(mddev->layout) ||
                                            rs->raid_disks % rs->raid10_copies) {
                                                rs->ti->error =
@@ -2248,7 +2345,7 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev)
        struct mddev *mddev = &rs->md;
        struct dm_raid_superblock *sb;
 
-       if (rs_is_raid0(rs) || !rdev->sb_page)
+       if (rs_is_raid0(rs) || !rdev->sb_page || rdev->raid_disk < 0)
                return 0;
 
        sb = page_address(rdev->sb_page);
@@ -2273,7 +2370,7 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev)
 
        /* Enable bitmap creation for RAID levels != 0 */
        mddev->bitmap_info.offset = rt_is_raid0(rs->raid_type) ? 0 : to_sector(4096);
-       rdev->mddev->bitmap_info.default_offset = mddev->bitmap_info.offset;
+       mddev->bitmap_info.default_offset = mddev->bitmap_info.offset;
 
        if (!test_and_clear_bit(FirstUse, &rdev->flags)) {
                /* Retrieve device size stored in superblock to be prepared for shrink */
@@ -2311,21 +2408,22 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev)
 static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
 {
        int r;
-       struct raid_dev *dev;
-       struct md_rdev *rdev, *tmp, *freshest;
+       struct md_rdev *rdev, *freshest;
        struct mddev *mddev = &rs->md;
 
        freshest = NULL;
-       rdev_for_each_safe(rdev, tmp, mddev) {
+       rdev_for_each(rdev, mddev) {
+               if (test_bit(Journal, &rdev->flags))
+                       continue;
+
                /*
                 * Skipping super_load due to CTR_FLAG_SYNC will cause
                 * the array to undergo initialization again as
                 * though it were new.  This is the intended effect
                 * of the "sync" directive.
                 *
-                * When reshaping capability is added, we must ensure
-                * that the "sync" directive is disallowed during the
-                * reshape.
+                * With reshaping capability added, we must ensure that
+                * that the "sync" directive is disallowed during the reshape.
                 */
                if (test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags))
                        continue;
@@ -2342,6 +2440,7 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
                case 0:
                        break;
                default:
+                       /* This is a failure to read the superblock from the metadata device. */
                        /*
                         * We have to keep any raid0 data/metadata device pairs or
                         * the MD raid0 personality will fail to start the array.
@@ -2349,33 +2448,16 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
                        if (rs_is_raid0(rs))
                                continue;
 
-                       dev = container_of(rdev, struct raid_dev, rdev);
-                       if (dev->meta_dev)
-                               dm_put_device(ti, dev->meta_dev);
-
-                       dev->meta_dev = NULL;
-                       rdev->meta_bdev = NULL;
-
-                       if (rdev->sb_page)
-                               put_page(rdev->sb_page);
-
-                       rdev->sb_page = NULL;
-
-                       rdev->sb_loaded = 0;
-
                        /*
-                        * We might be able to salvage the data device
-                        * even though the meta device has failed.  For
-                        * now, we behave as though '- -' had been
-                        * set for this device in the table.
+                        * We keep the dm_devs to be able to emit the device tuple
+                        * properly on the table line in raid_status() (rather than
+                        * mistakenly acting as if '- -' got passed into the constructor).
+                        *
+                        * The rdev has to stay on the same_set list to allow for
+                        * the attempt to restore faulty devices on second resume.
                         */
-                       if (dev->data_dev)
-                               dm_put_device(ti, dev->data_dev);
-
-                       dev->data_dev = NULL;
-                       rdev->bdev = NULL;
-
-                       list_del(&rdev->same_set);
+                       rdev->raid_disk = rdev->saved_raid_disk = -1;
+                       break;
                }
        }
 
@@ -2396,7 +2478,9 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
                return -EINVAL;
 
        rdev_for_each(rdev, mddev)
-               if ((rdev != freshest) && super_validate(rs, rdev))
+               if (!test_bit(Journal, &rdev->flags) &&
+                   rdev != freshest &&
+                   super_validate(rs, rdev))
                        return -EINVAL;
        return 0;
 }
@@ -2483,10 +2567,12 @@ static int rs_adjust_data_offsets(struct raid_set *rs)
                return -ENOSPC;
        }
 out:
-       /* Adjust data offsets on all rdevs */
+       /* Adjust data offsets on all rdevs but on any raid4/5/6 journal device */
        rdev_for_each(rdev, &rs->md) {
-               rdev->data_offset = data_offset;
-               rdev->new_data_offset = new_data_offset;
+               if (!test_bit(Journal, &rdev->flags)) {
+                       rdev->data_offset = data_offset;
+                       rdev->new_data_offset = new_data_offset;
+               }
        }
 
        return 0;
@@ -2499,8 +2585,10 @@ static void __reorder_raid_disk_indexes(struct raid_set *rs)
        struct md_rdev *rdev;
 
        rdev_for_each(rdev, &rs->md) {
-               rdev->raid_disk = i++;
-               rdev->saved_raid_disk = rdev->new_raid_disk = -1;
+               if (!test_bit(Journal, &rdev->flags)) {
+                       rdev->raid_disk = i++;
+                       rdev->saved_raid_disk = rdev->new_raid_disk = -1;
+               }
        }
 }
 
@@ -2840,7 +2928,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        if (r)
                goto bad;
 
-       calculated_dev_sectors = rs->dev[0].rdev.sectors;
+       calculated_dev_sectors = rs->md.dev_sectors;
 
        /*
         * Backup any new raid set level, layout, ...
@@ -2853,7 +2941,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        if (r)
                goto bad;
 
-       resize = calculated_dev_sectors != rs->dev[0].rdev.sectors;
+       resize = calculated_dev_sectors != __rdev_sectors(rs);
 
        INIT_WORK(&rs->md.event_work, do_table_event);
        ti->private = rs;
@@ -2897,6 +2985,13 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
                        goto bad;
                }
 
+               /* We can't takeover a journaled raid4/5/6 */
+               if (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) {
+                       ti->error = "Can't takeover a journaled raid4/5/6 set";
+                       r = -EPERM;
+                       goto bad;
+               }
+
                /*
                 * If a takeover is needed, userspace sets any additional
                 * devices to rebuild and we can check for a valid request here.
@@ -2918,6 +3013,18 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
                rs_setup_recovery(rs, MaxSector);
                rs_set_new(rs);
        } else if (rs_reshape_requested(rs)) {
+               /*
+                * No need to check for 'ongoing' takeover here, because takeover
+                * is an instant operation as oposed to an ongoing reshape.
+                */
+
+               /* We can't reshape a journaled raid4/5/6 */
+               if (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) {
+                       ti->error = "Can't reshape a journaled raid4/5/6 set";
+                       r = -EPERM;
+                       goto bad;
+               }
+
                /*
                  * We can only prepare for a reshape here, because the
                  * raid set needs to run to provide the repective reshape
@@ -2994,6 +3101,9 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
                }
        }
 
+       /* Disable/enable discard support on raid set. */
+       configure_discard_support(rs);
+
        mddev_unlock(&rs->md);
        return 0;
 
@@ -3063,18 +3173,23 @@ static const char *decipher_sync_action(struct mddev *mddev)
 }
 
 /*
- * Return status string @rdev
+ * Return status string for @rdev
  *
  * Status characters:
  *
- *  'D' = Dead/Failed device
+ *  'D' = Dead/Failed raid set component or raid4/5/6 journal device
  *  'a' = Alive but not in-sync
- *  'A' = Alive and in-sync
+ *  'A' = Alive and in-sync raid set component or alive raid4/5/6 journal device
+ *  '-' = Non-existing device (i.e. uspace passed '- -' into the ctr)
  */
 static const char *__raid_dev_status(struct md_rdev *rdev, bool array_in_sync)
 {
-       if (test_bit(Faulty, &rdev->flags))
+       if (!rdev->bdev)
+               return "-";
+       else if (test_bit(Faulty, &rdev->flags))
                return "D";
+       else if (test_bit(Journal, &rdev->flags))
+               return "A";
        else if (!array_in_sync || !test_bit(In_sync, &rdev->flags))
                return "a";
        else
@@ -3143,7 +3258,8 @@ static sector_t rs_get_progress(struct raid_set *rs,
                         * being initialized.
                         */
                        rdev_for_each(rdev, mddev)
-                               if (!test_bit(In_sync, &rdev->flags))
+                               if (!test_bit(Journal, &rdev->flags) &&
+                                   !test_bit(In_sync, &rdev->flags))
                                        *array_in_sync = true;
 #if 0
                        r = 0; /* HM FIXME: TESTME: https://bugzilla.redhat.com/show_bug.cgi?id=1210637 ? */
@@ -3175,7 +3291,6 @@ static void raid_status(struct dm_target *ti, status_type_t type,
        sector_t progress, resync_max_sectors, resync_mismatches;
        const char *sync_action;
        struct raid_type *rt;
-       struct md_rdev *rdev;
 
        switch (type) {
        case STATUSTYPE_INFO:
@@ -3196,9 +3311,9 @@ static void raid_status(struct dm_target *ti, status_type_t type,
                                    atomic64_read(&mddev->resync_mismatches) : 0;
                sync_action = decipher_sync_action(&rs->md);
 
-               /* HM FIXME: do we want another state char for raid0? It shows 'D' or 'A' now */
-               rdev_for_each(rdev, mddev)
-                       DMEMIT(__raid_dev_status(rdev, array_in_sync));
+               /* HM FIXME: do we want another state char for raid0? It shows 'D'/'A'/'-' now */
+               for (i = 0; i < rs->raid_disks; i++)
+                       DMEMIT(__raid_dev_status(&rs->dev[i].rdev, array_in_sync));
 
                /*
                 * In-sync/Reshape ratio:
@@ -3244,6 +3359,12 @@ static void raid_status(struct dm_target *ti, status_type_t type,
                 * so retrieving it from the first raid disk is sufficient.
                 */
                DMEMIT(" %llu", (unsigned long long) rs->dev[0].rdev.data_offset);
+
+               /*
+                * v1.10.0+:
+                */
+               DMEMIT(" %s", test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ?
+                             __raid_dev_status(&rs->journal_dev.rdev, 0) : "-");
                break;
 
        case STATUSTYPE_TABLE:
@@ -3257,7 +3378,8 @@ static void raid_status(struct dm_target *ti, status_type_t type,
                raid_param_cnt += rebuild_disks * 2 +
                                  write_mostly_params +
                                  hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_NO_ARGS) +
-                                 hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_ONE_ARG) * 2;
+                                 hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_ONE_ARG) * 2 +
+                                 (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ? 2 : 0);
                /* Emit table line */
                DMEMIT("%s %u %u", rs->raid_type->name, raid_param_cnt, mddev->new_chunk_sectors);
                if (test_bit(__CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags))
@@ -3304,6 +3426,9 @@ static void raid_status(struct dm_target *ti, status_type_t type,
                if (test_bit(__CTR_FLAG_MIN_RECOVERY_RATE, &rs->ctr_flags))
                        DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MIN_RECOVERY_RATE),
                                         mddev->sync_speed_min);
+               if (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags))
+                       DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_DEV),
+                                       __get_dev_name(rs->journal_dev.dev));
                DMEMIT(" %d", rs->raid_disks);
                for (i = 0; i < rs->raid_disks; i++)
                        DMEMIT(" %s %s", __get_dev_name(rs->dev[i].meta_dev),
@@ -3339,10 +3464,11 @@ static int raid_message(struct dm_target *ti, unsigned int argc, char **argv)
        else {
                if (!strcasecmp(argv[0], "check"))
                        set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
-               else if (!!strcasecmp(argv[0], "repair"))
+               else if (!strcasecmp(argv[0], "repair")) {
+                       set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
+                       set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+               } else
                        return -EINVAL;
-               set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
-               set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
        }
        if (mddev->ro == 2) {
                /* A write to sync_action is enough to justify
@@ -3419,11 +3545,14 @@ static void attempt_restore_of_faulty_devices(struct raid_set *rs)
 
        memset(cleared_failed_devices, 0, sizeof(cleared_failed_devices));
 
-       for (i = 0; i < rs->md.raid_disks; i++) {
+       for (i = 0; i < mddev->raid_disks; i++) {
                r = &rs->dev[i].rdev;
-               if (test_bit(Faulty, &r->flags) && r->sb_page &&
-                   sync_page_io(r, 0, r->sb_size, r->sb_page,
-                                REQ_OP_READ, 0, true)) {
+               /* HM FIXME: enhance journal device recovery processing */
+               if (test_bit(Journal, &r->flags))
+                       continue;
+
+               if (test_bit(Faulty, &r->flags) &&
+                   r->meta_bdev && !read_disk_sb(r, r->sb_size, true)) {
                        DMINFO("Faulty %s device #%d has readable super block."
                               "  Attempting to revive it.",
                               rs->raid_type->name, i);
@@ -3437,22 +3566,26 @@ static void attempt_restore_of_faulty_devices(struct raid_set *rs)
                         * '>= 0' - meaning we must call this function
                         * ourselves.
                         */
-                       if ((r->raid_disk >= 0) &&
-                           (mddev->pers->hot_remove_disk(mddev, r) != 0))
-                               /* Failed to revive this device, try next */
-                               continue;
-
-                       r->raid_disk = i;
-                       r->saved_raid_disk = i;
                        flags = r->flags;
+                       clear_bit(In_sync, &r->flags); /* Mandatory for hot remove. */
+                       if (r->raid_disk >= 0) {
+                               if (mddev->pers->hot_remove_disk(mddev, r)) {
+                                       /* Failed to revive this device, try next */
+                                       r->flags = flags;
+                                       continue;
+                               }
+                       } else
+                               r->raid_disk = r->saved_raid_disk = i;
+
                        clear_bit(Faulty, &r->flags);
                        clear_bit(WriteErrorSeen, &r->flags);
-                       clear_bit(In_sync, &r->flags);
+
                        if (mddev->pers->hot_add_disk(mddev, r)) {
-                               r->raid_disk = -1;
-                               r->saved_raid_disk = -1;
+                               /* Failed to revive this device, try next */
+                               r->raid_disk = r->saved_raid_disk = -1;
                                r->flags = flags;
                        } else {
+                               clear_bit(In_sync, &r->flags);
                                r->recovery_offset = 0;
                                set_bit(i, (void *) cleared_failed_devices);
                                cleared = true;
@@ -3465,6 +3598,9 @@ static void attempt_restore_of_faulty_devices(struct raid_set *rs)
                uint64_t failed_devices[DISKS_ARRAY_ELEMS];
 
                rdev_for_each(r, &rs->md) {
+                       if (test_bit(Journal, &r->flags))
+                               continue;
+
                        sb = page_address(r->sb_page);
                        sb_retrieve_failed_devices(sb, failed_devices);
 
@@ -3497,7 +3633,7 @@ static void rs_update_sbs(struct raid_set *rs)
        struct mddev *mddev = &rs->md;
        int ro = mddev->ro;
 
-       set_bit(MD_CHANGE_DEVS, &mddev->flags);
+       set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
        mddev->ro = 0;
        md_update_sb(mddev, 1);
        mddev->ro = ro;
@@ -3580,12 +3716,6 @@ static int raid_preresume(struct dm_target *ti)
        if (test_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags))
                rs_update_sbs(rs);
 
-       /*
-        * Disable/enable discard support on raid set after any
-        * conversion, because devices can have been added
-        */
-       configure_discard_support(rs);
-
        /* Load the bitmap from disk unless raid0 */
        r = __load_dirty_region_bitmap(rs);
        if (r)
@@ -3649,7 +3779,7 @@ static void raid_resume(struct dm_target *ti)
 
 static struct target_type raid_target = {
        .name = "raid",
-       .version = {1, 9, 1},
+       .version = {1, 10, 0},
        .module = THIS_MODULE,
        .ctr = raid_ctr,
        .dtr = raid_dtr,