]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blobdiff - drivers/md/md.c
[PATCH] md: Split reshape handler in check_reshape and start_reshape
[mirror_ubuntu-bionic-kernel.git] / drivers / md / md.c
index 653d4dcbee23e7e78b4da9c44cf648f17806ee0a..1a637676a930d1f85ef5a6ddda2bfe9702f1d5fd 100644 (file)
@@ -158,11 +158,12 @@ static int start_readonly;
  */
 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
 static atomic_t md_event_count;
-static void md_new_event(mddev_t *mddev)
+void md_new_event(mddev_t *mddev)
 {
        atomic_inc(&md_event_count);
        wake_up(&md_event_waiters);
 }
+EXPORT_SYMBOL_GPL(md_new_event);
 
 /*
  * Enables to iterate over all existing md arrays
@@ -213,8 +214,11 @@ static void mddev_put(mddev_t *mddev)
                return;
        if (!mddev->raid_disks && list_empty(&mddev->disks)) {
                list_del(&mddev->all_mddevs);
-               blk_put_queue(mddev->queue);
+               /* that blocks */
+               blk_cleanup_queue(mddev->queue);
+               /* that also blocks */
                kobject_unregister(&mddev->kobj);
+               /* result blows... */
        }
        spin_unlock(&all_mddevs_lock);
 }
@@ -263,6 +267,7 @@ static mddev_t * mddev_find(dev_t unit)
                kfree(new);
                return NULL;
        }
+       set_bit(QUEUE_FLAG_CLUSTER, &new->queue->queue_flags);
 
        blk_queue_make_request(new->queue, md_fail_request);
 
@@ -657,7 +662,8 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
        }
 
        if (sb->major_version != 0 ||
-           sb->minor_version != 90) {
+           sb->minor_version < 90 ||
+           sb->minor_version > 91) {
                printk(KERN_WARNING "Bad version number %d.%d on %s\n",
                        sb->major_version, sb->minor_version,
                        b);
@@ -742,6 +748,20 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
                mddev->bitmap_offset = 0;
                mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
 
+               if (mddev->minor_version >= 91) {
+                       mddev->reshape_position = sb->reshape_position;
+                       mddev->delta_disks = sb->delta_disks;
+                       mddev->new_level = sb->new_level;
+                       mddev->new_layout = sb->new_layout;
+                       mddev->new_chunk = sb->new_chunk;
+               } else {
+                       mddev->reshape_position = MaxSector;
+                       mddev->delta_disks = 0;
+                       mddev->new_level = mddev->level;
+                       mddev->new_layout = mddev->layout;
+                       mddev->new_chunk = mddev->chunk_size;
+               }
+
                if (sb->state & (1<<MD_SB_CLEAN))
                        mddev->recovery_cp = MaxSector;
                else {
@@ -761,7 +781,8 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 
                if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
                    mddev->bitmap_file == NULL) {
-                       if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6
+                       if (mddev->level != 1 && mddev->level != 4
+                           && mddev->level != 5 && mddev->level != 6
                            && mddev->level != 10) {
                                /* FIXME use a better test */
                                printk(KERN_WARNING "md: bitmaps not supported for this level.\n");
@@ -835,7 +856,6 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 
        sb->md_magic = MD_SB_MAGIC;
        sb->major_version = mddev->major_version;
-       sb->minor_version = mddev->minor_version;
        sb->patch_version = mddev->patch_version;
        sb->gvalid_words  = 0; /* ignored */
        memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
@@ -854,6 +874,17 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
        sb->events_hi = (mddev->events>>32);
        sb->events_lo = (u32)mddev->events;
 
+       if (mddev->reshape_position == MaxSector)
+               sb->minor_version = 90;
+       else {
+               sb->minor_version = 91;
+               sb->reshape_position = mddev->reshape_position;
+               sb->new_level = mddev->new_level;
+               sb->delta_disks = mddev->delta_disks;
+               sb->new_layout = mddev->new_layout;
+               sb->new_chunk = mddev->new_chunk;
+       }
+       mddev->minor_version = sb->minor_version;
        if (mddev->in_sync)
        {
                sb->recovery_cp = mddev->recovery_cp;
@@ -890,10 +921,9 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
                        d->raid_disk = rdev2->raid_disk;
                else
                        d->raid_disk = rdev2->desc_nr; /* compatibility */
-               if (test_bit(Faulty, &rdev2->flags)) {
+               if (test_bit(Faulty, &rdev2->flags))
                        d->state = (1<<MD_DISK_FAULTY);
-                       failed++;
-               } else if (test_bit(In_sync, &rdev2->flags)) {
+               else if (test_bit(In_sync, &rdev2->flags)) {
                        d->state = (1<<MD_DISK_ACTIVE);
                        d->state |= (1<<MD_DISK_SYNC);
                        active++;
@@ -1024,7 +1054,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
                rdev-> sb_size = (rdev->sb_size | bmask)+1;
 
        if (refdev == 0)
-               return 1;
+               ret = 1;
        else {
                __u64 ev1, ev2;
                struct mdp_superblock_1 *refsb = 
@@ -1044,7 +1074,9 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
                ev2 = le64_to_cpu(refsb->events);
 
                if (ev1 > ev2)
-                       return 1;
+                       ret = 1;
+               else
+                       ret = 0;
        }
        if (minor_version) 
                rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2;
@@ -1058,7 +1090,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
 
        if (le32_to_cpu(sb->size) > rdev->size*2)
                return -EINVAL;
-       return 0;
+       return ret;
 }
 
 static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
@@ -1081,7 +1113,7 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
                mddev->size = le64_to_cpu(sb->size)/2;
                mddev->events = le64_to_cpu(sb->events);
                mddev->bitmap_offset = 0;
-               mddev->default_bitmap_offset = 1024;
+               mddev->default_bitmap_offset = 1024 >> 9;
                
                mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
                memcpy(mddev->uuid, sb->set_uuid, 16);
@@ -1097,6 +1129,20 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
                        }
                        mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset);
                }
+               if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
+                       mddev->reshape_position = le64_to_cpu(sb->reshape_position);
+                       mddev->delta_disks = le32_to_cpu(sb->delta_disks);
+                       mddev->new_level = le32_to_cpu(sb->new_level);
+                       mddev->new_layout = le32_to_cpu(sb->new_layout);
+                       mddev->new_chunk = le32_to_cpu(sb->new_chunk)<<9;
+               } else {
+                       mddev->reshape_position = MaxSector;
+                       mddev->delta_disks = 0;
+                       mddev->new_level = mddev->level;
+                       mddev->new_layout = mddev->layout;
+                       mddev->new_chunk = mddev->chunk_size;
+               }
+
        } else if (mddev->pers == NULL) {
                /* Insist of good event counter while assembling */
                __u64 ev1 = le64_to_cpu(sb->events);
@@ -1162,12 +1208,20 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
        sb->cnt_corrected_read = atomic_read(&rdev->corrected_errors);
 
        sb->raid_disks = cpu_to_le32(mddev->raid_disks);
-       sb->size = cpu_to_le64(mddev->size);
+       sb->size = cpu_to_le64(mddev->size<<1);
 
        if (mddev->bitmap && mddev->bitmap_file == NULL) {
                sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
                sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
        }
+       if (mddev->reshape_position != MaxSector) {
+               sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
+               sb->reshape_position = cpu_to_le64(mddev->reshape_position);
+               sb->new_layout = cpu_to_le32(mddev->new_layout);
+               sb->delta_disks = cpu_to_le32(mddev->delta_disks);
+               sb->new_level = cpu_to_le32(mddev->new_level);
+               sb->new_chunk = cpu_to_le32(mddev->new_chunk>>9);
+       }
 
        max_dev = 0;
        ITERATE_RDEV(mddev,rdev2,tmp)
@@ -1296,6 +1350,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
        else
                ko = &rdev->bdev->bd_disk->kobj;
        sysfs_create_link(&rdev->kobj, ko, "block");
+       bd_claim_by_disk(rdev->bdev, rdev, mddev->gendisk);
        return 0;
 }
 
@@ -1306,6 +1361,7 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev)
                MD_BUG();
                return;
        }
+       bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk);
        list_del_init(&rdev->same_set);
        printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
        rdev->mddev = NULL;
@@ -1488,7 +1544,7 @@ static void sync_sbs(mddev_t * mddev)
        }
 }
 
-static void md_update_sb(mddev_t * mddev)
+void md_update_sb(mddev_t * mddev)
 {
        int err;
        struct list_head *tmp;
@@ -1565,6 +1621,7 @@ repeat:
        wake_up(&mddev->sb_wait);
 
 }
+EXPORT_SYMBOL_GPL(md_update_sb);
 
 /* words written to sysfs files may, or my not, be \n terminated.
  * We want to accept with case. For this we use cmd_match.
@@ -2157,7 +2214,9 @@ action_show(mddev_t *mddev, char *page)
        char *type = "idle";
        if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
            test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) {
-               if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
+               if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
+                       type = "reshape";
+               else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
                        if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
                                type = "resync";
                        else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
@@ -2534,6 +2593,14 @@ static int do_md_run(mddev_t * mddev)
        mddev->level = pers->level;
        strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
 
+       if (mddev->reshape_position != MaxSector &&
+           pers->start_reshape == NULL) {
+               /* This personality cannot handle reshaping... */
+               mddev->pers = NULL;
+               module_put(pers->owner);
+               return -EINVAL;
+       }
+
        mddev->recovery = 0;
        mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */
        mddev->barriers_work = 1;
@@ -2767,7 +2834,6 @@ static void autorun_array(mddev_t *mddev)
  */
 static void autorun_devices(int part)
 {
-       struct list_head candidates;
        struct list_head *tmp;
        mdk_rdev_t *rdev0, *rdev;
        mddev_t *mddev;
@@ -2776,6 +2842,7 @@ static void autorun_devices(int part)
        printk(KERN_INFO "md: autorun ...\n");
        while (!list_empty(&pending_raid_disks)) {
                dev_t dev;
+               LIST_HEAD(candidates);
                rdev0 = list_entry(pending_raid_disks.next,
                                         mdk_rdev_t, same_set);
 
@@ -2942,6 +3009,8 @@ static int get_array_info(mddev_t * mddev, void __user * arg)
        info.ctime         = mddev->ctime;
        info.level         = mddev->level;
        info.size          = mddev->size;
+       if (info.size != mddev->size) /* overflow */
+               info.size = -1;
        info.nr_disks      = nr;
        info.raid_disks    = mddev->raid_disks;
        info.md_minor      = mddev->md_minor;
@@ -3420,11 +3489,18 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
        mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
        mddev->bitmap_offset = 0;
 
+       mddev->reshape_position = MaxSector;
+
        /*
         * Generate a 128 bit UUID
         */
        get_random_bytes(mddev->uuid, 16);
 
+       mddev->new_level = mddev->level;
+       mddev->new_chunk = mddev->chunk_size;
+       mddev->new_layout = mddev->layout;
+       mddev->delta_disks = 0;
+
        return 0;
 }
 
@@ -3480,14 +3556,16 @@ static int update_raid_disks(mddev_t *mddev, int raid_disks)
 {
        int rv;
        /* change the number of raid disks */
-       if (mddev->pers->reshape == NULL)
+       if (mddev->pers->check_reshape == NULL)
                return -EINVAL;
        if (raid_disks <= 0 ||
            raid_disks >= mddev->max_disks)
                return -EINVAL;
-       if (mddev->sync_thread)
+       if (mddev->sync_thread || mddev->reshape_position != MaxSector)
                return -EBUSY;
-       rv = mddev->pers->reshape(mddev, raid_disks);
+       mddev->delta_disks = raid_disks - mddev->raid_disks;
+
+       rv = mddev->pers->check_reshape(mddev);
        return rv;
 }
 
@@ -3523,7 +3601,7 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
                )
                return -EINVAL;
        /* Check there is only one change */
-       if (mddev->size != info->size) cnt++;
+       if (info->size >= 0 && mddev->size != info->size) cnt++;
        if (mddev->raid_disks != info->raid_disks) cnt++;
        if (mddev->layout != info->layout) cnt++;
        if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) cnt++;
@@ -3540,7 +3618,7 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
                else
                        return mddev->pers->reconfig(mddev, info->layout, -1);
        }
-       if (mddev->size != info->size)
+       if (info->size >= 0 && mddev->size != info->size)
                rv = update_size(mddev, info->size);
 
        if (mddev->raid_disks    != info->raid_disks)
@@ -4034,7 +4112,10 @@ static void status_unused(struct seq_file *seq)
 
 static void status_resync(struct seq_file *seq, mddev_t * mddev)
 {
-       unsigned long max_blocks, resync, res, dt, db, rt;
+       sector_t max_blocks, resync, res;
+       unsigned long dt, db, rt;
+       int scale;
+       unsigned int per_milli;
 
        resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2;
 
@@ -4050,9 +4131,22 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev)
                MD_BUG();
                return;
        }
-       res = (resync/1024)*1000/(max_blocks/1024 + 1);
+       /* Pick 'scale' such that (resync>>scale)*1000 will fit
+        * in a sector_t, and (max_blocks>>scale) will fit in a
+        * u32, as those are the requirements for sector_div.
+        * Thus 'scale' must be at least 10
+        */
+       scale = 10;
+       if (sizeof(sector_t) > sizeof(unsigned long)) {
+               while ( max_blocks/2 > (1ULL<<(scale+32)))
+                       scale++;
+       }
+       res = (resync>>scale)*1000;
+       sector_div(res, (u32)((max_blocks>>scale)+1));
+
+       per_milli = res;
        {
-               int i, x = res/50, y = 20-x;
+               int i, x = per_milli/50, y = 20-x;
                seq_printf(seq, "[");
                for (i = 0; i < x; i++)
                        seq_printf(seq, "=");
@@ -4061,10 +4155,14 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev)
                        seq_printf(seq, ".");
                seq_printf(seq, "] ");
        }
-       seq_printf(seq, " %s =%3lu.%lu%% (%lu/%lu)",
+       seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
+                  (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
+                   "reshape" :
                      (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
-                      "resync" : "recovery"),
-                     res/10, res % 10, resync, max_blocks);
+                      "resync" : "recovery")),
+                     per_milli/10, per_milli % 10,
+                  (unsigned long long) resync,
+                  (unsigned long long) max_blocks);
 
        /*
         * We do not want to overflow, so the order of operands and
@@ -4078,7 +4176,7 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev)
        dt = ((jiffies - mddev->resync_mark) / HZ);
        if (!dt) dt++;
        db = resync - (mddev->resync_mark_cnt/2);
-       rt = (dt * ((max_blocks-resync) / (db/100+1)))/100;
+       rt = (dt * ((unsigned long)(max_blocks-resync) / (db/100+1)))/100;
 
        seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6);
 
@@ -4435,7 +4533,7 @@ static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
 
 #define SYNC_MARKS     10
 #define        SYNC_MARK_STEP  (3*HZ)
-static void md_do_sync(mddev_t *mddev)
+void md_do_sync(mddev_t *mddev)
 {
        mddev_t *mddev2;
        unsigned int currspeed = 0,
@@ -4515,7 +4613,9 @@ static void md_do_sync(mddev_t *mddev)
                 */
                max_sectors = mddev->resync_max_sectors;
                mddev->resync_mismatches = 0;
-       } else
+       } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
+               max_sectors = mddev->size << 1;
+       else
                /* recovery follows the physical size of devices */
                max_sectors = mddev->size << 1;
 
@@ -4651,6 +4751,8 @@ static void md_do_sync(mddev_t *mddev)
        mddev->pers->sync_request(mddev, max_sectors, &skipped, 1);
 
        if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
+           test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
+           !test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
            mddev->curr_resync > 2 &&
            mddev->curr_resync >= mddev->recovery_cp) {
                if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
@@ -4668,6 +4770,7 @@ static void md_do_sync(mddev_t *mddev)
        set_bit(MD_RECOVERY_DONE, &mddev->recovery);
        md_wakeup_thread(mddev->thread);
 }
+EXPORT_SYMBOL_GPL(md_do_sync);
 
 
 /*