]> git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/commitdiff
Merge tag 'md/4.10-rc6' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md
authorLinus Torvalds <torvalds@linux-foundation.org>
Sat, 28 Jan 2017 19:09:04 +0000 (11:09 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sat, 28 Jan 2017 19:09:04 +0000 (11:09 -0800)
Pull MD fixes from Shaohua Li:
 "This fixes several corner cases for raid5 cache, which is merged into
  this cycle"

* tag 'md/4.10-rc6' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md:
  md/r5cache: disable write back for degraded array
  md/r5cache: shift complex rmw from read path to write path
  md/r5cache: flush data only stripes in r5l_recovery_log()
  md/raid5: move comment of fetch_block to right location
  md/r5cache: read data into orig_page for prexor of cached data
  md/raid5-cache: delete meaningless code

drivers/md/md.c
drivers/md/raid5-cache.c
drivers/md/raid5.c
drivers/md/raid5.h

index 82821ee0d57fac691d26e0c3f41c2ba31bb611fc..01175dac0db6361f04cd1bcad910a59202cc276d 100644 (file)
@@ -5291,6 +5291,11 @@ int md_run(struct mddev *mddev)
        if (start_readonly && mddev->ro == 0)
                mddev->ro = 2; /* read-only, but switch on first write */
 
+       /*
+        * NOTE: some pers->run(), for example r5l_recovery_log(), wakes
+        * up mddev->thread. It is important to initialize critical
+        * resources for mddev->thread BEFORE calling pers->run().
+        */
        err = pers->run(mddev);
        if (err)
                pr_warn("md: pers->run() failed ...\n");
index 0e8ed2c327b07fd849c1720d7b272dd860b949b9..302dea3296ba5ccd07740365314f45d74df49ec2 100644 (file)
@@ -162,6 +162,8 @@ struct r5l_log {
 
        /* to submit async io_units, to fulfill ordering of flush */
        struct work_struct deferred_io_work;
+       /* to disable write back during in degraded mode */
+       struct work_struct disable_writeback_work;
 };
 
 /*
@@ -611,6 +613,21 @@ static void r5l_submit_io_async(struct work_struct *work)
                r5l_do_submit_io(log, io);
 }
 
+static void r5c_disable_writeback_async(struct work_struct *work)
+{
+       struct r5l_log *log = container_of(work, struct r5l_log,
+                                          disable_writeback_work);
+       struct mddev *mddev = log->rdev->mddev;
+
+       if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
+               return;
+       pr_info("md/raid:%s: Disabling writeback cache for degraded array.\n",
+               mdname(mddev));
+       mddev_suspend(mddev);
+       log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
+       mddev_resume(mddev);
+}
+
 static void r5l_submit_current_io(struct r5l_log *log)
 {
        struct r5l_io_unit *io = log->current_io;
@@ -1393,8 +1410,6 @@ static void r5l_do_reclaim(struct r5l_log *log)
        next_checkpoint = r5c_calculate_new_cp(conf);
        spin_unlock_irq(&log->io_list_lock);
 
-       BUG_ON(reclaimable < 0);
-
        if (reclaimable == 0 || !write_super)
                return;
 
@@ -2062,7 +2077,7 @@ static int
 r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
                                       struct r5l_recovery_ctx *ctx)
 {
-       struct stripe_head *sh, *next;
+       struct stripe_head *sh;
        struct mddev *mddev = log->rdev->mddev;
        struct page *page;
        sector_t next_checkpoint = MaxSector;
@@ -2076,7 +2091,7 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
 
        WARN_ON(list_empty(&ctx->cached_list));
 
-       list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) {
+       list_for_each_entry(sh, &ctx->cached_list, lru) {
                struct r5l_meta_block *mb;
                int i;
                int offset;
@@ -2126,14 +2141,39 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
                ctx->pos = write_pos;
                ctx->seq += 1;
                next_checkpoint = sh->log_start;
-               list_del_init(&sh->lru);
-               raid5_release_stripe(sh);
        }
        log->next_checkpoint = next_checkpoint;
        __free_page(page);
        return 0;
 }
 
+static void r5c_recovery_flush_data_only_stripes(struct r5l_log *log,
+                                                struct r5l_recovery_ctx *ctx)
+{
+       struct mddev *mddev = log->rdev->mddev;
+       struct r5conf *conf = mddev->private;
+       struct stripe_head *sh, *next;
+
+       if (ctx->data_only_stripes == 0)
+               return;
+
+       log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_BACK;
+
+       list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) {
+               r5c_make_stripe_write_out(sh);
+               set_bit(STRIPE_HANDLE, &sh->state);
+               list_del_init(&sh->lru);
+               raid5_release_stripe(sh);
+       }
+
+       md_wakeup_thread(conf->mddev->thread);
+       /* reuse conf->wait_for_quiescent in recovery */
+       wait_event(conf->wait_for_quiescent,
+                  atomic_read(&conf->active_stripes) == 0);
+
+       log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
+}
+
 static int r5l_recovery_log(struct r5l_log *log)
 {
        struct mddev *mddev = log->rdev->mddev;
@@ -2160,32 +2200,31 @@ static int r5l_recovery_log(struct r5l_log *log)
        pos = ctx.pos;
        ctx.seq += 10000;
 
-       if (ctx.data_only_stripes == 0) {
-               log->next_checkpoint = ctx.pos;
-               r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq++);
-               ctx.pos = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
-       }
 
        if ((ctx.data_only_stripes == 0) && (ctx.data_parity_stripes == 0))
                pr_debug("md/raid:%s: starting from clean shutdown\n",
                         mdname(mddev));
-       else {
+       else
                pr_debug("md/raid:%s: recovering %d data-only stripes and %d data-parity stripes\n",
                         mdname(mddev), ctx.data_only_stripes,
                         ctx.data_parity_stripes);
 
-               if (ctx.data_only_stripes > 0)
-                       if (r5c_recovery_rewrite_data_only_stripes(log, &ctx)) {
-                               pr_err("md/raid:%s: failed to rewrite stripes to journal\n",
-                                      mdname(mddev));
-                               return -EIO;
-                       }
+       if (ctx.data_only_stripes == 0) {
+               log->next_checkpoint = ctx.pos;
+               r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq++);
+               ctx.pos = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
+       } else if (r5c_recovery_rewrite_data_only_stripes(log, &ctx)) {
+               pr_err("md/raid:%s: failed to rewrite stripes to journal\n",
+                      mdname(mddev));
+               return -EIO;
        }
 
        log->log_start = ctx.pos;
        log->seq = ctx.seq;
        log->last_checkpoint = pos;
        r5l_write_super(log, pos);
+
+       r5c_recovery_flush_data_only_stripes(log, &ctx);
        return 0;
 }
 
@@ -2247,6 +2286,10 @@ static ssize_t r5c_journal_mode_store(struct mddev *mddev,
            val > R5C_JOURNAL_MODE_WRITE_BACK)
                return -EINVAL;
 
+       if (raid5_calc_degraded(conf) > 0 &&
+           val == R5C_JOURNAL_MODE_WRITE_BACK)
+               return -EINVAL;
+
        mddev_suspend(mddev);
        conf->log->r5c_journal_mode = val;
        mddev_resume(mddev);
@@ -2301,6 +2344,16 @@ int r5c_try_caching_write(struct r5conf *conf,
                set_bit(STRIPE_R5C_CACHING, &sh->state);
        }
 
+       /*
+        * When run in degraded mode, array is set to write-through mode.
+        * This check helps drain pending write safely in the transition to
+        * write-through mode.
+        */
+       if (s->failed) {
+               r5c_make_stripe_write_out(sh);
+               return -EAGAIN;
+       }
+
        for (i = disks; i--; ) {
                dev = &sh->dev[i];
                /* if non-overwrite, use writing-out phase */
@@ -2351,6 +2404,8 @@ void r5c_release_extra_page(struct stripe_head *sh)
                        struct page *p = sh->dev[i].orig_page;
 
                        sh->dev[i].orig_page = sh->dev[i].page;
+                       clear_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags);
+
                        if (!using_disk_info_extra_page)
                                put_page(p);
                }
@@ -2555,6 +2610,19 @@ ioerr:
        return ret;
 }
 
+void r5c_update_on_rdev_error(struct mddev *mddev)
+{
+       struct r5conf *conf = mddev->private;
+       struct r5l_log *log = conf->log;
+
+       if (!log)
+               return;
+
+       if (raid5_calc_degraded(conf) > 0 &&
+           conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK)
+               schedule_work(&log->disable_writeback_work);
+}
+
 int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 {
        struct request_queue *q = bdev_get_queue(rdev->bdev);
@@ -2627,6 +2695,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
        spin_lock_init(&log->no_space_stripes_lock);
 
        INIT_WORK(&log->deferred_io_work, r5l_submit_io_async);
+       INIT_WORK(&log->disable_writeback_work, r5c_disable_writeback_async);
 
        log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
        INIT_LIST_HEAD(&log->stripe_in_journal_list);
@@ -2659,6 +2728,7 @@ io_kc:
 
 void r5l_exit_log(struct r5l_log *log)
 {
+       flush_work(&log->disable_writeback_work);
        md_unregister_thread(&log->reclaim_thread);
        mempool_destroy(log->meta_pool);
        bioset_free(log->bs);
index 36c13e4be9c9e5d0cedacb59910e7b4482eb6ddd..3c7e106c12a246046abc67c479f589d10966bce3 100644 (file)
@@ -556,7 +556,7 @@ static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
  * of the two sections, and some non-in_sync devices may
  * be insync in the section most affected by failed devices.
  */
-static int calc_degraded(struct r5conf *conf)
+int raid5_calc_degraded(struct r5conf *conf)
 {
        int degraded, degraded2;
        int i;
@@ -619,7 +619,7 @@ static int has_failed(struct r5conf *conf)
        if (conf->mddev->reshape_position == MaxSector)
                return conf->mddev->degraded > conf->max_degraded;
 
-       degraded = calc_degraded(conf);
+       degraded = raid5_calc_degraded(conf);
        if (degraded > conf->max_degraded)
                return 1;
        return 0;
@@ -1015,7 +1015,17 @@ again:
 
                        if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
                                WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
-                       sh->dev[i].vec.bv_page = sh->dev[i].page;
+
+                       if (!op_is_write(op) &&
+                           test_bit(R5_InJournal, &sh->dev[i].flags))
+                               /*
+                                * issuing read for a page in journal, this
+                                * must be preparing for prexor in rmw; read
+                                * the data into orig_page
+                                */
+                               sh->dev[i].vec.bv_page = sh->dev[i].orig_page;
+                       else
+                               sh->dev[i].vec.bv_page = sh->dev[i].page;
                        bi->bi_vcnt = 1;
                        bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
                        bi->bi_io_vec[0].bv_offset = 0;
@@ -2380,6 +2390,13 @@ static void raid5_end_read_request(struct bio * bi)
                } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
                        clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
 
+               if (test_bit(R5_InJournal, &sh->dev[i].flags))
+                       /*
+                        * end read for a page in journal, this
+                        * must be preparing for prexor in rmw
+                        */
+                       set_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags);
+
                if (atomic_read(&rdev->read_errors))
                        atomic_set(&rdev->read_errors, 0);
        } else {
@@ -2538,7 +2555,7 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
 
        spin_lock_irqsave(&conf->device_lock, flags);
        clear_bit(In_sync, &rdev->flags);
-       mddev->degraded = calc_degraded(conf);
+       mddev->degraded = raid5_calc_degraded(conf);
        spin_unlock_irqrestore(&conf->device_lock, flags);
        set_bit(MD_RECOVERY_INTR, &mddev->recovery);
 
@@ -2552,6 +2569,7 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
                bdevname(rdev->bdev, b),
                mdname(mddev),
                conf->raid_disks - mddev->degraded);
+       r5c_update_on_rdev_error(mddev);
 }
 
 /*
@@ -2880,6 +2898,30 @@ sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous)
        return r_sector;
 }
 
+/*
+ * There are cases where we want handle_stripe_dirtying() and
+ * schedule_reconstruction() to delay towrite to some dev of a stripe.
+ *
+ * This function checks whether we want to delay the towrite. Specifically,
+ * we delay the towrite when:
+ *
+ *   1. degraded stripe has a non-overwrite to the missing dev, AND this
+ *      stripe has data in journal (for other devices).
+ *
+ *      In this case, when reading data for the non-overwrite dev, it is
+ *      necessary to handle complex rmw of write back cache (prexor with
+ *      orig_page, and xor with page). To keep read path simple, we would
+ *      like to flush data in journal to RAID disks first, so complex rmw
+ *      is handled in the write patch (handle_stripe_dirtying).
+ *
+ */
+static inline bool delay_towrite(struct r5dev *dev,
+                                  struct stripe_head_state *s)
+{
+       return !test_bit(R5_OVERWRITE, &dev->flags) &&
+               !test_bit(R5_Insync, &dev->flags) && s->injournal;
+}
+
 static void
 schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
                         int rcw, int expand)
@@ -2900,7 +2942,7 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
                for (i = disks; i--; ) {
                        struct r5dev *dev = &sh->dev[i];
 
-                       if (dev->towrite) {
+                       if (dev->towrite && !delay_towrite(dev, s)) {
                                set_bit(R5_LOCKED, &dev->flags);
                                set_bit(R5_Wantdrain, &dev->flags);
                                if (!expand)
@@ -3295,13 +3337,6 @@ static int want_replace(struct stripe_head *sh, int disk_idx)
        return rv;
 }
 
-/* fetch_block - checks the given member device to see if its data needs
- * to be read or computed to satisfy a request.
- *
- * Returns 1 when no more member devices need to be checked, otherwise returns
- * 0 to tell the loop in handle_stripe_fill to continue
- */
-
 static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
                           int disk_idx, int disks)
 {
@@ -3392,6 +3427,12 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
        return 0;
 }
 
+/* fetch_block - checks the given member device to see if its data needs
+ * to be read or computed to satisfy a request.
+ *
+ * Returns 1 when no more member devices need to be checked, otherwise returns
+ * 0 to tell the loop in handle_stripe_fill to continue
+ */
 static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
                       int disk_idx, int disks)
 {
@@ -3478,10 +3519,26 @@ static void handle_stripe_fill(struct stripe_head *sh,
         * midst of changing due to a write
         */
        if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
-           !sh->reconstruct_state)
+           !sh->reconstruct_state) {
+
+               /*
+                * For degraded stripe with data in journal, do not handle
+                * read requests yet, instead, flush the stripe to raid
+                * disks first, this avoids handling complex rmw of write
+                * back cache (prexor with orig_page, and then xor with
+                * page) in the read path
+                */
+               if (s->injournal && s->failed) {
+                       if (test_bit(STRIPE_R5C_CACHING, &sh->state))
+                               r5c_make_stripe_write_out(sh);
+                       goto out;
+               }
+
                for (i = disks; i--; )
                        if (fetch_block(sh, s, i, disks))
                                break;
+       }
+out:
        set_bit(STRIPE_HANDLE, &sh->state);
 }
 
@@ -3594,6 +3651,21 @@ unhash:
                break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS);
 }
 
+/*
+ * For RMW in write back cache, we need extra page in prexor to store the
+ * old data. This page is stored in dev->orig_page.
+ *
+ * This function checks whether we have data for prexor. The exact logic
+ * is:
+ *       R5_UPTODATE && (!R5_InJournal || R5_OrigPageUPTDODATE)
+ */
+static inline bool uptodate_for_rmw(struct r5dev *dev)
+{
+       return (test_bit(R5_UPTODATE, &dev->flags)) &&
+               (!test_bit(R5_InJournal, &dev->flags) ||
+                test_bit(R5_OrigPageUPTDODATE, &dev->flags));
+}
+
 static int handle_stripe_dirtying(struct r5conf *conf,
                                  struct stripe_head *sh,
                                  struct stripe_head_state *s,
@@ -3622,12 +3694,11 @@ static int handle_stripe_dirtying(struct r5conf *conf,
        } else for (i = disks; i--; ) {
                /* would I have to read this buffer for read_modify_write */
                struct r5dev *dev = &sh->dev[i];
-               if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx ||
+               if (((dev->towrite && !delay_towrite(dev, s)) ||
+                    i == sh->pd_idx || i == sh->qd_idx ||
                     test_bit(R5_InJournal, &dev->flags)) &&
                    !test_bit(R5_LOCKED, &dev->flags) &&
-                   !((test_bit(R5_UPTODATE, &dev->flags) &&
-                      (!test_bit(R5_InJournal, &dev->flags) ||
-                       dev->page != dev->orig_page)) ||
+                   !(uptodate_for_rmw(dev) ||
                      test_bit(R5_Wantcompute, &dev->flags))) {
                        if (test_bit(R5_Insync, &dev->flags))
                                rmw++;
@@ -3639,7 +3710,6 @@ static int handle_stripe_dirtying(struct r5conf *conf,
                    i != sh->pd_idx && i != sh->qd_idx &&
                    !test_bit(R5_LOCKED, &dev->flags) &&
                    !(test_bit(R5_UPTODATE, &dev->flags) ||
-                     test_bit(R5_InJournal, &dev->flags) ||
                      test_bit(R5_Wantcompute, &dev->flags))) {
                        if (test_bit(R5_Insync, &dev->flags))
                                rcw++;
@@ -3689,13 +3759,11 @@ static int handle_stripe_dirtying(struct r5conf *conf,
 
                for (i = disks; i--; ) {
                        struct r5dev *dev = &sh->dev[i];
-                       if ((dev->towrite ||
+                       if (((dev->towrite && !delay_towrite(dev, s)) ||
                             i == sh->pd_idx || i == sh->qd_idx ||
                             test_bit(R5_InJournal, &dev->flags)) &&
                            !test_bit(R5_LOCKED, &dev->flags) &&
-                           !((test_bit(R5_UPTODATE, &dev->flags) &&
-                              (!test_bit(R5_InJournal, &dev->flags) ||
-                               dev->page != dev->orig_page)) ||
+                           !(uptodate_for_rmw(dev) ||
                              test_bit(R5_Wantcompute, &dev->flags)) &&
                            test_bit(R5_Insync, &dev->flags)) {
                                if (test_bit(STRIPE_PREREAD_ACTIVE,
@@ -3722,7 +3790,6 @@ static int handle_stripe_dirtying(struct r5conf *conf,
                            i != sh->pd_idx && i != sh->qd_idx &&
                            !test_bit(R5_LOCKED, &dev->flags) &&
                            !(test_bit(R5_UPTODATE, &dev->flags) ||
-                             test_bit(R5_InJournal, &dev->flags) ||
                              test_bit(R5_Wantcompute, &dev->flags))) {
                                rcw++;
                                if (test_bit(R5_Insync, &dev->flags) &&
@@ -7025,7 +7092,7 @@ static int raid5_run(struct mddev *mddev)
        /*
         * 0 for a fully functional array, 1 or 2 for a degraded array.
         */
-       mddev->degraded = calc_degraded(conf);
+       mddev->degraded = raid5_calc_degraded(conf);
 
        if (has_failed(conf)) {
                pr_crit("md/raid:%s: not enough operational devices (%d/%d failed)\n",
@@ -7272,7 +7339,7 @@ static int raid5_spare_active(struct mddev *mddev)
                }
        }
        spin_lock_irqsave(&conf->device_lock, flags);
-       mddev->degraded = calc_degraded(conf);
+       mddev->degraded = raid5_calc_degraded(conf);
        spin_unlock_irqrestore(&conf->device_lock, flags);
        print_raid5_conf(conf);
        return count;
@@ -7632,7 +7699,7 @@ static int raid5_start_reshape(struct mddev *mddev)
                 * pre and post number of devices.
                 */
                spin_lock_irqsave(&conf->device_lock, flags);
-               mddev->degraded = calc_degraded(conf);
+               mddev->degraded = raid5_calc_degraded(conf);
                spin_unlock_irqrestore(&conf->device_lock, flags);
        }
        mddev->raid_disks = conf->raid_disks;
@@ -7720,7 +7787,7 @@ static void raid5_finish_reshape(struct mddev *mddev)
                } else {
                        int d;
                        spin_lock_irq(&conf->device_lock);
-                       mddev->degraded = calc_degraded(conf);
+                       mddev->degraded = raid5_calc_degraded(conf);
                        spin_unlock_irq(&conf->device_lock);
                        for (d = conf->raid_disks ;
                             d < conf->raid_disks - mddev->delta_disks;
index ed8e1362ab3698e6608aceee90614501bb1a69b2..1440fa26e29629c4f9acc098f0fa9035f5ff1d1a 100644 (file)
@@ -322,6 +322,11 @@ enum r5dev_flags {
                         * data and parity being written are in the journal
                         * device
                         */
+       R5_OrigPageUPTDODATE,   /* with write back cache, we read old data into
+                                * dev->orig_page for prexor. When this flag is
+                                * set, orig_page contains latest data in the
+                                * raid disk.
+                                */
 };
 
 /*
@@ -753,6 +758,7 @@ extern sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
 extern struct stripe_head *
 raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
                        int previous, int noblock, int noquiesce);
+extern int raid5_calc_degraded(struct r5conf *conf);
 extern int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev);
 extern void r5l_exit_log(struct r5l_log *log);
 extern int r5l_write_stripe(struct r5l_log *log, struct stripe_head *head_sh);
@@ -781,4 +787,5 @@ extern void r5c_flush_cache(struct r5conf *conf, int num);
 extern void r5c_check_stripe_cache_usage(struct r5conf *conf);
 extern void r5c_check_cached_full_stripe(struct r5conf *conf);
 extern struct md_sysfs_entry r5c_journal_mode;
+extern void r5c_update_on_rdev_error(struct mddev *mddev);
 #endif