Merge branch 'md-next' into md-linus

[mirror_ubuntu-artful-kernel.git] / drivers / md / raid5.c
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c

index 5f9e28443c8acbe5335f01638444c13f60d3c810..06d7279bdd048e66369961194bc1d21567a08446 100644 (file)
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -70,19 +70,6 @@ module_param(devices_handle_discard_safely, bool, 0644);
  MODULE_PARM_DESC(devices_handle_discard_safely,
                  "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions");
  static struct workqueue_struct *raid5_wq;
-/*
- * Stripe cache
- */
-
-#define NR_STRIPES             256
-#define STRIPE_SIZE            PAGE_SIZE
-#define STRIPE_SHIFT           (PAGE_SHIFT - 9)
-#define STRIPE_SECTORS         (STRIPE_SIZE>>9)
-#define        IO_THRESHOLD            1
-#define BYPASS_THRESHOLD       1
-#define NR_HASH                        (PAGE_SIZE / sizeof(struct hlist_head))
-#define HASH_MASK              (NR_HASH - 1)
-#define MAX_STRIPE_BATCH       8
  
  static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
  {
@@ -126,64 +113,6 @@ static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
         local_irq_enable();
  }
  
-/* bio's attached to a stripe+device for I/O are linked together in bi_sector
- * order without overlap.  There may be several bio's per stripe+device, and
- * a bio could span several devices.
- * When walking this list for a particular stripe+device, we must never proceed
- * beyond a bio that extends past this device, as the next bio might no longer
- * be valid.
- * This function is used to determine the 'next' bio in the list, given the sector
- * of the current stripe+device
- */
-static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
-{
-       int sectors = bio_sectors(bio);
-       if (bio->bi_iter.bi_sector + sectors < sector + STRIPE_SECTORS)
-               return bio->bi_next;
-       else
-               return NULL;
-}
-
-/*
- * We maintain a biased count of active stripes in the bottom 16 bits of
- * bi_phys_segments, and a count of processed stripes in the upper 16 bits
- */
-static inline int raid5_bi_processed_stripes(struct bio *bio)
-{
-       atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
-       return (atomic_read(segments) >> 16) & 0xffff;
-}
-
-static inline int raid5_dec_bi_active_stripes(struct bio *bio)
-{
-       atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
-       return atomic_sub_return(1, segments) & 0xffff;
-}
-
-static inline void raid5_inc_bi_active_stripes(struct bio *bio)
-{
-       atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
-       atomic_inc(segments);
-}
-
-static inline void raid5_set_bi_processed_stripes(struct bio *bio,
-       unsigned int cnt)
-{
-       atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
-       int old, new;
-
-       do {
-               old = atomic_read(segments);
-               new = (old & 0xffff) | (cnt << 16);
-       } while (atomic_cmpxchg(segments, old, new) != old);
-}
-
-static inline void raid5_set_bi_stripes(struct bio *bio, unsigned int cnt)
-{
-       atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
-       atomic_set(segments, cnt);
-}
-
  /* Find first data disk in a raid6 stripe */
  static inline int raid6_d0(struct stripe_head *sh)
  {
@@ -289,8 +218,27 @@ static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
  static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
                               struct list_head *temp_inactive_list)
  {
+       int i;
+       int injournal = 0;      /* number of date pages with R5_InJournal */
+
         BUG_ON(!list_empty(&sh->lru));
         BUG_ON(atomic_read(&conf->active_stripes)==0);
+
+       if (r5c_is_writeback(conf->log))
+               for (i = sh->disks; i--; )
+                       if (test_bit(R5_InJournal, &sh->dev[i].flags))
+                               injournal++;
+       /*
+        * When quiesce in r5c write back, set STRIPE_HANDLE for stripes with
+        * data in journal, so they are not released to cached lists
+        */
+       if (conf->quiesce && r5c_is_writeback(conf->log) &&
+           !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0) {
+               if (test_bit(STRIPE_R5C_CACHING, &sh->state))
+                       r5c_make_stripe_write_out(sh);
+               set_bit(STRIPE_HANDLE, &sh->state);
+       }
+
         if (test_bit(STRIPE_HANDLE, &sh->state)) {
                 if (test_bit(STRIPE_DELAYED, &sh->state) &&
                     !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
@@ -316,8 +264,30 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
                             < IO_THRESHOLD)
                                 md_wakeup_thread(conf->mddev->thread);
                 atomic_dec(&conf->active_stripes);
-               if (!test_bit(STRIPE_EXPANDING, &sh->state))
-                       list_add_tail(&sh->lru, temp_inactive_list);
+               if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
+                       if (!r5c_is_writeback(conf->log))
+                               list_add_tail(&sh->lru, temp_inactive_list);
+                       else {
+                               WARN_ON(test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags));
+                               if (injournal == 0)
+                                       list_add_tail(&sh->lru, temp_inactive_list);
+                               else if (injournal == conf->raid_disks - conf->max_degraded) {
+                                       /* full stripe */
+                                       if (!test_and_set_bit(STRIPE_R5C_FULL_STRIPE, &sh->state))
+                                               atomic_inc(&conf->r5c_cached_full_stripes);
+                                       if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state))
+                                               atomic_dec(&conf->r5c_cached_partial_stripes);
+                                       list_add_tail(&sh->lru, &conf->r5c_full_stripe_list);
+                                       r5c_check_cached_full_stripe(conf);
+                               } else {
+                                       /* partial stripe */
+                                       if (!test_and_set_bit(STRIPE_R5C_PARTIAL_STRIPE,
+                                                             &sh->state))
+                                               atomic_inc(&conf->r5c_cached_partial_stripes);
+                                       list_add_tail(&sh->lru, &conf->r5c_partial_stripe_list);
+                               }
+                       }
+               }
         }
  }
  
@@ -541,7 +511,7 @@ retry:
  
                 if (dev->toread || dev->read || dev->towrite || dev->written ||
                     test_bit(R5_LOCKED, &dev->flags)) {
-                       printk(KERN_ERR "sector=%llx i=%d %p %p %p %p %d\n",
+                       pr_err("sector=%llx i=%d %p %p %p %p %d\n",
                                (unsigned long long)sh->sector, i, dev->toread,
                                dev->read, dev->towrite, dev->written,
                                test_bit(R5_LOCKED, &dev->flags));
@@ -680,9 +650,12 @@ raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
                         }
                         if (noblock && sh == NULL)
                                 break;
+
+                       r5c_check_stripe_cache_usage(conf);
                         if (!sh) {
                                 set_bit(R5_INACTIVE_BLOCKED,
                                         &conf->cache_state);
+                               r5l_wake_reclaim(conf->log, 0);
                                 wait_event_lock_irq(
                                         conf->wait_for_stripe,
                                         !list_empty(conf->inactive_list + hash) &&
@@ -901,8 +874,19 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
  
         might_sleep();
  
-       if (r5l_write_stripe(conf->log, sh) == 0)
-               return;
+       if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
+               /* writing out phase */
+               if (s->waiting_extra_page)
+                       return;
+               if (r5l_write_stripe(conf->log, sh) == 0)
+                       return;
+       } else {  /* caching phase */
+               if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) {
+                       r5c_cache_data(conf->log, sh, s);
+                       return;
+               }
+       }
+
         for (i = disks; i--; ) {
                 int op, op_flags = 0;
                 int replace_only = 0;
@@ -977,7 +961,7 @@ again:
                         if (bad < 0) {
                                 set_bit(BlockedBadBlocks, &rdev->flags);
                                 if (!conf->mddev->external &&
-                                   conf->mddev->flags) {
+                                   conf->mddev->sb_flags) {
                                         /* It is very unlikely, but we might
                                          * still need to write out the
                                          * bad block log - better give it
@@ -1115,7 +1099,7 @@ again:
  static struct dma_async_tx_descriptor *
  async_copy_data(int frombio, struct bio *bio, struct page **page,
         sector_t sector, struct dma_async_tx_descriptor *tx,
-       struct stripe_head *sh)
+       struct stripe_head *sh, int no_skipcopy)
  {
         struct bio_vec bvl;
         struct bvec_iter iter;
@@ -1155,7 +1139,8 @@ async_copy_data(int frombio, struct bio *bio, struct page **page,
                         if (frombio) {
                                 if (sh->raid_conf->skip_copy &&
                                     b_offset == 0 && page_offset == 0 &&
-                                   clen == STRIPE_SIZE)
+                                   clen == STRIPE_SIZE &&
+                                   !no_skipcopy)
                                         *page = bio_page;
                                 else
                                         tx = async_memcpy(*page, bio_page, page_offset,
@@ -1237,7 +1222,7 @@ static void ops_run_biofill(struct stripe_head *sh)
                         while (rbi && rbi->bi_iter.bi_sector <
                                 dev->sector + STRIPE_SECTORS) {
                                 tx = async_copy_data(0, rbi, &dev->page,
-                                       dev->sector, tx, sh);
+                                                    dev->sector, tx, sh, 0);
                                 rbi = r5_next_bio(rbi, dev->sector);
                         }
                 }
@@ -1364,10 +1349,15 @@ static int set_syndrome_sources(struct page **srcs,
                 if (i == sh->qd_idx || i == sh->pd_idx ||
                     (srctype == SYNDROME_SRC_ALL) ||
                     (srctype == SYNDROME_SRC_WANT_DRAIN &&
-                    test_bit(R5_Wantdrain, &dev->flags)) ||
+                    (test_bit(R5_Wantdrain, &dev->flags) ||
+                     test_bit(R5_InJournal, &dev->flags))) ||
                     (srctype == SYNDROME_SRC_WRITTEN &&
-                    dev->written))
-                       srcs[slot] = sh->dev[i].page;
+                    dev->written)) {
+                       if (test_bit(R5_InJournal, &dev->flags))
+                               srcs[slot] = sh->dev[i].orig_page;
+                       else
+                               srcs[slot] = sh->dev[i].page;
+               }
                 i = raid6_next_disk(i, disks);
         } while (i != d0_idx);
  
@@ -1546,6 +1536,13 @@ static void ops_complete_prexor(void *stripe_head_ref)
  
         pr_debug("%s: stripe %llu\n", __func__,
                 (unsigned long long)sh->sector);
+
+       if (r5c_is_writeback(sh->raid_conf->log))
+               /*
+                * raid5-cache write back uses orig_page during prexor.
+                * After prexor, it is time to free orig_page
+                */
+               r5c_release_extra_page(sh);
  }
  
  static struct dma_async_tx_descriptor *
@@ -1567,7 +1564,9 @@ ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu,
         for (i = disks; i--; ) {
                 struct r5dev *dev = &sh->dev[i];
                 /* Only process blocks that are known to be uptodate */
-               if (test_bit(R5_Wantdrain, &dev->flags))
+               if (test_bit(R5_InJournal, &dev->flags))
+                       xor_srcs[count++] = dev->orig_page;
+               else if (test_bit(R5_Wantdrain, &dev->flags))
                         xor_srcs[count++] = dev->page;
         }
  
@@ -1601,6 +1600,7 @@ ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu,
  static struct dma_async_tx_descriptor *
  ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
  {
+       struct r5conf *conf = sh->raid_conf;
         int disks = sh->disks;
         int i;
         struct stripe_head *head_sh = sh;
@@ -1618,6 +1618,11 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
  
  again:
                         dev = &sh->dev[i];
+                       /*
+                        * clear R5_InJournal, so when rewriting a page in
+                        * journal, it is not skipped by r5l_log_stripe()
+                        */
+                       clear_bit(R5_InJournal, &dev->flags);
                         spin_lock_irq(&sh->stripe_lock);
                         chosen = dev->towrite;
                         dev->towrite = NULL;
@@ -1637,8 +1642,10 @@ again:
                                         set_bit(R5_Discard, &dev->flags);
                                 else {
                                         tx = async_copy_data(1, wbi, &dev->page,
-                                               dev->sector, tx, sh);
-                                       if (dev->page != dev->orig_page) {
+                                                            dev->sector, tx, sh,
+                                                            r5c_is_writeback(conf->log));
+                                       if (dev->page != dev->orig_page &&
+                                           !r5c_is_writeback(conf->log)) {
                                                 set_bit(R5_SkipCopy, &dev->flags);
                                                 clear_bit(R5_UPTODATE, &dev->flags);
                                                 clear_bit(R5_OVERWRITE, &dev->flags);
@@ -1746,7 +1753,8 @@ again:
                 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
                 for (i = disks; i--; ) {
                         struct r5dev *dev = &sh->dev[i];
-                       if (head_sh->dev[i].written)
+                       if (head_sh->dev[i].written ||
+                           test_bit(R5_InJournal, &head_sh->dev[i].flags))
                                 xor_srcs[count++] = dev->page;
                 }
         } else {
@@ -2000,7 +2008,10 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
                 spin_lock_init(&sh->batch_lock);
                 INIT_LIST_HEAD(&sh->batch_list);
                 INIT_LIST_HEAD(&sh->lru);
+               INIT_LIST_HEAD(&sh->r5c);
+               INIT_LIST_HEAD(&sh->log_list);
                 atomic_set(&sh->count, 1);
+               sh->log_start = MaxSector;
                 for (i = 0; i < disks; i++) {
                         struct r5dev *dev = &sh->dev[i];
  
@@ -2240,10 +2251,24 @@ static int resize_stripes(struct r5conf *conf, int newsize)
          */
         ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO);
         if (ndisks) {
-               for (i=0; i<conf->raid_disks; i++)
+               for (i = 0; i < conf->pool_size; i++)
                         ndisks[i] = conf->disks[i];
-               kfree(conf->disks);
-               conf->disks = ndisks;
+
+               for (i = conf->pool_size; i < newsize; i++) {
+                       ndisks[i].extra_page = alloc_page(GFP_NOIO);
+                       if (!ndisks[i].extra_page)
+                               err = -ENOMEM;
+               }
+
+               if (err) {
+                       for (i = conf->pool_size; i < newsize; i++)
+                               if (ndisks[i].extra_page)
+                                       put_page(ndisks[i].extra_page);
+                       kfree(ndisks);
+               } else {
+                       kfree(conf->disks);
+                       conf->disks = ndisks;
+               }
         } else
                 err = -ENOMEM;
  
@@ -2342,10 +2367,8 @@ static void raid5_end_read_request(struct bio * bi)
                          * replacement device.  We just fail those on
                          * any error
                          */
-                       printk_ratelimited(
-                               KERN_INFO
-                               "md/raid:%s: read error corrected"
-                               " (%lu sectors at %llu on %s)\n",
+                       pr_info_ratelimited(
+                               "md/raid:%s: read error corrected (%lu sectors at %llu on %s)\n",
                                 mdname(conf->mddev), STRIPE_SECTORS,
                                 (unsigned long long)s,
                                 bdevname(rdev->bdev, b));
@@ -2365,36 +2388,29 @@ static void raid5_end_read_request(struct bio * bi)
                 clear_bit(R5_UPTODATE, &sh->dev[i].flags);
                 atomic_inc(&rdev->read_errors);
                 if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
-                       printk_ratelimited(
-                               KERN_WARNING
-                               "md/raid:%s: read error on replacement device "
-                               "(sector %llu on %s).\n",
+                       pr_warn_ratelimited(
+                               "md/raid:%s: read error on replacement device (sector %llu on %s).\n",
                                 mdname(conf->mddev),
                                 (unsigned long long)s,
                                 bdn);
                 else if (conf->mddev->degraded >= conf->max_degraded) {
                         set_bad = 1;
-                       printk_ratelimited(
-                               KERN_WARNING
-                               "md/raid:%s: read error not correctable "
-                               "(sector %llu on %s).\n",
+                       pr_warn_ratelimited(
+                               "md/raid:%s: read error not correctable (sector %llu on %s).\n",
                                 mdname(conf->mddev),
                                 (unsigned long long)s,
                                 bdn);
                 } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) {
                         /* Oh, no!!! */
                         set_bad = 1;
-                       printk_ratelimited(
-                               KERN_WARNING
-                               "md/raid:%s: read error NOT corrected!! "
-                               "(sector %llu on %s).\n",
+                       pr_warn_ratelimited(
+                               "md/raid:%s: read error NOT corrected!! (sector %llu on %s).\n",
                                 mdname(conf->mddev),
                                 (unsigned long long)s,
                                 bdn);
                 } else if (atomic_read(&rdev->read_errors)
                          > conf->max_nr_stripes)
-                       printk(KERN_WARNING
-                              "md/raid:%s: Too many read errors, failing device %s.\n",
+                       pr_warn("md/raid:%s: Too many read errors, failing device %s.\n",
                                mdname(conf->mddev), bdn);
                 else
                         retry = 1;
@@ -2526,15 +2542,14 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
  
         set_bit(Blocked, &rdev->flags);
         set_bit(Faulty, &rdev->flags);
-       set_mask_bits(&mddev->flags, 0,
-                     BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING));
-       printk(KERN_ALERT
-              "md/raid:%s: Disk failure on %s, disabling device.\n"
-              "md/raid:%s: Operation continuing on %d devices.\n",
-              mdname(mddev),
-              bdevname(rdev->bdev, b),
-              mdname(mddev),
-              conf->raid_disks - mddev->degraded);
+       set_mask_bits(&mddev->sb_flags, 0,
+                     BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
+       pr_crit("md/raid:%s: Disk failure on %s, disabling device.\n"
+               "md/raid:%s: Operation continuing on %d devices.\n",
+               mdname(mddev),
+               bdevname(rdev->bdev, b),
+               mdname(mddev),
+               conf->raid_disks - mddev->degraded);
  }
  
  /*
@@ -2856,8 +2871,8 @@ sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous)
                                      previous, &dummy1, &sh2);
         if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx
                 || sh2.qd_idx != sh->qd_idx) {
-               printk(KERN_ERR "md/raid:%s: compute_blocknr: map not correct\n",
-                      mdname(conf->mddev));
+               pr_warn("md/raid:%s: compute_blocknr: map not correct\n",
+                       mdname(conf->mddev));
                 return 0;
         }
         return r_sector;
@@ -2872,6 +2887,13 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
         int level = conf->level;
  
         if (rcw) {
+               /*
+                * In some cases, handle_stripe_dirtying initially decided to
+                * run rmw and allocates extra page for prexor. However, rcw is
+                * cheaper later on. We need to free the extra page now,
+                * because we won't be able to do that in ops_complete_prexor().
+                */
+               r5c_release_extra_page(sh);
  
                 for (i = disks; i--; ) {
                         struct r5dev *dev = &sh->dev[i];
@@ -2882,6 +2904,9 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
                                 if (!expand)
                                         clear_bit(R5_UPTODATE, &dev->flags);
                                 s->locked++;
+                       } else if (test_bit(R5_InJournal, &dev->flags)) {
+                               set_bit(R5_LOCKED, &dev->flags);
+                               s->locked++;
                         }
                 }
                 /* if we are not expanding this is a proper write request, and
@@ -2921,6 +2946,9 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
                                 set_bit(R5_LOCKED, &dev->flags);
                                 clear_bit(R5_UPTODATE, &dev->flags);
                                 s->locked++;
+                       } else if (test_bit(R5_InJournal, &dev->flags)) {
+                               set_bit(R5_LOCKED, &dev->flags);
+                               s->locked++;
                         }
                 }
                 if (!s->locked)
@@ -3564,10 +3592,10 @@ unhash:
                 break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS);
  }
  
-static void handle_stripe_dirtying(struct r5conf *conf,
-                                  struct stripe_head *sh,
-                                  struct stripe_head_state *s,
-                                  int disks)
+static int handle_stripe_dirtying(struct r5conf *conf,
+                                 struct stripe_head *sh,
+                                 struct stripe_head_state *s,
+                                 int disks)
  {
         int rmw = 0, rcw = 0, i;
         sector_t recovery_cp = conf->mddev->recovery_cp;
@@ -3592,9 +3620,12 @@ static void handle_stripe_dirtying(struct r5conf *conf,
         } else for (i = disks; i--; ) {
                 /* would I have to read this buffer for read_modify_write */
                 struct r5dev *dev = &sh->dev[i];
-               if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) &&
+               if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx ||
+                    test_bit(R5_InJournal, &dev->flags)) &&
                     !test_bit(R5_LOCKED, &dev->flags) &&
-                   !(test_bit(R5_UPTODATE, &dev->flags) ||
+                   !((test_bit(R5_UPTODATE, &dev->flags) &&
+                      (!test_bit(R5_InJournal, &dev->flags) ||
+                       dev->page != dev->orig_page)) ||
                       test_bit(R5_Wantcompute, &dev->flags))) {
                         if (test_bit(R5_Insync, &dev->flags))
                                 rmw++;
@@ -3606,13 +3637,15 @@ static void handle_stripe_dirtying(struct r5conf *conf,
                     i != sh->pd_idx && i != sh->qd_idx &&
                     !test_bit(R5_LOCKED, &dev->flags) &&
                     !(test_bit(R5_UPTODATE, &dev->flags) ||
-                   test_bit(R5_Wantcompute, &dev->flags))) {
+                     test_bit(R5_InJournal, &dev->flags) ||
+                     test_bit(R5_Wantcompute, &dev->flags))) {
                         if (test_bit(R5_Insync, &dev->flags))
                                 rcw++;
                         else
                                 rcw += 2*disks;
                 }
         }
+
         pr_debug("for sector %llu, rmw=%d rcw=%d\n",
                 (unsigned long long)sh->sector, rmw, rcw);
         set_bit(STRIPE_HANDLE, &sh->state);
@@ -3624,10 +3657,44 @@ static void handle_stripe_dirtying(struct r5conf *conf,
                                           (unsigned long long)sh->sector, rmw);
                 for (i = disks; i--; ) {
                         struct r5dev *dev = &sh->dev[i];
-                       if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) &&
+                       if (test_bit(R5_InJournal, &dev->flags) &&
+                           dev->page == dev->orig_page &&
+                           !test_bit(R5_LOCKED, &sh->dev[sh->pd_idx].flags)) {
+                               /* alloc page for prexor */
+                               struct page *p = alloc_page(GFP_NOIO);
+
+                               if (p) {
+                                       dev->orig_page = p;
+                                       continue;
+                               }
+
+                               /*
+                                * alloc_page() failed, try use
+                                * disk_info->extra_page
+                                */
+                               if (!test_and_set_bit(R5C_EXTRA_PAGE_IN_USE,
+                                                     &conf->cache_state)) {
+                                       r5c_use_extra_page(sh);
+                                       break;
+                               }
+
+                               /* extra_page in use, add to delayed_list */
+                               set_bit(STRIPE_DELAYED, &sh->state);
+                               s->waiting_extra_page = 1;
+                               return -EAGAIN;
+                       }
+               }
+
+               for (i = disks; i--; ) {
+                       struct r5dev *dev = &sh->dev[i];
+                       if ((dev->towrite ||
+                            i == sh->pd_idx || i == sh->qd_idx ||
+                            test_bit(R5_InJournal, &dev->flags)) &&
                             !test_bit(R5_LOCKED, &dev->flags) &&
-                           !(test_bit(R5_UPTODATE, &dev->flags) ||
-                           test_bit(R5_Wantcompute, &dev->flags)) &&
+                           !((test_bit(R5_UPTODATE, &dev->flags) &&
+                              (!test_bit(R5_InJournal, &dev->flags) ||
+                               dev->page != dev->orig_page)) ||
+                             test_bit(R5_Wantcompute, &dev->flags)) &&
                             test_bit(R5_Insync, &dev->flags)) {
                                 if (test_bit(STRIPE_PREREAD_ACTIVE,
                                              &sh->state)) {
@@ -3653,6 +3720,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
                             i != sh->pd_idx && i != sh->qd_idx &&
                             !test_bit(R5_LOCKED, &dev->flags) &&
                             !(test_bit(R5_UPTODATE, &dev->flags) ||
+                             test_bit(R5_InJournal, &dev->flags) ||
                               test_bit(R5_Wantcompute, &dev->flags))) {
                                 rcw++;
                                 if (test_bit(R5_Insync, &dev->flags) &&
@@ -3692,8 +3760,9 @@ static void handle_stripe_dirtying(struct r5conf *conf,
          */
         if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
             (s->locked == 0 && (rcw == 0 || rmw == 0) &&
-           !test_bit(STRIPE_BIT_DELAY, &sh->state)))
+            !test_bit(STRIPE_BIT_DELAY, &sh->state)))
                 schedule_reconstruction(sh, s, rcw == 0, 0);
+       return 0;
  }
  
  static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
@@ -3777,7 +3846,7 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
         case check_state_compute_run:
                 break;
         default:
-               printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n",
+               pr_err("%s: unknown check_state: %d sector: %llu\n",
                        __func__, sh->check_state,
                        (unsigned long long) sh->sector);
                 BUG();
@@ -3941,9 +4010,9 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
         case check_state_compute_run:
                 break;
         default:
-               printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n",
-                      __func__, sh->check_state,
-                      (unsigned long long) sh->sector);
+               pr_warn("%s: unknown check_state: %d sector: %llu\n",
+                       __func__, sh->check_state,
+                       (unsigned long long) sh->sector);
                 BUG();
         }
  }
@@ -4183,6 +4252,11 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
                         if (rdev && !test_bit(Faulty, &rdev->flags))
                                 do_recovery = 1;
                 }
+
+               if (test_bit(R5_InJournal, &dev->flags))
+                       s->injournal++;
+               if (test_bit(R5_InJournal, &dev->flags) && dev->written)
+                       s->just_cached++;
         }
         if (test_bit(STRIPE_SYNCING, &sh->state)) {
                 /* If there is a failed device being replaced,
@@ -4411,7 +4485,8 @@ static void handle_stripe(struct stripe_head *sh)
                         struct r5dev *dev = &sh->dev[i];
                         if (test_bit(R5_LOCKED, &dev->flags) &&
                                 (i == sh->pd_idx || i == sh->qd_idx ||
-                                dev->written)) {
+                                dev->written || test_bit(R5_InJournal,
+                                                         &dev->flags))) {
                                 pr_debug("Writing block %d\n", i);
                                 set_bit(R5_Wantwrite, &dev->flags);
                                 if (prexor)
@@ -4451,6 +4526,10 @@ static void handle_stripe(struct stripe_head *sh)
                                  test_bit(R5_Discard, &qdev->flags))))))
                 handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
  
+       if (s.just_cached)
+               r5c_handle_cached_data_endio(conf, sh, disks, &s.return_bi);
+       r5l_stripe_write_finished(sh);
+
         /* Now we might consider reading some blocks, either to check/generate
          * parity, or to satisfy requests
          * or to load a block that is being partially written.
@@ -4462,14 +4541,51 @@ static void handle_stripe(struct stripe_head *sh)
             || s.expanding)
                 handle_stripe_fill(sh, &s, disks);
  
-       /* Now to consider new write requests and what else, if anything
-        * should be read.  We do not handle new writes when:
+       /*
+        * When the stripe finishes full journal write cycle (write to journal
+        * and raid disk), this is the clean up procedure so it is ready for
+        * next operation.
+        */
+       r5c_finish_stripe_write_out(conf, sh, &s);
+
+       /*
+        * Now to consider new write requests, cache write back and what else,
+        * if anything should be read.  We do not handle new writes when:
          * 1/ A 'write' operation (copy+xor) is already in flight.
          * 2/ A 'check' operation is in flight, as it may clobber the parity
          *    block.
+        * 3/ A r5c cache log write is in flight.
          */
-       if (s.to_write && !sh->reconstruct_state && !sh->check_state)
-               handle_stripe_dirtying(conf, sh, &s, disks);
+
+       if (!sh->reconstruct_state && !sh->check_state && !sh->log_io) {
+               if (!r5c_is_writeback(conf->log)) {
+                       if (s.to_write)
+                               handle_stripe_dirtying(conf, sh, &s, disks);
+               } else { /* write back cache */
+                       int ret = 0;
+
+                       /* First, try handle writes in caching phase */
+                       if (s.to_write)
+                               ret = r5c_try_caching_write(conf, sh, &s,
+                                                           disks);
+                       /*
+                        * If caching phase failed: ret == -EAGAIN
+                        *    OR
+                        * stripe under reclaim: !caching && injournal
+                        *
+                        * fall back to handle_stripe_dirtying()
+                        */
+                       if (ret == -EAGAIN ||
+                           /* stripe under reclaim: !caching && injournal */
+                           (!test_bit(STRIPE_R5C_CACHING, &sh->state) &&
+                            s.injournal > 0)) {
+                               ret = handle_stripe_dirtying(conf, sh, &s,
+                                                            disks);
+                               if (ret == -EAGAIN)
+                                       goto finish;
+                       }
+               }
+       }
  
         /* maybe we need to check and possibly fix the parity for this stripe
          * Any reads will already have been scheduled, so we just see if enough
@@ -4640,9 +4756,7 @@ finish:
         }
  
         if (!bio_list_empty(&s.return_bi)) {
-               if (test_bit(MD_CHANGE_PENDING, &conf->mddev->flags) &&
-                               (s.failed <= conf->max_degraded ||
-                                       conf->mddev->external == 0)) {
+               if (test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) {
                         spin_lock_irq(&conf->device_lock);
                         bio_list_merge(&conf->return_bi, &s.return_bi);
                         spin_unlock_irq(&conf->device_lock);
@@ -4698,6 +4812,10 @@ static int raid5_congested(struct mddev *mddev, int bits)
  
         if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state))
                 return 1;
+
+       /* Also checks whether there is pressure on r5cache log space */
+       if (test_bit(R5C_LOG_TIGHT, &conf->cache_state))
+               return 1;
         if (conf->quiesce)
                 return 1;
         if (atomic_read(&conf->empty_inactive_list_nr))
@@ -5167,6 +5285,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
         int remaining;
         DEFINE_WAIT(w);
         bool do_prepare;
+       bool do_flush = false;
  
         if (unlikely(bi->bi_opf & REQ_PREFLUSH)) {
                 int ret = r5l_handle_flush_request(conf->log, bi);
@@ -5178,6 +5297,11 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
                         return;
                 }
                 /* ret == -EAGAIN, fallback */
+               /*
+                * if r5l_handle_flush_request() didn't clear REQ_PREFLUSH,
+                * we need to flush journal device
+                */
+               do_flush = bi->bi_opf & REQ_PREFLUSH;
         }
  
         md_write_start(mddev, bi);
@@ -5188,6 +5312,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
          * data on failed drives.
          */
         if (rw == READ && mddev->degraded == 0 &&
+           !r5c_is_writeback(conf->log) &&
             mddev->reshape_position == MaxSector) {
                 bi = chunk_aligned_read(mddev, bi);
                 if (!bi)
@@ -5316,6 +5441,12 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
                                 do_prepare = true;
                                 goto retry;
                         }
+                       if (do_flush) {
+                               set_bit(STRIPE_R5C_PREFLUSH, &sh->state);
+                               /* we only need flush for one stripe */
+                               do_flush = false;
+                       }
+
                         set_bit(STRIPE_HANDLE, &sh->state);
                         clear_bit(STRIPE_DELAYED, &sh->state);
                         if ((!sh->batch_head || sh == sh->batch_head) &&
@@ -5481,9 +5612,9 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
                 mddev->reshape_position = conf->reshape_progress;
                 mddev->curr_resync_completed = sector_nr;
                 conf->reshape_checkpoint = jiffies;
-               set_bit(MD_CHANGE_DEVS, &mddev->flags);
+               set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
                 md_wakeup_thread(mddev->thread);
-               wait_event(mddev->sb_wait, mddev->flags == 0 ||
+               wait_event(mddev->sb_wait, mddev->sb_flags == 0 ||
                            test_bit(MD_RECOVERY_INTR, &mddev->recovery));
                 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
                         return 0;
@@ -5579,10 +5710,10 @@ finish:
                 mddev->reshape_position = conf->reshape_progress;
                 mddev->curr_resync_completed = sector_nr;
                 conf->reshape_checkpoint = jiffies;
-               set_bit(MD_CHANGE_DEVS, &mddev->flags);
+               set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
                 md_wakeup_thread(mddev->thread);
                 wait_event(mddev->sb_wait,
-                          !test_bit(MD_CHANGE_DEVS, &mddev->flags)
+                          !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)
                            || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
                 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
                         goto ret;
@@ -5857,10 +5988,10 @@ static void raid5d(struct md_thread *thread)
         md_check_recovery(mddev);
  
         if (!bio_list_empty(&conf->return_bi) &&
-           !test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
+           !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
                 struct bio_list tmp = BIO_EMPTY_LIST;
                 spin_lock_irq(&conf->device_lock);
-               if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
+               if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
                         bio_list_merge(&tmp, &conf->return_bi);
                         bio_list_init(&conf->return_bi);
                 }
@@ -5907,7 +6038,7 @@ static void raid5d(struct md_thread *thread)
                         break;
                 handled += batch_size;
  
-               if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) {
+               if (mddev->sb_flags & ~(1 << MD_SB_CHANGE_PENDING)) {
                         spin_unlock_irq(&conf->device_lock);
                         md_check_recovery(mddev);
                         spin_lock_irq(&conf->device_lock);
@@ -6237,6 +6368,7 @@ static struct attribute *raid5_attrs[] =  {
         &raid5_group_thread_cnt.attr,
         &raid5_skip_copy.attr,
         &raid5_rmw_level.attr,
+       &r5c_journal_mode.attr,
         NULL,
  };
  static struct attribute_group raid5_attrs_group = {
@@ -6363,6 +6495,8 @@ static void raid5_free_percpu(struct r5conf *conf)
  
  static void free_conf(struct r5conf *conf)
  {
+       int i;
+
         if (conf->log)
                 r5l_exit_log(conf->log);
         if (conf->shrinker.nr_deferred)
@@ -6371,6 +6505,9 @@ static void free_conf(struct r5conf *conf)
         free_thread_groups(conf);
         shrink_stripes(conf);
         raid5_free_percpu(conf);
+       for (i = 0; i < conf->pool_size; i++)
+               if (conf->disks[i].extra_page)
+                       put_page(conf->disks[i].extra_page);
         kfree(conf->disks);
         kfree(conf->stripe_hashtbl);
         kfree(conf);
@@ -6382,8 +6519,8 @@ static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node)
         struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu);
  
         if (alloc_scratch_buffer(conf, percpu)) {
-               pr_err("%s: failed memory allocation for cpu%u\n",
-                      __func__, cpu);
+               pr_warn("%s: failed memory allocation for cpu%u\n",
+                       __func__, cpu);
                 return -ENOMEM;
         }
         return 0;
@@ -6453,29 +6590,29 @@ static struct r5conf *setup_conf(struct mddev *mddev)
         if (mddev->new_level != 5
             && mddev->new_level != 4
             && mddev->new_level != 6) {
-               printk(KERN_ERR "md/raid:%s: raid level not set to 4/5/6 (%d)\n",
-                      mdname(mddev), mddev->new_level);
+               pr_warn("md/raid:%s: raid level not set to 4/5/6 (%d)\n",
+                       mdname(mddev), mddev->new_level);
                 return ERR_PTR(-EIO);
         }
         if ((mddev->new_level == 5
              && !algorithm_valid_raid5(mddev->new_layout)) ||
             (mddev->new_level == 6
              && !algorithm_valid_raid6(mddev->new_layout))) {
-               printk(KERN_ERR "md/raid:%s: layout %d not supported\n",
-                      mdname(mddev), mddev->new_layout);
+               pr_warn("md/raid:%s: layout %d not supported\n",
+                       mdname(mddev), mddev->new_layout);
                 return ERR_PTR(-EIO);
         }
         if (mddev->new_level == 6 && mddev->raid_disks < 4) {
-               printk(KERN_ERR "md/raid:%s: not enough configured devices (%d, minimum 4)\n",
-                      mdname(mddev), mddev->raid_disks);
+               pr_warn("md/raid:%s: not enough configured devices (%d, minimum 4)\n",
+                       mdname(mddev), mddev->raid_disks);
                 return ERR_PTR(-EINVAL);
         }
  
         if (!mddev->new_chunk_sectors ||
             (mddev->new_chunk_sectors << 9) % PAGE_SIZE ||
             !is_power_of_2(mddev->new_chunk_sectors)) {
-               printk(KERN_ERR "md/raid:%s: invalid chunk size %d\n",
-                      mdname(mddev), mddev->new_chunk_sectors << 9);
+               pr_warn("md/raid:%s: invalid chunk size %d\n",
+                       mdname(mddev), mddev->new_chunk_sectors << 9);
                 return ERR_PTR(-EINVAL);
         }
  
@@ -6517,9 +6654,16 @@ static struct r5conf *setup_conf(struct mddev *mddev)
  
         conf->disks = kzalloc(max_disks * sizeof(struct disk_info),
                               GFP_KERNEL);
+
         if (!conf->disks)
                 goto abort;
  
+       for (i = 0; i < max_disks; i++) {
+               conf->disks[i].extra_page = alloc_page(GFP_KERNEL);
+               if (!conf->disks[i].extra_page)
+                       goto abort;
+       }
+
         conf->mddev = mddev;
  
         if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
@@ -6540,6 +6684,11 @@ static struct r5conf *setup_conf(struct mddev *mddev)
         for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
                 INIT_LIST_HEAD(conf->temp_inactive_list + i);
  
+       atomic_set(&conf->r5c_cached_full_stripes, 0);
+       INIT_LIST_HEAD(&conf->r5c_full_stripe_list);
+       atomic_set(&conf->r5c_cached_partial_stripes, 0);
+       INIT_LIST_HEAD(&conf->r5c_partial_stripe_list);
+
         conf->level = mddev->new_level;
         conf->chunk_sectors = mddev->new_chunk_sectors;
         if (raid5_alloc_percpu(conf) != 0)
@@ -6566,9 +6715,8 @@ static struct r5conf *setup_conf(struct mddev *mddev)
  
                 if (test_bit(In_sync, &rdev->flags)) {
                         char b[BDEVNAME_SIZE];
-                       printk(KERN_INFO "md/raid:%s: device %s operational as raid"
-                              " disk %d\n",
-                              mdname(mddev), bdevname(rdev->bdev, b), raid_disk);
+                       pr_info("md/raid:%s: device %s operational as raid disk %d\n",
+                               mdname(mddev), bdevname(rdev->bdev, b), raid_disk);
                 } else if (rdev->saved_raid_disk != raid_disk)
                         /* Cannot rely on bitmap to complete recovery */
                         conf->fullsync = 1;
@@ -6602,21 +6750,18 @@ static struct r5conf *setup_conf(struct mddev *mddev)
                         ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4);
                 conf->min_nr_stripes = max(NR_STRIPES, stripes);
                 if (conf->min_nr_stripes != NR_STRIPES)
-                       printk(KERN_INFO
-                               "md/raid:%s: force stripe size %d for reshape\n",
+                       pr_info("md/raid:%s: force stripe size %d for reshape\n",
                                 mdname(mddev), conf->min_nr_stripes);
         }
         memory = conf->min_nr_stripes * (sizeof(struct stripe_head) +
                  max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
         atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS);
         if (grow_stripes(conf, conf->min_nr_stripes)) {
-               printk(KERN_ERR
-                      "md/raid:%s: couldn't allocate %dkB for buffers\n",
-                      mdname(mddev), memory);
+               pr_warn("md/raid:%s: couldn't allocate %dkB for buffers\n",
+                       mdname(mddev), memory);
                 goto abort;
         } else
-               printk(KERN_INFO "md/raid:%s: allocated %dkB\n",
-                      mdname(mddev), memory);
+               pr_debug("md/raid:%s: allocated %dkB\n", mdname(mddev), memory);
         /*
          * Losing a stripe head costs more than the time to refill it,
          * it reduces the queue depth and so can hurt throughput.
@@ -6628,18 +6773,16 @@ static struct r5conf *setup_conf(struct mddev *mddev)
         conf->shrinker.batch = 128;
         conf->shrinker.flags = 0;
         if (register_shrinker(&conf->shrinker)) {
-               printk(KERN_ERR
-                      "md/raid:%s: couldn't register shrinker.\n",
-                      mdname(mddev));
+               pr_warn("md/raid:%s: couldn't register shrinker.\n",
+                       mdname(mddev));
                 goto abort;
         }
  
         sprintf(pers_name, "raid%d", mddev->new_level);
         conf->thread = md_register_thread(raid5d, mddev, pers_name);
         if (!conf->thread) {
-               printk(KERN_ERR
-                      "md/raid:%s: couldn't allocate thread.\n",
-                      mdname(mddev));
+               pr_warn("md/raid:%s: couldn't allocate thread.\n",
+                       mdname(mddev));
                 goto abort;
         }
  
@@ -6692,9 +6835,8 @@ static int raid5_run(struct mddev *mddev)
         int first = 1;
  
         if (mddev->recovery_cp != MaxSector)
-               printk(KERN_NOTICE "md/raid:%s: not clean"
-                      " -- starting background reconstruction\n",
-                      mdname(mddev));
+               pr_notice("md/raid:%s: not clean -- starting background reconstruction\n",
+                         mdname(mddev));
  
         rdev_for_each(rdev, mddev) {
                 long long diff;
@@ -6737,15 +6879,14 @@ static int raid5_run(struct mddev *mddev)
                 int new_data_disks;
  
                 if (journal_dev) {
-                       printk(KERN_ERR "md/raid:%s: don't support reshape with journal - aborting.\n",
-                              mdname(mddev));
+                       pr_warn("md/raid:%s: don't support reshape with journal - aborting.\n",
+                               mdname(mddev));
                         return -EINVAL;
                 }
  
                 if (mddev->new_level != mddev->level) {
-                       printk(KERN_ERR "md/raid:%s: unsupported reshape "
-                              "required - aborting.\n",
-                              mdname(mddev));
+                       pr_warn("md/raid:%s: unsupported reshape required - aborting.\n",
+                               mdname(mddev));
                         return -EINVAL;
                 }
                 old_disks = mddev->raid_disks - mddev->delta_disks;
@@ -6760,8 +6901,8 @@ static int raid5_run(struct mddev *mddev)
                 chunk_sectors = max(mddev->chunk_sectors, mddev->new_chunk_sectors);
                 new_data_disks = mddev->raid_disks - max_degraded;
                 if (sector_div(here_new, chunk_sectors * new_data_disks)) {
-                       printk(KERN_ERR "md/raid:%s: reshape_position not "
-                              "on a stripe boundary\n", mdname(mddev));
+                       pr_warn("md/raid:%s: reshape_position not on a stripe boundary\n",
+                               mdname(mddev));
                         return -EINVAL;
                 }
                 reshape_offset = here_new * chunk_sectors;
@@ -6782,10 +6923,8 @@ static int raid5_run(struct mddev *mddev)
                             abs(min_offset_diff) >= mddev->new_chunk_sectors)
                                 /* not really in-place - so OK */;
                         else if (mddev->ro == 0) {
-                               printk(KERN_ERR "md/raid:%s: in-place reshape "
-                                      "must be started in read-only mode "
-                                      "- aborting\n",
-                                      mdname(mddev));
+                               pr_warn("md/raid:%s: in-place reshape must be started in read-only mode - aborting\n",
+                                       mdname(mddev));
                                 return -EINVAL;
                         }
                 } else if (mddev->reshape_backwards
@@ -6794,13 +6933,11 @@ static int raid5_run(struct mddev *mddev)
                     : (here_new * chunk_sectors >=
                        here_old * chunk_sectors + (-min_offset_diff))) {
                         /* Reading from the same stripe as writing to - bad */
-                       printk(KERN_ERR "md/raid:%s: reshape_position too early for "
-                              "auto-recovery - aborting.\n",
-                              mdname(mddev));
+                       pr_warn("md/raid:%s: reshape_position too early for auto-recovery - aborting.\n",
+                               mdname(mddev));
                         return -EINVAL;
                 }
-               printk(KERN_INFO "md/raid:%s: reshape will continue\n",
-                      mdname(mddev));
+               pr_debug("md/raid:%s: reshape will continue\n", mdname(mddev));
                 /* OK, we should be able to continue; */
         } else {
                 BUG_ON(mddev->level != mddev->new_level);
@@ -6819,8 +6956,8 @@ static int raid5_run(struct mddev *mddev)
  
         if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
                 if (!journal_dev) {
-                       pr_err("md/raid:%s: journal disk is missing, force array readonly\n",
-                              mdname(mddev));
+                       pr_warn("md/raid:%s: journal disk is missing, force array readonly\n",
+                               mdname(mddev));
                         mddev->ro = 1;
                         set_disk_ro(mddev->gendisk, 1);
                 } else if (mddev->recovery_cp == MaxSector)
@@ -6847,8 +6984,7 @@ static int raid5_run(struct mddev *mddev)
                 if (conf->disks[i].replacement &&
                     conf->reshape_progress != MaxSector) {
                         /* replacements and reshape simply do not mix. */
-                       printk(KERN_ERR "md: cannot handle concurrent "
-                              "replacement and reshape.\n");
+                       pr_warn("md: cannot handle concurrent replacement and reshape.\n");
                         goto abort;
                 }
                 if (test_bit(In_sync, &rdev->flags)) {
@@ -6890,8 +7026,7 @@ static int raid5_run(struct mddev *mddev)
         mddev->degraded = calc_degraded(conf);
  
         if (has_failed(conf)) {
-               printk(KERN_ERR "md/raid:%s: not enough operational devices"
-                       " (%d/%d failed)\n",
+               pr_crit("md/raid:%s: not enough operational devices (%d/%d failed)\n",
                         mdname(mddev), mddev->degraded, conf->raid_disks);
                 goto abort;
         }
@@ -6903,29 +7038,19 @@ static int raid5_run(struct mddev *mddev)
         if (mddev->degraded > dirty_parity_disks &&
             mddev->recovery_cp != MaxSector) {
                 if (mddev->ok_start_degraded)
-                       printk(KERN_WARNING
-                              "md/raid:%s: starting dirty degraded array"
-                              " - data corruption possible.\n",
-                              mdname(mddev));
+                       pr_crit("md/raid:%s: starting dirty degraded array - data corruption possible.\n",
+                               mdname(mddev));
                 else {
-                       printk(KERN_ERR
-                              "md/raid:%s: cannot start dirty degraded array.\n",
-                              mdname(mddev));
+                       pr_crit("md/raid:%s: cannot start dirty degraded array.\n",
+                               mdname(mddev));
                         goto abort;
                 }
         }
  
-       if (mddev->degraded == 0)
-               printk(KERN_INFO "md/raid:%s: raid level %d active with %d out of %d"
-                      " devices, algorithm %d\n", mdname(mddev), conf->level,
-                      mddev->raid_disks-mddev->degraded, mddev->raid_disks,
-                      mddev->new_layout);
-       else
-               printk(KERN_ALERT "md/raid:%s: raid level %d active with %d"
-                      " out of %d devices, algorithm %d\n",
-                      mdname(mddev), conf->level,
-                      mddev->raid_disks - mddev->degraded,
-                      mddev->raid_disks, mddev->new_layout);
+       pr_info("md/raid:%s: raid level %d active with %d out of %d devices, algorithm %d\n",
+               mdname(mddev), conf->level,
+               mddev->raid_disks-mddev->degraded, mddev->raid_disks,
+               mddev->new_layout);
  
         print_raid5_conf(conf);
  
@@ -6945,9 +7070,8 @@ static int raid5_run(struct mddev *mddev)
                 mddev->to_remove = NULL;
         else if (mddev->kobj.sd &&
             sysfs_create_group(&mddev->kobj, &raid5_attrs_group))
-               printk(KERN_WARNING
-                      "raid5: failed to create sysfs attributes for %s\n",
-                      mdname(mddev));
+               pr_warn("raid5: failed to create sysfs attributes for %s\n",
+                       mdname(mddev));
         md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
  
         if (mddev->queue) {
@@ -6979,6 +7103,15 @@ static int raid5_run(struct mddev *mddev)
                         stripe = (stripe | (stripe-1)) + 1;
                 mddev->queue->limits.discard_alignment = stripe;
                 mddev->queue->limits.discard_granularity = stripe;
+
+               /*
+                * We use 16-bit counter of active stripes in bi_phys_segments
+                * (minus one for over-loaded initialization)
+                */
+               blk_queue_max_hw_sectors(mddev->queue, 0xfffe * STRIPE_SECTORS);
+               blk_queue_max_discard_sectors(mddev->queue,
+                                             0xfffe * STRIPE_SECTORS);
+
                 /*
                  * unaligned part of discard request will be ignored, so can't
                  * guarantee discard_zeroes_data
@@ -7035,9 +7168,10 @@ static int raid5_run(struct mddev *mddev)
         if (journal_dev) {
                 char b[BDEVNAME_SIZE];
  
-               printk(KERN_INFO"md/raid:%s: using device %s as journal\n",
-                      mdname(mddev), bdevname(journal_dev->bdev, b));
-               r5l_init_log(conf, journal_dev);
+               pr_debug("md/raid:%s: using device %s as journal\n",
+                        mdname(mddev), bdevname(journal_dev->bdev, b));
+               if (r5l_init_log(conf, journal_dev))
+                       goto abort;
         }
  
         return 0;
@@ -7046,7 +7180,7 @@ abort:
         print_raid5_conf(conf);
         free_conf(conf);
         mddev->private = NULL;
-       printk(KERN_ALERT "md/raid:%s: failed to run raid set.\n", mdname(mddev));
+       pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev));
         return -EIO;
  }
  
@@ -7080,12 +7214,12 @@ static void print_raid5_conf (struct r5conf *conf)
         int i;
         struct disk_info *tmp;
  
-       printk(KERN_DEBUG "RAID conf printout:\n");
+       pr_debug("RAID conf printout:\n");
         if (!conf) {
-               printk("(conf==NULL)\n");
+               pr_debug("(conf==NULL)\n");
                 return;
         }
-       printk(KERN_DEBUG " --- level:%d rd:%d wd:%d\n", conf->level,
+       pr_debug(" --- level:%d rd:%d wd:%d\n", conf->level,
                conf->raid_disks,
                conf->raid_disks - conf->mddev->degraded);
  
@@ -7093,7 +7227,7 @@ static void print_raid5_conf (struct r5conf *conf)
                 char b[BDEVNAME_SIZE];
                 tmp = conf->disks + i;
                 if (tmp->rdev)
-                       printk(KERN_DEBUG " disk %d, o:%d, dev:%s\n",
+                       pr_debug(" disk %d, o:%d, dev:%s\n",
                                i, !test_bit(Faulty, &tmp->rdev->flags),
                                bdevname(tmp->rdev->bdev, b));
         }
@@ -7241,8 +7375,8 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
                  * write requests running. We should be safe
                  */
                 r5l_init_log(conf, rdev);
-               printk(KERN_INFO"md/raid:%s: using device %s as journal\n",
-                      mdname(mddev), bdevname(rdev->bdev, b));
+               pr_debug("md/raid:%s: using device %s as journal\n",
+                        mdname(mddev), bdevname(rdev->bdev, b));
                 return 0;
         }
         if (mddev->recovery_disabled == conf->recovery_disabled)
@@ -7346,10 +7480,10 @@ static int check_stripe_cache(struct mddev *mddev)
             > conf->min_nr_stripes ||
             ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4
             > conf->min_nr_stripes) {
-               printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes.  Needed %lu\n",
-                      mdname(mddev),
-                      ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)
-                       / STRIPE_SIZE)*4);
+               pr_warn("md/raid:%s: reshape: not enough stripes.  Needed %lu\n",
+                       mdname(mddev),
+                       ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)
+                        / STRIPE_SIZE)*4);
                 return 0;
         }
         return 1;
@@ -7430,8 +7564,8 @@ static int raid5_start_reshape(struct mddev *mddev)
          */
         if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks)
             < mddev->array_sectors) {
-               printk(KERN_ERR "md/raid:%s: array size must be reduced "
-                      "before number of disks\n", mdname(mddev));
+               pr_warn("md/raid:%s: array size must be reduced before number of disks\n",
+                       mdname(mddev));
                 return -EINVAL;
         }
  
@@ -7501,7 +7635,7 @@ static int raid5_start_reshape(struct mddev *mddev)
         }
         mddev->raid_disks = conf->raid_disks;
         mddev->reshape_position = conf->reshape_progress;
-       set_bit(MD_CHANGE_DEVS, &mddev->flags);
+       set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
  
         clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
         clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
@@ -7619,6 +7753,7 @@ static void raid5_quiesce(struct mddev *mddev, int state)
                 /* '2' tells resync/reshape to pause so that all
                  * active stripes can drain
                  */
+               r5c_flush_cache(conf, INT_MAX);
                 conf->quiesce = 2;
                 wait_event_cmd(conf->wait_for_quiescent,
                                     atomic_read(&conf->active_stripes) == 0 &&
@@ -7649,8 +7784,8 @@ static void *raid45_takeover_raid0(struct mddev *mddev, int level)
  
         /* for raid0 takeover only one zone is supported */
         if (raid0_conf->nr_strip_zones > 1) {
-               printk(KERN_ERR "md/raid:%s: cannot takeover raid0 with more than one zone.\n",
-                      mdname(mddev));
+               pr_warn("md/raid:%s: cannot takeover raid0 with more than one zone.\n",
+                       mdname(mddev));
                 return ERR_PTR(-EINVAL);
         }
  
@@ -7671,6 +7806,7 @@ static void *raid45_takeover_raid0(struct mddev *mddev, int level)
  static void *raid5_takeover_raid1(struct mddev *mddev)
  {
         int chunksect;
+       void *ret;
  
         if (mddev->raid_disks != 2 ||
             mddev->degraded > 1)
@@ -7692,7 +7828,10 @@ static void *raid5_takeover_raid1(struct mddev *mddev)
         mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC;
         mddev->new_chunk_sectors = chunksect;
  
-       return setup_conf(mddev);
+       ret = setup_conf(mddev);
+       if (!IS_ERR_VALUE(ret))
+               clear_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
+       return ret;
  }
  
  static void *raid5_takeover_raid6(struct mddev *mddev)
@@ -7762,7 +7901,7 @@ static int raid5_check_reshape(struct mddev *mddev)
                         conf->chunk_sectors = new_chunk ;
                         mddev->chunk_sectors = new_chunk;
                 }
-               set_bit(MD_CHANGE_DEVS, &mddev->flags);
+               set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
                 md_wakeup_thread(mddev->thread);
         }
         return check_reshape(mddev);