drivers/md/raid5-cache.c

   1 /*
   2  * Copyright (C) 2015 Shaohua Li <shli@fb.com>
   3  * Copyright (C) 2016 Song Liu <songliubraving@fb.com>
   4  *
   5  * This program is free software; you can redistribute it and/or modify it
   6  * under the terms and conditions of the GNU General Public License,
   7  * version 2, as published by the Free Software Foundation.
   8  *
   9  * This program is distributed in the hope it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  12  * more details.
  13  *
  14  */
  15 #include <linux/kernel.h>
  16 #include <linux/wait.h>
  17 #include <linux/blkdev.h>
  18 #include <linux/slab.h>
  19 #include <linux/raid/md_p.h>
  20 #include <linux/crc32c.h>
  21 #include <linux/random.h>
  22 #include "md.h"
  23 #include "raid5.h"
  24 #include "bitmap.h"
  25
  26 /*
  27  * metadata/data stored in disk with 4k size unit (a block) regardless
  28  * underneath hardware sector size. only works with PAGE_SIZE == 4096
  29  */
  30 #define BLOCK_SECTORS (8)
  31
  32 /*
  33  * log->max_free_space is min(1/4 disk size, 10G reclaimable space).
  34  *
  35  * In write through mode, the reclaim runs every log->max_free_space.
  36  * This can prevent the recovery scans for too long
  37  */
  38 #define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */
  39 #define RECLAIM_MAX_FREE_SPACE_SHIFT (2)
  40
  41 /* wake up reclaim thread periodically */
  42 #define R5C_RECLAIM_WAKEUP_INTERVAL (30 * HZ)
  43 /* start flush with these full stripes */
  44 #define R5C_FULL_STRIPE_FLUSH_BATCH 256
  45 /* reclaim stripes in groups */
  46 #define R5C_RECLAIM_STRIPE_GROUP (NR_STRIPE_HASH_LOCKS * 2)
  47
  48 /*
  49  * We only need 2 bios per I/O unit to make progress, but ensure we
  50  * have a few more available to not get too tight.
  51  */
  52 #define R5L_POOL_SIZE   4
  53
  54 /*
  55  * r5c journal modes of the array: write-back or write-through.
  56  * write-through mode has identical behavior as existing log only
  57  * implementation.
  58  */
  59 enum r5c_journal_mode {
  60         R5C_JOURNAL_MODE_WRITE_THROUGH = 0,
  61         R5C_JOURNAL_MODE_WRITE_BACK = 1,
  62 };
  63
  64 static char *r5c_journal_mode_str[] = {"write-through",
  65                                        "write-back"};
  66 /*
  67  * raid5 cache state machine
  68  *
  69  * With rhe RAID cache, each stripe works in two phases:
  70  *      - caching phase
  71  *      - writing-out phase
  72  *
  73  * These two phases are controlled by bit STRIPE_R5C_CACHING:
  74  *   if STRIPE_R5C_CACHING == 0, the stripe is in writing-out phase
  75  *   if STRIPE_R5C_CACHING == 1, the stripe is in caching phase
  76  *
  77  * When there is no journal, or the journal is in write-through mode,
  78  * the stripe is always in writing-out phase.
  79  *
  80  * For write-back journal, the stripe is sent to caching phase on write
  81  * (r5c_try_caching_write). r5c_make_stripe_write_out() kicks off
  82  * the write-out phase by clearing STRIPE_R5C_CACHING.
  83  *
  84  * Stripes in caching phase do not write the raid disks. Instead, all
  85  * writes are committed from the log device. Therefore, a stripe in
  86  * caching phase handles writes as:
  87  *      - write to log device
  88  *      - return IO
  89  *
  90  * Stripes in writing-out phase handle writes as:
  91  *      - calculate parity
  92  *      - write pending data and parity to journal
  93  *      - write data and parity to raid disks
  94  *      - return IO for pending writes
  95  */
  96
  97 struct r5l_log {
  98         struct md_rdev *rdev;
  99
 100         u32 uuid_checksum;
 101
 102         sector_t device_size;           /* log device size, round to
 103                                          * BLOCK_SECTORS */
 104         sector_t max_free_space;        /* reclaim run if free space is at
 105                                          * this size */
 106
 107         sector_t last_checkpoint;       /* log tail. where recovery scan
 108                                          * starts from */
 109         u64 last_cp_seq;                /* log tail sequence */
 110
 111         sector_t log_start;             /* log head. where new data appends */
 112         u64 seq;                        /* log head sequence */
 113
 114         sector_t next_checkpoint;
 115         u64 next_cp_seq;
 116
 117         struct mutex io_mutex;
 118         struct r5l_io_unit *current_io; /* current io_unit accepting new data */
 119
 120         spinlock_t io_list_lock;
 121         struct list_head running_ios;   /* io_units which are still running,
 122                                          * and have not yet been completely
 123                                          * written to the log */
 124         struct list_head io_end_ios;    /* io_units which have been completely
 125                                          * written to the log but not yet written
 126                                          * to the RAID */
 127         struct list_head flushing_ios;  /* io_units which are waiting for log
 128                                          * cache flush */
 129         struct list_head finished_ios;  /* io_units which settle down in log disk */
 130         struct bio flush_bio;
 131
 132         struct list_head no_mem_stripes;   /* pending stripes, -ENOMEM */
 133
 134         struct kmem_cache *io_kc;
 135         mempool_t *io_pool;
 136         struct bio_set *bs;
 137         mempool_t *meta_pool;
 138
 139         struct md_thread *reclaim_thread;
 140         unsigned long reclaim_target;   /* number of space that need to be
 141                                          * reclaimed.  if it's 0, reclaim spaces
 142                                          * used by io_units which are in
 143                                          * IO_UNIT_STRIPE_END state (eg, reclaim
 144                                          * dones't wait for specific io_unit
 145                                          * switching to IO_UNIT_STRIPE_END
 146                                          * state) */
 147         wait_queue_head_t iounit_wait;
 148
 149         struct list_head no_space_stripes; /* pending stripes, log has no space */
 150         spinlock_t no_space_stripes_lock;
 151
 152         bool need_cache_flush;
 153
 154         /* for r5c_cache */
 155         enum r5c_journal_mode r5c_journal_mode;
 156
 157         /* all stripes in r5cache, in the order of seq at sh->log_start */
 158         struct list_head stripe_in_journal_list;
 159
 160         spinlock_t stripe_in_journal_lock;
 161         atomic_t stripe_in_journal_count;
 162 };
 163
 164 /*
 165  * an IO range starts from a meta data block and end at the next meta data
 166  * block. The io unit's the meta data block tracks data/parity followed it. io
 167  * unit is written to log disk with normal write, as we always flush log disk
 168  * first and then start move data to raid disks, there is no requirement to
 169  * write io unit with FLUSH/FUA
 170  */
 171 struct r5l_io_unit {
 172         struct r5l_log *log;
 173
 174         struct page *meta_page; /* store meta block */
 175         int meta_offset;        /* current offset in meta_page */
 176
 177         struct bio *current_bio;/* current_bio accepting new data */
 178
 179         atomic_t pending_stripe;/* how many stripes not flushed to raid */
 180         u64 seq;                /* seq number of the metablock */
 181         sector_t log_start;     /* where the io_unit starts */
 182         sector_t log_end;       /* where the io_unit ends */
 183         struct list_head log_sibling; /* log->running_ios */
 184         struct list_head stripe_list; /* stripes added to the io_unit */
 185
 186         int state;
 187         bool need_split_bio;
 188 };
 189
 190 /* r5l_io_unit state */
 191 enum r5l_io_unit_state {
 192         IO_UNIT_RUNNING = 0,    /* accepting new IO */
 193         IO_UNIT_IO_START = 1,   /* io_unit bio start writing to log,
 194                                  * don't accepting new bio */
 195         IO_UNIT_IO_END = 2,     /* io_unit bio finish writing to log */
 196         IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */
 197 };
 198
 199 bool r5c_is_writeback(struct r5l_log *log)
 200 {
 201         return (log != NULL &&
 202                 log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK);
 203 }
 204
 205 static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc)
 206 {
 207         start += inc;
 208         if (start >= log->device_size)
 209                 start = start - log->device_size;
 210         return start;
 211 }
 212
 213 static sector_t r5l_ring_distance(struct r5l_log *log, sector_t start,
 214                                   sector_t end)
 215 {
 216         if (end >= start)
 217                 return end - start;
 218         else
 219                 return end + log->device_size - start;
 220 }
 221
 222 static bool r5l_has_free_space(struct r5l_log *log, sector_t size)
 223 {
 224         sector_t used_size;
 225
 226         used_size = r5l_ring_distance(log, log->last_checkpoint,
 227                                         log->log_start);
 228
 229         return log->device_size > used_size + size;
 230 }
 231
 232 static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
 233                                     enum r5l_io_unit_state state)
 234 {
 235         if (WARN_ON(io->state >= state))
 236                 return;
 237         io->state = state;
 238 }
 239
 240 static void
 241 r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev,
 242                               struct bio_list *return_bi)
 243 {
 244         struct bio *wbi, *wbi2;
 245
 246         wbi = dev->written;
 247         dev->written = NULL;
 248         while (wbi && wbi->bi_iter.bi_sector <
 249                dev->sector + STRIPE_SECTORS) {
 250                 wbi2 = r5_next_bio(wbi, dev->sector);
 251                 if (!raid5_dec_bi_active_stripes(wbi)) {
 252                         md_write_end(conf->mddev);
 253                         bio_list_add(return_bi, wbi);
 254                 }
 255                 wbi = wbi2;
 256         }
 257 }
 258
 259 void r5c_handle_cached_data_endio(struct r5conf *conf,
 260           struct stripe_head *sh, int disks, struct bio_list *return_bi)
 261 {
 262         int i;
 263
 264         for (i = sh->disks; i--; ) {
 265                 if (sh->dev[i].written) {
 266                         set_bit(R5_UPTODATE, &sh->dev[i].flags);
 267                         r5c_return_dev_pending_writes(conf, &sh->dev[i],
 268                                                       return_bi);
 269                         bitmap_endwrite(conf->mddev->bitmap, sh->sector,
 270                                         STRIPE_SECTORS,
 271                                         !test_bit(STRIPE_DEGRADED, &sh->state),
 272                                         0);
 273                 }
 274         }
 275 }
 276
 277 /* Check whether we should flush some stripes to free up stripe cache */
 278 void r5c_check_stripe_cache_usage(struct r5conf *conf)
 279 {
 280         int total_cached;
 281
 282         if (!r5c_is_writeback(conf->log))
 283                 return;
 284
 285         total_cached = atomic_read(&conf->r5c_cached_partial_stripes) +
 286                 atomic_read(&conf->r5c_cached_full_stripes);
 287
 288         /*
 289          * The following condition is true for either of the following:
 290          *   - stripe cache pressure high:
 291          *          total_cached > 3/4 min_nr_stripes ||
 292          *          empty_inactive_list_nr > 0
 293          *   - stripe cache pressure moderate:
 294          *          total_cached > 1/2 min_nr_stripes
 295          */
 296         if (total_cached > conf->min_nr_stripes * 1 / 2 ||
 297             atomic_read(&conf->empty_inactive_list_nr) > 0)
 298                 r5l_wake_reclaim(conf->log, 0);
 299 }
 300
 301 /*
 302  * flush cache when there are R5C_FULL_STRIPE_FLUSH_BATCH or more full
 303  * stripes in the cache
 304  */
 305 void r5c_check_cached_full_stripe(struct r5conf *conf)
 306 {
 307         if (!r5c_is_writeback(conf->log))
 308                 return;
 309
 310         /*
 311          * wake up reclaim for R5C_FULL_STRIPE_FLUSH_BATCH cached stripes
 312          * or a full stripe (chunk size / 4k stripes).
 313          */
 314         if (atomic_read(&conf->r5c_cached_full_stripes) >=
 315             min(R5C_FULL_STRIPE_FLUSH_BATCH,
 316                 conf->chunk_sectors >> STRIPE_SHIFT))
 317                 r5l_wake_reclaim(conf->log, 0);
 318 }
 319
 320 /*
 321  * Total log space (in sectors) needed to flush all data in cache
 322  *
 323  * Currently, writing-out phase automatically includes all pending writes
 324  * to the same sector. So the reclaim of each stripe takes up to
 325  * (conf->raid_disks + 1) pages of log space.
 326  *
 327  * To totally avoid deadlock due to log space, the code reserves
 328  * (conf->raid_disks + 1) pages for each stripe in cache, which is not
 329  * necessary in most cases.
 330  *
 331  * To improve this, we will need writing-out phase to be able to NOT include
 332  * pending writes, which will reduce the requirement to
 333  * (conf->max_degraded + 1) pages per stripe in cache.
 334  */
 335 static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf)
 336 {
 337         struct r5l_log *log = conf->log;
 338
 339         if (!r5c_is_writeback(log))
 340                 return 0;
 341
 342         return BLOCK_SECTORS * (conf->raid_disks + 1) *
 343                 atomic_read(&log->stripe_in_journal_count);
 344 }
 345
 346 /*
 347  * evaluate log space usage and update R5C_LOG_TIGHT and R5C_LOG_CRITICAL
 348  *
 349  * R5C_LOG_TIGHT is set when free space on the log device is less than 3x of
 350  * reclaim_required_space. R5C_LOG_CRITICAL is set when free space on the log
 351  * device is less than 2x of reclaim_required_space.
 352  */
 353 static inline void r5c_update_log_state(struct r5l_log *log)
 354 {
 355         struct r5conf *conf = log->rdev->mddev->private;
 356         sector_t free_space;
 357         sector_t reclaim_space;
 358
 359         if (!r5c_is_writeback(log))
 360                 return;
 361
 362         free_space = r5l_ring_distance(log, log->log_start,
 363                                        log->last_checkpoint);
 364         reclaim_space = r5c_log_required_to_flush_cache(conf);
 365         if (free_space < 2 * reclaim_space)
 366                 set_bit(R5C_LOG_CRITICAL, &conf->cache_state);
 367         else
 368                 clear_bit(R5C_LOG_CRITICAL, &conf->cache_state);
 369         if (free_space < 3 * reclaim_space)
 370                 set_bit(R5C_LOG_TIGHT, &conf->cache_state);
 371         else
 372                 clear_bit(R5C_LOG_TIGHT, &conf->cache_state);
 373 }
 374
 375 /*
 376  * Put the stripe into writing-out phase by clearing STRIPE_R5C_CACHING.
 377  * This function should only be called in write-back mode.
 378  */
 379 void r5c_make_stripe_write_out(struct stripe_head *sh)
 380 {
 381         struct r5conf *conf = sh->raid_conf;
 382         struct r5l_log *log = conf->log;
 383
 384         BUG_ON(!r5c_is_writeback(log));
 385
 386         WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
 387         clear_bit(STRIPE_R5C_CACHING, &sh->state);
 388
 389         if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
 390                 atomic_inc(&conf->preread_active_stripes);
 391
 392         if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) {
 393                 BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0);
 394                 atomic_dec(&conf->r5c_cached_partial_stripes);
 395         }
 396
 397         if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
 398                 BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0);
 399                 atomic_dec(&conf->r5c_cached_full_stripes);
 400         }
 401 }
 402
 403 static void r5c_handle_data_cached(struct stripe_head *sh)
 404 {
 405         int i;
 406
 407         for (i = sh->disks; i--; )
 408                 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
 409                         set_bit(R5_InJournal, &sh->dev[i].flags);
 410                         clear_bit(R5_LOCKED, &sh->dev[i].flags);
 411                 }
 412         clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
 413 }
 414
 415 /*
 416  * this journal write must contain full parity,
 417  * it may also contain some data pages
 418  */
 419 static void r5c_handle_parity_cached(struct stripe_head *sh)
 420 {
 421         int i;
 422
 423         for (i = sh->disks; i--; )
 424                 if (test_bit(R5_InJournal, &sh->dev[i].flags))
 425                         set_bit(R5_Wantwrite, &sh->dev[i].flags);
 426 }
 427
 428 /*
 429  * Setting proper flags after writing (or flushing) data and/or parity to the
 430  * log device. This is called from r5l_log_endio() or r5l_log_flush_endio().
 431  */
 432 static void r5c_finish_cache_stripe(struct stripe_head *sh)
 433 {
 434         struct r5l_log *log = sh->raid_conf->log;
 435
 436         if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
 437                 BUG_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
 438                 /*
 439                  * Set R5_InJournal for parity dev[pd_idx]. This means
 440                  * all data AND parity in the journal. For RAID 6, it is
 441                  * NOT necessary to set the flag for dev[qd_idx], as the
 442                  * two parities are written out together.
 443                  */
 444                 set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
 445         } else if (test_bit(STRIPE_R5C_CACHING, &sh->state)) {
 446                 r5c_handle_data_cached(sh);
 447         } else {
 448                 r5c_handle_parity_cached(sh);
 449                 set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
 450         }
 451 }
 452
 453 static void r5l_io_run_stripes(struct r5l_io_unit *io)
 454 {
 455         struct stripe_head *sh, *next;
 456
 457         list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) {
 458                 list_del_init(&sh->log_list);
 459
 460                 r5c_finish_cache_stripe(sh);
 461
 462                 set_bit(STRIPE_HANDLE, &sh->state);
 463                 raid5_release_stripe(sh);
 464         }
 465 }
 466
 467 static void r5l_log_run_stripes(struct r5l_log *log)
 468 {
 469         struct r5l_io_unit *io, *next;
 470
 471         assert_spin_locked(&log->io_list_lock);
 472
 473         list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
 474                 /* don't change list order */
 475                 if (io->state < IO_UNIT_IO_END)
 476                         break;
 477
 478                 list_move_tail(&io->log_sibling, &log->finished_ios);
 479                 r5l_io_run_stripes(io);
 480         }
 481 }
 482
 483 static void r5l_move_to_end_ios(struct r5l_log *log)
 484 {
 485         struct r5l_io_unit *io, *next;
 486
 487         assert_spin_locked(&log->io_list_lock);
 488
 489         list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
 490                 /* don't change list order */
 491                 if (io->state < IO_UNIT_IO_END)
 492                         break;
 493                 list_move_tail(&io->log_sibling, &log->io_end_ios);
 494         }
 495 }
 496
 497 static void r5l_log_endio(struct bio *bio)
 498 {
 499         struct r5l_io_unit *io = bio->bi_private;
 500         struct r5l_log *log = io->log;
 501         unsigned long flags;
 502
 503         if (bio->bi_error)
 504                 md_error(log->rdev->mddev, log->rdev);
 505
 506         bio_put(bio);
 507         mempool_free(io->meta_page, log->meta_pool);
 508
 509         spin_lock_irqsave(&log->io_list_lock, flags);
 510         __r5l_set_io_unit_state(io, IO_UNIT_IO_END);
 511         if (log->need_cache_flush)
 512                 r5l_move_to_end_ios(log);
 513         else
 514                 r5l_log_run_stripes(log);
 515         spin_unlock_irqrestore(&log->io_list_lock, flags);
 516
 517         if (log->need_cache_flush)
 518                 md_wakeup_thread(log->rdev->mddev->thread);
 519 }
 520
 521 static void r5l_submit_current_io(struct r5l_log *log)
 522 {
 523         struct r5l_io_unit *io = log->current_io;
 524         struct r5l_meta_block *block;
 525         unsigned long flags;
 526         u32 crc;
 527
 528         if (!io)
 529                 return;
 530
 531         block = page_address(io->meta_page);
 532         block->meta_size = cpu_to_le32(io->meta_offset);
 533         crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE);
 534         block->checksum = cpu_to_le32(crc);
 535
 536         log->current_io = NULL;
 537         spin_lock_irqsave(&log->io_list_lock, flags);
 538         __r5l_set_io_unit_state(io, IO_UNIT_IO_START);
 539         spin_unlock_irqrestore(&log->io_list_lock, flags);
 540
 541         submit_bio(io->current_bio);
 542 }
 543
 544 static struct bio *r5l_bio_alloc(struct r5l_log *log)
 545 {
 546         struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, log->bs);
 547
 548         bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
 549         bio->bi_bdev = log->rdev->bdev;
 550         bio->bi_iter.bi_sector = log->rdev->data_offset + log->log_start;
 551
 552         return bio;
 553 }
 554
 555 static void r5_reserve_log_entry(struct r5l_log *log, struct r5l_io_unit *io)
 556 {
 557         log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS);
 558
 559         r5c_update_log_state(log);
 560         /*
 561          * If we filled up the log device start from the beginning again,
 562          * which will require a new bio.
 563          *
 564          * Note: for this to work properly the log size needs to me a multiple
 565          * of BLOCK_SECTORS.
 566          */
 567         if (log->log_start == 0)
 568                 io->need_split_bio = true;
 569
 570         io->log_end = log->log_start;
 571 }
 572
 573 static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
 574 {
 575         struct r5l_io_unit *io;
 576         struct r5l_meta_block *block;
 577
 578         io = mempool_alloc(log->io_pool, GFP_ATOMIC);
 579         if (!io)
 580                 return NULL;
 581         memset(io, 0, sizeof(*io));
 582
 583         io->log = log;
 584         INIT_LIST_HEAD(&io->log_sibling);
 585         INIT_LIST_HEAD(&io->stripe_list);
 586         io->state = IO_UNIT_RUNNING;
 587
 588         io->meta_page = mempool_alloc(log->meta_pool, GFP_NOIO);
 589         block = page_address(io->meta_page);
 590         clear_page(block);
 591         block->magic = cpu_to_le32(R5LOG_MAGIC);
 592         block->version = R5LOG_VERSION;
 593         block->seq = cpu_to_le64(log->seq);
 594         block->position = cpu_to_le64(log->log_start);
 595
 596         io->log_start = log->log_start;
 597         io->meta_offset = sizeof(struct r5l_meta_block);
 598         io->seq = log->seq++;
 599
 600         io->current_bio = r5l_bio_alloc(log);
 601         io->current_bio->bi_end_io = r5l_log_endio;
 602         io->current_bio->bi_private = io;
 603         bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0);
 604
 605         r5_reserve_log_entry(log, io);
 606
 607         spin_lock_irq(&log->io_list_lock);
 608         list_add_tail(&io->log_sibling, &log->running_ios);
 609         spin_unlock_irq(&log->io_list_lock);
 610
 611         return io;
 612 }
 613
 614 static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size)
 615 {
 616         if (log->current_io &&
 617             log->current_io->meta_offset + payload_size > PAGE_SIZE)
 618                 r5l_submit_current_io(log);
 619
 620         if (!log->current_io) {
 621                 log->current_io = r5l_new_meta(log);
 622                 if (!log->current_io)
 623                         return -ENOMEM;
 624         }
 625
 626         return 0;
 627 }
 628
 629 static void r5l_append_payload_meta(struct r5l_log *log, u16 type,
 630                                     sector_t location,
 631                                     u32 checksum1, u32 checksum2,
 632                                     bool checksum2_valid)
 633 {
 634         struct r5l_io_unit *io = log->current_io;
 635         struct r5l_payload_data_parity *payload;
 636
 637         payload = page_address(io->meta_page) + io->meta_offset;
 638         payload->header.type = cpu_to_le16(type);
 639         payload->header.flags = cpu_to_le16(0);
 640         payload->size = cpu_to_le32((1 + !!checksum2_valid) <<
 641                                     (PAGE_SHIFT - 9));
 642         payload->location = cpu_to_le64(location);
 643         payload->checksum[0] = cpu_to_le32(checksum1);
 644         if (checksum2_valid)
 645                 payload->checksum[1] = cpu_to_le32(checksum2);
 646
 647         io->meta_offset += sizeof(struct r5l_payload_data_parity) +
 648                 sizeof(__le32) * (1 + !!checksum2_valid);
 649 }
 650
 651 static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
 652 {
 653         struct r5l_io_unit *io = log->current_io;
 654
 655         if (io->need_split_bio) {
 656                 struct bio *prev = io->current_bio;
 657
 658                 io->current_bio = r5l_bio_alloc(log);
 659                 bio_chain(io->current_bio, prev);
 660
 661                 submit_bio(prev);
 662         }
 663
 664         if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0))
 665                 BUG();
 666
 667         r5_reserve_log_entry(log, io);
 668 }
 669
 670 static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
 671                            int data_pages, int parity_pages)
 672 {
 673         int i;
 674         int meta_size;
 675         int ret;
 676         struct r5l_io_unit *io;
 677
 678         meta_size =
 679                 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
 680                  * data_pages) +
 681                 sizeof(struct r5l_payload_data_parity) +
 682                 sizeof(__le32) * parity_pages;
 683
 684         ret = r5l_get_meta(log, meta_size);
 685         if (ret)
 686                 return ret;
 687
 688         io = log->current_io;
 689
 690         for (i = 0; i < sh->disks; i++) {
 691                 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) ||
 692                     test_bit(R5_InJournal, &sh->dev[i].flags))
 693                         continue;
 694                 if (i == sh->pd_idx || i == sh->qd_idx)
 695                         continue;
 696                 r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA,
 697                                         raid5_compute_blocknr(sh, i, 0),
 698                                         sh->dev[i].log_checksum, 0, false);
 699                 r5l_append_payload_page(log, sh->dev[i].page);
 700         }
 701
 702         if (parity_pages == 2) {
 703                 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
 704                                         sh->sector, sh->dev[sh->pd_idx].log_checksum,
 705                                         sh->dev[sh->qd_idx].log_checksum, true);
 706                 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
 707                 r5l_append_payload_page(log, sh->dev[sh->qd_idx].page);
 708         } else if (parity_pages == 1) {
 709                 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
 710                                         sh->sector, sh->dev[sh->pd_idx].log_checksum,
 711                                         0, false);
 712                 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
 713         } else  /* Just writing data, not parity, in caching phase */
 714                 BUG_ON(parity_pages != 0);
 715
 716         list_add_tail(&sh->log_list, &io->stripe_list);
 717         atomic_inc(&io->pending_stripe);
 718         sh->log_io = io;
 719
 720         if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
 721                 return 0;
 722
 723         if (sh->log_start == MaxSector) {
 724                 BUG_ON(!list_empty(&sh->r5c));
 725                 sh->log_start = io->log_start;
 726                 spin_lock_irq(&log->stripe_in_journal_lock);
 727                 list_add_tail(&sh->r5c,
 728                               &log->stripe_in_journal_list);
 729                 spin_unlock_irq(&log->stripe_in_journal_lock);
 730                 atomic_inc(&log->stripe_in_journal_count);
 731         }
 732         return 0;
 733 }
 734
 735 /* add stripe to no_space_stripes, and then wake up reclaim */
 736 static inline void r5l_add_no_space_stripe(struct r5l_log *log,
 737                                            struct stripe_head *sh)
 738 {
 739         spin_lock(&log->no_space_stripes_lock);
 740         list_add_tail(&sh->log_list, &log->no_space_stripes);
 741         spin_unlock(&log->no_space_stripes_lock);
 742 }
 743
 744 /*
 745  * running in raid5d, where reclaim could wait for raid5d too (when it flushes
 746  * data from log to raid disks), so we shouldn't wait for reclaim here
 747  */
 748 int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
 749 {
 750         struct r5conf *conf = sh->raid_conf;
 751         int write_disks = 0;
 752         int data_pages, parity_pages;
 753         int reserve;
 754         int i;
 755         int ret = 0;
 756         bool wake_reclaim = false;
 757
 758         if (!log)
 759                 return -EAGAIN;
 760         /* Don't support stripe batch */
 761         if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
 762             test_bit(STRIPE_SYNCING, &sh->state)) {
 763                 /* the stripe is written to log, we start writing it to raid */
 764                 clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
 765                 return -EAGAIN;
 766         }
 767
 768         WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
 769
 770         for (i = 0; i < sh->disks; i++) {
 771                 void *addr;
 772
 773                 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) ||
 774                     test_bit(R5_InJournal, &sh->dev[i].flags))
 775                         continue;
 776
 777                 write_disks++;
 778                 /* checksum is already calculated in last run */
 779                 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
 780                         continue;
 781                 addr = kmap_atomic(sh->dev[i].page);
 782                 sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
 783                                                     addr, PAGE_SIZE);
 784                 kunmap_atomic(addr);
 785         }
 786         parity_pages = 1 + !!(sh->qd_idx >= 0);
 787         data_pages = write_disks - parity_pages;
 788
 789         set_bit(STRIPE_LOG_TRAPPED, &sh->state);
 790         /*
 791          * The stripe must enter state machine again to finish the write, so
 792          * don't delay.
 793          */
 794         clear_bit(STRIPE_DELAYED, &sh->state);
 795         atomic_inc(&sh->count);
 796
 797         mutex_lock(&log->io_mutex);
 798         /* meta + data */
 799         reserve = (1 + write_disks) << (PAGE_SHIFT - 9);
 800
 801         if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
 802                 if (!r5l_has_free_space(log, reserve)) {
 803                         r5l_add_no_space_stripe(log, sh);
 804                         wake_reclaim = true;
 805                 } else {
 806                         ret = r5l_log_stripe(log, sh, data_pages, parity_pages);
 807                         if (ret) {
 808                                 spin_lock_irq(&log->io_list_lock);
 809                                 list_add_tail(&sh->log_list,
 810                                               &log->no_mem_stripes);
 811                                 spin_unlock_irq(&log->io_list_lock);
 812                         }
 813                 }
 814         } else {  /* R5C_JOURNAL_MODE_WRITE_BACK */
 815                 /*
 816                  * log space critical, do not process stripes that are
 817                  * not in cache yet (sh->log_start == MaxSector).
 818                  */
 819                 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
 820                     sh->log_start == MaxSector) {
 821                         r5l_add_no_space_stripe(log, sh);
 822                         wake_reclaim = true;
 823                         reserve = 0;
 824                 } else if (!r5l_has_free_space(log, reserve)) {
 825                         if (sh->log_start == log->last_checkpoint)
 826                                 BUG();
 827                         else
 828                                 r5l_add_no_space_stripe(log, sh);
 829                 } else {
 830                         ret = r5l_log_stripe(log, sh, data_pages, parity_pages);
 831                         if (ret) {
 832                                 spin_lock_irq(&log->io_list_lock);
 833                                 list_add_tail(&sh->log_list,
 834                                               &log->no_mem_stripes);
 835                                 spin_unlock_irq(&log->io_list_lock);
 836                         }
 837                 }
 838         }
 839
 840         mutex_unlock(&log->io_mutex);
 841         if (wake_reclaim)
 842                 r5l_wake_reclaim(log, reserve);
 843         return 0;
 844 }
 845
 846 void r5l_write_stripe_run(struct r5l_log *log)
 847 {
 848         if (!log)
 849                 return;
 850         mutex_lock(&log->io_mutex);
 851         r5l_submit_current_io(log);
 852         mutex_unlock(&log->io_mutex);
 853 }
 854
 855 int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
 856 {
 857         if (!log)
 858                 return -ENODEV;
 859         /*
 860          * we flush log disk cache first, then write stripe data to raid disks.
 861          * So if bio is finished, the log disk cache is flushed already. The
 862          * recovery guarantees we can recovery the bio from log disk, so we
 863          * don't need to flush again
 864          */
 865         if (bio->bi_iter.bi_size == 0) {
 866                 bio_endio(bio);
 867                 return 0;
 868         }
 869         bio->bi_opf &= ~REQ_PREFLUSH;
 870         return -EAGAIN;
 871 }
 872
 873 /* This will run after log space is reclaimed */
 874 static void r5l_run_no_space_stripes(struct r5l_log *log)
 875 {
 876         struct stripe_head *sh;
 877
 878         spin_lock(&log->no_space_stripes_lock);
 879         while (!list_empty(&log->no_space_stripes)) {
 880                 sh = list_first_entry(&log->no_space_stripes,
 881                                       struct stripe_head, log_list);
 882                 list_del_init(&sh->log_list);
 883                 set_bit(STRIPE_HANDLE, &sh->state);
 884                 raid5_release_stripe(sh);
 885         }
 886         spin_unlock(&log->no_space_stripes_lock);
 887 }
 888
 889 /*
 890  * calculate new last_checkpoint
 891  * for write through mode, returns log->next_checkpoint
 892  * for write back, returns log_start of first sh in stripe_in_journal_list
 893  */
 894 static sector_t r5c_calculate_new_cp(struct r5conf *conf)
 895 {
 896         struct stripe_head *sh;
 897         struct r5l_log *log = conf->log;
 898         sector_t new_cp;
 899         unsigned long flags;
 900
 901         if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
 902                 return log->next_checkpoint;
 903
 904         spin_lock_irqsave(&log->stripe_in_journal_lock, flags);
 905         if (list_empty(&conf->log->stripe_in_journal_list)) {
 906                 /* all stripes flushed */
 907                 spin_unlock(&log->stripe_in_journal_lock);
 908                 return log->next_checkpoint;
 909         }
 910         sh = list_first_entry(&conf->log->stripe_in_journal_list,
 911                               struct stripe_head, r5c);
 912         new_cp = sh->log_start;
 913         spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
 914         return new_cp;
 915 }
 916
 917 static sector_t r5l_reclaimable_space(struct r5l_log *log)
 918 {
 919         struct r5conf *conf = log->rdev->mddev->private;
 920
 921         return r5l_ring_distance(log, log->last_checkpoint,
 922                                  r5c_calculate_new_cp(conf));
 923 }
 924
 925 static void r5l_run_no_mem_stripe(struct r5l_log *log)
 926 {
 927         struct stripe_head *sh;
 928
 929         assert_spin_locked(&log->io_list_lock);
 930
 931         if (!list_empty(&log->no_mem_stripes)) {
 932                 sh = list_first_entry(&log->no_mem_stripes,
 933                                       struct stripe_head, log_list);
 934                 list_del_init(&sh->log_list);
 935                 set_bit(STRIPE_HANDLE, &sh->state);
 936                 raid5_release_stripe(sh);
 937         }
 938 }
 939
 940 static bool r5l_complete_finished_ios(struct r5l_log *log)
 941 {
 942         struct r5l_io_unit *io, *next;
 943         bool found = false;
 944
 945         assert_spin_locked(&log->io_list_lock);
 946
 947         list_for_each_entry_safe(io, next, &log->finished_ios, log_sibling) {
 948                 /* don't change list order */
 949                 if (io->state < IO_UNIT_STRIPE_END)
 950                         break;
 951
 952                 log->next_checkpoint = io->log_start;
 953                 log->next_cp_seq = io->seq;
 954
 955                 list_del(&io->log_sibling);
 956                 mempool_free(io, log->io_pool);
 957                 r5l_run_no_mem_stripe(log);
 958
 959                 found = true;
 960         }
 961
 962         return found;
 963 }
 964
 965 static void __r5l_stripe_write_finished(struct r5l_io_unit *io)
 966 {
 967         struct r5l_log *log = io->log;
 968         struct r5conf *conf = log->rdev->mddev->private;
 969         unsigned long flags;
 970
 971         spin_lock_irqsave(&log->io_list_lock, flags);
 972         __r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END);
 973
 974         if (!r5l_complete_finished_ios(log)) {
 975                 spin_unlock_irqrestore(&log->io_list_lock, flags);
 976                 return;
 977         }
 978
 979         if (r5l_reclaimable_space(log) > log->max_free_space ||
 980             test_bit(R5C_LOG_TIGHT, &conf->cache_state))
 981                 r5l_wake_reclaim(log, 0);
 982
 983         spin_unlock_irqrestore(&log->io_list_lock, flags);
 984         wake_up(&log->iounit_wait);
 985 }
 986
 987 void r5l_stripe_write_finished(struct stripe_head *sh)
 988 {
 989         struct r5l_io_unit *io;
 990
 991         io = sh->log_io;
 992         sh->log_io = NULL;
 993
 994         if (io && atomic_dec_and_test(&io->pending_stripe))
 995                 __r5l_stripe_write_finished(io);
 996 }
 997
 998 static void r5l_log_flush_endio(struct bio *bio)
 999 {
1000         struct r5l_log *log = container_of(bio, struct r5l_log,
1001                 flush_bio);
1002         unsigned long flags;
1003         struct r5l_io_unit *io;
1004
1005         if (bio->bi_error)
1006                 md_error(log->rdev->mddev, log->rdev);
1007
1008         spin_lock_irqsave(&log->io_list_lock, flags);
1009         list_for_each_entry(io, &log->flushing_ios, log_sibling)
1010                 r5l_io_run_stripes(io);
1011         list_splice_tail_init(&log->flushing_ios, &log->finished_ios);
1012         spin_unlock_irqrestore(&log->io_list_lock, flags);
1013 }
1014
1015 /*
1016  * Starting dispatch IO to raid.
1017  * io_unit(meta) consists of a log. There is one situation we want to avoid. A
1018  * broken meta in the middle of a log causes recovery can't find meta at the
1019  * head of log. If operations require meta at the head persistent in log, we
1020  * must make sure meta before it persistent in log too. A case is:
1021  *
1022  * stripe data/parity is in log, we start write stripe to raid disks. stripe
1023  * data/parity must be persistent in log before we do the write to raid disks.
1024  *
1025  * The solution is we restrictly maintain io_unit list order. In this case, we
1026  * only write stripes of an io_unit to raid disks till the io_unit is the first
1027  * one whose data/parity is in log.
1028  */
1029 void r5l_flush_stripe_to_raid(struct r5l_log *log)
1030 {
1031         bool do_flush;
1032
1033         if (!log || !log->need_cache_flush)
1034                 return;
1035
1036         spin_lock_irq(&log->io_list_lock);
1037         /* flush bio is running */
1038         if (!list_empty(&log->flushing_ios)) {
1039                 spin_unlock_irq(&log->io_list_lock);
1040                 return;
1041         }
1042         list_splice_tail_init(&log->io_end_ios, &log->flushing_ios);
1043         do_flush = !list_empty(&log->flushing_ios);
1044         spin_unlock_irq(&log->io_list_lock);
1045
1046         if (!do_flush)
1047                 return;
1048         bio_reset(&log->flush_bio);
1049         log->flush_bio.bi_bdev = log->rdev->bdev;
1050         log->flush_bio.bi_end_io = r5l_log_flush_endio;
1051         bio_set_op_attrs(&log->flush_bio, REQ_OP_WRITE, WRITE_FLUSH);
1052         submit_bio(&log->flush_bio);
1053 }
1054
1055 static void r5l_write_super(struct r5l_log *log, sector_t cp);
1056 static void r5l_write_super_and_discard_space(struct r5l_log *log,
1057         sector_t end)
1058 {
1059         struct block_device *bdev = log->rdev->bdev;
1060         struct mddev *mddev;
1061
1062         r5l_write_super(log, end);
1063
1064         if (!blk_queue_discard(bdev_get_queue(bdev)))
1065                 return;
1066
1067         mddev = log->rdev->mddev;
1068         /*
1069          * Discard could zero data, so before discard we must make sure
1070          * superblock is updated to new log tail. Updating superblock (either
1071          * directly call md_update_sb() or depend on md thread) must hold
1072          * reconfig mutex. On the other hand, raid5_quiesce is called with
1073          * reconfig_mutex hold. The first step of raid5_quiesce() is waitting
1074          * for all IO finish, hence waitting for reclaim thread, while reclaim
1075          * thread is calling this function and waitting for reconfig mutex. So
1076          * there is a deadlock. We workaround this issue with a trylock.
1077          * FIXME: we could miss discard if we can't take reconfig mutex
1078          */
1079         set_mask_bits(&mddev->flags, 0,
1080                 BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING));
1081         if (!mddev_trylock(mddev))
1082                 return;
1083         md_update_sb(mddev, 1);
1084         mddev_unlock(mddev);
1085
1086         /* discard IO error really doesn't matter, ignore it */
1087         if (log->last_checkpoint < end) {
1088                 blkdev_issue_discard(bdev,
1089                                 log->last_checkpoint + log->rdev->data_offset,
1090                                 end - log->last_checkpoint, GFP_NOIO, 0);
1091         } else {
1092                 blkdev_issue_discard(bdev,
1093                                 log->last_checkpoint + log->rdev->data_offset,
1094                                 log->device_size - log->last_checkpoint,
1095                                 GFP_NOIO, 0);
1096                 blkdev_issue_discard(bdev, log->rdev->data_offset, end,
1097                                 GFP_NOIO, 0);
1098         }
1099 }
1100
1101 /*
1102  * r5c_flush_stripe moves stripe from cached list to handle_list. When called,
1103  * the stripe must be on r5c_cached_full_stripes or r5c_cached_partial_stripes.
1104  *
1105  * must hold conf->device_lock
1106  */
1107 static void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh)
1108 {
1109         BUG_ON(list_empty(&sh->lru));
1110         BUG_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
1111         BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
1112
1113         /*
1114          * The stripe is not ON_RELEASE_LIST, so it is safe to call
1115          * raid5_release_stripe() while holding conf->device_lock
1116          */
1117         BUG_ON(test_bit(STRIPE_ON_RELEASE_LIST, &sh->state));
1118         assert_spin_locked(&conf->device_lock);
1119
1120         list_del_init(&sh->lru);
1121         atomic_inc(&sh->count);
1122
1123         set_bit(STRIPE_HANDLE, &sh->state);
1124         atomic_inc(&conf->active_stripes);
1125         r5c_make_stripe_write_out(sh);
1126
1127         if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
1128                 atomic_inc(&conf->preread_active_stripes);
1129         raid5_release_stripe(sh);
1130 }
1131
1132 /*
1133  * if num == 0, flush all full stripes
1134  * if num > 0, flush all full stripes. If less than num full stripes are
1135  *             flushed, flush some partial stripes until totally num stripes are
1136  *             flushed or there is no more cached stripes.
1137  */
1138 void r5c_flush_cache(struct r5conf *conf, int num)
1139 {
1140         int count;
1141         struct stripe_head *sh, *next;
1142
1143         assert_spin_locked(&conf->device_lock);
1144         if (!conf->log)
1145                 return;
1146
1147         count = 0;
1148         list_for_each_entry_safe(sh, next, &conf->r5c_full_stripe_list, lru) {
1149                 r5c_flush_stripe(conf, sh);
1150                 count++;
1151         }
1152
1153         if (count >= num)
1154                 return;
1155         list_for_each_entry_safe(sh, next,
1156                                  &conf->r5c_partial_stripe_list, lru) {
1157                 r5c_flush_stripe(conf, sh);
1158                 if (++count >= num)
1159                         break;
1160         }
1161 }
1162
1163 static void r5c_do_reclaim(struct r5conf *conf)
1164 {
1165         struct r5l_log *log = conf->log;
1166         struct stripe_head *sh;
1167         int count = 0;
1168         unsigned long flags;
1169         int total_cached;
1170         int stripes_to_flush;
1171
1172         if (!r5c_is_writeback(log))
1173                 return;
1174
1175         total_cached = atomic_read(&conf->r5c_cached_partial_stripes) +
1176                 atomic_read(&conf->r5c_cached_full_stripes);
1177
1178         if (total_cached > conf->min_nr_stripes * 3 / 4 ||
1179             atomic_read(&conf->empty_inactive_list_nr) > 0)
1180                 /*
1181                  * if stripe cache pressure high, flush all full stripes and
1182                  * some partial stripes
1183                  */
1184                 stripes_to_flush = R5C_RECLAIM_STRIPE_GROUP;
1185         else if (total_cached > conf->min_nr_stripes * 1 / 2 ||
1186                  atomic_read(&conf->r5c_cached_full_stripes) >
1187                  R5C_FULL_STRIPE_FLUSH_BATCH)
1188                 /*
1189                  * if stripe cache pressure moderate, or if there is many full
1190                  * stripes,flush all full stripes
1191                  */
1192                 stripes_to_flush = 0;
1193         else
1194                 /* no need to flush */
1195                 stripes_to_flush = -1;
1196
1197         if (stripes_to_flush >= 0) {
1198                 spin_lock_irqsave(&conf->device_lock, flags);
1199                 r5c_flush_cache(conf, stripes_to_flush);
1200                 spin_unlock_irqrestore(&conf->device_lock, flags);
1201         }
1202
1203         /* if log space is tight, flush stripes on stripe_in_journal_list */
1204         if (test_bit(R5C_LOG_TIGHT, &conf->cache_state)) {
1205                 spin_lock_irqsave(&log->stripe_in_journal_lock, flags);
1206                 spin_lock(&conf->device_lock);
1207                 list_for_each_entry(sh, &log->stripe_in_journal_list, r5c) {
1208                         /*
1209                          * stripes on stripe_in_journal_list could be in any
1210                          * state of the stripe_cache state machine. In this
1211                          * case, we only want to flush stripe on
1212                          * r5c_cached_full/partial_stripes. The following
1213                          * condition makes sure the stripe is on one of the
1214                          * two lists.
1215                          */
1216                         if (!list_empty(&sh->lru) &&
1217                             !test_bit(STRIPE_HANDLE, &sh->state) &&
1218                             atomic_read(&sh->count) == 0) {
1219                                 r5c_flush_stripe(conf, sh);
1220                         }
1221                         if (count++ >= R5C_RECLAIM_STRIPE_GROUP)
1222                                 break;
1223                 }
1224                 spin_unlock(&conf->device_lock);
1225                 spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
1226         }
1227         md_wakeup_thread(conf->mddev->thread);
1228 }
1229
1230 static void r5l_do_reclaim(struct r5l_log *log)
1231 {
1232         struct r5conf *conf = log->rdev->mddev->private;
1233         sector_t reclaim_target = xchg(&log->reclaim_target, 0);
1234         sector_t reclaimable;
1235         sector_t next_checkpoint;
1236         bool write_super;
1237
1238         spin_lock_irq(&log->io_list_lock);
1239         write_super = r5l_reclaimable_space(log) > log->max_free_space ||
1240                 reclaim_target != 0 || !list_empty(&log->no_space_stripes);
1241         /*
1242          * move proper io_unit to reclaim list. We should not change the order.
1243          * reclaimable/unreclaimable io_unit can be mixed in the list, we
1244          * shouldn't reuse space of an unreclaimable io_unit
1245          */
1246         while (1) {
1247                 reclaimable = r5l_reclaimable_space(log);
1248                 if (reclaimable >= reclaim_target ||
1249                     (list_empty(&log->running_ios) &&
1250                      list_empty(&log->io_end_ios) &&
1251                      list_empty(&log->flushing_ios) &&
1252                      list_empty(&log->finished_ios)))
1253                         break;
1254
1255                 md_wakeup_thread(log->rdev->mddev->thread);
1256                 wait_event_lock_irq(log->iounit_wait,
1257                                     r5l_reclaimable_space(log) > reclaimable,
1258                                     log->io_list_lock);
1259         }
1260
1261         next_checkpoint = r5c_calculate_new_cp(conf);
1262         spin_unlock_irq(&log->io_list_lock);
1263
1264         BUG_ON(reclaimable < 0);
1265
1266         if (reclaimable == 0 || !write_super)
1267                 return;
1268
1269         /*
1270          * write_super will flush cache of each raid disk. We must write super
1271          * here, because the log area might be reused soon and we don't want to
1272          * confuse recovery
1273          */
1274         r5l_write_super_and_discard_space(log, next_checkpoint);
1275
1276         mutex_lock(&log->io_mutex);
1277         log->last_checkpoint = next_checkpoint;
1278         r5c_update_log_state(log);
1279         mutex_unlock(&log->io_mutex);
1280
1281         r5l_run_no_space_stripes(log);
1282 }
1283
1284 static void r5l_reclaim_thread(struct md_thread *thread)
1285 {
1286         struct mddev *mddev = thread->mddev;
1287         struct r5conf *conf = mddev->private;
1288         struct r5l_log *log = conf->log;
1289
1290         if (!log)
1291                 return;
1292         r5c_do_reclaim(conf);
1293         r5l_do_reclaim(log);
1294 }
1295
1296 void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
1297 {
1298         unsigned long target;
1299         unsigned long new = (unsigned long)space; /* overflow in theory */
1300
1301         if (!log)
1302                 return;
1303         do {
1304                 target = log->reclaim_target;
1305                 if (new < target)
1306                         return;
1307         } while (cmpxchg(&log->reclaim_target, target, new) != target);
1308         md_wakeup_thread(log->reclaim_thread);
1309 }
1310
1311 void r5l_quiesce(struct r5l_log *log, int state)
1312 {
1313         struct mddev *mddev;
1314         if (!log || state == 2)
1315                 return;
1316         if (state == 0) {
1317                 /*
1318                  * This is a special case for hotadd. In suspend, the array has
1319                  * no journal. In resume, journal is initialized as well as the
1320                  * reclaim thread.
1321                  */
1322                 if (log->reclaim_thread)
1323                         return;
1324                 log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
1325                                         log->rdev->mddev, "reclaim");
1326                 log->reclaim_thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL;
1327         } else if (state == 1) {
1328                 /* make sure r5l_write_super_and_discard_space exits */
1329                 mddev = log->rdev->mddev;
1330                 wake_up(&mddev->sb_wait);
1331                 r5l_wake_reclaim(log, MaxSector);
1332                 md_unregister_thread(&log->reclaim_thread);
1333                 r5l_do_reclaim(log);
1334         }
1335 }
1336
1337 bool r5l_log_disk_error(struct r5conf *conf)
1338 {
1339         struct r5l_log *log;
1340         bool ret;
1341         /* don't allow write if journal disk is missing */
1342         rcu_read_lock();
1343         log = rcu_dereference(conf->log);
1344
1345         if (!log)
1346                 ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
1347         else
1348                 ret = test_bit(Faulty, &log->rdev->flags);
1349         rcu_read_unlock();
1350         return ret;
1351 }
1352
1353 struct r5l_recovery_ctx {
1354         struct page *meta_page;         /* current meta */
1355         sector_t meta_total_blocks;     /* total size of current meta and data */
1356         sector_t pos;                   /* recovery position */
1357         u64 seq;                        /* recovery position seq */
1358         int data_parity_stripes;        /* number of data_parity stripes */
1359         int data_only_stripes;          /* number of data_only stripes */
1360         struct list_head cached_list;
1361 };
1362
1363 static int r5l_recovery_read_meta_block(struct r5l_log *log,
1364                                         struct r5l_recovery_ctx *ctx)
1365 {
1366         struct page *page = ctx->meta_page;
1367         struct r5l_meta_block *mb;
1368         u32 crc, stored_crc;
1369
1370         if (!sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, REQ_OP_READ, 0,
1371                           false))
1372                 return -EIO;
1373
1374         mb = page_address(page);
1375         stored_crc = le32_to_cpu(mb->checksum);
1376         mb->checksum = 0;
1377
1378         if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
1379             le64_to_cpu(mb->seq) != ctx->seq ||
1380             mb->version != R5LOG_VERSION ||
1381             le64_to_cpu(mb->position) != ctx->pos)
1382                 return -EINVAL;
1383
1384         crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
1385         if (stored_crc != crc)
1386                 return -EINVAL;
1387
1388         if (le32_to_cpu(mb->meta_size) > PAGE_SIZE)
1389                 return -EINVAL;
1390
1391         ctx->meta_total_blocks = BLOCK_SECTORS;
1392
1393         return 0;
1394 }
1395
1396 static void
1397 r5l_recovery_create_empty_meta_block(struct r5l_log *log,
1398                                      struct page *page,
1399                                      sector_t pos, u64 seq)
1400 {
1401         struct r5l_meta_block *mb;
1402         u32 crc;
1403
1404         mb = page_address(page);
1405         clear_page(mb);
1406         mb->magic = cpu_to_le32(R5LOG_MAGIC);
1407         mb->version = R5LOG_VERSION;
1408         mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block));
1409         mb->seq = cpu_to_le64(seq);
1410         mb->position = cpu_to_le64(pos);
1411         crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
1412         mb->checksum = cpu_to_le32(crc);
1413 }
1414
1415 static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
1416                                           u64 seq)
1417 {
1418         struct page *page;
1419
1420         page = alloc_page(GFP_KERNEL);
1421         if (!page)
1422                 return -ENOMEM;
1423         r5l_recovery_create_empty_meta_block(log, page, pos, seq);
1424         if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE,
1425                           WRITE_FUA, false)) {
1426                 __free_page(page);
1427                 return -EIO;
1428         }
1429         __free_page(page);
1430         return 0;
1431 }
1432
1433 /*
1434  * r5l_recovery_load_data and r5l_recovery_load_parity uses flag R5_Wantwrite
1435  * to mark valid (potentially not flushed) data in the journal.
1436  *
1437  * We already verified checksum in r5l_recovery_verify_data_checksum_for_mb,
1438  * so there should not be any mismatch here.
1439  */
1440 static void r5l_recovery_load_data(struct r5l_log *log,
1441                                    struct stripe_head *sh,
1442                                    struct r5l_recovery_ctx *ctx,
1443                                    struct r5l_payload_data_parity *payload,
1444                                    sector_t log_offset)
1445 {
1446         struct mddev *mddev = log->rdev->mddev;
1447         struct r5conf *conf = mddev->private;
1448         int dd_idx;
1449
1450         raid5_compute_sector(conf,
1451                              le64_to_cpu(payload->location), 0,
1452                              &dd_idx, sh);
1453         sync_page_io(log->rdev, log_offset, PAGE_SIZE,
1454                      sh->dev[dd_idx].page, REQ_OP_READ, 0, false);
1455         sh->dev[dd_idx].log_checksum =
1456                 le32_to_cpu(payload->checksum[0]);
1457         ctx->meta_total_blocks += BLOCK_SECTORS;
1458
1459         set_bit(R5_Wantwrite, &sh->dev[dd_idx].flags);
1460         set_bit(STRIPE_R5C_CACHING, &sh->state);
1461 }
1462
1463 static void r5l_recovery_load_parity(struct r5l_log *log,
1464                                      struct stripe_head *sh,
1465                                      struct r5l_recovery_ctx *ctx,
1466                                      struct r5l_payload_data_parity *payload,
1467                                      sector_t log_offset)
1468 {
1469         struct mddev *mddev = log->rdev->mddev;
1470         struct r5conf *conf = mddev->private;
1471
1472         ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded;
1473         sync_page_io(log->rdev, log_offset, PAGE_SIZE,
1474                      sh->dev[sh->pd_idx].page, REQ_OP_READ, 0, false);
1475         sh->dev[sh->pd_idx].log_checksum =
1476                 le32_to_cpu(payload->checksum[0]);
1477         set_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags);
1478
1479         if (sh->qd_idx >= 0) {
1480                 sync_page_io(log->rdev,
1481                              r5l_ring_add(log, log_offset, BLOCK_SECTORS),
1482                              PAGE_SIZE, sh->dev[sh->qd_idx].page,
1483                              REQ_OP_READ, 0, false);
1484                 sh->dev[sh->qd_idx].log_checksum =
1485                         le32_to_cpu(payload->checksum[1]);
1486                 set_bit(R5_Wantwrite, &sh->dev[sh->qd_idx].flags);
1487         }
1488         clear_bit(STRIPE_R5C_CACHING, &sh->state);
1489 }
1490
1491 static void r5l_recovery_reset_stripe(struct stripe_head *sh)
1492 {
1493         int i;
1494
1495         sh->state = 0;
1496         sh->log_start = MaxSector;
1497         for (i = sh->disks; i--; )
1498                 sh->dev[i].flags = 0;
1499 }
1500
1501 static void
1502 r5l_recovery_replay_one_stripe(struct r5conf *conf,
1503                                struct stripe_head *sh,
1504                                struct r5l_recovery_ctx *ctx)
1505 {
1506         struct md_rdev *rdev, *rrdev;
1507         int disk_index;
1508         int data_count = 0;
1509
1510         for (disk_index = 0; disk_index < sh->disks; disk_index++) {
1511                 if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
1512                         continue;
1513                 if (disk_index == sh->qd_idx || disk_index == sh->pd_idx)
1514                         continue;
1515                 data_count++;
1516         }
1517
1518         /*
1519          * stripes that only have parity must have been flushed
1520          * before the crash that we are now recovering from, so
1521          * there is nothing more to recovery.
1522          */
1523         if (data_count == 0)
1524                 goto out;
1525
1526         for (disk_index = 0; disk_index < sh->disks; disk_index++) {
1527                 if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
1528                         continue;
1529
1530                 /* in case device is broken */
1531                 rcu_read_lock();
1532                 rdev = rcu_dereference(conf->disks[disk_index].rdev);
1533                 if (rdev) {
1534                         atomic_inc(&rdev->nr_pending);
1535                         rcu_read_unlock();
1536                         sync_page_io(rdev, sh->sector, PAGE_SIZE,
1537                                      sh->dev[disk_index].page, REQ_OP_WRITE, 0,
1538                                      false);
1539                         rdev_dec_pending(rdev, rdev->mddev);
1540                         rcu_read_lock();
1541                 }
1542                 rrdev = rcu_dereference(conf->disks[disk_index].replacement);
1543                 if (rrdev) {
1544                         atomic_inc(&rrdev->nr_pending);
1545                         rcu_read_unlock();
1546                         sync_page_io(rrdev, sh->sector, PAGE_SIZE,
1547                                      sh->dev[disk_index].page, REQ_OP_WRITE, 0,
1548                                      false);
1549                         rdev_dec_pending(rrdev, rrdev->mddev);
1550                         rcu_read_lock();
1551                 }
1552                 rcu_read_unlock();
1553         }
1554         ctx->data_parity_stripes++;
1555 out:
1556         r5l_recovery_reset_stripe(sh);
1557 }
1558
1559 static struct stripe_head *
1560 r5c_recovery_alloc_stripe(struct r5conf *conf,
1561                           struct list_head *recovery_list,
1562                           sector_t stripe_sect,
1563                           sector_t log_start)
1564 {
1565         struct stripe_head *sh;
1566
1567         sh = raid5_get_active_stripe(conf, stripe_sect, 0, 1, 0);
1568         if (!sh)
1569                 return NULL;  /* no more stripe available */
1570
1571         r5l_recovery_reset_stripe(sh);
1572         sh->log_start = log_start;
1573
1574         return sh;
1575 }
1576
1577 static struct stripe_head *
1578 r5c_recovery_lookup_stripe(struct list_head *list, sector_t sect)
1579 {
1580         struct stripe_head *sh;
1581
1582         list_for_each_entry(sh, list, lru)
1583                 if (sh->sector == sect)
1584                         return sh;
1585         return NULL;
1586 }
1587
1588 static void
1589 r5c_recovery_drop_stripes(struct list_head *cached_stripe_list,
1590                           struct r5l_recovery_ctx *ctx)
1591 {
1592         struct stripe_head *sh, *next;
1593
1594         list_for_each_entry_safe(sh, next, cached_stripe_list, lru) {
1595                 r5l_recovery_reset_stripe(sh);
1596                 list_del_init(&sh->lru);
1597                 raid5_release_stripe(sh);
1598         }
1599 }
1600
1601 static void
1602 r5c_recovery_replay_stripes(struct list_head *cached_stripe_list,
1603                             struct r5l_recovery_ctx *ctx)
1604 {
1605         struct stripe_head *sh, *next;
1606
1607         list_for_each_entry_safe(sh, next, cached_stripe_list, lru)
1608                 if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
1609                         r5l_recovery_replay_one_stripe(sh->raid_conf, sh, ctx);
1610                         list_del_init(&sh->lru);
1611                         raid5_release_stripe(sh);
1612                 }
1613 }
1614
1615 /* if matches return 0; otherwise return -EINVAL */
1616 static int
1617 r5l_recovery_verify_data_checksum(struct r5l_log *log, struct page *page,
1618                                   sector_t log_offset, __le32 log_checksum)
1619 {
1620         void *addr;
1621         u32 checksum;
1622
1623         sync_page_io(log->rdev, log_offset, PAGE_SIZE,
1624                      page, REQ_OP_READ, 0, false);
1625         addr = kmap_atomic(page);
1626         checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE);
1627         kunmap_atomic(addr);
1628         return (le32_to_cpu(log_checksum) == checksum) ? 0 : -EINVAL;
1629 }
1630
1631 /*
1632  * before loading data to stripe cache, we need verify checksum for all data,
1633  * if there is mismatch for any data page, we drop all data in the mata block
1634  */
1635 static int
1636 r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log,
1637                                          struct r5l_recovery_ctx *ctx)
1638 {
1639         struct mddev *mddev = log->rdev->mddev;
1640         struct r5conf *conf = mddev->private;
1641         struct r5l_meta_block *mb = page_address(ctx->meta_page);
1642         sector_t mb_offset = sizeof(struct r5l_meta_block);
1643         sector_t log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
1644         struct page *page;
1645         struct r5l_payload_data_parity *payload;
1646
1647         page = alloc_page(GFP_KERNEL);
1648         if (!page)
1649                 return -ENOMEM;
1650
1651         while (mb_offset < le32_to_cpu(mb->meta_size)) {
1652                 payload = (void *)mb + mb_offset;
1653
1654                 if (payload->header.type == R5LOG_PAYLOAD_DATA) {
1655                         if (r5l_recovery_verify_data_checksum(
1656                                     log, page, log_offset,
1657                                     payload->checksum[0]) < 0)
1658                                 goto mismatch;
1659                 } else if (payload->header.type == R5LOG_PAYLOAD_PARITY) {
1660                         if (r5l_recovery_verify_data_checksum(
1661                                     log, page, log_offset,
1662                                     payload->checksum[0]) < 0)
1663                                 goto mismatch;
1664                         if (conf->max_degraded == 2 && /* q for RAID 6 */
1665                             r5l_recovery_verify_data_checksum(
1666                                     log, page,
1667                                     r5l_ring_add(log, log_offset,
1668                                                  BLOCK_SECTORS),
1669                                     payload->checksum[1]) < 0)
1670                                 goto mismatch;
1671                 } else /* not R5LOG_PAYLOAD_DATA or R5LOG_PAYLOAD_PARITY */
1672                         goto mismatch;
1673
1674                 log_offset = r5l_ring_add(log, log_offset,
1675                                           le32_to_cpu(payload->size));
1676
1677                 mb_offset += sizeof(struct r5l_payload_data_parity) +
1678                         sizeof(__le32) *
1679                         (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
1680         }
1681
1682         put_page(page);
1683         return 0;
1684
1685 mismatch:
1686         put_page(page);
1687         return -EINVAL;
1688 }
1689
1690 /*
1691  * Analyze all data/parity pages in one meta block
1692  * Returns:
1693  * 0 for success
1694  * -EINVAL for unknown playload type
1695  * -EAGAIN for checksum mismatch of data page
1696  * -ENOMEM for run out of memory (alloc_page failed or run out of stripes)
1697  */
1698 static int
1699 r5c_recovery_analyze_meta_block(struct r5l_log *log,
1700                                 struct r5l_recovery_ctx *ctx,
1701                                 struct list_head *cached_stripe_list)
1702 {
1703         struct mddev *mddev = log->rdev->mddev;
1704         struct r5conf *conf = mddev->private;
1705         struct r5l_meta_block *mb;
1706         struct r5l_payload_data_parity *payload;
1707         int mb_offset;
1708         sector_t log_offset;
1709         sector_t stripe_sect;
1710         struct stripe_head *sh;
1711         int ret;
1712
1713         /*
1714          * for mismatch in data blocks, we will drop all data in this mb, but
1715          * we will still read next mb for other data with FLUSH flag, as
1716          * io_unit could finish out of order.
1717          */
1718         ret = r5l_recovery_verify_data_checksum_for_mb(log, ctx);
1719         if (ret == -EINVAL)
1720                 return -EAGAIN;
1721         else if (ret)
1722                 return ret;   /* -ENOMEM duo to alloc_page() failed */
1723
1724         mb = page_address(ctx->meta_page);
1725         mb_offset = sizeof(struct r5l_meta_block);
1726         log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
1727
1728         while (mb_offset < le32_to_cpu(mb->meta_size)) {
1729                 int dd;
1730
1731                 payload = (void *)mb + mb_offset;
1732                 stripe_sect = (payload->header.type == R5LOG_PAYLOAD_DATA) ?
1733                         raid5_compute_sector(
1734                                 conf, le64_to_cpu(payload->location), 0, &dd,
1735                                 NULL)
1736                         : le64_to_cpu(payload->location);
1737
1738                 sh = r5c_recovery_lookup_stripe(cached_stripe_list,
1739                                                 stripe_sect);
1740
1741                 if (!sh) {
1742                         sh = r5c_recovery_alloc_stripe(conf, cached_stripe_list,
1743                                                        stripe_sect, ctx->pos);
1744                         /*
1745                          * cannot get stripe from raid5_get_active_stripe
1746                          * try replay some stripes
1747                          */
1748                         if (!sh) {
1749                                 r5c_recovery_replay_stripes(
1750                                         cached_stripe_list, ctx);
1751                                 sh = r5c_recovery_alloc_stripe(
1752                                         conf, cached_stripe_list,
1753                                         stripe_sect, ctx->pos);
1754                         }
1755                         if (!sh) {
1756                                 pr_debug("md/raid:%s: Increasing stripe cache size to %d to recovery data on journal.\n",
1757                                         mdname(mddev),
1758                                         conf->min_nr_stripes * 2);
1759                                 raid5_set_cache_size(mddev,
1760                                                      conf->min_nr_stripes * 2);
1761                                 sh = r5c_recovery_alloc_stripe(
1762                                         conf, cached_stripe_list, stripe_sect,
1763                                         ctx->pos);
1764                         }
1765                         if (!sh) {
1766                                 pr_err("md/raid:%s: Cannot get enough stripes due to memory pressure. Recovery failed.\n",
1767                                        mdname(mddev));
1768                                 return -ENOMEM;
1769                         }
1770                         list_add_tail(&sh->lru, cached_stripe_list);
1771                 }
1772
1773                 if (payload->header.type == R5LOG_PAYLOAD_DATA) {
1774                         if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
1775                                 r5l_recovery_replay_one_stripe(conf, sh, ctx);
1776                                 r5l_recovery_reset_stripe(sh);
1777                                 sh->log_start = ctx->pos;
1778                                 list_move_tail(&sh->lru, cached_stripe_list);
1779                         }
1780                         r5l_recovery_load_data(log, sh, ctx, payload,
1781                                                log_offset);
1782                 } else if (payload->header.type == R5LOG_PAYLOAD_PARITY)
1783                         r5l_recovery_load_parity(log, sh, ctx, payload,
1784                                                  log_offset);
1785                 else
1786                         return -EINVAL;
1787
1788                 log_offset = r5l_ring_add(log, log_offset,
1789                                           le32_to_cpu(payload->size));
1790
1791                 mb_offset += sizeof(struct r5l_payload_data_parity) +
1792                         sizeof(__le32) *
1793                         (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
1794         }
1795
1796         return 0;
1797 }
1798
1799 /*
1800  * Load the stripe into cache. The stripe will be written out later by
1801  * the stripe cache state machine.
1802  */
1803 static void r5c_recovery_load_one_stripe(struct r5l_log *log,
1804                                          struct stripe_head *sh)
1805 {
1806         struct r5conf *conf = sh->raid_conf;
1807         struct r5dev *dev;
1808         int i;
1809
1810         for (i = sh->disks; i--; ) {
1811                 dev = sh->dev + i;
1812                 if (test_and_clear_bit(R5_Wantwrite, &dev->flags)) {
1813                         set_bit(R5_InJournal, &dev->flags);
1814                         set_bit(R5_UPTODATE, &dev->flags);
1815                 }
1816         }
1817         set_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state);
1818         atomic_inc(&conf->r5c_cached_partial_stripes);
1819         list_add_tail(&sh->r5c, &log->stripe_in_journal_list);
1820 }
1821
1822 /*
1823  * Scan through the log for all to-be-flushed data
1824  *
1825  * For stripes with data and parity, namely Data-Parity stripe
1826  * (STRIPE_R5C_CACHING == 0), we simply replay all the writes.
1827  *
1828  * For stripes with only data, namely Data-Only stripe
1829  * (STRIPE_R5C_CACHING == 1), we load them to stripe cache state machine.
1830  *
1831  * For a stripe, if we see data after parity, we should discard all previous
1832  * data and parity for this stripe, as these data are already flushed to
1833  * the array.
1834  *
1835  * At the end of the scan, we return the new journal_tail, which points to
1836  * first data-only stripe on the journal device, or next invalid meta block.
1837  */
1838 static int r5c_recovery_flush_log(struct r5l_log *log,
1839                                   struct r5l_recovery_ctx *ctx)
1840 {
1841         struct stripe_head *sh, *next;
1842         int ret = 0;
1843
1844         /* scan through the log */
1845         while (1) {
1846                 if (r5l_recovery_read_meta_block(log, ctx))
1847                         break;
1848
1849                 ret = r5c_recovery_analyze_meta_block(log, ctx,
1850                                                       &ctx->cached_list);
1851                 /*
1852                  * -EAGAIN means mismatch in data block, in this case, we still
1853                  * try scan the next metablock
1854                  */
1855                 if (ret && ret != -EAGAIN)
1856                         break;   /* ret == -EINVAL or -ENOMEM */
1857                 ctx->seq++;
1858                 ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks);
1859         }
1860
1861         if (ret == -ENOMEM) {
1862                 r5c_recovery_drop_stripes(&ctx->cached_list, ctx);
1863                 return ret;
1864         }
1865
1866         /* replay data-parity stripes */
1867         r5c_recovery_replay_stripes(&ctx->cached_list, ctx);
1868
1869         /* load data-only stripes to stripe cache */
1870         list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) {
1871                 WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
1872                 r5c_recovery_load_one_stripe(log, sh);
1873                 list_del_init(&sh->lru);
1874                 raid5_release_stripe(sh);
1875                 ctx->data_only_stripes++;
1876         }
1877
1878         return 0;
1879 }
1880
1881 /*
1882  * we did a recovery. Now ctx.pos points to an invalid meta block. New
1883  * log will start here. but we can't let superblock point to last valid
1884  * meta block. The log might looks like:
1885  * | meta 1| meta 2| meta 3|
1886  * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If
1887  * superblock points to meta 1, we write a new valid meta 2n.  if crash
1888  * happens again, new recovery will start from meta 1. Since meta 2n is
1889  * valid now, recovery will think meta 3 is valid, which is wrong.
1890  * The solution is we create a new meta in meta2 with its seq == meta
1891  * 1's seq + 10 and let superblock points to meta2. The same recovery will
1892  * not think meta 3 is a valid meta, because its seq doesn't match
1893  */
1894
1895 /*
1896  * Before recovery, the log looks like the following
1897  *
1898  *   ---------------------------------------------
1899  *   |           valid log        | invalid log  |
1900  *   ---------------------------------------------
1901  *   ^
1902  *   |- log->last_checkpoint
1903  *   |- log->last_cp_seq
1904  *
1905  * Now we scan through the log until we see invalid entry
1906  *
1907  *   ---------------------------------------------
1908  *   |           valid log        | invalid log  |
1909  *   ---------------------------------------------
1910  *   ^                            ^
1911  *   |- log->last_checkpoint      |- ctx->pos
1912  *   |- log->last_cp_seq          |- ctx->seq
1913  *
1914  * From this point, we need to increase seq number by 10 to avoid
1915  * confusing next recovery.
1916  *
1917  *   ---------------------------------------------
1918  *   |           valid log        | invalid log  |
1919  *   ---------------------------------------------
1920  *   ^                              ^
1921  *   |- log->last_checkpoint        |- ctx->pos+1
1922  *   |- log->last_cp_seq            |- ctx->seq+11
1923  *
1924  * However, it is not safe to start the state machine yet, because data only
1925  * parities are not yet secured in RAID. To save these data only parities, we
1926  * rewrite them from seq+11.
1927  *
1928  *   -----------------------------------------------------------------
1929  *   |           valid log        | data only stripes | invalid log  |
1930  *   -----------------------------------------------------------------
1931  *   ^                                                ^
1932  *   |- log->last_checkpoint                          |- ctx->pos+n
1933  *   |- log->last_cp_seq                              |- ctx->seq+10+n
1934  *
1935  * If failure happens again during this process, the recovery can safe start
1936  * again from log->last_checkpoint.
1937  *
1938  * Once data only stripes are rewritten to journal, we move log_tail
1939  *
1940  *   -----------------------------------------------------------------
1941  *   |     old log        |    data only stripes    | invalid log  |
1942  *   -----------------------------------------------------------------
1943  *                        ^                         ^
1944  *                        |- log->last_checkpoint   |- ctx->pos+n
1945  *                        |- log->last_cp_seq       |- ctx->seq+10+n
1946  *
1947  * Then we can safely start the state machine. If failure happens from this
1948  * point on, the recovery will start from new log->last_checkpoint.
1949  */
1950 static int
1951 r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
1952                                        struct r5l_recovery_ctx *ctx)
1953 {
1954         struct stripe_head *sh;
1955         struct mddev *mddev = log->rdev->mddev;
1956         struct page *page;
1957
1958         page = alloc_page(GFP_KERNEL);
1959         if (!page) {
1960                 pr_err("md/raid:%s: cannot allocate memory to rewrite data only stripes\n",
1961                        mdname(mddev));
1962                 return -ENOMEM;
1963         }
1964
1965         ctx->seq += 10;
1966         list_for_each_entry(sh, &ctx->cached_list, lru) {
1967                 struct r5l_meta_block *mb;
1968                 int i;
1969                 int offset;
1970                 sector_t write_pos;
1971
1972                 WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
1973                 r5l_recovery_create_empty_meta_block(log, page,
1974                                                      ctx->pos, ctx->seq);
1975                 mb = page_address(page);
1976                 offset = le32_to_cpu(mb->meta_size);
1977                 write_pos = ctx->pos + BLOCK_SECTORS;
1978
1979                 for (i = sh->disks; i--; ) {
1980                         struct r5dev *dev = &sh->dev[i];
1981                         struct r5l_payload_data_parity *payload;
1982                         void *addr;
1983
1984                         if (test_bit(R5_InJournal, &dev->flags)) {
1985                                 payload = (void *)mb + offset;
1986                                 payload->header.type = cpu_to_le16(
1987                                         R5LOG_PAYLOAD_DATA);
1988                                 payload->size = BLOCK_SECTORS;
1989                                 payload->location = cpu_to_le64(
1990                                         raid5_compute_blocknr(sh, i, 0));
1991                                 addr = kmap_atomic(dev->page);
1992                                 payload->checksum[0] = cpu_to_le32(
1993                                         crc32c_le(log->uuid_checksum, addr,
1994                                                   PAGE_SIZE));
1995                                 kunmap_atomic(addr);
1996                                 sync_page_io(log->rdev, write_pos, PAGE_SIZE,
1997                                              dev->page, REQ_OP_WRITE, 0, false);
1998                                 write_pos = r5l_ring_add(log, write_pos,
1999                                                          BLOCK_SECTORS);
2000                                 offset += sizeof(__le32) +
2001                                         sizeof(struct r5l_payload_data_parity);
2002
2003                         }
2004                 }
2005                 mb->meta_size = cpu_to_le32(offset);
2006                 mb->checksum = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
2007                 sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page,
2008                              REQ_OP_WRITE, WRITE_FUA, false);
2009                 sh->log_start = ctx->pos;
2010                 ctx->pos = write_pos;
2011                 ctx->seq += 1;
2012         }
2013         __free_page(page);
2014         return 0;
2015 }
2016
2017 static int r5l_recovery_log(struct r5l_log *log)
2018 {
2019         struct mddev *mddev = log->rdev->mddev;
2020         struct r5l_recovery_ctx ctx;
2021         int ret;
2022
2023         ctx.pos = log->last_checkpoint;
2024         ctx.seq = log->last_cp_seq;
2025         ctx.meta_page = alloc_page(GFP_KERNEL);
2026         ctx.data_only_stripes = 0;
2027         ctx.data_parity_stripes = 0;
2028         INIT_LIST_HEAD(&ctx.cached_list);
2029
2030         if (!ctx.meta_page)
2031                 return -ENOMEM;
2032
2033         ret = r5c_recovery_flush_log(log, &ctx);
2034         __free_page(ctx.meta_page);
2035
2036         if (ret)
2037                 return ret;
2038
2039         if ((ctx.data_only_stripes == 0) && (ctx.data_parity_stripes == 0))
2040                 pr_debug("md/raid:%s: starting from clean shutdown\n",
2041                          mdname(mddev));
2042         else {
2043                 pr_debug("md/raid:%s: recoverying %d data-only stripes and %d data-parity stripes\n",
2044                          mdname(mddev), ctx.data_only_stripes,
2045                          ctx.data_parity_stripes);
2046
2047                 if (ctx.data_only_stripes > 0)
2048                         if (r5c_recovery_rewrite_data_only_stripes(log, &ctx)) {
2049                                 pr_err("md/raid:%s: failed to rewrite stripes to journal\n",
2050                                        mdname(mddev));
2051                                 return -EIO;
2052                         }
2053         }
2054
2055         log->log_start = ctx.pos;
2056         log->next_checkpoint = ctx.pos;
2057         log->seq = ctx.seq;
2058         r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq);
2059         r5l_write_super(log, ctx.pos);
2060         return 0;
2061 }
2062
2063 static void r5l_write_super(struct r5l_log *log, sector_t cp)
2064 {
2065         struct mddev *mddev = log->rdev->mddev;
2066
2067         log->rdev->journal_tail = cp;
2068         set_bit(MD_CHANGE_DEVS, &mddev->flags);
2069 }
2070
2071 static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page)
2072 {
2073         struct r5conf *conf = mddev->private;
2074         int ret;
2075
2076         if (!conf->log)
2077                 return 0;
2078
2079         switch (conf->log->r5c_journal_mode) {
2080         case R5C_JOURNAL_MODE_WRITE_THROUGH:
2081                 ret = snprintf(
2082                         page, PAGE_SIZE, "[%s] %s\n",
2083                         r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH],
2084                         r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]);
2085                 break;
2086         case R5C_JOURNAL_MODE_WRITE_BACK:
2087                 ret = snprintf(
2088                         page, PAGE_SIZE, "%s [%s]\n",
2089                         r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH],
2090                         r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]);
2091                 break;
2092         default:
2093                 ret = 0;
2094         }
2095         return ret;
2096 }
2097
2098 static ssize_t r5c_journal_mode_store(struct mddev *mddev,
2099                                       const char *page, size_t length)
2100 {
2101         struct r5conf *conf = mddev->private;
2102         struct r5l_log *log = conf->log;
2103         int val = -1, i;
2104         int len = length;
2105
2106         if (!log)
2107                 return -ENODEV;
2108
2109         if (len && page[len - 1] == '\n')
2110                 len -= 1;
2111         for (i = 0; i < ARRAY_SIZE(r5c_journal_mode_str); i++)
2112                 if (strlen(r5c_journal_mode_str[i]) == len &&
2113                     strncmp(page, r5c_journal_mode_str[i], len) == 0) {
2114                         val = i;
2115                         break;
2116                 }
2117         if (val < R5C_JOURNAL_MODE_WRITE_THROUGH ||
2118             val > R5C_JOURNAL_MODE_WRITE_BACK)
2119                 return -EINVAL;
2120
2121         mddev_suspend(mddev);
2122         conf->log->r5c_journal_mode = val;
2123         mddev_resume(mddev);
2124
2125         pr_debug("md/raid:%s: setting r5c cache mode to %d: %s\n",
2126                  mdname(mddev), val, r5c_journal_mode_str[val]);
2127         return length;
2128 }
2129
2130 struct md_sysfs_entry
2131 r5c_journal_mode = __ATTR(journal_mode, 0644,
2132                           r5c_journal_mode_show, r5c_journal_mode_store);
2133
2134 /*
2135  * Try handle write operation in caching phase. This function should only
2136  * be called in write-back mode.
2137  *
2138  * If all outstanding writes can be handled in caching phase, returns 0
2139  * If writes requires write-out phase, call r5c_make_stripe_write_out()
2140  * and returns -EAGAIN
2141  */
2142 int r5c_try_caching_write(struct r5conf *conf,
2143                           struct stripe_head *sh,
2144                           struct stripe_head_state *s,
2145                           int disks)
2146 {
2147         struct r5l_log *log = conf->log;
2148         int i;
2149         struct r5dev *dev;
2150         int to_cache = 0;
2151
2152         BUG_ON(!r5c_is_writeback(log));
2153
2154         if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
2155                 /*
2156                  * There are two different scenarios here:
2157                  *  1. The stripe has some data cached, and it is sent to
2158                  *     write-out phase for reclaim
2159                  *  2. The stripe is clean, and this is the first write
2160                  *
2161                  * For 1, return -EAGAIN, so we continue with
2162                  * handle_stripe_dirtying().
2163                  *
2164                  * For 2, set STRIPE_R5C_CACHING and continue with caching
2165                  * write.
2166                  */
2167
2168                 /* case 1: anything injournal or anything in written */
2169                 if (s->injournal > 0 || s->written > 0)
2170                         return -EAGAIN;
2171                 /* case 2 */
2172                 set_bit(STRIPE_R5C_CACHING, &sh->state);
2173         }
2174
2175         for (i = disks; i--; ) {
2176                 dev = &sh->dev[i];
2177                 /* if non-overwrite, use writing-out phase */
2178                 if (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags) &&
2179                     !test_bit(R5_InJournal, &dev->flags)) {
2180                         r5c_make_stripe_write_out(sh);
2181                         return -EAGAIN;
2182                 }
2183         }
2184
2185         for (i = disks; i--; ) {
2186                 dev = &sh->dev[i];
2187                 if (dev->towrite) {
2188                         set_bit(R5_Wantwrite, &dev->flags);
2189                         set_bit(R5_Wantdrain, &dev->flags);
2190                         set_bit(R5_LOCKED, &dev->flags);
2191                         to_cache++;
2192                 }
2193         }
2194
2195         if (to_cache) {
2196                 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
2197                 /*
2198                  * set STRIPE_LOG_TRAPPED, which triggers r5c_cache_data()
2199                  * in ops_run_io(). STRIPE_LOG_TRAPPED will be cleared in
2200                  * r5c_handle_data_cached()
2201                  */
2202                 set_bit(STRIPE_LOG_TRAPPED, &sh->state);
2203         }
2204
2205         return 0;
2206 }
2207
2208 /*
2209  * free extra pages (orig_page) we allocated for prexor
2210  */
2211 void r5c_release_extra_page(struct stripe_head *sh)
2212 {
2213         int i;
2214
2215         for (i = sh->disks; i--; )
2216                 if (sh->dev[i].page != sh->dev[i].orig_page) {
2217                         struct page *p = sh->dev[i].orig_page;
2218
2219                         sh->dev[i].orig_page = sh->dev[i].page;
2220                         put_page(p);
2221                 }
2222 }
2223
2224 /*
2225  * clean up the stripe (clear R5_InJournal for dev[pd_idx] etc.) after the
2226  * stripe is committed to RAID disks.
2227  */
2228 void r5c_finish_stripe_write_out(struct r5conf *conf,
2229                                  struct stripe_head *sh,
2230                                  struct stripe_head_state *s)
2231 {
2232         int i;
2233         int do_wakeup = 0;
2234
2235         if (!conf->log ||
2236             !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags))
2237                 return;
2238
2239         WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
2240         clear_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
2241
2242         if (conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
2243                 return;
2244
2245         for (i = sh->disks; i--; ) {
2246                 clear_bit(R5_InJournal, &sh->dev[i].flags);
2247                 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
2248                         do_wakeup = 1;
2249         }
2250
2251         /*
2252          * analyse_stripe() runs before r5c_finish_stripe_write_out(),
2253          * We updated R5_InJournal, so we also update s->injournal.
2254          */
2255         s->injournal = 0;
2256
2257         if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
2258                 if (atomic_dec_and_test(&conf->pending_full_writes))
2259                         md_wakeup_thread(conf->mddev->thread);
2260
2261         if (do_wakeup)
2262                 wake_up(&conf->wait_for_overlap);
2263
2264         if (conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
2265                 return;
2266
2267         spin_lock_irq(&conf->log->stripe_in_journal_lock);
2268         list_del_init(&sh->r5c);
2269         spin_unlock_irq(&conf->log->stripe_in_journal_lock);
2270         sh->log_start = MaxSector;
2271         atomic_dec(&conf->log->stripe_in_journal_count);
2272 }
2273
2274 int
2275 r5c_cache_data(struct r5l_log *log, struct stripe_head *sh,
2276                struct stripe_head_state *s)
2277 {
2278         struct r5conf *conf = sh->raid_conf;
2279         int pages = 0;
2280         int reserve;
2281         int i;
2282         int ret = 0;
2283
2284         BUG_ON(!log);
2285
2286         for (i = 0; i < sh->disks; i++) {
2287                 void *addr;
2288
2289                 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
2290                         continue;
2291                 addr = kmap_atomic(sh->dev[i].page);
2292                 sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
2293                                                     addr, PAGE_SIZE);
2294                 kunmap_atomic(addr);
2295                 pages++;
2296         }
2297         WARN_ON(pages == 0);
2298
2299         /*
2300          * The stripe must enter state machine again to call endio, so
2301          * don't delay.
2302          */
2303         clear_bit(STRIPE_DELAYED, &sh->state);
2304         atomic_inc(&sh->count);
2305
2306         mutex_lock(&log->io_mutex);
2307         /* meta + data */
2308         reserve = (1 + pages) << (PAGE_SHIFT - 9);
2309
2310         if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
2311             sh->log_start == MaxSector)
2312                 r5l_add_no_space_stripe(log, sh);
2313         else if (!r5l_has_free_space(log, reserve)) {
2314                 if (sh->log_start == log->last_checkpoint)
2315                         BUG();
2316                 else
2317                         r5l_add_no_space_stripe(log, sh);
2318         } else {
2319                 ret = r5l_log_stripe(log, sh, pages, 0);
2320                 if (ret) {
2321                         spin_lock_irq(&log->io_list_lock);
2322                         list_add_tail(&sh->log_list, &log->no_mem_stripes);
2323                         spin_unlock_irq(&log->io_list_lock);
2324                 }
2325         }
2326
2327         mutex_unlock(&log->io_mutex);
2328         return 0;
2329 }
2330
2331 static int r5l_load_log(struct r5l_log *log)
2332 {
2333         struct md_rdev *rdev = log->rdev;
2334         struct page *page;
2335         struct r5l_meta_block *mb;
2336         sector_t cp = log->rdev->journal_tail;
2337         u32 stored_crc, expected_crc;
2338         bool create_super = false;
2339         int ret;
2340
2341         /* Make sure it's valid */
2342         if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp)
2343                 cp = 0;
2344         page = alloc_page(GFP_KERNEL);
2345         if (!page)
2346                 return -ENOMEM;
2347
2348         if (!sync_page_io(rdev, cp, PAGE_SIZE, page, REQ_OP_READ, 0, false)) {
2349                 ret = -EIO;
2350                 goto ioerr;
2351         }
2352         mb = page_address(page);
2353
2354         if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
2355             mb->version != R5LOG_VERSION) {
2356                 create_super = true;
2357                 goto create;
2358         }
2359         stored_crc = le32_to_cpu(mb->checksum);
2360         mb->checksum = 0;
2361         expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
2362         if (stored_crc != expected_crc) {
2363                 create_super = true;
2364                 goto create;
2365         }
2366         if (le64_to_cpu(mb->position) != cp) {
2367                 create_super = true;
2368                 goto create;
2369         }
2370 create:
2371         if (create_super) {
2372                 log->last_cp_seq = prandom_u32();
2373                 cp = 0;
2374                 r5l_log_write_empty_meta_block(log, cp, log->last_cp_seq);
2375                 /*
2376                  * Make sure super points to correct address. Log might have
2377                  * data very soon. If super hasn't correct log tail address,
2378                  * recovery can't find the log
2379                  */
2380                 r5l_write_super(log, cp);
2381         } else
2382                 log->last_cp_seq = le64_to_cpu(mb->seq);
2383
2384         log->device_size = round_down(rdev->sectors, BLOCK_SECTORS);
2385         log->max_free_space = log->device_size >> RECLAIM_MAX_FREE_SPACE_SHIFT;
2386         if (log->max_free_space > RECLAIM_MAX_FREE_SPACE)
2387                 log->max_free_space = RECLAIM_MAX_FREE_SPACE;
2388         log->last_checkpoint = cp;
2389         log->next_checkpoint = cp;
2390         mutex_lock(&log->io_mutex);
2391         r5c_update_log_state(log);
2392         mutex_unlock(&log->io_mutex);
2393
2394         __free_page(page);
2395
2396         return r5l_recovery_log(log);
2397 ioerr:
2398         __free_page(page);
2399         return ret;
2400 }
2401
2402 int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
2403 {
2404         struct request_queue *q = bdev_get_queue(rdev->bdev);
2405         struct r5l_log *log;
2406
2407         if (PAGE_SIZE != 4096)
2408                 return -EINVAL;
2409
2410         /*
2411          * The PAGE_SIZE must be big enough to hold 1 r5l_meta_block and
2412          * raid_disks r5l_payload_data_parity.
2413          *
2414          * Write journal and cache does not work for very big array
2415          * (raid_disks > 203)
2416          */
2417         if (sizeof(struct r5l_meta_block) +
2418             ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) *
2419              conf->raid_disks) > PAGE_SIZE) {
2420                 pr_err("md/raid:%s: write journal/cache doesn't work for array with %d disks\n",
2421                        mdname(conf->mddev), conf->raid_disks);
2422                 return -EINVAL;
2423         }
2424
2425         log = kzalloc(sizeof(*log), GFP_KERNEL);
2426         if (!log)
2427                 return -ENOMEM;
2428         log->rdev = rdev;
2429
2430         log->need_cache_flush = test_bit(QUEUE_FLAG_WC, &q->queue_flags) != 0;
2431
2432         log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid,
2433                                        sizeof(rdev->mddev->uuid));
2434
2435         mutex_init(&log->io_mutex);
2436
2437         spin_lock_init(&log->io_list_lock);
2438         INIT_LIST_HEAD(&log->running_ios);
2439         INIT_LIST_HEAD(&log->io_end_ios);
2440         INIT_LIST_HEAD(&log->flushing_ios);
2441         INIT_LIST_HEAD(&log->finished_ios);
2442         bio_init(&log->flush_bio);
2443
2444         log->io_kc = KMEM_CACHE(r5l_io_unit, 0);
2445         if (!log->io_kc)
2446                 goto io_kc;
2447
2448         log->io_pool = mempool_create_slab_pool(R5L_POOL_SIZE, log->io_kc);
2449         if (!log->io_pool)
2450                 goto io_pool;
2451
2452         log->bs = bioset_create(R5L_POOL_SIZE, 0);
2453         if (!log->bs)
2454                 goto io_bs;
2455
2456         log->meta_pool = mempool_create_page_pool(R5L_POOL_SIZE, 0);
2457         if (!log->meta_pool)
2458                 goto out_mempool;
2459
2460         log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
2461                                                  log->rdev->mddev, "reclaim");
2462         if (!log->reclaim_thread)
2463                 goto reclaim_thread;
2464         log->reclaim_thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL;
2465
2466         init_waitqueue_head(&log->iounit_wait);
2467
2468         INIT_LIST_HEAD(&log->no_mem_stripes);
2469
2470         INIT_LIST_HEAD(&log->no_space_stripes);
2471         spin_lock_init(&log->no_space_stripes_lock);
2472
2473         log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
2474         INIT_LIST_HEAD(&log->stripe_in_journal_list);
2475         spin_lock_init(&log->stripe_in_journal_lock);
2476         atomic_set(&log->stripe_in_journal_count, 0);
2477
2478         if (r5l_load_log(log))
2479                 goto error;
2480
2481         rcu_assign_pointer(conf->log, log);
2482         set_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
2483         return 0;
2484
2485 error:
2486         md_unregister_thread(&log->reclaim_thread);
2487 reclaim_thread:
2488         mempool_destroy(log->meta_pool);
2489 out_mempool:
2490         bioset_free(log->bs);
2491 io_bs:
2492         mempool_destroy(log->io_pool);
2493 io_pool:
2494         kmem_cache_destroy(log->io_kc);
2495 io_kc:
2496         kfree(log);
2497         return -EINVAL;
2498 }
2499
2500 void r5l_exit_log(struct r5l_log *log)
2501 {
2502         md_unregister_thread(&log->reclaim_thread);
2503         mempool_destroy(log->meta_pool);
2504         bioset_free(log->bs);
2505         mempool_destroy(log->io_pool);
2506         kmem_cache_destroy(log->io_kc);
2507         kfree(log);
2508 }