drivers/md/raid5-cache.c

   1 /*
   2  * Copyright (C) 2015 Shaohua Li <shli@fb.com>
   3  *
   4  * This program is free software; you can redistribute it and/or modify it
   5  * under the terms and conditions of the GNU General Public License,
   6  * version 2, as published by the Free Software Foundation.
   7  *
   8  * This program is distributed in the hope it will be useful, but WITHOUT
   9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  11  * more details.
  12  *
  13  */
  14 #include <linux/kernel.h>
  15 #include <linux/wait.h>
  16 #include <linux/blkdev.h>
  17 #include <linux/slab.h>
  18 #include <linux/raid/md_p.h>
  19 #include <linux/crc32c.h>
  20 #include <linux/random.h>
  21 #include "md.h"
  22 #include "raid5.h"
  23 #include "bitmap.h"
  24
  25 /*
  26  * metadata/data stored in disk with 4k size unit (a block) regardless
  27  * underneath hardware sector size. only works with PAGE_SIZE == 4096
  28  */
  29 #define BLOCK_SECTORS (8)
  30
  31 /*
  32  * log->max_free_space is min(1/4 disk size, 10G reclaimable space).
  33  *
  34  * In write through mode, the reclaim runs every log->max_free_space.
  35  * This can prevent the recovery scans for too long
  36  */
  37 #define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */
  38 #define RECLAIM_MAX_FREE_SPACE_SHIFT (2)
  39
  40 /* wake up reclaim thread periodically */
  41 #define R5C_RECLAIM_WAKEUP_INTERVAL (30 * HZ)
  42 /* start flush with these full stripes */
  43 #define R5C_FULL_STRIPE_FLUSH_BATCH 256
  44 /* reclaim stripes in groups */
  45 #define R5C_RECLAIM_STRIPE_GROUP (NR_STRIPE_HASH_LOCKS * 2)
  46
  47 /*
  48  * We only need 2 bios per I/O unit to make progress, but ensure we
  49  * have a few more available to not get too tight.
  50  */
  51 #define R5L_POOL_SIZE   4
  52
  53 /*
  54  * r5c journal modes of the array: write-back or write-through.
  55  * write-through mode has identical behavior as existing log only
  56  * implementation.
  57  */
  58 enum r5c_journal_mode {
  59         R5C_JOURNAL_MODE_WRITE_THROUGH = 0,
  60         R5C_JOURNAL_MODE_WRITE_BACK = 1,
  61 };
  62
  63 /*
  64  * raid5 cache state machine
  65  *
  66  * With rhe RAID cache, each stripe works in two phases:
  67  *      - caching phase
  68  *      - writing-out phase
  69  *
  70  * These two phases are controlled by bit STRIPE_R5C_CACHING:
  71  *   if STRIPE_R5C_CACHING == 0, the stripe is in writing-out phase
  72  *   if STRIPE_R5C_CACHING == 1, the stripe is in caching phase
  73  *
  74  * When there is no journal, or the journal is in write-through mode,
  75  * the stripe is always in writing-out phase.
  76  *
  77  * For write-back journal, the stripe is sent to caching phase on write
  78  * (r5c_try_caching_write). r5c_make_stripe_write_out() kicks off
  79  * the write-out phase by clearing STRIPE_R5C_CACHING.
  80  *
  81  * Stripes in caching phase do not write the raid disks. Instead, all
  82  * writes are committed from the log device. Therefore, a stripe in
  83  * caching phase handles writes as:
  84  *      - write to log device
  85  *      - return IO
  86  *
  87  * Stripes in writing-out phase handle writes as:
  88  *      - calculate parity
  89  *      - write pending data and parity to journal
  90  *      - write data and parity to raid disks
  91  *      - return IO for pending writes
  92  */
  93
  94 struct r5l_log {
  95         struct md_rdev *rdev;
  96
  97         u32 uuid_checksum;
  98
  99         sector_t device_size;           /* log device size, round to
 100                                          * BLOCK_SECTORS */
 101         sector_t max_free_space;        /* reclaim run if free space is at
 102                                          * this size */
 103
 104         sector_t last_checkpoint;       /* log tail. where recovery scan
 105                                          * starts from */
 106         u64 last_cp_seq;                /* log tail sequence */
 107
 108         sector_t log_start;             /* log head. where new data appends */
 109         u64 seq;                        /* log head sequence */
 110
 111         sector_t next_checkpoint;
 112         u64 next_cp_seq;
 113
 114         struct mutex io_mutex;
 115         struct r5l_io_unit *current_io; /* current io_unit accepting new data */
 116
 117         spinlock_t io_list_lock;
 118         struct list_head running_ios;   /* io_units which are still running,
 119                                          * and have not yet been completely
 120                                          * written to the log */
 121         struct list_head io_end_ios;    /* io_units which have been completely
 122                                          * written to the log but not yet written
 123                                          * to the RAID */
 124         struct list_head flushing_ios;  /* io_units which are waiting for log
 125                                          * cache flush */
 126         struct list_head finished_ios;  /* io_units which settle down in log disk */
 127         struct bio flush_bio;
 128
 129         struct list_head no_mem_stripes;   /* pending stripes, -ENOMEM */
 130
 131         struct kmem_cache *io_kc;
 132         mempool_t *io_pool;
 133         struct bio_set *bs;
 134         mempool_t *meta_pool;
 135
 136         struct md_thread *reclaim_thread;
 137         unsigned long reclaim_target;   /* number of space that need to be
 138                                          * reclaimed.  if it's 0, reclaim spaces
 139                                          * used by io_units which are in
 140                                          * IO_UNIT_STRIPE_END state (eg, reclaim
 141                                          * dones't wait for specific io_unit
 142                                          * switching to IO_UNIT_STRIPE_END
 143                                          * state) */
 144         wait_queue_head_t iounit_wait;
 145
 146         struct list_head no_space_stripes; /* pending stripes, log has no space */
 147         spinlock_t no_space_stripes_lock;
 148
 149         bool need_cache_flush;
 150
 151         /* for r5c_cache */
 152         enum r5c_journal_mode r5c_journal_mode;
 153
 154         /* all stripes in r5cache, in the order of seq at sh->log_start */
 155         struct list_head stripe_in_journal_list;
 156
 157         spinlock_t stripe_in_journal_lock;
 158         atomic_t stripe_in_journal_count;
 159 };
 160
 161 /*
 162  * an IO range starts from a meta data block and end at the next meta data
 163  * block. The io unit's the meta data block tracks data/parity followed it. io
 164  * unit is written to log disk with normal write, as we always flush log disk
 165  * first and then start move data to raid disks, there is no requirement to
 166  * write io unit with FLUSH/FUA
 167  */
 168 struct r5l_io_unit {
 169         struct r5l_log *log;
 170
 171         struct page *meta_page; /* store meta block */
 172         int meta_offset;        /* current offset in meta_page */
 173
 174         struct bio *current_bio;/* current_bio accepting new data */
 175
 176         atomic_t pending_stripe;/* how many stripes not flushed to raid */
 177         u64 seq;                /* seq number of the metablock */
 178         sector_t log_start;     /* where the io_unit starts */
 179         sector_t log_end;       /* where the io_unit ends */
 180         struct list_head log_sibling; /* log->running_ios */
 181         struct list_head stripe_list; /* stripes added to the io_unit */
 182
 183         int state;
 184         bool need_split_bio;
 185 };
 186
 187 /* r5l_io_unit state */
 188 enum r5l_io_unit_state {
 189         IO_UNIT_RUNNING = 0,    /* accepting new IO */
 190         IO_UNIT_IO_START = 1,   /* io_unit bio start writing to log,
 191                                  * don't accepting new bio */
 192         IO_UNIT_IO_END = 2,     /* io_unit bio finish writing to log */
 193         IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */
 194 };
 195
 196 bool r5c_is_writeback(struct r5l_log *log)
 197 {
 198         return (log != NULL &&
 199                 log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK);
 200 }
 201
 202 static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc)
 203 {
 204         start += inc;
 205         if (start >= log->device_size)
 206                 start = start - log->device_size;
 207         return start;
 208 }
 209
 210 static sector_t r5l_ring_distance(struct r5l_log *log, sector_t start,
 211                                   sector_t end)
 212 {
 213         if (end >= start)
 214                 return end - start;
 215         else
 216                 return end + log->device_size - start;
 217 }
 218
 219 static bool r5l_has_free_space(struct r5l_log *log, sector_t size)
 220 {
 221         sector_t used_size;
 222
 223         used_size = r5l_ring_distance(log, log->last_checkpoint,
 224                                         log->log_start);
 225
 226         return log->device_size > used_size + size;
 227 }
 228
 229 static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
 230                                     enum r5l_io_unit_state state)
 231 {
 232         if (WARN_ON(io->state >= state))
 233                 return;
 234         io->state = state;
 235 }
 236
 237 static void
 238 r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev,
 239                               struct bio_list *return_bi)
 240 {
 241         struct bio *wbi, *wbi2;
 242
 243         wbi = dev->written;
 244         dev->written = NULL;
 245         while (wbi && wbi->bi_iter.bi_sector <
 246                dev->sector + STRIPE_SECTORS) {
 247                 wbi2 = r5_next_bio(wbi, dev->sector);
 248                 if (!raid5_dec_bi_active_stripes(wbi)) {
 249                         md_write_end(conf->mddev);
 250                         bio_list_add(return_bi, wbi);
 251                 }
 252                 wbi = wbi2;
 253         }
 254 }
 255
 256 void r5c_handle_cached_data_endio(struct r5conf *conf,
 257           struct stripe_head *sh, int disks, struct bio_list *return_bi)
 258 {
 259         int i;
 260
 261         for (i = sh->disks; i--; ) {
 262                 if (sh->dev[i].written) {
 263                         set_bit(R5_UPTODATE, &sh->dev[i].flags);
 264                         r5c_return_dev_pending_writes(conf, &sh->dev[i],
 265                                                       return_bi);
 266                         bitmap_endwrite(conf->mddev->bitmap, sh->sector,
 267                                         STRIPE_SECTORS,
 268                                         !test_bit(STRIPE_DEGRADED, &sh->state),
 269                                         0);
 270                 }
 271         }
 272 }
 273
 274 /* Check whether we should flush some stripes to free up stripe cache */
 275 void r5c_check_stripe_cache_usage(struct r5conf *conf)
 276 {
 277         int total_cached;
 278
 279         if (!r5c_is_writeback(conf->log))
 280                 return;
 281
 282         total_cached = atomic_read(&conf->r5c_cached_partial_stripes) +
 283                 atomic_read(&conf->r5c_cached_full_stripes);
 284
 285         /*
 286          * The following condition is true for either of the following:
 287          *   - stripe cache pressure high:
 288          *          total_cached > 3/4 min_nr_stripes ||
 289          *          empty_inactive_list_nr > 0
 290          *   - stripe cache pressure moderate:
 291          *          total_cached > 1/2 min_nr_stripes
 292          */
 293         if (total_cached > conf->min_nr_stripes * 1 / 2 ||
 294             atomic_read(&conf->empty_inactive_list_nr) > 0)
 295                 r5l_wake_reclaim(conf->log, 0);
 296 }
 297
 298 /*
 299  * flush cache when there are R5C_FULL_STRIPE_FLUSH_BATCH or more full
 300  * stripes in the cache
 301  */
 302 void r5c_check_cached_full_stripe(struct r5conf *conf)
 303 {
 304         if (!r5c_is_writeback(conf->log))
 305                 return;
 306
 307         /*
 308          * wake up reclaim for R5C_FULL_STRIPE_FLUSH_BATCH cached stripes
 309          * or a full stripe (chunk size / 4k stripes).
 310          */
 311         if (atomic_read(&conf->r5c_cached_full_stripes) >=
 312             min(R5C_FULL_STRIPE_FLUSH_BATCH,
 313                 conf->chunk_sectors >> STRIPE_SHIFT))
 314                 r5l_wake_reclaim(conf->log, 0);
 315 }
 316
 317 /*
 318  * Total log space (in sectors) needed to flush all data in cache
 319  *
 320  * Currently, writing-out phase automatically includes all pending writes
 321  * to the same sector. So the reclaim of each stripe takes up to
 322  * (conf->raid_disks + 1) pages of log space.
 323  *
 324  * To totally avoid deadlock due to log space, the code reserves
 325  * (conf->raid_disks + 1) pages for each stripe in cache, which is not
 326  * necessary in most cases.
 327  *
 328  * To improve this, we will need writing-out phase to be able to NOT include
 329  * pending writes, which will reduce the requirement to
 330  * (conf->max_degraded + 1) pages per stripe in cache.
 331  */
 332 static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf)
 333 {
 334         struct r5l_log *log = conf->log;
 335
 336         if (!r5c_is_writeback(log))
 337                 return 0;
 338
 339         return BLOCK_SECTORS * (conf->raid_disks + 1) *
 340                 atomic_read(&log->stripe_in_journal_count);
 341 }
 342
 343 /*
 344  * evaluate log space usage and update R5C_LOG_TIGHT and R5C_LOG_CRITICAL
 345  *
 346  * R5C_LOG_TIGHT is set when free space on the log device is less than 3x of
 347  * reclaim_required_space. R5C_LOG_CRITICAL is set when free space on the log
 348  * device is less than 2x of reclaim_required_space.
 349  */
 350 static inline void r5c_update_log_state(struct r5l_log *log)
 351 {
 352         struct r5conf *conf = log->rdev->mddev->private;
 353         sector_t free_space;
 354         sector_t reclaim_space;
 355
 356         if (!r5c_is_writeback(log))
 357                 return;
 358
 359         free_space = r5l_ring_distance(log, log->log_start,
 360                                        log->last_checkpoint);
 361         reclaim_space = r5c_log_required_to_flush_cache(conf);
 362         if (free_space < 2 * reclaim_space)
 363                 set_bit(R5C_LOG_CRITICAL, &conf->cache_state);
 364         else
 365                 clear_bit(R5C_LOG_CRITICAL, &conf->cache_state);
 366         if (free_space < 3 * reclaim_space)
 367                 set_bit(R5C_LOG_TIGHT, &conf->cache_state);
 368         else
 369                 clear_bit(R5C_LOG_TIGHT, &conf->cache_state);
 370 }
 371
 372 /*
 373  * Put the stripe into writing-out phase by clearing STRIPE_R5C_CACHING.
 374  * This function should only be called in write-back mode.
 375  */
 376 void r5c_make_stripe_write_out(struct stripe_head *sh)
 377 {
 378         struct r5conf *conf = sh->raid_conf;
 379         struct r5l_log *log = conf->log;
 380
 381         BUG_ON(!r5c_is_writeback(log));
 382
 383         WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
 384         clear_bit(STRIPE_R5C_CACHING, &sh->state);
 385
 386         if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
 387                 atomic_inc(&conf->preread_active_stripes);
 388
 389         if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) {
 390                 BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0);
 391                 atomic_dec(&conf->r5c_cached_partial_stripes);
 392         }
 393
 394         if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
 395                 BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0);
 396                 atomic_dec(&conf->r5c_cached_full_stripes);
 397         }
 398 }
 399
 400 static void r5c_handle_data_cached(struct stripe_head *sh)
 401 {
 402         int i;
 403
 404         for (i = sh->disks; i--; )
 405                 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
 406                         set_bit(R5_InJournal, &sh->dev[i].flags);
 407                         clear_bit(R5_LOCKED, &sh->dev[i].flags);
 408                 }
 409         clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
 410 }
 411
 412 /*
 413  * this journal write must contain full parity,
 414  * it may also contain some data pages
 415  */
 416 static void r5c_handle_parity_cached(struct stripe_head *sh)
 417 {
 418         int i;
 419
 420         for (i = sh->disks; i--; )
 421                 if (test_bit(R5_InJournal, &sh->dev[i].flags))
 422                         set_bit(R5_Wantwrite, &sh->dev[i].flags);
 423 }
 424
 425 /*
 426  * Setting proper flags after writing (or flushing) data and/or parity to the
 427  * log device. This is called from r5l_log_endio() or r5l_log_flush_endio().
 428  */
 429 static void r5c_finish_cache_stripe(struct stripe_head *sh)
 430 {
 431         struct r5l_log *log = sh->raid_conf->log;
 432
 433         if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
 434                 BUG_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
 435                 /*
 436                  * Set R5_InJournal for parity dev[pd_idx]. This means
 437                  * all data AND parity in the journal. For RAID 6, it is
 438                  * NOT necessary to set the flag for dev[qd_idx], as the
 439                  * two parities are written out together.
 440                  */
 441                 set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
 442         } else if (test_bit(STRIPE_R5C_CACHING, &sh->state)) {
 443                 r5c_handle_data_cached(sh);
 444         } else {
 445                 r5c_handle_parity_cached(sh);
 446                 set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
 447         }
 448 }
 449
 450 static void r5l_io_run_stripes(struct r5l_io_unit *io)
 451 {
 452         struct stripe_head *sh, *next;
 453
 454         list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) {
 455                 list_del_init(&sh->log_list);
 456
 457                 r5c_finish_cache_stripe(sh);
 458
 459                 set_bit(STRIPE_HANDLE, &sh->state);
 460                 raid5_release_stripe(sh);
 461         }
 462 }
 463
 464 static void r5l_log_run_stripes(struct r5l_log *log)
 465 {
 466         struct r5l_io_unit *io, *next;
 467
 468         assert_spin_locked(&log->io_list_lock);
 469
 470         list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
 471                 /* don't change list order */
 472                 if (io->state < IO_UNIT_IO_END)
 473                         break;
 474
 475                 list_move_tail(&io->log_sibling, &log->finished_ios);
 476                 r5l_io_run_stripes(io);
 477         }
 478 }
 479
 480 static void r5l_move_to_end_ios(struct r5l_log *log)
 481 {
 482         struct r5l_io_unit *io, *next;
 483
 484         assert_spin_locked(&log->io_list_lock);
 485
 486         list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
 487                 /* don't change list order */
 488                 if (io->state < IO_UNIT_IO_END)
 489                         break;
 490                 list_move_tail(&io->log_sibling, &log->io_end_ios);
 491         }
 492 }
 493
 494 static void r5l_log_endio(struct bio *bio)
 495 {
 496         struct r5l_io_unit *io = bio->bi_private;
 497         struct r5l_log *log = io->log;
 498         unsigned long flags;
 499
 500         if (bio->bi_error)
 501                 md_error(log->rdev->mddev, log->rdev);
 502
 503         bio_put(bio);
 504         mempool_free(io->meta_page, log->meta_pool);
 505
 506         spin_lock_irqsave(&log->io_list_lock, flags);
 507         __r5l_set_io_unit_state(io, IO_UNIT_IO_END);
 508         if (log->need_cache_flush)
 509                 r5l_move_to_end_ios(log);
 510         else
 511                 r5l_log_run_stripes(log);
 512         spin_unlock_irqrestore(&log->io_list_lock, flags);
 513
 514         if (log->need_cache_flush)
 515                 md_wakeup_thread(log->rdev->mddev->thread);
 516 }
 517
 518 static void r5l_submit_current_io(struct r5l_log *log)
 519 {
 520         struct r5l_io_unit *io = log->current_io;
 521         struct r5l_meta_block *block;
 522         unsigned long flags;
 523         u32 crc;
 524
 525         if (!io)
 526                 return;
 527
 528         block = page_address(io->meta_page);
 529         block->meta_size = cpu_to_le32(io->meta_offset);
 530         crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE);
 531         block->checksum = cpu_to_le32(crc);
 532
 533         log->current_io = NULL;
 534         spin_lock_irqsave(&log->io_list_lock, flags);
 535         __r5l_set_io_unit_state(io, IO_UNIT_IO_START);
 536         spin_unlock_irqrestore(&log->io_list_lock, flags);
 537
 538         submit_bio(io->current_bio);
 539 }
 540
 541 static struct bio *r5l_bio_alloc(struct r5l_log *log)
 542 {
 543         struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, log->bs);
 544
 545         bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
 546         bio->bi_bdev = log->rdev->bdev;
 547         bio->bi_iter.bi_sector = log->rdev->data_offset + log->log_start;
 548
 549         return bio;
 550 }
 551
 552 static void r5_reserve_log_entry(struct r5l_log *log, struct r5l_io_unit *io)
 553 {
 554         log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS);
 555
 556         r5c_update_log_state(log);
 557         /*
 558          * If we filled up the log device start from the beginning again,
 559          * which will require a new bio.
 560          *
 561          * Note: for this to work properly the log size needs to me a multiple
 562          * of BLOCK_SECTORS.
 563          */
 564         if (log->log_start == 0)
 565                 io->need_split_bio = true;
 566
 567         io->log_end = log->log_start;
 568 }
 569
 570 static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
 571 {
 572         struct r5l_io_unit *io;
 573         struct r5l_meta_block *block;
 574
 575         io = mempool_alloc(log->io_pool, GFP_ATOMIC);
 576         if (!io)
 577                 return NULL;
 578         memset(io, 0, sizeof(*io));
 579
 580         io->log = log;
 581         INIT_LIST_HEAD(&io->log_sibling);
 582         INIT_LIST_HEAD(&io->stripe_list);
 583         io->state = IO_UNIT_RUNNING;
 584
 585         io->meta_page = mempool_alloc(log->meta_pool, GFP_NOIO);
 586         block = page_address(io->meta_page);
 587         clear_page(block);
 588         block->magic = cpu_to_le32(R5LOG_MAGIC);
 589         block->version = R5LOG_VERSION;
 590         block->seq = cpu_to_le64(log->seq);
 591         block->position = cpu_to_le64(log->log_start);
 592
 593         io->log_start = log->log_start;
 594         io->meta_offset = sizeof(struct r5l_meta_block);
 595         io->seq = log->seq++;
 596
 597         io->current_bio = r5l_bio_alloc(log);
 598         io->current_bio->bi_end_io = r5l_log_endio;
 599         io->current_bio->bi_private = io;
 600         bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0);
 601
 602         r5_reserve_log_entry(log, io);
 603
 604         spin_lock_irq(&log->io_list_lock);
 605         list_add_tail(&io->log_sibling, &log->running_ios);
 606         spin_unlock_irq(&log->io_list_lock);
 607
 608         return io;
 609 }
 610
 611 static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size)
 612 {
 613         if (log->current_io &&
 614             log->current_io->meta_offset + payload_size > PAGE_SIZE)
 615                 r5l_submit_current_io(log);
 616
 617         if (!log->current_io) {
 618                 log->current_io = r5l_new_meta(log);
 619                 if (!log->current_io)
 620                         return -ENOMEM;
 621         }
 622
 623         return 0;
 624 }
 625
 626 static void r5l_append_payload_meta(struct r5l_log *log, u16 type,
 627                                     sector_t location,
 628                                     u32 checksum1, u32 checksum2,
 629                                     bool checksum2_valid)
 630 {
 631         struct r5l_io_unit *io = log->current_io;
 632         struct r5l_payload_data_parity *payload;
 633
 634         payload = page_address(io->meta_page) + io->meta_offset;
 635         payload->header.type = cpu_to_le16(type);
 636         payload->header.flags = cpu_to_le16(0);
 637         payload->size = cpu_to_le32((1 + !!checksum2_valid) <<
 638                                     (PAGE_SHIFT - 9));
 639         payload->location = cpu_to_le64(location);
 640         payload->checksum[0] = cpu_to_le32(checksum1);
 641         if (checksum2_valid)
 642                 payload->checksum[1] = cpu_to_le32(checksum2);
 643
 644         io->meta_offset += sizeof(struct r5l_payload_data_parity) +
 645                 sizeof(__le32) * (1 + !!checksum2_valid);
 646 }
 647
 648 static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
 649 {
 650         struct r5l_io_unit *io = log->current_io;
 651
 652         if (io->need_split_bio) {
 653                 struct bio *prev = io->current_bio;
 654
 655                 io->current_bio = r5l_bio_alloc(log);
 656                 bio_chain(io->current_bio, prev);
 657
 658                 submit_bio(prev);
 659         }
 660
 661         if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0))
 662                 BUG();
 663
 664         r5_reserve_log_entry(log, io);
 665 }
 666
 667 static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
 668                            int data_pages, int parity_pages)
 669 {
 670         int i;
 671         int meta_size;
 672         int ret;
 673         struct r5l_io_unit *io;
 674
 675         meta_size =
 676                 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
 677                  * data_pages) +
 678                 sizeof(struct r5l_payload_data_parity) +
 679                 sizeof(__le32) * parity_pages;
 680
 681         ret = r5l_get_meta(log, meta_size);
 682         if (ret)
 683                 return ret;
 684
 685         io = log->current_io;
 686
 687         for (i = 0; i < sh->disks; i++) {
 688                 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) ||
 689                     test_bit(R5_InJournal, &sh->dev[i].flags))
 690                         continue;
 691                 if (i == sh->pd_idx || i == sh->qd_idx)
 692                         continue;
 693                 r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA,
 694                                         raid5_compute_blocknr(sh, i, 0),
 695                                         sh->dev[i].log_checksum, 0, false);
 696                 r5l_append_payload_page(log, sh->dev[i].page);
 697         }
 698
 699         if (parity_pages == 2) {
 700                 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
 701                                         sh->sector, sh->dev[sh->pd_idx].log_checksum,
 702                                         sh->dev[sh->qd_idx].log_checksum, true);
 703                 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
 704                 r5l_append_payload_page(log, sh->dev[sh->qd_idx].page);
 705         } else if (parity_pages == 1) {
 706                 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
 707                                         sh->sector, sh->dev[sh->pd_idx].log_checksum,
 708                                         0, false);
 709                 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
 710         } else  /* Just writing data, not parity, in caching phase */
 711                 BUG_ON(parity_pages != 0);
 712
 713         list_add_tail(&sh->log_list, &io->stripe_list);
 714         atomic_inc(&io->pending_stripe);
 715         sh->log_io = io;
 716
 717         if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
 718                 return 0;
 719
 720         if (sh->log_start == MaxSector) {
 721                 BUG_ON(!list_empty(&sh->r5c));
 722                 sh->log_start = io->log_start;
 723                 spin_lock_irq(&log->stripe_in_journal_lock);
 724                 list_add_tail(&sh->r5c,
 725                               &log->stripe_in_journal_list);
 726                 spin_unlock_irq(&log->stripe_in_journal_lock);
 727                 atomic_inc(&log->stripe_in_journal_count);
 728         }
 729         return 0;
 730 }
 731
 732 /* add stripe to no_space_stripes, and then wake up reclaim */
 733 static inline void r5l_add_no_space_stripe(struct r5l_log *log,
 734                                            struct stripe_head *sh)
 735 {
 736         spin_lock(&log->no_space_stripes_lock);
 737         list_add_tail(&sh->log_list, &log->no_space_stripes);
 738         spin_unlock(&log->no_space_stripes_lock);
 739 }
 740
 741 /*
 742  * running in raid5d, where reclaim could wait for raid5d too (when it flushes
 743  * data from log to raid disks), so we shouldn't wait for reclaim here
 744  */
 745 int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
 746 {
 747         struct r5conf *conf = sh->raid_conf;
 748         int write_disks = 0;
 749         int data_pages, parity_pages;
 750         int reserve;
 751         int i;
 752         int ret = 0;
 753         bool wake_reclaim = false;
 754
 755         if (!log)
 756                 return -EAGAIN;
 757         /* Don't support stripe batch */
 758         if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
 759             test_bit(STRIPE_SYNCING, &sh->state)) {
 760                 /* the stripe is written to log, we start writing it to raid */
 761                 clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
 762                 return -EAGAIN;
 763         }
 764
 765         WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
 766
 767         for (i = 0; i < sh->disks; i++) {
 768                 void *addr;
 769
 770                 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) ||
 771                     test_bit(R5_InJournal, &sh->dev[i].flags))
 772                         continue;
 773
 774                 write_disks++;
 775                 /* checksum is already calculated in last run */
 776                 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
 777                         continue;
 778                 addr = kmap_atomic(sh->dev[i].page);
 779                 sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
 780                                                     addr, PAGE_SIZE);
 781                 kunmap_atomic(addr);
 782         }
 783         parity_pages = 1 + !!(sh->qd_idx >= 0);
 784         data_pages = write_disks - parity_pages;
 785
 786         set_bit(STRIPE_LOG_TRAPPED, &sh->state);
 787         /*
 788          * The stripe must enter state machine again to finish the write, so
 789          * don't delay.
 790          */
 791         clear_bit(STRIPE_DELAYED, &sh->state);
 792         atomic_inc(&sh->count);
 793
 794         mutex_lock(&log->io_mutex);
 795         /* meta + data */
 796         reserve = (1 + write_disks) << (PAGE_SHIFT - 9);
 797
 798         if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
 799                 if (!r5l_has_free_space(log, reserve)) {
 800                         r5l_add_no_space_stripe(log, sh);
 801                         wake_reclaim = true;
 802                 } else {
 803                         ret = r5l_log_stripe(log, sh, data_pages, parity_pages);
 804                         if (ret) {
 805                                 spin_lock_irq(&log->io_list_lock);
 806                                 list_add_tail(&sh->log_list,
 807                                               &log->no_mem_stripes);
 808                                 spin_unlock_irq(&log->io_list_lock);
 809                         }
 810                 }
 811         } else {  /* R5C_JOURNAL_MODE_WRITE_BACK */
 812                 /*
 813                  * log space critical, do not process stripes that are
 814                  * not in cache yet (sh->log_start == MaxSector).
 815                  */
 816                 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
 817                     sh->log_start == MaxSector) {
 818                         r5l_add_no_space_stripe(log, sh);
 819                         wake_reclaim = true;
 820                         reserve = 0;
 821                 } else if (!r5l_has_free_space(log, reserve)) {
 822                         if (sh->log_start == log->last_checkpoint)
 823                                 BUG();
 824                         else
 825                                 r5l_add_no_space_stripe(log, sh);
 826                 } else {
 827                         ret = r5l_log_stripe(log, sh, data_pages, parity_pages);
 828                         if (ret) {
 829                                 spin_lock_irq(&log->io_list_lock);
 830                                 list_add_tail(&sh->log_list,
 831                                               &log->no_mem_stripes);
 832                                 spin_unlock_irq(&log->io_list_lock);
 833                         }
 834                 }
 835         }
 836
 837         mutex_unlock(&log->io_mutex);
 838         if (wake_reclaim)
 839                 r5l_wake_reclaim(log, reserve);
 840         return 0;
 841 }
 842
 843 void r5l_write_stripe_run(struct r5l_log *log)
 844 {
 845         if (!log)
 846                 return;
 847         mutex_lock(&log->io_mutex);
 848         r5l_submit_current_io(log);
 849         mutex_unlock(&log->io_mutex);
 850 }
 851
 852 int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
 853 {
 854         if (!log)
 855                 return -ENODEV;
 856         /*
 857          * we flush log disk cache first, then write stripe data to raid disks.
 858          * So if bio is finished, the log disk cache is flushed already. The
 859          * recovery guarantees we can recovery the bio from log disk, so we
 860          * don't need to flush again
 861          */
 862         if (bio->bi_iter.bi_size == 0) {
 863                 bio_endio(bio);
 864                 return 0;
 865         }
 866         bio->bi_opf &= ~REQ_PREFLUSH;
 867         return -EAGAIN;
 868 }
 869
 870 /* This will run after log space is reclaimed */
 871 static void r5l_run_no_space_stripes(struct r5l_log *log)
 872 {
 873         struct stripe_head *sh;
 874
 875         spin_lock(&log->no_space_stripes_lock);
 876         while (!list_empty(&log->no_space_stripes)) {
 877                 sh = list_first_entry(&log->no_space_stripes,
 878                                       struct stripe_head, log_list);
 879                 list_del_init(&sh->log_list);
 880                 set_bit(STRIPE_HANDLE, &sh->state);
 881                 raid5_release_stripe(sh);
 882         }
 883         spin_unlock(&log->no_space_stripes_lock);
 884 }
 885
 886 /*
 887  * calculate new last_checkpoint
 888  * for write through mode, returns log->next_checkpoint
 889  * for write back, returns log_start of first sh in stripe_in_journal_list
 890  */
 891 static sector_t r5c_calculate_new_cp(struct r5conf *conf)
 892 {
 893         struct stripe_head *sh;
 894         struct r5l_log *log = conf->log;
 895         sector_t new_cp;
 896         unsigned long flags;
 897
 898         if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
 899                 return log->next_checkpoint;
 900
 901         spin_lock_irqsave(&log->stripe_in_journal_lock, flags);
 902         if (list_empty(&conf->log->stripe_in_journal_list)) {
 903                 /* all stripes flushed */
 904                 spin_unlock(&log->stripe_in_journal_lock);
 905                 return log->next_checkpoint;
 906         }
 907         sh = list_first_entry(&conf->log->stripe_in_journal_list,
 908                               struct stripe_head, r5c);
 909         new_cp = sh->log_start;
 910         spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
 911         return new_cp;
 912 }
 913
 914 static sector_t r5l_reclaimable_space(struct r5l_log *log)
 915 {
 916         struct r5conf *conf = log->rdev->mddev->private;
 917
 918         return r5l_ring_distance(log, log->last_checkpoint,
 919                                  r5c_calculate_new_cp(conf));
 920 }
 921
 922 static void r5l_run_no_mem_stripe(struct r5l_log *log)
 923 {
 924         struct stripe_head *sh;
 925
 926         assert_spin_locked(&log->io_list_lock);
 927
 928         if (!list_empty(&log->no_mem_stripes)) {
 929                 sh = list_first_entry(&log->no_mem_stripes,
 930                                       struct stripe_head, log_list);
 931                 list_del_init(&sh->log_list);
 932                 set_bit(STRIPE_HANDLE, &sh->state);
 933                 raid5_release_stripe(sh);
 934         }
 935 }
 936
 937 static bool r5l_complete_finished_ios(struct r5l_log *log)
 938 {
 939         struct r5l_io_unit *io, *next;
 940         bool found = false;
 941
 942         assert_spin_locked(&log->io_list_lock);
 943
 944         list_for_each_entry_safe(io, next, &log->finished_ios, log_sibling) {
 945                 /* don't change list order */
 946                 if (io->state < IO_UNIT_STRIPE_END)
 947                         break;
 948
 949                 log->next_checkpoint = io->log_start;
 950                 log->next_cp_seq = io->seq;
 951
 952                 list_del(&io->log_sibling);
 953                 mempool_free(io, log->io_pool);
 954                 r5l_run_no_mem_stripe(log);
 955
 956                 found = true;
 957         }
 958
 959         return found;
 960 }
 961
 962 static void __r5l_stripe_write_finished(struct r5l_io_unit *io)
 963 {
 964         struct r5l_log *log = io->log;
 965         struct r5conf *conf = log->rdev->mddev->private;
 966         unsigned long flags;
 967
 968         spin_lock_irqsave(&log->io_list_lock, flags);
 969         __r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END);
 970
 971         if (!r5l_complete_finished_ios(log)) {
 972                 spin_unlock_irqrestore(&log->io_list_lock, flags);
 973                 return;
 974         }
 975
 976         if (r5l_reclaimable_space(log) > log->max_free_space ||
 977             test_bit(R5C_LOG_TIGHT, &conf->cache_state))
 978                 r5l_wake_reclaim(log, 0);
 979
 980         spin_unlock_irqrestore(&log->io_list_lock, flags);
 981         wake_up(&log->iounit_wait);
 982 }
 983
 984 void r5l_stripe_write_finished(struct stripe_head *sh)
 985 {
 986         struct r5l_io_unit *io;
 987
 988         io = sh->log_io;
 989         sh->log_io = NULL;
 990
 991         if (io && atomic_dec_and_test(&io->pending_stripe))
 992                 __r5l_stripe_write_finished(io);
 993 }
 994
 995 static void r5l_log_flush_endio(struct bio *bio)
 996 {
 997         struct r5l_log *log = container_of(bio, struct r5l_log,
 998                 flush_bio);
 999         unsigned long flags;
1000         struct r5l_io_unit *io;
1001
1002         if (bio->bi_error)
1003                 md_error(log->rdev->mddev, log->rdev);
1004
1005         spin_lock_irqsave(&log->io_list_lock, flags);
1006         list_for_each_entry(io, &log->flushing_ios, log_sibling)
1007                 r5l_io_run_stripes(io);
1008         list_splice_tail_init(&log->flushing_ios, &log->finished_ios);
1009         spin_unlock_irqrestore(&log->io_list_lock, flags);
1010 }
1011
1012 /*
1013  * Starting dispatch IO to raid.
1014  * io_unit(meta) consists of a log. There is one situation we want to avoid. A
1015  * broken meta in the middle of a log causes recovery can't find meta at the
1016  * head of log. If operations require meta at the head persistent in log, we
1017  * must make sure meta before it persistent in log too. A case is:
1018  *
1019  * stripe data/parity is in log, we start write stripe to raid disks. stripe
1020  * data/parity must be persistent in log before we do the write to raid disks.
1021  *
1022  * The solution is we restrictly maintain io_unit list order. In this case, we
1023  * only write stripes of an io_unit to raid disks till the io_unit is the first
1024  * one whose data/parity is in log.
1025  */
1026 void r5l_flush_stripe_to_raid(struct r5l_log *log)
1027 {
1028         bool do_flush;
1029
1030         if (!log || !log->need_cache_flush)
1031                 return;
1032
1033         spin_lock_irq(&log->io_list_lock);
1034         /* flush bio is running */
1035         if (!list_empty(&log->flushing_ios)) {
1036                 spin_unlock_irq(&log->io_list_lock);
1037                 return;
1038         }
1039         list_splice_tail_init(&log->io_end_ios, &log->flushing_ios);
1040         do_flush = !list_empty(&log->flushing_ios);
1041         spin_unlock_irq(&log->io_list_lock);
1042
1043         if (!do_flush)
1044                 return;
1045         bio_reset(&log->flush_bio);
1046         log->flush_bio.bi_bdev = log->rdev->bdev;
1047         log->flush_bio.bi_end_io = r5l_log_flush_endio;
1048         bio_set_op_attrs(&log->flush_bio, REQ_OP_WRITE, WRITE_FLUSH);
1049         submit_bio(&log->flush_bio);
1050 }
1051
1052 static void r5l_write_super(struct r5l_log *log, sector_t cp);
1053 static void r5l_write_super_and_discard_space(struct r5l_log *log,
1054         sector_t end)
1055 {
1056         struct block_device *bdev = log->rdev->bdev;
1057         struct mddev *mddev;
1058
1059         r5l_write_super(log, end);
1060
1061         if (!blk_queue_discard(bdev_get_queue(bdev)))
1062                 return;
1063
1064         mddev = log->rdev->mddev;
1065         /*
1066          * Discard could zero data, so before discard we must make sure
1067          * superblock is updated to new log tail. Updating superblock (either
1068          * directly call md_update_sb() or depend on md thread) must hold
1069          * reconfig mutex. On the other hand, raid5_quiesce is called with
1070          * reconfig_mutex hold. The first step of raid5_quiesce() is waitting
1071          * for all IO finish, hence waitting for reclaim thread, while reclaim
1072          * thread is calling this function and waitting for reconfig mutex. So
1073          * there is a deadlock. We workaround this issue with a trylock.
1074          * FIXME: we could miss discard if we can't take reconfig mutex
1075          */
1076         set_mask_bits(&mddev->flags, 0,
1077                 BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING));
1078         if (!mddev_trylock(mddev))
1079                 return;
1080         md_update_sb(mddev, 1);
1081         mddev_unlock(mddev);
1082
1083         /* discard IO error really doesn't matter, ignore it */
1084         if (log->last_checkpoint < end) {
1085                 blkdev_issue_discard(bdev,
1086                                 log->last_checkpoint + log->rdev->data_offset,
1087                                 end - log->last_checkpoint, GFP_NOIO, 0);
1088         } else {
1089                 blkdev_issue_discard(bdev,
1090                                 log->last_checkpoint + log->rdev->data_offset,
1091                                 log->device_size - log->last_checkpoint,
1092                                 GFP_NOIO, 0);
1093                 blkdev_issue_discard(bdev, log->rdev->data_offset, end,
1094                                 GFP_NOIO, 0);
1095         }
1096 }
1097
1098 /*
1099  * r5c_flush_stripe moves stripe from cached list to handle_list. When called,
1100  * the stripe must be on r5c_cached_full_stripes or r5c_cached_partial_stripes.
1101  *
1102  * must hold conf->device_lock
1103  */
1104 static void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh)
1105 {
1106         BUG_ON(list_empty(&sh->lru));
1107         BUG_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
1108         BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
1109
1110         /*
1111          * The stripe is not ON_RELEASE_LIST, so it is safe to call
1112          * raid5_release_stripe() while holding conf->device_lock
1113          */
1114         BUG_ON(test_bit(STRIPE_ON_RELEASE_LIST, &sh->state));
1115         assert_spin_locked(&conf->device_lock);
1116
1117         list_del_init(&sh->lru);
1118         atomic_inc(&sh->count);
1119
1120         set_bit(STRIPE_HANDLE, &sh->state);
1121         atomic_inc(&conf->active_stripes);
1122         r5c_make_stripe_write_out(sh);
1123
1124         if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
1125                 atomic_inc(&conf->preread_active_stripes);
1126         raid5_release_stripe(sh);
1127 }
1128
1129 /*
1130  * if num == 0, flush all full stripes
1131  * if num > 0, flush all full stripes. If less than num full stripes are
1132  *             flushed, flush some partial stripes until totally num stripes are
1133  *             flushed or there is no more cached stripes.
1134  */
1135 void r5c_flush_cache(struct r5conf *conf, int num)
1136 {
1137         int count;
1138         struct stripe_head *sh, *next;
1139
1140         assert_spin_locked(&conf->device_lock);
1141         if (!conf->log)
1142                 return;
1143
1144         count = 0;
1145         list_for_each_entry_safe(sh, next, &conf->r5c_full_stripe_list, lru) {
1146                 r5c_flush_stripe(conf, sh);
1147                 count++;
1148         }
1149
1150         if (count >= num)
1151                 return;
1152         list_for_each_entry_safe(sh, next,
1153                                  &conf->r5c_partial_stripe_list, lru) {
1154                 r5c_flush_stripe(conf, sh);
1155                 if (++count >= num)
1156                         break;
1157         }
1158 }
1159
1160 static void r5c_do_reclaim(struct r5conf *conf)
1161 {
1162         struct r5l_log *log = conf->log;
1163         struct stripe_head *sh;
1164         int count = 0;
1165         unsigned long flags;
1166         int total_cached;
1167         int stripes_to_flush;
1168
1169         if (!r5c_is_writeback(log))
1170                 return;
1171
1172         total_cached = atomic_read(&conf->r5c_cached_partial_stripes) +
1173                 atomic_read(&conf->r5c_cached_full_stripes);
1174
1175         if (total_cached > conf->min_nr_stripes * 3 / 4 ||
1176             atomic_read(&conf->empty_inactive_list_nr) > 0)
1177                 /*
1178                  * if stripe cache pressure high, flush all full stripes and
1179                  * some partial stripes
1180                  */
1181                 stripes_to_flush = R5C_RECLAIM_STRIPE_GROUP;
1182         else if (total_cached > conf->min_nr_stripes * 1 / 2 ||
1183                  atomic_read(&conf->r5c_cached_full_stripes) >
1184                  R5C_FULL_STRIPE_FLUSH_BATCH)
1185                 /*
1186                  * if stripe cache pressure moderate, or if there is many full
1187                  * stripes,flush all full stripes
1188                  */
1189                 stripes_to_flush = 0;
1190         else
1191                 /* no need to flush */
1192                 stripes_to_flush = -1;
1193
1194         if (stripes_to_flush >= 0) {
1195                 spin_lock_irqsave(&conf->device_lock, flags);
1196                 r5c_flush_cache(conf, stripes_to_flush);
1197                 spin_unlock_irqrestore(&conf->device_lock, flags);
1198         }
1199
1200         /* if log space is tight, flush stripes on stripe_in_journal_list */
1201         if (test_bit(R5C_LOG_TIGHT, &conf->cache_state)) {
1202                 spin_lock_irqsave(&log->stripe_in_journal_lock, flags);
1203                 spin_lock(&conf->device_lock);
1204                 list_for_each_entry(sh, &log->stripe_in_journal_list, r5c) {
1205                         /*
1206                          * stripes on stripe_in_journal_list could be in any
1207                          * state of the stripe_cache state machine. In this
1208                          * case, we only want to flush stripe on
1209                          * r5c_cached_full/partial_stripes. The following
1210                          * condition makes sure the stripe is on one of the
1211                          * two lists.
1212                          */
1213                         if (!list_empty(&sh->lru) &&
1214                             !test_bit(STRIPE_HANDLE, &sh->state) &&
1215                             atomic_read(&sh->count) == 0) {
1216                                 r5c_flush_stripe(conf, sh);
1217                         }
1218                         if (count++ >= R5C_RECLAIM_STRIPE_GROUP)
1219                                 break;
1220                 }
1221                 spin_unlock(&conf->device_lock);
1222                 spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
1223         }
1224         md_wakeup_thread(conf->mddev->thread);
1225 }
1226
1227 static void r5l_do_reclaim(struct r5l_log *log)
1228 {
1229         struct r5conf *conf = log->rdev->mddev->private;
1230         sector_t reclaim_target = xchg(&log->reclaim_target, 0);
1231         sector_t reclaimable;
1232         sector_t next_checkpoint;
1233         bool write_super;
1234
1235         spin_lock_irq(&log->io_list_lock);
1236         write_super = r5l_reclaimable_space(log) > log->max_free_space ||
1237                 reclaim_target != 0 || !list_empty(&log->no_space_stripes);
1238         /*
1239          * move proper io_unit to reclaim list. We should not change the order.
1240          * reclaimable/unreclaimable io_unit can be mixed in the list, we
1241          * shouldn't reuse space of an unreclaimable io_unit
1242          */
1243         while (1) {
1244                 reclaimable = r5l_reclaimable_space(log);
1245                 if (reclaimable >= reclaim_target ||
1246                     (list_empty(&log->running_ios) &&
1247                      list_empty(&log->io_end_ios) &&
1248                      list_empty(&log->flushing_ios) &&
1249                      list_empty(&log->finished_ios)))
1250                         break;
1251
1252                 md_wakeup_thread(log->rdev->mddev->thread);
1253                 wait_event_lock_irq(log->iounit_wait,
1254                                     r5l_reclaimable_space(log) > reclaimable,
1255                                     log->io_list_lock);
1256         }
1257
1258         next_checkpoint = r5c_calculate_new_cp(conf);
1259         spin_unlock_irq(&log->io_list_lock);
1260
1261         BUG_ON(reclaimable < 0);
1262
1263         if (reclaimable == 0 || !write_super)
1264                 return;
1265
1266         /*
1267          * write_super will flush cache of each raid disk. We must write super
1268          * here, because the log area might be reused soon and we don't want to
1269          * confuse recovery
1270          */
1271         r5l_write_super_and_discard_space(log, next_checkpoint);
1272
1273         mutex_lock(&log->io_mutex);
1274         log->last_checkpoint = next_checkpoint;
1275         r5c_update_log_state(log);
1276         mutex_unlock(&log->io_mutex);
1277
1278         r5l_run_no_space_stripes(log);
1279 }
1280
1281 static void r5l_reclaim_thread(struct md_thread *thread)
1282 {
1283         struct mddev *mddev = thread->mddev;
1284         struct r5conf *conf = mddev->private;
1285         struct r5l_log *log = conf->log;
1286
1287         if (!log)
1288                 return;
1289         r5c_do_reclaim(conf);
1290         r5l_do_reclaim(log);
1291 }
1292
1293 void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
1294 {
1295         unsigned long target;
1296         unsigned long new = (unsigned long)space; /* overflow in theory */
1297
1298         if (!log)
1299                 return;
1300         do {
1301                 target = log->reclaim_target;
1302                 if (new < target)
1303                         return;
1304         } while (cmpxchg(&log->reclaim_target, target, new) != target);
1305         md_wakeup_thread(log->reclaim_thread);
1306 }
1307
1308 void r5l_quiesce(struct r5l_log *log, int state)
1309 {
1310         struct mddev *mddev;
1311         if (!log || state == 2)
1312                 return;
1313         if (state == 0) {
1314                 /*
1315                  * This is a special case for hotadd. In suspend, the array has
1316                  * no journal. In resume, journal is initialized as well as the
1317                  * reclaim thread.
1318                  */
1319                 if (log->reclaim_thread)
1320                         return;
1321                 log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
1322                                         log->rdev->mddev, "reclaim");
1323                 log->reclaim_thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL;
1324         } else if (state == 1) {
1325                 /* make sure r5l_write_super_and_discard_space exits */
1326                 mddev = log->rdev->mddev;
1327                 wake_up(&mddev->sb_wait);
1328                 r5l_wake_reclaim(log, MaxSector);
1329                 md_unregister_thread(&log->reclaim_thread);
1330                 r5l_do_reclaim(log);
1331         }
1332 }
1333
1334 bool r5l_log_disk_error(struct r5conf *conf)
1335 {
1336         struct r5l_log *log;
1337         bool ret;
1338         /* don't allow write if journal disk is missing */
1339         rcu_read_lock();
1340         log = rcu_dereference(conf->log);
1341
1342         if (!log)
1343                 ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
1344         else
1345                 ret = test_bit(Faulty, &log->rdev->flags);
1346         rcu_read_unlock();
1347         return ret;
1348 }
1349
1350 struct r5l_recovery_ctx {
1351         struct page *meta_page;         /* current meta */
1352         sector_t meta_total_blocks;     /* total size of current meta and data */
1353         sector_t pos;                   /* recovery position */
1354         u64 seq;                        /* recovery position seq */
1355 };
1356
1357 static int r5l_read_meta_block(struct r5l_log *log,
1358                                struct r5l_recovery_ctx *ctx)
1359 {
1360         struct page *page = ctx->meta_page;
1361         struct r5l_meta_block *mb;
1362         u32 crc, stored_crc;
1363
1364         if (!sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, REQ_OP_READ, 0,
1365                           false))
1366                 return -EIO;
1367
1368         mb = page_address(page);
1369         stored_crc = le32_to_cpu(mb->checksum);
1370         mb->checksum = 0;
1371
1372         if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
1373             le64_to_cpu(mb->seq) != ctx->seq ||
1374             mb->version != R5LOG_VERSION ||
1375             le64_to_cpu(mb->position) != ctx->pos)
1376                 return -EINVAL;
1377
1378         crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
1379         if (stored_crc != crc)
1380                 return -EINVAL;
1381
1382         if (le32_to_cpu(mb->meta_size) > PAGE_SIZE)
1383                 return -EINVAL;
1384
1385         ctx->meta_total_blocks = BLOCK_SECTORS;
1386
1387         return 0;
1388 }
1389
1390 static int r5l_recovery_flush_one_stripe(struct r5l_log *log,
1391                                          struct r5l_recovery_ctx *ctx,
1392                                          sector_t stripe_sect,
1393                                          int *offset)
1394 {
1395         struct r5conf *conf = log->rdev->mddev->private;
1396         struct stripe_head *sh;
1397         struct r5l_payload_data_parity *payload;
1398         int disk_index;
1399
1400         sh = raid5_get_active_stripe(conf, stripe_sect, 0, 0, 0);
1401         while (1) {
1402                 sector_t log_offset = r5l_ring_add(log, ctx->pos,
1403                                 ctx->meta_total_blocks);
1404                 payload = page_address(ctx->meta_page) + *offset;
1405
1406                 if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
1407                         raid5_compute_sector(conf,
1408                                              le64_to_cpu(payload->location), 0,
1409                                              &disk_index, sh);
1410
1411                         sync_page_io(log->rdev, log_offset, PAGE_SIZE,
1412                                      sh->dev[disk_index].page, REQ_OP_READ, 0,
1413                                      false);
1414                         sh->dev[disk_index].log_checksum =
1415                                 le32_to_cpu(payload->checksum[0]);
1416                         set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
1417                 } else {
1418                         disk_index = sh->pd_idx;
1419                         sync_page_io(log->rdev, log_offset, PAGE_SIZE,
1420                                      sh->dev[disk_index].page, REQ_OP_READ, 0,
1421                                      false);
1422                         sh->dev[disk_index].log_checksum =
1423                                 le32_to_cpu(payload->checksum[0]);
1424                         set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
1425
1426                         if (sh->qd_idx >= 0) {
1427                                 disk_index = sh->qd_idx;
1428                                 sync_page_io(log->rdev,
1429                                              r5l_ring_add(log, log_offset, BLOCK_SECTORS),
1430                                              PAGE_SIZE, sh->dev[disk_index].page,
1431                                              REQ_OP_READ, 0, false);
1432                                 sh->dev[disk_index].log_checksum =
1433                                         le32_to_cpu(payload->checksum[1]);
1434                                 set_bit(R5_Wantwrite,
1435                                         &sh->dev[disk_index].flags);
1436                         }
1437                 }
1438
1439                 ctx->meta_total_blocks += le32_to_cpu(payload->size);
1440                 *offset += sizeof(struct r5l_payload_data_parity) +
1441                         sizeof(__le32) *
1442                         (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
1443                 if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY)
1444                         break;
1445         }
1446
1447         for (disk_index = 0; disk_index < sh->disks; disk_index++) {
1448                 void *addr;
1449                 u32 checksum;
1450
1451                 if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
1452                         continue;
1453                 addr = kmap_atomic(sh->dev[disk_index].page);
1454                 checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE);
1455                 kunmap_atomic(addr);
1456                 if (checksum != sh->dev[disk_index].log_checksum)
1457                         goto error;
1458         }
1459
1460         for (disk_index = 0; disk_index < sh->disks; disk_index++) {
1461                 struct md_rdev *rdev, *rrdev;
1462
1463                 if (!test_and_clear_bit(R5_Wantwrite,
1464                                         &sh->dev[disk_index].flags))
1465                         continue;
1466
1467                 /* in case device is broken */
1468                 rcu_read_lock();
1469                 rdev = rcu_dereference(conf->disks[disk_index].rdev);
1470                 if (rdev) {
1471                         atomic_inc(&rdev->nr_pending);
1472                         rcu_read_unlock();
1473                         sync_page_io(rdev, stripe_sect, PAGE_SIZE,
1474                                      sh->dev[disk_index].page, REQ_OP_WRITE, 0,
1475                                      false);
1476                         rdev_dec_pending(rdev, rdev->mddev);
1477                         rcu_read_lock();
1478                 }
1479                 rrdev = rcu_dereference(conf->disks[disk_index].replacement);
1480                 if (rrdev) {
1481                         atomic_inc(&rrdev->nr_pending);
1482                         rcu_read_unlock();
1483                         sync_page_io(rrdev, stripe_sect, PAGE_SIZE,
1484                                      sh->dev[disk_index].page, REQ_OP_WRITE, 0,
1485                                      false);
1486                         rdev_dec_pending(rrdev, rrdev->mddev);
1487                         rcu_read_lock();
1488                 }
1489                 rcu_read_unlock();
1490         }
1491         raid5_release_stripe(sh);
1492         return 0;
1493
1494 error:
1495         for (disk_index = 0; disk_index < sh->disks; disk_index++)
1496                 sh->dev[disk_index].flags = 0;
1497         raid5_release_stripe(sh);
1498         return -EINVAL;
1499 }
1500
1501 static int r5l_recovery_flush_one_meta(struct r5l_log *log,
1502                                        struct r5l_recovery_ctx *ctx)
1503 {
1504         struct r5conf *conf = log->rdev->mddev->private;
1505         struct r5l_payload_data_parity *payload;
1506         struct r5l_meta_block *mb;
1507         int offset;
1508         sector_t stripe_sector;
1509
1510         mb = page_address(ctx->meta_page);
1511         offset = sizeof(struct r5l_meta_block);
1512
1513         while (offset < le32_to_cpu(mb->meta_size)) {
1514                 int dd;
1515
1516                 payload = (void *)mb + offset;
1517                 stripe_sector = raid5_compute_sector(conf,
1518                                                      le64_to_cpu(payload->location), 0, &dd, NULL);
1519                 if (r5l_recovery_flush_one_stripe(log, ctx, stripe_sector,
1520                                                   &offset))
1521                         return -EINVAL;
1522         }
1523         return 0;
1524 }
1525
1526 /* copy data/parity from log to raid disks */
1527 static void r5l_recovery_flush_log(struct r5l_log *log,
1528                                    struct r5l_recovery_ctx *ctx)
1529 {
1530         while (1) {
1531                 if (r5l_read_meta_block(log, ctx))
1532                         return;
1533                 if (r5l_recovery_flush_one_meta(log, ctx))
1534                         return;
1535                 ctx->seq++;
1536                 ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks);
1537         }
1538 }
1539
1540 static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
1541                                           u64 seq)
1542 {
1543         struct page *page;
1544         struct r5l_meta_block *mb;
1545         u32 crc;
1546
1547         page = alloc_page(GFP_KERNEL | __GFP_ZERO);
1548         if (!page)
1549                 return -ENOMEM;
1550         mb = page_address(page);
1551         mb->magic = cpu_to_le32(R5LOG_MAGIC);
1552         mb->version = R5LOG_VERSION;
1553         mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block));
1554         mb->seq = cpu_to_le64(seq);
1555         mb->position = cpu_to_le64(pos);
1556         crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
1557         mb->checksum = cpu_to_le32(crc);
1558
1559         if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE,
1560                           WRITE_FUA, false)) {
1561                 __free_page(page);
1562                 return -EIO;
1563         }
1564         __free_page(page);
1565         return 0;
1566 }
1567
1568 static int r5l_recovery_log(struct r5l_log *log)
1569 {
1570         struct r5l_recovery_ctx ctx;
1571
1572         ctx.pos = log->last_checkpoint;
1573         ctx.seq = log->last_cp_seq;
1574         ctx.meta_page = alloc_page(GFP_KERNEL);
1575         if (!ctx.meta_page)
1576                 return -ENOMEM;
1577
1578         r5l_recovery_flush_log(log, &ctx);
1579         __free_page(ctx.meta_page);
1580
1581         /*
1582          * we did a recovery. Now ctx.pos points to an invalid meta block. New
1583          * log will start here. but we can't let superblock point to last valid
1584          * meta block. The log might looks like:
1585          * | meta 1| meta 2| meta 3|
1586          * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If
1587          * superblock points to meta 1, we write a new valid meta 2n.  if crash
1588          * happens again, new recovery will start from meta 1. Since meta 2n is
1589          * valid now, recovery will think meta 3 is valid, which is wrong.
1590          * The solution is we create a new meta in meta2 with its seq == meta
1591          * 1's seq + 10 and let superblock points to meta2. The same recovery will
1592          * not think meta 3 is a valid meta, because its seq doesn't match
1593          */
1594         if (ctx.seq > log->last_cp_seq) {
1595                 int ret;
1596
1597                 ret = r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq + 10);
1598                 if (ret)
1599                         return ret;
1600                 log->seq = ctx.seq + 11;
1601                 log->log_start = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
1602                 r5l_write_super(log, ctx.pos);
1603                 log->last_checkpoint = ctx.pos;
1604                 log->next_checkpoint = ctx.pos;
1605         } else {
1606                 log->log_start = ctx.pos;
1607                 log->seq = ctx.seq;
1608         }
1609         return 0;
1610 }
1611
1612 static void r5l_write_super(struct r5l_log *log, sector_t cp)
1613 {
1614         struct mddev *mddev = log->rdev->mddev;
1615
1616         log->rdev->journal_tail = cp;
1617         set_bit(MD_CHANGE_DEVS, &mddev->flags);
1618 }
1619
1620 /*
1621  * Try handle write operation in caching phase. This function should only
1622  * be called in write-back mode.
1623  *
1624  * If all outstanding writes can be handled in caching phase, returns 0
1625  * If writes requires write-out phase, call r5c_make_stripe_write_out()
1626  * and returns -EAGAIN
1627  */
1628 int r5c_try_caching_write(struct r5conf *conf,
1629                           struct stripe_head *sh,
1630                           struct stripe_head_state *s,
1631                           int disks)
1632 {
1633         struct r5l_log *log = conf->log;
1634         int i;
1635         struct r5dev *dev;
1636         int to_cache = 0;
1637
1638         BUG_ON(!r5c_is_writeback(log));
1639
1640         if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
1641                 /*
1642                  * There are two different scenarios here:
1643                  *  1. The stripe has some data cached, and it is sent to
1644                  *     write-out phase for reclaim
1645                  *  2. The stripe is clean, and this is the first write
1646                  *
1647                  * For 1, return -EAGAIN, so we continue with
1648                  * handle_stripe_dirtying().
1649                  *
1650                  * For 2, set STRIPE_R5C_CACHING and continue with caching
1651                  * write.
1652                  */
1653
1654                 /* case 1: anything injournal or anything in written */
1655                 if (s->injournal > 0 || s->written > 0)
1656                         return -EAGAIN;
1657                 /* case 2 */
1658                 set_bit(STRIPE_R5C_CACHING, &sh->state);
1659         }
1660
1661         for (i = disks; i--; ) {
1662                 dev = &sh->dev[i];
1663                 /* if non-overwrite, use writing-out phase */
1664                 if (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags) &&
1665                     !test_bit(R5_InJournal, &dev->flags)) {
1666                         r5c_make_stripe_write_out(sh);
1667                         return -EAGAIN;
1668                 }
1669         }
1670
1671         for (i = disks; i--; ) {
1672                 dev = &sh->dev[i];
1673                 if (dev->towrite) {
1674                         set_bit(R5_Wantwrite, &dev->flags);
1675                         set_bit(R5_Wantdrain, &dev->flags);
1676                         set_bit(R5_LOCKED, &dev->flags);
1677                         to_cache++;
1678                 }
1679         }
1680
1681         if (to_cache) {
1682                 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
1683                 /*
1684                  * set STRIPE_LOG_TRAPPED, which triggers r5c_cache_data()
1685                  * in ops_run_io(). STRIPE_LOG_TRAPPED will be cleared in
1686                  * r5c_handle_data_cached()
1687                  */
1688                 set_bit(STRIPE_LOG_TRAPPED, &sh->state);
1689         }
1690
1691         return 0;
1692 }
1693
1694 /*
1695  * free extra pages (orig_page) we allocated for prexor
1696  */
1697 void r5c_release_extra_page(struct stripe_head *sh)
1698 {
1699         int i;
1700
1701         for (i = sh->disks; i--; )
1702                 if (sh->dev[i].page != sh->dev[i].orig_page) {
1703                         struct page *p = sh->dev[i].orig_page;
1704
1705                         sh->dev[i].orig_page = sh->dev[i].page;
1706                         put_page(p);
1707                 }
1708 }
1709
1710 /*
1711  * clean up the stripe (clear R5_InJournal for dev[pd_idx] etc.) after the
1712  * stripe is committed to RAID disks.
1713  */
1714 void r5c_finish_stripe_write_out(struct r5conf *conf,
1715                                  struct stripe_head *sh,
1716                                  struct stripe_head_state *s)
1717 {
1718         int i;
1719         int do_wakeup = 0;
1720
1721         if (!conf->log ||
1722             !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags))
1723                 return;
1724
1725         WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
1726         clear_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
1727
1728         if (conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
1729                 return;
1730
1731         for (i = sh->disks; i--; ) {
1732                 clear_bit(R5_InJournal, &sh->dev[i].flags);
1733                 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
1734                         do_wakeup = 1;
1735         }
1736
1737         /*
1738          * analyse_stripe() runs before r5c_finish_stripe_write_out(),
1739          * We updated R5_InJournal, so we also update s->injournal.
1740          */
1741         s->injournal = 0;
1742
1743         if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
1744                 if (atomic_dec_and_test(&conf->pending_full_writes))
1745                         md_wakeup_thread(conf->mddev->thread);
1746
1747         if (do_wakeup)
1748                 wake_up(&conf->wait_for_overlap);
1749
1750         if (conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
1751                 return;
1752
1753         spin_lock_irq(&conf->log->stripe_in_journal_lock);
1754         list_del_init(&sh->r5c);
1755         spin_unlock_irq(&conf->log->stripe_in_journal_lock);
1756         sh->log_start = MaxSector;
1757         atomic_dec(&conf->log->stripe_in_journal_count);
1758 }
1759
1760 int
1761 r5c_cache_data(struct r5l_log *log, struct stripe_head *sh,
1762                struct stripe_head_state *s)
1763 {
1764         struct r5conf *conf = sh->raid_conf;
1765         int pages = 0;
1766         int reserve;
1767         int i;
1768         int ret = 0;
1769
1770         BUG_ON(!log);
1771
1772         for (i = 0; i < sh->disks; i++) {
1773                 void *addr;
1774
1775                 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
1776                         continue;
1777                 addr = kmap_atomic(sh->dev[i].page);
1778                 sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
1779                                                     addr, PAGE_SIZE);
1780                 kunmap_atomic(addr);
1781                 pages++;
1782         }
1783         WARN_ON(pages == 0);
1784
1785         /*
1786          * The stripe must enter state machine again to call endio, so
1787          * don't delay.
1788          */
1789         clear_bit(STRIPE_DELAYED, &sh->state);
1790         atomic_inc(&sh->count);
1791
1792         mutex_lock(&log->io_mutex);
1793         /* meta + data */
1794         reserve = (1 + pages) << (PAGE_SHIFT - 9);
1795
1796         if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
1797             sh->log_start == MaxSector)
1798                 r5l_add_no_space_stripe(log, sh);
1799         else if (!r5l_has_free_space(log, reserve)) {
1800                 if (sh->log_start == log->last_checkpoint)
1801                         BUG();
1802                 else
1803                         r5l_add_no_space_stripe(log, sh);
1804         } else {
1805                 ret = r5l_log_stripe(log, sh, pages, 0);
1806                 if (ret) {
1807                         spin_lock_irq(&log->io_list_lock);
1808                         list_add_tail(&sh->log_list, &log->no_mem_stripes);
1809                         spin_unlock_irq(&log->io_list_lock);
1810                 }
1811         }
1812
1813         mutex_unlock(&log->io_mutex);
1814         return 0;
1815 }
1816
1817 static int r5l_load_log(struct r5l_log *log)
1818 {
1819         struct md_rdev *rdev = log->rdev;
1820         struct page *page;
1821         struct r5l_meta_block *mb;
1822         sector_t cp = log->rdev->journal_tail;
1823         u32 stored_crc, expected_crc;
1824         bool create_super = false;
1825         int ret;
1826
1827         /* Make sure it's valid */
1828         if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp)
1829                 cp = 0;
1830         page = alloc_page(GFP_KERNEL);
1831         if (!page)
1832                 return -ENOMEM;
1833
1834         if (!sync_page_io(rdev, cp, PAGE_SIZE, page, REQ_OP_READ, 0, false)) {
1835                 ret = -EIO;
1836                 goto ioerr;
1837         }
1838         mb = page_address(page);
1839
1840         if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
1841             mb->version != R5LOG_VERSION) {
1842                 create_super = true;
1843                 goto create;
1844         }
1845         stored_crc = le32_to_cpu(mb->checksum);
1846         mb->checksum = 0;
1847         expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
1848         if (stored_crc != expected_crc) {
1849                 create_super = true;
1850                 goto create;
1851         }
1852         if (le64_to_cpu(mb->position) != cp) {
1853                 create_super = true;
1854                 goto create;
1855         }
1856 create:
1857         if (create_super) {
1858                 log->last_cp_seq = prandom_u32();
1859                 cp = 0;
1860                 r5l_log_write_empty_meta_block(log, cp, log->last_cp_seq);
1861                 /*
1862                  * Make sure super points to correct address. Log might have
1863                  * data very soon. If super hasn't correct log tail address,
1864                  * recovery can't find the log
1865                  */
1866                 r5l_write_super(log, cp);
1867         } else
1868                 log->last_cp_seq = le64_to_cpu(mb->seq);
1869
1870         log->device_size = round_down(rdev->sectors, BLOCK_SECTORS);
1871         log->max_free_space = log->device_size >> RECLAIM_MAX_FREE_SPACE_SHIFT;
1872         if (log->max_free_space > RECLAIM_MAX_FREE_SPACE)
1873                 log->max_free_space = RECLAIM_MAX_FREE_SPACE;
1874         log->last_checkpoint = cp;
1875         log->next_checkpoint = cp;
1876         mutex_lock(&log->io_mutex);
1877         r5c_update_log_state(log);
1878         mutex_unlock(&log->io_mutex);
1879
1880         __free_page(page);
1881
1882         return r5l_recovery_log(log);
1883 ioerr:
1884         __free_page(page);
1885         return ret;
1886 }
1887
1888 int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
1889 {
1890         struct request_queue *q = bdev_get_queue(rdev->bdev);
1891         struct r5l_log *log;
1892
1893         if (PAGE_SIZE != 4096)
1894                 return -EINVAL;
1895
1896         /*
1897          * The PAGE_SIZE must be big enough to hold 1 r5l_meta_block and
1898          * raid_disks r5l_payload_data_parity.
1899          *
1900          * Write journal and cache does not work for very big array
1901          * (raid_disks > 203)
1902          */
1903         if (sizeof(struct r5l_meta_block) +
1904             ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) *
1905              conf->raid_disks) > PAGE_SIZE) {
1906                 pr_err("md/raid:%s: write journal/cache doesn't work for array with %d disks\n",
1907                        mdname(conf->mddev), conf->raid_disks);
1908                 return -EINVAL;
1909         }
1910
1911         log = kzalloc(sizeof(*log), GFP_KERNEL);
1912         if (!log)
1913                 return -ENOMEM;
1914         log->rdev = rdev;
1915
1916         log->need_cache_flush = test_bit(QUEUE_FLAG_WC, &q->queue_flags) != 0;
1917
1918         log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid,
1919                                        sizeof(rdev->mddev->uuid));
1920
1921         mutex_init(&log->io_mutex);
1922
1923         spin_lock_init(&log->io_list_lock);
1924         INIT_LIST_HEAD(&log->running_ios);
1925         INIT_LIST_HEAD(&log->io_end_ios);
1926         INIT_LIST_HEAD(&log->flushing_ios);
1927         INIT_LIST_HEAD(&log->finished_ios);
1928         bio_init(&log->flush_bio);
1929
1930         log->io_kc = KMEM_CACHE(r5l_io_unit, 0);
1931         if (!log->io_kc)
1932                 goto io_kc;
1933
1934         log->io_pool = mempool_create_slab_pool(R5L_POOL_SIZE, log->io_kc);
1935         if (!log->io_pool)
1936                 goto io_pool;
1937
1938         log->bs = bioset_create(R5L_POOL_SIZE, 0);
1939         if (!log->bs)
1940                 goto io_bs;
1941
1942         log->meta_pool = mempool_create_page_pool(R5L_POOL_SIZE, 0);
1943         if (!log->meta_pool)
1944                 goto out_mempool;
1945
1946         log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
1947                                                  log->rdev->mddev, "reclaim");
1948         if (!log->reclaim_thread)
1949                 goto reclaim_thread;
1950         log->reclaim_thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL;
1951
1952         init_waitqueue_head(&log->iounit_wait);
1953
1954         INIT_LIST_HEAD(&log->no_mem_stripes);
1955
1956         INIT_LIST_HEAD(&log->no_space_stripes);
1957         spin_lock_init(&log->no_space_stripes_lock);
1958
1959         log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
1960         INIT_LIST_HEAD(&log->stripe_in_journal_list);
1961         spin_lock_init(&log->stripe_in_journal_lock);
1962         atomic_set(&log->stripe_in_journal_count, 0);
1963
1964         if (r5l_load_log(log))
1965                 goto error;
1966
1967         rcu_assign_pointer(conf->log, log);
1968         set_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
1969         return 0;
1970
1971 error:
1972         md_unregister_thread(&log->reclaim_thread);
1973 reclaim_thread:
1974         mempool_destroy(log->meta_pool);
1975 out_mempool:
1976         bioset_free(log->bs);
1977 io_bs:
1978         mempool_destroy(log->io_pool);
1979 io_pool:
1980         kmem_cache_destroy(log->io_kc);
1981 io_kc:
1982         kfree(log);
1983         return -EINVAL;
1984 }
1985
1986 void r5l_exit_log(struct r5l_log *log)
1987 {
1988         md_unregister_thread(&log->reclaim_thread);
1989         mempool_destroy(log->meta_pool);
1990         bioset_free(log->bs);
1991         mempool_destroy(log->io_pool);
1992         kmem_cache_destroy(log->io_kc);
1993         kfree(log);
1994 }