]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blob - drivers/md/raid5-cache.c
md/r5cache: write-out phase and reclaim support
[mirror_ubuntu-bionic-kernel.git] / drivers / md / raid5-cache.c
1 /*
2 * Copyright (C) 2015 Shaohua Li <shli@fb.com>
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 *
13 */
14 #include <linux/kernel.h>
15 #include <linux/wait.h>
16 #include <linux/blkdev.h>
17 #include <linux/slab.h>
18 #include <linux/raid/md_p.h>
19 #include <linux/crc32c.h>
20 #include <linux/random.h>
21 #include "md.h"
22 #include "raid5.h"
23 #include "bitmap.h"
24
25 /*
26 * metadata/data stored in disk with 4k size unit (a block) regardless
27 * underneath hardware sector size. only works with PAGE_SIZE == 4096
28 */
29 #define BLOCK_SECTORS (8)
30
31 /*
32 * log->max_free_space is min(1/4 disk size, 10G reclaimable space).
33 *
34 * In write through mode, the reclaim runs every log->max_free_space.
35 * This can prevent the recovery scans for too long
36 */
37 #define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */
38 #define RECLAIM_MAX_FREE_SPACE_SHIFT (2)
39
40 /* wake up reclaim thread periodically */
41 #define R5C_RECLAIM_WAKEUP_INTERVAL (30 * HZ)
42 /* start flush with these full stripes */
43 #define R5C_FULL_STRIPE_FLUSH_BATCH 256
44 /* reclaim stripes in groups */
45 #define R5C_RECLAIM_STRIPE_GROUP (NR_STRIPE_HASH_LOCKS * 2)
46
47 /*
48 * We only need 2 bios per I/O unit to make progress, but ensure we
49 * have a few more available to not get too tight.
50 */
51 #define R5L_POOL_SIZE 4
52
53 /*
54 * r5c journal modes of the array: write-back or write-through.
55 * write-through mode has identical behavior as existing log only
56 * implementation.
57 */
58 enum r5c_journal_mode {
59 R5C_JOURNAL_MODE_WRITE_THROUGH = 0,
60 R5C_JOURNAL_MODE_WRITE_BACK = 1,
61 };
62
63 /*
64 * raid5 cache state machine
65 *
66 * With rhe RAID cache, each stripe works in two phases:
67 * - caching phase
68 * - writing-out phase
69 *
70 * These two phases are controlled by bit STRIPE_R5C_CACHING:
71 * if STRIPE_R5C_CACHING == 0, the stripe is in writing-out phase
72 * if STRIPE_R5C_CACHING == 1, the stripe is in caching phase
73 *
74 * When there is no journal, or the journal is in write-through mode,
75 * the stripe is always in writing-out phase.
76 *
77 * For write-back journal, the stripe is sent to caching phase on write
78 * (r5c_try_caching_write). r5c_make_stripe_write_out() kicks off
79 * the write-out phase by clearing STRIPE_R5C_CACHING.
80 *
81 * Stripes in caching phase do not write the raid disks. Instead, all
82 * writes are committed from the log device. Therefore, a stripe in
83 * caching phase handles writes as:
84 * - write to log device
85 * - return IO
86 *
87 * Stripes in writing-out phase handle writes as:
88 * - calculate parity
89 * - write pending data and parity to journal
90 * - write data and parity to raid disks
91 * - return IO for pending writes
92 */
93
94 struct r5l_log {
95 struct md_rdev *rdev;
96
97 u32 uuid_checksum;
98
99 sector_t device_size; /* log device size, round to
100 * BLOCK_SECTORS */
101 sector_t max_free_space; /* reclaim run if free space is at
102 * this size */
103
104 sector_t last_checkpoint; /* log tail. where recovery scan
105 * starts from */
106 u64 last_cp_seq; /* log tail sequence */
107
108 sector_t log_start; /* log head. where new data appends */
109 u64 seq; /* log head sequence */
110
111 sector_t next_checkpoint;
112 u64 next_cp_seq;
113
114 struct mutex io_mutex;
115 struct r5l_io_unit *current_io; /* current io_unit accepting new data */
116
117 spinlock_t io_list_lock;
118 struct list_head running_ios; /* io_units which are still running,
119 * and have not yet been completely
120 * written to the log */
121 struct list_head io_end_ios; /* io_units which have been completely
122 * written to the log but not yet written
123 * to the RAID */
124 struct list_head flushing_ios; /* io_units which are waiting for log
125 * cache flush */
126 struct list_head finished_ios; /* io_units which settle down in log disk */
127 struct bio flush_bio;
128
129 struct list_head no_mem_stripes; /* pending stripes, -ENOMEM */
130
131 struct kmem_cache *io_kc;
132 mempool_t *io_pool;
133 struct bio_set *bs;
134 mempool_t *meta_pool;
135
136 struct md_thread *reclaim_thread;
137 unsigned long reclaim_target; /* number of space that need to be
138 * reclaimed. if it's 0, reclaim spaces
139 * used by io_units which are in
140 * IO_UNIT_STRIPE_END state (eg, reclaim
141 * dones't wait for specific io_unit
142 * switching to IO_UNIT_STRIPE_END
143 * state) */
144 wait_queue_head_t iounit_wait;
145
146 struct list_head no_space_stripes; /* pending stripes, log has no space */
147 spinlock_t no_space_stripes_lock;
148
149 bool need_cache_flush;
150
151 /* for r5c_cache */
152 enum r5c_journal_mode r5c_journal_mode;
153
154 /* all stripes in r5cache, in the order of seq at sh->log_start */
155 struct list_head stripe_in_journal_list;
156
157 spinlock_t stripe_in_journal_lock;
158 atomic_t stripe_in_journal_count;
159 };
160
161 /*
162 * an IO range starts from a meta data block and end at the next meta data
163 * block. The io unit's the meta data block tracks data/parity followed it. io
164 * unit is written to log disk with normal write, as we always flush log disk
165 * first and then start move data to raid disks, there is no requirement to
166 * write io unit with FLUSH/FUA
167 */
168 struct r5l_io_unit {
169 struct r5l_log *log;
170
171 struct page *meta_page; /* store meta block */
172 int meta_offset; /* current offset in meta_page */
173
174 struct bio *current_bio;/* current_bio accepting new data */
175
176 atomic_t pending_stripe;/* how many stripes not flushed to raid */
177 u64 seq; /* seq number of the metablock */
178 sector_t log_start; /* where the io_unit starts */
179 sector_t log_end; /* where the io_unit ends */
180 struct list_head log_sibling; /* log->running_ios */
181 struct list_head stripe_list; /* stripes added to the io_unit */
182
183 int state;
184 bool need_split_bio;
185 };
186
187 /* r5l_io_unit state */
188 enum r5l_io_unit_state {
189 IO_UNIT_RUNNING = 0, /* accepting new IO */
190 IO_UNIT_IO_START = 1, /* io_unit bio start writing to log,
191 * don't accepting new bio */
192 IO_UNIT_IO_END = 2, /* io_unit bio finish writing to log */
193 IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */
194 };
195
196 bool r5c_is_writeback(struct r5l_log *log)
197 {
198 return (log != NULL &&
199 log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK);
200 }
201
202 static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc)
203 {
204 start += inc;
205 if (start >= log->device_size)
206 start = start - log->device_size;
207 return start;
208 }
209
210 static sector_t r5l_ring_distance(struct r5l_log *log, sector_t start,
211 sector_t end)
212 {
213 if (end >= start)
214 return end - start;
215 else
216 return end + log->device_size - start;
217 }
218
219 static bool r5l_has_free_space(struct r5l_log *log, sector_t size)
220 {
221 sector_t used_size;
222
223 used_size = r5l_ring_distance(log, log->last_checkpoint,
224 log->log_start);
225
226 return log->device_size > used_size + size;
227 }
228
229 static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
230 enum r5l_io_unit_state state)
231 {
232 if (WARN_ON(io->state >= state))
233 return;
234 io->state = state;
235 }
236
237 static void
238 r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev,
239 struct bio_list *return_bi)
240 {
241 struct bio *wbi, *wbi2;
242
243 wbi = dev->written;
244 dev->written = NULL;
245 while (wbi && wbi->bi_iter.bi_sector <
246 dev->sector + STRIPE_SECTORS) {
247 wbi2 = r5_next_bio(wbi, dev->sector);
248 if (!raid5_dec_bi_active_stripes(wbi)) {
249 md_write_end(conf->mddev);
250 bio_list_add(return_bi, wbi);
251 }
252 wbi = wbi2;
253 }
254 }
255
256 void r5c_handle_cached_data_endio(struct r5conf *conf,
257 struct stripe_head *sh, int disks, struct bio_list *return_bi)
258 {
259 int i;
260
261 for (i = sh->disks; i--; ) {
262 if (sh->dev[i].written) {
263 set_bit(R5_UPTODATE, &sh->dev[i].flags);
264 r5c_return_dev_pending_writes(conf, &sh->dev[i],
265 return_bi);
266 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
267 STRIPE_SECTORS,
268 !test_bit(STRIPE_DEGRADED, &sh->state),
269 0);
270 }
271 }
272 }
273
274 /* Check whether we should flush some stripes to free up stripe cache */
275 void r5c_check_stripe_cache_usage(struct r5conf *conf)
276 {
277 int total_cached;
278
279 if (!r5c_is_writeback(conf->log))
280 return;
281
282 total_cached = atomic_read(&conf->r5c_cached_partial_stripes) +
283 atomic_read(&conf->r5c_cached_full_stripes);
284
285 /*
286 * The following condition is true for either of the following:
287 * - stripe cache pressure high:
288 * total_cached > 3/4 min_nr_stripes ||
289 * empty_inactive_list_nr > 0
290 * - stripe cache pressure moderate:
291 * total_cached > 1/2 min_nr_stripes
292 */
293 if (total_cached > conf->min_nr_stripes * 1 / 2 ||
294 atomic_read(&conf->empty_inactive_list_nr) > 0)
295 r5l_wake_reclaim(conf->log, 0);
296 }
297
298 /*
299 * flush cache when there are R5C_FULL_STRIPE_FLUSH_BATCH or more full
300 * stripes in the cache
301 */
302 void r5c_check_cached_full_stripe(struct r5conf *conf)
303 {
304 if (!r5c_is_writeback(conf->log))
305 return;
306
307 /*
308 * wake up reclaim for R5C_FULL_STRIPE_FLUSH_BATCH cached stripes
309 * or a full stripe (chunk size / 4k stripes).
310 */
311 if (atomic_read(&conf->r5c_cached_full_stripes) >=
312 min(R5C_FULL_STRIPE_FLUSH_BATCH,
313 conf->chunk_sectors >> STRIPE_SHIFT))
314 r5l_wake_reclaim(conf->log, 0);
315 }
316
317 /*
318 * Total log space (in sectors) needed to flush all data in cache
319 *
320 * Currently, writing-out phase automatically includes all pending writes
321 * to the same sector. So the reclaim of each stripe takes up to
322 * (conf->raid_disks + 1) pages of log space.
323 *
324 * To totally avoid deadlock due to log space, the code reserves
325 * (conf->raid_disks + 1) pages for each stripe in cache, which is not
326 * necessary in most cases.
327 *
328 * To improve this, we will need writing-out phase to be able to NOT include
329 * pending writes, which will reduce the requirement to
330 * (conf->max_degraded + 1) pages per stripe in cache.
331 */
332 static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf)
333 {
334 struct r5l_log *log = conf->log;
335
336 if (!r5c_is_writeback(log))
337 return 0;
338
339 return BLOCK_SECTORS * (conf->raid_disks + 1) *
340 atomic_read(&log->stripe_in_journal_count);
341 }
342
343 /*
344 * evaluate log space usage and update R5C_LOG_TIGHT and R5C_LOG_CRITICAL
345 *
346 * R5C_LOG_TIGHT is set when free space on the log device is less than 3x of
347 * reclaim_required_space. R5C_LOG_CRITICAL is set when free space on the log
348 * device is less than 2x of reclaim_required_space.
349 */
350 static inline void r5c_update_log_state(struct r5l_log *log)
351 {
352 struct r5conf *conf = log->rdev->mddev->private;
353 sector_t free_space;
354 sector_t reclaim_space;
355
356 if (!r5c_is_writeback(log))
357 return;
358
359 free_space = r5l_ring_distance(log, log->log_start,
360 log->last_checkpoint);
361 reclaim_space = r5c_log_required_to_flush_cache(conf);
362 if (free_space < 2 * reclaim_space)
363 set_bit(R5C_LOG_CRITICAL, &conf->cache_state);
364 else
365 clear_bit(R5C_LOG_CRITICAL, &conf->cache_state);
366 if (free_space < 3 * reclaim_space)
367 set_bit(R5C_LOG_TIGHT, &conf->cache_state);
368 else
369 clear_bit(R5C_LOG_TIGHT, &conf->cache_state);
370 }
371
372 /*
373 * Put the stripe into writing-out phase by clearing STRIPE_R5C_CACHING.
374 * This function should only be called in write-back mode.
375 */
376 void r5c_make_stripe_write_out(struct stripe_head *sh)
377 {
378 struct r5conf *conf = sh->raid_conf;
379 struct r5l_log *log = conf->log;
380
381 BUG_ON(!r5c_is_writeback(log));
382
383 WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
384 clear_bit(STRIPE_R5C_CACHING, &sh->state);
385
386 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
387 atomic_inc(&conf->preread_active_stripes);
388
389 if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) {
390 BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0);
391 atomic_dec(&conf->r5c_cached_partial_stripes);
392 }
393
394 if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
395 BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0);
396 atomic_dec(&conf->r5c_cached_full_stripes);
397 }
398 }
399
400 static void r5c_handle_data_cached(struct stripe_head *sh)
401 {
402 int i;
403
404 for (i = sh->disks; i--; )
405 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
406 set_bit(R5_InJournal, &sh->dev[i].flags);
407 clear_bit(R5_LOCKED, &sh->dev[i].flags);
408 }
409 clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
410 }
411
412 /*
413 * this journal write must contain full parity,
414 * it may also contain some data pages
415 */
416 static void r5c_handle_parity_cached(struct stripe_head *sh)
417 {
418 int i;
419
420 for (i = sh->disks; i--; )
421 if (test_bit(R5_InJournal, &sh->dev[i].flags))
422 set_bit(R5_Wantwrite, &sh->dev[i].flags);
423 }
424
425 /*
426 * Setting proper flags after writing (or flushing) data and/or parity to the
427 * log device. This is called from r5l_log_endio() or r5l_log_flush_endio().
428 */
429 static void r5c_finish_cache_stripe(struct stripe_head *sh)
430 {
431 struct r5l_log *log = sh->raid_conf->log;
432
433 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
434 BUG_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
435 /*
436 * Set R5_InJournal for parity dev[pd_idx]. This means
437 * all data AND parity in the journal. For RAID 6, it is
438 * NOT necessary to set the flag for dev[qd_idx], as the
439 * two parities are written out together.
440 */
441 set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
442 } else if (test_bit(STRIPE_R5C_CACHING, &sh->state)) {
443 r5c_handle_data_cached(sh);
444 } else {
445 r5c_handle_parity_cached(sh);
446 set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
447 }
448 }
449
450 static void r5l_io_run_stripes(struct r5l_io_unit *io)
451 {
452 struct stripe_head *sh, *next;
453
454 list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) {
455 list_del_init(&sh->log_list);
456
457 r5c_finish_cache_stripe(sh);
458
459 set_bit(STRIPE_HANDLE, &sh->state);
460 raid5_release_stripe(sh);
461 }
462 }
463
464 static void r5l_log_run_stripes(struct r5l_log *log)
465 {
466 struct r5l_io_unit *io, *next;
467
468 assert_spin_locked(&log->io_list_lock);
469
470 list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
471 /* don't change list order */
472 if (io->state < IO_UNIT_IO_END)
473 break;
474
475 list_move_tail(&io->log_sibling, &log->finished_ios);
476 r5l_io_run_stripes(io);
477 }
478 }
479
480 static void r5l_move_to_end_ios(struct r5l_log *log)
481 {
482 struct r5l_io_unit *io, *next;
483
484 assert_spin_locked(&log->io_list_lock);
485
486 list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
487 /* don't change list order */
488 if (io->state < IO_UNIT_IO_END)
489 break;
490 list_move_tail(&io->log_sibling, &log->io_end_ios);
491 }
492 }
493
494 static void r5l_log_endio(struct bio *bio)
495 {
496 struct r5l_io_unit *io = bio->bi_private;
497 struct r5l_log *log = io->log;
498 unsigned long flags;
499
500 if (bio->bi_error)
501 md_error(log->rdev->mddev, log->rdev);
502
503 bio_put(bio);
504 mempool_free(io->meta_page, log->meta_pool);
505
506 spin_lock_irqsave(&log->io_list_lock, flags);
507 __r5l_set_io_unit_state(io, IO_UNIT_IO_END);
508 if (log->need_cache_flush)
509 r5l_move_to_end_ios(log);
510 else
511 r5l_log_run_stripes(log);
512 spin_unlock_irqrestore(&log->io_list_lock, flags);
513
514 if (log->need_cache_flush)
515 md_wakeup_thread(log->rdev->mddev->thread);
516 }
517
518 static void r5l_submit_current_io(struct r5l_log *log)
519 {
520 struct r5l_io_unit *io = log->current_io;
521 struct r5l_meta_block *block;
522 unsigned long flags;
523 u32 crc;
524
525 if (!io)
526 return;
527
528 block = page_address(io->meta_page);
529 block->meta_size = cpu_to_le32(io->meta_offset);
530 crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE);
531 block->checksum = cpu_to_le32(crc);
532
533 log->current_io = NULL;
534 spin_lock_irqsave(&log->io_list_lock, flags);
535 __r5l_set_io_unit_state(io, IO_UNIT_IO_START);
536 spin_unlock_irqrestore(&log->io_list_lock, flags);
537
538 submit_bio(io->current_bio);
539 }
540
541 static struct bio *r5l_bio_alloc(struct r5l_log *log)
542 {
543 struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, log->bs);
544
545 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
546 bio->bi_bdev = log->rdev->bdev;
547 bio->bi_iter.bi_sector = log->rdev->data_offset + log->log_start;
548
549 return bio;
550 }
551
552 static void r5_reserve_log_entry(struct r5l_log *log, struct r5l_io_unit *io)
553 {
554 log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS);
555
556 r5c_update_log_state(log);
557 /*
558 * If we filled up the log device start from the beginning again,
559 * which will require a new bio.
560 *
561 * Note: for this to work properly the log size needs to me a multiple
562 * of BLOCK_SECTORS.
563 */
564 if (log->log_start == 0)
565 io->need_split_bio = true;
566
567 io->log_end = log->log_start;
568 }
569
570 static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
571 {
572 struct r5l_io_unit *io;
573 struct r5l_meta_block *block;
574
575 io = mempool_alloc(log->io_pool, GFP_ATOMIC);
576 if (!io)
577 return NULL;
578 memset(io, 0, sizeof(*io));
579
580 io->log = log;
581 INIT_LIST_HEAD(&io->log_sibling);
582 INIT_LIST_HEAD(&io->stripe_list);
583 io->state = IO_UNIT_RUNNING;
584
585 io->meta_page = mempool_alloc(log->meta_pool, GFP_NOIO);
586 block = page_address(io->meta_page);
587 clear_page(block);
588 block->magic = cpu_to_le32(R5LOG_MAGIC);
589 block->version = R5LOG_VERSION;
590 block->seq = cpu_to_le64(log->seq);
591 block->position = cpu_to_le64(log->log_start);
592
593 io->log_start = log->log_start;
594 io->meta_offset = sizeof(struct r5l_meta_block);
595 io->seq = log->seq++;
596
597 io->current_bio = r5l_bio_alloc(log);
598 io->current_bio->bi_end_io = r5l_log_endio;
599 io->current_bio->bi_private = io;
600 bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0);
601
602 r5_reserve_log_entry(log, io);
603
604 spin_lock_irq(&log->io_list_lock);
605 list_add_tail(&io->log_sibling, &log->running_ios);
606 spin_unlock_irq(&log->io_list_lock);
607
608 return io;
609 }
610
611 static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size)
612 {
613 if (log->current_io &&
614 log->current_io->meta_offset + payload_size > PAGE_SIZE)
615 r5l_submit_current_io(log);
616
617 if (!log->current_io) {
618 log->current_io = r5l_new_meta(log);
619 if (!log->current_io)
620 return -ENOMEM;
621 }
622
623 return 0;
624 }
625
626 static void r5l_append_payload_meta(struct r5l_log *log, u16 type,
627 sector_t location,
628 u32 checksum1, u32 checksum2,
629 bool checksum2_valid)
630 {
631 struct r5l_io_unit *io = log->current_io;
632 struct r5l_payload_data_parity *payload;
633
634 payload = page_address(io->meta_page) + io->meta_offset;
635 payload->header.type = cpu_to_le16(type);
636 payload->header.flags = cpu_to_le16(0);
637 payload->size = cpu_to_le32((1 + !!checksum2_valid) <<
638 (PAGE_SHIFT - 9));
639 payload->location = cpu_to_le64(location);
640 payload->checksum[0] = cpu_to_le32(checksum1);
641 if (checksum2_valid)
642 payload->checksum[1] = cpu_to_le32(checksum2);
643
644 io->meta_offset += sizeof(struct r5l_payload_data_parity) +
645 sizeof(__le32) * (1 + !!checksum2_valid);
646 }
647
648 static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
649 {
650 struct r5l_io_unit *io = log->current_io;
651
652 if (io->need_split_bio) {
653 struct bio *prev = io->current_bio;
654
655 io->current_bio = r5l_bio_alloc(log);
656 bio_chain(io->current_bio, prev);
657
658 submit_bio(prev);
659 }
660
661 if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0))
662 BUG();
663
664 r5_reserve_log_entry(log, io);
665 }
666
667 static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
668 int data_pages, int parity_pages)
669 {
670 int i;
671 int meta_size;
672 int ret;
673 struct r5l_io_unit *io;
674
675 meta_size =
676 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
677 * data_pages) +
678 sizeof(struct r5l_payload_data_parity) +
679 sizeof(__le32) * parity_pages;
680
681 ret = r5l_get_meta(log, meta_size);
682 if (ret)
683 return ret;
684
685 io = log->current_io;
686
687 for (i = 0; i < sh->disks; i++) {
688 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) ||
689 test_bit(R5_InJournal, &sh->dev[i].flags))
690 continue;
691 if (i == sh->pd_idx || i == sh->qd_idx)
692 continue;
693 r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA,
694 raid5_compute_blocknr(sh, i, 0),
695 sh->dev[i].log_checksum, 0, false);
696 r5l_append_payload_page(log, sh->dev[i].page);
697 }
698
699 if (parity_pages == 2) {
700 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
701 sh->sector, sh->dev[sh->pd_idx].log_checksum,
702 sh->dev[sh->qd_idx].log_checksum, true);
703 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
704 r5l_append_payload_page(log, sh->dev[sh->qd_idx].page);
705 } else if (parity_pages == 1) {
706 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
707 sh->sector, sh->dev[sh->pd_idx].log_checksum,
708 0, false);
709 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
710 } else /* Just writing data, not parity, in caching phase */
711 BUG_ON(parity_pages != 0);
712
713 list_add_tail(&sh->log_list, &io->stripe_list);
714 atomic_inc(&io->pending_stripe);
715 sh->log_io = io;
716
717 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
718 return 0;
719
720 if (sh->log_start == MaxSector) {
721 BUG_ON(!list_empty(&sh->r5c));
722 sh->log_start = io->log_start;
723 spin_lock_irq(&log->stripe_in_journal_lock);
724 list_add_tail(&sh->r5c,
725 &log->stripe_in_journal_list);
726 spin_unlock_irq(&log->stripe_in_journal_lock);
727 atomic_inc(&log->stripe_in_journal_count);
728 }
729 return 0;
730 }
731
732 /* add stripe to no_space_stripes, and then wake up reclaim */
733 static inline void r5l_add_no_space_stripe(struct r5l_log *log,
734 struct stripe_head *sh)
735 {
736 spin_lock(&log->no_space_stripes_lock);
737 list_add_tail(&sh->log_list, &log->no_space_stripes);
738 spin_unlock(&log->no_space_stripes_lock);
739 }
740
741 /*
742 * running in raid5d, where reclaim could wait for raid5d too (when it flushes
743 * data from log to raid disks), so we shouldn't wait for reclaim here
744 */
745 int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
746 {
747 struct r5conf *conf = sh->raid_conf;
748 int write_disks = 0;
749 int data_pages, parity_pages;
750 int reserve;
751 int i;
752 int ret = 0;
753 bool wake_reclaim = false;
754
755 if (!log)
756 return -EAGAIN;
757 /* Don't support stripe batch */
758 if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
759 test_bit(STRIPE_SYNCING, &sh->state)) {
760 /* the stripe is written to log, we start writing it to raid */
761 clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
762 return -EAGAIN;
763 }
764
765 WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
766
767 for (i = 0; i < sh->disks; i++) {
768 void *addr;
769
770 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) ||
771 test_bit(R5_InJournal, &sh->dev[i].flags))
772 continue;
773
774 write_disks++;
775 /* checksum is already calculated in last run */
776 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
777 continue;
778 addr = kmap_atomic(sh->dev[i].page);
779 sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
780 addr, PAGE_SIZE);
781 kunmap_atomic(addr);
782 }
783 parity_pages = 1 + !!(sh->qd_idx >= 0);
784 data_pages = write_disks - parity_pages;
785
786 set_bit(STRIPE_LOG_TRAPPED, &sh->state);
787 /*
788 * The stripe must enter state machine again to finish the write, so
789 * don't delay.
790 */
791 clear_bit(STRIPE_DELAYED, &sh->state);
792 atomic_inc(&sh->count);
793
794 mutex_lock(&log->io_mutex);
795 /* meta + data */
796 reserve = (1 + write_disks) << (PAGE_SHIFT - 9);
797
798 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
799 if (!r5l_has_free_space(log, reserve)) {
800 r5l_add_no_space_stripe(log, sh);
801 wake_reclaim = true;
802 } else {
803 ret = r5l_log_stripe(log, sh, data_pages, parity_pages);
804 if (ret) {
805 spin_lock_irq(&log->io_list_lock);
806 list_add_tail(&sh->log_list,
807 &log->no_mem_stripes);
808 spin_unlock_irq(&log->io_list_lock);
809 }
810 }
811 } else { /* R5C_JOURNAL_MODE_WRITE_BACK */
812 /*
813 * log space critical, do not process stripes that are
814 * not in cache yet (sh->log_start == MaxSector).
815 */
816 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
817 sh->log_start == MaxSector) {
818 r5l_add_no_space_stripe(log, sh);
819 wake_reclaim = true;
820 reserve = 0;
821 } else if (!r5l_has_free_space(log, reserve)) {
822 if (sh->log_start == log->last_checkpoint)
823 BUG();
824 else
825 r5l_add_no_space_stripe(log, sh);
826 } else {
827 ret = r5l_log_stripe(log, sh, data_pages, parity_pages);
828 if (ret) {
829 spin_lock_irq(&log->io_list_lock);
830 list_add_tail(&sh->log_list,
831 &log->no_mem_stripes);
832 spin_unlock_irq(&log->io_list_lock);
833 }
834 }
835 }
836
837 mutex_unlock(&log->io_mutex);
838 if (wake_reclaim)
839 r5l_wake_reclaim(log, reserve);
840 return 0;
841 }
842
843 void r5l_write_stripe_run(struct r5l_log *log)
844 {
845 if (!log)
846 return;
847 mutex_lock(&log->io_mutex);
848 r5l_submit_current_io(log);
849 mutex_unlock(&log->io_mutex);
850 }
851
852 int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
853 {
854 if (!log)
855 return -ENODEV;
856 /*
857 * we flush log disk cache first, then write stripe data to raid disks.
858 * So if bio is finished, the log disk cache is flushed already. The
859 * recovery guarantees we can recovery the bio from log disk, so we
860 * don't need to flush again
861 */
862 if (bio->bi_iter.bi_size == 0) {
863 bio_endio(bio);
864 return 0;
865 }
866 bio->bi_opf &= ~REQ_PREFLUSH;
867 return -EAGAIN;
868 }
869
870 /* This will run after log space is reclaimed */
871 static void r5l_run_no_space_stripes(struct r5l_log *log)
872 {
873 struct stripe_head *sh;
874
875 spin_lock(&log->no_space_stripes_lock);
876 while (!list_empty(&log->no_space_stripes)) {
877 sh = list_first_entry(&log->no_space_stripes,
878 struct stripe_head, log_list);
879 list_del_init(&sh->log_list);
880 set_bit(STRIPE_HANDLE, &sh->state);
881 raid5_release_stripe(sh);
882 }
883 spin_unlock(&log->no_space_stripes_lock);
884 }
885
886 /*
887 * calculate new last_checkpoint
888 * for write through mode, returns log->next_checkpoint
889 * for write back, returns log_start of first sh in stripe_in_journal_list
890 */
891 static sector_t r5c_calculate_new_cp(struct r5conf *conf)
892 {
893 struct stripe_head *sh;
894 struct r5l_log *log = conf->log;
895 sector_t new_cp;
896 unsigned long flags;
897
898 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
899 return log->next_checkpoint;
900
901 spin_lock_irqsave(&log->stripe_in_journal_lock, flags);
902 if (list_empty(&conf->log->stripe_in_journal_list)) {
903 /* all stripes flushed */
904 spin_unlock(&log->stripe_in_journal_lock);
905 return log->next_checkpoint;
906 }
907 sh = list_first_entry(&conf->log->stripe_in_journal_list,
908 struct stripe_head, r5c);
909 new_cp = sh->log_start;
910 spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
911 return new_cp;
912 }
913
914 static sector_t r5l_reclaimable_space(struct r5l_log *log)
915 {
916 struct r5conf *conf = log->rdev->mddev->private;
917
918 return r5l_ring_distance(log, log->last_checkpoint,
919 r5c_calculate_new_cp(conf));
920 }
921
922 static void r5l_run_no_mem_stripe(struct r5l_log *log)
923 {
924 struct stripe_head *sh;
925
926 assert_spin_locked(&log->io_list_lock);
927
928 if (!list_empty(&log->no_mem_stripes)) {
929 sh = list_first_entry(&log->no_mem_stripes,
930 struct stripe_head, log_list);
931 list_del_init(&sh->log_list);
932 set_bit(STRIPE_HANDLE, &sh->state);
933 raid5_release_stripe(sh);
934 }
935 }
936
937 static bool r5l_complete_finished_ios(struct r5l_log *log)
938 {
939 struct r5l_io_unit *io, *next;
940 bool found = false;
941
942 assert_spin_locked(&log->io_list_lock);
943
944 list_for_each_entry_safe(io, next, &log->finished_ios, log_sibling) {
945 /* don't change list order */
946 if (io->state < IO_UNIT_STRIPE_END)
947 break;
948
949 log->next_checkpoint = io->log_start;
950 log->next_cp_seq = io->seq;
951
952 list_del(&io->log_sibling);
953 mempool_free(io, log->io_pool);
954 r5l_run_no_mem_stripe(log);
955
956 found = true;
957 }
958
959 return found;
960 }
961
962 static void __r5l_stripe_write_finished(struct r5l_io_unit *io)
963 {
964 struct r5l_log *log = io->log;
965 struct r5conf *conf = log->rdev->mddev->private;
966 unsigned long flags;
967
968 spin_lock_irqsave(&log->io_list_lock, flags);
969 __r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END);
970
971 if (!r5l_complete_finished_ios(log)) {
972 spin_unlock_irqrestore(&log->io_list_lock, flags);
973 return;
974 }
975
976 if (r5l_reclaimable_space(log) > log->max_free_space ||
977 test_bit(R5C_LOG_TIGHT, &conf->cache_state))
978 r5l_wake_reclaim(log, 0);
979
980 spin_unlock_irqrestore(&log->io_list_lock, flags);
981 wake_up(&log->iounit_wait);
982 }
983
984 void r5l_stripe_write_finished(struct stripe_head *sh)
985 {
986 struct r5l_io_unit *io;
987
988 io = sh->log_io;
989 sh->log_io = NULL;
990
991 if (io && atomic_dec_and_test(&io->pending_stripe))
992 __r5l_stripe_write_finished(io);
993 }
994
995 static void r5l_log_flush_endio(struct bio *bio)
996 {
997 struct r5l_log *log = container_of(bio, struct r5l_log,
998 flush_bio);
999 unsigned long flags;
1000 struct r5l_io_unit *io;
1001
1002 if (bio->bi_error)
1003 md_error(log->rdev->mddev, log->rdev);
1004
1005 spin_lock_irqsave(&log->io_list_lock, flags);
1006 list_for_each_entry(io, &log->flushing_ios, log_sibling)
1007 r5l_io_run_stripes(io);
1008 list_splice_tail_init(&log->flushing_ios, &log->finished_ios);
1009 spin_unlock_irqrestore(&log->io_list_lock, flags);
1010 }
1011
1012 /*
1013 * Starting dispatch IO to raid.
1014 * io_unit(meta) consists of a log. There is one situation we want to avoid. A
1015 * broken meta in the middle of a log causes recovery can't find meta at the
1016 * head of log. If operations require meta at the head persistent in log, we
1017 * must make sure meta before it persistent in log too. A case is:
1018 *
1019 * stripe data/parity is in log, we start write stripe to raid disks. stripe
1020 * data/parity must be persistent in log before we do the write to raid disks.
1021 *
1022 * The solution is we restrictly maintain io_unit list order. In this case, we
1023 * only write stripes of an io_unit to raid disks till the io_unit is the first
1024 * one whose data/parity is in log.
1025 */
1026 void r5l_flush_stripe_to_raid(struct r5l_log *log)
1027 {
1028 bool do_flush;
1029
1030 if (!log || !log->need_cache_flush)
1031 return;
1032
1033 spin_lock_irq(&log->io_list_lock);
1034 /* flush bio is running */
1035 if (!list_empty(&log->flushing_ios)) {
1036 spin_unlock_irq(&log->io_list_lock);
1037 return;
1038 }
1039 list_splice_tail_init(&log->io_end_ios, &log->flushing_ios);
1040 do_flush = !list_empty(&log->flushing_ios);
1041 spin_unlock_irq(&log->io_list_lock);
1042
1043 if (!do_flush)
1044 return;
1045 bio_reset(&log->flush_bio);
1046 log->flush_bio.bi_bdev = log->rdev->bdev;
1047 log->flush_bio.bi_end_io = r5l_log_flush_endio;
1048 bio_set_op_attrs(&log->flush_bio, REQ_OP_WRITE, WRITE_FLUSH);
1049 submit_bio(&log->flush_bio);
1050 }
1051
1052 static void r5l_write_super(struct r5l_log *log, sector_t cp);
1053 static void r5l_write_super_and_discard_space(struct r5l_log *log,
1054 sector_t end)
1055 {
1056 struct block_device *bdev = log->rdev->bdev;
1057 struct mddev *mddev;
1058
1059 r5l_write_super(log, end);
1060
1061 if (!blk_queue_discard(bdev_get_queue(bdev)))
1062 return;
1063
1064 mddev = log->rdev->mddev;
1065 /*
1066 * Discard could zero data, so before discard we must make sure
1067 * superblock is updated to new log tail. Updating superblock (either
1068 * directly call md_update_sb() or depend on md thread) must hold
1069 * reconfig mutex. On the other hand, raid5_quiesce is called with
1070 * reconfig_mutex hold. The first step of raid5_quiesce() is waitting
1071 * for all IO finish, hence waitting for reclaim thread, while reclaim
1072 * thread is calling this function and waitting for reconfig mutex. So
1073 * there is a deadlock. We workaround this issue with a trylock.
1074 * FIXME: we could miss discard if we can't take reconfig mutex
1075 */
1076 set_mask_bits(&mddev->flags, 0,
1077 BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING));
1078 if (!mddev_trylock(mddev))
1079 return;
1080 md_update_sb(mddev, 1);
1081 mddev_unlock(mddev);
1082
1083 /* discard IO error really doesn't matter, ignore it */
1084 if (log->last_checkpoint < end) {
1085 blkdev_issue_discard(bdev,
1086 log->last_checkpoint + log->rdev->data_offset,
1087 end - log->last_checkpoint, GFP_NOIO, 0);
1088 } else {
1089 blkdev_issue_discard(bdev,
1090 log->last_checkpoint + log->rdev->data_offset,
1091 log->device_size - log->last_checkpoint,
1092 GFP_NOIO, 0);
1093 blkdev_issue_discard(bdev, log->rdev->data_offset, end,
1094 GFP_NOIO, 0);
1095 }
1096 }
1097
1098 /*
1099 * r5c_flush_stripe moves stripe from cached list to handle_list. When called,
1100 * the stripe must be on r5c_cached_full_stripes or r5c_cached_partial_stripes.
1101 *
1102 * must hold conf->device_lock
1103 */
1104 static void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh)
1105 {
1106 BUG_ON(list_empty(&sh->lru));
1107 BUG_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
1108 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
1109
1110 /*
1111 * The stripe is not ON_RELEASE_LIST, so it is safe to call
1112 * raid5_release_stripe() while holding conf->device_lock
1113 */
1114 BUG_ON(test_bit(STRIPE_ON_RELEASE_LIST, &sh->state));
1115 assert_spin_locked(&conf->device_lock);
1116
1117 list_del_init(&sh->lru);
1118 atomic_inc(&sh->count);
1119
1120 set_bit(STRIPE_HANDLE, &sh->state);
1121 atomic_inc(&conf->active_stripes);
1122 r5c_make_stripe_write_out(sh);
1123
1124 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
1125 atomic_inc(&conf->preread_active_stripes);
1126 raid5_release_stripe(sh);
1127 }
1128
1129 /*
1130 * if num == 0, flush all full stripes
1131 * if num > 0, flush all full stripes. If less than num full stripes are
1132 * flushed, flush some partial stripes until totally num stripes are
1133 * flushed or there is no more cached stripes.
1134 */
1135 void r5c_flush_cache(struct r5conf *conf, int num)
1136 {
1137 int count;
1138 struct stripe_head *sh, *next;
1139
1140 assert_spin_locked(&conf->device_lock);
1141 if (!conf->log)
1142 return;
1143
1144 count = 0;
1145 list_for_each_entry_safe(sh, next, &conf->r5c_full_stripe_list, lru) {
1146 r5c_flush_stripe(conf, sh);
1147 count++;
1148 }
1149
1150 if (count >= num)
1151 return;
1152 list_for_each_entry_safe(sh, next,
1153 &conf->r5c_partial_stripe_list, lru) {
1154 r5c_flush_stripe(conf, sh);
1155 if (++count >= num)
1156 break;
1157 }
1158 }
1159
1160 static void r5c_do_reclaim(struct r5conf *conf)
1161 {
1162 struct r5l_log *log = conf->log;
1163 struct stripe_head *sh;
1164 int count = 0;
1165 unsigned long flags;
1166 int total_cached;
1167 int stripes_to_flush;
1168
1169 if (!r5c_is_writeback(log))
1170 return;
1171
1172 total_cached = atomic_read(&conf->r5c_cached_partial_stripes) +
1173 atomic_read(&conf->r5c_cached_full_stripes);
1174
1175 if (total_cached > conf->min_nr_stripes * 3 / 4 ||
1176 atomic_read(&conf->empty_inactive_list_nr) > 0)
1177 /*
1178 * if stripe cache pressure high, flush all full stripes and
1179 * some partial stripes
1180 */
1181 stripes_to_flush = R5C_RECLAIM_STRIPE_GROUP;
1182 else if (total_cached > conf->min_nr_stripes * 1 / 2 ||
1183 atomic_read(&conf->r5c_cached_full_stripes) >
1184 R5C_FULL_STRIPE_FLUSH_BATCH)
1185 /*
1186 * if stripe cache pressure moderate, or if there is many full
1187 * stripes,flush all full stripes
1188 */
1189 stripes_to_flush = 0;
1190 else
1191 /* no need to flush */
1192 stripes_to_flush = -1;
1193
1194 if (stripes_to_flush >= 0) {
1195 spin_lock_irqsave(&conf->device_lock, flags);
1196 r5c_flush_cache(conf, stripes_to_flush);
1197 spin_unlock_irqrestore(&conf->device_lock, flags);
1198 }
1199
1200 /* if log space is tight, flush stripes on stripe_in_journal_list */
1201 if (test_bit(R5C_LOG_TIGHT, &conf->cache_state)) {
1202 spin_lock_irqsave(&log->stripe_in_journal_lock, flags);
1203 spin_lock(&conf->device_lock);
1204 list_for_each_entry(sh, &log->stripe_in_journal_list, r5c) {
1205 /*
1206 * stripes on stripe_in_journal_list could be in any
1207 * state of the stripe_cache state machine. In this
1208 * case, we only want to flush stripe on
1209 * r5c_cached_full/partial_stripes. The following
1210 * condition makes sure the stripe is on one of the
1211 * two lists.
1212 */
1213 if (!list_empty(&sh->lru) &&
1214 !test_bit(STRIPE_HANDLE, &sh->state) &&
1215 atomic_read(&sh->count) == 0) {
1216 r5c_flush_stripe(conf, sh);
1217 }
1218 if (count++ >= R5C_RECLAIM_STRIPE_GROUP)
1219 break;
1220 }
1221 spin_unlock(&conf->device_lock);
1222 spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
1223 }
1224 md_wakeup_thread(conf->mddev->thread);
1225 }
1226
1227 static void r5l_do_reclaim(struct r5l_log *log)
1228 {
1229 struct r5conf *conf = log->rdev->mddev->private;
1230 sector_t reclaim_target = xchg(&log->reclaim_target, 0);
1231 sector_t reclaimable;
1232 sector_t next_checkpoint;
1233 bool write_super;
1234
1235 spin_lock_irq(&log->io_list_lock);
1236 write_super = r5l_reclaimable_space(log) > log->max_free_space ||
1237 reclaim_target != 0 || !list_empty(&log->no_space_stripes);
1238 /*
1239 * move proper io_unit to reclaim list. We should not change the order.
1240 * reclaimable/unreclaimable io_unit can be mixed in the list, we
1241 * shouldn't reuse space of an unreclaimable io_unit
1242 */
1243 while (1) {
1244 reclaimable = r5l_reclaimable_space(log);
1245 if (reclaimable >= reclaim_target ||
1246 (list_empty(&log->running_ios) &&
1247 list_empty(&log->io_end_ios) &&
1248 list_empty(&log->flushing_ios) &&
1249 list_empty(&log->finished_ios)))
1250 break;
1251
1252 md_wakeup_thread(log->rdev->mddev->thread);
1253 wait_event_lock_irq(log->iounit_wait,
1254 r5l_reclaimable_space(log) > reclaimable,
1255 log->io_list_lock);
1256 }
1257
1258 next_checkpoint = r5c_calculate_new_cp(conf);
1259 spin_unlock_irq(&log->io_list_lock);
1260
1261 BUG_ON(reclaimable < 0);
1262
1263 if (reclaimable == 0 || !write_super)
1264 return;
1265
1266 /*
1267 * write_super will flush cache of each raid disk. We must write super
1268 * here, because the log area might be reused soon and we don't want to
1269 * confuse recovery
1270 */
1271 r5l_write_super_and_discard_space(log, next_checkpoint);
1272
1273 mutex_lock(&log->io_mutex);
1274 log->last_checkpoint = next_checkpoint;
1275 r5c_update_log_state(log);
1276 mutex_unlock(&log->io_mutex);
1277
1278 r5l_run_no_space_stripes(log);
1279 }
1280
1281 static void r5l_reclaim_thread(struct md_thread *thread)
1282 {
1283 struct mddev *mddev = thread->mddev;
1284 struct r5conf *conf = mddev->private;
1285 struct r5l_log *log = conf->log;
1286
1287 if (!log)
1288 return;
1289 r5c_do_reclaim(conf);
1290 r5l_do_reclaim(log);
1291 }
1292
1293 void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
1294 {
1295 unsigned long target;
1296 unsigned long new = (unsigned long)space; /* overflow in theory */
1297
1298 if (!log)
1299 return;
1300 do {
1301 target = log->reclaim_target;
1302 if (new < target)
1303 return;
1304 } while (cmpxchg(&log->reclaim_target, target, new) != target);
1305 md_wakeup_thread(log->reclaim_thread);
1306 }
1307
1308 void r5l_quiesce(struct r5l_log *log, int state)
1309 {
1310 struct mddev *mddev;
1311 if (!log || state == 2)
1312 return;
1313 if (state == 0) {
1314 /*
1315 * This is a special case for hotadd. In suspend, the array has
1316 * no journal. In resume, journal is initialized as well as the
1317 * reclaim thread.
1318 */
1319 if (log->reclaim_thread)
1320 return;
1321 log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
1322 log->rdev->mddev, "reclaim");
1323 log->reclaim_thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL;
1324 } else if (state == 1) {
1325 /* make sure r5l_write_super_and_discard_space exits */
1326 mddev = log->rdev->mddev;
1327 wake_up(&mddev->sb_wait);
1328 r5l_wake_reclaim(log, MaxSector);
1329 md_unregister_thread(&log->reclaim_thread);
1330 r5l_do_reclaim(log);
1331 }
1332 }
1333
1334 bool r5l_log_disk_error(struct r5conf *conf)
1335 {
1336 struct r5l_log *log;
1337 bool ret;
1338 /* don't allow write if journal disk is missing */
1339 rcu_read_lock();
1340 log = rcu_dereference(conf->log);
1341
1342 if (!log)
1343 ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
1344 else
1345 ret = test_bit(Faulty, &log->rdev->flags);
1346 rcu_read_unlock();
1347 return ret;
1348 }
1349
1350 struct r5l_recovery_ctx {
1351 struct page *meta_page; /* current meta */
1352 sector_t meta_total_blocks; /* total size of current meta and data */
1353 sector_t pos; /* recovery position */
1354 u64 seq; /* recovery position seq */
1355 };
1356
1357 static int r5l_read_meta_block(struct r5l_log *log,
1358 struct r5l_recovery_ctx *ctx)
1359 {
1360 struct page *page = ctx->meta_page;
1361 struct r5l_meta_block *mb;
1362 u32 crc, stored_crc;
1363
1364 if (!sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, REQ_OP_READ, 0,
1365 false))
1366 return -EIO;
1367
1368 mb = page_address(page);
1369 stored_crc = le32_to_cpu(mb->checksum);
1370 mb->checksum = 0;
1371
1372 if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
1373 le64_to_cpu(mb->seq) != ctx->seq ||
1374 mb->version != R5LOG_VERSION ||
1375 le64_to_cpu(mb->position) != ctx->pos)
1376 return -EINVAL;
1377
1378 crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
1379 if (stored_crc != crc)
1380 return -EINVAL;
1381
1382 if (le32_to_cpu(mb->meta_size) > PAGE_SIZE)
1383 return -EINVAL;
1384
1385 ctx->meta_total_blocks = BLOCK_SECTORS;
1386
1387 return 0;
1388 }
1389
1390 static int r5l_recovery_flush_one_stripe(struct r5l_log *log,
1391 struct r5l_recovery_ctx *ctx,
1392 sector_t stripe_sect,
1393 int *offset)
1394 {
1395 struct r5conf *conf = log->rdev->mddev->private;
1396 struct stripe_head *sh;
1397 struct r5l_payload_data_parity *payload;
1398 int disk_index;
1399
1400 sh = raid5_get_active_stripe(conf, stripe_sect, 0, 0, 0);
1401 while (1) {
1402 sector_t log_offset = r5l_ring_add(log, ctx->pos,
1403 ctx->meta_total_blocks);
1404 payload = page_address(ctx->meta_page) + *offset;
1405
1406 if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
1407 raid5_compute_sector(conf,
1408 le64_to_cpu(payload->location), 0,
1409 &disk_index, sh);
1410
1411 sync_page_io(log->rdev, log_offset, PAGE_SIZE,
1412 sh->dev[disk_index].page, REQ_OP_READ, 0,
1413 false);
1414 sh->dev[disk_index].log_checksum =
1415 le32_to_cpu(payload->checksum[0]);
1416 set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
1417 } else {
1418 disk_index = sh->pd_idx;
1419 sync_page_io(log->rdev, log_offset, PAGE_SIZE,
1420 sh->dev[disk_index].page, REQ_OP_READ, 0,
1421 false);
1422 sh->dev[disk_index].log_checksum =
1423 le32_to_cpu(payload->checksum[0]);
1424 set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
1425
1426 if (sh->qd_idx >= 0) {
1427 disk_index = sh->qd_idx;
1428 sync_page_io(log->rdev,
1429 r5l_ring_add(log, log_offset, BLOCK_SECTORS),
1430 PAGE_SIZE, sh->dev[disk_index].page,
1431 REQ_OP_READ, 0, false);
1432 sh->dev[disk_index].log_checksum =
1433 le32_to_cpu(payload->checksum[1]);
1434 set_bit(R5_Wantwrite,
1435 &sh->dev[disk_index].flags);
1436 }
1437 }
1438
1439 ctx->meta_total_blocks += le32_to_cpu(payload->size);
1440 *offset += sizeof(struct r5l_payload_data_parity) +
1441 sizeof(__le32) *
1442 (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
1443 if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY)
1444 break;
1445 }
1446
1447 for (disk_index = 0; disk_index < sh->disks; disk_index++) {
1448 void *addr;
1449 u32 checksum;
1450
1451 if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
1452 continue;
1453 addr = kmap_atomic(sh->dev[disk_index].page);
1454 checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE);
1455 kunmap_atomic(addr);
1456 if (checksum != sh->dev[disk_index].log_checksum)
1457 goto error;
1458 }
1459
1460 for (disk_index = 0; disk_index < sh->disks; disk_index++) {
1461 struct md_rdev *rdev, *rrdev;
1462
1463 if (!test_and_clear_bit(R5_Wantwrite,
1464 &sh->dev[disk_index].flags))
1465 continue;
1466
1467 /* in case device is broken */
1468 rcu_read_lock();
1469 rdev = rcu_dereference(conf->disks[disk_index].rdev);
1470 if (rdev) {
1471 atomic_inc(&rdev->nr_pending);
1472 rcu_read_unlock();
1473 sync_page_io(rdev, stripe_sect, PAGE_SIZE,
1474 sh->dev[disk_index].page, REQ_OP_WRITE, 0,
1475 false);
1476 rdev_dec_pending(rdev, rdev->mddev);
1477 rcu_read_lock();
1478 }
1479 rrdev = rcu_dereference(conf->disks[disk_index].replacement);
1480 if (rrdev) {
1481 atomic_inc(&rrdev->nr_pending);
1482 rcu_read_unlock();
1483 sync_page_io(rrdev, stripe_sect, PAGE_SIZE,
1484 sh->dev[disk_index].page, REQ_OP_WRITE, 0,
1485 false);
1486 rdev_dec_pending(rrdev, rrdev->mddev);
1487 rcu_read_lock();
1488 }
1489 rcu_read_unlock();
1490 }
1491 raid5_release_stripe(sh);
1492 return 0;
1493
1494 error:
1495 for (disk_index = 0; disk_index < sh->disks; disk_index++)
1496 sh->dev[disk_index].flags = 0;
1497 raid5_release_stripe(sh);
1498 return -EINVAL;
1499 }
1500
1501 static int r5l_recovery_flush_one_meta(struct r5l_log *log,
1502 struct r5l_recovery_ctx *ctx)
1503 {
1504 struct r5conf *conf = log->rdev->mddev->private;
1505 struct r5l_payload_data_parity *payload;
1506 struct r5l_meta_block *mb;
1507 int offset;
1508 sector_t stripe_sector;
1509
1510 mb = page_address(ctx->meta_page);
1511 offset = sizeof(struct r5l_meta_block);
1512
1513 while (offset < le32_to_cpu(mb->meta_size)) {
1514 int dd;
1515
1516 payload = (void *)mb + offset;
1517 stripe_sector = raid5_compute_sector(conf,
1518 le64_to_cpu(payload->location), 0, &dd, NULL);
1519 if (r5l_recovery_flush_one_stripe(log, ctx, stripe_sector,
1520 &offset))
1521 return -EINVAL;
1522 }
1523 return 0;
1524 }
1525
1526 /* copy data/parity from log to raid disks */
1527 static void r5l_recovery_flush_log(struct r5l_log *log,
1528 struct r5l_recovery_ctx *ctx)
1529 {
1530 while (1) {
1531 if (r5l_read_meta_block(log, ctx))
1532 return;
1533 if (r5l_recovery_flush_one_meta(log, ctx))
1534 return;
1535 ctx->seq++;
1536 ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks);
1537 }
1538 }
1539
1540 static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
1541 u64 seq)
1542 {
1543 struct page *page;
1544 struct r5l_meta_block *mb;
1545 u32 crc;
1546
1547 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
1548 if (!page)
1549 return -ENOMEM;
1550 mb = page_address(page);
1551 mb->magic = cpu_to_le32(R5LOG_MAGIC);
1552 mb->version = R5LOG_VERSION;
1553 mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block));
1554 mb->seq = cpu_to_le64(seq);
1555 mb->position = cpu_to_le64(pos);
1556 crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
1557 mb->checksum = cpu_to_le32(crc);
1558
1559 if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE,
1560 WRITE_FUA, false)) {
1561 __free_page(page);
1562 return -EIO;
1563 }
1564 __free_page(page);
1565 return 0;
1566 }
1567
1568 static int r5l_recovery_log(struct r5l_log *log)
1569 {
1570 struct r5l_recovery_ctx ctx;
1571
1572 ctx.pos = log->last_checkpoint;
1573 ctx.seq = log->last_cp_seq;
1574 ctx.meta_page = alloc_page(GFP_KERNEL);
1575 if (!ctx.meta_page)
1576 return -ENOMEM;
1577
1578 r5l_recovery_flush_log(log, &ctx);
1579 __free_page(ctx.meta_page);
1580
1581 /*
1582 * we did a recovery. Now ctx.pos points to an invalid meta block. New
1583 * log will start here. but we can't let superblock point to last valid
1584 * meta block. The log might looks like:
1585 * | meta 1| meta 2| meta 3|
1586 * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If
1587 * superblock points to meta 1, we write a new valid meta 2n. if crash
1588 * happens again, new recovery will start from meta 1. Since meta 2n is
1589 * valid now, recovery will think meta 3 is valid, which is wrong.
1590 * The solution is we create a new meta in meta2 with its seq == meta
1591 * 1's seq + 10 and let superblock points to meta2. The same recovery will
1592 * not think meta 3 is a valid meta, because its seq doesn't match
1593 */
1594 if (ctx.seq > log->last_cp_seq) {
1595 int ret;
1596
1597 ret = r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq + 10);
1598 if (ret)
1599 return ret;
1600 log->seq = ctx.seq + 11;
1601 log->log_start = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
1602 r5l_write_super(log, ctx.pos);
1603 log->last_checkpoint = ctx.pos;
1604 log->next_checkpoint = ctx.pos;
1605 } else {
1606 log->log_start = ctx.pos;
1607 log->seq = ctx.seq;
1608 }
1609 return 0;
1610 }
1611
1612 static void r5l_write_super(struct r5l_log *log, sector_t cp)
1613 {
1614 struct mddev *mddev = log->rdev->mddev;
1615
1616 log->rdev->journal_tail = cp;
1617 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1618 }
1619
1620 /*
1621 * Try handle write operation in caching phase. This function should only
1622 * be called in write-back mode.
1623 *
1624 * If all outstanding writes can be handled in caching phase, returns 0
1625 * If writes requires write-out phase, call r5c_make_stripe_write_out()
1626 * and returns -EAGAIN
1627 */
1628 int r5c_try_caching_write(struct r5conf *conf,
1629 struct stripe_head *sh,
1630 struct stripe_head_state *s,
1631 int disks)
1632 {
1633 struct r5l_log *log = conf->log;
1634 int i;
1635 struct r5dev *dev;
1636 int to_cache = 0;
1637
1638 BUG_ON(!r5c_is_writeback(log));
1639
1640 if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
1641 /*
1642 * There are two different scenarios here:
1643 * 1. The stripe has some data cached, and it is sent to
1644 * write-out phase for reclaim
1645 * 2. The stripe is clean, and this is the first write
1646 *
1647 * For 1, return -EAGAIN, so we continue with
1648 * handle_stripe_dirtying().
1649 *
1650 * For 2, set STRIPE_R5C_CACHING and continue with caching
1651 * write.
1652 */
1653
1654 /* case 1: anything injournal or anything in written */
1655 if (s->injournal > 0 || s->written > 0)
1656 return -EAGAIN;
1657 /* case 2 */
1658 set_bit(STRIPE_R5C_CACHING, &sh->state);
1659 }
1660
1661 for (i = disks; i--; ) {
1662 dev = &sh->dev[i];
1663 /* if non-overwrite, use writing-out phase */
1664 if (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags) &&
1665 !test_bit(R5_InJournal, &dev->flags)) {
1666 r5c_make_stripe_write_out(sh);
1667 return -EAGAIN;
1668 }
1669 }
1670
1671 for (i = disks; i--; ) {
1672 dev = &sh->dev[i];
1673 if (dev->towrite) {
1674 set_bit(R5_Wantwrite, &dev->flags);
1675 set_bit(R5_Wantdrain, &dev->flags);
1676 set_bit(R5_LOCKED, &dev->flags);
1677 to_cache++;
1678 }
1679 }
1680
1681 if (to_cache) {
1682 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
1683 /*
1684 * set STRIPE_LOG_TRAPPED, which triggers r5c_cache_data()
1685 * in ops_run_io(). STRIPE_LOG_TRAPPED will be cleared in
1686 * r5c_handle_data_cached()
1687 */
1688 set_bit(STRIPE_LOG_TRAPPED, &sh->state);
1689 }
1690
1691 return 0;
1692 }
1693
1694 /*
1695 * free extra pages (orig_page) we allocated for prexor
1696 */
1697 void r5c_release_extra_page(struct stripe_head *sh)
1698 {
1699 int i;
1700
1701 for (i = sh->disks; i--; )
1702 if (sh->dev[i].page != sh->dev[i].orig_page) {
1703 struct page *p = sh->dev[i].orig_page;
1704
1705 sh->dev[i].orig_page = sh->dev[i].page;
1706 put_page(p);
1707 }
1708 }
1709
1710 /*
1711 * clean up the stripe (clear R5_InJournal for dev[pd_idx] etc.) after the
1712 * stripe is committed to RAID disks.
1713 */
1714 void r5c_finish_stripe_write_out(struct r5conf *conf,
1715 struct stripe_head *sh,
1716 struct stripe_head_state *s)
1717 {
1718 int i;
1719 int do_wakeup = 0;
1720
1721 if (!conf->log ||
1722 !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags))
1723 return;
1724
1725 WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
1726 clear_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
1727
1728 if (conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
1729 return;
1730
1731 for (i = sh->disks; i--; ) {
1732 clear_bit(R5_InJournal, &sh->dev[i].flags);
1733 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
1734 do_wakeup = 1;
1735 }
1736
1737 /*
1738 * analyse_stripe() runs before r5c_finish_stripe_write_out(),
1739 * We updated R5_InJournal, so we also update s->injournal.
1740 */
1741 s->injournal = 0;
1742
1743 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
1744 if (atomic_dec_and_test(&conf->pending_full_writes))
1745 md_wakeup_thread(conf->mddev->thread);
1746
1747 if (do_wakeup)
1748 wake_up(&conf->wait_for_overlap);
1749
1750 if (conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
1751 return;
1752
1753 spin_lock_irq(&conf->log->stripe_in_journal_lock);
1754 list_del_init(&sh->r5c);
1755 spin_unlock_irq(&conf->log->stripe_in_journal_lock);
1756 sh->log_start = MaxSector;
1757 atomic_dec(&conf->log->stripe_in_journal_count);
1758 }
1759
1760 int
1761 r5c_cache_data(struct r5l_log *log, struct stripe_head *sh,
1762 struct stripe_head_state *s)
1763 {
1764 struct r5conf *conf = sh->raid_conf;
1765 int pages = 0;
1766 int reserve;
1767 int i;
1768 int ret = 0;
1769
1770 BUG_ON(!log);
1771
1772 for (i = 0; i < sh->disks; i++) {
1773 void *addr;
1774
1775 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
1776 continue;
1777 addr = kmap_atomic(sh->dev[i].page);
1778 sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
1779 addr, PAGE_SIZE);
1780 kunmap_atomic(addr);
1781 pages++;
1782 }
1783 WARN_ON(pages == 0);
1784
1785 /*
1786 * The stripe must enter state machine again to call endio, so
1787 * don't delay.
1788 */
1789 clear_bit(STRIPE_DELAYED, &sh->state);
1790 atomic_inc(&sh->count);
1791
1792 mutex_lock(&log->io_mutex);
1793 /* meta + data */
1794 reserve = (1 + pages) << (PAGE_SHIFT - 9);
1795
1796 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
1797 sh->log_start == MaxSector)
1798 r5l_add_no_space_stripe(log, sh);
1799 else if (!r5l_has_free_space(log, reserve)) {
1800 if (sh->log_start == log->last_checkpoint)
1801 BUG();
1802 else
1803 r5l_add_no_space_stripe(log, sh);
1804 } else {
1805 ret = r5l_log_stripe(log, sh, pages, 0);
1806 if (ret) {
1807 spin_lock_irq(&log->io_list_lock);
1808 list_add_tail(&sh->log_list, &log->no_mem_stripes);
1809 spin_unlock_irq(&log->io_list_lock);
1810 }
1811 }
1812
1813 mutex_unlock(&log->io_mutex);
1814 return 0;
1815 }
1816
1817 static int r5l_load_log(struct r5l_log *log)
1818 {
1819 struct md_rdev *rdev = log->rdev;
1820 struct page *page;
1821 struct r5l_meta_block *mb;
1822 sector_t cp = log->rdev->journal_tail;
1823 u32 stored_crc, expected_crc;
1824 bool create_super = false;
1825 int ret;
1826
1827 /* Make sure it's valid */
1828 if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp)
1829 cp = 0;
1830 page = alloc_page(GFP_KERNEL);
1831 if (!page)
1832 return -ENOMEM;
1833
1834 if (!sync_page_io(rdev, cp, PAGE_SIZE, page, REQ_OP_READ, 0, false)) {
1835 ret = -EIO;
1836 goto ioerr;
1837 }
1838 mb = page_address(page);
1839
1840 if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
1841 mb->version != R5LOG_VERSION) {
1842 create_super = true;
1843 goto create;
1844 }
1845 stored_crc = le32_to_cpu(mb->checksum);
1846 mb->checksum = 0;
1847 expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
1848 if (stored_crc != expected_crc) {
1849 create_super = true;
1850 goto create;
1851 }
1852 if (le64_to_cpu(mb->position) != cp) {
1853 create_super = true;
1854 goto create;
1855 }
1856 create:
1857 if (create_super) {
1858 log->last_cp_seq = prandom_u32();
1859 cp = 0;
1860 r5l_log_write_empty_meta_block(log, cp, log->last_cp_seq);
1861 /*
1862 * Make sure super points to correct address. Log might have
1863 * data very soon. If super hasn't correct log tail address,
1864 * recovery can't find the log
1865 */
1866 r5l_write_super(log, cp);
1867 } else
1868 log->last_cp_seq = le64_to_cpu(mb->seq);
1869
1870 log->device_size = round_down(rdev->sectors, BLOCK_SECTORS);
1871 log->max_free_space = log->device_size >> RECLAIM_MAX_FREE_SPACE_SHIFT;
1872 if (log->max_free_space > RECLAIM_MAX_FREE_SPACE)
1873 log->max_free_space = RECLAIM_MAX_FREE_SPACE;
1874 log->last_checkpoint = cp;
1875 log->next_checkpoint = cp;
1876 mutex_lock(&log->io_mutex);
1877 r5c_update_log_state(log);
1878 mutex_unlock(&log->io_mutex);
1879
1880 __free_page(page);
1881
1882 return r5l_recovery_log(log);
1883 ioerr:
1884 __free_page(page);
1885 return ret;
1886 }
1887
1888 int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
1889 {
1890 struct request_queue *q = bdev_get_queue(rdev->bdev);
1891 struct r5l_log *log;
1892
1893 if (PAGE_SIZE != 4096)
1894 return -EINVAL;
1895
1896 /*
1897 * The PAGE_SIZE must be big enough to hold 1 r5l_meta_block and
1898 * raid_disks r5l_payload_data_parity.
1899 *
1900 * Write journal and cache does not work for very big array
1901 * (raid_disks > 203)
1902 */
1903 if (sizeof(struct r5l_meta_block) +
1904 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) *
1905 conf->raid_disks) > PAGE_SIZE) {
1906 pr_err("md/raid:%s: write journal/cache doesn't work for array with %d disks\n",
1907 mdname(conf->mddev), conf->raid_disks);
1908 return -EINVAL;
1909 }
1910
1911 log = kzalloc(sizeof(*log), GFP_KERNEL);
1912 if (!log)
1913 return -ENOMEM;
1914 log->rdev = rdev;
1915
1916 log->need_cache_flush = test_bit(QUEUE_FLAG_WC, &q->queue_flags) != 0;
1917
1918 log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid,
1919 sizeof(rdev->mddev->uuid));
1920
1921 mutex_init(&log->io_mutex);
1922
1923 spin_lock_init(&log->io_list_lock);
1924 INIT_LIST_HEAD(&log->running_ios);
1925 INIT_LIST_HEAD(&log->io_end_ios);
1926 INIT_LIST_HEAD(&log->flushing_ios);
1927 INIT_LIST_HEAD(&log->finished_ios);
1928 bio_init(&log->flush_bio);
1929
1930 log->io_kc = KMEM_CACHE(r5l_io_unit, 0);
1931 if (!log->io_kc)
1932 goto io_kc;
1933
1934 log->io_pool = mempool_create_slab_pool(R5L_POOL_SIZE, log->io_kc);
1935 if (!log->io_pool)
1936 goto io_pool;
1937
1938 log->bs = bioset_create(R5L_POOL_SIZE, 0);
1939 if (!log->bs)
1940 goto io_bs;
1941
1942 log->meta_pool = mempool_create_page_pool(R5L_POOL_SIZE, 0);
1943 if (!log->meta_pool)
1944 goto out_mempool;
1945
1946 log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
1947 log->rdev->mddev, "reclaim");
1948 if (!log->reclaim_thread)
1949 goto reclaim_thread;
1950 log->reclaim_thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL;
1951
1952 init_waitqueue_head(&log->iounit_wait);
1953
1954 INIT_LIST_HEAD(&log->no_mem_stripes);
1955
1956 INIT_LIST_HEAD(&log->no_space_stripes);
1957 spin_lock_init(&log->no_space_stripes_lock);
1958
1959 log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
1960 INIT_LIST_HEAD(&log->stripe_in_journal_list);
1961 spin_lock_init(&log->stripe_in_journal_lock);
1962 atomic_set(&log->stripe_in_journal_count, 0);
1963
1964 if (r5l_load_log(log))
1965 goto error;
1966
1967 rcu_assign_pointer(conf->log, log);
1968 set_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
1969 return 0;
1970
1971 error:
1972 md_unregister_thread(&log->reclaim_thread);
1973 reclaim_thread:
1974 mempool_destroy(log->meta_pool);
1975 out_mempool:
1976 bioset_free(log->bs);
1977 io_bs:
1978 mempool_destroy(log->io_pool);
1979 io_pool:
1980 kmem_cache_destroy(log->io_kc);
1981 io_kc:
1982 kfree(log);
1983 return -EINVAL;
1984 }
1985
1986 void r5l_exit_log(struct r5l_log *log)
1987 {
1988 md_unregister_thread(&log->reclaim_thread);
1989 mempool_destroy(log->meta_pool);
1990 bioset_free(log->bs);
1991 mempool_destroy(log->io_pool);
1992 kmem_cache_destroy(log->io_kc);
1993 kfree(log);
1994 }