]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - drivers/md/raid5-ppl.c
raid5-ppl: use resize_stripes() when enabling or disabling ppl
[mirror_ubuntu-bionic-kernel.git] / drivers / md / raid5-ppl.c
CommitLineData
3418d036
AP
1/*
2 * Partial Parity Log for closing the RAID5 write hole
3 * Copyright (c) 2017, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 */
14
15#include <linux/kernel.h>
16#include <linux/blkdev.h>
17#include <linux/slab.h>
18#include <linux/crc32c.h>
19#include <linux/flex_array.h>
20#include <linux/async_tx.h>
21#include <linux/raid/md_p.h>
22#include "md.h"
23#include "raid5.h"
24
25/*
26 * PPL consists of a 4KB header (struct ppl_header) and at least 128KB for
27 * partial parity data. The header contains an array of entries
28 * (struct ppl_header_entry) which describe the logged write requests.
29 * Partial parity for the entries comes after the header, written in the same
30 * sequence as the entries:
31 *
32 * Header
33 * entry0
34 * ...
35 * entryN
36 * PP data
37 * PP for entry0
38 * ...
39 * PP for entryN
40 *
41 * An entry describes one or more consecutive stripe_heads, up to a full
42 * stripe. The modifed raid data chunks form an m-by-n matrix, where m is the
43 * number of stripe_heads in the entry and n is the number of modified data
44 * disks. Every stripe_head in the entry must write to the same data disks.
45 * An example of a valid case described by a single entry (writes to the first
46 * stripe of a 4 disk array, 16k chunk size):
47 *
48 * sh->sector dd0 dd1 dd2 ppl
49 * +-----+-----+-----+
50 * 0 | --- | --- | --- | +----+
51 * 8 | -W- | -W- | --- | | pp | data_sector = 8
52 * 16 | -W- | -W- | --- | | pp | data_size = 3 * 2 * 4k
53 * 24 | -W- | -W- | --- | | pp | pp_size = 3 * 4k
54 * +-----+-----+-----+ +----+
55 *
56 * data_sector is the first raid sector of the modified data, data_size is the
57 * total size of modified data and pp_size is the size of partial parity for
58 * this entry. Entries for full stripe writes contain no partial parity
59 * (pp_size = 0), they only mark the stripes for which parity should be
60 * recalculated after an unclean shutdown. Every entry holds a checksum of its
61 * partial parity, the header also has a checksum of the header itself.
62 *
63 * A write request is always logged to the PPL instance stored on the parity
64 * disk of the corresponding stripe. For each member disk there is one ppl_log
65 * used to handle logging for this disk, independently from others. They are
66 * grouped in child_logs array in struct ppl_conf, which is assigned to
67 * r5conf->log_private.
68 *
69 * ppl_io_unit represents a full PPL write, header_page contains the ppl_header.
70 * PPL entries for logged stripes are added in ppl_log_stripe(). A stripe_head
71 * can be appended to the last entry if it meets the conditions for a valid
72 * entry described above, otherwise a new entry is added. Checksums of entries
73 * are calculated incrementally as stripes containing partial parity are being
74 * added. ppl_submit_iounit() calculates the checksum of the header and submits
75 * a bio containing the header page and partial parity pages (sh->ppl_page) for
76 * all stripes of the io_unit. When the PPL write completes, the stripes
77 * associated with the io_unit are released and raid5d starts writing their data
78 * and parity. When all stripes are written, the io_unit is freed and the next
79 * can be submitted.
80 *
81 * An io_unit is used to gather stripes until it is submitted or becomes full
82 * (if the maximum number of entries or size of PPL is reached). Another io_unit
83 * can't be submitted until the previous has completed (PPL and stripe
84 * data+parity is written). The log->io_list tracks all io_units of a log
85 * (for a single member disk). New io_units are added to the end of the list
86 * and the first io_unit is submitted, if it is not submitted already.
87 * The current io_unit accepting new stripes is always at the end of the list.
88 */
89
90struct ppl_conf {
91 struct mddev *mddev;
92
93 /* array of child logs, one for each raid disk */
94 struct ppl_log *child_logs;
95 int count;
96
97 int block_size; /* the logical block size used for data_sector
98 * in ppl_header_entry */
99 u32 signature; /* raid array identifier */
100 atomic64_t seq; /* current log write sequence number */
101
102 struct kmem_cache *io_kc;
103 mempool_t *io_pool;
104 struct bio_set *bs;
105 mempool_t *meta_pool;
4536bf9b
AP
106
107 /* used only for recovery */
108 int recovered_entries;
109 int mismatch_count;
94568f64
AP
110
111 /* stripes to retry if failed to allocate io_unit */
112 struct list_head no_mem_stripes;
113 spinlock_t no_mem_stripes_lock;
3418d036
AP
114};
115
116struct ppl_log {
117 struct ppl_conf *ppl_conf; /* shared between all log instances */
118
119 struct md_rdev *rdev; /* array member disk associated with
120 * this log instance */
121 struct mutex io_mutex;
122 struct ppl_io_unit *current_io; /* current io_unit accepting new data
123 * always at the end of io_list */
124 spinlock_t io_list_lock;
125 struct list_head io_list; /* all io_units of this log */
3418d036
AP
126};
127
128#define PPL_IO_INLINE_BVECS 32
129
130struct ppl_io_unit {
131 struct ppl_log *log;
132
133 struct page *header_page; /* for ppl_header */
134
135 unsigned int entries_count; /* number of entries in ppl_header */
136 unsigned int pp_size; /* total size current of partial parity */
137
138 u64 seq; /* sequence number of this log write */
139 struct list_head log_sibling; /* log->io_list */
140
141 struct list_head stripe_list; /* stripes added to the io_unit */
142 atomic_t pending_stripes; /* how many stripes not written to raid */
143
144 bool submitted; /* true if write to log started */
145
146 /* inline bio and its biovec for submitting the iounit */
147 struct bio bio;
148 struct bio_vec biovec[PPL_IO_INLINE_BVECS];
149};
150
151struct dma_async_tx_descriptor *
152ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu,
153 struct dma_async_tx_descriptor *tx)
154{
155 int disks = sh->disks;
156 struct page **xor_srcs = flex_array_get(percpu->scribble, 0);
157 int count = 0, pd_idx = sh->pd_idx, i;
158 struct async_submit_ctl submit;
159
160 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
161
162 /*
163 * Partial parity is the XOR of stripe data chunks that are not changed
164 * during the write request. Depending on available data
165 * (read-modify-write vs. reconstruct-write case) we calculate it
166 * differently.
167 */
168 if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
169 /* rmw: xor old data and parity from updated disks */
170 for (i = disks; i--;) {
171 struct r5dev *dev = &sh->dev[i];
172 if (test_bit(R5_Wantdrain, &dev->flags) || i == pd_idx)
173 xor_srcs[count++] = dev->page;
174 }
175 } else if (sh->reconstruct_state == reconstruct_state_drain_run) {
176 /* rcw: xor data from all not updated disks */
177 for (i = disks; i--;) {
178 struct r5dev *dev = &sh->dev[i];
179 if (test_bit(R5_UPTODATE, &dev->flags))
180 xor_srcs[count++] = dev->page;
181 }
182 } else {
183 return tx;
184 }
185
186 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, tx,
187 NULL, sh, flex_array_get(percpu->scribble, 0)
188 + sizeof(struct page *) * (sh->disks + 2));
189
190 if (count == 1)
191 tx = async_memcpy(sh->ppl_page, xor_srcs[0], 0, 0, PAGE_SIZE,
192 &submit);
193 else
194 tx = async_xor(sh->ppl_page, xor_srcs, 0, count, PAGE_SIZE,
195 &submit);
196
197 return tx;
198}
199
200static struct ppl_io_unit *ppl_new_iounit(struct ppl_log *log,
201 struct stripe_head *sh)
202{
203 struct ppl_conf *ppl_conf = log->ppl_conf;
204 struct ppl_io_unit *io;
205 struct ppl_header *pplhdr;
206
207 io = mempool_alloc(ppl_conf->io_pool, GFP_ATOMIC);
208 if (!io)
209 return NULL;
210
211 memset(io, 0, sizeof(*io));
212 io->log = log;
213 INIT_LIST_HEAD(&io->log_sibling);
214 INIT_LIST_HEAD(&io->stripe_list);
215 atomic_set(&io->pending_stripes, 0);
216 bio_init(&io->bio, io->biovec, PPL_IO_INLINE_BVECS);
217
218 io->header_page = mempool_alloc(ppl_conf->meta_pool, GFP_NOIO);
219 pplhdr = page_address(io->header_page);
220 clear_page(pplhdr);
221 memset(pplhdr->reserved, 0xff, PPL_HDR_RESERVED);
222 pplhdr->signature = cpu_to_le32(ppl_conf->signature);
223
224 io->seq = atomic64_add_return(1, &ppl_conf->seq);
225 pplhdr->generation = cpu_to_le64(io->seq);
226
227 return io;
228}
229
230static int ppl_log_stripe(struct ppl_log *log, struct stripe_head *sh)
231{
232 struct ppl_io_unit *io = log->current_io;
233 struct ppl_header_entry *e = NULL;
234 struct ppl_header *pplhdr;
235 int i;
236 sector_t data_sector = 0;
237 int data_disks = 0;
238 unsigned int entry_space = (log->rdev->ppl.size << 9) - PPL_HEADER_SIZE;
239 struct r5conf *conf = sh->raid_conf;
240
241 pr_debug("%s: stripe: %llu\n", __func__, (unsigned long long)sh->sector);
242
243 /* check if current io_unit is full */
244 if (io && (io->pp_size == entry_space ||
245 io->entries_count == PPL_HDR_MAX_ENTRIES)) {
246 pr_debug("%s: add io_unit blocked by seq: %llu\n",
247 __func__, io->seq);
248 io = NULL;
249 }
250
251 /* add a new unit if there is none or the current is full */
252 if (!io) {
253 io = ppl_new_iounit(log, sh);
254 if (!io)
255 return -ENOMEM;
256 spin_lock_irq(&log->io_list_lock);
257 list_add_tail(&io->log_sibling, &log->io_list);
258 spin_unlock_irq(&log->io_list_lock);
259
260 log->current_io = io;
261 }
262
263 for (i = 0; i < sh->disks; i++) {
264 struct r5dev *dev = &sh->dev[i];
265
266 if (i != sh->pd_idx && test_bit(R5_Wantwrite, &dev->flags)) {
267 if (!data_disks || dev->sector < data_sector)
268 data_sector = dev->sector;
269 data_disks++;
270 }
271 }
272 BUG_ON(!data_disks);
273
274 pr_debug("%s: seq: %llu data_sector: %llu data_disks: %d\n", __func__,
275 io->seq, (unsigned long long)data_sector, data_disks);
276
277 pplhdr = page_address(io->header_page);
278
279 if (io->entries_count > 0) {
280 struct ppl_header_entry *last =
281 &pplhdr->entries[io->entries_count - 1];
282 struct stripe_head *sh_last = list_last_entry(
283 &io->stripe_list, struct stripe_head, log_list);
284 u64 data_sector_last = le64_to_cpu(last->data_sector);
285 u32 data_size_last = le32_to_cpu(last->data_size);
286
287 /*
288 * Check if we can append the stripe to the last entry. It must
289 * be just after the last logged stripe and write to the same
290 * disks. Use bit shift and logarithm to avoid 64-bit division.
291 */
292 if ((sh->sector == sh_last->sector + STRIPE_SECTORS) &&
293 (data_sector >> ilog2(conf->chunk_sectors) ==
294 data_sector_last >> ilog2(conf->chunk_sectors)) &&
295 ((data_sector - data_sector_last) * data_disks ==
296 data_size_last >> 9))
297 e = last;
298 }
299
300 if (!e) {
301 e = &pplhdr->entries[io->entries_count++];
302 e->data_sector = cpu_to_le64(data_sector);
303 e->parity_disk = cpu_to_le32(sh->pd_idx);
304 e->checksum = cpu_to_le32(~0);
305 }
306
307 le32_add_cpu(&e->data_size, data_disks << PAGE_SHIFT);
308
309 /* don't write any PP if full stripe write */
310 if (!test_bit(STRIPE_FULL_WRITE, &sh->state)) {
311 le32_add_cpu(&e->pp_size, PAGE_SIZE);
312 io->pp_size += PAGE_SIZE;
313 e->checksum = cpu_to_le32(crc32c_le(le32_to_cpu(e->checksum),
314 page_address(sh->ppl_page),
315 PAGE_SIZE));
316 }
317
318 list_add_tail(&sh->log_list, &io->stripe_list);
319 atomic_inc(&io->pending_stripes);
320 sh->ppl_io = io;
321
322 return 0;
323}
324
325int ppl_write_stripe(struct r5conf *conf, struct stripe_head *sh)
326{
327 struct ppl_conf *ppl_conf = conf->log_private;
328 struct ppl_io_unit *io = sh->ppl_io;
329 struct ppl_log *log;
330
845b9e22 331 if (io || test_bit(STRIPE_SYNCING, &sh->state) || !sh->ppl_page ||
3418d036
AP
332 !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
333 !test_bit(R5_Insync, &sh->dev[sh->pd_idx].flags)) {
334 clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
335 return -EAGAIN;
336 }
337
338 log = &ppl_conf->child_logs[sh->pd_idx];
339
340 mutex_lock(&log->io_mutex);
341
342 if (!log->rdev || test_bit(Faulty, &log->rdev->flags)) {
343 mutex_unlock(&log->io_mutex);
344 return -EAGAIN;
345 }
346
347 set_bit(STRIPE_LOG_TRAPPED, &sh->state);
348 clear_bit(STRIPE_DELAYED, &sh->state);
349 atomic_inc(&sh->count);
350
351 if (ppl_log_stripe(log, sh)) {
94568f64
AP
352 spin_lock_irq(&ppl_conf->no_mem_stripes_lock);
353 list_add_tail(&sh->log_list, &ppl_conf->no_mem_stripes);
354 spin_unlock_irq(&ppl_conf->no_mem_stripes_lock);
3418d036
AP
355 }
356
357 mutex_unlock(&log->io_mutex);
358
359 return 0;
360}
361
362static void ppl_log_endio(struct bio *bio)
363{
364 struct ppl_io_unit *io = bio->bi_private;
365 struct ppl_log *log = io->log;
366 struct ppl_conf *ppl_conf = log->ppl_conf;
367 struct stripe_head *sh, *next;
368
369 pr_debug("%s: seq: %llu\n", __func__, io->seq);
370
371 if (bio->bi_error)
372 md_error(ppl_conf->mddev, log->rdev);
373
374 mempool_free(io->header_page, ppl_conf->meta_pool);
375
376 list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) {
377 list_del_init(&sh->log_list);
378
379 set_bit(STRIPE_HANDLE, &sh->state);
380 raid5_release_stripe(sh);
381 }
382}
383
384static void ppl_submit_iounit_bio(struct ppl_io_unit *io, struct bio *bio)
385{
386 char b[BDEVNAME_SIZE];
387
388 pr_debug("%s: seq: %llu size: %u sector: %llu dev: %s\n",
389 __func__, io->seq, bio->bi_iter.bi_size,
390 (unsigned long long)bio->bi_iter.bi_sector,
391 bdevname(bio->bi_bdev, b));
392
393 submit_bio(bio);
394}
395
396static void ppl_submit_iounit(struct ppl_io_unit *io)
397{
398 struct ppl_log *log = io->log;
399 struct ppl_conf *ppl_conf = log->ppl_conf;
400 struct ppl_header *pplhdr = page_address(io->header_page);
401 struct bio *bio = &io->bio;
402 struct stripe_head *sh;
403 int i;
404
6358c239
AP
405 bio->bi_private = io;
406
407 if (!log->rdev || test_bit(Faulty, &log->rdev->flags)) {
408 ppl_log_endio(bio);
409 return;
410 }
411
3418d036
AP
412 for (i = 0; i < io->entries_count; i++) {
413 struct ppl_header_entry *e = &pplhdr->entries[i];
414
415 pr_debug("%s: seq: %llu entry: %d data_sector: %llu pp_size: %u data_size: %u\n",
416 __func__, io->seq, i, le64_to_cpu(e->data_sector),
417 le32_to_cpu(e->pp_size), le32_to_cpu(e->data_size));
418
419 e->data_sector = cpu_to_le64(le64_to_cpu(e->data_sector) >>
420 ilog2(ppl_conf->block_size >> 9));
421 e->checksum = cpu_to_le32(~le32_to_cpu(e->checksum));
422 }
423
424 pplhdr->entries_count = cpu_to_le32(io->entries_count);
425 pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PPL_HEADER_SIZE));
426
3418d036
AP
427 bio->bi_end_io = ppl_log_endio;
428 bio->bi_opf = REQ_OP_WRITE | REQ_FUA;
429 bio->bi_bdev = log->rdev->bdev;
430 bio->bi_iter.bi_sector = log->rdev->ppl.sector;
431 bio_add_page(bio, io->header_page, PAGE_SIZE, 0);
432
433 list_for_each_entry(sh, &io->stripe_list, log_list) {
434 /* entries for full stripe writes have no partial parity */
435 if (test_bit(STRIPE_FULL_WRITE, &sh->state))
436 continue;
437
438 if (!bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0)) {
439 struct bio *prev = bio;
440
441 bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES,
442 ppl_conf->bs);
443 bio->bi_opf = prev->bi_opf;
444 bio->bi_bdev = prev->bi_bdev;
445 bio->bi_iter.bi_sector = bio_end_sector(prev);
446 bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0);
447
448 bio_chain(bio, prev);
449 ppl_submit_iounit_bio(io, prev);
450 }
451 }
452
453 ppl_submit_iounit_bio(io, bio);
454}
455
456static void ppl_submit_current_io(struct ppl_log *log)
457{
458 struct ppl_io_unit *io;
459
460 spin_lock_irq(&log->io_list_lock);
461
462 io = list_first_entry_or_null(&log->io_list, struct ppl_io_unit,
463 log_sibling);
464 if (io && io->submitted)
465 io = NULL;
466
467 spin_unlock_irq(&log->io_list_lock);
468
469 if (io) {
470 io->submitted = true;
471
472 if (io == log->current_io)
473 log->current_io = NULL;
474
475 ppl_submit_iounit(io);
476 }
477}
478
479void ppl_write_stripe_run(struct r5conf *conf)
480{
481 struct ppl_conf *ppl_conf = conf->log_private;
482 struct ppl_log *log;
483 int i;
484
485 for (i = 0; i < ppl_conf->count; i++) {
486 log = &ppl_conf->child_logs[i];
487
488 mutex_lock(&log->io_mutex);
489 ppl_submit_current_io(log);
490 mutex_unlock(&log->io_mutex);
491 }
492}
493
494static void ppl_io_unit_finished(struct ppl_io_unit *io)
495{
496 struct ppl_log *log = io->log;
94568f64 497 struct ppl_conf *ppl_conf = log->ppl_conf;
3418d036
AP
498 unsigned long flags;
499
500 pr_debug("%s: seq: %llu\n", __func__, io->seq);
501
94568f64 502 local_irq_save(flags);
3418d036 503
94568f64 504 spin_lock(&log->io_list_lock);
3418d036 505 list_del(&io->log_sibling);
94568f64
AP
506 spin_unlock(&log->io_list_lock);
507
508 mempool_free(io, ppl_conf->io_pool);
509
510 spin_lock(&ppl_conf->no_mem_stripes_lock);
511 if (!list_empty(&ppl_conf->no_mem_stripes)) {
512 struct stripe_head *sh;
3418d036 513
94568f64
AP
514 sh = list_first_entry(&ppl_conf->no_mem_stripes,
515 struct stripe_head, log_list);
3418d036
AP
516 list_del_init(&sh->log_list);
517 set_bit(STRIPE_HANDLE, &sh->state);
518 raid5_release_stripe(sh);
519 }
94568f64 520 spin_unlock(&ppl_conf->no_mem_stripes_lock);
3418d036 521
94568f64 522 local_irq_restore(flags);
3418d036
AP
523}
524
525void ppl_stripe_write_finished(struct stripe_head *sh)
526{
527 struct ppl_io_unit *io;
528
529 io = sh->ppl_io;
530 sh->ppl_io = NULL;
531
532 if (io && atomic_dec_and_test(&io->pending_stripes))
533 ppl_io_unit_finished(io);
534}
535
4536bf9b
AP
536static void ppl_xor(int size, struct page *page1, struct page *page2)
537{
538 struct async_submit_ctl submit;
539 struct dma_async_tx_descriptor *tx;
540 struct page *xor_srcs[] = { page1, page2 };
541
542 init_async_submit(&submit, ASYNC_TX_ACK|ASYNC_TX_XOR_DROP_DST,
543 NULL, NULL, NULL, NULL);
544 tx = async_xor(page1, xor_srcs, 0, 2, size, &submit);
545
546 async_tx_quiesce(&tx);
547}
548
549/*
550 * PPL recovery strategy: xor partial parity and data from all modified data
551 * disks within a stripe and write the result as the new stripe parity. If all
552 * stripe data disks are modified (full stripe write), no partial parity is
553 * available, so just xor the data disks.
554 *
555 * Recovery of a PPL entry shall occur only if all modified data disks are
556 * available and read from all of them succeeds.
557 *
558 * A PPL entry applies to a stripe, partial parity size for an entry is at most
559 * the size of the chunk. Examples of possible cases for a single entry:
560 *
561 * case 0: single data disk write:
562 * data0 data1 data2 ppl parity
563 * +--------+--------+--------+ +--------------------+
564 * | ------ | ------ | ------ | +----+ | (no change) |
565 * | ------ | -data- | ------ | | pp | -> | data1 ^ pp |
566 * | ------ | -data- | ------ | | pp | -> | data1 ^ pp |
567 * | ------ | ------ | ------ | +----+ | (no change) |
568 * +--------+--------+--------+ +--------------------+
569 * pp_size = data_size
570 *
571 * case 1: more than one data disk write:
572 * data0 data1 data2 ppl parity
573 * +--------+--------+--------+ +--------------------+
574 * | ------ | ------ | ------ | +----+ | (no change) |
575 * | -data- | -data- | ------ | | pp | -> | data0 ^ data1 ^ pp |
576 * | -data- | -data- | ------ | | pp | -> | data0 ^ data1 ^ pp |
577 * | ------ | ------ | ------ | +----+ | (no change) |
578 * +--------+--------+--------+ +--------------------+
579 * pp_size = data_size / modified_data_disks
580 *
581 * case 2: write to all data disks (also full stripe write):
582 * data0 data1 data2 parity
583 * +--------+--------+--------+ +--------------------+
584 * | ------ | ------ | ------ | | (no change) |
585 * | -data- | -data- | -data- | --------> | xor all data |
586 * | ------ | ------ | ------ | --------> | (no change) |
587 * | ------ | ------ | ------ | | (no change) |
588 * +--------+--------+--------+ +--------------------+
589 * pp_size = 0
590 *
591 * The following cases are possible only in other implementations. The recovery
592 * code can handle them, but they are not generated at runtime because they can
593 * be reduced to cases 0, 1 and 2:
594 *
595 * case 3:
596 * data0 data1 data2 ppl parity
597 * +--------+--------+--------+ +----+ +--------------------+
598 * | ------ | -data- | -data- | | pp | | data1 ^ data2 ^ pp |
599 * | ------ | -data- | -data- | | pp | -> | data1 ^ data2 ^ pp |
600 * | -data- | -data- | -data- | | -- | -> | xor all data |
601 * | -data- | -data- | ------ | | pp | | data0 ^ data1 ^ pp |
602 * +--------+--------+--------+ +----+ +--------------------+
603 * pp_size = chunk_size
604 *
605 * case 4:
606 * data0 data1 data2 ppl parity
607 * +--------+--------+--------+ +----+ +--------------------+
608 * | ------ | -data- | ------ | | pp | | data1 ^ pp |
609 * | ------ | ------ | ------ | | -- | -> | (no change) |
610 * | ------ | ------ | ------ | | -- | -> | (no change) |
611 * | -data- | ------ | ------ | | pp | | data0 ^ pp |
612 * +--------+--------+--------+ +----+ +--------------------+
613 * pp_size = chunk_size
614 */
615static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e,
616 sector_t ppl_sector)
617{
618 struct ppl_conf *ppl_conf = log->ppl_conf;
619 struct mddev *mddev = ppl_conf->mddev;
620 struct r5conf *conf = mddev->private;
621 int block_size = ppl_conf->block_size;
622 struct page *page1;
623 struct page *page2;
624 sector_t r_sector_first;
625 sector_t r_sector_last;
626 int strip_sectors;
627 int data_disks;
628 int i;
629 int ret = 0;
630 char b[BDEVNAME_SIZE];
631 unsigned int pp_size = le32_to_cpu(e->pp_size);
632 unsigned int data_size = le32_to_cpu(e->data_size);
633
634 page1 = alloc_page(GFP_KERNEL);
635 page2 = alloc_page(GFP_KERNEL);
636
637 if (!page1 || !page2) {
638 ret = -ENOMEM;
639 goto out;
640 }
641
642 r_sector_first = le64_to_cpu(e->data_sector) * (block_size >> 9);
643
644 if ((pp_size >> 9) < conf->chunk_sectors) {
645 if (pp_size > 0) {
646 data_disks = data_size / pp_size;
647 strip_sectors = pp_size >> 9;
648 } else {
649 data_disks = conf->raid_disks - conf->max_degraded;
650 strip_sectors = (data_size >> 9) / data_disks;
651 }
652 r_sector_last = r_sector_first +
653 (data_disks - 1) * conf->chunk_sectors +
654 strip_sectors;
655 } else {
656 data_disks = conf->raid_disks - conf->max_degraded;
657 strip_sectors = conf->chunk_sectors;
658 r_sector_last = r_sector_first + (data_size >> 9);
659 }
660
661 pr_debug("%s: array sector first: %llu last: %llu\n", __func__,
662 (unsigned long long)r_sector_first,
663 (unsigned long long)r_sector_last);
664
665 /* if start and end is 4k aligned, use a 4k block */
666 if (block_size == 512 &&
667 (r_sector_first & (STRIPE_SECTORS - 1)) == 0 &&
668 (r_sector_last & (STRIPE_SECTORS - 1)) == 0)
669 block_size = STRIPE_SIZE;
670
671 /* iterate through blocks in strip */
672 for (i = 0; i < strip_sectors; i += (block_size >> 9)) {
673 bool update_parity = false;
674 sector_t parity_sector;
675 struct md_rdev *parity_rdev;
676 struct stripe_head sh;
677 int disk;
678 int indent = 0;
679
680 pr_debug("%s:%*s iter %d start\n", __func__, indent, "", i);
681 indent += 2;
682
683 memset(page_address(page1), 0, PAGE_SIZE);
684
685 /* iterate through data member disks */
686 for (disk = 0; disk < data_disks; disk++) {
687 int dd_idx;
688 struct md_rdev *rdev;
689 sector_t sector;
690 sector_t r_sector = r_sector_first + i +
691 (disk * conf->chunk_sectors);
692
693 pr_debug("%s:%*s data member disk %d start\n",
694 __func__, indent, "", disk);
695 indent += 2;
696
697 if (r_sector >= r_sector_last) {
698 pr_debug("%s:%*s array sector %llu doesn't need parity update\n",
699 __func__, indent, "",
700 (unsigned long long)r_sector);
701 indent -= 2;
702 continue;
703 }
704
705 update_parity = true;
706
707 /* map raid sector to member disk */
708 sector = raid5_compute_sector(conf, r_sector, 0,
709 &dd_idx, NULL);
710 pr_debug("%s:%*s processing array sector %llu => data member disk %d, sector %llu\n",
711 __func__, indent, "",
712 (unsigned long long)r_sector, dd_idx,
713 (unsigned long long)sector);
714
715 rdev = conf->disks[dd_idx].rdev;
716 if (!rdev) {
717 pr_debug("%s:%*s data member disk %d missing\n",
718 __func__, indent, "", dd_idx);
719 update_parity = false;
720 break;
721 }
722
723 pr_debug("%s:%*s reading data member disk %s sector %llu\n",
724 __func__, indent, "", bdevname(rdev->bdev, b),
725 (unsigned long long)sector);
726 if (!sync_page_io(rdev, sector, block_size, page2,
727 REQ_OP_READ, 0, false)) {
728 md_error(mddev, rdev);
729 pr_debug("%s:%*s read failed!\n", __func__,
730 indent, "");
731 ret = -EIO;
732 goto out;
733 }
734
735 ppl_xor(block_size, page1, page2);
736
737 indent -= 2;
738 }
739
740 if (!update_parity)
741 continue;
742
743 if (pp_size > 0) {
744 pr_debug("%s:%*s reading pp disk sector %llu\n",
745 __func__, indent, "",
746 (unsigned long long)(ppl_sector + i));
747 if (!sync_page_io(log->rdev,
748 ppl_sector - log->rdev->data_offset + i,
749 block_size, page2, REQ_OP_READ, 0,
750 false)) {
751 pr_debug("%s:%*s read failed!\n", __func__,
752 indent, "");
753 md_error(mddev, log->rdev);
754 ret = -EIO;
755 goto out;
756 }
757
758 ppl_xor(block_size, page1, page2);
759 }
760
761 /* map raid sector to parity disk */
762 parity_sector = raid5_compute_sector(conf, r_sector_first + i,
763 0, &disk, &sh);
764 BUG_ON(sh.pd_idx != le32_to_cpu(e->parity_disk));
765 parity_rdev = conf->disks[sh.pd_idx].rdev;
766
767 BUG_ON(parity_rdev->bdev->bd_dev != log->rdev->bdev->bd_dev);
768 pr_debug("%s:%*s write parity at sector %llu, disk %s\n",
769 __func__, indent, "",
770 (unsigned long long)parity_sector,
771 bdevname(parity_rdev->bdev, b));
772 if (!sync_page_io(parity_rdev, parity_sector, block_size,
773 page1, REQ_OP_WRITE, 0, false)) {
774 pr_debug("%s:%*s parity write error!\n", __func__,
775 indent, "");
776 md_error(mddev, parity_rdev);
777 ret = -EIO;
778 goto out;
779 }
780 }
781out:
782 if (page1)
783 __free_page(page1);
784 if (page2)
785 __free_page(page2);
786 return ret;
787}
788
789static int ppl_recover(struct ppl_log *log, struct ppl_header *pplhdr)
790{
791 struct ppl_conf *ppl_conf = log->ppl_conf;
792 struct md_rdev *rdev = log->rdev;
793 struct mddev *mddev = rdev->mddev;
794 sector_t ppl_sector = rdev->ppl.sector + (PPL_HEADER_SIZE >> 9);
795 struct page *page;
796 int i;
797 int ret = 0;
798
799 page = alloc_page(GFP_KERNEL);
800 if (!page)
801 return -ENOMEM;
802
803 /* iterate through all PPL entries saved */
804 for (i = 0; i < le32_to_cpu(pplhdr->entries_count); i++) {
805 struct ppl_header_entry *e = &pplhdr->entries[i];
806 u32 pp_size = le32_to_cpu(e->pp_size);
807 sector_t sector = ppl_sector;
808 int ppl_entry_sectors = pp_size >> 9;
809 u32 crc, crc_stored;
810
811 pr_debug("%s: disk: %d entry: %d ppl_sector: %llu pp_size: %u\n",
812 __func__, rdev->raid_disk, i,
813 (unsigned long long)ppl_sector, pp_size);
814
815 crc = ~0;
816 crc_stored = le32_to_cpu(e->checksum);
817
818 /* read parial parity for this entry and calculate its checksum */
819 while (pp_size) {
820 int s = pp_size > PAGE_SIZE ? PAGE_SIZE : pp_size;
821
822 if (!sync_page_io(rdev, sector - rdev->data_offset,
823 s, page, REQ_OP_READ, 0, false)) {
824 md_error(mddev, rdev);
825 ret = -EIO;
826 goto out;
827 }
828
829 crc = crc32c_le(crc, page_address(page), s);
830
831 pp_size -= s;
832 sector += s >> 9;
833 }
834
835 crc = ~crc;
836
837 if (crc != crc_stored) {
838 /*
839 * Don't recover this entry if the checksum does not
840 * match, but keep going and try to recover other
841 * entries.
842 */
843 pr_debug("%s: ppl entry crc does not match: stored: 0x%x calculated: 0x%x\n",
844 __func__, crc_stored, crc);
845 ppl_conf->mismatch_count++;
846 } else {
847 ret = ppl_recover_entry(log, e, ppl_sector);
848 if (ret)
849 goto out;
850 ppl_conf->recovered_entries++;
851 }
852
853 ppl_sector += ppl_entry_sectors;
854 }
855
856 /* flush the disk cache after recovery if necessary */
857 ret = blkdev_issue_flush(rdev->bdev, GFP_KERNEL, NULL);
858out:
859 __free_page(page);
860 return ret;
861}
862
863static int ppl_write_empty_header(struct ppl_log *log)
864{
865 struct page *page;
866 struct ppl_header *pplhdr;
867 struct md_rdev *rdev = log->rdev;
868 int ret = 0;
869
870 pr_debug("%s: disk: %d ppl_sector: %llu\n", __func__,
871 rdev->raid_disk, (unsigned long long)rdev->ppl.sector);
872
873 page = alloc_page(GFP_NOIO | __GFP_ZERO);
874 if (!page)
875 return -ENOMEM;
876
877 pplhdr = page_address(page);
878 memset(pplhdr->reserved, 0xff, PPL_HDR_RESERVED);
879 pplhdr->signature = cpu_to_le32(log->ppl_conf->signature);
880 pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PAGE_SIZE));
881
882 if (!sync_page_io(rdev, rdev->ppl.sector - rdev->data_offset,
883 PPL_HEADER_SIZE, page, REQ_OP_WRITE | REQ_FUA, 0,
884 false)) {
885 md_error(rdev->mddev, rdev);
886 ret = -EIO;
887 }
888
889 __free_page(page);
890 return ret;
891}
892
893static int ppl_load_distributed(struct ppl_log *log)
894{
895 struct ppl_conf *ppl_conf = log->ppl_conf;
896 struct md_rdev *rdev = log->rdev;
897 struct mddev *mddev = rdev->mddev;
898 struct page *page;
899 struct ppl_header *pplhdr;
900 u32 crc, crc_stored;
901 u32 signature;
902 int ret = 0;
903
904 pr_debug("%s: disk: %d\n", __func__, rdev->raid_disk);
905
906 /* read PPL header */
907 page = alloc_page(GFP_KERNEL);
908 if (!page)
909 return -ENOMEM;
910
911 if (!sync_page_io(rdev, rdev->ppl.sector - rdev->data_offset,
912 PAGE_SIZE, page, REQ_OP_READ, 0, false)) {
913 md_error(mddev, rdev);
914 ret = -EIO;
915 goto out;
916 }
917 pplhdr = page_address(page);
918
919 /* check header validity */
920 crc_stored = le32_to_cpu(pplhdr->checksum);
921 pplhdr->checksum = 0;
922 crc = ~crc32c_le(~0, pplhdr, PAGE_SIZE);
923
924 if (crc_stored != crc) {
925 pr_debug("%s: ppl header crc does not match: stored: 0x%x calculated: 0x%x\n",
926 __func__, crc_stored, crc);
927 ppl_conf->mismatch_count++;
928 goto out;
929 }
930
931 signature = le32_to_cpu(pplhdr->signature);
932
933 if (mddev->external) {
934 /*
935 * For external metadata the header signature is set and
936 * validated in userspace.
937 */
938 ppl_conf->signature = signature;
939 } else if (ppl_conf->signature != signature) {
940 pr_debug("%s: ppl header signature does not match: stored: 0x%x configured: 0x%x\n",
941 __func__, signature, ppl_conf->signature);
942 ppl_conf->mismatch_count++;
943 goto out;
944 }
945
946 /* attempt to recover from log if we are starting a dirty array */
947 if (!mddev->pers && mddev->recovery_cp != MaxSector)
948 ret = ppl_recover(log, pplhdr);
949out:
950 /* write empty header if we are starting the array */
951 if (!ret && !mddev->pers)
952 ret = ppl_write_empty_header(log);
953
954 __free_page(page);
955
956 pr_debug("%s: return: %d mismatch_count: %d recovered_entries: %d\n",
957 __func__, ret, ppl_conf->mismatch_count,
958 ppl_conf->recovered_entries);
959 return ret;
960}
961
962static int ppl_load(struct ppl_conf *ppl_conf)
963{
964 int ret = 0;
965 u32 signature = 0;
966 bool signature_set = false;
967 int i;
968
969 for (i = 0; i < ppl_conf->count; i++) {
970 struct ppl_log *log = &ppl_conf->child_logs[i];
971
972 /* skip missing drive */
973 if (!log->rdev)
974 continue;
975
976 ret = ppl_load_distributed(log);
977 if (ret)
978 break;
979
980 /*
981 * For external metadata we can't check if the signature is
982 * correct on a single drive, but we can check if it is the same
983 * on all drives.
984 */
985 if (ppl_conf->mddev->external) {
986 if (!signature_set) {
987 signature = ppl_conf->signature;
988 signature_set = true;
989 } else if (signature != ppl_conf->signature) {
990 pr_warn("md/raid:%s: PPL header signature does not match on all member drives\n",
991 mdname(ppl_conf->mddev));
992 ret = -EINVAL;
993 break;
994 }
995 }
996 }
997
998 pr_debug("%s: return: %d mismatch_count: %d recovered_entries: %d\n",
999 __func__, ret, ppl_conf->mismatch_count,
1000 ppl_conf->recovered_entries);
1001 return ret;
1002}
1003
3418d036
AP
1004static void __ppl_exit_log(struct ppl_conf *ppl_conf)
1005{
1006 clear_bit(MD_HAS_PPL, &ppl_conf->mddev->flags);
1007
1008 kfree(ppl_conf->child_logs);
1009
1010 mempool_destroy(ppl_conf->meta_pool);
1011 if (ppl_conf->bs)
1012 bioset_free(ppl_conf->bs);
1013 mempool_destroy(ppl_conf->io_pool);
1014 kmem_cache_destroy(ppl_conf->io_kc);
1015
1016 kfree(ppl_conf);
1017}
1018
1019void ppl_exit_log(struct r5conf *conf)
1020{
1021 struct ppl_conf *ppl_conf = conf->log_private;
1022
1023 if (ppl_conf) {
1024 __ppl_exit_log(ppl_conf);
1025 conf->log_private = NULL;
1026 }
1027}
1028
1029static int ppl_validate_rdev(struct md_rdev *rdev)
1030{
1031 char b[BDEVNAME_SIZE];
1032 int ppl_data_sectors;
1033 int ppl_size_new;
1034
1035 /*
1036 * The configured PPL size must be enough to store
1037 * the header and (at the very least) partial parity
1038 * for one stripe. Round it down to ensure the data
1039 * space is cleanly divisible by stripe size.
1040 */
1041 ppl_data_sectors = rdev->ppl.size - (PPL_HEADER_SIZE >> 9);
1042
1043 if (ppl_data_sectors > 0)
1044 ppl_data_sectors = rounddown(ppl_data_sectors, STRIPE_SECTORS);
1045
1046 if (ppl_data_sectors <= 0) {
1047 pr_warn("md/raid:%s: PPL space too small on %s\n",
1048 mdname(rdev->mddev), bdevname(rdev->bdev, b));
1049 return -ENOSPC;
1050 }
1051
1052 ppl_size_new = ppl_data_sectors + (PPL_HEADER_SIZE >> 9);
1053
1054 if ((rdev->ppl.sector < rdev->data_offset &&
1055 rdev->ppl.sector + ppl_size_new > rdev->data_offset) ||
1056 (rdev->ppl.sector >= rdev->data_offset &&
1057 rdev->data_offset + rdev->sectors > rdev->ppl.sector)) {
1058 pr_warn("md/raid:%s: PPL space overlaps with data on %s\n",
1059 mdname(rdev->mddev), bdevname(rdev->bdev, b));
1060 return -EINVAL;
1061 }
1062
1063 if (!rdev->mddev->external &&
1064 ((rdev->ppl.offset > 0 && rdev->ppl.offset < (rdev->sb_size >> 9)) ||
1065 (rdev->ppl.offset <= 0 && rdev->ppl.offset + ppl_size_new > 0))) {
1066 pr_warn("md/raid:%s: PPL space overlaps with superblock on %s\n",
1067 mdname(rdev->mddev), bdevname(rdev->bdev, b));
1068 return -EINVAL;
1069 }
1070
1071 rdev->ppl.size = ppl_size_new;
1072
1073 return 0;
1074}
1075
1076int ppl_init_log(struct r5conf *conf)
1077{
1078 struct ppl_conf *ppl_conf;
1079 struct mddev *mddev = conf->mddev;
1080 int ret = 0;
1081 int i;
0b408baf 1082 bool need_cache_flush = false;
3418d036
AP
1083
1084 pr_debug("md/raid:%s: enabling distributed Partial Parity Log\n",
1085 mdname(conf->mddev));
1086
1087 if (PAGE_SIZE != 4096)
1088 return -EINVAL;
1089
1090 if (mddev->level != 5) {
1091 pr_warn("md/raid:%s PPL is not compatible with raid level %d\n",
1092 mdname(mddev), mddev->level);
1093 return -EINVAL;
1094 }
1095
1096 if (mddev->bitmap_info.file || mddev->bitmap_info.offset) {
1097 pr_warn("md/raid:%s PPL is not compatible with bitmap\n",
1098 mdname(mddev));
1099 return -EINVAL;
1100 }
1101
1102 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
1103 pr_warn("md/raid:%s PPL is not compatible with journal\n",
1104 mdname(mddev));
1105 return -EINVAL;
1106 }
1107
1108 ppl_conf = kzalloc(sizeof(struct ppl_conf), GFP_KERNEL);
1109 if (!ppl_conf)
1110 return -ENOMEM;
1111
1112 ppl_conf->mddev = mddev;
1113
1114 ppl_conf->io_kc = KMEM_CACHE(ppl_io_unit, 0);
1115 if (!ppl_conf->io_kc) {
1116 ret = -EINVAL;
1117 goto err;
1118 }
1119
1120 ppl_conf->io_pool = mempool_create_slab_pool(conf->raid_disks, ppl_conf->io_kc);
1121 if (!ppl_conf->io_pool) {
1122 ret = -EINVAL;
1123 goto err;
1124 }
1125
1126 ppl_conf->bs = bioset_create(conf->raid_disks, 0);
1127 if (!ppl_conf->bs) {
1128 ret = -EINVAL;
1129 goto err;
1130 }
1131
1132 ppl_conf->meta_pool = mempool_create_page_pool(conf->raid_disks, 0);
1133 if (!ppl_conf->meta_pool) {
1134 ret = -EINVAL;
1135 goto err;
1136 }
1137
1138 ppl_conf->count = conf->raid_disks;
1139 ppl_conf->child_logs = kcalloc(ppl_conf->count, sizeof(struct ppl_log),
1140 GFP_KERNEL);
1141 if (!ppl_conf->child_logs) {
1142 ret = -ENOMEM;
1143 goto err;
1144 }
1145
1146 atomic64_set(&ppl_conf->seq, 0);
94568f64
AP
1147 INIT_LIST_HEAD(&ppl_conf->no_mem_stripes);
1148 spin_lock_init(&ppl_conf->no_mem_stripes_lock);
3418d036
AP
1149
1150 if (!mddev->external) {
1151 ppl_conf->signature = ~crc32c_le(~0, mddev->uuid, sizeof(mddev->uuid));
1152 ppl_conf->block_size = 512;
1153 } else {
1154 ppl_conf->block_size = queue_logical_block_size(mddev->queue);
1155 }
1156
1157 for (i = 0; i < ppl_conf->count; i++) {
1158 struct ppl_log *log = &ppl_conf->child_logs[i];
1159 struct md_rdev *rdev = conf->disks[i].rdev;
1160
1161 mutex_init(&log->io_mutex);
1162 spin_lock_init(&log->io_list_lock);
1163 INIT_LIST_HEAD(&log->io_list);
3418d036
AP
1164
1165 log->ppl_conf = ppl_conf;
1166 log->rdev = rdev;
1167
1168 if (rdev) {
1169 struct request_queue *q;
1170
1171 ret = ppl_validate_rdev(rdev);
1172 if (ret)
1173 goto err;
1174
1175 q = bdev_get_queue(rdev->bdev);
1176 if (test_bit(QUEUE_FLAG_WC, &q->queue_flags))
1177 need_cache_flush = true;
1178 }
1179 }
1180
1181 if (need_cache_flush)
1182 pr_warn("md/raid:%s: Volatile write-back cache should be disabled on all member drives when using PPL!\n",
1183 mdname(mddev));
1184
4536bf9b
AP
1185 /* load and possibly recover the logs from the member disks */
1186 ret = ppl_load(ppl_conf);
1187
1188 if (ret) {
1189 goto err;
1190 } else if (!mddev->pers &&
1191 mddev->recovery_cp == 0 && !mddev->degraded &&
1192 ppl_conf->recovered_entries > 0 &&
1193 ppl_conf->mismatch_count == 0) {
1194 /*
1195 * If we are starting a dirty array and the recovery succeeds
1196 * without any issues, set the array as clean.
1197 */
1198 mddev->recovery_cp = MaxSector;
1199 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
ba903a3e
AP
1200 } else if (mddev->pers && ppl_conf->mismatch_count > 0) {
1201 /* no mismatch allowed when enabling PPL for a running array */
1202 ret = -EINVAL;
1203 goto err;
4536bf9b
AP
1204 }
1205
3418d036 1206 conf->log_private = ppl_conf;
845b9e22 1207 set_bit(MD_HAS_PPL, &ppl_conf->mddev->flags);
3418d036
AP
1208
1209 return 0;
1210err:
1211 __ppl_exit_log(ppl_conf);
1212 return ret;
1213}
6358c239
AP
1214
1215int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add)
1216{
1217 struct ppl_conf *ppl_conf = conf->log_private;
1218 struct ppl_log *log;
1219 int ret = 0;
1220 char b[BDEVNAME_SIZE];
1221
1222 if (!rdev)
1223 return -EINVAL;
1224
1225 pr_debug("%s: disk: %d operation: %s dev: %s\n",
1226 __func__, rdev->raid_disk, add ? "add" : "remove",
1227 bdevname(rdev->bdev, b));
1228
1229 if (rdev->raid_disk < 0)
1230 return 0;
1231
1232 if (rdev->raid_disk >= ppl_conf->count)
1233 return -ENODEV;
1234
1235 log = &ppl_conf->child_logs[rdev->raid_disk];
1236
1237 mutex_lock(&log->io_mutex);
1238 if (add) {
1239 ret = ppl_validate_rdev(rdev);
1240 if (!ret) {
1241 log->rdev = rdev;
1242 ret = ppl_write_empty_header(log);
1243 }
1244 } else {
1245 log->rdev = NULL;
1246 }
1247 mutex_unlock(&log->io_mutex);
1248
1249 return ret;
1250}