]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blame - fs/btrfs/scrub.c
btrfs: clear log tree recovering status if starting transaction fails
[mirror_ubuntu-jammy-kernel.git] / fs / btrfs / scrub.c
CommitLineData
c1d7c514 1// SPDX-License-Identifier: GPL-2.0
a2de733c 2/*
b6bfebc1 3 * Copyright (C) 2011, 2012 STRATO. All rights reserved.
a2de733c
AJ
4 */
5
a2de733c 6#include <linux/blkdev.h>
558540c1 7#include <linux/ratelimit.h>
de2491fd 8#include <linux/sched/mm.h>
d5178578 9#include <crypto/hash.h>
a2de733c 10#include "ctree.h"
6e80d4f8 11#include "discard.h"
a2de733c
AJ
12#include "volumes.h"
13#include "disk-io.h"
14#include "ordered-data.h"
0ef8e451 15#include "transaction.h"
558540c1 16#include "backref.h"
5da6fcbc 17#include "extent_io.h"
ff023aac 18#include "dev-replace.h"
21adbd5c 19#include "check-integrity.h"
606686ee 20#include "rcu-string.h"
53b381b3 21#include "raid56.h"
aac0023c 22#include "block-group.h"
12659251 23#include "zoned.h"
a2de733c
AJ
24
25/*
26 * This is only the first step towards a full-features scrub. It reads all
27 * extent and super block and verifies the checksums. In case a bad checksum
28 * is found or the extent cannot be read, good data will be written back if
29 * any can be found.
30 *
31 * Future enhancements:
a2de733c
AJ
32 * - In case an unrepairable extent is encountered, track which files are
33 * affected and report them
a2de733c 34 * - track and record media errors, throw out bad devices
a2de733c 35 * - add a mode to also read unallocated space
a2de733c
AJ
36 */
37
b5d67f64 38struct scrub_block;
d9d181c1 39struct scrub_ctx;
a2de733c 40
ff023aac
SB
41/*
42 * the following three values only influence the performance.
43 * The last one configures the number of parallel and outstanding I/O
44 * operations. The first two values configure an upper limit for the number
45 * of (dynamically allocated) pages that are added to a bio.
46 */
47#define SCRUB_PAGES_PER_RD_BIO 32 /* 128k per bio */
48#define SCRUB_PAGES_PER_WR_BIO 32 /* 128k per bio */
49#define SCRUB_BIOS_PER_SCTX 64 /* 8MB per device in flight */
7a9e9987
SB
50
51/*
52 * the following value times PAGE_SIZE needs to be large enough to match the
53 * largest node/leaf/sector size that shall be supported.
54 * Values larger than BTRFS_STRIPE_LEN are not supported.
55 */
b5d67f64 56#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */
a2de733c 57
af8e2d1d 58struct scrub_recover {
6f615018 59 refcount_t refs;
af8e2d1d 60 struct btrfs_bio *bbio;
af8e2d1d
MX
61 u64 map_length;
62};
63
a2de733c 64struct scrub_page {
b5d67f64
SB
65 struct scrub_block *sblock;
66 struct page *page;
442a4f63 67 struct btrfs_device *dev;
5a6ac9ea 68 struct list_head list;
a2de733c
AJ
69 u64 flags; /* extent flags */
70 u64 generation;
b5d67f64
SB
71 u64 logical;
72 u64 physical;
ff023aac 73 u64 physical_for_dev_replace;
57019345 74 atomic_t refs;
2c363954
QW
75 u8 mirror_num;
76 int have_csum:1;
77 int io_error:1;
a2de733c 78 u8 csum[BTRFS_CSUM_SIZE];
af8e2d1d
MX
79
80 struct scrub_recover *recover;
a2de733c
AJ
81};
82
83struct scrub_bio {
84 int index;
d9d181c1 85 struct scrub_ctx *sctx;
a36cf8b8 86 struct btrfs_device *dev;
a2de733c 87 struct bio *bio;
4e4cbee9 88 blk_status_t status;
a2de733c
AJ
89 u64 logical;
90 u64 physical;
ff023aac
SB
91#if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
92 struct scrub_page *pagev[SCRUB_PAGES_PER_WR_BIO];
93#else
94 struct scrub_page *pagev[SCRUB_PAGES_PER_RD_BIO];
95#endif
b5d67f64 96 int page_count;
a2de733c
AJ
97 int next_free;
98 struct btrfs_work work;
99};
100
b5d67f64 101struct scrub_block {
7a9e9987 102 struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
b5d67f64
SB
103 int page_count;
104 atomic_t outstanding_pages;
186debd6 105 refcount_t refs; /* free mem on transition to zero */
d9d181c1 106 struct scrub_ctx *sctx;
5a6ac9ea 107 struct scrub_parity *sparity;
b5d67f64
SB
108 struct {
109 unsigned int header_error:1;
110 unsigned int checksum_error:1;
111 unsigned int no_io_error_seen:1;
442a4f63 112 unsigned int generation_error:1; /* also sets header_error */
5a6ac9ea
MX
113
114 /* The following is for the data used to check parity */
115 /* It is for the data with checksum */
116 unsigned int data_corrected:1;
b5d67f64 117 };
73ff61db 118 struct btrfs_work work;
b5d67f64
SB
119};
120
5a6ac9ea
MX
121/* Used for the chunks with parity stripe such RAID5/6 */
122struct scrub_parity {
123 struct scrub_ctx *sctx;
124
125 struct btrfs_device *scrub_dev;
126
127 u64 logic_start;
128
129 u64 logic_end;
130
131 int nsectors;
132
fa485d21 133 u32 stripe_len;
5a6ac9ea 134
78a76450 135 refcount_t refs;
5a6ac9ea
MX
136
137 struct list_head spages;
138
139 /* Work of parity check and repair */
140 struct btrfs_work work;
141
142 /* Mark the parity blocks which have data */
143 unsigned long *dbitmap;
144
145 /*
146 * Mark the parity blocks which have data, but errors happen when
147 * read data or check data
148 */
149 unsigned long *ebitmap;
150
a8753ee3 151 unsigned long bitmap[];
5a6ac9ea
MX
152};
153
d9d181c1 154struct scrub_ctx {
ff023aac 155 struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX];
fb456252 156 struct btrfs_fs_info *fs_info;
a2de733c
AJ
157 int first_free;
158 int curr;
b6bfebc1
SB
159 atomic_t bios_in_flight;
160 atomic_t workers_pending;
a2de733c
AJ
161 spinlock_t list_lock;
162 wait_queue_head_t list_wait;
a2de733c
AJ
163 struct list_head csum_list;
164 atomic_t cancel_req;
8628764e 165 int readonly;
ff023aac 166 int pages_per_rd_bio;
63a212ab 167
eb3b5053
DS
168 /* State of IO submission throttling affecting the associated device */
169 ktime_t throttle_deadline;
170 u64 throttle_sent;
171
63a212ab 172 int is_dev_replace;
de17addc 173 u64 write_pointer;
3fb99303
DS
174
175 struct scrub_bio *wr_curr_bio;
176 struct mutex wr_lock;
177 int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
3fb99303 178 struct btrfs_device *wr_tgtdev;
2073c4c2 179 bool flush_all_writes;
63a212ab 180
a2de733c
AJ
181 /*
182 * statistics
183 */
184 struct btrfs_scrub_progress stat;
185 spinlock_t stat_lock;
f55985f4
FM
186
187 /*
188 * Use a ref counter to avoid use-after-free issues. Scrub workers
189 * decrement bios_in_flight and workers_pending and then do a wakeup
190 * on the list_wait wait queue. We must ensure the main scrub task
191 * doesn't free the scrub context before or while the workers are
192 * doing the wakeup() call.
193 */
99f4cdb1 194 refcount_t refs;
a2de733c
AJ
195};
196
558540c1
JS
197struct scrub_warning {
198 struct btrfs_path *path;
199 u64 extent_item_size;
558540c1 200 const char *errstr;
6aa21263 201 u64 physical;
558540c1
JS
202 u64 logical;
203 struct btrfs_device *dev;
558540c1
JS
204};
205
0966a7b1
QW
206struct full_stripe_lock {
207 struct rb_node node;
208 u64 logical;
209 u64 refs;
210 struct mutex mutex;
211};
212
be50a8dd 213static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
ff023aac 214 struct scrub_block *sblocks_for_recheck);
34f5c8e9 215static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
affe4a5a
ZL
216 struct scrub_block *sblock,
217 int retry_failed_mirror);
ba7cf988 218static void scrub_recheck_block_checksum(struct scrub_block *sblock);
b5d67f64 219static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
114ab50d 220 struct scrub_block *sblock_good);
b5d67f64
SB
221static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
222 struct scrub_block *sblock_good,
223 int page_num, int force_write);
ff023aac
SB
224static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
225static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
226 int page_num);
b5d67f64
SB
227static int scrub_checksum_data(struct scrub_block *sblock);
228static int scrub_checksum_tree_block(struct scrub_block *sblock);
229static int scrub_checksum_super(struct scrub_block *sblock);
b5d67f64 230static void scrub_block_put(struct scrub_block *sblock);
7a9e9987
SB
231static void scrub_page_get(struct scrub_page *spage);
232static void scrub_page_put(struct scrub_page *spage);
5a6ac9ea
MX
233static void scrub_parity_get(struct scrub_parity *sparity);
234static void scrub_parity_put(struct scrub_parity *sparity);
fa485d21 235static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len,
a36cf8b8 236 u64 physical, struct btrfs_device *dev, u64 flags,
96e63a45 237 u64 gen, int mirror_num, u8 *csum,
ff023aac 238 u64 physical_for_dev_replace);
4246a0b6 239static void scrub_bio_end_io(struct bio *bio);
b5d67f64
SB
240static void scrub_bio_end_io_worker(struct btrfs_work *work);
241static void scrub_block_complete(struct scrub_block *sblock);
ff023aac 242static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
fa485d21 243 u64 extent_logical, u32 extent_len,
ff023aac
SB
244 u64 *extent_physical,
245 struct btrfs_device **extent_dev,
246 int *extent_mirror_num);
ff023aac
SB
247static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
248 struct scrub_page *spage);
249static void scrub_wr_submit(struct scrub_ctx *sctx);
4246a0b6 250static void scrub_wr_bio_end_io(struct bio *bio);
ff023aac 251static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
f55985f4 252static void scrub_put_ctx(struct scrub_ctx *sctx);
1623edeb 253
261d2dcb 254static inline int scrub_is_page_on_raid56(struct scrub_page *spage)
762221f0 255{
261d2dcb
QW
256 return spage->recover &&
257 (spage->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
762221f0 258}
1623edeb 259
b6bfebc1
SB
260static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
261{
99f4cdb1 262 refcount_inc(&sctx->refs);
b6bfebc1
SB
263 atomic_inc(&sctx->bios_in_flight);
264}
265
266static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
267{
268 atomic_dec(&sctx->bios_in_flight);
269 wake_up(&sctx->list_wait);
f55985f4 270 scrub_put_ctx(sctx);
b6bfebc1
SB
271}
272
cb7ab021 273static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
3cb0929a
WS
274{
275 while (atomic_read(&fs_info->scrub_pause_req)) {
276 mutex_unlock(&fs_info->scrub_lock);
277 wait_event(fs_info->scrub_pause_wait,
278 atomic_read(&fs_info->scrub_pause_req) == 0);
279 mutex_lock(&fs_info->scrub_lock);
280 }
281}
282
0e22be89 283static void scrub_pause_on(struct btrfs_fs_info *fs_info)
cb7ab021
WS
284{
285 atomic_inc(&fs_info->scrubs_paused);
286 wake_up(&fs_info->scrub_pause_wait);
0e22be89 287}
cb7ab021 288
0e22be89
Z
289static void scrub_pause_off(struct btrfs_fs_info *fs_info)
290{
cb7ab021
WS
291 mutex_lock(&fs_info->scrub_lock);
292 __scrub_blocked_if_needed(fs_info);
293 atomic_dec(&fs_info->scrubs_paused);
294 mutex_unlock(&fs_info->scrub_lock);
295
296 wake_up(&fs_info->scrub_pause_wait);
297}
298
0e22be89
Z
299static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
300{
301 scrub_pause_on(fs_info);
302 scrub_pause_off(fs_info);
303}
304
0966a7b1
QW
305/*
306 * Insert new full stripe lock into full stripe locks tree
307 *
308 * Return pointer to existing or newly inserted full_stripe_lock structure if
309 * everything works well.
310 * Return ERR_PTR(-ENOMEM) if we failed to allocate memory
311 *
312 * NOTE: caller must hold full_stripe_locks_root->lock before calling this
313 * function
314 */
315static struct full_stripe_lock *insert_full_stripe_lock(
316 struct btrfs_full_stripe_locks_tree *locks_root,
317 u64 fstripe_logical)
318{
319 struct rb_node **p;
320 struct rb_node *parent = NULL;
321 struct full_stripe_lock *entry;
322 struct full_stripe_lock *ret;
323
a32bf9a3 324 lockdep_assert_held(&locks_root->lock);
0966a7b1
QW
325
326 p = &locks_root->root.rb_node;
327 while (*p) {
328 parent = *p;
329 entry = rb_entry(parent, struct full_stripe_lock, node);
330 if (fstripe_logical < entry->logical) {
331 p = &(*p)->rb_left;
332 } else if (fstripe_logical > entry->logical) {
333 p = &(*p)->rb_right;
334 } else {
335 entry->refs++;
336 return entry;
337 }
338 }
339
a5fb1142
FM
340 /*
341 * Insert new lock.
a5fb1142 342 */
0966a7b1
QW
343 ret = kmalloc(sizeof(*ret), GFP_KERNEL);
344 if (!ret)
345 return ERR_PTR(-ENOMEM);
346 ret->logical = fstripe_logical;
347 ret->refs = 1;
348 mutex_init(&ret->mutex);
349
350 rb_link_node(&ret->node, parent, p);
351 rb_insert_color(&ret->node, &locks_root->root);
352 return ret;
353}
354
355/*
356 * Search for a full stripe lock of a block group
357 *
358 * Return pointer to existing full stripe lock if found
359 * Return NULL if not found
360 */
361static struct full_stripe_lock *search_full_stripe_lock(
362 struct btrfs_full_stripe_locks_tree *locks_root,
363 u64 fstripe_logical)
364{
365 struct rb_node *node;
366 struct full_stripe_lock *entry;
367
a32bf9a3 368 lockdep_assert_held(&locks_root->lock);
0966a7b1
QW
369
370 node = locks_root->root.rb_node;
371 while (node) {
372 entry = rb_entry(node, struct full_stripe_lock, node);
373 if (fstripe_logical < entry->logical)
374 node = node->rb_left;
375 else if (fstripe_logical > entry->logical)
376 node = node->rb_right;
377 else
378 return entry;
379 }
380 return NULL;
381}
382
383/*
384 * Helper to get full stripe logical from a normal bytenr.
385 *
386 * Caller must ensure @cache is a RAID56 block group.
387 */
32da5386 388static u64 get_full_stripe_logical(struct btrfs_block_group *cache, u64 bytenr)
0966a7b1
QW
389{
390 u64 ret;
391
392 /*
393 * Due to chunk item size limit, full stripe length should not be
394 * larger than U32_MAX. Just a sanity check here.
395 */
396 WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX);
397
398 /*
399 * round_down() can only handle power of 2, while RAID56 full
400 * stripe length can be 64KiB * n, so we need to manually round down.
401 */
b3470b5d
DS
402 ret = div64_u64(bytenr - cache->start, cache->full_stripe_len) *
403 cache->full_stripe_len + cache->start;
0966a7b1
QW
404 return ret;
405}
406
407/*
408 * Lock a full stripe to avoid concurrency of recovery and read
409 *
410 * It's only used for profiles with parities (RAID5/6), for other profiles it
411 * does nothing.
412 *
413 * Return 0 if we locked full stripe covering @bytenr, with a mutex held.
414 * So caller must call unlock_full_stripe() at the same context.
415 *
416 * Return <0 if encounters error.
417 */
418static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
419 bool *locked_ret)
420{
32da5386 421 struct btrfs_block_group *bg_cache;
0966a7b1
QW
422 struct btrfs_full_stripe_locks_tree *locks_root;
423 struct full_stripe_lock *existing;
424 u64 fstripe_start;
425 int ret = 0;
426
427 *locked_ret = false;
428 bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
429 if (!bg_cache) {
430 ASSERT(0);
431 return -ENOENT;
432 }
433
434 /* Profiles not based on parity don't need full stripe lock */
435 if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
436 goto out;
437 locks_root = &bg_cache->full_stripe_locks_root;
438
439 fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
440
441 /* Now insert the full stripe lock */
442 mutex_lock(&locks_root->lock);
443 existing = insert_full_stripe_lock(locks_root, fstripe_start);
444 mutex_unlock(&locks_root->lock);
445 if (IS_ERR(existing)) {
446 ret = PTR_ERR(existing);
447 goto out;
448 }
449 mutex_lock(&existing->mutex);
450 *locked_ret = true;
451out:
452 btrfs_put_block_group(bg_cache);
453 return ret;
454}
455
456/*
457 * Unlock a full stripe.
458 *
459 * NOTE: Caller must ensure it's the same context calling corresponding
460 * lock_full_stripe().
461 *
462 * Return 0 if we unlock full stripe without problem.
463 * Return <0 for error
464 */
465static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
466 bool locked)
467{
32da5386 468 struct btrfs_block_group *bg_cache;
0966a7b1
QW
469 struct btrfs_full_stripe_locks_tree *locks_root;
470 struct full_stripe_lock *fstripe_lock;
471 u64 fstripe_start;
472 bool freeit = false;
473 int ret = 0;
474
475 /* If we didn't acquire full stripe lock, no need to continue */
476 if (!locked)
477 return 0;
478
479 bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
480 if (!bg_cache) {
481 ASSERT(0);
482 return -ENOENT;
483 }
484 if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
485 goto out;
486
487 locks_root = &bg_cache->full_stripe_locks_root;
488 fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
489
490 mutex_lock(&locks_root->lock);
491 fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start);
492 /* Unpaired unlock_full_stripe() detected */
493 if (!fstripe_lock) {
494 WARN_ON(1);
495 ret = -ENOENT;
496 mutex_unlock(&locks_root->lock);
497 goto out;
498 }
499
500 if (fstripe_lock->refs == 0) {
501 WARN_ON(1);
502 btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow",
503 fstripe_lock->logical);
504 } else {
505 fstripe_lock->refs--;
506 }
507
508 if (fstripe_lock->refs == 0) {
509 rb_erase(&fstripe_lock->node, &locks_root->root);
510 freeit = true;
511 }
512 mutex_unlock(&locks_root->lock);
513
514 mutex_unlock(&fstripe_lock->mutex);
515 if (freeit)
516 kfree(fstripe_lock);
517out:
518 btrfs_put_block_group(bg_cache);
519 return ret;
520}
521
d9d181c1 522static void scrub_free_csums(struct scrub_ctx *sctx)
a2de733c 523{
d9d181c1 524 while (!list_empty(&sctx->csum_list)) {
a2de733c 525 struct btrfs_ordered_sum *sum;
d9d181c1 526 sum = list_first_entry(&sctx->csum_list,
a2de733c
AJ
527 struct btrfs_ordered_sum, list);
528 list_del(&sum->list);
529 kfree(sum);
530 }
531}
532
d9d181c1 533static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
a2de733c
AJ
534{
535 int i;
a2de733c 536
d9d181c1 537 if (!sctx)
a2de733c
AJ
538 return;
539
b5d67f64 540 /* this can happen when scrub is cancelled */
d9d181c1
SB
541 if (sctx->curr != -1) {
542 struct scrub_bio *sbio = sctx->bios[sctx->curr];
b5d67f64
SB
543
544 for (i = 0; i < sbio->page_count; i++) {
ff023aac 545 WARN_ON(!sbio->pagev[i]->page);
b5d67f64
SB
546 scrub_block_put(sbio->pagev[i]->sblock);
547 }
548 bio_put(sbio->bio);
549 }
550
ff023aac 551 for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
d9d181c1 552 struct scrub_bio *sbio = sctx->bios[i];
a2de733c
AJ
553
554 if (!sbio)
555 break;
a2de733c
AJ
556 kfree(sbio);
557 }
558
3fb99303 559 kfree(sctx->wr_curr_bio);
d9d181c1
SB
560 scrub_free_csums(sctx);
561 kfree(sctx);
a2de733c
AJ
562}
563
f55985f4
FM
564static void scrub_put_ctx(struct scrub_ctx *sctx)
565{
99f4cdb1 566 if (refcount_dec_and_test(&sctx->refs))
f55985f4
FM
567 scrub_free_ctx(sctx);
568}
569
92f7ba43
DS
570static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
571 struct btrfs_fs_info *fs_info, int is_dev_replace)
a2de733c 572{
d9d181c1 573 struct scrub_ctx *sctx;
a2de733c 574 int i;
a2de733c 575
58c4e173 576 sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
d9d181c1 577 if (!sctx)
a2de733c 578 goto nomem;
99f4cdb1 579 refcount_set(&sctx->refs, 1);
63a212ab 580 sctx->is_dev_replace = is_dev_replace;
b54ffb73 581 sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
d9d181c1 582 sctx->curr = -1;
92f7ba43 583 sctx->fs_info = fs_info;
e49be14b 584 INIT_LIST_HEAD(&sctx->csum_list);
ff023aac 585 for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
a2de733c
AJ
586 struct scrub_bio *sbio;
587
58c4e173 588 sbio = kzalloc(sizeof(*sbio), GFP_KERNEL);
a2de733c
AJ
589 if (!sbio)
590 goto nomem;
d9d181c1 591 sctx->bios[i] = sbio;
a2de733c 592
a2de733c 593 sbio->index = i;
d9d181c1 594 sbio->sctx = sctx;
b5d67f64 595 sbio->page_count = 0;
a0cac0ec
OS
596 btrfs_init_work(&sbio->work, scrub_bio_end_io_worker, NULL,
597 NULL);
a2de733c 598
ff023aac 599 if (i != SCRUB_BIOS_PER_SCTX - 1)
d9d181c1 600 sctx->bios[i]->next_free = i + 1;
0ef8e451 601 else
d9d181c1
SB
602 sctx->bios[i]->next_free = -1;
603 }
604 sctx->first_free = 0;
b6bfebc1
SB
605 atomic_set(&sctx->bios_in_flight, 0);
606 atomic_set(&sctx->workers_pending, 0);
d9d181c1 607 atomic_set(&sctx->cancel_req, 0);
d9d181c1
SB
608
609 spin_lock_init(&sctx->list_lock);
610 spin_lock_init(&sctx->stat_lock);
611 init_waitqueue_head(&sctx->list_wait);
eb3b5053 612 sctx->throttle_deadline = 0;
ff023aac 613
3fb99303
DS
614 WARN_ON(sctx->wr_curr_bio != NULL);
615 mutex_init(&sctx->wr_lock);
616 sctx->wr_curr_bio = NULL;
8fcdac3f 617 if (is_dev_replace) {
ded56184 618 WARN_ON(!fs_info->dev_replace.tgtdev);
3fb99303 619 sctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO;
ded56184 620 sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
2073c4c2 621 sctx->flush_all_writes = false;
ff023aac 622 }
8fcdac3f 623
d9d181c1 624 return sctx;
a2de733c
AJ
625
626nomem:
d9d181c1 627 scrub_free_ctx(sctx);
a2de733c
AJ
628 return ERR_PTR(-ENOMEM);
629}
630
ff023aac
SB
631static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
632 void *warn_ctx)
558540c1
JS
633{
634 u64 isize;
635 u32 nlink;
636 int ret;
637 int i;
de2491fd 638 unsigned nofs_flag;
558540c1
JS
639 struct extent_buffer *eb;
640 struct btrfs_inode_item *inode_item;
ff023aac 641 struct scrub_warning *swarn = warn_ctx;
fb456252 642 struct btrfs_fs_info *fs_info = swarn->dev->fs_info;
558540c1
JS
643 struct inode_fs_paths *ipath = NULL;
644 struct btrfs_root *local_root;
1d4c08e0 645 struct btrfs_key key;
558540c1 646
56e9357a 647 local_root = btrfs_get_fs_root(fs_info, root, true);
558540c1
JS
648 if (IS_ERR(local_root)) {
649 ret = PTR_ERR(local_root);
650 goto err;
651 }
652
14692cc1
DS
653 /*
654 * this makes the path point to (inum INODE_ITEM ioff)
655 */
1d4c08e0
DS
656 key.objectid = inum;
657 key.type = BTRFS_INODE_ITEM_KEY;
658 key.offset = 0;
659
660 ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
558540c1 661 if (ret) {
00246528 662 btrfs_put_root(local_root);
558540c1
JS
663 btrfs_release_path(swarn->path);
664 goto err;
665 }
666
667 eb = swarn->path->nodes[0];
668 inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
669 struct btrfs_inode_item);
670 isize = btrfs_inode_size(eb, inode_item);
671 nlink = btrfs_inode_nlink(eb, inode_item);
672 btrfs_release_path(swarn->path);
673
de2491fd
DS
674 /*
675 * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub
676 * uses GFP_NOFS in this context, so we keep it consistent but it does
677 * not seem to be strictly necessary.
678 */
679 nofs_flag = memalloc_nofs_save();
558540c1 680 ipath = init_ipath(4096, local_root, swarn->path);
de2491fd 681 memalloc_nofs_restore(nofs_flag);
26bdef54 682 if (IS_ERR(ipath)) {
00246528 683 btrfs_put_root(local_root);
26bdef54
DC
684 ret = PTR_ERR(ipath);
685 ipath = NULL;
686 goto err;
687 }
558540c1
JS
688 ret = paths_from_inode(inum, ipath);
689
690 if (ret < 0)
691 goto err;
692
693 /*
694 * we deliberately ignore the bit ipath might have been too small to
695 * hold all of the paths here
696 */
697 for (i = 0; i < ipath->fspath->elem_cnt; ++i)
5d163e0e 698 btrfs_warn_in_rcu(fs_info,
6aa21263 699"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %llu, links %u (path: %s)",
5d163e0e
JM
700 swarn->errstr, swarn->logical,
701 rcu_str_deref(swarn->dev->name),
6aa21263 702 swarn->physical,
5d163e0e
JM
703 root, inum, offset,
704 min(isize - offset, (u64)PAGE_SIZE), nlink,
705 (char *)(unsigned long)ipath->fspath->val[i]);
558540c1 706
00246528 707 btrfs_put_root(local_root);
558540c1
JS
708 free_ipath(ipath);
709 return 0;
710
711err:
5d163e0e 712 btrfs_warn_in_rcu(fs_info,
6aa21263 713 "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
5d163e0e
JM
714 swarn->errstr, swarn->logical,
715 rcu_str_deref(swarn->dev->name),
6aa21263 716 swarn->physical,
5d163e0e 717 root, inum, offset, ret);
558540c1
JS
718
719 free_ipath(ipath);
720 return 0;
721}
722
b5d67f64 723static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
558540c1 724{
a36cf8b8
SB
725 struct btrfs_device *dev;
726 struct btrfs_fs_info *fs_info;
558540c1
JS
727 struct btrfs_path *path;
728 struct btrfs_key found_key;
729 struct extent_buffer *eb;
730 struct btrfs_extent_item *ei;
731 struct scrub_warning swarn;
69917e43
LB
732 unsigned long ptr = 0;
733 u64 extent_item_pos;
734 u64 flags = 0;
558540c1 735 u64 ref_root;
69917e43 736 u32 item_size;
07c9a8e0 737 u8 ref_level = 0;
69917e43 738 int ret;
558540c1 739
a36cf8b8 740 WARN_ON(sblock->page_count < 1);
7a9e9987 741 dev = sblock->pagev[0]->dev;
fb456252 742 fs_info = sblock->sctx->fs_info;
a36cf8b8 743
558540c1 744 path = btrfs_alloc_path();
8b9456da
DS
745 if (!path)
746 return;
558540c1 747
6aa21263 748 swarn.physical = sblock->pagev[0]->physical;
7a9e9987 749 swarn.logical = sblock->pagev[0]->logical;
558540c1 750 swarn.errstr = errstr;
a36cf8b8 751 swarn.dev = NULL;
558540c1 752
69917e43
LB
753 ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
754 &flags);
558540c1
JS
755 if (ret < 0)
756 goto out;
757
4692cf58 758 extent_item_pos = swarn.logical - found_key.objectid;
558540c1
JS
759 swarn.extent_item_size = found_key.offset;
760
761 eb = path->nodes[0];
762 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
763 item_size = btrfs_item_size_nr(eb, path->slots[0]);
764
69917e43 765 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
558540c1 766 do {
6eda71d0
LB
767 ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
768 item_size, &ref_root,
769 &ref_level);
ecaeb14b 770 btrfs_warn_in_rcu(fs_info,
6aa21263 771"%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
5d163e0e 772 errstr, swarn.logical,
606686ee 773 rcu_str_deref(dev->name),
6aa21263 774 swarn.physical,
558540c1
JS
775 ref_level ? "node" : "leaf",
776 ret < 0 ? -1 : ref_level,
777 ret < 0 ? -1 : ref_root);
778 } while (ret != 1);
d8fe29e9 779 btrfs_release_path(path);
558540c1 780 } else {
d8fe29e9 781 btrfs_release_path(path);
558540c1 782 swarn.path = path;
a36cf8b8 783 swarn.dev = dev;
7a3ae2f8
JS
784 iterate_extent_inodes(fs_info, found_key.objectid,
785 extent_item_pos, 1,
c995ab3c 786 scrub_print_warning_inode, &swarn, false);
558540c1
JS
787 }
788
789out:
790 btrfs_free_path(path);
558540c1
JS
791}
792
af8e2d1d
MX
793static inline void scrub_get_recover(struct scrub_recover *recover)
794{
6f615018 795 refcount_inc(&recover->refs);
af8e2d1d
MX
796}
797
e501bfe3
QW
798static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
799 struct scrub_recover *recover)
af8e2d1d 800{
6f615018 801 if (refcount_dec_and_test(&recover->refs)) {
e501bfe3 802 btrfs_bio_counter_dec(fs_info);
6e9606d2 803 btrfs_put_bbio(recover->bbio);
af8e2d1d
MX
804 kfree(recover);
805 }
806}
807
a2de733c 808/*
b5d67f64
SB
809 * scrub_handle_errored_block gets called when either verification of the
810 * pages failed or the bio failed to read, e.g. with EIO. In the latter
811 * case, this function handles all pages in the bio, even though only one
812 * may be bad.
813 * The goal of this function is to repair the errored block by using the
814 * contents of one of the mirrors.
a2de733c 815 */
b5d67f64 816static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
a2de733c 817{
d9d181c1 818 struct scrub_ctx *sctx = sblock_to_check->sctx;
a36cf8b8 819 struct btrfs_device *dev;
b5d67f64 820 struct btrfs_fs_info *fs_info;
b5d67f64 821 u64 logical;
b5d67f64
SB
822 unsigned int failed_mirror_index;
823 unsigned int is_metadata;
824 unsigned int have_csum;
b5d67f64
SB
825 struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
826 struct scrub_block *sblock_bad;
827 int ret;
828 int mirror_index;
829 int page_num;
830 int success;
28d70e23 831 bool full_stripe_locked;
7c3c7cb9 832 unsigned int nofs_flag;
8bb1cf1b 833 static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
b5d67f64
SB
834 DEFAULT_RATELIMIT_BURST);
835
836 BUG_ON(sblock_to_check->page_count < 1);
fb456252 837 fs_info = sctx->fs_info;
4ded4f63
SB
838 if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
839 /*
840 * if we find an error in a super block, we just report it.
841 * They will get written with the next transaction commit
842 * anyway
843 */
844 spin_lock(&sctx->stat_lock);
845 ++sctx->stat.super_errors;
846 spin_unlock(&sctx->stat_lock);
847 return 0;
848 }
7a9e9987 849 logical = sblock_to_check->pagev[0]->logical;
7a9e9987
SB
850 BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
851 failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
852 is_metadata = !(sblock_to_check->pagev[0]->flags &
b5d67f64 853 BTRFS_EXTENT_FLAG_DATA);
7a9e9987 854 have_csum = sblock_to_check->pagev[0]->have_csum;
7a9e9987 855 dev = sblock_to_check->pagev[0]->dev;
13db62b7 856
f7ef5287
NA
857 if (btrfs_is_zoned(fs_info) && !sctx->is_dev_replace)
858 return btrfs_repair_one_zone(fs_info, logical);
859
7c3c7cb9
FM
860 /*
861 * We must use GFP_NOFS because the scrub task might be waiting for a
862 * worker task executing this function and in turn a transaction commit
863 * might be waiting the scrub task to pause (which needs to wait for all
864 * the worker tasks to complete before pausing).
865 * We do allocations in the workers through insert_full_stripe_lock()
866 * and scrub_add_page_to_wr_bio(), which happens down the call chain of
867 * this function.
868 */
869 nofs_flag = memalloc_nofs_save();
28d70e23
QW
870 /*
871 * For RAID5/6, race can happen for a different device scrub thread.
872 * For data corruption, Parity and Data threads will both try
873 * to recovery the data.
874 * Race can lead to doubly added csum error, or even unrecoverable
875 * error.
876 */
877 ret = lock_full_stripe(fs_info, logical, &full_stripe_locked);
878 if (ret < 0) {
7c3c7cb9 879 memalloc_nofs_restore(nofs_flag);
28d70e23
QW
880 spin_lock(&sctx->stat_lock);
881 if (ret == -ENOMEM)
882 sctx->stat.malloc_errors++;
883 sctx->stat.read_errors++;
884 sctx->stat.uncorrectable_errors++;
885 spin_unlock(&sctx->stat_lock);
886 return ret;
887 }
888
b5d67f64
SB
889 /*
890 * read all mirrors one after the other. This includes to
891 * re-read the extent or metadata block that failed (that was
892 * the cause that this fixup code is called) another time,
893 * page by page this time in order to know which pages
894 * caused I/O errors and which ones are good (for all mirrors).
895 * It is the goal to handle the situation when more than one
896 * mirror contains I/O errors, but the errors do not
897 * overlap, i.e. the data can be repaired by selecting the
898 * pages from those mirrors without I/O error on the
899 * particular pages. One example (with blocks >= 2 * PAGE_SIZE)
900 * would be that mirror #1 has an I/O error on the first page,
901 * the second page is good, and mirror #2 has an I/O error on
902 * the second page, but the first page is good.
903 * Then the first page of the first mirror can be repaired by
904 * taking the first page of the second mirror, and the
905 * second page of the second mirror can be repaired by
906 * copying the contents of the 2nd page of the 1st mirror.
907 * One more note: if the pages of one mirror contain I/O
908 * errors, the checksum cannot be verified. In order to get
909 * the best data for repairing, the first attempt is to find
910 * a mirror without I/O errors and with a validated checksum.
911 * Only if this is not possible, the pages are picked from
912 * mirrors with I/O errors without considering the checksum.
913 * If the latter is the case, at the end, the checksum of the
914 * repaired area is verified in order to correctly maintain
915 * the statistics.
916 */
917
31e818fe 918 sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS,
7c3c7cb9 919 sizeof(*sblocks_for_recheck), GFP_KERNEL);
b5d67f64 920 if (!sblocks_for_recheck) {
d9d181c1
SB
921 spin_lock(&sctx->stat_lock);
922 sctx->stat.malloc_errors++;
923 sctx->stat.read_errors++;
924 sctx->stat.uncorrectable_errors++;
925 spin_unlock(&sctx->stat_lock);
a36cf8b8 926 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
b5d67f64 927 goto out;
a2de733c
AJ
928 }
929
b5d67f64 930 /* setup the context, map the logical blocks and alloc the pages */
be50a8dd 931 ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
b5d67f64 932 if (ret) {
d9d181c1
SB
933 spin_lock(&sctx->stat_lock);
934 sctx->stat.read_errors++;
935 sctx->stat.uncorrectable_errors++;
936 spin_unlock(&sctx->stat_lock);
a36cf8b8 937 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
b5d67f64
SB
938 goto out;
939 }
940 BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
941 sblock_bad = sblocks_for_recheck + failed_mirror_index;
13db62b7 942
b5d67f64 943 /* build and submit the bios for the failed mirror, check checksums */
affe4a5a 944 scrub_recheck_block(fs_info, sblock_bad, 1);
a2de733c 945
b5d67f64
SB
946 if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
947 sblock_bad->no_io_error_seen) {
948 /*
949 * the error disappeared after reading page by page, or
950 * the area was part of a huge bio and other parts of the
951 * bio caused I/O errors, or the block layer merged several
952 * read requests into one and the error is caused by a
953 * different bio (usually one of the two latter cases is
954 * the cause)
955 */
d9d181c1
SB
956 spin_lock(&sctx->stat_lock);
957 sctx->stat.unverified_errors++;
5a6ac9ea 958 sblock_to_check->data_corrected = 1;
d9d181c1 959 spin_unlock(&sctx->stat_lock);
a2de733c 960
ff023aac
SB
961 if (sctx->is_dev_replace)
962 scrub_write_block_to_dev_replace(sblock_bad);
b5d67f64 963 goto out;
a2de733c 964 }
a2de733c 965
b5d67f64 966 if (!sblock_bad->no_io_error_seen) {
d9d181c1
SB
967 spin_lock(&sctx->stat_lock);
968 sctx->stat.read_errors++;
969 spin_unlock(&sctx->stat_lock);
8bb1cf1b 970 if (__ratelimit(&rs))
b5d67f64 971 scrub_print_warning("i/o error", sblock_to_check);
a36cf8b8 972 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
b5d67f64 973 } else if (sblock_bad->checksum_error) {
d9d181c1
SB
974 spin_lock(&sctx->stat_lock);
975 sctx->stat.csum_errors++;
976 spin_unlock(&sctx->stat_lock);
8bb1cf1b 977 if (__ratelimit(&rs))
b5d67f64 978 scrub_print_warning("checksum error", sblock_to_check);
a36cf8b8 979 btrfs_dev_stat_inc_and_print(dev,
442a4f63 980 BTRFS_DEV_STAT_CORRUPTION_ERRS);
b5d67f64 981 } else if (sblock_bad->header_error) {
d9d181c1
SB
982 spin_lock(&sctx->stat_lock);
983 sctx->stat.verify_errors++;
984 spin_unlock(&sctx->stat_lock);
8bb1cf1b 985 if (__ratelimit(&rs))
b5d67f64
SB
986 scrub_print_warning("checksum/header error",
987 sblock_to_check);
442a4f63 988 if (sblock_bad->generation_error)
a36cf8b8 989 btrfs_dev_stat_inc_and_print(dev,
442a4f63
SB
990 BTRFS_DEV_STAT_GENERATION_ERRS);
991 else
a36cf8b8 992 btrfs_dev_stat_inc_and_print(dev,
442a4f63 993 BTRFS_DEV_STAT_CORRUPTION_ERRS);
b5d67f64 994 }
a2de733c 995
33ef30ad
ID
996 if (sctx->readonly) {
997 ASSERT(!sctx->is_dev_replace);
998 goto out;
999 }
a2de733c 1000
b5d67f64
SB
1001 /*
1002 * now build and submit the bios for the other mirrors, check
cb2ced73
SB
1003 * checksums.
1004 * First try to pick the mirror which is completely without I/O
b5d67f64
SB
1005 * errors and also does not have a checksum error.
1006 * If one is found, and if a checksum is present, the full block
1007 * that is known to contain an error is rewritten. Afterwards
1008 * the block is known to be corrected.
1009 * If a mirror is found which is completely correct, and no
1010 * checksum is present, only those pages are rewritten that had
1011 * an I/O error in the block to be repaired, since it cannot be
1012 * determined, which copy of the other pages is better (and it
1013 * could happen otherwise that a correct page would be
1014 * overwritten by a bad one).
1015 */
762221f0 1016 for (mirror_index = 0; ;mirror_index++) {
cb2ced73 1017 struct scrub_block *sblock_other;
b5d67f64 1018
cb2ced73
SB
1019 if (mirror_index == failed_mirror_index)
1020 continue;
762221f0
LB
1021
1022 /* raid56's mirror can be more than BTRFS_MAX_MIRRORS */
1023 if (!scrub_is_page_on_raid56(sblock_bad->pagev[0])) {
1024 if (mirror_index >= BTRFS_MAX_MIRRORS)
1025 break;
1026 if (!sblocks_for_recheck[mirror_index].page_count)
1027 break;
1028
1029 sblock_other = sblocks_for_recheck + mirror_index;
1030 } else {
1031 struct scrub_recover *r = sblock_bad->pagev[0]->recover;
1032 int max_allowed = r->bbio->num_stripes -
1033 r->bbio->num_tgtdevs;
1034
1035 if (mirror_index >= max_allowed)
1036 break;
1037 if (!sblocks_for_recheck[1].page_count)
1038 break;
1039
1040 ASSERT(failed_mirror_index == 0);
1041 sblock_other = sblocks_for_recheck + 1;
1042 sblock_other->pagev[0]->mirror_num = 1 + mirror_index;
1043 }
cb2ced73
SB
1044
1045 /* build and submit the bios, check checksums */
affe4a5a 1046 scrub_recheck_block(fs_info, sblock_other, 0);
34f5c8e9
SB
1047
1048 if (!sblock_other->header_error &&
b5d67f64
SB
1049 !sblock_other->checksum_error &&
1050 sblock_other->no_io_error_seen) {
ff023aac
SB
1051 if (sctx->is_dev_replace) {
1052 scrub_write_block_to_dev_replace(sblock_other);
114ab50d 1053 goto corrected_error;
ff023aac 1054 } else {
ff023aac 1055 ret = scrub_repair_block_from_good_copy(
114ab50d
ZL
1056 sblock_bad, sblock_other);
1057 if (!ret)
1058 goto corrected_error;
ff023aac 1059 }
b5d67f64
SB
1060 }
1061 }
a2de733c 1062
b968fed1
ZL
1063 if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
1064 goto did_not_correct_error;
ff023aac
SB
1065
1066 /*
ff023aac 1067 * In case of I/O errors in the area that is supposed to be
b5d67f64
SB
1068 * repaired, continue by picking good copies of those pages.
1069 * Select the good pages from mirrors to rewrite bad pages from
1070 * the area to fix. Afterwards verify the checksum of the block
1071 * that is supposed to be repaired. This verification step is
1072 * only done for the purpose of statistic counting and for the
1073 * final scrub report, whether errors remain.
1074 * A perfect algorithm could make use of the checksum and try
1075 * all possible combinations of pages from the different mirrors
1076 * until the checksum verification succeeds. For example, when
1077 * the 2nd page of mirror #1 faces I/O errors, and the 2nd page
1078 * of mirror #2 is readable but the final checksum test fails,
1079 * then the 2nd page of mirror #3 could be tried, whether now
01327610 1080 * the final checksum succeeds. But this would be a rare
b5d67f64
SB
1081 * exception and is therefore not implemented. At least it is
1082 * avoided that the good copy is overwritten.
1083 * A more useful improvement would be to pick the sectors
1084 * without I/O error based on sector sizes (512 bytes on legacy
1085 * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one
1086 * mirror could be repaired by taking 512 byte of a different
1087 * mirror, even if other 512 byte sectors in the same PAGE_SIZE
1088 * area are unreadable.
a2de733c 1089 */
b5d67f64 1090 success = 1;
b968fed1
ZL
1091 for (page_num = 0; page_num < sblock_bad->page_count;
1092 page_num++) {
261d2dcb 1093 struct scrub_page *spage_bad = sblock_bad->pagev[page_num];
b968fed1 1094 struct scrub_block *sblock_other = NULL;
b5d67f64 1095
b968fed1 1096 /* skip no-io-error page in scrub */
261d2dcb 1097 if (!spage_bad->io_error && !sctx->is_dev_replace)
a2de733c 1098 continue;
b5d67f64 1099
4759700a
LB
1100 if (scrub_is_page_on_raid56(sblock_bad->pagev[0])) {
1101 /*
1102 * In case of dev replace, if raid56 rebuild process
1103 * didn't work out correct data, then copy the content
1104 * in sblock_bad to make sure target device is identical
1105 * to source device, instead of writing garbage data in
1106 * sblock_for_recheck array to target device.
1107 */
1108 sblock_other = NULL;
261d2dcb 1109 } else if (spage_bad->io_error) {
4759700a 1110 /* try to find no-io-error page in mirrors */
b968fed1
ZL
1111 for (mirror_index = 0;
1112 mirror_index < BTRFS_MAX_MIRRORS &&
1113 sblocks_for_recheck[mirror_index].page_count > 0;
1114 mirror_index++) {
1115 if (!sblocks_for_recheck[mirror_index].
1116 pagev[page_num]->io_error) {
1117 sblock_other = sblocks_for_recheck +
1118 mirror_index;
1119 break;
b5d67f64
SB
1120 }
1121 }
b968fed1
ZL
1122 if (!sblock_other)
1123 success = 0;
96e36920 1124 }
a2de733c 1125
b968fed1
ZL
1126 if (sctx->is_dev_replace) {
1127 /*
1128 * did not find a mirror to fetch the page
1129 * from. scrub_write_page_to_dev_replace()
1130 * handles this case (page->io_error), by
1131 * filling the block with zeros before
1132 * submitting the write request
1133 */
1134 if (!sblock_other)
1135 sblock_other = sblock_bad;
1136
1137 if (scrub_write_page_to_dev_replace(sblock_other,
1138 page_num) != 0) {
e37abe97 1139 atomic64_inc(
0b246afa 1140 &fs_info->dev_replace.num_write_errors);
b968fed1
ZL
1141 success = 0;
1142 }
1143 } else if (sblock_other) {
1144 ret = scrub_repair_page_from_good_copy(sblock_bad,
1145 sblock_other,
1146 page_num, 0);
1147 if (0 == ret)
261d2dcb 1148 spage_bad->io_error = 0;
b968fed1
ZL
1149 else
1150 success = 0;
b5d67f64 1151 }
a2de733c 1152 }
a2de733c 1153
b968fed1 1154 if (success && !sctx->is_dev_replace) {
b5d67f64
SB
1155 if (is_metadata || have_csum) {
1156 /*
1157 * need to verify the checksum now that all
1158 * sectors on disk are repaired (the write
1159 * request for data to be repaired is on its way).
1160 * Just be lazy and use scrub_recheck_block()
1161 * which re-reads the data before the checksum
1162 * is verified, but most likely the data comes out
1163 * of the page cache.
1164 */
affe4a5a 1165 scrub_recheck_block(fs_info, sblock_bad, 1);
34f5c8e9 1166 if (!sblock_bad->header_error &&
b5d67f64
SB
1167 !sblock_bad->checksum_error &&
1168 sblock_bad->no_io_error_seen)
1169 goto corrected_error;
1170 else
1171 goto did_not_correct_error;
1172 } else {
1173corrected_error:
d9d181c1
SB
1174 spin_lock(&sctx->stat_lock);
1175 sctx->stat.corrected_errors++;
5a6ac9ea 1176 sblock_to_check->data_corrected = 1;
d9d181c1 1177 spin_unlock(&sctx->stat_lock);
b14af3b4
DS
1178 btrfs_err_rl_in_rcu(fs_info,
1179 "fixed up error at logical %llu on dev %s",
c1c9ff7c 1180 logical, rcu_str_deref(dev->name));
8628764e 1181 }
b5d67f64
SB
1182 } else {
1183did_not_correct_error:
d9d181c1
SB
1184 spin_lock(&sctx->stat_lock);
1185 sctx->stat.uncorrectable_errors++;
1186 spin_unlock(&sctx->stat_lock);
b14af3b4
DS
1187 btrfs_err_rl_in_rcu(fs_info,
1188 "unable to fixup (regular) error at logical %llu on dev %s",
c1c9ff7c 1189 logical, rcu_str_deref(dev->name));
96e36920 1190 }
a2de733c 1191
b5d67f64
SB
1192out:
1193 if (sblocks_for_recheck) {
1194 for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
1195 mirror_index++) {
1196 struct scrub_block *sblock = sblocks_for_recheck +
1197 mirror_index;
af8e2d1d 1198 struct scrub_recover *recover;
b5d67f64
SB
1199 int page_index;
1200
7a9e9987
SB
1201 for (page_index = 0; page_index < sblock->page_count;
1202 page_index++) {
1203 sblock->pagev[page_index]->sblock = NULL;
af8e2d1d
MX
1204 recover = sblock->pagev[page_index]->recover;
1205 if (recover) {
e501bfe3 1206 scrub_put_recover(fs_info, recover);
af8e2d1d
MX
1207 sblock->pagev[page_index]->recover =
1208 NULL;
1209 }
7a9e9987
SB
1210 scrub_page_put(sblock->pagev[page_index]);
1211 }
b5d67f64
SB
1212 }
1213 kfree(sblocks_for_recheck);
1214 }
a2de733c 1215
28d70e23 1216 ret = unlock_full_stripe(fs_info, logical, full_stripe_locked);
7c3c7cb9 1217 memalloc_nofs_restore(nofs_flag);
28d70e23
QW
1218 if (ret < 0)
1219 return ret;
b5d67f64
SB
1220 return 0;
1221}
a2de733c 1222
8e5cfb55 1223static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio)
af8e2d1d 1224{
10f11900
ZL
1225 if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
1226 return 2;
1227 else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
1228 return 3;
1229 else
af8e2d1d 1230 return (int)bbio->num_stripes;
af8e2d1d
MX
1231}
1232
10f11900
ZL
1233static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
1234 u64 *raid_map,
af8e2d1d
MX
1235 u64 mapped_length,
1236 int nstripes, int mirror,
1237 int *stripe_index,
1238 u64 *stripe_offset)
1239{
1240 int i;
1241
ffe2d203 1242 if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
af8e2d1d
MX
1243 /* RAID5/6 */
1244 for (i = 0; i < nstripes; i++) {
1245 if (raid_map[i] == RAID6_Q_STRIPE ||
1246 raid_map[i] == RAID5_P_STRIPE)
1247 continue;
1248
1249 if (logical >= raid_map[i] &&
1250 logical < raid_map[i] + mapped_length)
1251 break;
1252 }
1253
1254 *stripe_index = i;
1255 *stripe_offset = logical - raid_map[i];
1256 } else {
1257 /* The other RAID type */
1258 *stripe_index = mirror;
1259 *stripe_offset = 0;
1260 }
1261}
1262
be50a8dd 1263static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
b5d67f64
SB
1264 struct scrub_block *sblocks_for_recheck)
1265{
be50a8dd 1266 struct scrub_ctx *sctx = original_sblock->sctx;
fb456252 1267 struct btrfs_fs_info *fs_info = sctx->fs_info;
be50a8dd
ZL
1268 u64 length = original_sblock->page_count * PAGE_SIZE;
1269 u64 logical = original_sblock->pagev[0]->logical;
4734b7ed
ZL
1270 u64 generation = original_sblock->pagev[0]->generation;
1271 u64 flags = original_sblock->pagev[0]->flags;
1272 u64 have_csum = original_sblock->pagev[0]->have_csum;
af8e2d1d
MX
1273 struct scrub_recover *recover;
1274 struct btrfs_bio *bbio;
af8e2d1d
MX
1275 u64 sublen;
1276 u64 mapped_length;
1277 u64 stripe_offset;
1278 int stripe_index;
be50a8dd 1279 int page_index = 0;
b5d67f64 1280 int mirror_index;
af8e2d1d 1281 int nmirrors;
b5d67f64
SB
1282 int ret;
1283
1284 /*
57019345 1285 * note: the two members refs and outstanding_pages
b5d67f64
SB
1286 * are not used (and not set) in the blocks that are used for
1287 * the recheck procedure
1288 */
1289
b5d67f64 1290 while (length > 0) {
af8e2d1d
MX
1291 sublen = min_t(u64, length, PAGE_SIZE);
1292 mapped_length = sublen;
1293 bbio = NULL;
a2de733c 1294
b5d67f64
SB
1295 /*
1296 * with a length of PAGE_SIZE, each returned stripe
1297 * represents one mirror
1298 */
e501bfe3 1299 btrfs_bio_counter_inc_blocked(fs_info);
cf8cddd3 1300 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
825ad4c9 1301 logical, &mapped_length, &bbio);
b5d67f64 1302 if (ret || !bbio || mapped_length < sublen) {
6e9606d2 1303 btrfs_put_bbio(bbio);
e501bfe3 1304 btrfs_bio_counter_dec(fs_info);
b5d67f64
SB
1305 return -EIO;
1306 }
a2de733c 1307
af8e2d1d
MX
1308 recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
1309 if (!recover) {
6e9606d2 1310 btrfs_put_bbio(bbio);
e501bfe3 1311 btrfs_bio_counter_dec(fs_info);
af8e2d1d
MX
1312 return -ENOMEM;
1313 }
1314
6f615018 1315 refcount_set(&recover->refs, 1);
af8e2d1d 1316 recover->bbio = bbio;
af8e2d1d
MX
1317 recover->map_length = mapped_length;
1318
24731149 1319 BUG_ON(page_index >= SCRUB_MAX_PAGES_PER_BLOCK);
af8e2d1d 1320
be50a8dd 1321 nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS);
10f11900 1322
af8e2d1d 1323 for (mirror_index = 0; mirror_index < nmirrors;
b5d67f64
SB
1324 mirror_index++) {
1325 struct scrub_block *sblock;
261d2dcb 1326 struct scrub_page *spage;
b5d67f64 1327
b5d67f64 1328 sblock = sblocks_for_recheck + mirror_index;
7a9e9987 1329 sblock->sctx = sctx;
4734b7ed 1330
261d2dcb
QW
1331 spage = kzalloc(sizeof(*spage), GFP_NOFS);
1332 if (!spage) {
7a9e9987 1333leave_nomem:
d9d181c1
SB
1334 spin_lock(&sctx->stat_lock);
1335 sctx->stat.malloc_errors++;
1336 spin_unlock(&sctx->stat_lock);
e501bfe3 1337 scrub_put_recover(fs_info, recover);
b5d67f64
SB
1338 return -ENOMEM;
1339 }
261d2dcb
QW
1340 scrub_page_get(spage);
1341 sblock->pagev[page_index] = spage;
1342 spage->sblock = sblock;
1343 spage->flags = flags;
1344 spage->generation = generation;
1345 spage->logical = logical;
1346 spage->have_csum = have_csum;
4734b7ed 1347 if (have_csum)
261d2dcb 1348 memcpy(spage->csum,
4734b7ed 1349 original_sblock->pagev[0]->csum,
2ae0c2d8 1350 sctx->fs_info->csum_size);
af8e2d1d 1351
10f11900
ZL
1352 scrub_stripe_index_and_offset(logical,
1353 bbio->map_type,
1354 bbio->raid_map,
af8e2d1d 1355 mapped_length,
e34c330d
ZL
1356 bbio->num_stripes -
1357 bbio->num_tgtdevs,
af8e2d1d
MX
1358 mirror_index,
1359 &stripe_index,
1360 &stripe_offset);
261d2dcb 1361 spage->physical = bbio->stripes[stripe_index].physical +
af8e2d1d 1362 stripe_offset;
261d2dcb 1363 spage->dev = bbio->stripes[stripe_index].dev;
af8e2d1d 1364
ff023aac 1365 BUG_ON(page_index >= original_sblock->page_count);
261d2dcb 1366 spage->physical_for_dev_replace =
ff023aac
SB
1367 original_sblock->pagev[page_index]->
1368 physical_for_dev_replace;
7a9e9987 1369 /* for missing devices, dev->bdev is NULL */
261d2dcb 1370 spage->mirror_num = mirror_index + 1;
b5d67f64 1371 sblock->page_count++;
261d2dcb
QW
1372 spage->page = alloc_page(GFP_NOFS);
1373 if (!spage->page)
7a9e9987 1374 goto leave_nomem;
af8e2d1d
MX
1375
1376 scrub_get_recover(recover);
261d2dcb 1377 spage->recover = recover;
b5d67f64 1378 }
e501bfe3 1379 scrub_put_recover(fs_info, recover);
b5d67f64
SB
1380 length -= sublen;
1381 logical += sublen;
1382 page_index++;
1383 }
1384
1385 return 0;
96e36920
ID
1386}
1387
4246a0b6 1388static void scrub_bio_wait_endio(struct bio *bio)
af8e2d1d 1389{
b4ff5ad7 1390 complete(bio->bi_private);
af8e2d1d
MX
1391}
1392
af8e2d1d
MX
1393static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
1394 struct bio *bio,
261d2dcb 1395 struct scrub_page *spage)
af8e2d1d 1396{
b4ff5ad7 1397 DECLARE_COMPLETION_ONSTACK(done);
af8e2d1d 1398 int ret;
762221f0 1399 int mirror_num;
af8e2d1d 1400
261d2dcb 1401 bio->bi_iter.bi_sector = spage->logical >> 9;
af8e2d1d
MX
1402 bio->bi_private = &done;
1403 bio->bi_end_io = scrub_bio_wait_endio;
1404
261d2dcb
QW
1405 mirror_num = spage->sblock->pagev[0]->mirror_num;
1406 ret = raid56_parity_recover(fs_info, bio, spage->recover->bbio,
1407 spage->recover->map_length,
762221f0 1408 mirror_num, 0);
af8e2d1d
MX
1409 if (ret)
1410 return ret;
1411
b4ff5ad7
LB
1412 wait_for_completion_io(&done);
1413 return blk_status_to_errno(bio->bi_status);
af8e2d1d
MX
1414}
1415
6ca1765b
LB
1416static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
1417 struct scrub_block *sblock)
1418{
1419 struct scrub_page *first_page = sblock->pagev[0];
1420 struct bio *bio;
1421 int page_num;
1422
1423 /* All pages in sblock belong to the same stripe on the same device. */
1424 ASSERT(first_page->dev);
1425 if (!first_page->dev->bdev)
1426 goto out;
1427
a8affc03 1428 bio = btrfs_io_bio_alloc(BIO_MAX_VECS);
6ca1765b
LB
1429 bio_set_dev(bio, first_page->dev->bdev);
1430
1431 for (page_num = 0; page_num < sblock->page_count; page_num++) {
261d2dcb 1432 struct scrub_page *spage = sblock->pagev[page_num];
6ca1765b 1433
261d2dcb
QW
1434 WARN_ON(!spage->page);
1435 bio_add_page(bio, spage->page, PAGE_SIZE, 0);
6ca1765b
LB
1436 }
1437
1438 if (scrub_submit_raid56_bio_wait(fs_info, bio, first_page)) {
1439 bio_put(bio);
1440 goto out;
1441 }
1442
1443 bio_put(bio);
1444
1445 scrub_recheck_block_checksum(sblock);
1446
1447 return;
1448out:
1449 for (page_num = 0; page_num < sblock->page_count; page_num++)
1450 sblock->pagev[page_num]->io_error = 1;
1451
1452 sblock->no_io_error_seen = 0;
1453}
1454
b5d67f64
SB
1455/*
1456 * this function will check the on disk data for checksum errors, header
1457 * errors and read I/O errors. If any I/O errors happen, the exact pages
1458 * which are errored are marked as being bad. The goal is to enable scrub
1459 * to take those pages that are not errored from all the mirrors so that
1460 * the pages that are errored in the just handled mirror can be repaired.
1461 */
34f5c8e9 1462static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
affe4a5a
ZL
1463 struct scrub_block *sblock,
1464 int retry_failed_mirror)
96e36920 1465{
b5d67f64 1466 int page_num;
96e36920 1467
b5d67f64 1468 sblock->no_io_error_seen = 1;
96e36920 1469
6ca1765b
LB
1470 /* short cut for raid56 */
1471 if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->pagev[0]))
1472 return scrub_recheck_block_on_raid56(fs_info, sblock);
1473
b5d67f64
SB
1474 for (page_num = 0; page_num < sblock->page_count; page_num++) {
1475 struct bio *bio;
261d2dcb 1476 struct scrub_page *spage = sblock->pagev[page_num];
b5d67f64 1477
261d2dcb
QW
1478 if (spage->dev->bdev == NULL) {
1479 spage->io_error = 1;
ea9947b4
SB
1480 sblock->no_io_error_seen = 0;
1481 continue;
1482 }
1483
261d2dcb 1484 WARN_ON(!spage->page);
c5e4c3d7 1485 bio = btrfs_io_bio_alloc(1);
261d2dcb 1486 bio_set_dev(bio, spage->dev->bdev);
b5d67f64 1487
261d2dcb
QW
1488 bio_add_page(bio, spage->page, PAGE_SIZE, 0);
1489 bio->bi_iter.bi_sector = spage->physical >> 9;
6ca1765b 1490 bio->bi_opf = REQ_OP_READ;
af8e2d1d 1491
6ca1765b 1492 if (btrfsic_submit_bio_wait(bio)) {
261d2dcb 1493 spage->io_error = 1;
6ca1765b 1494 sblock->no_io_error_seen = 0;
af8e2d1d 1495 }
33879d45 1496
b5d67f64
SB
1497 bio_put(bio);
1498 }
96e36920 1499
b5d67f64 1500 if (sblock->no_io_error_seen)
ba7cf988 1501 scrub_recheck_block_checksum(sblock);
a2de733c
AJ
1502}
1503
17a9be2f
MX
1504static inline int scrub_check_fsid(u8 fsid[],
1505 struct scrub_page *spage)
1506{
1507 struct btrfs_fs_devices *fs_devices = spage->dev->fs_devices;
1508 int ret;
1509
44880fdc 1510 ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
17a9be2f
MX
1511 return !ret;
1512}
1513
ba7cf988 1514static void scrub_recheck_block_checksum(struct scrub_block *sblock)
a2de733c 1515{
ba7cf988
ZL
1516 sblock->header_error = 0;
1517 sblock->checksum_error = 0;
1518 sblock->generation_error = 0;
b5d67f64 1519
ba7cf988
ZL
1520 if (sblock->pagev[0]->flags & BTRFS_EXTENT_FLAG_DATA)
1521 scrub_checksum_data(sblock);
1522 else
1523 scrub_checksum_tree_block(sblock);
a2de733c
AJ
1524}
1525
b5d67f64 1526static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
114ab50d 1527 struct scrub_block *sblock_good)
b5d67f64
SB
1528{
1529 int page_num;
1530 int ret = 0;
96e36920 1531
b5d67f64
SB
1532 for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
1533 int ret_sub;
96e36920 1534
b5d67f64
SB
1535 ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
1536 sblock_good,
114ab50d 1537 page_num, 1);
b5d67f64
SB
1538 if (ret_sub)
1539 ret = ret_sub;
a2de733c 1540 }
b5d67f64
SB
1541
1542 return ret;
1543}
1544
1545static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1546 struct scrub_block *sblock_good,
1547 int page_num, int force_write)
1548{
261d2dcb
QW
1549 struct scrub_page *spage_bad = sblock_bad->pagev[page_num];
1550 struct scrub_page *spage_good = sblock_good->pagev[page_num];
0b246afa 1551 struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info;
b5d67f64 1552
261d2dcb
QW
1553 BUG_ON(spage_bad->page == NULL);
1554 BUG_ON(spage_good->page == NULL);
b5d67f64 1555 if (force_write || sblock_bad->header_error ||
261d2dcb 1556 sblock_bad->checksum_error || spage_bad->io_error) {
b5d67f64
SB
1557 struct bio *bio;
1558 int ret;
b5d67f64 1559
261d2dcb 1560 if (!spage_bad->dev->bdev) {
0b246afa 1561 btrfs_warn_rl(fs_info,
5d163e0e 1562 "scrub_repair_page_from_good_copy(bdev == NULL) is unexpected");
ff023aac
SB
1563 return -EIO;
1564 }
1565
c5e4c3d7 1566 bio = btrfs_io_bio_alloc(1);
261d2dcb
QW
1567 bio_set_dev(bio, spage_bad->dev->bdev);
1568 bio->bi_iter.bi_sector = spage_bad->physical >> 9;
ebcc3263 1569 bio->bi_opf = REQ_OP_WRITE;
b5d67f64 1570
261d2dcb 1571 ret = bio_add_page(bio, spage_good->page, PAGE_SIZE, 0);
b5d67f64
SB
1572 if (PAGE_SIZE != ret) {
1573 bio_put(bio);
1574 return -EIO;
13db62b7 1575 }
b5d67f64 1576
4e49ea4a 1577 if (btrfsic_submit_bio_wait(bio)) {
261d2dcb 1578 btrfs_dev_stat_inc_and_print(spage_bad->dev,
442a4f63 1579 BTRFS_DEV_STAT_WRITE_ERRS);
e37abe97 1580 atomic64_inc(&fs_info->dev_replace.num_write_errors);
442a4f63
SB
1581 bio_put(bio);
1582 return -EIO;
1583 }
b5d67f64 1584 bio_put(bio);
a2de733c
AJ
1585 }
1586
b5d67f64
SB
1587 return 0;
1588}
1589
ff023aac
SB
1590static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1591{
0b246afa 1592 struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
ff023aac
SB
1593 int page_num;
1594
5a6ac9ea
MX
1595 /*
1596 * This block is used for the check of the parity on the source device,
1597 * so the data needn't be written into the destination device.
1598 */
1599 if (sblock->sparity)
1600 return;
1601
ff023aac
SB
1602 for (page_num = 0; page_num < sblock->page_count; page_num++) {
1603 int ret;
1604
1605 ret = scrub_write_page_to_dev_replace(sblock, page_num);
1606 if (ret)
e37abe97 1607 atomic64_inc(&fs_info->dev_replace.num_write_errors);
ff023aac
SB
1608 }
1609}
1610
1611static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
1612 int page_num)
1613{
1614 struct scrub_page *spage = sblock->pagev[page_num];
1615
1616 BUG_ON(spage->page == NULL);
a8b3a890
DS
1617 if (spage->io_error)
1618 clear_page(page_address(spage->page));
ff023aac 1619
ff023aac
SB
1620 return scrub_add_page_to_wr_bio(sblock->sctx, spage);
1621}
1622
de17addc
NA
1623static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)
1624{
1625 int ret = 0;
1626 u64 length;
1627
1628 if (!btrfs_is_zoned(sctx->fs_info))
1629 return 0;
1630
7db1c5d1
NA
1631 if (!btrfs_dev_is_sequential(sctx->wr_tgtdev, physical))
1632 return 0;
1633
de17addc
NA
1634 if (sctx->write_pointer < physical) {
1635 length = physical - sctx->write_pointer;
1636
1637 ret = btrfs_zoned_issue_zeroout(sctx->wr_tgtdev,
1638 sctx->write_pointer, length);
1639 if (!ret)
1640 sctx->write_pointer = physical;
1641 }
1642 return ret;
1643}
1644
ff023aac
SB
1645static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
1646 struct scrub_page *spage)
1647{
ff023aac
SB
1648 struct scrub_bio *sbio;
1649 int ret;
1650
3fb99303 1651 mutex_lock(&sctx->wr_lock);
ff023aac 1652again:
3fb99303
DS
1653 if (!sctx->wr_curr_bio) {
1654 sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio),
58c4e173 1655 GFP_KERNEL);
3fb99303
DS
1656 if (!sctx->wr_curr_bio) {
1657 mutex_unlock(&sctx->wr_lock);
ff023aac
SB
1658 return -ENOMEM;
1659 }
3fb99303
DS
1660 sctx->wr_curr_bio->sctx = sctx;
1661 sctx->wr_curr_bio->page_count = 0;
ff023aac 1662 }
3fb99303 1663 sbio = sctx->wr_curr_bio;
ff023aac
SB
1664 if (sbio->page_count == 0) {
1665 struct bio *bio;
1666
de17addc
NA
1667 ret = fill_writer_pointer_gap(sctx,
1668 spage->physical_for_dev_replace);
1669 if (ret) {
1670 mutex_unlock(&sctx->wr_lock);
1671 return ret;
1672 }
1673
ff023aac
SB
1674 sbio->physical = spage->physical_for_dev_replace;
1675 sbio->logical = spage->logical;
3fb99303 1676 sbio->dev = sctx->wr_tgtdev;
ff023aac
SB
1677 bio = sbio->bio;
1678 if (!bio) {
c5e4c3d7 1679 bio = btrfs_io_bio_alloc(sctx->pages_per_wr_bio);
ff023aac
SB
1680 sbio->bio = bio;
1681 }
1682
1683 bio->bi_private = sbio;
1684 bio->bi_end_io = scrub_wr_bio_end_io;
74d46992 1685 bio_set_dev(bio, sbio->dev->bdev);
4f024f37 1686 bio->bi_iter.bi_sector = sbio->physical >> 9;
ebcc3263 1687 bio->bi_opf = REQ_OP_WRITE;
4e4cbee9 1688 sbio->status = 0;
ff023aac
SB
1689 } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1690 spage->physical_for_dev_replace ||
1691 sbio->logical + sbio->page_count * PAGE_SIZE !=
1692 spage->logical) {
1693 scrub_wr_submit(sctx);
1694 goto again;
1695 }
1696
1697 ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
1698 if (ret != PAGE_SIZE) {
1699 if (sbio->page_count < 1) {
1700 bio_put(sbio->bio);
1701 sbio->bio = NULL;
3fb99303 1702 mutex_unlock(&sctx->wr_lock);
ff023aac
SB
1703 return -EIO;
1704 }
1705 scrub_wr_submit(sctx);
1706 goto again;
1707 }
1708
1709 sbio->pagev[sbio->page_count] = spage;
1710 scrub_page_get(spage);
1711 sbio->page_count++;
3fb99303 1712 if (sbio->page_count == sctx->pages_per_wr_bio)
ff023aac 1713 scrub_wr_submit(sctx);
3fb99303 1714 mutex_unlock(&sctx->wr_lock);
ff023aac
SB
1715
1716 return 0;
1717}
1718
1719static void scrub_wr_submit(struct scrub_ctx *sctx)
1720{
ff023aac
SB
1721 struct scrub_bio *sbio;
1722
3fb99303 1723 if (!sctx->wr_curr_bio)
ff023aac
SB
1724 return;
1725
3fb99303
DS
1726 sbio = sctx->wr_curr_bio;
1727 sctx->wr_curr_bio = NULL;
309dca30 1728 WARN_ON(!sbio->bio->bi_bdev);
ff023aac
SB
1729 scrub_pending_bio_inc(sctx);
1730 /* process all writes in a single worker thread. Then the block layer
1731 * orders the requests before sending them to the driver which
1732 * doubled the write performance on spinning disks when measured
1733 * with Linux 3.5 */
4e49ea4a 1734 btrfsic_submit_bio(sbio->bio);
de17addc
NA
1735
1736 if (btrfs_is_zoned(sctx->fs_info))
1737 sctx->write_pointer = sbio->physical + sbio->page_count * PAGE_SIZE;
ff023aac
SB
1738}
1739
4246a0b6 1740static void scrub_wr_bio_end_io(struct bio *bio)
ff023aac
SB
1741{
1742 struct scrub_bio *sbio = bio->bi_private;
fb456252 1743 struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
ff023aac 1744
4e4cbee9 1745 sbio->status = bio->bi_status;
ff023aac
SB
1746 sbio->bio = bio;
1747
a0cac0ec 1748 btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL);
0339ef2f 1749 btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
ff023aac
SB
1750}
1751
1752static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
1753{
1754 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1755 struct scrub_ctx *sctx = sbio->sctx;
1756 int i;
1757
1758 WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
4e4cbee9 1759 if (sbio->status) {
ff023aac 1760 struct btrfs_dev_replace *dev_replace =
fb456252 1761 &sbio->sctx->fs_info->dev_replace;
ff023aac
SB
1762
1763 for (i = 0; i < sbio->page_count; i++) {
1764 struct scrub_page *spage = sbio->pagev[i];
1765
1766 spage->io_error = 1;
e37abe97 1767 atomic64_inc(&dev_replace->num_write_errors);
ff023aac
SB
1768 }
1769 }
1770
1771 for (i = 0; i < sbio->page_count; i++)
1772 scrub_page_put(sbio->pagev[i]);
1773
1774 bio_put(sbio->bio);
1775 kfree(sbio);
1776 scrub_pending_bio_dec(sctx);
1777}
1778
1779static int scrub_checksum(struct scrub_block *sblock)
b5d67f64
SB
1780{
1781 u64 flags;
1782 int ret;
1783
ba7cf988
ZL
1784 /*
1785 * No need to initialize these stats currently,
1786 * because this function only use return value
1787 * instead of these stats value.
1788 *
1789 * Todo:
1790 * always use stats
1791 */
1792 sblock->header_error = 0;
1793 sblock->generation_error = 0;
1794 sblock->checksum_error = 0;
1795
7a9e9987
SB
1796 WARN_ON(sblock->page_count < 1);
1797 flags = sblock->pagev[0]->flags;
b5d67f64
SB
1798 ret = 0;
1799 if (flags & BTRFS_EXTENT_FLAG_DATA)
1800 ret = scrub_checksum_data(sblock);
1801 else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1802 ret = scrub_checksum_tree_block(sblock);
1803 else if (flags & BTRFS_EXTENT_FLAG_SUPER)
1804 (void)scrub_checksum_super(sblock);
1805 else
1806 WARN_ON(1);
1807 if (ret)
1808 scrub_handle_errored_block(sblock);
ff023aac
SB
1809
1810 return ret;
a2de733c
AJ
1811}
1812
b5d67f64 1813static int scrub_checksum_data(struct scrub_block *sblock)
a2de733c 1814{
d9d181c1 1815 struct scrub_ctx *sctx = sblock->sctx;
d5178578
JT
1816 struct btrfs_fs_info *fs_info = sctx->fs_info;
1817 SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
a2de733c 1818 u8 csum[BTRFS_CSUM_SIZE];
d41ebef2 1819 struct scrub_page *spage;
b0485252 1820 char *kaddr;
a2de733c 1821
b5d67f64 1822 BUG_ON(sblock->page_count < 1);
d41ebef2
DS
1823 spage = sblock->pagev[0];
1824 if (!spage->have_csum)
a2de733c
AJ
1825 return 0;
1826
d41ebef2 1827 kaddr = page_address(spage->page);
b5d67f64 1828
771aba0d
DS
1829 shash->tfm = fs_info->csum_shash;
1830 crypto_shash_init(shash);
b5d67f64 1831
b29dca44
QW
1832 /*
1833 * In scrub_pages() and scrub_pages_for_parity() we ensure each spage
1834 * only contains one sector of data.
1835 */
1836 crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
a2de733c 1837
b29dca44
QW
1838 if (memcmp(csum, spage->csum, fs_info->csum_size))
1839 sblock->checksum_error = 1;
ba7cf988 1840 return sblock->checksum_error;
a2de733c
AJ
1841}
1842
b5d67f64 1843static int scrub_checksum_tree_block(struct scrub_block *sblock)
a2de733c 1844{
d9d181c1 1845 struct scrub_ctx *sctx = sblock->sctx;
a2de733c 1846 struct btrfs_header *h;
0b246afa 1847 struct btrfs_fs_info *fs_info = sctx->fs_info;
d5178578 1848 SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
b5d67f64
SB
1849 u8 calculated_csum[BTRFS_CSUM_SIZE];
1850 u8 on_disk_csum[BTRFS_CSUM_SIZE];
53f3251d
QW
1851 /*
1852 * This is done in sectorsize steps even for metadata as there's a
1853 * constraint for nodesize to be aligned to sectorsize. This will need
1854 * to change so we don't misuse data and metadata units like that.
1855 */
1856 const u32 sectorsize = sctx->fs_info->sectorsize;
1857 const int num_sectors = fs_info->nodesize >> fs_info->sectorsize_bits;
521e1022 1858 int i;
100aa5d9 1859 struct scrub_page *spage;
b0485252 1860 char *kaddr;
d5178578 1861
b5d67f64 1862 BUG_ON(sblock->page_count < 1);
53f3251d
QW
1863
1864 /* Each member in pagev is just one block, not a full page */
1865 ASSERT(sblock->page_count == num_sectors);
1866
100aa5d9
DS
1867 spage = sblock->pagev[0];
1868 kaddr = page_address(spage->page);
b0485252 1869 h = (struct btrfs_header *)kaddr;
2ae0c2d8 1870 memcpy(on_disk_csum, h->csum, sctx->fs_info->csum_size);
a2de733c
AJ
1871
1872 /*
1873 * we don't use the getter functions here, as we
1874 * a) don't have an extent buffer and
1875 * b) the page is already kmapped
1876 */
100aa5d9 1877 if (spage->logical != btrfs_stack_header_bytenr(h))
ba7cf988 1878 sblock->header_error = 1;
a2de733c 1879
100aa5d9 1880 if (spage->generation != btrfs_stack_header_generation(h)) {
ba7cf988
ZL
1881 sblock->header_error = 1;
1882 sblock->generation_error = 1;
1883 }
a2de733c 1884
100aa5d9 1885 if (!scrub_check_fsid(h->fsid, spage))
ba7cf988 1886 sblock->header_error = 1;
a2de733c
AJ
1887
1888 if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1889 BTRFS_UUID_SIZE))
ba7cf988 1890 sblock->header_error = 1;
a2de733c 1891
521e1022
DS
1892 shash->tfm = fs_info->csum_shash;
1893 crypto_shash_init(shash);
1894 crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
53f3251d 1895 sectorsize - BTRFS_CSUM_SIZE);
b5d67f64 1896
53f3251d 1897 for (i = 1; i < num_sectors; i++) {
521e1022 1898 kaddr = page_address(sblock->pagev[i]->page);
53f3251d 1899 crypto_shash_update(shash, kaddr, sectorsize);
b5d67f64
SB
1900 }
1901
d5178578 1902 crypto_shash_final(shash, calculated_csum);
2ae0c2d8 1903 if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size))
ba7cf988 1904 sblock->checksum_error = 1;
a2de733c 1905
ba7cf988 1906 return sblock->header_error || sblock->checksum_error;
a2de733c
AJ
1907}
1908
b5d67f64 1909static int scrub_checksum_super(struct scrub_block *sblock)
a2de733c
AJ
1910{
1911 struct btrfs_super_block *s;
d9d181c1 1912 struct scrub_ctx *sctx = sblock->sctx;
d5178578
JT
1913 struct btrfs_fs_info *fs_info = sctx->fs_info;
1914 SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
b5d67f64 1915 u8 calculated_csum[BTRFS_CSUM_SIZE];
c7460541 1916 struct scrub_page *spage;
b0485252 1917 char *kaddr;
442a4f63
SB
1918 int fail_gen = 0;
1919 int fail_cor = 0;
d5178578 1920
b5d67f64 1921 BUG_ON(sblock->page_count < 1);
c7460541
DS
1922 spage = sblock->pagev[0];
1923 kaddr = page_address(spage->page);
b0485252 1924 s = (struct btrfs_super_block *)kaddr;
a2de733c 1925
c7460541 1926 if (spage->logical != btrfs_super_bytenr(s))
442a4f63 1927 ++fail_cor;
a2de733c 1928
c7460541 1929 if (spage->generation != btrfs_super_generation(s))
442a4f63 1930 ++fail_gen;
a2de733c 1931
c7460541 1932 if (!scrub_check_fsid(s->fsid, spage))
442a4f63 1933 ++fail_cor;
a2de733c 1934
83cf6d5e
DS
1935 shash->tfm = fs_info->csum_shash;
1936 crypto_shash_init(shash);
1937 crypto_shash_digest(shash, kaddr + BTRFS_CSUM_SIZE,
1938 BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, calculated_csum);
b5d67f64 1939
2ae0c2d8 1940 if (memcmp(calculated_csum, s->csum, sctx->fs_info->csum_size))
442a4f63 1941 ++fail_cor;
a2de733c 1942
442a4f63 1943 if (fail_cor + fail_gen) {
a2de733c
AJ
1944 /*
1945 * if we find an error in a super block, we just report it.
1946 * They will get written with the next transaction commit
1947 * anyway
1948 */
d9d181c1
SB
1949 spin_lock(&sctx->stat_lock);
1950 ++sctx->stat.super_errors;
1951 spin_unlock(&sctx->stat_lock);
442a4f63 1952 if (fail_cor)
c7460541 1953 btrfs_dev_stat_inc_and_print(spage->dev,
442a4f63
SB
1954 BTRFS_DEV_STAT_CORRUPTION_ERRS);
1955 else
c7460541 1956 btrfs_dev_stat_inc_and_print(spage->dev,
442a4f63 1957 BTRFS_DEV_STAT_GENERATION_ERRS);
a2de733c
AJ
1958 }
1959
442a4f63 1960 return fail_cor + fail_gen;
a2de733c
AJ
1961}
1962
b5d67f64
SB
1963static void scrub_block_get(struct scrub_block *sblock)
1964{
186debd6 1965 refcount_inc(&sblock->refs);
b5d67f64
SB
1966}
1967
1968static void scrub_block_put(struct scrub_block *sblock)
1969{
186debd6 1970 if (refcount_dec_and_test(&sblock->refs)) {
b5d67f64
SB
1971 int i;
1972
5a6ac9ea
MX
1973 if (sblock->sparity)
1974 scrub_parity_put(sblock->sparity);
1975
b5d67f64 1976 for (i = 0; i < sblock->page_count; i++)
7a9e9987 1977 scrub_page_put(sblock->pagev[i]);
b5d67f64
SB
1978 kfree(sblock);
1979 }
1980}
1981
7a9e9987
SB
1982static void scrub_page_get(struct scrub_page *spage)
1983{
57019345 1984 atomic_inc(&spage->refs);
7a9e9987
SB
1985}
1986
1987static void scrub_page_put(struct scrub_page *spage)
1988{
57019345 1989 if (atomic_dec_and_test(&spage->refs)) {
7a9e9987
SB
1990 if (spage->page)
1991 __free_page(spage->page);
1992 kfree(spage);
1993 }
1994}
1995
eb3b5053
DS
1996/*
1997 * Throttling of IO submission, bandwidth-limit based, the timeslice is 1
1998 * second. Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max.
1999 */
2000static void scrub_throttle(struct scrub_ctx *sctx)
2001{
2002 const int time_slice = 1000;
2003 struct scrub_bio *sbio;
2004 struct btrfs_device *device;
2005 s64 delta;
2006 ktime_t now;
2007 u32 div;
2008 u64 bwlimit;
2009
2010 sbio = sctx->bios[sctx->curr];
2011 device = sbio->dev;
2012 bwlimit = READ_ONCE(device->scrub_speed_max);
2013 if (bwlimit == 0)
2014 return;
2015
2016 /*
2017 * Slice is divided into intervals when the IO is submitted, adjust by
2018 * bwlimit and maximum of 64 intervals.
2019 */
2020 div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024)));
2021 div = min_t(u32, 64, div);
2022
2023 /* Start new epoch, set deadline */
2024 now = ktime_get();
2025 if (sctx->throttle_deadline == 0) {
2026 sctx->throttle_deadline = ktime_add_ms(now, time_slice / div);
2027 sctx->throttle_sent = 0;
2028 }
2029
2030 /* Still in the time to send? */
2031 if (ktime_before(now, sctx->throttle_deadline)) {
2032 /* If current bio is within the limit, send it */
2033 sctx->throttle_sent += sbio->bio->bi_iter.bi_size;
2034 if (sctx->throttle_sent <= div_u64(bwlimit, div))
2035 return;
2036
2037 /* We're over the limit, sleep until the rest of the slice */
2038 delta = ktime_ms_delta(sctx->throttle_deadline, now);
2039 } else {
2040 /* New request after deadline, start new epoch */
2041 delta = 0;
2042 }
2043
2044 if (delta) {
2045 long timeout;
2046
2047 timeout = div_u64(delta * HZ, 1000);
2048 schedule_timeout_interruptible(timeout);
2049 }
2050
2051 /* Next call will start the deadline period */
2052 sctx->throttle_deadline = 0;
2053}
2054
d9d181c1 2055static void scrub_submit(struct scrub_ctx *sctx)
a2de733c
AJ
2056{
2057 struct scrub_bio *sbio;
2058
d9d181c1 2059 if (sctx->curr == -1)
1623edeb 2060 return;
a2de733c 2061
eb3b5053
DS
2062 scrub_throttle(sctx);
2063
d9d181c1
SB
2064 sbio = sctx->bios[sctx->curr];
2065 sctx->curr = -1;
b6bfebc1 2066 scrub_pending_bio_inc(sctx);
4e49ea4a 2067 btrfsic_submit_bio(sbio->bio);
a2de733c
AJ
2068}
2069
ff023aac
SB
2070static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
2071 struct scrub_page *spage)
a2de733c 2072{
b5d67f64 2073 struct scrub_block *sblock = spage->sblock;
a2de733c 2074 struct scrub_bio *sbio;
69f4cb52 2075 int ret;
a2de733c
AJ
2076
2077again:
2078 /*
2079 * grab a fresh bio or wait for one to become available
2080 */
d9d181c1
SB
2081 while (sctx->curr == -1) {
2082 spin_lock(&sctx->list_lock);
2083 sctx->curr = sctx->first_free;
2084 if (sctx->curr != -1) {
2085 sctx->first_free = sctx->bios[sctx->curr]->next_free;
2086 sctx->bios[sctx->curr]->next_free = -1;
2087 sctx->bios[sctx->curr]->page_count = 0;
2088 spin_unlock(&sctx->list_lock);
a2de733c 2089 } else {
d9d181c1
SB
2090 spin_unlock(&sctx->list_lock);
2091 wait_event(sctx->list_wait, sctx->first_free != -1);
a2de733c
AJ
2092 }
2093 }
d9d181c1 2094 sbio = sctx->bios[sctx->curr];
b5d67f64 2095 if (sbio->page_count == 0) {
69f4cb52
AJ
2096 struct bio *bio;
2097
b5d67f64
SB
2098 sbio->physical = spage->physical;
2099 sbio->logical = spage->logical;
a36cf8b8 2100 sbio->dev = spage->dev;
b5d67f64
SB
2101 bio = sbio->bio;
2102 if (!bio) {
c5e4c3d7 2103 bio = btrfs_io_bio_alloc(sctx->pages_per_rd_bio);
b5d67f64
SB
2104 sbio->bio = bio;
2105 }
69f4cb52
AJ
2106
2107 bio->bi_private = sbio;
2108 bio->bi_end_io = scrub_bio_end_io;
74d46992 2109 bio_set_dev(bio, sbio->dev->bdev);
4f024f37 2110 bio->bi_iter.bi_sector = sbio->physical >> 9;
ebcc3263 2111 bio->bi_opf = REQ_OP_READ;
4e4cbee9 2112 sbio->status = 0;
b5d67f64
SB
2113 } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
2114 spage->physical ||
2115 sbio->logical + sbio->page_count * PAGE_SIZE !=
a36cf8b8
SB
2116 spage->logical ||
2117 sbio->dev != spage->dev) {
d9d181c1 2118 scrub_submit(sctx);
a2de733c
AJ
2119 goto again;
2120 }
69f4cb52 2121
b5d67f64
SB
2122 sbio->pagev[sbio->page_count] = spage;
2123 ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
2124 if (ret != PAGE_SIZE) {
2125 if (sbio->page_count < 1) {
2126 bio_put(sbio->bio);
2127 sbio->bio = NULL;
2128 return -EIO;
2129 }
d9d181c1 2130 scrub_submit(sctx);
69f4cb52
AJ
2131 goto again;
2132 }
2133
ff023aac 2134 scrub_block_get(sblock); /* one for the page added to the bio */
b5d67f64
SB
2135 atomic_inc(&sblock->outstanding_pages);
2136 sbio->page_count++;
ff023aac 2137 if (sbio->page_count == sctx->pages_per_rd_bio)
d9d181c1 2138 scrub_submit(sctx);
b5d67f64
SB
2139
2140 return 0;
2141}
2142
22365979 2143static void scrub_missing_raid56_end_io(struct bio *bio)
73ff61db
OS
2144{
2145 struct scrub_block *sblock = bio->bi_private;
fb456252 2146 struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
73ff61db 2147
4e4cbee9 2148 if (bio->bi_status)
73ff61db
OS
2149 sblock->no_io_error_seen = 0;
2150
4673272f
ST
2151 bio_put(bio);
2152
73ff61db
OS
2153 btrfs_queue_work(fs_info->scrub_workers, &sblock->work);
2154}
2155
2156static void scrub_missing_raid56_worker(struct btrfs_work *work)
2157{
2158 struct scrub_block *sblock = container_of(work, struct scrub_block, work);
2159 struct scrub_ctx *sctx = sblock->sctx;
0b246afa 2160 struct btrfs_fs_info *fs_info = sctx->fs_info;
73ff61db
OS
2161 u64 logical;
2162 struct btrfs_device *dev;
2163
73ff61db
OS
2164 logical = sblock->pagev[0]->logical;
2165 dev = sblock->pagev[0]->dev;
2166
affe4a5a 2167 if (sblock->no_io_error_seen)
ba7cf988 2168 scrub_recheck_block_checksum(sblock);
73ff61db
OS
2169
2170 if (!sblock->no_io_error_seen) {
2171 spin_lock(&sctx->stat_lock);
2172 sctx->stat.read_errors++;
2173 spin_unlock(&sctx->stat_lock);
0b246afa 2174 btrfs_err_rl_in_rcu(fs_info,
b14af3b4 2175 "IO error rebuilding logical %llu for dev %s",
73ff61db
OS
2176 logical, rcu_str_deref(dev->name));
2177 } else if (sblock->header_error || sblock->checksum_error) {
2178 spin_lock(&sctx->stat_lock);
2179 sctx->stat.uncorrectable_errors++;
2180 spin_unlock(&sctx->stat_lock);
0b246afa 2181 btrfs_err_rl_in_rcu(fs_info,
b14af3b4 2182 "failed to rebuild valid logical %llu for dev %s",
73ff61db
OS
2183 logical, rcu_str_deref(dev->name));
2184 } else {
2185 scrub_write_block_to_dev_replace(sblock);
2186 }
2187
2073c4c2 2188 if (sctx->is_dev_replace && sctx->flush_all_writes) {
3fb99303 2189 mutex_lock(&sctx->wr_lock);
73ff61db 2190 scrub_wr_submit(sctx);
3fb99303 2191 mutex_unlock(&sctx->wr_lock);
73ff61db
OS
2192 }
2193
57d4f0b8 2194 scrub_block_put(sblock);
73ff61db
OS
2195 scrub_pending_bio_dec(sctx);
2196}
2197
2198static void scrub_missing_raid56_pages(struct scrub_block *sblock)
2199{
2200 struct scrub_ctx *sctx = sblock->sctx;
fb456252 2201 struct btrfs_fs_info *fs_info = sctx->fs_info;
73ff61db
OS
2202 u64 length = sblock->page_count * PAGE_SIZE;
2203 u64 logical = sblock->pagev[0]->logical;
f1fee653 2204 struct btrfs_bio *bbio = NULL;
73ff61db
OS
2205 struct bio *bio;
2206 struct btrfs_raid_bio *rbio;
2207 int ret;
2208 int i;
2209
ae6529c3 2210 btrfs_bio_counter_inc_blocked(fs_info);
cf8cddd3 2211 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
825ad4c9 2212 &length, &bbio);
73ff61db
OS
2213 if (ret || !bbio || !bbio->raid_map)
2214 goto bbio_out;
2215
2216 if (WARN_ON(!sctx->is_dev_replace ||
2217 !(bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
2218 /*
2219 * We shouldn't be scrubbing a missing device. Even for dev
2220 * replace, we should only get here for RAID 5/6. We either
2221 * managed to mount something with no mirrors remaining or
2222 * there's a bug in scrub_remap_extent()/btrfs_map_block().
2223 */
2224 goto bbio_out;
2225 }
2226
c5e4c3d7 2227 bio = btrfs_io_bio_alloc(0);
73ff61db
OS
2228 bio->bi_iter.bi_sector = logical >> 9;
2229 bio->bi_private = sblock;
2230 bio->bi_end_io = scrub_missing_raid56_end_io;
2231
2ff7e61e 2232 rbio = raid56_alloc_missing_rbio(fs_info, bio, bbio, length);
73ff61db
OS
2233 if (!rbio)
2234 goto rbio_out;
2235
2236 for (i = 0; i < sblock->page_count; i++) {
2237 struct scrub_page *spage = sblock->pagev[i];
2238
2239 raid56_add_scrub_pages(rbio, spage->page, spage->logical);
2240 }
2241
a0cac0ec 2242 btrfs_init_work(&sblock->work, scrub_missing_raid56_worker, NULL, NULL);
73ff61db
OS
2243 scrub_block_get(sblock);
2244 scrub_pending_bio_inc(sctx);
2245 raid56_submit_missing_rbio(rbio);
2246 return;
2247
2248rbio_out:
2249 bio_put(bio);
2250bbio_out:
ae6529c3 2251 btrfs_bio_counter_dec(fs_info);
73ff61db
OS
2252 btrfs_put_bbio(bbio);
2253 spin_lock(&sctx->stat_lock);
2254 sctx->stat.malloc_errors++;
2255 spin_unlock(&sctx->stat_lock);
2256}
2257
fa485d21 2258static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len,
a36cf8b8 2259 u64 physical, struct btrfs_device *dev, u64 flags,
96e63a45 2260 u64 gen, int mirror_num, u8 *csum,
ff023aac 2261 u64 physical_for_dev_replace)
b5d67f64
SB
2262{
2263 struct scrub_block *sblock;
d0a7a9c0 2264 const u32 sectorsize = sctx->fs_info->sectorsize;
b5d67f64
SB
2265 int index;
2266
58c4e173 2267 sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
b5d67f64 2268 if (!sblock) {
d9d181c1
SB
2269 spin_lock(&sctx->stat_lock);
2270 sctx->stat.malloc_errors++;
2271 spin_unlock(&sctx->stat_lock);
b5d67f64 2272 return -ENOMEM;
a2de733c 2273 }
b5d67f64 2274
7a9e9987
SB
2275 /* one ref inside this function, plus one for each page added to
2276 * a bio later on */
186debd6 2277 refcount_set(&sblock->refs, 1);
d9d181c1 2278 sblock->sctx = sctx;
b5d67f64
SB
2279 sblock->no_io_error_seen = 1;
2280
2281 for (index = 0; len > 0; index++) {
7a9e9987 2282 struct scrub_page *spage;
d0a7a9c0
QW
2283 /*
2284 * Here we will allocate one page for one sector to scrub.
2285 * This is fine if PAGE_SIZE == sectorsize, but will cost
2286 * more memory for PAGE_SIZE > sectorsize case.
2287 */
2288 u32 l = min(sectorsize, len);
b5d67f64 2289
58c4e173 2290 spage = kzalloc(sizeof(*spage), GFP_KERNEL);
7a9e9987
SB
2291 if (!spage) {
2292leave_nomem:
d9d181c1
SB
2293 spin_lock(&sctx->stat_lock);
2294 sctx->stat.malloc_errors++;
2295 spin_unlock(&sctx->stat_lock);
7a9e9987 2296 scrub_block_put(sblock);
b5d67f64
SB
2297 return -ENOMEM;
2298 }
7a9e9987
SB
2299 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2300 scrub_page_get(spage);
2301 sblock->pagev[index] = spage;
b5d67f64 2302 spage->sblock = sblock;
a36cf8b8 2303 spage->dev = dev;
b5d67f64
SB
2304 spage->flags = flags;
2305 spage->generation = gen;
2306 spage->logical = logical;
2307 spage->physical = physical;
ff023aac 2308 spage->physical_for_dev_replace = physical_for_dev_replace;
b5d67f64
SB
2309 spage->mirror_num = mirror_num;
2310 if (csum) {
2311 spage->have_csum = 1;
2ae0c2d8 2312 memcpy(spage->csum, csum, sctx->fs_info->csum_size);
b5d67f64
SB
2313 } else {
2314 spage->have_csum = 0;
2315 }
2316 sblock->page_count++;
58c4e173 2317 spage->page = alloc_page(GFP_KERNEL);
7a9e9987
SB
2318 if (!spage->page)
2319 goto leave_nomem;
b5d67f64
SB
2320 len -= l;
2321 logical += l;
2322 physical += l;
ff023aac 2323 physical_for_dev_replace += l;
b5d67f64
SB
2324 }
2325
7a9e9987 2326 WARN_ON(sblock->page_count == 0);
e6e674bd 2327 if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
73ff61db
OS
2328 /*
2329 * This case should only be hit for RAID 5/6 device replace. See
2330 * the comment in scrub_missing_raid56_pages() for details.
2331 */
2332 scrub_missing_raid56_pages(sblock);
2333 } else {
2334 for (index = 0; index < sblock->page_count; index++) {
2335 struct scrub_page *spage = sblock->pagev[index];
2336 int ret;
1bc87793 2337
73ff61db
OS
2338 ret = scrub_add_page_to_rd_bio(sctx, spage);
2339 if (ret) {
2340 scrub_block_put(sblock);
2341 return ret;
2342 }
b5d67f64 2343 }
a2de733c 2344
96e63a45 2345 if (flags & BTRFS_EXTENT_FLAG_SUPER)
73ff61db
OS
2346 scrub_submit(sctx);
2347 }
a2de733c 2348
b5d67f64
SB
2349 /* last one frees, either here or in bio completion for last page */
2350 scrub_block_put(sblock);
a2de733c
AJ
2351 return 0;
2352}
2353
4246a0b6 2354static void scrub_bio_end_io(struct bio *bio)
b5d67f64
SB
2355{
2356 struct scrub_bio *sbio = bio->bi_private;
fb456252 2357 struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
b5d67f64 2358
4e4cbee9 2359 sbio->status = bio->bi_status;
b5d67f64
SB
2360 sbio->bio = bio;
2361
0339ef2f 2362 btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
b5d67f64
SB
2363}
2364
2365static void scrub_bio_end_io_worker(struct btrfs_work *work)
2366{
2367 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
d9d181c1 2368 struct scrub_ctx *sctx = sbio->sctx;
b5d67f64
SB
2369 int i;
2370
ff023aac 2371 BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
4e4cbee9 2372 if (sbio->status) {
b5d67f64
SB
2373 for (i = 0; i < sbio->page_count; i++) {
2374 struct scrub_page *spage = sbio->pagev[i];
2375
2376 spage->io_error = 1;
2377 spage->sblock->no_io_error_seen = 0;
2378 }
2379 }
2380
2381 /* now complete the scrub_block items that have all pages completed */
2382 for (i = 0; i < sbio->page_count; i++) {
2383 struct scrub_page *spage = sbio->pagev[i];
2384 struct scrub_block *sblock = spage->sblock;
2385
2386 if (atomic_dec_and_test(&sblock->outstanding_pages))
2387 scrub_block_complete(sblock);
2388 scrub_block_put(sblock);
2389 }
2390
b5d67f64
SB
2391 bio_put(sbio->bio);
2392 sbio->bio = NULL;
d9d181c1
SB
2393 spin_lock(&sctx->list_lock);
2394 sbio->next_free = sctx->first_free;
2395 sctx->first_free = sbio->index;
2396 spin_unlock(&sctx->list_lock);
ff023aac 2397
2073c4c2 2398 if (sctx->is_dev_replace && sctx->flush_all_writes) {
3fb99303 2399 mutex_lock(&sctx->wr_lock);
ff023aac 2400 scrub_wr_submit(sctx);
3fb99303 2401 mutex_unlock(&sctx->wr_lock);
ff023aac
SB
2402 }
2403
b6bfebc1 2404 scrub_pending_bio_dec(sctx);
b5d67f64
SB
2405}
2406
5a6ac9ea
MX
2407static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
2408 unsigned long *bitmap,
fa485d21 2409 u64 start, u32 len)
5a6ac9ea 2410{
972d7219 2411 u64 offset;
7736b0a4 2412 u32 nsectors;
ab108d99 2413 u32 sectorsize_bits = sparity->sctx->fs_info->sectorsize_bits;
5a6ac9ea
MX
2414
2415 if (len >= sparity->stripe_len) {
2416 bitmap_set(bitmap, 0, sparity->nsectors);
2417 return;
2418 }
2419
2420 start -= sparity->logic_start;
972d7219 2421 start = div64_u64_rem(start, sparity->stripe_len, &offset);
ab108d99 2422 offset = offset >> sectorsize_bits;
fa485d21 2423 nsectors = len >> sectorsize_bits;
5a6ac9ea
MX
2424
2425 if (offset + nsectors <= sparity->nsectors) {
2426 bitmap_set(bitmap, offset, nsectors);
2427 return;
2428 }
2429
2430 bitmap_set(bitmap, offset, sparity->nsectors - offset);
2431 bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
2432}
2433
2434static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
fa485d21 2435 u64 start, u32 len)
5a6ac9ea
MX
2436{
2437 __scrub_mark_bitmap(sparity, sparity->ebitmap, start, len);
2438}
2439
2440static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
fa485d21 2441 u64 start, u32 len)
5a6ac9ea
MX
2442{
2443 __scrub_mark_bitmap(sparity, sparity->dbitmap, start, len);
2444}
2445
b5d67f64
SB
2446static void scrub_block_complete(struct scrub_block *sblock)
2447{
5a6ac9ea
MX
2448 int corrupted = 0;
2449
ff023aac 2450 if (!sblock->no_io_error_seen) {
5a6ac9ea 2451 corrupted = 1;
b5d67f64 2452 scrub_handle_errored_block(sblock);
ff023aac
SB
2453 } else {
2454 /*
2455 * if has checksum error, write via repair mechanism in
2456 * dev replace case, otherwise write here in dev replace
2457 * case.
2458 */
5a6ac9ea
MX
2459 corrupted = scrub_checksum(sblock);
2460 if (!corrupted && sblock->sctx->is_dev_replace)
ff023aac
SB
2461 scrub_write_block_to_dev_replace(sblock);
2462 }
5a6ac9ea
MX
2463
2464 if (sblock->sparity && corrupted && !sblock->data_corrected) {
2465 u64 start = sblock->pagev[0]->logical;
2466 u64 end = sblock->pagev[sblock->page_count - 1]->logical +
2467 PAGE_SIZE;
2468
fa485d21 2469 ASSERT(end - start <= U32_MAX);
5a6ac9ea
MX
2470 scrub_parity_mark_sectors_error(sblock->sparity,
2471 start, end - start);
2472 }
b5d67f64
SB
2473}
2474
480a8ec8
QW
2475static void drop_csum_range(struct scrub_ctx *sctx, struct btrfs_ordered_sum *sum)
2476{
2477 sctx->stat.csum_discards += sum->len >> sctx->fs_info->sectorsize_bits;
2478 list_del(&sum->list);
2479 kfree(sum);
2480}
2481
2482/*
2483 * Find the desired csum for range [logical, logical + sectorsize), and store
2484 * the csum into @csum.
2485 *
2486 * The search source is sctx->csum_list, which is a pre-populated list
2487 * storing bytenr ordered csum ranges. We're reponsible to cleanup any range
2488 * that is before @logical.
2489 *
2490 * Return 0 if there is no csum for the range.
2491 * Return 1 if there is csum for the range and copied to @csum.
2492 */
3b5753ec 2493static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
a2de733c 2494{
480a8ec8 2495 bool found = false;
a2de733c 2496
d9d181c1 2497 while (!list_empty(&sctx->csum_list)) {
480a8ec8
QW
2498 struct btrfs_ordered_sum *sum = NULL;
2499 unsigned long index;
2500 unsigned long num_sectors;
2501
d9d181c1 2502 sum = list_first_entry(&sctx->csum_list,
a2de733c 2503 struct btrfs_ordered_sum, list);
480a8ec8 2504 /* The current csum range is beyond our range, no csum found */
a2de733c 2505 if (sum->bytenr > logical)
a2de733c
AJ
2506 break;
2507
480a8ec8
QW
2508 /*
2509 * The current sum is before our bytenr, since scrub is always
2510 * done in bytenr order, the csum will never be used anymore,
2511 * clean it up so that later calls won't bother with the range,
2512 * and continue search the next range.
2513 */
2514 if (sum->bytenr + sum->len <= logical) {
2515 drop_csum_range(sctx, sum);
2516 continue;
2517 }
a2de733c 2518
480a8ec8
QW
2519 /* Now the csum range covers our bytenr, copy the csum */
2520 found = true;
2521 index = (logical - sum->bytenr) >> sctx->fs_info->sectorsize_bits;
2522 num_sectors = sum->len >> sctx->fs_info->sectorsize_bits;
1d1bf92d 2523
480a8ec8
QW
2524 memcpy(csum, sum->sums + index * sctx->fs_info->csum_size,
2525 sctx->fs_info->csum_size);
2526
2527 /* Cleanup the range if we're at the end of the csum range */
2528 if (index == num_sectors - 1)
2529 drop_csum_range(sctx, sum);
2530 break;
a2de733c 2531 }
480a8ec8
QW
2532 if (!found)
2533 return 0;
f51a4a18 2534 return 1;
a2de733c
AJ
2535}
2536
2537/* scrub extent tries to collect up to 64 kB for each bio */
6ca1765b 2538static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
fa485d21 2539 u64 logical, u32 len,
a36cf8b8 2540 u64 physical, struct btrfs_device *dev, u64 flags,
ff023aac 2541 u64 gen, int mirror_num, u64 physical_for_dev_replace)
a2de733c
AJ
2542{
2543 int ret;
2544 u8 csum[BTRFS_CSUM_SIZE];
b5d67f64
SB
2545 u32 blocksize;
2546
2547 if (flags & BTRFS_EXTENT_FLAG_DATA) {
6ca1765b
LB
2548 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2549 blocksize = map->stripe_len;
2550 else
2551 blocksize = sctx->fs_info->sectorsize;
d9d181c1
SB
2552 spin_lock(&sctx->stat_lock);
2553 sctx->stat.data_extents_scrubbed++;
2554 sctx->stat.data_bytes_scrubbed += len;
2555 spin_unlock(&sctx->stat_lock);
b5d67f64 2556 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
6ca1765b
LB
2557 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2558 blocksize = map->stripe_len;
2559 else
2560 blocksize = sctx->fs_info->nodesize;
d9d181c1
SB
2561 spin_lock(&sctx->stat_lock);
2562 sctx->stat.tree_extents_scrubbed++;
2563 sctx->stat.tree_bytes_scrubbed += len;
2564 spin_unlock(&sctx->stat_lock);
b5d67f64 2565 } else {
25cc1226 2566 blocksize = sctx->fs_info->sectorsize;
ff023aac 2567 WARN_ON(1);
b5d67f64 2568 }
a2de733c
AJ
2569
2570 while (len) {
fa485d21 2571 u32 l = min(len, blocksize);
a2de733c
AJ
2572 int have_csum = 0;
2573
2574 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2575 /* push csums to sbio */
3b5753ec 2576 have_csum = scrub_find_csum(sctx, logical, csum);
a2de733c 2577 if (have_csum == 0)
d9d181c1 2578 ++sctx->stat.no_csum;
a2de733c 2579 }
a36cf8b8 2580 ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
96e63a45 2581 mirror_num, have_csum ? csum : NULL,
ff023aac 2582 physical_for_dev_replace);
a2de733c
AJ
2583 if (ret)
2584 return ret;
2585 len -= l;
2586 logical += l;
2587 physical += l;
ff023aac 2588 physical_for_dev_replace += l;
a2de733c
AJ
2589 }
2590 return 0;
2591}
2592
5a6ac9ea 2593static int scrub_pages_for_parity(struct scrub_parity *sparity,
fa485d21 2594 u64 logical, u32 len,
5a6ac9ea
MX
2595 u64 physical, struct btrfs_device *dev,
2596 u64 flags, u64 gen, int mirror_num, u8 *csum)
2597{
2598 struct scrub_ctx *sctx = sparity->sctx;
2599 struct scrub_block *sblock;
d0a7a9c0 2600 const u32 sectorsize = sctx->fs_info->sectorsize;
5a6ac9ea
MX
2601 int index;
2602
d0a7a9c0
QW
2603 ASSERT(IS_ALIGNED(len, sectorsize));
2604
58c4e173 2605 sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
5a6ac9ea
MX
2606 if (!sblock) {
2607 spin_lock(&sctx->stat_lock);
2608 sctx->stat.malloc_errors++;
2609 spin_unlock(&sctx->stat_lock);
2610 return -ENOMEM;
2611 }
2612
2613 /* one ref inside this function, plus one for each page added to
2614 * a bio later on */
186debd6 2615 refcount_set(&sblock->refs, 1);
5a6ac9ea
MX
2616 sblock->sctx = sctx;
2617 sblock->no_io_error_seen = 1;
2618 sblock->sparity = sparity;
2619 scrub_parity_get(sparity);
2620
2621 for (index = 0; len > 0; index++) {
2622 struct scrub_page *spage;
5a6ac9ea 2623
58c4e173 2624 spage = kzalloc(sizeof(*spage), GFP_KERNEL);
5a6ac9ea
MX
2625 if (!spage) {
2626leave_nomem:
2627 spin_lock(&sctx->stat_lock);
2628 sctx->stat.malloc_errors++;
2629 spin_unlock(&sctx->stat_lock);
2630 scrub_block_put(sblock);
2631 return -ENOMEM;
2632 }
2633 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2634 /* For scrub block */
2635 scrub_page_get(spage);
2636 sblock->pagev[index] = spage;
2637 /* For scrub parity */
2638 scrub_page_get(spage);
2639 list_add_tail(&spage->list, &sparity->spages);
2640 spage->sblock = sblock;
2641 spage->dev = dev;
2642 spage->flags = flags;
2643 spage->generation = gen;
2644 spage->logical = logical;
2645 spage->physical = physical;
2646 spage->mirror_num = mirror_num;
2647 if (csum) {
2648 spage->have_csum = 1;
2ae0c2d8 2649 memcpy(spage->csum, csum, sctx->fs_info->csum_size);
5a6ac9ea
MX
2650 } else {
2651 spage->have_csum = 0;
2652 }
2653 sblock->page_count++;
58c4e173 2654 spage->page = alloc_page(GFP_KERNEL);
5a6ac9ea
MX
2655 if (!spage->page)
2656 goto leave_nomem;
d0a7a9c0
QW
2657
2658
2659 /* Iterate over the stripe range in sectorsize steps */
2660 len -= sectorsize;
2661 logical += sectorsize;
2662 physical += sectorsize;
5a6ac9ea
MX
2663 }
2664
2665 WARN_ON(sblock->page_count == 0);
2666 for (index = 0; index < sblock->page_count; index++) {
2667 struct scrub_page *spage = sblock->pagev[index];
2668 int ret;
2669
2670 ret = scrub_add_page_to_rd_bio(sctx, spage);
2671 if (ret) {
2672 scrub_block_put(sblock);
2673 return ret;
2674 }
2675 }
2676
2677 /* last one frees, either here or in bio completion for last page */
2678 scrub_block_put(sblock);
2679 return 0;
2680}
2681
2682static int scrub_extent_for_parity(struct scrub_parity *sparity,
fa485d21 2683 u64 logical, u32 len,
5a6ac9ea
MX
2684 u64 physical, struct btrfs_device *dev,
2685 u64 flags, u64 gen, int mirror_num)
2686{
2687 struct scrub_ctx *sctx = sparity->sctx;
2688 int ret;
2689 u8 csum[BTRFS_CSUM_SIZE];
2690 u32 blocksize;
2691
e6e674bd 2692 if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
4a770891
OS
2693 scrub_parity_mark_sectors_error(sparity, logical, len);
2694 return 0;
2695 }
2696
5a6ac9ea 2697 if (flags & BTRFS_EXTENT_FLAG_DATA) {
6ca1765b 2698 blocksize = sparity->stripe_len;
5a6ac9ea 2699 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
6ca1765b 2700 blocksize = sparity->stripe_len;
5a6ac9ea 2701 } else {
25cc1226 2702 blocksize = sctx->fs_info->sectorsize;
5a6ac9ea
MX
2703 WARN_ON(1);
2704 }
2705
2706 while (len) {
fa485d21 2707 u32 l = min(len, blocksize);
5a6ac9ea
MX
2708 int have_csum = 0;
2709
2710 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2711 /* push csums to sbio */
3b5753ec 2712 have_csum = scrub_find_csum(sctx, logical, csum);
5a6ac9ea
MX
2713 if (have_csum == 0)
2714 goto skip;
2715 }
2716 ret = scrub_pages_for_parity(sparity, logical, l, physical, dev,
2717 flags, gen, mirror_num,
2718 have_csum ? csum : NULL);
5a6ac9ea
MX
2719 if (ret)
2720 return ret;
6b6d24b3 2721skip:
5a6ac9ea
MX
2722 len -= l;
2723 logical += l;
2724 physical += l;
2725 }
2726 return 0;
2727}
2728
3b080b25
WS
2729/*
2730 * Given a physical address, this will calculate it's
2731 * logical offset. if this is a parity stripe, it will return
2732 * the most left data stripe's logical offset.
2733 *
2734 * return 0 if it is a data stripe, 1 means parity stripe.
2735 */
2736static int get_raid56_logic_offset(u64 physical, int num,
5a6ac9ea
MX
2737 struct map_lookup *map, u64 *offset,
2738 u64 *stripe_start)
3b080b25
WS
2739{
2740 int i;
2741 int j = 0;
2742 u64 stripe_nr;
2743 u64 last_offset;
9d644a62
DS
2744 u32 stripe_index;
2745 u32 rot;
cff82672 2746 const int data_stripes = nr_data_stripes(map);
3b080b25 2747
cff82672 2748 last_offset = (physical - map->stripes[num].physical) * data_stripes;
5a6ac9ea
MX
2749 if (stripe_start)
2750 *stripe_start = last_offset;
2751
3b080b25 2752 *offset = last_offset;
cff82672 2753 for (i = 0; i < data_stripes; i++) {
3b080b25
WS
2754 *offset = last_offset + i * map->stripe_len;
2755
42c61ab6 2756 stripe_nr = div64_u64(*offset, map->stripe_len);
cff82672 2757 stripe_nr = div_u64(stripe_nr, data_stripes);
3b080b25
WS
2758
2759 /* Work out the disk rotation on this stripe-set */
47c5713f 2760 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot);
3b080b25
WS
2761 /* calculate which stripe this data locates */
2762 rot += i;
e4fbaee2 2763 stripe_index = rot % map->num_stripes;
3b080b25
WS
2764 if (stripe_index == num)
2765 return 0;
2766 if (stripe_index < num)
2767 j++;
2768 }
2769 *offset = last_offset + j * map->stripe_len;
2770 return 1;
2771}
2772
5a6ac9ea
MX
2773static void scrub_free_parity(struct scrub_parity *sparity)
2774{
2775 struct scrub_ctx *sctx = sparity->sctx;
2776 struct scrub_page *curr, *next;
2777 int nbits;
2778
2779 nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors);
2780 if (nbits) {
2781 spin_lock(&sctx->stat_lock);
2782 sctx->stat.read_errors += nbits;
2783 sctx->stat.uncorrectable_errors += nbits;
2784 spin_unlock(&sctx->stat_lock);
2785 }
2786
2787 list_for_each_entry_safe(curr, next, &sparity->spages, list) {
2788 list_del_init(&curr->list);
2789 scrub_page_put(curr);
2790 }
2791
2792 kfree(sparity);
2793}
2794
20b2e302
ZL
2795static void scrub_parity_bio_endio_worker(struct btrfs_work *work)
2796{
2797 struct scrub_parity *sparity = container_of(work, struct scrub_parity,
2798 work);
2799 struct scrub_ctx *sctx = sparity->sctx;
2800
2801 scrub_free_parity(sparity);
2802 scrub_pending_bio_dec(sctx);
2803}
2804
4246a0b6 2805static void scrub_parity_bio_endio(struct bio *bio)
5a6ac9ea
MX
2806{
2807 struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private;
0b246afa 2808 struct btrfs_fs_info *fs_info = sparity->sctx->fs_info;
5a6ac9ea 2809
4e4cbee9 2810 if (bio->bi_status)
5a6ac9ea
MX
2811 bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
2812 sparity->nsectors);
2813
5a6ac9ea 2814 bio_put(bio);
20b2e302 2815
a0cac0ec
OS
2816 btrfs_init_work(&sparity->work, scrub_parity_bio_endio_worker, NULL,
2817 NULL);
0b246afa 2818 btrfs_queue_work(fs_info->scrub_parity_workers, &sparity->work);
5a6ac9ea
MX
2819}
2820
2821static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
2822{
2823 struct scrub_ctx *sctx = sparity->sctx;
0b246afa 2824 struct btrfs_fs_info *fs_info = sctx->fs_info;
5a6ac9ea
MX
2825 struct bio *bio;
2826 struct btrfs_raid_bio *rbio;
5a6ac9ea 2827 struct btrfs_bio *bbio = NULL;
5a6ac9ea
MX
2828 u64 length;
2829 int ret;
2830
2831 if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap,
2832 sparity->nsectors))
2833 goto out;
2834
a0dd59de 2835 length = sparity->logic_end - sparity->logic_start;
ae6529c3
QW
2836
2837 btrfs_bio_counter_inc_blocked(fs_info);
0b246afa 2838 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
825ad4c9 2839 &length, &bbio);
8e5cfb55 2840 if (ret || !bbio || !bbio->raid_map)
5a6ac9ea
MX
2841 goto bbio_out;
2842
c5e4c3d7 2843 bio = btrfs_io_bio_alloc(0);
5a6ac9ea
MX
2844 bio->bi_iter.bi_sector = sparity->logic_start >> 9;
2845 bio->bi_private = sparity;
2846 bio->bi_end_io = scrub_parity_bio_endio;
2847
2ff7e61e 2848 rbio = raid56_parity_alloc_scrub_rbio(fs_info, bio, bbio,
8e5cfb55 2849 length, sparity->scrub_dev,
5a6ac9ea
MX
2850 sparity->dbitmap,
2851 sparity->nsectors);
2852 if (!rbio)
2853 goto rbio_out;
2854
5a6ac9ea
MX
2855 scrub_pending_bio_inc(sctx);
2856 raid56_parity_submit_scrub_rbio(rbio);
2857 return;
2858
2859rbio_out:
2860 bio_put(bio);
2861bbio_out:
ae6529c3 2862 btrfs_bio_counter_dec(fs_info);
6e9606d2 2863 btrfs_put_bbio(bbio);
5a6ac9ea
MX
2864 bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
2865 sparity->nsectors);
2866 spin_lock(&sctx->stat_lock);
2867 sctx->stat.malloc_errors++;
2868 spin_unlock(&sctx->stat_lock);
2869out:
2870 scrub_free_parity(sparity);
2871}
2872
2873static inline int scrub_calc_parity_bitmap_len(int nsectors)
2874{
bfca9a6d 2875 return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * sizeof(long);
5a6ac9ea
MX
2876}
2877
2878static void scrub_parity_get(struct scrub_parity *sparity)
2879{
78a76450 2880 refcount_inc(&sparity->refs);
5a6ac9ea
MX
2881}
2882
2883static void scrub_parity_put(struct scrub_parity *sparity)
2884{
78a76450 2885 if (!refcount_dec_and_test(&sparity->refs))
5a6ac9ea
MX
2886 return;
2887
2888 scrub_parity_check_and_repair(sparity);
2889}
2890
2891static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
2892 struct map_lookup *map,
2893 struct btrfs_device *sdev,
2894 struct btrfs_path *path,
2895 u64 logic_start,
2896 u64 logic_end)
2897{
fb456252 2898 struct btrfs_fs_info *fs_info = sctx->fs_info;
5a6ac9ea
MX
2899 struct btrfs_root *root = fs_info->extent_root;
2900 struct btrfs_root *csum_root = fs_info->csum_root;
2901 struct btrfs_extent_item *extent;
4a770891 2902 struct btrfs_bio *bbio = NULL;
5a6ac9ea
MX
2903 u64 flags;
2904 int ret;
2905 int slot;
2906 struct extent_buffer *l;
2907 struct btrfs_key key;
2908 u64 generation;
2909 u64 extent_logical;
2910 u64 extent_physical;
fa485d21
QW
2911 /* Check the comment in scrub_stripe() for why u32 is enough here */
2912 u32 extent_len;
4a770891 2913 u64 mapped_length;
5a6ac9ea
MX
2914 struct btrfs_device *extent_dev;
2915 struct scrub_parity *sparity;
2916 int nsectors;
2917 int bitmap_len;
2918 int extent_mirror_num;
2919 int stop_loop = 0;
2920
fa485d21 2921 ASSERT(map->stripe_len <= U32_MAX);
ab108d99 2922 nsectors = map->stripe_len >> fs_info->sectorsize_bits;
5a6ac9ea
MX
2923 bitmap_len = scrub_calc_parity_bitmap_len(nsectors);
2924 sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len,
2925 GFP_NOFS);
2926 if (!sparity) {
2927 spin_lock(&sctx->stat_lock);
2928 sctx->stat.malloc_errors++;
2929 spin_unlock(&sctx->stat_lock);
2930 return -ENOMEM;
2931 }
2932
fa485d21 2933 ASSERT(map->stripe_len <= U32_MAX);
5a6ac9ea
MX
2934 sparity->stripe_len = map->stripe_len;
2935 sparity->nsectors = nsectors;
2936 sparity->sctx = sctx;
2937 sparity->scrub_dev = sdev;
2938 sparity->logic_start = logic_start;
2939 sparity->logic_end = logic_end;
78a76450 2940 refcount_set(&sparity->refs, 1);
5a6ac9ea
MX
2941 INIT_LIST_HEAD(&sparity->spages);
2942 sparity->dbitmap = sparity->bitmap;
2943 sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
2944
2945 ret = 0;
2946 while (logic_start < logic_end) {
2947 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2948 key.type = BTRFS_METADATA_ITEM_KEY;
2949 else
2950 key.type = BTRFS_EXTENT_ITEM_KEY;
2951 key.objectid = logic_start;
2952 key.offset = (u64)-1;
2953
2954 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2955 if (ret < 0)
2956 goto out;
2957
2958 if (ret > 0) {
2959 ret = btrfs_previous_extent_item(root, path, 0);
2960 if (ret < 0)
2961 goto out;
2962 if (ret > 0) {
2963 btrfs_release_path(path);
2964 ret = btrfs_search_slot(NULL, root, &key,
2965 path, 0, 0);
2966 if (ret < 0)
2967 goto out;
2968 }
2969 }
2970
2971 stop_loop = 0;
2972 while (1) {
2973 u64 bytes;
2974
2975 l = path->nodes[0];
2976 slot = path->slots[0];
2977 if (slot >= btrfs_header_nritems(l)) {
2978 ret = btrfs_next_leaf(root, path);
2979 if (ret == 0)
2980 continue;
2981 if (ret < 0)
2982 goto out;
2983
2984 stop_loop = 1;
2985 break;
2986 }
2987 btrfs_item_key_to_cpu(l, &key, slot);
2988
d7cad238
ZL
2989 if (key.type != BTRFS_EXTENT_ITEM_KEY &&
2990 key.type != BTRFS_METADATA_ITEM_KEY)
2991 goto next;
2992
5a6ac9ea 2993 if (key.type == BTRFS_METADATA_ITEM_KEY)
0b246afa 2994 bytes = fs_info->nodesize;
5a6ac9ea
MX
2995 else
2996 bytes = key.offset;
2997
2998 if (key.objectid + bytes <= logic_start)
2999 goto next;
3000
a0dd59de 3001 if (key.objectid >= logic_end) {
5a6ac9ea
MX
3002 stop_loop = 1;
3003 break;
3004 }
3005
3006 while (key.objectid >= logic_start + map->stripe_len)
3007 logic_start += map->stripe_len;
3008
3009 extent = btrfs_item_ptr(l, slot,
3010 struct btrfs_extent_item);
3011 flags = btrfs_extent_flags(l, extent);
3012 generation = btrfs_extent_generation(l, extent);
3013
a323e813
ZL
3014 if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3015 (key.objectid < logic_start ||
3016 key.objectid + bytes >
3017 logic_start + map->stripe_len)) {
5d163e0e
JM
3018 btrfs_err(fs_info,
3019 "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
a323e813 3020 key.objectid, logic_start);
9799d2c3
ZL
3021 spin_lock(&sctx->stat_lock);
3022 sctx->stat.uncorrectable_errors++;
3023 spin_unlock(&sctx->stat_lock);
5a6ac9ea
MX
3024 goto next;
3025 }
3026again:
3027 extent_logical = key.objectid;
fa485d21 3028 ASSERT(bytes <= U32_MAX);
5a6ac9ea
MX
3029 extent_len = bytes;
3030
3031 if (extent_logical < logic_start) {
3032 extent_len -= logic_start - extent_logical;
3033 extent_logical = logic_start;
3034 }
3035
3036 if (extent_logical + extent_len >
3037 logic_start + map->stripe_len)
3038 extent_len = logic_start + map->stripe_len -
3039 extent_logical;
3040
3041 scrub_parity_mark_sectors_data(sparity, extent_logical,
3042 extent_len);
3043
4a770891 3044 mapped_length = extent_len;
f1fee653 3045 bbio = NULL;
cf8cddd3
CH
3046 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ,
3047 extent_logical, &mapped_length, &bbio,
3048 0);
4a770891
OS
3049 if (!ret) {
3050 if (!bbio || mapped_length < extent_len)
3051 ret = -EIO;
3052 }
3053 if (ret) {
3054 btrfs_put_bbio(bbio);
3055 goto out;
3056 }
3057 extent_physical = bbio->stripes[0].physical;
3058 extent_mirror_num = bbio->mirror_num;
3059 extent_dev = bbio->stripes[0].dev;
3060 btrfs_put_bbio(bbio);
5a6ac9ea
MX
3061
3062 ret = btrfs_lookup_csums_range(csum_root,
3063 extent_logical,
3064 extent_logical + extent_len - 1,
3065 &sctx->csum_list, 1);
3066 if (ret)
3067 goto out;
3068
3069 ret = scrub_extent_for_parity(sparity, extent_logical,
3070 extent_len,
3071 extent_physical,
3072 extent_dev, flags,
3073 generation,
3074 extent_mirror_num);
6fa96d72
ZL
3075
3076 scrub_free_csums(sctx);
3077
5a6ac9ea
MX
3078 if (ret)
3079 goto out;
3080
5a6ac9ea
MX
3081 if (extent_logical + extent_len <
3082 key.objectid + bytes) {
3083 logic_start += map->stripe_len;
3084
3085 if (logic_start >= logic_end) {
3086 stop_loop = 1;
3087 break;
3088 }
3089
3090 if (logic_start < key.objectid + bytes) {
3091 cond_resched();
3092 goto again;
3093 }
3094 }
3095next:
3096 path->slots[0]++;
3097 }
3098
3099 btrfs_release_path(path);
3100
3101 if (stop_loop)
3102 break;
3103
3104 logic_start += map->stripe_len;
3105 }
3106out:
fa485d21
QW
3107 if (ret < 0) {
3108 ASSERT(logic_end - logic_start <= U32_MAX);
5a6ac9ea 3109 scrub_parity_mark_sectors_error(sparity, logic_start,
a0dd59de 3110 logic_end - logic_start);
fa485d21 3111 }
5a6ac9ea
MX
3112 scrub_parity_put(sparity);
3113 scrub_submit(sctx);
3fb99303 3114 mutex_lock(&sctx->wr_lock);
5a6ac9ea 3115 scrub_wr_submit(sctx);
3fb99303 3116 mutex_unlock(&sctx->wr_lock);
5a6ac9ea
MX
3117
3118 btrfs_release_path(path);
3119 return ret < 0 ? ret : 0;
3120}
3121
de17addc
NA
3122static void sync_replace_for_zoned(struct scrub_ctx *sctx)
3123{
3124 if (!btrfs_is_zoned(sctx->fs_info))
3125 return;
3126
3127 sctx->flush_all_writes = true;
3128 scrub_submit(sctx);
3129 mutex_lock(&sctx->wr_lock);
3130 scrub_wr_submit(sctx);
3131 mutex_unlock(&sctx->wr_lock);
3132
3133 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3134}
3135
7db1c5d1
NA
3136static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical,
3137 u64 physical, u64 physical_end)
3138{
3139 struct btrfs_fs_info *fs_info = sctx->fs_info;
3140 int ret = 0;
3141
3142 if (!btrfs_is_zoned(fs_info))
3143 return 0;
3144
3145 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3146
3147 mutex_lock(&sctx->wr_lock);
3148 if (sctx->write_pointer < physical_end) {
3149 ret = btrfs_sync_zone_write_pointer(sctx->wr_tgtdev, logical,
3150 physical,
3151 sctx->write_pointer);
3152 if (ret)
3153 btrfs_err(fs_info,
3154 "zoned: failed to recover write pointer");
3155 }
3156 mutex_unlock(&sctx->wr_lock);
3157 btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical);
3158
3159 return ret;
3160}
3161
d9d181c1 3162static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
a36cf8b8
SB
3163 struct map_lookup *map,
3164 struct btrfs_device *scrub_dev,
2473d24f
FM
3165 int num, u64 base, u64 length,
3166 struct btrfs_block_group *cache)
a2de733c 3167{
5a6ac9ea 3168 struct btrfs_path *path, *ppath;
fb456252 3169 struct btrfs_fs_info *fs_info = sctx->fs_info;
a2de733c
AJ
3170 struct btrfs_root *root = fs_info->extent_root;
3171 struct btrfs_root *csum_root = fs_info->csum_root;
3172 struct btrfs_extent_item *extent;
e7786c3a 3173 struct blk_plug plug;
a2de733c
AJ
3174 u64 flags;
3175 int ret;
3176 int slot;
a2de733c 3177 u64 nstripes;
a2de733c 3178 struct extent_buffer *l;
a2de733c
AJ
3179 u64 physical;
3180 u64 logical;
625f1c8d 3181 u64 logic_end;
3b080b25 3182 u64 physical_end;
a2de733c 3183 u64 generation;
e12fa9cd 3184 int mirror_num;
7a26285e
AJ
3185 struct reada_control *reada1;
3186 struct reada_control *reada2;
e6c11f9a 3187 struct btrfs_key key;
7a26285e 3188 struct btrfs_key key_end;
a2de733c
AJ
3189 u64 increment = map->stripe_len;
3190 u64 offset;
ff023aac
SB
3191 u64 extent_logical;
3192 u64 extent_physical;
fa485d21
QW
3193 /*
3194 * Unlike chunk length, extent length should never go beyond
3195 * BTRFS_MAX_EXTENT_SIZE, thus u32 is enough here.
3196 */
3197 u32 extent_len;
5a6ac9ea
MX
3198 u64 stripe_logical;
3199 u64 stripe_end;
ff023aac
SB
3200 struct btrfs_device *extent_dev;
3201 int extent_mirror_num;
3b080b25 3202 int stop_loop = 0;
53b381b3 3203
3b080b25 3204 physical = map->stripes[num].physical;
a2de733c 3205 offset = 0;
42c61ab6 3206 nstripes = div64_u64(length, map->stripe_len);
a2de733c
AJ
3207 if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3208 offset = map->stripe_len * num;
3209 increment = map->stripe_len * map->num_stripes;
193ea74b 3210 mirror_num = 1;
a2de733c
AJ
3211 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3212 int factor = map->num_stripes / map->sub_stripes;
3213 offset = map->stripe_len * (num / map->sub_stripes);
3214 increment = map->stripe_len * factor;
193ea74b 3215 mirror_num = num % map->sub_stripes + 1;
c7369b3f 3216 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
a2de733c 3217 increment = map->stripe_len;
193ea74b 3218 mirror_num = num % map->num_stripes + 1;
a2de733c
AJ
3219 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
3220 increment = map->stripe_len;
193ea74b 3221 mirror_num = num % map->num_stripes + 1;
ffe2d203 3222 } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5a6ac9ea 3223 get_raid56_logic_offset(physical, num, map, &offset, NULL);
3b080b25
WS
3224 increment = map->stripe_len * nr_data_stripes(map);
3225 mirror_num = 1;
a2de733c
AJ
3226 } else {
3227 increment = map->stripe_len;
193ea74b 3228 mirror_num = 1;
a2de733c
AJ
3229 }
3230
3231 path = btrfs_alloc_path();
3232 if (!path)
3233 return -ENOMEM;
3234
5a6ac9ea
MX
3235 ppath = btrfs_alloc_path();
3236 if (!ppath) {
379d6854 3237 btrfs_free_path(path);
5a6ac9ea
MX
3238 return -ENOMEM;
3239 }
3240
b5d67f64
SB
3241 /*
3242 * work on commit root. The related disk blocks are static as
3243 * long as COW is applied. This means, it is save to rewrite
3244 * them to repair disk errors without any race conditions
3245 */
a2de733c
AJ
3246 path->search_commit_root = 1;
3247 path->skip_locking = 1;
3248
063c54dc
GH
3249 ppath->search_commit_root = 1;
3250 ppath->skip_locking = 1;
a2de733c 3251 /*
7a26285e
AJ
3252 * trigger the readahead for extent tree csum tree and wait for
3253 * completion. During readahead, the scrub is officially paused
3254 * to not hold off transaction commits
a2de733c
AJ
3255 */
3256 logical = base + offset;
3b080b25 3257 physical_end = physical + nstripes * map->stripe_len;
ffe2d203 3258 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3b080b25 3259 get_raid56_logic_offset(physical_end, num,
5a6ac9ea 3260 map, &logic_end, NULL);
3b080b25
WS
3261 logic_end += base;
3262 } else {
3263 logic_end = logical + increment * nstripes;
3264 }
d9d181c1 3265 wait_event(sctx->list_wait,
b6bfebc1 3266 atomic_read(&sctx->bios_in_flight) == 0);
cb7ab021 3267 scrub_blocked_if_needed(fs_info);
7a26285e
AJ
3268
3269 /* FIXME it might be better to start readahead at commit root */
e6c11f9a
DS
3270 key.objectid = logical;
3271 key.type = BTRFS_EXTENT_ITEM_KEY;
3272 key.offset = (u64)0;
3b080b25 3273 key_end.objectid = logic_end;
3173a18f
JB
3274 key_end.type = BTRFS_METADATA_ITEM_KEY;
3275 key_end.offset = (u64)-1;
e6c11f9a 3276 reada1 = btrfs_reada_add(root, &key, &key_end);
7a26285e 3277
a6889caf
FM
3278 if (cache->flags & BTRFS_BLOCK_GROUP_DATA) {
3279 key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
3280 key.type = BTRFS_EXTENT_CSUM_KEY;
3281 key.offset = logical;
3282 key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
3283 key_end.type = BTRFS_EXTENT_CSUM_KEY;
3284 key_end.offset = logic_end;
3285 reada2 = btrfs_reada_add(csum_root, &key, &key_end);
3286 } else {
3287 reada2 = NULL;
3288 }
7a26285e
AJ
3289
3290 if (!IS_ERR(reada1))
3291 btrfs_reada_wait(reada1);
a6889caf 3292 if (!IS_ERR_OR_NULL(reada2))
7a26285e
AJ
3293 btrfs_reada_wait(reada2);
3294
a2de733c
AJ
3295
3296 /*
3297 * collect all data csums for the stripe to avoid seeking during
3298 * the scrub. This might currently (crc32) end up to be about 1MB
3299 */
e7786c3a 3300 blk_start_plug(&plug);
a2de733c 3301
de17addc
NA
3302 if (sctx->is_dev_replace &&
3303 btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) {
3304 mutex_lock(&sctx->wr_lock);
3305 sctx->write_pointer = physical;
3306 mutex_unlock(&sctx->wr_lock);
3307 sctx->flush_all_writes = true;
3308 }
3309
a2de733c
AJ
3310 /*
3311 * now find all extents for each stripe and scrub them
3312 */
a2de733c 3313 ret = 0;
3b080b25 3314 while (physical < physical_end) {
a2de733c
AJ
3315 /*
3316 * canceled?
3317 */
3318 if (atomic_read(&fs_info->scrub_cancel_req) ||
d9d181c1 3319 atomic_read(&sctx->cancel_req)) {
a2de733c
AJ
3320 ret = -ECANCELED;
3321 goto out;
3322 }
3323 /*
3324 * check to see if we have to pause
3325 */
3326 if (atomic_read(&fs_info->scrub_pause_req)) {
3327 /* push queued extents */
2073c4c2 3328 sctx->flush_all_writes = true;
d9d181c1 3329 scrub_submit(sctx);
3fb99303 3330 mutex_lock(&sctx->wr_lock);
ff023aac 3331 scrub_wr_submit(sctx);
3fb99303 3332 mutex_unlock(&sctx->wr_lock);
d9d181c1 3333 wait_event(sctx->list_wait,
b6bfebc1 3334 atomic_read(&sctx->bios_in_flight) == 0);
2073c4c2 3335 sctx->flush_all_writes = false;
3cb0929a 3336 scrub_blocked_if_needed(fs_info);
a2de733c
AJ
3337 }
3338
f2f66a2f
ZL
3339 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3340 ret = get_raid56_logic_offset(physical, num, map,
3341 &logical,
3342 &stripe_logical);
3343 logical += base;
3344 if (ret) {
7955323b 3345 /* it is parity strip */
f2f66a2f 3346 stripe_logical += base;
a0dd59de 3347 stripe_end = stripe_logical + increment;
f2f66a2f
ZL
3348 ret = scrub_raid56_parity(sctx, map, scrub_dev,
3349 ppath, stripe_logical,
3350 stripe_end);
3351 if (ret)
3352 goto out;
3353 goto skip;
3354 }
3355 }
3356
7c76edb7
WS
3357 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
3358 key.type = BTRFS_METADATA_ITEM_KEY;
3359 else
3360 key.type = BTRFS_EXTENT_ITEM_KEY;
a2de733c 3361 key.objectid = logical;
625f1c8d 3362 key.offset = (u64)-1;
a2de733c
AJ
3363
3364 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3365 if (ret < 0)
3366 goto out;
3173a18f 3367
8c51032f 3368 if (ret > 0) {
ade2e0b3 3369 ret = btrfs_previous_extent_item(root, path, 0);
a2de733c
AJ
3370 if (ret < 0)
3371 goto out;
8c51032f
AJ
3372 if (ret > 0) {
3373 /* there's no smaller item, so stick with the
3374 * larger one */
3375 btrfs_release_path(path);
3376 ret = btrfs_search_slot(NULL, root, &key,
3377 path, 0, 0);
3378 if (ret < 0)
3379 goto out;
3380 }
a2de733c
AJ
3381 }
3382
625f1c8d 3383 stop_loop = 0;
a2de733c 3384 while (1) {
3173a18f
JB
3385 u64 bytes;
3386
a2de733c
AJ
3387 l = path->nodes[0];
3388 slot = path->slots[0];
3389 if (slot >= btrfs_header_nritems(l)) {
3390 ret = btrfs_next_leaf(root, path);
3391 if (ret == 0)
3392 continue;
3393 if (ret < 0)
3394 goto out;
3395
625f1c8d 3396 stop_loop = 1;
a2de733c
AJ
3397 break;
3398 }
3399 btrfs_item_key_to_cpu(l, &key, slot);
3400
d7cad238
ZL
3401 if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3402 key.type != BTRFS_METADATA_ITEM_KEY)
3403 goto next;
3404
3173a18f 3405 if (key.type == BTRFS_METADATA_ITEM_KEY)
0b246afa 3406 bytes = fs_info->nodesize;
3173a18f
JB
3407 else
3408 bytes = key.offset;
3409
3410 if (key.objectid + bytes <= logical)
a2de733c
AJ
3411 goto next;
3412
625f1c8d
LB
3413 if (key.objectid >= logical + map->stripe_len) {
3414 /* out of this device extent */
3415 if (key.objectid >= logic_end)
3416 stop_loop = 1;
3417 break;
3418 }
a2de733c 3419
2473d24f
FM
3420 /*
3421 * If our block group was removed in the meanwhile, just
3422 * stop scrubbing since there is no point in continuing.
3423 * Continuing would prevent reusing its device extents
3424 * for new block groups for a long time.
3425 */
3426 spin_lock(&cache->lock);
3427 if (cache->removed) {
3428 spin_unlock(&cache->lock);
3429 ret = 0;
3430 goto out;
3431 }
3432 spin_unlock(&cache->lock);
3433
a2de733c
AJ
3434 extent = btrfs_item_ptr(l, slot,
3435 struct btrfs_extent_item);
3436 flags = btrfs_extent_flags(l, extent);
3437 generation = btrfs_extent_generation(l, extent);
3438
a323e813
ZL
3439 if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3440 (key.objectid < logical ||
3441 key.objectid + bytes >
3442 logical + map->stripe_len)) {
efe120a0 3443 btrfs_err(fs_info,
5d163e0e 3444 "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
c1c9ff7c 3445 key.objectid, logical);
9799d2c3
ZL
3446 spin_lock(&sctx->stat_lock);
3447 sctx->stat.uncorrectable_errors++;
3448 spin_unlock(&sctx->stat_lock);
a2de733c
AJ
3449 goto next;
3450 }
3451
625f1c8d
LB
3452again:
3453 extent_logical = key.objectid;
fa485d21 3454 ASSERT(bytes <= U32_MAX);
625f1c8d
LB
3455 extent_len = bytes;
3456
a2de733c
AJ
3457 /*
3458 * trim extent to this stripe
3459 */
625f1c8d
LB
3460 if (extent_logical < logical) {
3461 extent_len -= logical - extent_logical;
3462 extent_logical = logical;
a2de733c 3463 }
625f1c8d 3464 if (extent_logical + extent_len >
a2de733c 3465 logical + map->stripe_len) {
625f1c8d
LB
3466 extent_len = logical + map->stripe_len -
3467 extent_logical;
a2de733c
AJ
3468 }
3469
625f1c8d 3470 extent_physical = extent_logical - logical + physical;
ff023aac
SB
3471 extent_dev = scrub_dev;
3472 extent_mirror_num = mirror_num;
32934280 3473 if (sctx->is_dev_replace)
ff023aac
SB
3474 scrub_remap_extent(fs_info, extent_logical,
3475 extent_len, &extent_physical,
3476 &extent_dev,
3477 &extent_mirror_num);
625f1c8d 3478
89490303
FM
3479 if (flags & BTRFS_EXTENT_FLAG_DATA) {
3480 ret = btrfs_lookup_csums_range(csum_root,
3481 extent_logical,
3482 extent_logical + extent_len - 1,
3483 &sctx->csum_list, 1);
3484 if (ret)
3485 goto out;
3486 }
625f1c8d 3487
6ca1765b 3488 ret = scrub_extent(sctx, map, extent_logical, extent_len,
ff023aac
SB
3489 extent_physical, extent_dev, flags,
3490 generation, extent_mirror_num,
115930cb 3491 extent_logical - logical + physical);
6fa96d72
ZL
3492
3493 scrub_free_csums(sctx);
3494
a2de733c
AJ
3495 if (ret)
3496 goto out;
3497
de17addc
NA
3498 if (sctx->is_dev_replace)
3499 sync_replace_for_zoned(sctx);
3500
625f1c8d
LB
3501 if (extent_logical + extent_len <
3502 key.objectid + bytes) {
ffe2d203 3503 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3b080b25
WS
3504 /*
3505 * loop until we find next data stripe
3506 * or we have finished all stripes.
3507 */
5a6ac9ea
MX
3508loop:
3509 physical += map->stripe_len;
3510 ret = get_raid56_logic_offset(physical,
3511 num, map, &logical,
3512 &stripe_logical);
3513 logical += base;
3514
3515 if (ret && physical < physical_end) {
3516 stripe_logical += base;
3517 stripe_end = stripe_logical +
a0dd59de 3518 increment;
5a6ac9ea
MX
3519 ret = scrub_raid56_parity(sctx,
3520 map, scrub_dev, ppath,
3521 stripe_logical,
3522 stripe_end);
3523 if (ret)
3524 goto out;
3525 goto loop;
3526 }
3b080b25
WS
3527 } else {
3528 physical += map->stripe_len;
3529 logical += increment;
3530 }
625f1c8d
LB
3531 if (logical < key.objectid + bytes) {
3532 cond_resched();
3533 goto again;
3534 }
3535
3b080b25 3536 if (physical >= physical_end) {
625f1c8d
LB
3537 stop_loop = 1;
3538 break;
3539 }
3540 }
a2de733c
AJ
3541next:
3542 path->slots[0]++;
3543 }
71267333 3544 btrfs_release_path(path);
3b080b25 3545skip:
a2de733c
AJ
3546 logical += increment;
3547 physical += map->stripe_len;
d9d181c1 3548 spin_lock(&sctx->stat_lock);
625f1c8d
LB
3549 if (stop_loop)
3550 sctx->stat.last_physical = map->stripes[num].physical +
3551 length;
3552 else
3553 sctx->stat.last_physical = physical;
d9d181c1 3554 spin_unlock(&sctx->stat_lock);
625f1c8d
LB
3555 if (stop_loop)
3556 break;
a2de733c 3557 }
ff023aac 3558out:
a2de733c 3559 /* push queued extents */
d9d181c1 3560 scrub_submit(sctx);
3fb99303 3561 mutex_lock(&sctx->wr_lock);
ff023aac 3562 scrub_wr_submit(sctx);
3fb99303 3563 mutex_unlock(&sctx->wr_lock);
a2de733c 3564
e7786c3a 3565 blk_finish_plug(&plug);
a2de733c 3566 btrfs_free_path(path);
5a6ac9ea 3567 btrfs_free_path(ppath);
7db1c5d1
NA
3568
3569 if (sctx->is_dev_replace && ret >= 0) {
3570 int ret2;
3571
3572 ret2 = sync_write_pointer_for_zoned(sctx, base + offset,
3573 map->stripes[num].physical,
3574 physical_end);
3575 if (ret2)
3576 ret = ret2;
3577 }
3578
a2de733c
AJ
3579 return ret < 0 ? ret : 0;
3580}
3581
d9d181c1 3582static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
a36cf8b8 3583 struct btrfs_device *scrub_dev,
a36cf8b8 3584 u64 chunk_offset, u64 length,
020d5b73 3585 u64 dev_offset,
32da5386 3586 struct btrfs_block_group *cache)
a2de733c 3587{
fb456252 3588 struct btrfs_fs_info *fs_info = sctx->fs_info;
c8bf1b67 3589 struct extent_map_tree *map_tree = &fs_info->mapping_tree;
a2de733c
AJ
3590 struct map_lookup *map;
3591 struct extent_map *em;
3592 int i;
ff023aac 3593 int ret = 0;
a2de733c 3594
c8bf1b67
DS
3595 read_lock(&map_tree->lock);
3596 em = lookup_extent_mapping(map_tree, chunk_offset, 1);
3597 read_unlock(&map_tree->lock);
a2de733c 3598
020d5b73
FM
3599 if (!em) {
3600 /*
3601 * Might have been an unused block group deleted by the cleaner
3602 * kthread or relocation.
3603 */
3604 spin_lock(&cache->lock);
3605 if (!cache->removed)
3606 ret = -EINVAL;
3607 spin_unlock(&cache->lock);
3608
3609 return ret;
3610 }
a2de733c 3611
95617d69 3612 map = em->map_lookup;
a2de733c
AJ
3613 if (em->start != chunk_offset)
3614 goto out;
3615
3616 if (em->len < length)
3617 goto out;
3618
3619 for (i = 0; i < map->num_stripes; ++i) {
a36cf8b8 3620 if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
859acaf1 3621 map->stripes[i].physical == dev_offset) {
a36cf8b8 3622 ret = scrub_stripe(sctx, map, scrub_dev, i,
2473d24f 3623 chunk_offset, length, cache);
a2de733c
AJ
3624 if (ret)
3625 goto out;
3626 }
3627 }
3628out:
3629 free_extent_map(em);
3630
3631 return ret;
3632}
3633
de17addc
NA
3634static int finish_extent_writes_for_zoned(struct btrfs_root *root,
3635 struct btrfs_block_group *cache)
3636{
3637 struct btrfs_fs_info *fs_info = cache->fs_info;
3638 struct btrfs_trans_handle *trans;
3639
3640 if (!btrfs_is_zoned(fs_info))
3641 return 0;
3642
3643 btrfs_wait_block_group_reservations(cache);
3644 btrfs_wait_nocow_writers(cache);
3645 btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start, cache->length);
3646
3647 trans = btrfs_join_transaction(root);
3648 if (IS_ERR(trans))
3649 return PTR_ERR(trans);
3650 return btrfs_commit_transaction(trans);
3651}
3652
a2de733c 3653static noinline_for_stack
a36cf8b8 3654int scrub_enumerate_chunks(struct scrub_ctx *sctx,
32934280 3655 struct btrfs_device *scrub_dev, u64 start, u64 end)
a2de733c
AJ
3656{
3657 struct btrfs_dev_extent *dev_extent = NULL;
3658 struct btrfs_path *path;
0b246afa
JM
3659 struct btrfs_fs_info *fs_info = sctx->fs_info;
3660 struct btrfs_root *root = fs_info->dev_root;
a2de733c 3661 u64 length;
a2de733c 3662 u64 chunk_offset;
55e3a601 3663 int ret = 0;
76a8efa1 3664 int ro_set;
a2de733c
AJ
3665 int slot;
3666 struct extent_buffer *l;
3667 struct btrfs_key key;
3668 struct btrfs_key found_key;
32da5386 3669 struct btrfs_block_group *cache;
ff023aac 3670 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
a2de733c
AJ
3671
3672 path = btrfs_alloc_path();
3673 if (!path)
3674 return -ENOMEM;
3675
e4058b54 3676 path->reada = READA_FORWARD;
a2de733c
AJ
3677 path->search_commit_root = 1;
3678 path->skip_locking = 1;
3679
a36cf8b8 3680 key.objectid = scrub_dev->devid;
a2de733c
AJ
3681 key.offset = 0ull;
3682 key.type = BTRFS_DEV_EXTENT_KEY;
3683
a2de733c
AJ
3684 while (1) {
3685 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3686 if (ret < 0)
8c51032f
AJ
3687 break;
3688 if (ret > 0) {
3689 if (path->slots[0] >=
3690 btrfs_header_nritems(path->nodes[0])) {
3691 ret = btrfs_next_leaf(root, path);
55e3a601
Z
3692 if (ret < 0)
3693 break;
3694 if (ret > 0) {
3695 ret = 0;
8c51032f 3696 break;
55e3a601
Z
3697 }
3698 } else {
3699 ret = 0;
8c51032f
AJ
3700 }
3701 }
a2de733c
AJ
3702
3703 l = path->nodes[0];
3704 slot = path->slots[0];
3705
3706 btrfs_item_key_to_cpu(l, &found_key, slot);
3707
a36cf8b8 3708 if (found_key.objectid != scrub_dev->devid)
a2de733c
AJ
3709 break;
3710
962a298f 3711 if (found_key.type != BTRFS_DEV_EXTENT_KEY)
a2de733c
AJ
3712 break;
3713
3714 if (found_key.offset >= end)
3715 break;
3716
3717 if (found_key.offset < key.offset)
3718 break;
3719
3720 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
3721 length = btrfs_dev_extent_length(l, dev_extent);
3722
ced96edc
QW
3723 if (found_key.offset + length <= start)
3724 goto skip;
a2de733c 3725
a2de733c
AJ
3726 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
3727
3728 /*
3729 * get a reference on the corresponding block group to prevent
3730 * the chunk from going away while we scrub it
3731 */
3732 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
ced96edc
QW
3733
3734 /* some chunks are removed but not committed to disk yet,
3735 * continue scrubbing */
3736 if (!cache)
3737 goto skip;
3738
78ce9fc2
NA
3739 if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) {
3740 spin_lock(&cache->lock);
3741 if (!cache->to_copy) {
3742 spin_unlock(&cache->lock);
0dc16ef4
FM
3743 btrfs_put_block_group(cache);
3744 goto skip;
78ce9fc2
NA
3745 }
3746 spin_unlock(&cache->lock);
3747 }
3748
2473d24f
FM
3749 /*
3750 * Make sure that while we are scrubbing the corresponding block
3751 * group doesn't get its logical address and its device extents
3752 * reused for another block group, which can possibly be of a
3753 * different type and different profile. We do this to prevent
3754 * false error detections and crashes due to bogus attempts to
3755 * repair extents.
3756 */
3757 spin_lock(&cache->lock);
3758 if (cache->removed) {
3759 spin_unlock(&cache->lock);
3760 btrfs_put_block_group(cache);
3761 goto skip;
3762 }
6b7304af 3763 btrfs_freeze_block_group(cache);
2473d24f
FM
3764 spin_unlock(&cache->lock);
3765
55e3a601
Z
3766 /*
3767 * we need call btrfs_inc_block_group_ro() with scrubs_paused,
3768 * to avoid deadlock caused by:
3769 * btrfs_inc_block_group_ro()
3770 * -> btrfs_wait_for_commit()
3771 * -> btrfs_commit_transaction()
3772 * -> btrfs_scrub_pause()
3773 */
3774 scrub_pause_on(fs_info);
b12de528
QW
3775
3776 /*
3777 * Don't do chunk preallocation for scrub.
3778 *
3779 * This is especially important for SYSTEM bgs, or we can hit
3780 * -EFBIG from btrfs_finish_chunk_alloc() like:
3781 * 1. The only SYSTEM bg is marked RO.
3782 * Since SYSTEM bg is small, that's pretty common.
3783 * 2. New SYSTEM bg will be allocated
3784 * Due to regular version will allocate new chunk.
3785 * 3. New SYSTEM bg is empty and will get cleaned up
3786 * Before cleanup really happens, it's marked RO again.
3787 * 4. Empty SYSTEM bg get scrubbed
3788 * We go back to 2.
3789 *
3790 * This can easily boost the amount of SYSTEM chunks if cleaner
3791 * thread can't be triggered fast enough, and use up all space
3792 * of btrfs_super_block::sys_chunk_array
1bbb97b8
QW
3793 *
3794 * While for dev replace, we need to try our best to mark block
3795 * group RO, to prevent race between:
3796 * - Write duplication
3797 * Contains latest data
3798 * - Scrub copy
3799 * Contains data from commit tree
3800 *
3801 * If target block group is not marked RO, nocow writes can
3802 * be overwritten by scrub copy, causing data corruption.
3803 * So for dev-replace, it's not allowed to continue if a block
3804 * group is not RO.
b12de528 3805 */
1bbb97b8 3806 ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace);
de17addc
NA
3807 if (!ret && sctx->is_dev_replace) {
3808 ret = finish_extent_writes_for_zoned(root, cache);
3809 if (ret) {
3810 btrfs_dec_block_group_ro(cache);
3811 scrub_pause_off(fs_info);
3812 btrfs_put_block_group(cache);
3813 break;
3814 }
3815 }
3816
76a8efa1
Z
3817 if (ret == 0) {
3818 ro_set = 1;
1bbb97b8 3819 } else if (ret == -ENOSPC && !sctx->is_dev_replace) {
76a8efa1
Z
3820 /*
3821 * btrfs_inc_block_group_ro return -ENOSPC when it
3822 * failed in creating new chunk for metadata.
1bbb97b8 3823 * It is not a problem for scrub, because
76a8efa1
Z
3824 * metadata are always cowed, and our scrub paused
3825 * commit_transactions.
3826 */
3827 ro_set = 0;
195a49ea
FM
3828 } else if (ret == -ETXTBSY) {
3829 btrfs_warn(fs_info,
3830 "skipping scrub of block group %llu due to active swapfile",
3831 cache->start);
3832 scrub_pause_off(fs_info);
3833 ret = 0;
3834 goto skip_unfreeze;
76a8efa1 3835 } else {
5d163e0e 3836 btrfs_warn(fs_info,
913e1535 3837 "failed setting block group ro: %d", ret);
6b7304af 3838 btrfs_unfreeze_block_group(cache);
55e3a601 3839 btrfs_put_block_group(cache);
1bbb97b8 3840 scrub_pause_off(fs_info);
55e3a601
Z
3841 break;
3842 }
3843
1bbb97b8
QW
3844 /*
3845 * Now the target block is marked RO, wait for nocow writes to
3846 * finish before dev-replace.
3847 * COW is fine, as COW never overwrites extents in commit tree.
3848 */
3849 if (sctx->is_dev_replace) {
3850 btrfs_wait_nocow_writers(cache);
3851 btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start,
3852 cache->length);
3853 }
3854
3855 scrub_pause_off(fs_info);
3ec17a67 3856 down_write(&dev_replace->rwsem);
ff023aac
SB
3857 dev_replace->cursor_right = found_key.offset + length;
3858 dev_replace->cursor_left = found_key.offset;
3859 dev_replace->item_needs_writeback = 1;
cb5583dd
DS
3860 up_write(&dev_replace->rwsem);
3861
8c204c96 3862 ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length,
32934280 3863 found_key.offset, cache);
ff023aac
SB
3864
3865 /*
3866 * flush, submit all pending read and write bios, afterwards
3867 * wait for them.
3868 * Note that in the dev replace case, a read request causes
3869 * write requests that are submitted in the read completion
3870 * worker. Therefore in the current situation, it is required
3871 * that all write requests are flushed, so that all read and
3872 * write requests are really completed when bios_in_flight
3873 * changes to 0.
3874 */
2073c4c2 3875 sctx->flush_all_writes = true;
ff023aac 3876 scrub_submit(sctx);
3fb99303 3877 mutex_lock(&sctx->wr_lock);
ff023aac 3878 scrub_wr_submit(sctx);
3fb99303 3879 mutex_unlock(&sctx->wr_lock);
ff023aac
SB
3880
3881 wait_event(sctx->list_wait,
3882 atomic_read(&sctx->bios_in_flight) == 0);
b708ce96
Z
3883
3884 scrub_pause_on(fs_info);
12cf9372
WS
3885
3886 /*
3887 * must be called before we decrease @scrub_paused.
3888 * make sure we don't block transaction commit while
3889 * we are waiting pending workers finished.
3890 */
ff023aac
SB
3891 wait_event(sctx->list_wait,
3892 atomic_read(&sctx->workers_pending) == 0);
2073c4c2 3893 sctx->flush_all_writes = false;
12cf9372 3894
b708ce96 3895 scrub_pause_off(fs_info);
ff023aac 3896
78ce9fc2
NA
3897 if (sctx->is_dev_replace &&
3898 !btrfs_finish_block_group_to_copy(dev_replace->srcdev,
3899 cache, found_key.offset))
3900 ro_set = 0;
3901
3ec17a67 3902 down_write(&dev_replace->rwsem);
1a1a8b73
FM
3903 dev_replace->cursor_left = dev_replace->cursor_right;
3904 dev_replace->item_needs_writeback = 1;
3ec17a67 3905 up_write(&dev_replace->rwsem);
1a1a8b73 3906
76a8efa1 3907 if (ro_set)
2ff7e61e 3908 btrfs_dec_block_group_ro(cache);
ff023aac 3909
758f2dfc
FM
3910 /*
3911 * We might have prevented the cleaner kthread from deleting
3912 * this block group if it was already unused because we raced
3913 * and set it to RO mode first. So add it back to the unused
3914 * list, otherwise it might not ever be deleted unless a manual
3915 * balance is triggered or it becomes used and unused again.
3916 */
3917 spin_lock(&cache->lock);
3918 if (!cache->removed && !cache->ro && cache->reserved == 0 &&
bf38be65 3919 cache->used == 0) {
758f2dfc 3920 spin_unlock(&cache->lock);
6e80d4f8
DZ
3921 if (btrfs_test_opt(fs_info, DISCARD_ASYNC))
3922 btrfs_discard_queue_work(&fs_info->discard_ctl,
3923 cache);
3924 else
3925 btrfs_mark_bg_unused(cache);
758f2dfc
FM
3926 } else {
3927 spin_unlock(&cache->lock);
3928 }
195a49ea 3929skip_unfreeze:
6b7304af 3930 btrfs_unfreeze_block_group(cache);
a2de733c
AJ
3931 btrfs_put_block_group(cache);
3932 if (ret)
3933 break;
32934280 3934 if (sctx->is_dev_replace &&
af1be4f8 3935 atomic64_read(&dev_replace->num_write_errors) > 0) {
ff023aac
SB
3936 ret = -EIO;
3937 break;
3938 }
3939 if (sctx->stat.malloc_errors > 0) {
3940 ret = -ENOMEM;
3941 break;
3942 }
ced96edc 3943skip:
a2de733c 3944 key.offset = found_key.offset + length;
71267333 3945 btrfs_release_path(path);
a2de733c
AJ
3946 }
3947
a2de733c 3948 btrfs_free_path(path);
8c51032f 3949
55e3a601 3950 return ret;
a2de733c
AJ
3951}
3952
a36cf8b8
SB
3953static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
3954 struct btrfs_device *scrub_dev)
a2de733c
AJ
3955{
3956 int i;
3957 u64 bytenr;
3958 u64 gen;
3959 int ret;
0b246afa 3960 struct btrfs_fs_info *fs_info = sctx->fs_info;
a2de733c 3961
0b246afa 3962 if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
fbabd4a3 3963 return -EROFS;
79787eaa 3964
5f546063 3965 /* Seed devices of a new filesystem has their own generation. */
0b246afa 3966 if (scrub_dev->fs_devices != fs_info->fs_devices)
5f546063
MX
3967 gen = scrub_dev->generation;
3968 else
0b246afa 3969 gen = fs_info->last_trans_committed;
a2de733c
AJ
3970
3971 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
3972 bytenr = btrfs_sb_offset(i);
935e5cc9
MX
3973 if (bytenr + BTRFS_SUPER_INFO_SIZE >
3974 scrub_dev->commit_total_bytes)
a2de733c 3975 break;
12659251
NA
3976 if (!btrfs_check_super_location(scrub_dev, bytenr))
3977 continue;
a2de733c 3978
d9d181c1 3979 ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
a36cf8b8 3980 scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
96e63a45 3981 NULL, bytenr);
a2de733c
AJ
3982 if (ret)
3983 return ret;
3984 }
b6bfebc1 3985 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
a2de733c
AJ
3986
3987 return 0;
3988}
3989
e89c4a9c
JB
3990static void scrub_workers_put(struct btrfs_fs_info *fs_info)
3991{
3992 if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt,
3993 &fs_info->scrub_lock)) {
3994 struct btrfs_workqueue *scrub_workers = NULL;
3995 struct btrfs_workqueue *scrub_wr_comp = NULL;
3996 struct btrfs_workqueue *scrub_parity = NULL;
3997
3998 scrub_workers = fs_info->scrub_workers;
3999 scrub_wr_comp = fs_info->scrub_wr_completion_workers;
4000 scrub_parity = fs_info->scrub_parity_workers;
4001
4002 fs_info->scrub_workers = NULL;
4003 fs_info->scrub_wr_completion_workers = NULL;
4004 fs_info->scrub_parity_workers = NULL;
4005 mutex_unlock(&fs_info->scrub_lock);
4006
4007 btrfs_destroy_workqueue(scrub_workers);
4008 btrfs_destroy_workqueue(scrub_wr_comp);
4009 btrfs_destroy_workqueue(scrub_parity);
4010 }
4011}
4012
a2de733c
AJ
4013/*
4014 * get a reference count on fs_info->scrub_workers. start worker if necessary
4015 */
ff023aac
SB
4016static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
4017 int is_dev_replace)
a2de733c 4018{
e89c4a9c
JB
4019 struct btrfs_workqueue *scrub_workers = NULL;
4020 struct btrfs_workqueue *scrub_wr_comp = NULL;
4021 struct btrfs_workqueue *scrub_parity = NULL;
6f011058 4022 unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
0339ef2f 4023 int max_active = fs_info->thread_pool_size;
e89c4a9c 4024 int ret = -ENOMEM;
a2de733c 4025
e89c4a9c
JB
4026 if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt))
4027 return 0;
eb4318e5 4028
e89c4a9c
JB
4029 scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub", flags,
4030 is_dev_replace ? 1 : max_active, 4);
4031 if (!scrub_workers)
4032 goto fail_scrub_workers;
e82afc52 4033
e89c4a9c 4034 scrub_wr_comp = btrfs_alloc_workqueue(fs_info, "scrubwrc", flags,
20b2e302 4035 max_active, 2);
e89c4a9c
JB
4036 if (!scrub_wr_comp)
4037 goto fail_scrub_wr_completion_workers;
ff09c4ca 4038
e89c4a9c
JB
4039 scrub_parity = btrfs_alloc_workqueue(fs_info, "scrubparity", flags,
4040 max_active, 2);
4041 if (!scrub_parity)
4042 goto fail_scrub_parity_workers;
4043
4044 mutex_lock(&fs_info->scrub_lock);
4045 if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
4046 ASSERT(fs_info->scrub_workers == NULL &&
4047 fs_info->scrub_wr_completion_workers == NULL &&
4048 fs_info->scrub_parity_workers == NULL);
4049 fs_info->scrub_workers = scrub_workers;
4050 fs_info->scrub_wr_completion_workers = scrub_wr_comp;
4051 fs_info->scrub_parity_workers = scrub_parity;
ff09c4ca 4052 refcount_set(&fs_info->scrub_workers_refcnt, 1);
e89c4a9c
JB
4053 mutex_unlock(&fs_info->scrub_lock);
4054 return 0;
632dd772 4055 }
e89c4a9c
JB
4056 /* Other thread raced in and created the workers for us */
4057 refcount_inc(&fs_info->scrub_workers_refcnt);
4058 mutex_unlock(&fs_info->scrub_lock);
e82afc52 4059
e89c4a9c
JB
4060 ret = 0;
4061 btrfs_destroy_workqueue(scrub_parity);
e82afc52 4062fail_scrub_parity_workers:
e89c4a9c 4063 btrfs_destroy_workqueue(scrub_wr_comp);
e82afc52 4064fail_scrub_wr_completion_workers:
e89c4a9c 4065 btrfs_destroy_workqueue(scrub_workers);
e82afc52 4066fail_scrub_workers:
e89c4a9c 4067 return ret;
a2de733c
AJ
4068}
4069
aa1b8cd4
SB
4070int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
4071 u64 end, struct btrfs_scrub_progress *progress,
63a212ab 4072 int readonly, int is_dev_replace)
a2de733c 4073{
d9d181c1 4074 struct scrub_ctx *sctx;
a2de733c
AJ
4075 int ret;
4076 struct btrfs_device *dev;
a5fb1142 4077 unsigned int nofs_flag;
a2de733c 4078
aa1b8cd4 4079 if (btrfs_fs_closing(fs_info))
6c3abeda 4080 return -EAGAIN;
a2de733c 4081
da17066c 4082 if (fs_info->nodesize > BTRFS_STRIPE_LEN) {
b5d67f64
SB
4083 /*
4084 * in this case scrub is unable to calculate the checksum
4085 * the way scrub is implemented. Do not handle this
4086 * situation at all because it won't ever happen.
4087 */
efe120a0
FH
4088 btrfs_err(fs_info,
4089 "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",
da17066c
JM
4090 fs_info->nodesize,
4091 BTRFS_STRIPE_LEN);
b5d67f64
SB
4092 return -EINVAL;
4093 }
4094
da17066c 4095 if (fs_info->nodesize >
7a9e9987 4096 PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
da17066c 4097 fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
7a9e9987
SB
4098 /*
4099 * would exhaust the array bounds of pagev member in
4100 * struct scrub_block
4101 */
5d163e0e
JM
4102 btrfs_err(fs_info,
4103 "scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails",
da17066c 4104 fs_info->nodesize,
7a9e9987 4105 SCRUB_MAX_PAGES_PER_BLOCK,
da17066c 4106 fs_info->sectorsize,
7a9e9987
SB
4107 SCRUB_MAX_PAGES_PER_BLOCK);
4108 return -EINVAL;
4109 }
4110
0e94c4f4
DS
4111 /* Allocate outside of device_list_mutex */
4112 sctx = scrub_setup_ctx(fs_info, is_dev_replace);
4113 if (IS_ERR(sctx))
4114 return PTR_ERR(sctx);
a2de733c 4115
e89c4a9c
JB
4116 ret = scrub_workers_get(fs_info, is_dev_replace);
4117 if (ret)
4118 goto out_free_ctx;
4119
aa1b8cd4 4120 mutex_lock(&fs_info->fs_devices->device_list_mutex);
b2598edf 4121 dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
e6e674bd
AJ
4122 if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
4123 !is_dev_replace)) {
aa1b8cd4 4124 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
0e94c4f4 4125 ret = -ENODEV;
e89c4a9c 4126 goto out;
a2de733c 4127 }
a2de733c 4128
ebbede42
AJ
4129 if (!is_dev_replace && !readonly &&
4130 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
5d68da3b 4131 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
a4852cf2
DS
4132 btrfs_err_in_rcu(fs_info,
4133 "scrub on devid %llu: filesystem on %s is not writable",
4134 devid, rcu_str_deref(dev->name));
0e94c4f4 4135 ret = -EROFS;
e89c4a9c 4136 goto out;
5d68da3b
MX
4137 }
4138
3b7a016f 4139 mutex_lock(&fs_info->scrub_lock);
e12c9621 4140 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
401e29c1 4141 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) {
a2de733c 4142 mutex_unlock(&fs_info->scrub_lock);
aa1b8cd4 4143 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
0e94c4f4 4144 ret = -EIO;
e89c4a9c 4145 goto out;
a2de733c
AJ
4146 }
4147
cb5583dd 4148 down_read(&fs_info->dev_replace.rwsem);
cadbc0a0 4149 if (dev->scrub_ctx ||
8dabb742
SB
4150 (!is_dev_replace &&
4151 btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
cb5583dd 4152 up_read(&fs_info->dev_replace.rwsem);
a2de733c 4153 mutex_unlock(&fs_info->scrub_lock);
aa1b8cd4 4154 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
0e94c4f4 4155 ret = -EINPROGRESS;
e89c4a9c 4156 goto out;
a2de733c 4157 }
cb5583dd 4158 up_read(&fs_info->dev_replace.rwsem);
3b7a016f 4159
d9d181c1 4160 sctx->readonly = readonly;
cadbc0a0 4161 dev->scrub_ctx = sctx;
3cb0929a 4162 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
a2de733c 4163
3cb0929a
WS
4164 /*
4165 * checking @scrub_pause_req here, we can avoid
4166 * race between committing transaction and scrubbing.
4167 */
cb7ab021 4168 __scrub_blocked_if_needed(fs_info);
a2de733c
AJ
4169 atomic_inc(&fs_info->scrubs_running);
4170 mutex_unlock(&fs_info->scrub_lock);
a2de733c 4171
a5fb1142
FM
4172 /*
4173 * In order to avoid deadlock with reclaim when there is a transaction
4174 * trying to pause scrub, make sure we use GFP_NOFS for all the
4175 * allocations done at btrfs_scrub_pages() and scrub_pages_for_parity()
4176 * invoked by our callees. The pausing request is done when the
4177 * transaction commit starts, and it blocks the transaction until scrub
4178 * is paused (done at specific points at scrub_stripe() or right above
4179 * before incrementing fs_info->scrubs_running).
4180 */
4181 nofs_flag = memalloc_nofs_save();
ff023aac 4182 if (!is_dev_replace) {
d1e14420 4183 btrfs_info(fs_info, "scrub: started on devid %llu", devid);
9b011adf
WS
4184 /*
4185 * by holding device list mutex, we can
4186 * kick off writing super in log tree sync.
4187 */
3cb0929a 4188 mutex_lock(&fs_info->fs_devices->device_list_mutex);
ff023aac 4189 ret = scrub_supers(sctx, dev);
3cb0929a 4190 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
ff023aac 4191 }
a2de733c
AJ
4192
4193 if (!ret)
32934280 4194 ret = scrub_enumerate_chunks(sctx, dev, start, end);
a5fb1142 4195 memalloc_nofs_restore(nofs_flag);
a2de733c 4196
b6bfebc1 4197 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
a2de733c
AJ
4198 atomic_dec(&fs_info->scrubs_running);
4199 wake_up(&fs_info->scrub_pause_wait);
4200
b6bfebc1 4201 wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
0ef8e451 4202
a2de733c 4203 if (progress)
d9d181c1 4204 memcpy(progress, &sctx->stat, sizeof(*progress));
a2de733c 4205
d1e14420
AJ
4206 if (!is_dev_replace)
4207 btrfs_info(fs_info, "scrub: %s on devid %llu with status: %d",
4208 ret ? "not finished" : "finished", devid, ret);
4209
a2de733c 4210 mutex_lock(&fs_info->scrub_lock);
cadbc0a0 4211 dev->scrub_ctx = NULL;
a2de733c
AJ
4212 mutex_unlock(&fs_info->scrub_lock);
4213
e89c4a9c 4214 scrub_workers_put(fs_info);
f55985f4 4215 scrub_put_ctx(sctx);
a2de733c 4216
0e94c4f4 4217 return ret;
e89c4a9c
JB
4218out:
4219 scrub_workers_put(fs_info);
0e94c4f4
DS
4220out_free_ctx:
4221 scrub_free_ctx(sctx);
4222
a2de733c
AJ
4223 return ret;
4224}
4225
2ff7e61e 4226void btrfs_scrub_pause(struct btrfs_fs_info *fs_info)
a2de733c 4227{
a2de733c
AJ
4228 mutex_lock(&fs_info->scrub_lock);
4229 atomic_inc(&fs_info->scrub_pause_req);
4230 while (atomic_read(&fs_info->scrubs_paused) !=
4231 atomic_read(&fs_info->scrubs_running)) {
4232 mutex_unlock(&fs_info->scrub_lock);
4233 wait_event(fs_info->scrub_pause_wait,
4234 atomic_read(&fs_info->scrubs_paused) ==
4235 atomic_read(&fs_info->scrubs_running));
4236 mutex_lock(&fs_info->scrub_lock);
4237 }
4238 mutex_unlock(&fs_info->scrub_lock);
a2de733c
AJ
4239}
4240
2ff7e61e 4241void btrfs_scrub_continue(struct btrfs_fs_info *fs_info)
a2de733c 4242{
a2de733c
AJ
4243 atomic_dec(&fs_info->scrub_pause_req);
4244 wake_up(&fs_info->scrub_pause_wait);
a2de733c
AJ
4245}
4246
aa1b8cd4 4247int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
a2de733c 4248{
a2de733c
AJ
4249 mutex_lock(&fs_info->scrub_lock);
4250 if (!atomic_read(&fs_info->scrubs_running)) {
4251 mutex_unlock(&fs_info->scrub_lock);
4252 return -ENOTCONN;
4253 }
4254
4255 atomic_inc(&fs_info->scrub_cancel_req);
4256 while (atomic_read(&fs_info->scrubs_running)) {
4257 mutex_unlock(&fs_info->scrub_lock);
4258 wait_event(fs_info->scrub_pause_wait,
4259 atomic_read(&fs_info->scrubs_running) == 0);
4260 mutex_lock(&fs_info->scrub_lock);
4261 }
4262 atomic_dec(&fs_info->scrub_cancel_req);
4263 mutex_unlock(&fs_info->scrub_lock);
4264
4265 return 0;
4266}
4267
163e97ee 4268int btrfs_scrub_cancel_dev(struct btrfs_device *dev)
49b25e05 4269{
163e97ee 4270 struct btrfs_fs_info *fs_info = dev->fs_info;
d9d181c1 4271 struct scrub_ctx *sctx;
a2de733c
AJ
4272
4273 mutex_lock(&fs_info->scrub_lock);
cadbc0a0 4274 sctx = dev->scrub_ctx;
d9d181c1 4275 if (!sctx) {
a2de733c
AJ
4276 mutex_unlock(&fs_info->scrub_lock);
4277 return -ENOTCONN;
4278 }
d9d181c1 4279 atomic_inc(&sctx->cancel_req);
cadbc0a0 4280 while (dev->scrub_ctx) {
a2de733c
AJ
4281 mutex_unlock(&fs_info->scrub_lock);
4282 wait_event(fs_info->scrub_pause_wait,
cadbc0a0 4283 dev->scrub_ctx == NULL);
a2de733c
AJ
4284 mutex_lock(&fs_info->scrub_lock);
4285 }
4286 mutex_unlock(&fs_info->scrub_lock);
4287
4288 return 0;
4289}
1623edeb 4290
2ff7e61e 4291int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
a2de733c
AJ
4292 struct btrfs_scrub_progress *progress)
4293{
4294 struct btrfs_device *dev;
d9d181c1 4295 struct scrub_ctx *sctx = NULL;
a2de733c 4296
0b246afa 4297 mutex_lock(&fs_info->fs_devices->device_list_mutex);
b2598edf 4298 dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
a2de733c 4299 if (dev)
cadbc0a0 4300 sctx = dev->scrub_ctx;
d9d181c1
SB
4301 if (sctx)
4302 memcpy(progress, &sctx->stat, sizeof(*progress));
0b246afa 4303 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
a2de733c 4304
d9d181c1 4305 return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
a2de733c 4306}
ff023aac
SB
4307
4308static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
fa485d21 4309 u64 extent_logical, u32 extent_len,
ff023aac
SB
4310 u64 *extent_physical,
4311 struct btrfs_device **extent_dev,
4312 int *extent_mirror_num)
4313{
4314 u64 mapped_length;
4315 struct btrfs_bio *bbio = NULL;
4316 int ret;
4317
4318 mapped_length = extent_len;
cf8cddd3 4319 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical,
ff023aac
SB
4320 &mapped_length, &bbio, 0);
4321 if (ret || !bbio || mapped_length < extent_len ||
4322 !bbio->stripes[0].dev->bdev) {
6e9606d2 4323 btrfs_put_bbio(bbio);
ff023aac
SB
4324 return;
4325 }
4326
4327 *extent_physical = bbio->stripes[0].physical;
4328 *extent_mirror_num = bbio->mirror_num;
4329 *extent_dev = bbio->stripes[0].dev;
6e9606d2 4330 btrfs_put_bbio(bbio);
ff023aac 4331}