]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blame_incremental - fs/btrfs/scrub.c
btrfs: clear log tree recovering status if starting transaction fails
[mirror_ubuntu-jammy-kernel.git] / fs / btrfs / scrub.c
... / ...
CommitLineData
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (C) 2011, 2012 STRATO. All rights reserved.
4 */
5
6#include <linux/blkdev.h>
7#include <linux/ratelimit.h>
8#include <linux/sched/mm.h>
9#include <crypto/hash.h>
10#include "ctree.h"
11#include "discard.h"
12#include "volumes.h"
13#include "disk-io.h"
14#include "ordered-data.h"
15#include "transaction.h"
16#include "backref.h"
17#include "extent_io.h"
18#include "dev-replace.h"
19#include "check-integrity.h"
20#include "rcu-string.h"
21#include "raid56.h"
22#include "block-group.h"
23#include "zoned.h"
24
25/*
26 * This is only the first step towards a full-features scrub. It reads all
27 * extent and super block and verifies the checksums. In case a bad checksum
28 * is found or the extent cannot be read, good data will be written back if
29 * any can be found.
30 *
31 * Future enhancements:
32 * - In case an unrepairable extent is encountered, track which files are
33 * affected and report them
34 * - track and record media errors, throw out bad devices
35 * - add a mode to also read unallocated space
36 */
37
38struct scrub_block;
39struct scrub_ctx;
40
41/*
42 * the following three values only influence the performance.
43 * The last one configures the number of parallel and outstanding I/O
44 * operations. The first two values configure an upper limit for the number
45 * of (dynamically allocated) pages that are added to a bio.
46 */
47#define SCRUB_PAGES_PER_RD_BIO 32 /* 128k per bio */
48#define SCRUB_PAGES_PER_WR_BIO 32 /* 128k per bio */
49#define SCRUB_BIOS_PER_SCTX 64 /* 8MB per device in flight */
50
51/*
52 * the following value times PAGE_SIZE needs to be large enough to match the
53 * largest node/leaf/sector size that shall be supported.
54 * Values larger than BTRFS_STRIPE_LEN are not supported.
55 */
56#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */
57
58struct scrub_recover {
59 refcount_t refs;
60 struct btrfs_bio *bbio;
61 u64 map_length;
62};
63
64struct scrub_page {
65 struct scrub_block *sblock;
66 struct page *page;
67 struct btrfs_device *dev;
68 struct list_head list;
69 u64 flags; /* extent flags */
70 u64 generation;
71 u64 logical;
72 u64 physical;
73 u64 physical_for_dev_replace;
74 atomic_t refs;
75 u8 mirror_num;
76 int have_csum:1;
77 int io_error:1;
78 u8 csum[BTRFS_CSUM_SIZE];
79
80 struct scrub_recover *recover;
81};
82
83struct scrub_bio {
84 int index;
85 struct scrub_ctx *sctx;
86 struct btrfs_device *dev;
87 struct bio *bio;
88 blk_status_t status;
89 u64 logical;
90 u64 physical;
91#if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
92 struct scrub_page *pagev[SCRUB_PAGES_PER_WR_BIO];
93#else
94 struct scrub_page *pagev[SCRUB_PAGES_PER_RD_BIO];
95#endif
96 int page_count;
97 int next_free;
98 struct btrfs_work work;
99};
100
101struct scrub_block {
102 struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
103 int page_count;
104 atomic_t outstanding_pages;
105 refcount_t refs; /* free mem on transition to zero */
106 struct scrub_ctx *sctx;
107 struct scrub_parity *sparity;
108 struct {
109 unsigned int header_error:1;
110 unsigned int checksum_error:1;
111 unsigned int no_io_error_seen:1;
112 unsigned int generation_error:1; /* also sets header_error */
113
114 /* The following is for the data used to check parity */
115 /* It is for the data with checksum */
116 unsigned int data_corrected:1;
117 };
118 struct btrfs_work work;
119};
120
121/* Used for the chunks with parity stripe such RAID5/6 */
122struct scrub_parity {
123 struct scrub_ctx *sctx;
124
125 struct btrfs_device *scrub_dev;
126
127 u64 logic_start;
128
129 u64 logic_end;
130
131 int nsectors;
132
133 u32 stripe_len;
134
135 refcount_t refs;
136
137 struct list_head spages;
138
139 /* Work of parity check and repair */
140 struct btrfs_work work;
141
142 /* Mark the parity blocks which have data */
143 unsigned long *dbitmap;
144
145 /*
146 * Mark the parity blocks which have data, but errors happen when
147 * read data or check data
148 */
149 unsigned long *ebitmap;
150
151 unsigned long bitmap[];
152};
153
154struct scrub_ctx {
155 struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX];
156 struct btrfs_fs_info *fs_info;
157 int first_free;
158 int curr;
159 atomic_t bios_in_flight;
160 atomic_t workers_pending;
161 spinlock_t list_lock;
162 wait_queue_head_t list_wait;
163 struct list_head csum_list;
164 atomic_t cancel_req;
165 int readonly;
166 int pages_per_rd_bio;
167
168 /* State of IO submission throttling affecting the associated device */
169 ktime_t throttle_deadline;
170 u64 throttle_sent;
171
172 int is_dev_replace;
173 u64 write_pointer;
174
175 struct scrub_bio *wr_curr_bio;
176 struct mutex wr_lock;
177 int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
178 struct btrfs_device *wr_tgtdev;
179 bool flush_all_writes;
180
181 /*
182 * statistics
183 */
184 struct btrfs_scrub_progress stat;
185 spinlock_t stat_lock;
186
187 /*
188 * Use a ref counter to avoid use-after-free issues. Scrub workers
189 * decrement bios_in_flight and workers_pending and then do a wakeup
190 * on the list_wait wait queue. We must ensure the main scrub task
191 * doesn't free the scrub context before or while the workers are
192 * doing the wakeup() call.
193 */
194 refcount_t refs;
195};
196
197struct scrub_warning {
198 struct btrfs_path *path;
199 u64 extent_item_size;
200 const char *errstr;
201 u64 physical;
202 u64 logical;
203 struct btrfs_device *dev;
204};
205
206struct full_stripe_lock {
207 struct rb_node node;
208 u64 logical;
209 u64 refs;
210 struct mutex mutex;
211};
212
213static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
214 struct scrub_block *sblocks_for_recheck);
215static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
216 struct scrub_block *sblock,
217 int retry_failed_mirror);
218static void scrub_recheck_block_checksum(struct scrub_block *sblock);
219static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
220 struct scrub_block *sblock_good);
221static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
222 struct scrub_block *sblock_good,
223 int page_num, int force_write);
224static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
225static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
226 int page_num);
227static int scrub_checksum_data(struct scrub_block *sblock);
228static int scrub_checksum_tree_block(struct scrub_block *sblock);
229static int scrub_checksum_super(struct scrub_block *sblock);
230static void scrub_block_put(struct scrub_block *sblock);
231static void scrub_page_get(struct scrub_page *spage);
232static void scrub_page_put(struct scrub_page *spage);
233static void scrub_parity_get(struct scrub_parity *sparity);
234static void scrub_parity_put(struct scrub_parity *sparity);
235static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len,
236 u64 physical, struct btrfs_device *dev, u64 flags,
237 u64 gen, int mirror_num, u8 *csum,
238 u64 physical_for_dev_replace);
239static void scrub_bio_end_io(struct bio *bio);
240static void scrub_bio_end_io_worker(struct btrfs_work *work);
241static void scrub_block_complete(struct scrub_block *sblock);
242static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
243 u64 extent_logical, u32 extent_len,
244 u64 *extent_physical,
245 struct btrfs_device **extent_dev,
246 int *extent_mirror_num);
247static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
248 struct scrub_page *spage);
249static void scrub_wr_submit(struct scrub_ctx *sctx);
250static void scrub_wr_bio_end_io(struct bio *bio);
251static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
252static void scrub_put_ctx(struct scrub_ctx *sctx);
253
254static inline int scrub_is_page_on_raid56(struct scrub_page *spage)
255{
256 return spage->recover &&
257 (spage->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
258}
259
260static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
261{
262 refcount_inc(&sctx->refs);
263 atomic_inc(&sctx->bios_in_flight);
264}
265
266static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
267{
268 atomic_dec(&sctx->bios_in_flight);
269 wake_up(&sctx->list_wait);
270 scrub_put_ctx(sctx);
271}
272
273static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
274{
275 while (atomic_read(&fs_info->scrub_pause_req)) {
276 mutex_unlock(&fs_info->scrub_lock);
277 wait_event(fs_info->scrub_pause_wait,
278 atomic_read(&fs_info->scrub_pause_req) == 0);
279 mutex_lock(&fs_info->scrub_lock);
280 }
281}
282
283static void scrub_pause_on(struct btrfs_fs_info *fs_info)
284{
285 atomic_inc(&fs_info->scrubs_paused);
286 wake_up(&fs_info->scrub_pause_wait);
287}
288
289static void scrub_pause_off(struct btrfs_fs_info *fs_info)
290{
291 mutex_lock(&fs_info->scrub_lock);
292 __scrub_blocked_if_needed(fs_info);
293 atomic_dec(&fs_info->scrubs_paused);
294 mutex_unlock(&fs_info->scrub_lock);
295
296 wake_up(&fs_info->scrub_pause_wait);
297}
298
299static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
300{
301 scrub_pause_on(fs_info);
302 scrub_pause_off(fs_info);
303}
304
305/*
306 * Insert new full stripe lock into full stripe locks tree
307 *
308 * Return pointer to existing or newly inserted full_stripe_lock structure if
309 * everything works well.
310 * Return ERR_PTR(-ENOMEM) if we failed to allocate memory
311 *
312 * NOTE: caller must hold full_stripe_locks_root->lock before calling this
313 * function
314 */
315static struct full_stripe_lock *insert_full_stripe_lock(
316 struct btrfs_full_stripe_locks_tree *locks_root,
317 u64 fstripe_logical)
318{
319 struct rb_node **p;
320 struct rb_node *parent = NULL;
321 struct full_stripe_lock *entry;
322 struct full_stripe_lock *ret;
323
324 lockdep_assert_held(&locks_root->lock);
325
326 p = &locks_root->root.rb_node;
327 while (*p) {
328 parent = *p;
329 entry = rb_entry(parent, struct full_stripe_lock, node);
330 if (fstripe_logical < entry->logical) {
331 p = &(*p)->rb_left;
332 } else if (fstripe_logical > entry->logical) {
333 p = &(*p)->rb_right;
334 } else {
335 entry->refs++;
336 return entry;
337 }
338 }
339
340 /*
341 * Insert new lock.
342 */
343 ret = kmalloc(sizeof(*ret), GFP_KERNEL);
344 if (!ret)
345 return ERR_PTR(-ENOMEM);
346 ret->logical = fstripe_logical;
347 ret->refs = 1;
348 mutex_init(&ret->mutex);
349
350 rb_link_node(&ret->node, parent, p);
351 rb_insert_color(&ret->node, &locks_root->root);
352 return ret;
353}
354
355/*
356 * Search for a full stripe lock of a block group
357 *
358 * Return pointer to existing full stripe lock if found
359 * Return NULL if not found
360 */
361static struct full_stripe_lock *search_full_stripe_lock(
362 struct btrfs_full_stripe_locks_tree *locks_root,
363 u64 fstripe_logical)
364{
365 struct rb_node *node;
366 struct full_stripe_lock *entry;
367
368 lockdep_assert_held(&locks_root->lock);
369
370 node = locks_root->root.rb_node;
371 while (node) {
372 entry = rb_entry(node, struct full_stripe_lock, node);
373 if (fstripe_logical < entry->logical)
374 node = node->rb_left;
375 else if (fstripe_logical > entry->logical)
376 node = node->rb_right;
377 else
378 return entry;
379 }
380 return NULL;
381}
382
383/*
384 * Helper to get full stripe logical from a normal bytenr.
385 *
386 * Caller must ensure @cache is a RAID56 block group.
387 */
388static u64 get_full_stripe_logical(struct btrfs_block_group *cache, u64 bytenr)
389{
390 u64 ret;
391
392 /*
393 * Due to chunk item size limit, full stripe length should not be
394 * larger than U32_MAX. Just a sanity check here.
395 */
396 WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX);
397
398 /*
399 * round_down() can only handle power of 2, while RAID56 full
400 * stripe length can be 64KiB * n, so we need to manually round down.
401 */
402 ret = div64_u64(bytenr - cache->start, cache->full_stripe_len) *
403 cache->full_stripe_len + cache->start;
404 return ret;
405}
406
407/*
408 * Lock a full stripe to avoid concurrency of recovery and read
409 *
410 * It's only used for profiles with parities (RAID5/6), for other profiles it
411 * does nothing.
412 *
413 * Return 0 if we locked full stripe covering @bytenr, with a mutex held.
414 * So caller must call unlock_full_stripe() at the same context.
415 *
416 * Return <0 if encounters error.
417 */
418static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
419 bool *locked_ret)
420{
421 struct btrfs_block_group *bg_cache;
422 struct btrfs_full_stripe_locks_tree *locks_root;
423 struct full_stripe_lock *existing;
424 u64 fstripe_start;
425 int ret = 0;
426
427 *locked_ret = false;
428 bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
429 if (!bg_cache) {
430 ASSERT(0);
431 return -ENOENT;
432 }
433
434 /* Profiles not based on parity don't need full stripe lock */
435 if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
436 goto out;
437 locks_root = &bg_cache->full_stripe_locks_root;
438
439 fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
440
441 /* Now insert the full stripe lock */
442 mutex_lock(&locks_root->lock);
443 existing = insert_full_stripe_lock(locks_root, fstripe_start);
444 mutex_unlock(&locks_root->lock);
445 if (IS_ERR(existing)) {
446 ret = PTR_ERR(existing);
447 goto out;
448 }
449 mutex_lock(&existing->mutex);
450 *locked_ret = true;
451out:
452 btrfs_put_block_group(bg_cache);
453 return ret;
454}
455
456/*
457 * Unlock a full stripe.
458 *
459 * NOTE: Caller must ensure it's the same context calling corresponding
460 * lock_full_stripe().
461 *
462 * Return 0 if we unlock full stripe without problem.
463 * Return <0 for error
464 */
465static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
466 bool locked)
467{
468 struct btrfs_block_group *bg_cache;
469 struct btrfs_full_stripe_locks_tree *locks_root;
470 struct full_stripe_lock *fstripe_lock;
471 u64 fstripe_start;
472 bool freeit = false;
473 int ret = 0;
474
475 /* If we didn't acquire full stripe lock, no need to continue */
476 if (!locked)
477 return 0;
478
479 bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
480 if (!bg_cache) {
481 ASSERT(0);
482 return -ENOENT;
483 }
484 if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
485 goto out;
486
487 locks_root = &bg_cache->full_stripe_locks_root;
488 fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
489
490 mutex_lock(&locks_root->lock);
491 fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start);
492 /* Unpaired unlock_full_stripe() detected */
493 if (!fstripe_lock) {
494 WARN_ON(1);
495 ret = -ENOENT;
496 mutex_unlock(&locks_root->lock);
497 goto out;
498 }
499
500 if (fstripe_lock->refs == 0) {
501 WARN_ON(1);
502 btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow",
503 fstripe_lock->logical);
504 } else {
505 fstripe_lock->refs--;
506 }
507
508 if (fstripe_lock->refs == 0) {
509 rb_erase(&fstripe_lock->node, &locks_root->root);
510 freeit = true;
511 }
512 mutex_unlock(&locks_root->lock);
513
514 mutex_unlock(&fstripe_lock->mutex);
515 if (freeit)
516 kfree(fstripe_lock);
517out:
518 btrfs_put_block_group(bg_cache);
519 return ret;
520}
521
522static void scrub_free_csums(struct scrub_ctx *sctx)
523{
524 while (!list_empty(&sctx->csum_list)) {
525 struct btrfs_ordered_sum *sum;
526 sum = list_first_entry(&sctx->csum_list,
527 struct btrfs_ordered_sum, list);
528 list_del(&sum->list);
529 kfree(sum);
530 }
531}
532
533static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
534{
535 int i;
536
537 if (!sctx)
538 return;
539
540 /* this can happen when scrub is cancelled */
541 if (sctx->curr != -1) {
542 struct scrub_bio *sbio = sctx->bios[sctx->curr];
543
544 for (i = 0; i < sbio->page_count; i++) {
545 WARN_ON(!sbio->pagev[i]->page);
546 scrub_block_put(sbio->pagev[i]->sblock);
547 }
548 bio_put(sbio->bio);
549 }
550
551 for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
552 struct scrub_bio *sbio = sctx->bios[i];
553
554 if (!sbio)
555 break;
556 kfree(sbio);
557 }
558
559 kfree(sctx->wr_curr_bio);
560 scrub_free_csums(sctx);
561 kfree(sctx);
562}
563
564static void scrub_put_ctx(struct scrub_ctx *sctx)
565{
566 if (refcount_dec_and_test(&sctx->refs))
567 scrub_free_ctx(sctx);
568}
569
570static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
571 struct btrfs_fs_info *fs_info, int is_dev_replace)
572{
573 struct scrub_ctx *sctx;
574 int i;
575
576 sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
577 if (!sctx)
578 goto nomem;
579 refcount_set(&sctx->refs, 1);
580 sctx->is_dev_replace = is_dev_replace;
581 sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
582 sctx->curr = -1;
583 sctx->fs_info = fs_info;
584 INIT_LIST_HEAD(&sctx->csum_list);
585 for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
586 struct scrub_bio *sbio;
587
588 sbio = kzalloc(sizeof(*sbio), GFP_KERNEL);
589 if (!sbio)
590 goto nomem;
591 sctx->bios[i] = sbio;
592
593 sbio->index = i;
594 sbio->sctx = sctx;
595 sbio->page_count = 0;
596 btrfs_init_work(&sbio->work, scrub_bio_end_io_worker, NULL,
597 NULL);
598
599 if (i != SCRUB_BIOS_PER_SCTX - 1)
600 sctx->bios[i]->next_free = i + 1;
601 else
602 sctx->bios[i]->next_free = -1;
603 }
604 sctx->first_free = 0;
605 atomic_set(&sctx->bios_in_flight, 0);
606 atomic_set(&sctx->workers_pending, 0);
607 atomic_set(&sctx->cancel_req, 0);
608
609 spin_lock_init(&sctx->list_lock);
610 spin_lock_init(&sctx->stat_lock);
611 init_waitqueue_head(&sctx->list_wait);
612 sctx->throttle_deadline = 0;
613
614 WARN_ON(sctx->wr_curr_bio != NULL);
615 mutex_init(&sctx->wr_lock);
616 sctx->wr_curr_bio = NULL;
617 if (is_dev_replace) {
618 WARN_ON(!fs_info->dev_replace.tgtdev);
619 sctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO;
620 sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
621 sctx->flush_all_writes = false;
622 }
623
624 return sctx;
625
626nomem:
627 scrub_free_ctx(sctx);
628 return ERR_PTR(-ENOMEM);
629}
630
631static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
632 void *warn_ctx)
633{
634 u64 isize;
635 u32 nlink;
636 int ret;
637 int i;
638 unsigned nofs_flag;
639 struct extent_buffer *eb;
640 struct btrfs_inode_item *inode_item;
641 struct scrub_warning *swarn = warn_ctx;
642 struct btrfs_fs_info *fs_info = swarn->dev->fs_info;
643 struct inode_fs_paths *ipath = NULL;
644 struct btrfs_root *local_root;
645 struct btrfs_key key;
646
647 local_root = btrfs_get_fs_root(fs_info, root, true);
648 if (IS_ERR(local_root)) {
649 ret = PTR_ERR(local_root);
650 goto err;
651 }
652
653 /*
654 * this makes the path point to (inum INODE_ITEM ioff)
655 */
656 key.objectid = inum;
657 key.type = BTRFS_INODE_ITEM_KEY;
658 key.offset = 0;
659
660 ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
661 if (ret) {
662 btrfs_put_root(local_root);
663 btrfs_release_path(swarn->path);
664 goto err;
665 }
666
667 eb = swarn->path->nodes[0];
668 inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
669 struct btrfs_inode_item);
670 isize = btrfs_inode_size(eb, inode_item);
671 nlink = btrfs_inode_nlink(eb, inode_item);
672 btrfs_release_path(swarn->path);
673
674 /*
675 * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub
676 * uses GFP_NOFS in this context, so we keep it consistent but it does
677 * not seem to be strictly necessary.
678 */
679 nofs_flag = memalloc_nofs_save();
680 ipath = init_ipath(4096, local_root, swarn->path);
681 memalloc_nofs_restore(nofs_flag);
682 if (IS_ERR(ipath)) {
683 btrfs_put_root(local_root);
684 ret = PTR_ERR(ipath);
685 ipath = NULL;
686 goto err;
687 }
688 ret = paths_from_inode(inum, ipath);
689
690 if (ret < 0)
691 goto err;
692
693 /*
694 * we deliberately ignore the bit ipath might have been too small to
695 * hold all of the paths here
696 */
697 for (i = 0; i < ipath->fspath->elem_cnt; ++i)
698 btrfs_warn_in_rcu(fs_info,
699"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %llu, links %u (path: %s)",
700 swarn->errstr, swarn->logical,
701 rcu_str_deref(swarn->dev->name),
702 swarn->physical,
703 root, inum, offset,
704 min(isize - offset, (u64)PAGE_SIZE), nlink,
705 (char *)(unsigned long)ipath->fspath->val[i]);
706
707 btrfs_put_root(local_root);
708 free_ipath(ipath);
709 return 0;
710
711err:
712 btrfs_warn_in_rcu(fs_info,
713 "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
714 swarn->errstr, swarn->logical,
715 rcu_str_deref(swarn->dev->name),
716 swarn->physical,
717 root, inum, offset, ret);
718
719 free_ipath(ipath);
720 return 0;
721}
722
723static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
724{
725 struct btrfs_device *dev;
726 struct btrfs_fs_info *fs_info;
727 struct btrfs_path *path;
728 struct btrfs_key found_key;
729 struct extent_buffer *eb;
730 struct btrfs_extent_item *ei;
731 struct scrub_warning swarn;
732 unsigned long ptr = 0;
733 u64 extent_item_pos;
734 u64 flags = 0;
735 u64 ref_root;
736 u32 item_size;
737 u8 ref_level = 0;
738 int ret;
739
740 WARN_ON(sblock->page_count < 1);
741 dev = sblock->pagev[0]->dev;
742 fs_info = sblock->sctx->fs_info;
743
744 path = btrfs_alloc_path();
745 if (!path)
746 return;
747
748 swarn.physical = sblock->pagev[0]->physical;
749 swarn.logical = sblock->pagev[0]->logical;
750 swarn.errstr = errstr;
751 swarn.dev = NULL;
752
753 ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
754 &flags);
755 if (ret < 0)
756 goto out;
757
758 extent_item_pos = swarn.logical - found_key.objectid;
759 swarn.extent_item_size = found_key.offset;
760
761 eb = path->nodes[0];
762 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
763 item_size = btrfs_item_size_nr(eb, path->slots[0]);
764
765 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
766 do {
767 ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
768 item_size, &ref_root,
769 &ref_level);
770 btrfs_warn_in_rcu(fs_info,
771"%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
772 errstr, swarn.logical,
773 rcu_str_deref(dev->name),
774 swarn.physical,
775 ref_level ? "node" : "leaf",
776 ret < 0 ? -1 : ref_level,
777 ret < 0 ? -1 : ref_root);
778 } while (ret != 1);
779 btrfs_release_path(path);
780 } else {
781 btrfs_release_path(path);
782 swarn.path = path;
783 swarn.dev = dev;
784 iterate_extent_inodes(fs_info, found_key.objectid,
785 extent_item_pos, 1,
786 scrub_print_warning_inode, &swarn, false);
787 }
788
789out:
790 btrfs_free_path(path);
791}
792
793static inline void scrub_get_recover(struct scrub_recover *recover)
794{
795 refcount_inc(&recover->refs);
796}
797
798static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
799 struct scrub_recover *recover)
800{
801 if (refcount_dec_and_test(&recover->refs)) {
802 btrfs_bio_counter_dec(fs_info);
803 btrfs_put_bbio(recover->bbio);
804 kfree(recover);
805 }
806}
807
808/*
809 * scrub_handle_errored_block gets called when either verification of the
810 * pages failed or the bio failed to read, e.g. with EIO. In the latter
811 * case, this function handles all pages in the bio, even though only one
812 * may be bad.
813 * The goal of this function is to repair the errored block by using the
814 * contents of one of the mirrors.
815 */
816static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
817{
818 struct scrub_ctx *sctx = sblock_to_check->sctx;
819 struct btrfs_device *dev;
820 struct btrfs_fs_info *fs_info;
821 u64 logical;
822 unsigned int failed_mirror_index;
823 unsigned int is_metadata;
824 unsigned int have_csum;
825 struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
826 struct scrub_block *sblock_bad;
827 int ret;
828 int mirror_index;
829 int page_num;
830 int success;
831 bool full_stripe_locked;
832 unsigned int nofs_flag;
833 static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
834 DEFAULT_RATELIMIT_BURST);
835
836 BUG_ON(sblock_to_check->page_count < 1);
837 fs_info = sctx->fs_info;
838 if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
839 /*
840 * if we find an error in a super block, we just report it.
841 * They will get written with the next transaction commit
842 * anyway
843 */
844 spin_lock(&sctx->stat_lock);
845 ++sctx->stat.super_errors;
846 spin_unlock(&sctx->stat_lock);
847 return 0;
848 }
849 logical = sblock_to_check->pagev[0]->logical;
850 BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
851 failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
852 is_metadata = !(sblock_to_check->pagev[0]->flags &
853 BTRFS_EXTENT_FLAG_DATA);
854 have_csum = sblock_to_check->pagev[0]->have_csum;
855 dev = sblock_to_check->pagev[0]->dev;
856
857 if (btrfs_is_zoned(fs_info) && !sctx->is_dev_replace)
858 return btrfs_repair_one_zone(fs_info, logical);
859
860 /*
861 * We must use GFP_NOFS because the scrub task might be waiting for a
862 * worker task executing this function and in turn a transaction commit
863 * might be waiting the scrub task to pause (which needs to wait for all
864 * the worker tasks to complete before pausing).
865 * We do allocations in the workers through insert_full_stripe_lock()
866 * and scrub_add_page_to_wr_bio(), which happens down the call chain of
867 * this function.
868 */
869 nofs_flag = memalloc_nofs_save();
870 /*
871 * For RAID5/6, race can happen for a different device scrub thread.
872 * For data corruption, Parity and Data threads will both try
873 * to recovery the data.
874 * Race can lead to doubly added csum error, or even unrecoverable
875 * error.
876 */
877 ret = lock_full_stripe(fs_info, logical, &full_stripe_locked);
878 if (ret < 0) {
879 memalloc_nofs_restore(nofs_flag);
880 spin_lock(&sctx->stat_lock);
881 if (ret == -ENOMEM)
882 sctx->stat.malloc_errors++;
883 sctx->stat.read_errors++;
884 sctx->stat.uncorrectable_errors++;
885 spin_unlock(&sctx->stat_lock);
886 return ret;
887 }
888
889 /*
890 * read all mirrors one after the other. This includes to
891 * re-read the extent or metadata block that failed (that was
892 * the cause that this fixup code is called) another time,
893 * page by page this time in order to know which pages
894 * caused I/O errors and which ones are good (for all mirrors).
895 * It is the goal to handle the situation when more than one
896 * mirror contains I/O errors, but the errors do not
897 * overlap, i.e. the data can be repaired by selecting the
898 * pages from those mirrors without I/O error on the
899 * particular pages. One example (with blocks >= 2 * PAGE_SIZE)
900 * would be that mirror #1 has an I/O error on the first page,
901 * the second page is good, and mirror #2 has an I/O error on
902 * the second page, but the first page is good.
903 * Then the first page of the first mirror can be repaired by
904 * taking the first page of the second mirror, and the
905 * second page of the second mirror can be repaired by
906 * copying the contents of the 2nd page of the 1st mirror.
907 * One more note: if the pages of one mirror contain I/O
908 * errors, the checksum cannot be verified. In order to get
909 * the best data for repairing, the first attempt is to find
910 * a mirror without I/O errors and with a validated checksum.
911 * Only if this is not possible, the pages are picked from
912 * mirrors with I/O errors without considering the checksum.
913 * If the latter is the case, at the end, the checksum of the
914 * repaired area is verified in order to correctly maintain
915 * the statistics.
916 */
917
918 sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS,
919 sizeof(*sblocks_for_recheck), GFP_KERNEL);
920 if (!sblocks_for_recheck) {
921 spin_lock(&sctx->stat_lock);
922 sctx->stat.malloc_errors++;
923 sctx->stat.read_errors++;
924 sctx->stat.uncorrectable_errors++;
925 spin_unlock(&sctx->stat_lock);
926 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
927 goto out;
928 }
929
930 /* setup the context, map the logical blocks and alloc the pages */
931 ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
932 if (ret) {
933 spin_lock(&sctx->stat_lock);
934 sctx->stat.read_errors++;
935 sctx->stat.uncorrectable_errors++;
936 spin_unlock(&sctx->stat_lock);
937 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
938 goto out;
939 }
940 BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
941 sblock_bad = sblocks_for_recheck + failed_mirror_index;
942
943 /* build and submit the bios for the failed mirror, check checksums */
944 scrub_recheck_block(fs_info, sblock_bad, 1);
945
946 if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
947 sblock_bad->no_io_error_seen) {
948 /*
949 * the error disappeared after reading page by page, or
950 * the area was part of a huge bio and other parts of the
951 * bio caused I/O errors, or the block layer merged several
952 * read requests into one and the error is caused by a
953 * different bio (usually one of the two latter cases is
954 * the cause)
955 */
956 spin_lock(&sctx->stat_lock);
957 sctx->stat.unverified_errors++;
958 sblock_to_check->data_corrected = 1;
959 spin_unlock(&sctx->stat_lock);
960
961 if (sctx->is_dev_replace)
962 scrub_write_block_to_dev_replace(sblock_bad);
963 goto out;
964 }
965
966 if (!sblock_bad->no_io_error_seen) {
967 spin_lock(&sctx->stat_lock);
968 sctx->stat.read_errors++;
969 spin_unlock(&sctx->stat_lock);
970 if (__ratelimit(&rs))
971 scrub_print_warning("i/o error", sblock_to_check);
972 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
973 } else if (sblock_bad->checksum_error) {
974 spin_lock(&sctx->stat_lock);
975 sctx->stat.csum_errors++;
976 spin_unlock(&sctx->stat_lock);
977 if (__ratelimit(&rs))
978 scrub_print_warning("checksum error", sblock_to_check);
979 btrfs_dev_stat_inc_and_print(dev,
980 BTRFS_DEV_STAT_CORRUPTION_ERRS);
981 } else if (sblock_bad->header_error) {
982 spin_lock(&sctx->stat_lock);
983 sctx->stat.verify_errors++;
984 spin_unlock(&sctx->stat_lock);
985 if (__ratelimit(&rs))
986 scrub_print_warning("checksum/header error",
987 sblock_to_check);
988 if (sblock_bad->generation_error)
989 btrfs_dev_stat_inc_and_print(dev,
990 BTRFS_DEV_STAT_GENERATION_ERRS);
991 else
992 btrfs_dev_stat_inc_and_print(dev,
993 BTRFS_DEV_STAT_CORRUPTION_ERRS);
994 }
995
996 if (sctx->readonly) {
997 ASSERT(!sctx->is_dev_replace);
998 goto out;
999 }
1000
1001 /*
1002 * now build and submit the bios for the other mirrors, check
1003 * checksums.
1004 * First try to pick the mirror which is completely without I/O
1005 * errors and also does not have a checksum error.
1006 * If one is found, and if a checksum is present, the full block
1007 * that is known to contain an error is rewritten. Afterwards
1008 * the block is known to be corrected.
1009 * If a mirror is found which is completely correct, and no
1010 * checksum is present, only those pages are rewritten that had
1011 * an I/O error in the block to be repaired, since it cannot be
1012 * determined, which copy of the other pages is better (and it
1013 * could happen otherwise that a correct page would be
1014 * overwritten by a bad one).
1015 */
1016 for (mirror_index = 0; ;mirror_index++) {
1017 struct scrub_block *sblock_other;
1018
1019 if (mirror_index == failed_mirror_index)
1020 continue;
1021
1022 /* raid56's mirror can be more than BTRFS_MAX_MIRRORS */
1023 if (!scrub_is_page_on_raid56(sblock_bad->pagev[0])) {
1024 if (mirror_index >= BTRFS_MAX_MIRRORS)
1025 break;
1026 if (!sblocks_for_recheck[mirror_index].page_count)
1027 break;
1028
1029 sblock_other = sblocks_for_recheck + mirror_index;
1030 } else {
1031 struct scrub_recover *r = sblock_bad->pagev[0]->recover;
1032 int max_allowed = r->bbio->num_stripes -
1033 r->bbio->num_tgtdevs;
1034
1035 if (mirror_index >= max_allowed)
1036 break;
1037 if (!sblocks_for_recheck[1].page_count)
1038 break;
1039
1040 ASSERT(failed_mirror_index == 0);
1041 sblock_other = sblocks_for_recheck + 1;
1042 sblock_other->pagev[0]->mirror_num = 1 + mirror_index;
1043 }
1044
1045 /* build and submit the bios, check checksums */
1046 scrub_recheck_block(fs_info, sblock_other, 0);
1047
1048 if (!sblock_other->header_error &&
1049 !sblock_other->checksum_error &&
1050 sblock_other->no_io_error_seen) {
1051 if (sctx->is_dev_replace) {
1052 scrub_write_block_to_dev_replace(sblock_other);
1053 goto corrected_error;
1054 } else {
1055 ret = scrub_repair_block_from_good_copy(
1056 sblock_bad, sblock_other);
1057 if (!ret)
1058 goto corrected_error;
1059 }
1060 }
1061 }
1062
1063 if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
1064 goto did_not_correct_error;
1065
1066 /*
1067 * In case of I/O errors in the area that is supposed to be
1068 * repaired, continue by picking good copies of those pages.
1069 * Select the good pages from mirrors to rewrite bad pages from
1070 * the area to fix. Afterwards verify the checksum of the block
1071 * that is supposed to be repaired. This verification step is
1072 * only done for the purpose of statistic counting and for the
1073 * final scrub report, whether errors remain.
1074 * A perfect algorithm could make use of the checksum and try
1075 * all possible combinations of pages from the different mirrors
1076 * until the checksum verification succeeds. For example, when
1077 * the 2nd page of mirror #1 faces I/O errors, and the 2nd page
1078 * of mirror #2 is readable but the final checksum test fails,
1079 * then the 2nd page of mirror #3 could be tried, whether now
1080 * the final checksum succeeds. But this would be a rare
1081 * exception and is therefore not implemented. At least it is
1082 * avoided that the good copy is overwritten.
1083 * A more useful improvement would be to pick the sectors
1084 * without I/O error based on sector sizes (512 bytes on legacy
1085 * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one
1086 * mirror could be repaired by taking 512 byte of a different
1087 * mirror, even if other 512 byte sectors in the same PAGE_SIZE
1088 * area are unreadable.
1089 */
1090 success = 1;
1091 for (page_num = 0; page_num < sblock_bad->page_count;
1092 page_num++) {
1093 struct scrub_page *spage_bad = sblock_bad->pagev[page_num];
1094 struct scrub_block *sblock_other = NULL;
1095
1096 /* skip no-io-error page in scrub */
1097 if (!spage_bad->io_error && !sctx->is_dev_replace)
1098 continue;
1099
1100 if (scrub_is_page_on_raid56(sblock_bad->pagev[0])) {
1101 /*
1102 * In case of dev replace, if raid56 rebuild process
1103 * didn't work out correct data, then copy the content
1104 * in sblock_bad to make sure target device is identical
1105 * to source device, instead of writing garbage data in
1106 * sblock_for_recheck array to target device.
1107 */
1108 sblock_other = NULL;
1109 } else if (spage_bad->io_error) {
1110 /* try to find no-io-error page in mirrors */
1111 for (mirror_index = 0;
1112 mirror_index < BTRFS_MAX_MIRRORS &&
1113 sblocks_for_recheck[mirror_index].page_count > 0;
1114 mirror_index++) {
1115 if (!sblocks_for_recheck[mirror_index].
1116 pagev[page_num]->io_error) {
1117 sblock_other = sblocks_for_recheck +
1118 mirror_index;
1119 break;
1120 }
1121 }
1122 if (!sblock_other)
1123 success = 0;
1124 }
1125
1126 if (sctx->is_dev_replace) {
1127 /*
1128 * did not find a mirror to fetch the page
1129 * from. scrub_write_page_to_dev_replace()
1130 * handles this case (page->io_error), by
1131 * filling the block with zeros before
1132 * submitting the write request
1133 */
1134 if (!sblock_other)
1135 sblock_other = sblock_bad;
1136
1137 if (scrub_write_page_to_dev_replace(sblock_other,
1138 page_num) != 0) {
1139 atomic64_inc(
1140 &fs_info->dev_replace.num_write_errors);
1141 success = 0;
1142 }
1143 } else if (sblock_other) {
1144 ret = scrub_repair_page_from_good_copy(sblock_bad,
1145 sblock_other,
1146 page_num, 0);
1147 if (0 == ret)
1148 spage_bad->io_error = 0;
1149 else
1150 success = 0;
1151 }
1152 }
1153
1154 if (success && !sctx->is_dev_replace) {
1155 if (is_metadata || have_csum) {
1156 /*
1157 * need to verify the checksum now that all
1158 * sectors on disk are repaired (the write
1159 * request for data to be repaired is on its way).
1160 * Just be lazy and use scrub_recheck_block()
1161 * which re-reads the data before the checksum
1162 * is verified, but most likely the data comes out
1163 * of the page cache.
1164 */
1165 scrub_recheck_block(fs_info, sblock_bad, 1);
1166 if (!sblock_bad->header_error &&
1167 !sblock_bad->checksum_error &&
1168 sblock_bad->no_io_error_seen)
1169 goto corrected_error;
1170 else
1171 goto did_not_correct_error;
1172 } else {
1173corrected_error:
1174 spin_lock(&sctx->stat_lock);
1175 sctx->stat.corrected_errors++;
1176 sblock_to_check->data_corrected = 1;
1177 spin_unlock(&sctx->stat_lock);
1178 btrfs_err_rl_in_rcu(fs_info,
1179 "fixed up error at logical %llu on dev %s",
1180 logical, rcu_str_deref(dev->name));
1181 }
1182 } else {
1183did_not_correct_error:
1184 spin_lock(&sctx->stat_lock);
1185 sctx->stat.uncorrectable_errors++;
1186 spin_unlock(&sctx->stat_lock);
1187 btrfs_err_rl_in_rcu(fs_info,
1188 "unable to fixup (regular) error at logical %llu on dev %s",
1189 logical, rcu_str_deref(dev->name));
1190 }
1191
1192out:
1193 if (sblocks_for_recheck) {
1194 for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
1195 mirror_index++) {
1196 struct scrub_block *sblock = sblocks_for_recheck +
1197 mirror_index;
1198 struct scrub_recover *recover;
1199 int page_index;
1200
1201 for (page_index = 0; page_index < sblock->page_count;
1202 page_index++) {
1203 sblock->pagev[page_index]->sblock = NULL;
1204 recover = sblock->pagev[page_index]->recover;
1205 if (recover) {
1206 scrub_put_recover(fs_info, recover);
1207 sblock->pagev[page_index]->recover =
1208 NULL;
1209 }
1210 scrub_page_put(sblock->pagev[page_index]);
1211 }
1212 }
1213 kfree(sblocks_for_recheck);
1214 }
1215
1216 ret = unlock_full_stripe(fs_info, logical, full_stripe_locked);
1217 memalloc_nofs_restore(nofs_flag);
1218 if (ret < 0)
1219 return ret;
1220 return 0;
1221}
1222
1223static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio)
1224{
1225 if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
1226 return 2;
1227 else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
1228 return 3;
1229 else
1230 return (int)bbio->num_stripes;
1231}
1232
1233static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
1234 u64 *raid_map,
1235 u64 mapped_length,
1236 int nstripes, int mirror,
1237 int *stripe_index,
1238 u64 *stripe_offset)
1239{
1240 int i;
1241
1242 if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
1243 /* RAID5/6 */
1244 for (i = 0; i < nstripes; i++) {
1245 if (raid_map[i] == RAID6_Q_STRIPE ||
1246 raid_map[i] == RAID5_P_STRIPE)
1247 continue;
1248
1249 if (logical >= raid_map[i] &&
1250 logical < raid_map[i] + mapped_length)
1251 break;
1252 }
1253
1254 *stripe_index = i;
1255 *stripe_offset = logical - raid_map[i];
1256 } else {
1257 /* The other RAID type */
1258 *stripe_index = mirror;
1259 *stripe_offset = 0;
1260 }
1261}
1262
1263static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
1264 struct scrub_block *sblocks_for_recheck)
1265{
1266 struct scrub_ctx *sctx = original_sblock->sctx;
1267 struct btrfs_fs_info *fs_info = sctx->fs_info;
1268 u64 length = original_sblock->page_count * PAGE_SIZE;
1269 u64 logical = original_sblock->pagev[0]->logical;
1270 u64 generation = original_sblock->pagev[0]->generation;
1271 u64 flags = original_sblock->pagev[0]->flags;
1272 u64 have_csum = original_sblock->pagev[0]->have_csum;
1273 struct scrub_recover *recover;
1274 struct btrfs_bio *bbio;
1275 u64 sublen;
1276 u64 mapped_length;
1277 u64 stripe_offset;
1278 int stripe_index;
1279 int page_index = 0;
1280 int mirror_index;
1281 int nmirrors;
1282 int ret;
1283
1284 /*
1285 * note: the two members refs and outstanding_pages
1286 * are not used (and not set) in the blocks that are used for
1287 * the recheck procedure
1288 */
1289
1290 while (length > 0) {
1291 sublen = min_t(u64, length, PAGE_SIZE);
1292 mapped_length = sublen;
1293 bbio = NULL;
1294
1295 /*
1296 * with a length of PAGE_SIZE, each returned stripe
1297 * represents one mirror
1298 */
1299 btrfs_bio_counter_inc_blocked(fs_info);
1300 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
1301 logical, &mapped_length, &bbio);
1302 if (ret || !bbio || mapped_length < sublen) {
1303 btrfs_put_bbio(bbio);
1304 btrfs_bio_counter_dec(fs_info);
1305 return -EIO;
1306 }
1307
1308 recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
1309 if (!recover) {
1310 btrfs_put_bbio(bbio);
1311 btrfs_bio_counter_dec(fs_info);
1312 return -ENOMEM;
1313 }
1314
1315 refcount_set(&recover->refs, 1);
1316 recover->bbio = bbio;
1317 recover->map_length = mapped_length;
1318
1319 BUG_ON(page_index >= SCRUB_MAX_PAGES_PER_BLOCK);
1320
1321 nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS);
1322
1323 for (mirror_index = 0; mirror_index < nmirrors;
1324 mirror_index++) {
1325 struct scrub_block *sblock;
1326 struct scrub_page *spage;
1327
1328 sblock = sblocks_for_recheck + mirror_index;
1329 sblock->sctx = sctx;
1330
1331 spage = kzalloc(sizeof(*spage), GFP_NOFS);
1332 if (!spage) {
1333leave_nomem:
1334 spin_lock(&sctx->stat_lock);
1335 sctx->stat.malloc_errors++;
1336 spin_unlock(&sctx->stat_lock);
1337 scrub_put_recover(fs_info, recover);
1338 return -ENOMEM;
1339 }
1340 scrub_page_get(spage);
1341 sblock->pagev[page_index] = spage;
1342 spage->sblock = sblock;
1343 spage->flags = flags;
1344 spage->generation = generation;
1345 spage->logical = logical;
1346 spage->have_csum = have_csum;
1347 if (have_csum)
1348 memcpy(spage->csum,
1349 original_sblock->pagev[0]->csum,
1350 sctx->fs_info->csum_size);
1351
1352 scrub_stripe_index_and_offset(logical,
1353 bbio->map_type,
1354 bbio->raid_map,
1355 mapped_length,
1356 bbio->num_stripes -
1357 bbio->num_tgtdevs,
1358 mirror_index,
1359 &stripe_index,
1360 &stripe_offset);
1361 spage->physical = bbio->stripes[stripe_index].physical +
1362 stripe_offset;
1363 spage->dev = bbio->stripes[stripe_index].dev;
1364
1365 BUG_ON(page_index >= original_sblock->page_count);
1366 spage->physical_for_dev_replace =
1367 original_sblock->pagev[page_index]->
1368 physical_for_dev_replace;
1369 /* for missing devices, dev->bdev is NULL */
1370 spage->mirror_num = mirror_index + 1;
1371 sblock->page_count++;
1372 spage->page = alloc_page(GFP_NOFS);
1373 if (!spage->page)
1374 goto leave_nomem;
1375
1376 scrub_get_recover(recover);
1377 spage->recover = recover;
1378 }
1379 scrub_put_recover(fs_info, recover);
1380 length -= sublen;
1381 logical += sublen;
1382 page_index++;
1383 }
1384
1385 return 0;
1386}
1387
1388static void scrub_bio_wait_endio(struct bio *bio)
1389{
1390 complete(bio->bi_private);
1391}
1392
1393static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
1394 struct bio *bio,
1395 struct scrub_page *spage)
1396{
1397 DECLARE_COMPLETION_ONSTACK(done);
1398 int ret;
1399 int mirror_num;
1400
1401 bio->bi_iter.bi_sector = spage->logical >> 9;
1402 bio->bi_private = &done;
1403 bio->bi_end_io = scrub_bio_wait_endio;
1404
1405 mirror_num = spage->sblock->pagev[0]->mirror_num;
1406 ret = raid56_parity_recover(fs_info, bio, spage->recover->bbio,
1407 spage->recover->map_length,
1408 mirror_num, 0);
1409 if (ret)
1410 return ret;
1411
1412 wait_for_completion_io(&done);
1413 return blk_status_to_errno(bio->bi_status);
1414}
1415
1416static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
1417 struct scrub_block *sblock)
1418{
1419 struct scrub_page *first_page = sblock->pagev[0];
1420 struct bio *bio;
1421 int page_num;
1422
1423 /* All pages in sblock belong to the same stripe on the same device. */
1424 ASSERT(first_page->dev);
1425 if (!first_page->dev->bdev)
1426 goto out;
1427
1428 bio = btrfs_io_bio_alloc(BIO_MAX_VECS);
1429 bio_set_dev(bio, first_page->dev->bdev);
1430
1431 for (page_num = 0; page_num < sblock->page_count; page_num++) {
1432 struct scrub_page *spage = sblock->pagev[page_num];
1433
1434 WARN_ON(!spage->page);
1435 bio_add_page(bio, spage->page, PAGE_SIZE, 0);
1436 }
1437
1438 if (scrub_submit_raid56_bio_wait(fs_info, bio, first_page)) {
1439 bio_put(bio);
1440 goto out;
1441 }
1442
1443 bio_put(bio);
1444
1445 scrub_recheck_block_checksum(sblock);
1446
1447 return;
1448out:
1449 for (page_num = 0; page_num < sblock->page_count; page_num++)
1450 sblock->pagev[page_num]->io_error = 1;
1451
1452 sblock->no_io_error_seen = 0;
1453}
1454
1455/*
1456 * this function will check the on disk data for checksum errors, header
1457 * errors and read I/O errors. If any I/O errors happen, the exact pages
1458 * which are errored are marked as being bad. The goal is to enable scrub
1459 * to take those pages that are not errored from all the mirrors so that
1460 * the pages that are errored in the just handled mirror can be repaired.
1461 */
1462static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1463 struct scrub_block *sblock,
1464 int retry_failed_mirror)
1465{
1466 int page_num;
1467
1468 sblock->no_io_error_seen = 1;
1469
1470 /* short cut for raid56 */
1471 if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->pagev[0]))
1472 return scrub_recheck_block_on_raid56(fs_info, sblock);
1473
1474 for (page_num = 0; page_num < sblock->page_count; page_num++) {
1475 struct bio *bio;
1476 struct scrub_page *spage = sblock->pagev[page_num];
1477
1478 if (spage->dev->bdev == NULL) {
1479 spage->io_error = 1;
1480 sblock->no_io_error_seen = 0;
1481 continue;
1482 }
1483
1484 WARN_ON(!spage->page);
1485 bio = btrfs_io_bio_alloc(1);
1486 bio_set_dev(bio, spage->dev->bdev);
1487
1488 bio_add_page(bio, spage->page, PAGE_SIZE, 0);
1489 bio->bi_iter.bi_sector = spage->physical >> 9;
1490 bio->bi_opf = REQ_OP_READ;
1491
1492 if (btrfsic_submit_bio_wait(bio)) {
1493 spage->io_error = 1;
1494 sblock->no_io_error_seen = 0;
1495 }
1496
1497 bio_put(bio);
1498 }
1499
1500 if (sblock->no_io_error_seen)
1501 scrub_recheck_block_checksum(sblock);
1502}
1503
1504static inline int scrub_check_fsid(u8 fsid[],
1505 struct scrub_page *spage)
1506{
1507 struct btrfs_fs_devices *fs_devices = spage->dev->fs_devices;
1508 int ret;
1509
1510 ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1511 return !ret;
1512}
1513
1514static void scrub_recheck_block_checksum(struct scrub_block *sblock)
1515{
1516 sblock->header_error = 0;
1517 sblock->checksum_error = 0;
1518 sblock->generation_error = 0;
1519
1520 if (sblock->pagev[0]->flags & BTRFS_EXTENT_FLAG_DATA)
1521 scrub_checksum_data(sblock);
1522 else
1523 scrub_checksum_tree_block(sblock);
1524}
1525
1526static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
1527 struct scrub_block *sblock_good)
1528{
1529 int page_num;
1530 int ret = 0;
1531
1532 for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
1533 int ret_sub;
1534
1535 ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
1536 sblock_good,
1537 page_num, 1);
1538 if (ret_sub)
1539 ret = ret_sub;
1540 }
1541
1542 return ret;
1543}
1544
1545static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1546 struct scrub_block *sblock_good,
1547 int page_num, int force_write)
1548{
1549 struct scrub_page *spage_bad = sblock_bad->pagev[page_num];
1550 struct scrub_page *spage_good = sblock_good->pagev[page_num];
1551 struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info;
1552
1553 BUG_ON(spage_bad->page == NULL);
1554 BUG_ON(spage_good->page == NULL);
1555 if (force_write || sblock_bad->header_error ||
1556 sblock_bad->checksum_error || spage_bad->io_error) {
1557 struct bio *bio;
1558 int ret;
1559
1560 if (!spage_bad->dev->bdev) {
1561 btrfs_warn_rl(fs_info,
1562 "scrub_repair_page_from_good_copy(bdev == NULL) is unexpected");
1563 return -EIO;
1564 }
1565
1566 bio = btrfs_io_bio_alloc(1);
1567 bio_set_dev(bio, spage_bad->dev->bdev);
1568 bio->bi_iter.bi_sector = spage_bad->physical >> 9;
1569 bio->bi_opf = REQ_OP_WRITE;
1570
1571 ret = bio_add_page(bio, spage_good->page, PAGE_SIZE, 0);
1572 if (PAGE_SIZE != ret) {
1573 bio_put(bio);
1574 return -EIO;
1575 }
1576
1577 if (btrfsic_submit_bio_wait(bio)) {
1578 btrfs_dev_stat_inc_and_print(spage_bad->dev,
1579 BTRFS_DEV_STAT_WRITE_ERRS);
1580 atomic64_inc(&fs_info->dev_replace.num_write_errors);
1581 bio_put(bio);
1582 return -EIO;
1583 }
1584 bio_put(bio);
1585 }
1586
1587 return 0;
1588}
1589
1590static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1591{
1592 struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
1593 int page_num;
1594
1595 /*
1596 * This block is used for the check of the parity on the source device,
1597 * so the data needn't be written into the destination device.
1598 */
1599 if (sblock->sparity)
1600 return;
1601
1602 for (page_num = 0; page_num < sblock->page_count; page_num++) {
1603 int ret;
1604
1605 ret = scrub_write_page_to_dev_replace(sblock, page_num);
1606 if (ret)
1607 atomic64_inc(&fs_info->dev_replace.num_write_errors);
1608 }
1609}
1610
1611static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
1612 int page_num)
1613{
1614 struct scrub_page *spage = sblock->pagev[page_num];
1615
1616 BUG_ON(spage->page == NULL);
1617 if (spage->io_error)
1618 clear_page(page_address(spage->page));
1619
1620 return scrub_add_page_to_wr_bio(sblock->sctx, spage);
1621}
1622
1623static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)
1624{
1625 int ret = 0;
1626 u64 length;
1627
1628 if (!btrfs_is_zoned(sctx->fs_info))
1629 return 0;
1630
1631 if (!btrfs_dev_is_sequential(sctx->wr_tgtdev, physical))
1632 return 0;
1633
1634 if (sctx->write_pointer < physical) {
1635 length = physical - sctx->write_pointer;
1636
1637 ret = btrfs_zoned_issue_zeroout(sctx->wr_tgtdev,
1638 sctx->write_pointer, length);
1639 if (!ret)
1640 sctx->write_pointer = physical;
1641 }
1642 return ret;
1643}
1644
1645static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
1646 struct scrub_page *spage)
1647{
1648 struct scrub_bio *sbio;
1649 int ret;
1650
1651 mutex_lock(&sctx->wr_lock);
1652again:
1653 if (!sctx->wr_curr_bio) {
1654 sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio),
1655 GFP_KERNEL);
1656 if (!sctx->wr_curr_bio) {
1657 mutex_unlock(&sctx->wr_lock);
1658 return -ENOMEM;
1659 }
1660 sctx->wr_curr_bio->sctx = sctx;
1661 sctx->wr_curr_bio->page_count = 0;
1662 }
1663 sbio = sctx->wr_curr_bio;
1664 if (sbio->page_count == 0) {
1665 struct bio *bio;
1666
1667 ret = fill_writer_pointer_gap(sctx,
1668 spage->physical_for_dev_replace);
1669 if (ret) {
1670 mutex_unlock(&sctx->wr_lock);
1671 return ret;
1672 }
1673
1674 sbio->physical = spage->physical_for_dev_replace;
1675 sbio->logical = spage->logical;
1676 sbio->dev = sctx->wr_tgtdev;
1677 bio = sbio->bio;
1678 if (!bio) {
1679 bio = btrfs_io_bio_alloc(sctx->pages_per_wr_bio);
1680 sbio->bio = bio;
1681 }
1682
1683 bio->bi_private = sbio;
1684 bio->bi_end_io = scrub_wr_bio_end_io;
1685 bio_set_dev(bio, sbio->dev->bdev);
1686 bio->bi_iter.bi_sector = sbio->physical >> 9;
1687 bio->bi_opf = REQ_OP_WRITE;
1688 sbio->status = 0;
1689 } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1690 spage->physical_for_dev_replace ||
1691 sbio->logical + sbio->page_count * PAGE_SIZE !=
1692 spage->logical) {
1693 scrub_wr_submit(sctx);
1694 goto again;
1695 }
1696
1697 ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
1698 if (ret != PAGE_SIZE) {
1699 if (sbio->page_count < 1) {
1700 bio_put(sbio->bio);
1701 sbio->bio = NULL;
1702 mutex_unlock(&sctx->wr_lock);
1703 return -EIO;
1704 }
1705 scrub_wr_submit(sctx);
1706 goto again;
1707 }
1708
1709 sbio->pagev[sbio->page_count] = spage;
1710 scrub_page_get(spage);
1711 sbio->page_count++;
1712 if (sbio->page_count == sctx->pages_per_wr_bio)
1713 scrub_wr_submit(sctx);
1714 mutex_unlock(&sctx->wr_lock);
1715
1716 return 0;
1717}
1718
1719static void scrub_wr_submit(struct scrub_ctx *sctx)
1720{
1721 struct scrub_bio *sbio;
1722
1723 if (!sctx->wr_curr_bio)
1724 return;
1725
1726 sbio = sctx->wr_curr_bio;
1727 sctx->wr_curr_bio = NULL;
1728 WARN_ON(!sbio->bio->bi_bdev);
1729 scrub_pending_bio_inc(sctx);
1730 /* process all writes in a single worker thread. Then the block layer
1731 * orders the requests before sending them to the driver which
1732 * doubled the write performance on spinning disks when measured
1733 * with Linux 3.5 */
1734 btrfsic_submit_bio(sbio->bio);
1735
1736 if (btrfs_is_zoned(sctx->fs_info))
1737 sctx->write_pointer = sbio->physical + sbio->page_count * PAGE_SIZE;
1738}
1739
1740static void scrub_wr_bio_end_io(struct bio *bio)
1741{
1742 struct scrub_bio *sbio = bio->bi_private;
1743 struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
1744
1745 sbio->status = bio->bi_status;
1746 sbio->bio = bio;
1747
1748 btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL);
1749 btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
1750}
1751
1752static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
1753{
1754 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1755 struct scrub_ctx *sctx = sbio->sctx;
1756 int i;
1757
1758 WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
1759 if (sbio->status) {
1760 struct btrfs_dev_replace *dev_replace =
1761 &sbio->sctx->fs_info->dev_replace;
1762
1763 for (i = 0; i < sbio->page_count; i++) {
1764 struct scrub_page *spage = sbio->pagev[i];
1765
1766 spage->io_error = 1;
1767 atomic64_inc(&dev_replace->num_write_errors);
1768 }
1769 }
1770
1771 for (i = 0; i < sbio->page_count; i++)
1772 scrub_page_put(sbio->pagev[i]);
1773
1774 bio_put(sbio->bio);
1775 kfree(sbio);
1776 scrub_pending_bio_dec(sctx);
1777}
1778
1779static int scrub_checksum(struct scrub_block *sblock)
1780{
1781 u64 flags;
1782 int ret;
1783
1784 /*
1785 * No need to initialize these stats currently,
1786 * because this function only use return value
1787 * instead of these stats value.
1788 *
1789 * Todo:
1790 * always use stats
1791 */
1792 sblock->header_error = 0;
1793 sblock->generation_error = 0;
1794 sblock->checksum_error = 0;
1795
1796 WARN_ON(sblock->page_count < 1);
1797 flags = sblock->pagev[0]->flags;
1798 ret = 0;
1799 if (flags & BTRFS_EXTENT_FLAG_DATA)
1800 ret = scrub_checksum_data(sblock);
1801 else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1802 ret = scrub_checksum_tree_block(sblock);
1803 else if (flags & BTRFS_EXTENT_FLAG_SUPER)
1804 (void)scrub_checksum_super(sblock);
1805 else
1806 WARN_ON(1);
1807 if (ret)
1808 scrub_handle_errored_block(sblock);
1809
1810 return ret;
1811}
1812
1813static int scrub_checksum_data(struct scrub_block *sblock)
1814{
1815 struct scrub_ctx *sctx = sblock->sctx;
1816 struct btrfs_fs_info *fs_info = sctx->fs_info;
1817 SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1818 u8 csum[BTRFS_CSUM_SIZE];
1819 struct scrub_page *spage;
1820 char *kaddr;
1821
1822 BUG_ON(sblock->page_count < 1);
1823 spage = sblock->pagev[0];
1824 if (!spage->have_csum)
1825 return 0;
1826
1827 kaddr = page_address(spage->page);
1828
1829 shash->tfm = fs_info->csum_shash;
1830 crypto_shash_init(shash);
1831
1832 /*
1833 * In scrub_pages() and scrub_pages_for_parity() we ensure each spage
1834 * only contains one sector of data.
1835 */
1836 crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
1837
1838 if (memcmp(csum, spage->csum, fs_info->csum_size))
1839 sblock->checksum_error = 1;
1840 return sblock->checksum_error;
1841}
1842
1843static int scrub_checksum_tree_block(struct scrub_block *sblock)
1844{
1845 struct scrub_ctx *sctx = sblock->sctx;
1846 struct btrfs_header *h;
1847 struct btrfs_fs_info *fs_info = sctx->fs_info;
1848 SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1849 u8 calculated_csum[BTRFS_CSUM_SIZE];
1850 u8 on_disk_csum[BTRFS_CSUM_SIZE];
1851 /*
1852 * This is done in sectorsize steps even for metadata as there's a
1853 * constraint for nodesize to be aligned to sectorsize. This will need
1854 * to change so we don't misuse data and metadata units like that.
1855 */
1856 const u32 sectorsize = sctx->fs_info->sectorsize;
1857 const int num_sectors = fs_info->nodesize >> fs_info->sectorsize_bits;
1858 int i;
1859 struct scrub_page *spage;
1860 char *kaddr;
1861
1862 BUG_ON(sblock->page_count < 1);
1863
1864 /* Each member in pagev is just one block, not a full page */
1865 ASSERT(sblock->page_count == num_sectors);
1866
1867 spage = sblock->pagev[0];
1868 kaddr = page_address(spage->page);
1869 h = (struct btrfs_header *)kaddr;
1870 memcpy(on_disk_csum, h->csum, sctx->fs_info->csum_size);
1871
1872 /*
1873 * we don't use the getter functions here, as we
1874 * a) don't have an extent buffer and
1875 * b) the page is already kmapped
1876 */
1877 if (spage->logical != btrfs_stack_header_bytenr(h))
1878 sblock->header_error = 1;
1879
1880 if (spage->generation != btrfs_stack_header_generation(h)) {
1881 sblock->header_error = 1;
1882 sblock->generation_error = 1;
1883 }
1884
1885 if (!scrub_check_fsid(h->fsid, spage))
1886 sblock->header_error = 1;
1887
1888 if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1889 BTRFS_UUID_SIZE))
1890 sblock->header_error = 1;
1891
1892 shash->tfm = fs_info->csum_shash;
1893 crypto_shash_init(shash);
1894 crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
1895 sectorsize - BTRFS_CSUM_SIZE);
1896
1897 for (i = 1; i < num_sectors; i++) {
1898 kaddr = page_address(sblock->pagev[i]->page);
1899 crypto_shash_update(shash, kaddr, sectorsize);
1900 }
1901
1902 crypto_shash_final(shash, calculated_csum);
1903 if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size))
1904 sblock->checksum_error = 1;
1905
1906 return sblock->header_error || sblock->checksum_error;
1907}
1908
1909static int scrub_checksum_super(struct scrub_block *sblock)
1910{
1911 struct btrfs_super_block *s;
1912 struct scrub_ctx *sctx = sblock->sctx;
1913 struct btrfs_fs_info *fs_info = sctx->fs_info;
1914 SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1915 u8 calculated_csum[BTRFS_CSUM_SIZE];
1916 struct scrub_page *spage;
1917 char *kaddr;
1918 int fail_gen = 0;
1919 int fail_cor = 0;
1920
1921 BUG_ON(sblock->page_count < 1);
1922 spage = sblock->pagev[0];
1923 kaddr = page_address(spage->page);
1924 s = (struct btrfs_super_block *)kaddr;
1925
1926 if (spage->logical != btrfs_super_bytenr(s))
1927 ++fail_cor;
1928
1929 if (spage->generation != btrfs_super_generation(s))
1930 ++fail_gen;
1931
1932 if (!scrub_check_fsid(s->fsid, spage))
1933 ++fail_cor;
1934
1935 shash->tfm = fs_info->csum_shash;
1936 crypto_shash_init(shash);
1937 crypto_shash_digest(shash, kaddr + BTRFS_CSUM_SIZE,
1938 BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, calculated_csum);
1939
1940 if (memcmp(calculated_csum, s->csum, sctx->fs_info->csum_size))
1941 ++fail_cor;
1942
1943 if (fail_cor + fail_gen) {
1944 /*
1945 * if we find an error in a super block, we just report it.
1946 * They will get written with the next transaction commit
1947 * anyway
1948 */
1949 spin_lock(&sctx->stat_lock);
1950 ++sctx->stat.super_errors;
1951 spin_unlock(&sctx->stat_lock);
1952 if (fail_cor)
1953 btrfs_dev_stat_inc_and_print(spage->dev,
1954 BTRFS_DEV_STAT_CORRUPTION_ERRS);
1955 else
1956 btrfs_dev_stat_inc_and_print(spage->dev,
1957 BTRFS_DEV_STAT_GENERATION_ERRS);
1958 }
1959
1960 return fail_cor + fail_gen;
1961}
1962
1963static void scrub_block_get(struct scrub_block *sblock)
1964{
1965 refcount_inc(&sblock->refs);
1966}
1967
1968static void scrub_block_put(struct scrub_block *sblock)
1969{
1970 if (refcount_dec_and_test(&sblock->refs)) {
1971 int i;
1972
1973 if (sblock->sparity)
1974 scrub_parity_put(sblock->sparity);
1975
1976 for (i = 0; i < sblock->page_count; i++)
1977 scrub_page_put(sblock->pagev[i]);
1978 kfree(sblock);
1979 }
1980}
1981
1982static void scrub_page_get(struct scrub_page *spage)
1983{
1984 atomic_inc(&spage->refs);
1985}
1986
1987static void scrub_page_put(struct scrub_page *spage)
1988{
1989 if (atomic_dec_and_test(&spage->refs)) {
1990 if (spage->page)
1991 __free_page(spage->page);
1992 kfree(spage);
1993 }
1994}
1995
1996/*
1997 * Throttling of IO submission, bandwidth-limit based, the timeslice is 1
1998 * second. Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max.
1999 */
2000static void scrub_throttle(struct scrub_ctx *sctx)
2001{
2002 const int time_slice = 1000;
2003 struct scrub_bio *sbio;
2004 struct btrfs_device *device;
2005 s64 delta;
2006 ktime_t now;
2007 u32 div;
2008 u64 bwlimit;
2009
2010 sbio = sctx->bios[sctx->curr];
2011 device = sbio->dev;
2012 bwlimit = READ_ONCE(device->scrub_speed_max);
2013 if (bwlimit == 0)
2014 return;
2015
2016 /*
2017 * Slice is divided into intervals when the IO is submitted, adjust by
2018 * bwlimit and maximum of 64 intervals.
2019 */
2020 div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024)));
2021 div = min_t(u32, 64, div);
2022
2023 /* Start new epoch, set deadline */
2024 now = ktime_get();
2025 if (sctx->throttle_deadline == 0) {
2026 sctx->throttle_deadline = ktime_add_ms(now, time_slice / div);
2027 sctx->throttle_sent = 0;
2028 }
2029
2030 /* Still in the time to send? */
2031 if (ktime_before(now, sctx->throttle_deadline)) {
2032 /* If current bio is within the limit, send it */
2033 sctx->throttle_sent += sbio->bio->bi_iter.bi_size;
2034 if (sctx->throttle_sent <= div_u64(bwlimit, div))
2035 return;
2036
2037 /* We're over the limit, sleep until the rest of the slice */
2038 delta = ktime_ms_delta(sctx->throttle_deadline, now);
2039 } else {
2040 /* New request after deadline, start new epoch */
2041 delta = 0;
2042 }
2043
2044 if (delta) {
2045 long timeout;
2046
2047 timeout = div_u64(delta * HZ, 1000);
2048 schedule_timeout_interruptible(timeout);
2049 }
2050
2051 /* Next call will start the deadline period */
2052 sctx->throttle_deadline = 0;
2053}
2054
2055static void scrub_submit(struct scrub_ctx *sctx)
2056{
2057 struct scrub_bio *sbio;
2058
2059 if (sctx->curr == -1)
2060 return;
2061
2062 scrub_throttle(sctx);
2063
2064 sbio = sctx->bios[sctx->curr];
2065 sctx->curr = -1;
2066 scrub_pending_bio_inc(sctx);
2067 btrfsic_submit_bio(sbio->bio);
2068}
2069
2070static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
2071 struct scrub_page *spage)
2072{
2073 struct scrub_block *sblock = spage->sblock;
2074 struct scrub_bio *sbio;
2075 int ret;
2076
2077again:
2078 /*
2079 * grab a fresh bio or wait for one to become available
2080 */
2081 while (sctx->curr == -1) {
2082 spin_lock(&sctx->list_lock);
2083 sctx->curr = sctx->first_free;
2084 if (sctx->curr != -1) {
2085 sctx->first_free = sctx->bios[sctx->curr]->next_free;
2086 sctx->bios[sctx->curr]->next_free = -1;
2087 sctx->bios[sctx->curr]->page_count = 0;
2088 spin_unlock(&sctx->list_lock);
2089 } else {
2090 spin_unlock(&sctx->list_lock);
2091 wait_event(sctx->list_wait, sctx->first_free != -1);
2092 }
2093 }
2094 sbio = sctx->bios[sctx->curr];
2095 if (sbio->page_count == 0) {
2096 struct bio *bio;
2097
2098 sbio->physical = spage->physical;
2099 sbio->logical = spage->logical;
2100 sbio->dev = spage->dev;
2101 bio = sbio->bio;
2102 if (!bio) {
2103 bio = btrfs_io_bio_alloc(sctx->pages_per_rd_bio);
2104 sbio->bio = bio;
2105 }
2106
2107 bio->bi_private = sbio;
2108 bio->bi_end_io = scrub_bio_end_io;
2109 bio_set_dev(bio, sbio->dev->bdev);
2110 bio->bi_iter.bi_sector = sbio->physical >> 9;
2111 bio->bi_opf = REQ_OP_READ;
2112 sbio->status = 0;
2113 } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
2114 spage->physical ||
2115 sbio->logical + sbio->page_count * PAGE_SIZE !=
2116 spage->logical ||
2117 sbio->dev != spage->dev) {
2118 scrub_submit(sctx);
2119 goto again;
2120 }
2121
2122 sbio->pagev[sbio->page_count] = spage;
2123 ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
2124 if (ret != PAGE_SIZE) {
2125 if (sbio->page_count < 1) {
2126 bio_put(sbio->bio);
2127 sbio->bio = NULL;
2128 return -EIO;
2129 }
2130 scrub_submit(sctx);
2131 goto again;
2132 }
2133
2134 scrub_block_get(sblock); /* one for the page added to the bio */
2135 atomic_inc(&sblock->outstanding_pages);
2136 sbio->page_count++;
2137 if (sbio->page_count == sctx->pages_per_rd_bio)
2138 scrub_submit(sctx);
2139
2140 return 0;
2141}
2142
2143static void scrub_missing_raid56_end_io(struct bio *bio)
2144{
2145 struct scrub_block *sblock = bio->bi_private;
2146 struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
2147
2148 if (bio->bi_status)
2149 sblock->no_io_error_seen = 0;
2150
2151 bio_put(bio);
2152
2153 btrfs_queue_work(fs_info->scrub_workers, &sblock->work);
2154}
2155
2156static void scrub_missing_raid56_worker(struct btrfs_work *work)
2157{
2158 struct scrub_block *sblock = container_of(work, struct scrub_block, work);
2159 struct scrub_ctx *sctx = sblock->sctx;
2160 struct btrfs_fs_info *fs_info = sctx->fs_info;
2161 u64 logical;
2162 struct btrfs_device *dev;
2163
2164 logical = sblock->pagev[0]->logical;
2165 dev = sblock->pagev[0]->dev;
2166
2167 if (sblock->no_io_error_seen)
2168 scrub_recheck_block_checksum(sblock);
2169
2170 if (!sblock->no_io_error_seen) {
2171 spin_lock(&sctx->stat_lock);
2172 sctx->stat.read_errors++;
2173 spin_unlock(&sctx->stat_lock);
2174 btrfs_err_rl_in_rcu(fs_info,
2175 "IO error rebuilding logical %llu for dev %s",
2176 logical, rcu_str_deref(dev->name));
2177 } else if (sblock->header_error || sblock->checksum_error) {
2178 spin_lock(&sctx->stat_lock);
2179 sctx->stat.uncorrectable_errors++;
2180 spin_unlock(&sctx->stat_lock);
2181 btrfs_err_rl_in_rcu(fs_info,
2182 "failed to rebuild valid logical %llu for dev %s",
2183 logical, rcu_str_deref(dev->name));
2184 } else {
2185 scrub_write_block_to_dev_replace(sblock);
2186 }
2187
2188 if (sctx->is_dev_replace && sctx->flush_all_writes) {
2189 mutex_lock(&sctx->wr_lock);
2190 scrub_wr_submit(sctx);
2191 mutex_unlock(&sctx->wr_lock);
2192 }
2193
2194 scrub_block_put(sblock);
2195 scrub_pending_bio_dec(sctx);
2196}
2197
2198static void scrub_missing_raid56_pages(struct scrub_block *sblock)
2199{
2200 struct scrub_ctx *sctx = sblock->sctx;
2201 struct btrfs_fs_info *fs_info = sctx->fs_info;
2202 u64 length = sblock->page_count * PAGE_SIZE;
2203 u64 logical = sblock->pagev[0]->logical;
2204 struct btrfs_bio *bbio = NULL;
2205 struct bio *bio;
2206 struct btrfs_raid_bio *rbio;
2207 int ret;
2208 int i;
2209
2210 btrfs_bio_counter_inc_blocked(fs_info);
2211 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
2212 &length, &bbio);
2213 if (ret || !bbio || !bbio->raid_map)
2214 goto bbio_out;
2215
2216 if (WARN_ON(!sctx->is_dev_replace ||
2217 !(bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
2218 /*
2219 * We shouldn't be scrubbing a missing device. Even for dev
2220 * replace, we should only get here for RAID 5/6. We either
2221 * managed to mount something with no mirrors remaining or
2222 * there's a bug in scrub_remap_extent()/btrfs_map_block().
2223 */
2224 goto bbio_out;
2225 }
2226
2227 bio = btrfs_io_bio_alloc(0);
2228 bio->bi_iter.bi_sector = logical >> 9;
2229 bio->bi_private = sblock;
2230 bio->bi_end_io = scrub_missing_raid56_end_io;
2231
2232 rbio = raid56_alloc_missing_rbio(fs_info, bio, bbio, length);
2233 if (!rbio)
2234 goto rbio_out;
2235
2236 for (i = 0; i < sblock->page_count; i++) {
2237 struct scrub_page *spage = sblock->pagev[i];
2238
2239 raid56_add_scrub_pages(rbio, spage->page, spage->logical);
2240 }
2241
2242 btrfs_init_work(&sblock->work, scrub_missing_raid56_worker, NULL, NULL);
2243 scrub_block_get(sblock);
2244 scrub_pending_bio_inc(sctx);
2245 raid56_submit_missing_rbio(rbio);
2246 return;
2247
2248rbio_out:
2249 bio_put(bio);
2250bbio_out:
2251 btrfs_bio_counter_dec(fs_info);
2252 btrfs_put_bbio(bbio);
2253 spin_lock(&sctx->stat_lock);
2254 sctx->stat.malloc_errors++;
2255 spin_unlock(&sctx->stat_lock);
2256}
2257
2258static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len,
2259 u64 physical, struct btrfs_device *dev, u64 flags,
2260 u64 gen, int mirror_num, u8 *csum,
2261 u64 physical_for_dev_replace)
2262{
2263 struct scrub_block *sblock;
2264 const u32 sectorsize = sctx->fs_info->sectorsize;
2265 int index;
2266
2267 sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2268 if (!sblock) {
2269 spin_lock(&sctx->stat_lock);
2270 sctx->stat.malloc_errors++;
2271 spin_unlock(&sctx->stat_lock);
2272 return -ENOMEM;
2273 }
2274
2275 /* one ref inside this function, plus one for each page added to
2276 * a bio later on */
2277 refcount_set(&sblock->refs, 1);
2278 sblock->sctx = sctx;
2279 sblock->no_io_error_seen = 1;
2280
2281 for (index = 0; len > 0; index++) {
2282 struct scrub_page *spage;
2283 /*
2284 * Here we will allocate one page for one sector to scrub.
2285 * This is fine if PAGE_SIZE == sectorsize, but will cost
2286 * more memory for PAGE_SIZE > sectorsize case.
2287 */
2288 u32 l = min(sectorsize, len);
2289
2290 spage = kzalloc(sizeof(*spage), GFP_KERNEL);
2291 if (!spage) {
2292leave_nomem:
2293 spin_lock(&sctx->stat_lock);
2294 sctx->stat.malloc_errors++;
2295 spin_unlock(&sctx->stat_lock);
2296 scrub_block_put(sblock);
2297 return -ENOMEM;
2298 }
2299 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2300 scrub_page_get(spage);
2301 sblock->pagev[index] = spage;
2302 spage->sblock = sblock;
2303 spage->dev = dev;
2304 spage->flags = flags;
2305 spage->generation = gen;
2306 spage->logical = logical;
2307 spage->physical = physical;
2308 spage->physical_for_dev_replace = physical_for_dev_replace;
2309 spage->mirror_num = mirror_num;
2310 if (csum) {
2311 spage->have_csum = 1;
2312 memcpy(spage->csum, csum, sctx->fs_info->csum_size);
2313 } else {
2314 spage->have_csum = 0;
2315 }
2316 sblock->page_count++;
2317 spage->page = alloc_page(GFP_KERNEL);
2318 if (!spage->page)
2319 goto leave_nomem;
2320 len -= l;
2321 logical += l;
2322 physical += l;
2323 physical_for_dev_replace += l;
2324 }
2325
2326 WARN_ON(sblock->page_count == 0);
2327 if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
2328 /*
2329 * This case should only be hit for RAID 5/6 device replace. See
2330 * the comment in scrub_missing_raid56_pages() for details.
2331 */
2332 scrub_missing_raid56_pages(sblock);
2333 } else {
2334 for (index = 0; index < sblock->page_count; index++) {
2335 struct scrub_page *spage = sblock->pagev[index];
2336 int ret;
2337
2338 ret = scrub_add_page_to_rd_bio(sctx, spage);
2339 if (ret) {
2340 scrub_block_put(sblock);
2341 return ret;
2342 }
2343 }
2344
2345 if (flags & BTRFS_EXTENT_FLAG_SUPER)
2346 scrub_submit(sctx);
2347 }
2348
2349 /* last one frees, either here or in bio completion for last page */
2350 scrub_block_put(sblock);
2351 return 0;
2352}
2353
2354static void scrub_bio_end_io(struct bio *bio)
2355{
2356 struct scrub_bio *sbio = bio->bi_private;
2357 struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
2358
2359 sbio->status = bio->bi_status;
2360 sbio->bio = bio;
2361
2362 btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
2363}
2364
2365static void scrub_bio_end_io_worker(struct btrfs_work *work)
2366{
2367 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
2368 struct scrub_ctx *sctx = sbio->sctx;
2369 int i;
2370
2371 BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
2372 if (sbio->status) {
2373 for (i = 0; i < sbio->page_count; i++) {
2374 struct scrub_page *spage = sbio->pagev[i];
2375
2376 spage->io_error = 1;
2377 spage->sblock->no_io_error_seen = 0;
2378 }
2379 }
2380
2381 /* now complete the scrub_block items that have all pages completed */
2382 for (i = 0; i < sbio->page_count; i++) {
2383 struct scrub_page *spage = sbio->pagev[i];
2384 struct scrub_block *sblock = spage->sblock;
2385
2386 if (atomic_dec_and_test(&sblock->outstanding_pages))
2387 scrub_block_complete(sblock);
2388 scrub_block_put(sblock);
2389 }
2390
2391 bio_put(sbio->bio);
2392 sbio->bio = NULL;
2393 spin_lock(&sctx->list_lock);
2394 sbio->next_free = sctx->first_free;
2395 sctx->first_free = sbio->index;
2396 spin_unlock(&sctx->list_lock);
2397
2398 if (sctx->is_dev_replace && sctx->flush_all_writes) {
2399 mutex_lock(&sctx->wr_lock);
2400 scrub_wr_submit(sctx);
2401 mutex_unlock(&sctx->wr_lock);
2402 }
2403
2404 scrub_pending_bio_dec(sctx);
2405}
2406
2407static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
2408 unsigned long *bitmap,
2409 u64 start, u32 len)
2410{
2411 u64 offset;
2412 u32 nsectors;
2413 u32 sectorsize_bits = sparity->sctx->fs_info->sectorsize_bits;
2414
2415 if (len >= sparity->stripe_len) {
2416 bitmap_set(bitmap, 0, sparity->nsectors);
2417 return;
2418 }
2419
2420 start -= sparity->logic_start;
2421 start = div64_u64_rem(start, sparity->stripe_len, &offset);
2422 offset = offset >> sectorsize_bits;
2423 nsectors = len >> sectorsize_bits;
2424
2425 if (offset + nsectors <= sparity->nsectors) {
2426 bitmap_set(bitmap, offset, nsectors);
2427 return;
2428 }
2429
2430 bitmap_set(bitmap, offset, sparity->nsectors - offset);
2431 bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
2432}
2433
2434static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
2435 u64 start, u32 len)
2436{
2437 __scrub_mark_bitmap(sparity, sparity->ebitmap, start, len);
2438}
2439
2440static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
2441 u64 start, u32 len)
2442{
2443 __scrub_mark_bitmap(sparity, sparity->dbitmap, start, len);
2444}
2445
2446static void scrub_block_complete(struct scrub_block *sblock)
2447{
2448 int corrupted = 0;
2449
2450 if (!sblock->no_io_error_seen) {
2451 corrupted = 1;
2452 scrub_handle_errored_block(sblock);
2453 } else {
2454 /*
2455 * if has checksum error, write via repair mechanism in
2456 * dev replace case, otherwise write here in dev replace
2457 * case.
2458 */
2459 corrupted = scrub_checksum(sblock);
2460 if (!corrupted && sblock->sctx->is_dev_replace)
2461 scrub_write_block_to_dev_replace(sblock);
2462 }
2463
2464 if (sblock->sparity && corrupted && !sblock->data_corrected) {
2465 u64 start = sblock->pagev[0]->logical;
2466 u64 end = sblock->pagev[sblock->page_count - 1]->logical +
2467 PAGE_SIZE;
2468
2469 ASSERT(end - start <= U32_MAX);
2470 scrub_parity_mark_sectors_error(sblock->sparity,
2471 start, end - start);
2472 }
2473}
2474
2475static void drop_csum_range(struct scrub_ctx *sctx, struct btrfs_ordered_sum *sum)
2476{
2477 sctx->stat.csum_discards += sum->len >> sctx->fs_info->sectorsize_bits;
2478 list_del(&sum->list);
2479 kfree(sum);
2480}
2481
2482/*
2483 * Find the desired csum for range [logical, logical + sectorsize), and store
2484 * the csum into @csum.
2485 *
2486 * The search source is sctx->csum_list, which is a pre-populated list
2487 * storing bytenr ordered csum ranges. We're reponsible to cleanup any range
2488 * that is before @logical.
2489 *
2490 * Return 0 if there is no csum for the range.
2491 * Return 1 if there is csum for the range and copied to @csum.
2492 */
2493static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
2494{
2495 bool found = false;
2496
2497 while (!list_empty(&sctx->csum_list)) {
2498 struct btrfs_ordered_sum *sum = NULL;
2499 unsigned long index;
2500 unsigned long num_sectors;
2501
2502 sum = list_first_entry(&sctx->csum_list,
2503 struct btrfs_ordered_sum, list);
2504 /* The current csum range is beyond our range, no csum found */
2505 if (sum->bytenr > logical)
2506 break;
2507
2508 /*
2509 * The current sum is before our bytenr, since scrub is always
2510 * done in bytenr order, the csum will never be used anymore,
2511 * clean it up so that later calls won't bother with the range,
2512 * and continue search the next range.
2513 */
2514 if (sum->bytenr + sum->len <= logical) {
2515 drop_csum_range(sctx, sum);
2516 continue;
2517 }
2518
2519 /* Now the csum range covers our bytenr, copy the csum */
2520 found = true;
2521 index = (logical - sum->bytenr) >> sctx->fs_info->sectorsize_bits;
2522 num_sectors = sum->len >> sctx->fs_info->sectorsize_bits;
2523
2524 memcpy(csum, sum->sums + index * sctx->fs_info->csum_size,
2525 sctx->fs_info->csum_size);
2526
2527 /* Cleanup the range if we're at the end of the csum range */
2528 if (index == num_sectors - 1)
2529 drop_csum_range(sctx, sum);
2530 break;
2531 }
2532 if (!found)
2533 return 0;
2534 return 1;
2535}
2536
2537/* scrub extent tries to collect up to 64 kB for each bio */
2538static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
2539 u64 logical, u32 len,
2540 u64 physical, struct btrfs_device *dev, u64 flags,
2541 u64 gen, int mirror_num, u64 physical_for_dev_replace)
2542{
2543 int ret;
2544 u8 csum[BTRFS_CSUM_SIZE];
2545 u32 blocksize;
2546
2547 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2548 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2549 blocksize = map->stripe_len;
2550 else
2551 blocksize = sctx->fs_info->sectorsize;
2552 spin_lock(&sctx->stat_lock);
2553 sctx->stat.data_extents_scrubbed++;
2554 sctx->stat.data_bytes_scrubbed += len;
2555 spin_unlock(&sctx->stat_lock);
2556 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2557 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2558 blocksize = map->stripe_len;
2559 else
2560 blocksize = sctx->fs_info->nodesize;
2561 spin_lock(&sctx->stat_lock);
2562 sctx->stat.tree_extents_scrubbed++;
2563 sctx->stat.tree_bytes_scrubbed += len;
2564 spin_unlock(&sctx->stat_lock);
2565 } else {
2566 blocksize = sctx->fs_info->sectorsize;
2567 WARN_ON(1);
2568 }
2569
2570 while (len) {
2571 u32 l = min(len, blocksize);
2572 int have_csum = 0;
2573
2574 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2575 /* push csums to sbio */
2576 have_csum = scrub_find_csum(sctx, logical, csum);
2577 if (have_csum == 0)
2578 ++sctx->stat.no_csum;
2579 }
2580 ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
2581 mirror_num, have_csum ? csum : NULL,
2582 physical_for_dev_replace);
2583 if (ret)
2584 return ret;
2585 len -= l;
2586 logical += l;
2587 physical += l;
2588 physical_for_dev_replace += l;
2589 }
2590 return 0;
2591}
2592
2593static int scrub_pages_for_parity(struct scrub_parity *sparity,
2594 u64 logical, u32 len,
2595 u64 physical, struct btrfs_device *dev,
2596 u64 flags, u64 gen, int mirror_num, u8 *csum)
2597{
2598 struct scrub_ctx *sctx = sparity->sctx;
2599 struct scrub_block *sblock;
2600 const u32 sectorsize = sctx->fs_info->sectorsize;
2601 int index;
2602
2603 ASSERT(IS_ALIGNED(len, sectorsize));
2604
2605 sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2606 if (!sblock) {
2607 spin_lock(&sctx->stat_lock);
2608 sctx->stat.malloc_errors++;
2609 spin_unlock(&sctx->stat_lock);
2610 return -ENOMEM;
2611 }
2612
2613 /* one ref inside this function, plus one for each page added to
2614 * a bio later on */
2615 refcount_set(&sblock->refs, 1);
2616 sblock->sctx = sctx;
2617 sblock->no_io_error_seen = 1;
2618 sblock->sparity = sparity;
2619 scrub_parity_get(sparity);
2620
2621 for (index = 0; len > 0; index++) {
2622 struct scrub_page *spage;
2623
2624 spage = kzalloc(sizeof(*spage), GFP_KERNEL);
2625 if (!spage) {
2626leave_nomem:
2627 spin_lock(&sctx->stat_lock);
2628 sctx->stat.malloc_errors++;
2629 spin_unlock(&sctx->stat_lock);
2630 scrub_block_put(sblock);
2631 return -ENOMEM;
2632 }
2633 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2634 /* For scrub block */
2635 scrub_page_get(spage);
2636 sblock->pagev[index] = spage;
2637 /* For scrub parity */
2638 scrub_page_get(spage);
2639 list_add_tail(&spage->list, &sparity->spages);
2640 spage->sblock = sblock;
2641 spage->dev = dev;
2642 spage->flags = flags;
2643 spage->generation = gen;
2644 spage->logical = logical;
2645 spage->physical = physical;
2646 spage->mirror_num = mirror_num;
2647 if (csum) {
2648 spage->have_csum = 1;
2649 memcpy(spage->csum, csum, sctx->fs_info->csum_size);
2650 } else {
2651 spage->have_csum = 0;
2652 }
2653 sblock->page_count++;
2654 spage->page = alloc_page(GFP_KERNEL);
2655 if (!spage->page)
2656 goto leave_nomem;
2657
2658
2659 /* Iterate over the stripe range in sectorsize steps */
2660 len -= sectorsize;
2661 logical += sectorsize;
2662 physical += sectorsize;
2663 }
2664
2665 WARN_ON(sblock->page_count == 0);
2666 for (index = 0; index < sblock->page_count; index++) {
2667 struct scrub_page *spage = sblock->pagev[index];
2668 int ret;
2669
2670 ret = scrub_add_page_to_rd_bio(sctx, spage);
2671 if (ret) {
2672 scrub_block_put(sblock);
2673 return ret;
2674 }
2675 }
2676
2677 /* last one frees, either here or in bio completion for last page */
2678 scrub_block_put(sblock);
2679 return 0;
2680}
2681
2682static int scrub_extent_for_parity(struct scrub_parity *sparity,
2683 u64 logical, u32 len,
2684 u64 physical, struct btrfs_device *dev,
2685 u64 flags, u64 gen, int mirror_num)
2686{
2687 struct scrub_ctx *sctx = sparity->sctx;
2688 int ret;
2689 u8 csum[BTRFS_CSUM_SIZE];
2690 u32 blocksize;
2691
2692 if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
2693 scrub_parity_mark_sectors_error(sparity, logical, len);
2694 return 0;
2695 }
2696
2697 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2698 blocksize = sparity->stripe_len;
2699 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2700 blocksize = sparity->stripe_len;
2701 } else {
2702 blocksize = sctx->fs_info->sectorsize;
2703 WARN_ON(1);
2704 }
2705
2706 while (len) {
2707 u32 l = min(len, blocksize);
2708 int have_csum = 0;
2709
2710 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2711 /* push csums to sbio */
2712 have_csum = scrub_find_csum(sctx, logical, csum);
2713 if (have_csum == 0)
2714 goto skip;
2715 }
2716 ret = scrub_pages_for_parity(sparity, logical, l, physical, dev,
2717 flags, gen, mirror_num,
2718 have_csum ? csum : NULL);
2719 if (ret)
2720 return ret;
2721skip:
2722 len -= l;
2723 logical += l;
2724 physical += l;
2725 }
2726 return 0;
2727}
2728
2729/*
2730 * Given a physical address, this will calculate it's
2731 * logical offset. if this is a parity stripe, it will return
2732 * the most left data stripe's logical offset.
2733 *
2734 * return 0 if it is a data stripe, 1 means parity stripe.
2735 */
2736static int get_raid56_logic_offset(u64 physical, int num,
2737 struct map_lookup *map, u64 *offset,
2738 u64 *stripe_start)
2739{
2740 int i;
2741 int j = 0;
2742 u64 stripe_nr;
2743 u64 last_offset;
2744 u32 stripe_index;
2745 u32 rot;
2746 const int data_stripes = nr_data_stripes(map);
2747
2748 last_offset = (physical - map->stripes[num].physical) * data_stripes;
2749 if (stripe_start)
2750 *stripe_start = last_offset;
2751
2752 *offset = last_offset;
2753 for (i = 0; i < data_stripes; i++) {
2754 *offset = last_offset + i * map->stripe_len;
2755
2756 stripe_nr = div64_u64(*offset, map->stripe_len);
2757 stripe_nr = div_u64(stripe_nr, data_stripes);
2758
2759 /* Work out the disk rotation on this stripe-set */
2760 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot);
2761 /* calculate which stripe this data locates */
2762 rot += i;
2763 stripe_index = rot % map->num_stripes;
2764 if (stripe_index == num)
2765 return 0;
2766 if (stripe_index < num)
2767 j++;
2768 }
2769 *offset = last_offset + j * map->stripe_len;
2770 return 1;
2771}
2772
2773static void scrub_free_parity(struct scrub_parity *sparity)
2774{
2775 struct scrub_ctx *sctx = sparity->sctx;
2776 struct scrub_page *curr, *next;
2777 int nbits;
2778
2779 nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors);
2780 if (nbits) {
2781 spin_lock(&sctx->stat_lock);
2782 sctx->stat.read_errors += nbits;
2783 sctx->stat.uncorrectable_errors += nbits;
2784 spin_unlock(&sctx->stat_lock);
2785 }
2786
2787 list_for_each_entry_safe(curr, next, &sparity->spages, list) {
2788 list_del_init(&curr->list);
2789 scrub_page_put(curr);
2790 }
2791
2792 kfree(sparity);
2793}
2794
2795static void scrub_parity_bio_endio_worker(struct btrfs_work *work)
2796{
2797 struct scrub_parity *sparity = container_of(work, struct scrub_parity,
2798 work);
2799 struct scrub_ctx *sctx = sparity->sctx;
2800
2801 scrub_free_parity(sparity);
2802 scrub_pending_bio_dec(sctx);
2803}
2804
2805static void scrub_parity_bio_endio(struct bio *bio)
2806{
2807 struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private;
2808 struct btrfs_fs_info *fs_info = sparity->sctx->fs_info;
2809
2810 if (bio->bi_status)
2811 bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
2812 sparity->nsectors);
2813
2814 bio_put(bio);
2815
2816 btrfs_init_work(&sparity->work, scrub_parity_bio_endio_worker, NULL,
2817 NULL);
2818 btrfs_queue_work(fs_info->scrub_parity_workers, &sparity->work);
2819}
2820
2821static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
2822{
2823 struct scrub_ctx *sctx = sparity->sctx;
2824 struct btrfs_fs_info *fs_info = sctx->fs_info;
2825 struct bio *bio;
2826 struct btrfs_raid_bio *rbio;
2827 struct btrfs_bio *bbio = NULL;
2828 u64 length;
2829 int ret;
2830
2831 if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap,
2832 sparity->nsectors))
2833 goto out;
2834
2835 length = sparity->logic_end - sparity->logic_start;
2836
2837 btrfs_bio_counter_inc_blocked(fs_info);
2838 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
2839 &length, &bbio);
2840 if (ret || !bbio || !bbio->raid_map)
2841 goto bbio_out;
2842
2843 bio = btrfs_io_bio_alloc(0);
2844 bio->bi_iter.bi_sector = sparity->logic_start >> 9;
2845 bio->bi_private = sparity;
2846 bio->bi_end_io = scrub_parity_bio_endio;
2847
2848 rbio = raid56_parity_alloc_scrub_rbio(fs_info, bio, bbio,
2849 length, sparity->scrub_dev,
2850 sparity->dbitmap,
2851 sparity->nsectors);
2852 if (!rbio)
2853 goto rbio_out;
2854
2855 scrub_pending_bio_inc(sctx);
2856 raid56_parity_submit_scrub_rbio(rbio);
2857 return;
2858
2859rbio_out:
2860 bio_put(bio);
2861bbio_out:
2862 btrfs_bio_counter_dec(fs_info);
2863 btrfs_put_bbio(bbio);
2864 bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
2865 sparity->nsectors);
2866 spin_lock(&sctx->stat_lock);
2867 sctx->stat.malloc_errors++;
2868 spin_unlock(&sctx->stat_lock);
2869out:
2870 scrub_free_parity(sparity);
2871}
2872
2873static inline int scrub_calc_parity_bitmap_len(int nsectors)
2874{
2875 return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * sizeof(long);
2876}
2877
2878static void scrub_parity_get(struct scrub_parity *sparity)
2879{
2880 refcount_inc(&sparity->refs);
2881}
2882
2883static void scrub_parity_put(struct scrub_parity *sparity)
2884{
2885 if (!refcount_dec_and_test(&sparity->refs))
2886 return;
2887
2888 scrub_parity_check_and_repair(sparity);
2889}
2890
2891static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
2892 struct map_lookup *map,
2893 struct btrfs_device *sdev,
2894 struct btrfs_path *path,
2895 u64 logic_start,
2896 u64 logic_end)
2897{
2898 struct btrfs_fs_info *fs_info = sctx->fs_info;
2899 struct btrfs_root *root = fs_info->extent_root;
2900 struct btrfs_root *csum_root = fs_info->csum_root;
2901 struct btrfs_extent_item *extent;
2902 struct btrfs_bio *bbio = NULL;
2903 u64 flags;
2904 int ret;
2905 int slot;
2906 struct extent_buffer *l;
2907 struct btrfs_key key;
2908 u64 generation;
2909 u64 extent_logical;
2910 u64 extent_physical;
2911 /* Check the comment in scrub_stripe() for why u32 is enough here */
2912 u32 extent_len;
2913 u64 mapped_length;
2914 struct btrfs_device *extent_dev;
2915 struct scrub_parity *sparity;
2916 int nsectors;
2917 int bitmap_len;
2918 int extent_mirror_num;
2919 int stop_loop = 0;
2920
2921 ASSERT(map->stripe_len <= U32_MAX);
2922 nsectors = map->stripe_len >> fs_info->sectorsize_bits;
2923 bitmap_len = scrub_calc_parity_bitmap_len(nsectors);
2924 sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len,
2925 GFP_NOFS);
2926 if (!sparity) {
2927 spin_lock(&sctx->stat_lock);
2928 sctx->stat.malloc_errors++;
2929 spin_unlock(&sctx->stat_lock);
2930 return -ENOMEM;
2931 }
2932
2933 ASSERT(map->stripe_len <= U32_MAX);
2934 sparity->stripe_len = map->stripe_len;
2935 sparity->nsectors = nsectors;
2936 sparity->sctx = sctx;
2937 sparity->scrub_dev = sdev;
2938 sparity->logic_start = logic_start;
2939 sparity->logic_end = logic_end;
2940 refcount_set(&sparity->refs, 1);
2941 INIT_LIST_HEAD(&sparity->spages);
2942 sparity->dbitmap = sparity->bitmap;
2943 sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
2944
2945 ret = 0;
2946 while (logic_start < logic_end) {
2947 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2948 key.type = BTRFS_METADATA_ITEM_KEY;
2949 else
2950 key.type = BTRFS_EXTENT_ITEM_KEY;
2951 key.objectid = logic_start;
2952 key.offset = (u64)-1;
2953
2954 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2955 if (ret < 0)
2956 goto out;
2957
2958 if (ret > 0) {
2959 ret = btrfs_previous_extent_item(root, path, 0);
2960 if (ret < 0)
2961 goto out;
2962 if (ret > 0) {
2963 btrfs_release_path(path);
2964 ret = btrfs_search_slot(NULL, root, &key,
2965 path, 0, 0);
2966 if (ret < 0)
2967 goto out;
2968 }
2969 }
2970
2971 stop_loop = 0;
2972 while (1) {
2973 u64 bytes;
2974
2975 l = path->nodes[0];
2976 slot = path->slots[0];
2977 if (slot >= btrfs_header_nritems(l)) {
2978 ret = btrfs_next_leaf(root, path);
2979 if (ret == 0)
2980 continue;
2981 if (ret < 0)
2982 goto out;
2983
2984 stop_loop = 1;
2985 break;
2986 }
2987 btrfs_item_key_to_cpu(l, &key, slot);
2988
2989 if (key.type != BTRFS_EXTENT_ITEM_KEY &&
2990 key.type != BTRFS_METADATA_ITEM_KEY)
2991 goto next;
2992
2993 if (key.type == BTRFS_METADATA_ITEM_KEY)
2994 bytes = fs_info->nodesize;
2995 else
2996 bytes = key.offset;
2997
2998 if (key.objectid + bytes <= logic_start)
2999 goto next;
3000
3001 if (key.objectid >= logic_end) {
3002 stop_loop = 1;
3003 break;
3004 }
3005
3006 while (key.objectid >= logic_start + map->stripe_len)
3007 logic_start += map->stripe_len;
3008
3009 extent = btrfs_item_ptr(l, slot,
3010 struct btrfs_extent_item);
3011 flags = btrfs_extent_flags(l, extent);
3012 generation = btrfs_extent_generation(l, extent);
3013
3014 if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3015 (key.objectid < logic_start ||
3016 key.objectid + bytes >
3017 logic_start + map->stripe_len)) {
3018 btrfs_err(fs_info,
3019 "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
3020 key.objectid, logic_start);
3021 spin_lock(&sctx->stat_lock);
3022 sctx->stat.uncorrectable_errors++;
3023 spin_unlock(&sctx->stat_lock);
3024 goto next;
3025 }
3026again:
3027 extent_logical = key.objectid;
3028 ASSERT(bytes <= U32_MAX);
3029 extent_len = bytes;
3030
3031 if (extent_logical < logic_start) {
3032 extent_len -= logic_start - extent_logical;
3033 extent_logical = logic_start;
3034 }
3035
3036 if (extent_logical + extent_len >
3037 logic_start + map->stripe_len)
3038 extent_len = logic_start + map->stripe_len -
3039 extent_logical;
3040
3041 scrub_parity_mark_sectors_data(sparity, extent_logical,
3042 extent_len);
3043
3044 mapped_length = extent_len;
3045 bbio = NULL;
3046 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ,
3047 extent_logical, &mapped_length, &bbio,
3048 0);
3049 if (!ret) {
3050 if (!bbio || mapped_length < extent_len)
3051 ret = -EIO;
3052 }
3053 if (ret) {
3054 btrfs_put_bbio(bbio);
3055 goto out;
3056 }
3057 extent_physical = bbio->stripes[0].physical;
3058 extent_mirror_num = bbio->mirror_num;
3059 extent_dev = bbio->stripes[0].dev;
3060 btrfs_put_bbio(bbio);
3061
3062 ret = btrfs_lookup_csums_range(csum_root,
3063 extent_logical,
3064 extent_logical + extent_len - 1,
3065 &sctx->csum_list, 1);
3066 if (ret)
3067 goto out;
3068
3069 ret = scrub_extent_for_parity(sparity, extent_logical,
3070 extent_len,
3071 extent_physical,
3072 extent_dev, flags,
3073 generation,
3074 extent_mirror_num);
3075
3076 scrub_free_csums(sctx);
3077
3078 if (ret)
3079 goto out;
3080
3081 if (extent_logical + extent_len <
3082 key.objectid + bytes) {
3083 logic_start += map->stripe_len;
3084
3085 if (logic_start >= logic_end) {
3086 stop_loop = 1;
3087 break;
3088 }
3089
3090 if (logic_start < key.objectid + bytes) {
3091 cond_resched();
3092 goto again;
3093 }
3094 }
3095next:
3096 path->slots[0]++;
3097 }
3098
3099 btrfs_release_path(path);
3100
3101 if (stop_loop)
3102 break;
3103
3104 logic_start += map->stripe_len;
3105 }
3106out:
3107 if (ret < 0) {
3108 ASSERT(logic_end - logic_start <= U32_MAX);
3109 scrub_parity_mark_sectors_error(sparity, logic_start,
3110 logic_end - logic_start);
3111 }
3112 scrub_parity_put(sparity);
3113 scrub_submit(sctx);
3114 mutex_lock(&sctx->wr_lock);
3115 scrub_wr_submit(sctx);
3116 mutex_unlock(&sctx->wr_lock);
3117
3118 btrfs_release_path(path);
3119 return ret < 0 ? ret : 0;
3120}
3121
3122static void sync_replace_for_zoned(struct scrub_ctx *sctx)
3123{
3124 if (!btrfs_is_zoned(sctx->fs_info))
3125 return;
3126
3127 sctx->flush_all_writes = true;
3128 scrub_submit(sctx);
3129 mutex_lock(&sctx->wr_lock);
3130 scrub_wr_submit(sctx);
3131 mutex_unlock(&sctx->wr_lock);
3132
3133 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3134}
3135
3136static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical,
3137 u64 physical, u64 physical_end)
3138{
3139 struct btrfs_fs_info *fs_info = sctx->fs_info;
3140 int ret = 0;
3141
3142 if (!btrfs_is_zoned(fs_info))
3143 return 0;
3144
3145 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3146
3147 mutex_lock(&sctx->wr_lock);
3148 if (sctx->write_pointer < physical_end) {
3149 ret = btrfs_sync_zone_write_pointer(sctx->wr_tgtdev, logical,
3150 physical,
3151 sctx->write_pointer);
3152 if (ret)
3153 btrfs_err(fs_info,
3154 "zoned: failed to recover write pointer");
3155 }
3156 mutex_unlock(&sctx->wr_lock);
3157 btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical);
3158
3159 return ret;
3160}
3161
3162static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
3163 struct map_lookup *map,
3164 struct btrfs_device *scrub_dev,
3165 int num, u64 base, u64 length,
3166 struct btrfs_block_group *cache)
3167{
3168 struct btrfs_path *path, *ppath;
3169 struct btrfs_fs_info *fs_info = sctx->fs_info;
3170 struct btrfs_root *root = fs_info->extent_root;
3171 struct btrfs_root *csum_root = fs_info->csum_root;
3172 struct btrfs_extent_item *extent;
3173 struct blk_plug plug;
3174 u64 flags;
3175 int ret;
3176 int slot;
3177 u64 nstripes;
3178 struct extent_buffer *l;
3179 u64 physical;
3180 u64 logical;
3181 u64 logic_end;
3182 u64 physical_end;
3183 u64 generation;
3184 int mirror_num;
3185 struct reada_control *reada1;
3186 struct reada_control *reada2;
3187 struct btrfs_key key;
3188 struct btrfs_key key_end;
3189 u64 increment = map->stripe_len;
3190 u64 offset;
3191 u64 extent_logical;
3192 u64 extent_physical;
3193 /*
3194 * Unlike chunk length, extent length should never go beyond
3195 * BTRFS_MAX_EXTENT_SIZE, thus u32 is enough here.
3196 */
3197 u32 extent_len;
3198 u64 stripe_logical;
3199 u64 stripe_end;
3200 struct btrfs_device *extent_dev;
3201 int extent_mirror_num;
3202 int stop_loop = 0;
3203
3204 physical = map->stripes[num].physical;
3205 offset = 0;
3206 nstripes = div64_u64(length, map->stripe_len);
3207 if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3208 offset = map->stripe_len * num;
3209 increment = map->stripe_len * map->num_stripes;
3210 mirror_num = 1;
3211 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3212 int factor = map->num_stripes / map->sub_stripes;
3213 offset = map->stripe_len * (num / map->sub_stripes);
3214 increment = map->stripe_len * factor;
3215 mirror_num = num % map->sub_stripes + 1;
3216 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
3217 increment = map->stripe_len;
3218 mirror_num = num % map->num_stripes + 1;
3219 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
3220 increment = map->stripe_len;
3221 mirror_num = num % map->num_stripes + 1;
3222 } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3223 get_raid56_logic_offset(physical, num, map, &offset, NULL);
3224 increment = map->stripe_len * nr_data_stripes(map);
3225 mirror_num = 1;
3226 } else {
3227 increment = map->stripe_len;
3228 mirror_num = 1;
3229 }
3230
3231 path = btrfs_alloc_path();
3232 if (!path)
3233 return -ENOMEM;
3234
3235 ppath = btrfs_alloc_path();
3236 if (!ppath) {
3237 btrfs_free_path(path);
3238 return -ENOMEM;
3239 }
3240
3241 /*
3242 * work on commit root. The related disk blocks are static as
3243 * long as COW is applied. This means, it is save to rewrite
3244 * them to repair disk errors without any race conditions
3245 */
3246 path->search_commit_root = 1;
3247 path->skip_locking = 1;
3248
3249 ppath->search_commit_root = 1;
3250 ppath->skip_locking = 1;
3251 /*
3252 * trigger the readahead for extent tree csum tree and wait for
3253 * completion. During readahead, the scrub is officially paused
3254 * to not hold off transaction commits
3255 */
3256 logical = base + offset;
3257 physical_end = physical + nstripes * map->stripe_len;
3258 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3259 get_raid56_logic_offset(physical_end, num,
3260 map, &logic_end, NULL);
3261 logic_end += base;
3262 } else {
3263 logic_end = logical + increment * nstripes;
3264 }
3265 wait_event(sctx->list_wait,
3266 atomic_read(&sctx->bios_in_flight) == 0);
3267 scrub_blocked_if_needed(fs_info);
3268
3269 /* FIXME it might be better to start readahead at commit root */
3270 key.objectid = logical;
3271 key.type = BTRFS_EXTENT_ITEM_KEY;
3272 key.offset = (u64)0;
3273 key_end.objectid = logic_end;
3274 key_end.type = BTRFS_METADATA_ITEM_KEY;
3275 key_end.offset = (u64)-1;
3276 reada1 = btrfs_reada_add(root, &key, &key_end);
3277
3278 if (cache->flags & BTRFS_BLOCK_GROUP_DATA) {
3279 key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
3280 key.type = BTRFS_EXTENT_CSUM_KEY;
3281 key.offset = logical;
3282 key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
3283 key_end.type = BTRFS_EXTENT_CSUM_KEY;
3284 key_end.offset = logic_end;
3285 reada2 = btrfs_reada_add(csum_root, &key, &key_end);
3286 } else {
3287 reada2 = NULL;
3288 }
3289
3290 if (!IS_ERR(reada1))
3291 btrfs_reada_wait(reada1);
3292 if (!IS_ERR_OR_NULL(reada2))
3293 btrfs_reada_wait(reada2);
3294
3295
3296 /*
3297 * collect all data csums for the stripe to avoid seeking during
3298 * the scrub. This might currently (crc32) end up to be about 1MB
3299 */
3300 blk_start_plug(&plug);
3301
3302 if (sctx->is_dev_replace &&
3303 btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) {
3304 mutex_lock(&sctx->wr_lock);
3305 sctx->write_pointer = physical;
3306 mutex_unlock(&sctx->wr_lock);
3307 sctx->flush_all_writes = true;
3308 }
3309
3310 /*
3311 * now find all extents for each stripe and scrub them
3312 */
3313 ret = 0;
3314 while (physical < physical_end) {
3315 /*
3316 * canceled?
3317 */
3318 if (atomic_read(&fs_info->scrub_cancel_req) ||
3319 atomic_read(&sctx->cancel_req)) {
3320 ret = -ECANCELED;
3321 goto out;
3322 }
3323 /*
3324 * check to see if we have to pause
3325 */
3326 if (atomic_read(&fs_info->scrub_pause_req)) {
3327 /* push queued extents */
3328 sctx->flush_all_writes = true;
3329 scrub_submit(sctx);
3330 mutex_lock(&sctx->wr_lock);
3331 scrub_wr_submit(sctx);
3332 mutex_unlock(&sctx->wr_lock);
3333 wait_event(sctx->list_wait,
3334 atomic_read(&sctx->bios_in_flight) == 0);
3335 sctx->flush_all_writes = false;
3336 scrub_blocked_if_needed(fs_info);
3337 }
3338
3339 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3340 ret = get_raid56_logic_offset(physical, num, map,
3341 &logical,
3342 &stripe_logical);
3343 logical += base;
3344 if (ret) {
3345 /* it is parity strip */
3346 stripe_logical += base;
3347 stripe_end = stripe_logical + increment;
3348 ret = scrub_raid56_parity(sctx, map, scrub_dev,
3349 ppath, stripe_logical,
3350 stripe_end);
3351 if (ret)
3352 goto out;
3353 goto skip;
3354 }
3355 }
3356
3357 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
3358 key.type = BTRFS_METADATA_ITEM_KEY;
3359 else
3360 key.type = BTRFS_EXTENT_ITEM_KEY;
3361 key.objectid = logical;
3362 key.offset = (u64)-1;
3363
3364 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3365 if (ret < 0)
3366 goto out;
3367
3368 if (ret > 0) {
3369 ret = btrfs_previous_extent_item(root, path, 0);
3370 if (ret < 0)
3371 goto out;
3372 if (ret > 0) {
3373 /* there's no smaller item, so stick with the
3374 * larger one */
3375 btrfs_release_path(path);
3376 ret = btrfs_search_slot(NULL, root, &key,
3377 path, 0, 0);
3378 if (ret < 0)
3379 goto out;
3380 }
3381 }
3382
3383 stop_loop = 0;
3384 while (1) {
3385 u64 bytes;
3386
3387 l = path->nodes[0];
3388 slot = path->slots[0];
3389 if (slot >= btrfs_header_nritems(l)) {
3390 ret = btrfs_next_leaf(root, path);
3391 if (ret == 0)
3392 continue;
3393 if (ret < 0)
3394 goto out;
3395
3396 stop_loop = 1;
3397 break;
3398 }
3399 btrfs_item_key_to_cpu(l, &key, slot);
3400
3401 if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3402 key.type != BTRFS_METADATA_ITEM_KEY)
3403 goto next;
3404
3405 if (key.type == BTRFS_METADATA_ITEM_KEY)
3406 bytes = fs_info->nodesize;
3407 else
3408 bytes = key.offset;
3409
3410 if (key.objectid + bytes <= logical)
3411 goto next;
3412
3413 if (key.objectid >= logical + map->stripe_len) {
3414 /* out of this device extent */
3415 if (key.objectid >= logic_end)
3416 stop_loop = 1;
3417 break;
3418 }
3419
3420 /*
3421 * If our block group was removed in the meanwhile, just
3422 * stop scrubbing since there is no point in continuing.
3423 * Continuing would prevent reusing its device extents
3424 * for new block groups for a long time.
3425 */
3426 spin_lock(&cache->lock);
3427 if (cache->removed) {
3428 spin_unlock(&cache->lock);
3429 ret = 0;
3430 goto out;
3431 }
3432 spin_unlock(&cache->lock);
3433
3434 extent = btrfs_item_ptr(l, slot,
3435 struct btrfs_extent_item);
3436 flags = btrfs_extent_flags(l, extent);
3437 generation = btrfs_extent_generation(l, extent);
3438
3439 if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3440 (key.objectid < logical ||
3441 key.objectid + bytes >
3442 logical + map->stripe_len)) {
3443 btrfs_err(fs_info,
3444 "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
3445 key.objectid, logical);
3446 spin_lock(&sctx->stat_lock);
3447 sctx->stat.uncorrectable_errors++;
3448 spin_unlock(&sctx->stat_lock);
3449 goto next;
3450 }
3451
3452again:
3453 extent_logical = key.objectid;
3454 ASSERT(bytes <= U32_MAX);
3455 extent_len = bytes;
3456
3457 /*
3458 * trim extent to this stripe
3459 */
3460 if (extent_logical < logical) {
3461 extent_len -= logical - extent_logical;
3462 extent_logical = logical;
3463 }
3464 if (extent_logical + extent_len >
3465 logical + map->stripe_len) {
3466 extent_len = logical + map->stripe_len -
3467 extent_logical;
3468 }
3469
3470 extent_physical = extent_logical - logical + physical;
3471 extent_dev = scrub_dev;
3472 extent_mirror_num = mirror_num;
3473 if (sctx->is_dev_replace)
3474 scrub_remap_extent(fs_info, extent_logical,
3475 extent_len, &extent_physical,
3476 &extent_dev,
3477 &extent_mirror_num);
3478
3479 if (flags & BTRFS_EXTENT_FLAG_DATA) {
3480 ret = btrfs_lookup_csums_range(csum_root,
3481 extent_logical,
3482 extent_logical + extent_len - 1,
3483 &sctx->csum_list, 1);
3484 if (ret)
3485 goto out;
3486 }
3487
3488 ret = scrub_extent(sctx, map, extent_logical, extent_len,
3489 extent_physical, extent_dev, flags,
3490 generation, extent_mirror_num,
3491 extent_logical - logical + physical);
3492
3493 scrub_free_csums(sctx);
3494
3495 if (ret)
3496 goto out;
3497
3498 if (sctx->is_dev_replace)
3499 sync_replace_for_zoned(sctx);
3500
3501 if (extent_logical + extent_len <
3502 key.objectid + bytes) {
3503 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3504 /*
3505 * loop until we find next data stripe
3506 * or we have finished all stripes.
3507 */
3508loop:
3509 physical += map->stripe_len;
3510 ret = get_raid56_logic_offset(physical,
3511 num, map, &logical,
3512 &stripe_logical);
3513 logical += base;
3514
3515 if (ret && physical < physical_end) {
3516 stripe_logical += base;
3517 stripe_end = stripe_logical +
3518 increment;
3519 ret = scrub_raid56_parity(sctx,
3520 map, scrub_dev, ppath,
3521 stripe_logical,
3522 stripe_end);
3523 if (ret)
3524 goto out;
3525 goto loop;
3526 }
3527 } else {
3528 physical += map->stripe_len;
3529 logical += increment;
3530 }
3531 if (logical < key.objectid + bytes) {
3532 cond_resched();
3533 goto again;
3534 }
3535
3536 if (physical >= physical_end) {
3537 stop_loop = 1;
3538 break;
3539 }
3540 }
3541next:
3542 path->slots[0]++;
3543 }
3544 btrfs_release_path(path);
3545skip:
3546 logical += increment;
3547 physical += map->stripe_len;
3548 spin_lock(&sctx->stat_lock);
3549 if (stop_loop)
3550 sctx->stat.last_physical = map->stripes[num].physical +
3551 length;
3552 else
3553 sctx->stat.last_physical = physical;
3554 spin_unlock(&sctx->stat_lock);
3555 if (stop_loop)
3556 break;
3557 }
3558out:
3559 /* push queued extents */
3560 scrub_submit(sctx);
3561 mutex_lock(&sctx->wr_lock);
3562 scrub_wr_submit(sctx);
3563 mutex_unlock(&sctx->wr_lock);
3564
3565 blk_finish_plug(&plug);
3566 btrfs_free_path(path);
3567 btrfs_free_path(ppath);
3568
3569 if (sctx->is_dev_replace && ret >= 0) {
3570 int ret2;
3571
3572 ret2 = sync_write_pointer_for_zoned(sctx, base + offset,
3573 map->stripes[num].physical,
3574 physical_end);
3575 if (ret2)
3576 ret = ret2;
3577 }
3578
3579 return ret < 0 ? ret : 0;
3580}
3581
3582static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
3583 struct btrfs_device *scrub_dev,
3584 u64 chunk_offset, u64 length,
3585 u64 dev_offset,
3586 struct btrfs_block_group *cache)
3587{
3588 struct btrfs_fs_info *fs_info = sctx->fs_info;
3589 struct extent_map_tree *map_tree = &fs_info->mapping_tree;
3590 struct map_lookup *map;
3591 struct extent_map *em;
3592 int i;
3593 int ret = 0;
3594
3595 read_lock(&map_tree->lock);
3596 em = lookup_extent_mapping(map_tree, chunk_offset, 1);
3597 read_unlock(&map_tree->lock);
3598
3599 if (!em) {
3600 /*
3601 * Might have been an unused block group deleted by the cleaner
3602 * kthread or relocation.
3603 */
3604 spin_lock(&cache->lock);
3605 if (!cache->removed)
3606 ret = -EINVAL;
3607 spin_unlock(&cache->lock);
3608
3609 return ret;
3610 }
3611
3612 map = em->map_lookup;
3613 if (em->start != chunk_offset)
3614 goto out;
3615
3616 if (em->len < length)
3617 goto out;
3618
3619 for (i = 0; i < map->num_stripes; ++i) {
3620 if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
3621 map->stripes[i].physical == dev_offset) {
3622 ret = scrub_stripe(sctx, map, scrub_dev, i,
3623 chunk_offset, length, cache);
3624 if (ret)
3625 goto out;
3626 }
3627 }
3628out:
3629 free_extent_map(em);
3630
3631 return ret;
3632}
3633
3634static int finish_extent_writes_for_zoned(struct btrfs_root *root,
3635 struct btrfs_block_group *cache)
3636{
3637 struct btrfs_fs_info *fs_info = cache->fs_info;
3638 struct btrfs_trans_handle *trans;
3639
3640 if (!btrfs_is_zoned(fs_info))
3641 return 0;
3642
3643 btrfs_wait_block_group_reservations(cache);
3644 btrfs_wait_nocow_writers(cache);
3645 btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start, cache->length);
3646
3647 trans = btrfs_join_transaction(root);
3648 if (IS_ERR(trans))
3649 return PTR_ERR(trans);
3650 return btrfs_commit_transaction(trans);
3651}
3652
3653static noinline_for_stack
3654int scrub_enumerate_chunks(struct scrub_ctx *sctx,
3655 struct btrfs_device *scrub_dev, u64 start, u64 end)
3656{
3657 struct btrfs_dev_extent *dev_extent = NULL;
3658 struct btrfs_path *path;
3659 struct btrfs_fs_info *fs_info = sctx->fs_info;
3660 struct btrfs_root *root = fs_info->dev_root;
3661 u64 length;
3662 u64 chunk_offset;
3663 int ret = 0;
3664 int ro_set;
3665 int slot;
3666 struct extent_buffer *l;
3667 struct btrfs_key key;
3668 struct btrfs_key found_key;
3669 struct btrfs_block_group *cache;
3670 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
3671
3672 path = btrfs_alloc_path();
3673 if (!path)
3674 return -ENOMEM;
3675
3676 path->reada = READA_FORWARD;
3677 path->search_commit_root = 1;
3678 path->skip_locking = 1;
3679
3680 key.objectid = scrub_dev->devid;
3681 key.offset = 0ull;
3682 key.type = BTRFS_DEV_EXTENT_KEY;
3683
3684 while (1) {
3685 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3686 if (ret < 0)
3687 break;
3688 if (ret > 0) {
3689 if (path->slots[0] >=
3690 btrfs_header_nritems(path->nodes[0])) {
3691 ret = btrfs_next_leaf(root, path);
3692 if (ret < 0)
3693 break;
3694 if (ret > 0) {
3695 ret = 0;
3696 break;
3697 }
3698 } else {
3699 ret = 0;
3700 }
3701 }
3702
3703 l = path->nodes[0];
3704 slot = path->slots[0];
3705
3706 btrfs_item_key_to_cpu(l, &found_key, slot);
3707
3708 if (found_key.objectid != scrub_dev->devid)
3709 break;
3710
3711 if (found_key.type != BTRFS_DEV_EXTENT_KEY)
3712 break;
3713
3714 if (found_key.offset >= end)
3715 break;
3716
3717 if (found_key.offset < key.offset)
3718 break;
3719
3720 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
3721 length = btrfs_dev_extent_length(l, dev_extent);
3722
3723 if (found_key.offset + length <= start)
3724 goto skip;
3725
3726 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
3727
3728 /*
3729 * get a reference on the corresponding block group to prevent
3730 * the chunk from going away while we scrub it
3731 */
3732 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3733
3734 /* some chunks are removed but not committed to disk yet,
3735 * continue scrubbing */
3736 if (!cache)
3737 goto skip;
3738
3739 if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) {
3740 spin_lock(&cache->lock);
3741 if (!cache->to_copy) {
3742 spin_unlock(&cache->lock);
3743 btrfs_put_block_group(cache);
3744 goto skip;
3745 }
3746 spin_unlock(&cache->lock);
3747 }
3748
3749 /*
3750 * Make sure that while we are scrubbing the corresponding block
3751 * group doesn't get its logical address and its device extents
3752 * reused for another block group, which can possibly be of a
3753 * different type and different profile. We do this to prevent
3754 * false error detections and crashes due to bogus attempts to
3755 * repair extents.
3756 */
3757 spin_lock(&cache->lock);
3758 if (cache->removed) {
3759 spin_unlock(&cache->lock);
3760 btrfs_put_block_group(cache);
3761 goto skip;
3762 }
3763 btrfs_freeze_block_group(cache);
3764 spin_unlock(&cache->lock);
3765
3766 /*
3767 * we need call btrfs_inc_block_group_ro() with scrubs_paused,
3768 * to avoid deadlock caused by:
3769 * btrfs_inc_block_group_ro()
3770 * -> btrfs_wait_for_commit()
3771 * -> btrfs_commit_transaction()
3772 * -> btrfs_scrub_pause()
3773 */
3774 scrub_pause_on(fs_info);
3775
3776 /*
3777 * Don't do chunk preallocation for scrub.
3778 *
3779 * This is especially important for SYSTEM bgs, or we can hit
3780 * -EFBIG from btrfs_finish_chunk_alloc() like:
3781 * 1. The only SYSTEM bg is marked RO.
3782 * Since SYSTEM bg is small, that's pretty common.
3783 * 2. New SYSTEM bg will be allocated
3784 * Due to regular version will allocate new chunk.
3785 * 3. New SYSTEM bg is empty and will get cleaned up
3786 * Before cleanup really happens, it's marked RO again.
3787 * 4. Empty SYSTEM bg get scrubbed
3788 * We go back to 2.
3789 *
3790 * This can easily boost the amount of SYSTEM chunks if cleaner
3791 * thread can't be triggered fast enough, and use up all space
3792 * of btrfs_super_block::sys_chunk_array
3793 *
3794 * While for dev replace, we need to try our best to mark block
3795 * group RO, to prevent race between:
3796 * - Write duplication
3797 * Contains latest data
3798 * - Scrub copy
3799 * Contains data from commit tree
3800 *
3801 * If target block group is not marked RO, nocow writes can
3802 * be overwritten by scrub copy, causing data corruption.
3803 * So for dev-replace, it's not allowed to continue if a block
3804 * group is not RO.
3805 */
3806 ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace);
3807 if (!ret && sctx->is_dev_replace) {
3808 ret = finish_extent_writes_for_zoned(root, cache);
3809 if (ret) {
3810 btrfs_dec_block_group_ro(cache);
3811 scrub_pause_off(fs_info);
3812 btrfs_put_block_group(cache);
3813 break;
3814 }
3815 }
3816
3817 if (ret == 0) {
3818 ro_set = 1;
3819 } else if (ret == -ENOSPC && !sctx->is_dev_replace) {
3820 /*
3821 * btrfs_inc_block_group_ro return -ENOSPC when it
3822 * failed in creating new chunk for metadata.
3823 * It is not a problem for scrub, because
3824 * metadata are always cowed, and our scrub paused
3825 * commit_transactions.
3826 */
3827 ro_set = 0;
3828 } else if (ret == -ETXTBSY) {
3829 btrfs_warn(fs_info,
3830 "skipping scrub of block group %llu due to active swapfile",
3831 cache->start);
3832 scrub_pause_off(fs_info);
3833 ret = 0;
3834 goto skip_unfreeze;
3835 } else {
3836 btrfs_warn(fs_info,
3837 "failed setting block group ro: %d", ret);
3838 btrfs_unfreeze_block_group(cache);
3839 btrfs_put_block_group(cache);
3840 scrub_pause_off(fs_info);
3841 break;
3842 }
3843
3844 /*
3845 * Now the target block is marked RO, wait for nocow writes to
3846 * finish before dev-replace.
3847 * COW is fine, as COW never overwrites extents in commit tree.
3848 */
3849 if (sctx->is_dev_replace) {
3850 btrfs_wait_nocow_writers(cache);
3851 btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start,
3852 cache->length);
3853 }
3854
3855 scrub_pause_off(fs_info);
3856 down_write(&dev_replace->rwsem);
3857 dev_replace->cursor_right = found_key.offset + length;
3858 dev_replace->cursor_left = found_key.offset;
3859 dev_replace->item_needs_writeback = 1;
3860 up_write(&dev_replace->rwsem);
3861
3862 ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length,
3863 found_key.offset, cache);
3864
3865 /*
3866 * flush, submit all pending read and write bios, afterwards
3867 * wait for them.
3868 * Note that in the dev replace case, a read request causes
3869 * write requests that are submitted in the read completion
3870 * worker. Therefore in the current situation, it is required
3871 * that all write requests are flushed, so that all read and
3872 * write requests are really completed when bios_in_flight
3873 * changes to 0.
3874 */
3875 sctx->flush_all_writes = true;
3876 scrub_submit(sctx);
3877 mutex_lock(&sctx->wr_lock);
3878 scrub_wr_submit(sctx);
3879 mutex_unlock(&sctx->wr_lock);
3880
3881 wait_event(sctx->list_wait,
3882 atomic_read(&sctx->bios_in_flight) == 0);
3883
3884 scrub_pause_on(fs_info);
3885
3886 /*
3887 * must be called before we decrease @scrub_paused.
3888 * make sure we don't block transaction commit while
3889 * we are waiting pending workers finished.
3890 */
3891 wait_event(sctx->list_wait,
3892 atomic_read(&sctx->workers_pending) == 0);
3893 sctx->flush_all_writes = false;
3894
3895 scrub_pause_off(fs_info);
3896
3897 if (sctx->is_dev_replace &&
3898 !btrfs_finish_block_group_to_copy(dev_replace->srcdev,
3899 cache, found_key.offset))
3900 ro_set = 0;
3901
3902 down_write(&dev_replace->rwsem);
3903 dev_replace->cursor_left = dev_replace->cursor_right;
3904 dev_replace->item_needs_writeback = 1;
3905 up_write(&dev_replace->rwsem);
3906
3907 if (ro_set)
3908 btrfs_dec_block_group_ro(cache);
3909
3910 /*
3911 * We might have prevented the cleaner kthread from deleting
3912 * this block group if it was already unused because we raced
3913 * and set it to RO mode first. So add it back to the unused
3914 * list, otherwise it might not ever be deleted unless a manual
3915 * balance is triggered or it becomes used and unused again.
3916 */
3917 spin_lock(&cache->lock);
3918 if (!cache->removed && !cache->ro && cache->reserved == 0 &&
3919 cache->used == 0) {
3920 spin_unlock(&cache->lock);
3921 if (btrfs_test_opt(fs_info, DISCARD_ASYNC))
3922 btrfs_discard_queue_work(&fs_info->discard_ctl,
3923 cache);
3924 else
3925 btrfs_mark_bg_unused(cache);
3926 } else {
3927 spin_unlock(&cache->lock);
3928 }
3929skip_unfreeze:
3930 btrfs_unfreeze_block_group(cache);
3931 btrfs_put_block_group(cache);
3932 if (ret)
3933 break;
3934 if (sctx->is_dev_replace &&
3935 atomic64_read(&dev_replace->num_write_errors) > 0) {
3936 ret = -EIO;
3937 break;
3938 }
3939 if (sctx->stat.malloc_errors > 0) {
3940 ret = -ENOMEM;
3941 break;
3942 }
3943skip:
3944 key.offset = found_key.offset + length;
3945 btrfs_release_path(path);
3946 }
3947
3948 btrfs_free_path(path);
3949
3950 return ret;
3951}
3952
3953static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
3954 struct btrfs_device *scrub_dev)
3955{
3956 int i;
3957 u64 bytenr;
3958 u64 gen;
3959 int ret;
3960 struct btrfs_fs_info *fs_info = sctx->fs_info;
3961
3962 if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
3963 return -EROFS;
3964
3965 /* Seed devices of a new filesystem has their own generation. */
3966 if (scrub_dev->fs_devices != fs_info->fs_devices)
3967 gen = scrub_dev->generation;
3968 else
3969 gen = fs_info->last_trans_committed;
3970
3971 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
3972 bytenr = btrfs_sb_offset(i);
3973 if (bytenr + BTRFS_SUPER_INFO_SIZE >
3974 scrub_dev->commit_total_bytes)
3975 break;
3976 if (!btrfs_check_super_location(scrub_dev, bytenr))
3977 continue;
3978
3979 ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
3980 scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
3981 NULL, bytenr);
3982 if (ret)
3983 return ret;
3984 }
3985 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3986
3987 return 0;
3988}
3989
3990static void scrub_workers_put(struct btrfs_fs_info *fs_info)
3991{
3992 if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt,
3993 &fs_info->scrub_lock)) {
3994 struct btrfs_workqueue *scrub_workers = NULL;
3995 struct btrfs_workqueue *scrub_wr_comp = NULL;
3996 struct btrfs_workqueue *scrub_parity = NULL;
3997
3998 scrub_workers = fs_info->scrub_workers;
3999 scrub_wr_comp = fs_info->scrub_wr_completion_workers;
4000 scrub_parity = fs_info->scrub_parity_workers;
4001
4002 fs_info->scrub_workers = NULL;
4003 fs_info->scrub_wr_completion_workers = NULL;
4004 fs_info->scrub_parity_workers = NULL;
4005 mutex_unlock(&fs_info->scrub_lock);
4006
4007 btrfs_destroy_workqueue(scrub_workers);
4008 btrfs_destroy_workqueue(scrub_wr_comp);
4009 btrfs_destroy_workqueue(scrub_parity);
4010 }
4011}
4012
4013/*
4014 * get a reference count on fs_info->scrub_workers. start worker if necessary
4015 */
4016static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
4017 int is_dev_replace)
4018{
4019 struct btrfs_workqueue *scrub_workers = NULL;
4020 struct btrfs_workqueue *scrub_wr_comp = NULL;
4021 struct btrfs_workqueue *scrub_parity = NULL;
4022 unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
4023 int max_active = fs_info->thread_pool_size;
4024 int ret = -ENOMEM;
4025
4026 if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt))
4027 return 0;
4028
4029 scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub", flags,
4030 is_dev_replace ? 1 : max_active, 4);
4031 if (!scrub_workers)
4032 goto fail_scrub_workers;
4033
4034 scrub_wr_comp = btrfs_alloc_workqueue(fs_info, "scrubwrc", flags,
4035 max_active, 2);
4036 if (!scrub_wr_comp)
4037 goto fail_scrub_wr_completion_workers;
4038
4039 scrub_parity = btrfs_alloc_workqueue(fs_info, "scrubparity", flags,
4040 max_active, 2);
4041 if (!scrub_parity)
4042 goto fail_scrub_parity_workers;
4043
4044 mutex_lock(&fs_info->scrub_lock);
4045 if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
4046 ASSERT(fs_info->scrub_workers == NULL &&
4047 fs_info->scrub_wr_completion_workers == NULL &&
4048 fs_info->scrub_parity_workers == NULL);
4049 fs_info->scrub_workers = scrub_workers;
4050 fs_info->scrub_wr_completion_workers = scrub_wr_comp;
4051 fs_info->scrub_parity_workers = scrub_parity;
4052 refcount_set(&fs_info->scrub_workers_refcnt, 1);
4053 mutex_unlock(&fs_info->scrub_lock);
4054 return 0;
4055 }
4056 /* Other thread raced in and created the workers for us */
4057 refcount_inc(&fs_info->scrub_workers_refcnt);
4058 mutex_unlock(&fs_info->scrub_lock);
4059
4060 ret = 0;
4061 btrfs_destroy_workqueue(scrub_parity);
4062fail_scrub_parity_workers:
4063 btrfs_destroy_workqueue(scrub_wr_comp);
4064fail_scrub_wr_completion_workers:
4065 btrfs_destroy_workqueue(scrub_workers);
4066fail_scrub_workers:
4067 return ret;
4068}
4069
4070int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
4071 u64 end, struct btrfs_scrub_progress *progress,
4072 int readonly, int is_dev_replace)
4073{
4074 struct scrub_ctx *sctx;
4075 int ret;
4076 struct btrfs_device *dev;
4077 unsigned int nofs_flag;
4078
4079 if (btrfs_fs_closing(fs_info))
4080 return -EAGAIN;
4081
4082 if (fs_info->nodesize > BTRFS_STRIPE_LEN) {
4083 /*
4084 * in this case scrub is unable to calculate the checksum
4085 * the way scrub is implemented. Do not handle this
4086 * situation at all because it won't ever happen.
4087 */
4088 btrfs_err(fs_info,
4089 "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",
4090 fs_info->nodesize,
4091 BTRFS_STRIPE_LEN);
4092 return -EINVAL;
4093 }
4094
4095 if (fs_info->nodesize >
4096 PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
4097 fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
4098 /*
4099 * would exhaust the array bounds of pagev member in
4100 * struct scrub_block
4101 */
4102 btrfs_err(fs_info,
4103 "scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails",
4104 fs_info->nodesize,
4105 SCRUB_MAX_PAGES_PER_BLOCK,
4106 fs_info->sectorsize,
4107 SCRUB_MAX_PAGES_PER_BLOCK);
4108 return -EINVAL;
4109 }
4110
4111 /* Allocate outside of device_list_mutex */
4112 sctx = scrub_setup_ctx(fs_info, is_dev_replace);
4113 if (IS_ERR(sctx))
4114 return PTR_ERR(sctx);
4115
4116 ret = scrub_workers_get(fs_info, is_dev_replace);
4117 if (ret)
4118 goto out_free_ctx;
4119
4120 mutex_lock(&fs_info->fs_devices->device_list_mutex);
4121 dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
4122 if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
4123 !is_dev_replace)) {
4124 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4125 ret = -ENODEV;
4126 goto out;
4127 }
4128
4129 if (!is_dev_replace && !readonly &&
4130 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
4131 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4132 btrfs_err_in_rcu(fs_info,
4133 "scrub on devid %llu: filesystem on %s is not writable",
4134 devid, rcu_str_deref(dev->name));
4135 ret = -EROFS;
4136 goto out;
4137 }
4138
4139 mutex_lock(&fs_info->scrub_lock);
4140 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
4141 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) {
4142 mutex_unlock(&fs_info->scrub_lock);
4143 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4144 ret = -EIO;
4145 goto out;
4146 }
4147
4148 down_read(&fs_info->dev_replace.rwsem);
4149 if (dev->scrub_ctx ||
4150 (!is_dev_replace &&
4151 btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
4152 up_read(&fs_info->dev_replace.rwsem);
4153 mutex_unlock(&fs_info->scrub_lock);
4154 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4155 ret = -EINPROGRESS;
4156 goto out;
4157 }
4158 up_read(&fs_info->dev_replace.rwsem);
4159
4160 sctx->readonly = readonly;
4161 dev->scrub_ctx = sctx;
4162 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4163
4164 /*
4165 * checking @scrub_pause_req here, we can avoid
4166 * race between committing transaction and scrubbing.
4167 */
4168 __scrub_blocked_if_needed(fs_info);
4169 atomic_inc(&fs_info->scrubs_running);
4170 mutex_unlock(&fs_info->scrub_lock);
4171
4172 /*
4173 * In order to avoid deadlock with reclaim when there is a transaction
4174 * trying to pause scrub, make sure we use GFP_NOFS for all the
4175 * allocations done at btrfs_scrub_pages() and scrub_pages_for_parity()
4176 * invoked by our callees. The pausing request is done when the
4177 * transaction commit starts, and it blocks the transaction until scrub
4178 * is paused (done at specific points at scrub_stripe() or right above
4179 * before incrementing fs_info->scrubs_running).
4180 */
4181 nofs_flag = memalloc_nofs_save();
4182 if (!is_dev_replace) {
4183 btrfs_info(fs_info, "scrub: started on devid %llu", devid);
4184 /*
4185 * by holding device list mutex, we can
4186 * kick off writing super in log tree sync.
4187 */
4188 mutex_lock(&fs_info->fs_devices->device_list_mutex);
4189 ret = scrub_supers(sctx, dev);
4190 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4191 }
4192
4193 if (!ret)
4194 ret = scrub_enumerate_chunks(sctx, dev, start, end);
4195 memalloc_nofs_restore(nofs_flag);
4196
4197 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
4198 atomic_dec(&fs_info->scrubs_running);
4199 wake_up(&fs_info->scrub_pause_wait);
4200
4201 wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
4202
4203 if (progress)
4204 memcpy(progress, &sctx->stat, sizeof(*progress));
4205
4206 if (!is_dev_replace)
4207 btrfs_info(fs_info, "scrub: %s on devid %llu with status: %d",
4208 ret ? "not finished" : "finished", devid, ret);
4209
4210 mutex_lock(&fs_info->scrub_lock);
4211 dev->scrub_ctx = NULL;
4212 mutex_unlock(&fs_info->scrub_lock);
4213
4214 scrub_workers_put(fs_info);
4215 scrub_put_ctx(sctx);
4216
4217 return ret;
4218out:
4219 scrub_workers_put(fs_info);
4220out_free_ctx:
4221 scrub_free_ctx(sctx);
4222
4223 return ret;
4224}
4225
4226void btrfs_scrub_pause(struct btrfs_fs_info *fs_info)
4227{
4228 mutex_lock(&fs_info->scrub_lock);
4229 atomic_inc(&fs_info->scrub_pause_req);
4230 while (atomic_read(&fs_info->scrubs_paused) !=
4231 atomic_read(&fs_info->scrubs_running)) {
4232 mutex_unlock(&fs_info->scrub_lock);
4233 wait_event(fs_info->scrub_pause_wait,
4234 atomic_read(&fs_info->scrubs_paused) ==
4235 atomic_read(&fs_info->scrubs_running));
4236 mutex_lock(&fs_info->scrub_lock);
4237 }
4238 mutex_unlock(&fs_info->scrub_lock);
4239}
4240
4241void btrfs_scrub_continue(struct btrfs_fs_info *fs_info)
4242{
4243 atomic_dec(&fs_info->scrub_pause_req);
4244 wake_up(&fs_info->scrub_pause_wait);
4245}
4246
4247int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
4248{
4249 mutex_lock(&fs_info->scrub_lock);
4250 if (!atomic_read(&fs_info->scrubs_running)) {
4251 mutex_unlock(&fs_info->scrub_lock);
4252 return -ENOTCONN;
4253 }
4254
4255 atomic_inc(&fs_info->scrub_cancel_req);
4256 while (atomic_read(&fs_info->scrubs_running)) {
4257 mutex_unlock(&fs_info->scrub_lock);
4258 wait_event(fs_info->scrub_pause_wait,
4259 atomic_read(&fs_info->scrubs_running) == 0);
4260 mutex_lock(&fs_info->scrub_lock);
4261 }
4262 atomic_dec(&fs_info->scrub_cancel_req);
4263 mutex_unlock(&fs_info->scrub_lock);
4264
4265 return 0;
4266}
4267
4268int btrfs_scrub_cancel_dev(struct btrfs_device *dev)
4269{
4270 struct btrfs_fs_info *fs_info = dev->fs_info;
4271 struct scrub_ctx *sctx;
4272
4273 mutex_lock(&fs_info->scrub_lock);
4274 sctx = dev->scrub_ctx;
4275 if (!sctx) {
4276 mutex_unlock(&fs_info->scrub_lock);
4277 return -ENOTCONN;
4278 }
4279 atomic_inc(&sctx->cancel_req);
4280 while (dev->scrub_ctx) {
4281 mutex_unlock(&fs_info->scrub_lock);
4282 wait_event(fs_info->scrub_pause_wait,
4283 dev->scrub_ctx == NULL);
4284 mutex_lock(&fs_info->scrub_lock);
4285 }
4286 mutex_unlock(&fs_info->scrub_lock);
4287
4288 return 0;
4289}
4290
4291int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
4292 struct btrfs_scrub_progress *progress)
4293{
4294 struct btrfs_device *dev;
4295 struct scrub_ctx *sctx = NULL;
4296
4297 mutex_lock(&fs_info->fs_devices->device_list_mutex);
4298 dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
4299 if (dev)
4300 sctx = dev->scrub_ctx;
4301 if (sctx)
4302 memcpy(progress, &sctx->stat, sizeof(*progress));
4303 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4304
4305 return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
4306}
4307
4308static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
4309 u64 extent_logical, u32 extent_len,
4310 u64 *extent_physical,
4311 struct btrfs_device **extent_dev,
4312 int *extent_mirror_num)
4313{
4314 u64 mapped_length;
4315 struct btrfs_bio *bbio = NULL;
4316 int ret;
4317
4318 mapped_length = extent_len;
4319 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical,
4320 &mapped_length, &bbio, 0);
4321 if (ret || !bbio || mapped_length < extent_len ||
4322 !bbio->stripes[0].dev->bdev) {
4323 btrfs_put_bbio(bbio);
4324 return;
4325 }
4326
4327 *extent_physical = bbio->stripes[0].physical;
4328 *extent_mirror_num = bbio->mirror_num;
4329 *extent_dev = bbio->stripes[0].dev;
4330 btrfs_put_bbio(bbio);
4331}