]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - fs/btrfs/scrub.c
Btrfs: fix missing hole after hole punching and fsync when using NO_HOLES
[mirror_ubuntu-bionic-kernel.git] / fs / btrfs / scrub.c
CommitLineData
a2de733c 1/*
b6bfebc1 2 * Copyright (C) 2011, 2012 STRATO. All rights reserved.
a2de733c
AJ
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
a2de733c 19#include <linux/blkdev.h>
558540c1 20#include <linux/ratelimit.h>
de2491fd 21#include <linux/sched/mm.h>
a2de733c
AJ
22#include "ctree.h"
23#include "volumes.h"
24#include "disk-io.h"
25#include "ordered-data.h"
0ef8e451 26#include "transaction.h"
558540c1 27#include "backref.h"
5da6fcbc 28#include "extent_io.h"
ff023aac 29#include "dev-replace.h"
21adbd5c 30#include "check-integrity.h"
606686ee 31#include "rcu-string.h"
53b381b3 32#include "raid56.h"
a2de733c
AJ
33
34/*
35 * This is only the first step towards a full-features scrub. It reads all
36 * extent and super block and verifies the checksums. In case a bad checksum
37 * is found or the extent cannot be read, good data will be written back if
38 * any can be found.
39 *
40 * Future enhancements:
a2de733c
AJ
41 * - In case an unrepairable extent is encountered, track which files are
42 * affected and report them
a2de733c 43 * - track and record media errors, throw out bad devices
a2de733c 44 * - add a mode to also read unallocated space
a2de733c
AJ
45 */
46
b5d67f64 47struct scrub_block;
d9d181c1 48struct scrub_ctx;
a2de733c 49
ff023aac
SB
50/*
51 * the following three values only influence the performance.
52 * The last one configures the number of parallel and outstanding I/O
53 * operations. The first two values configure an upper limit for the number
54 * of (dynamically allocated) pages that are added to a bio.
55 */
56#define SCRUB_PAGES_PER_RD_BIO 32 /* 128k per bio */
57#define SCRUB_PAGES_PER_WR_BIO 32 /* 128k per bio */
58#define SCRUB_BIOS_PER_SCTX 64 /* 8MB per device in flight */
7a9e9987
SB
59
60/*
61 * the following value times PAGE_SIZE needs to be large enough to match the
62 * largest node/leaf/sector size that shall be supported.
63 * Values larger than BTRFS_STRIPE_LEN are not supported.
64 */
b5d67f64 65#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */
a2de733c 66
af8e2d1d 67struct scrub_recover {
6f615018 68 refcount_t refs;
af8e2d1d 69 struct btrfs_bio *bbio;
af8e2d1d
MX
70 u64 map_length;
71};
72
a2de733c 73struct scrub_page {
b5d67f64
SB
74 struct scrub_block *sblock;
75 struct page *page;
442a4f63 76 struct btrfs_device *dev;
5a6ac9ea 77 struct list_head list;
a2de733c
AJ
78 u64 flags; /* extent flags */
79 u64 generation;
b5d67f64
SB
80 u64 logical;
81 u64 physical;
ff023aac 82 u64 physical_for_dev_replace;
57019345 83 atomic_t refs;
b5d67f64
SB
84 struct {
85 unsigned int mirror_num:8;
86 unsigned int have_csum:1;
87 unsigned int io_error:1;
88 };
a2de733c 89 u8 csum[BTRFS_CSUM_SIZE];
af8e2d1d
MX
90
91 struct scrub_recover *recover;
a2de733c
AJ
92};
93
94struct scrub_bio {
95 int index;
d9d181c1 96 struct scrub_ctx *sctx;
a36cf8b8 97 struct btrfs_device *dev;
a2de733c 98 struct bio *bio;
4e4cbee9 99 blk_status_t status;
a2de733c
AJ
100 u64 logical;
101 u64 physical;
ff023aac
SB
102#if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
103 struct scrub_page *pagev[SCRUB_PAGES_PER_WR_BIO];
104#else
105 struct scrub_page *pagev[SCRUB_PAGES_PER_RD_BIO];
106#endif
b5d67f64 107 int page_count;
a2de733c
AJ
108 int next_free;
109 struct btrfs_work work;
110};
111
b5d67f64 112struct scrub_block {
7a9e9987 113 struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
b5d67f64
SB
114 int page_count;
115 atomic_t outstanding_pages;
186debd6 116 refcount_t refs; /* free mem on transition to zero */
d9d181c1 117 struct scrub_ctx *sctx;
5a6ac9ea 118 struct scrub_parity *sparity;
b5d67f64
SB
119 struct {
120 unsigned int header_error:1;
121 unsigned int checksum_error:1;
122 unsigned int no_io_error_seen:1;
442a4f63 123 unsigned int generation_error:1; /* also sets header_error */
5a6ac9ea
MX
124
125 /* The following is for the data used to check parity */
126 /* It is for the data with checksum */
127 unsigned int data_corrected:1;
b5d67f64 128 };
73ff61db 129 struct btrfs_work work;
b5d67f64
SB
130};
131
5a6ac9ea
MX
132/* Used for the chunks with parity stripe such RAID5/6 */
133struct scrub_parity {
134 struct scrub_ctx *sctx;
135
136 struct btrfs_device *scrub_dev;
137
138 u64 logic_start;
139
140 u64 logic_end;
141
142 int nsectors;
143
972d7219 144 u64 stripe_len;
5a6ac9ea 145
78a76450 146 refcount_t refs;
5a6ac9ea
MX
147
148 struct list_head spages;
149
150 /* Work of parity check and repair */
151 struct btrfs_work work;
152
153 /* Mark the parity blocks which have data */
154 unsigned long *dbitmap;
155
156 /*
157 * Mark the parity blocks which have data, but errors happen when
158 * read data or check data
159 */
160 unsigned long *ebitmap;
161
162 unsigned long bitmap[0];
163};
164
d9d181c1 165struct scrub_ctx {
ff023aac 166 struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX];
fb456252 167 struct btrfs_fs_info *fs_info;
a2de733c
AJ
168 int first_free;
169 int curr;
b6bfebc1
SB
170 atomic_t bios_in_flight;
171 atomic_t workers_pending;
a2de733c
AJ
172 spinlock_t list_lock;
173 wait_queue_head_t list_wait;
174 u16 csum_size;
175 struct list_head csum_list;
176 atomic_t cancel_req;
8628764e 177 int readonly;
ff023aac 178 int pages_per_rd_bio;
63a212ab
SB
179
180 int is_dev_replace;
3fb99303
DS
181
182 struct scrub_bio *wr_curr_bio;
183 struct mutex wr_lock;
184 int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
3fb99303 185 struct btrfs_device *wr_tgtdev;
2073c4c2 186 bool flush_all_writes;
63a212ab 187
a2de733c
AJ
188 /*
189 * statistics
190 */
191 struct btrfs_scrub_progress stat;
192 spinlock_t stat_lock;
f55985f4
FM
193
194 /*
195 * Use a ref counter to avoid use-after-free issues. Scrub workers
196 * decrement bios_in_flight and workers_pending and then do a wakeup
197 * on the list_wait wait queue. We must ensure the main scrub task
198 * doesn't free the scrub context before or while the workers are
199 * doing the wakeup() call.
200 */
99f4cdb1 201 refcount_t refs;
a2de733c
AJ
202};
203
0ef8e451 204struct scrub_fixup_nodatasum {
d9d181c1 205 struct scrub_ctx *sctx;
a36cf8b8 206 struct btrfs_device *dev;
0ef8e451
JS
207 u64 logical;
208 struct btrfs_root *root;
209 struct btrfs_work work;
210 int mirror_num;
211};
212
652f25a2
JB
213struct scrub_nocow_inode {
214 u64 inum;
215 u64 offset;
216 u64 root;
217 struct list_head list;
218};
219
ff023aac
SB
220struct scrub_copy_nocow_ctx {
221 struct scrub_ctx *sctx;
222 u64 logical;
223 u64 len;
224 int mirror_num;
225 u64 physical_for_dev_replace;
652f25a2 226 struct list_head inodes;
ff023aac
SB
227 struct btrfs_work work;
228};
229
558540c1
JS
230struct scrub_warning {
231 struct btrfs_path *path;
232 u64 extent_item_size;
558540c1 233 const char *errstr;
6aa21263 234 u64 physical;
558540c1
JS
235 u64 logical;
236 struct btrfs_device *dev;
558540c1
JS
237};
238
0966a7b1
QW
239struct full_stripe_lock {
240 struct rb_node node;
241 u64 logical;
242 u64 refs;
243 struct mutex mutex;
244};
245
b6bfebc1
SB
246static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
247static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
248static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
249static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);
b5d67f64 250static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
be50a8dd 251static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
ff023aac 252 struct scrub_block *sblocks_for_recheck);
34f5c8e9 253static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
affe4a5a
ZL
254 struct scrub_block *sblock,
255 int retry_failed_mirror);
ba7cf988 256static void scrub_recheck_block_checksum(struct scrub_block *sblock);
b5d67f64 257static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
114ab50d 258 struct scrub_block *sblock_good);
b5d67f64
SB
259static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
260 struct scrub_block *sblock_good,
261 int page_num, int force_write);
ff023aac
SB
262static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
263static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
264 int page_num);
b5d67f64
SB
265static int scrub_checksum_data(struct scrub_block *sblock);
266static int scrub_checksum_tree_block(struct scrub_block *sblock);
267static int scrub_checksum_super(struct scrub_block *sblock);
268static void scrub_block_get(struct scrub_block *sblock);
269static void scrub_block_put(struct scrub_block *sblock);
7a9e9987
SB
270static void scrub_page_get(struct scrub_page *spage);
271static void scrub_page_put(struct scrub_page *spage);
5a6ac9ea
MX
272static void scrub_parity_get(struct scrub_parity *sparity);
273static void scrub_parity_put(struct scrub_parity *sparity);
ff023aac
SB
274static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
275 struct scrub_page *spage);
d9d181c1 276static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
a36cf8b8 277 u64 physical, struct btrfs_device *dev, u64 flags,
ff023aac
SB
278 u64 gen, int mirror_num, u8 *csum, int force,
279 u64 physical_for_dev_replace);
4246a0b6 280static void scrub_bio_end_io(struct bio *bio);
b5d67f64
SB
281static void scrub_bio_end_io_worker(struct btrfs_work *work);
282static void scrub_block_complete(struct scrub_block *sblock);
ff023aac
SB
283static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
284 u64 extent_logical, u64 extent_len,
285 u64 *extent_physical,
286 struct btrfs_device **extent_dev,
287 int *extent_mirror_num);
ff023aac
SB
288static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
289 struct scrub_page *spage);
290static void scrub_wr_submit(struct scrub_ctx *sctx);
4246a0b6 291static void scrub_wr_bio_end_io(struct bio *bio);
ff023aac
SB
292static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
293static int write_page_nocow(struct scrub_ctx *sctx,
294 u64 physical_for_dev_replace, struct page *page);
295static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
652f25a2 296 struct scrub_copy_nocow_ctx *ctx);
ff023aac
SB
297static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
298 int mirror_num, u64 physical_for_dev_replace);
299static void copy_nocow_pages_worker(struct btrfs_work *work);
cb7ab021 300static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
3cb0929a 301static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
f55985f4 302static void scrub_put_ctx(struct scrub_ctx *sctx);
1623edeb 303
42976286
LB
304static inline int scrub_is_page_on_raid56(struct scrub_page *page)
305{
306 return page->recover &&
307 (page->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
308}
1623edeb 309
b6bfebc1
SB
310static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
311{
99f4cdb1 312 refcount_inc(&sctx->refs);
b6bfebc1
SB
313 atomic_inc(&sctx->bios_in_flight);
314}
315
316static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
317{
318 atomic_dec(&sctx->bios_in_flight);
319 wake_up(&sctx->list_wait);
f55985f4 320 scrub_put_ctx(sctx);
b6bfebc1
SB
321}
322
cb7ab021 323static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
3cb0929a
WS
324{
325 while (atomic_read(&fs_info->scrub_pause_req)) {
326 mutex_unlock(&fs_info->scrub_lock);
327 wait_event(fs_info->scrub_pause_wait,
328 atomic_read(&fs_info->scrub_pause_req) == 0);
329 mutex_lock(&fs_info->scrub_lock);
330 }
331}
332
0e22be89 333static void scrub_pause_on(struct btrfs_fs_info *fs_info)
cb7ab021
WS
334{
335 atomic_inc(&fs_info->scrubs_paused);
336 wake_up(&fs_info->scrub_pause_wait);
0e22be89 337}
cb7ab021 338
0e22be89
Z
339static void scrub_pause_off(struct btrfs_fs_info *fs_info)
340{
cb7ab021
WS
341 mutex_lock(&fs_info->scrub_lock);
342 __scrub_blocked_if_needed(fs_info);
343 atomic_dec(&fs_info->scrubs_paused);
344 mutex_unlock(&fs_info->scrub_lock);
345
346 wake_up(&fs_info->scrub_pause_wait);
347}
348
0e22be89
Z
349static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
350{
351 scrub_pause_on(fs_info);
352 scrub_pause_off(fs_info);
353}
354
0966a7b1
QW
355/*
356 * Insert new full stripe lock into full stripe locks tree
357 *
358 * Return pointer to existing or newly inserted full_stripe_lock structure if
359 * everything works well.
360 * Return ERR_PTR(-ENOMEM) if we failed to allocate memory
361 *
362 * NOTE: caller must hold full_stripe_locks_root->lock before calling this
363 * function
364 */
365static struct full_stripe_lock *insert_full_stripe_lock(
366 struct btrfs_full_stripe_locks_tree *locks_root,
367 u64 fstripe_logical)
368{
369 struct rb_node **p;
370 struct rb_node *parent = NULL;
371 struct full_stripe_lock *entry;
372 struct full_stripe_lock *ret;
225cce41 373 unsigned int nofs_flag;
0966a7b1
QW
374
375 WARN_ON(!mutex_is_locked(&locks_root->lock));
376
377 p = &locks_root->root.rb_node;
378 while (*p) {
379 parent = *p;
380 entry = rb_entry(parent, struct full_stripe_lock, node);
381 if (fstripe_logical < entry->logical) {
382 p = &(*p)->rb_left;
383 } else if (fstripe_logical > entry->logical) {
384 p = &(*p)->rb_right;
385 } else {
386 entry->refs++;
387 return entry;
388 }
389 }
390
225cce41
FM
391 /*
392 * Insert new lock.
393 *
394 * We must use GFP_NOFS because the scrub task might be waiting for a
395 * worker task executing this function and in turn a transaction commit
396 * might be waiting the scrub task to pause (which needs to wait for all
397 * the worker tasks to complete before pausing).
398 */
399 nofs_flag = memalloc_nofs_save();
0966a7b1 400 ret = kmalloc(sizeof(*ret), GFP_KERNEL);
225cce41 401 memalloc_nofs_restore(nofs_flag);
0966a7b1
QW
402 if (!ret)
403 return ERR_PTR(-ENOMEM);
404 ret->logical = fstripe_logical;
405 ret->refs = 1;
406 mutex_init(&ret->mutex);
407
408 rb_link_node(&ret->node, parent, p);
409 rb_insert_color(&ret->node, &locks_root->root);
410 return ret;
411}
412
413/*
414 * Search for a full stripe lock of a block group
415 *
416 * Return pointer to existing full stripe lock if found
417 * Return NULL if not found
418 */
419static struct full_stripe_lock *search_full_stripe_lock(
420 struct btrfs_full_stripe_locks_tree *locks_root,
421 u64 fstripe_logical)
422{
423 struct rb_node *node;
424 struct full_stripe_lock *entry;
425
426 WARN_ON(!mutex_is_locked(&locks_root->lock));
427
428 node = locks_root->root.rb_node;
429 while (node) {
430 entry = rb_entry(node, struct full_stripe_lock, node);
431 if (fstripe_logical < entry->logical)
432 node = node->rb_left;
433 else if (fstripe_logical > entry->logical)
434 node = node->rb_right;
435 else
436 return entry;
437 }
438 return NULL;
439}
440
441/*
442 * Helper to get full stripe logical from a normal bytenr.
443 *
444 * Caller must ensure @cache is a RAID56 block group.
445 */
446static u64 get_full_stripe_logical(struct btrfs_block_group_cache *cache,
447 u64 bytenr)
448{
449 u64 ret;
450
451 /*
452 * Due to chunk item size limit, full stripe length should not be
453 * larger than U32_MAX. Just a sanity check here.
454 */
455 WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX);
456
457 /*
458 * round_down() can only handle power of 2, while RAID56 full
459 * stripe length can be 64KiB * n, so we need to manually round down.
460 */
461 ret = div64_u64(bytenr - cache->key.objectid, cache->full_stripe_len) *
462 cache->full_stripe_len + cache->key.objectid;
463 return ret;
464}
465
466/*
467 * Lock a full stripe to avoid concurrency of recovery and read
468 *
469 * It's only used for profiles with parities (RAID5/6), for other profiles it
470 * does nothing.
471 *
472 * Return 0 if we locked full stripe covering @bytenr, with a mutex held.
473 * So caller must call unlock_full_stripe() at the same context.
474 *
475 * Return <0 if encounters error.
476 */
477static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
478 bool *locked_ret)
479{
480 struct btrfs_block_group_cache *bg_cache;
481 struct btrfs_full_stripe_locks_tree *locks_root;
482 struct full_stripe_lock *existing;
483 u64 fstripe_start;
484 int ret = 0;
485
486 *locked_ret = false;
487 bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
488 if (!bg_cache) {
489 ASSERT(0);
490 return -ENOENT;
491 }
492
493 /* Profiles not based on parity don't need full stripe lock */
494 if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
495 goto out;
496 locks_root = &bg_cache->full_stripe_locks_root;
497
498 fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
499
500 /* Now insert the full stripe lock */
501 mutex_lock(&locks_root->lock);
502 existing = insert_full_stripe_lock(locks_root, fstripe_start);
503 mutex_unlock(&locks_root->lock);
504 if (IS_ERR(existing)) {
505 ret = PTR_ERR(existing);
506 goto out;
507 }
508 mutex_lock(&existing->mutex);
509 *locked_ret = true;
510out:
511 btrfs_put_block_group(bg_cache);
512 return ret;
513}
514
515/*
516 * Unlock a full stripe.
517 *
518 * NOTE: Caller must ensure it's the same context calling corresponding
519 * lock_full_stripe().
520 *
521 * Return 0 if we unlock full stripe without problem.
522 * Return <0 for error
523 */
524static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
525 bool locked)
526{
527 struct btrfs_block_group_cache *bg_cache;
528 struct btrfs_full_stripe_locks_tree *locks_root;
529 struct full_stripe_lock *fstripe_lock;
530 u64 fstripe_start;
531 bool freeit = false;
532 int ret = 0;
533
534 /* If we didn't acquire full stripe lock, no need to continue */
535 if (!locked)
536 return 0;
537
538 bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
539 if (!bg_cache) {
540 ASSERT(0);
541 return -ENOENT;
542 }
543 if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
544 goto out;
545
546 locks_root = &bg_cache->full_stripe_locks_root;
547 fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
548
549 mutex_lock(&locks_root->lock);
550 fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start);
551 /* Unpaired unlock_full_stripe() detected */
552 if (!fstripe_lock) {
553 WARN_ON(1);
554 ret = -ENOENT;
555 mutex_unlock(&locks_root->lock);
556 goto out;
557 }
558
559 if (fstripe_lock->refs == 0) {
560 WARN_ON(1);
561 btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow",
562 fstripe_lock->logical);
563 } else {
564 fstripe_lock->refs--;
565 }
566
567 if (fstripe_lock->refs == 0) {
568 rb_erase(&fstripe_lock->node, &locks_root->root);
569 freeit = true;
570 }
571 mutex_unlock(&locks_root->lock);
572
573 mutex_unlock(&fstripe_lock->mutex);
574 if (freeit)
575 kfree(fstripe_lock);
576out:
577 btrfs_put_block_group(bg_cache);
578 return ret;
579}
580
b6bfebc1
SB
581/*
582 * used for workers that require transaction commits (i.e., for the
583 * NOCOW case)
584 */
585static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
586{
fb456252 587 struct btrfs_fs_info *fs_info = sctx->fs_info;
b6bfebc1 588
99f4cdb1 589 refcount_inc(&sctx->refs);
b6bfebc1
SB
590 /*
591 * increment scrubs_running to prevent cancel requests from
592 * completing as long as a worker is running. we must also
593 * increment scrubs_paused to prevent deadlocking on pause
594 * requests used for transactions commits (as the worker uses a
595 * transaction context). it is safe to regard the worker
596 * as paused for all matters practical. effectively, we only
597 * avoid cancellation requests from completing.
598 */
599 mutex_lock(&fs_info->scrub_lock);
600 atomic_inc(&fs_info->scrubs_running);
601 atomic_inc(&fs_info->scrubs_paused);
602 mutex_unlock(&fs_info->scrub_lock);
32a44789
WS
603
604 /*
605 * check if @scrubs_running=@scrubs_paused condition
606 * inside wait_event() is not an atomic operation.
607 * which means we may inc/dec @scrub_running/paused
608 * at any time. Let's wake up @scrub_pause_wait as
609 * much as we can to let commit transaction blocked less.
610 */
611 wake_up(&fs_info->scrub_pause_wait);
612
b6bfebc1
SB
613 atomic_inc(&sctx->workers_pending);
614}
615
616/* used for workers that require transaction commits */
617static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx)
618{
fb456252 619 struct btrfs_fs_info *fs_info = sctx->fs_info;
b6bfebc1
SB
620
621 /*
622 * see scrub_pending_trans_workers_inc() why we're pretending
623 * to be paused in the scrub counters
624 */
625 mutex_lock(&fs_info->scrub_lock);
626 atomic_dec(&fs_info->scrubs_running);
627 atomic_dec(&fs_info->scrubs_paused);
628 mutex_unlock(&fs_info->scrub_lock);
629 atomic_dec(&sctx->workers_pending);
630 wake_up(&fs_info->scrub_pause_wait);
631 wake_up(&sctx->list_wait);
f55985f4 632 scrub_put_ctx(sctx);
b6bfebc1
SB
633}
634
d9d181c1 635static void scrub_free_csums(struct scrub_ctx *sctx)
a2de733c 636{
d9d181c1 637 while (!list_empty(&sctx->csum_list)) {
a2de733c 638 struct btrfs_ordered_sum *sum;
d9d181c1 639 sum = list_first_entry(&sctx->csum_list,
a2de733c
AJ
640 struct btrfs_ordered_sum, list);
641 list_del(&sum->list);
642 kfree(sum);
643 }
644}
645
d9d181c1 646static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
a2de733c
AJ
647{
648 int i;
a2de733c 649
d9d181c1 650 if (!sctx)
a2de733c
AJ
651 return;
652
b5d67f64 653 /* this can happen when scrub is cancelled */
d9d181c1
SB
654 if (sctx->curr != -1) {
655 struct scrub_bio *sbio = sctx->bios[sctx->curr];
b5d67f64
SB
656
657 for (i = 0; i < sbio->page_count; i++) {
ff023aac 658 WARN_ON(!sbio->pagev[i]->page);
b5d67f64
SB
659 scrub_block_put(sbio->pagev[i]->sblock);
660 }
661 bio_put(sbio->bio);
662 }
663
ff023aac 664 for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
d9d181c1 665 struct scrub_bio *sbio = sctx->bios[i];
a2de733c
AJ
666
667 if (!sbio)
668 break;
a2de733c
AJ
669 kfree(sbio);
670 }
671
3fb99303 672 kfree(sctx->wr_curr_bio);
d9d181c1
SB
673 scrub_free_csums(sctx);
674 kfree(sctx);
a2de733c
AJ
675}
676
f55985f4
FM
677static void scrub_put_ctx(struct scrub_ctx *sctx)
678{
99f4cdb1 679 if (refcount_dec_and_test(&sctx->refs))
f55985f4
FM
680 scrub_free_ctx(sctx);
681}
682
0af6b5c4
DS
683static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
684 struct btrfs_fs_info *fs_info, int is_dev_replace)
a2de733c 685{
d9d181c1 686 struct scrub_ctx *sctx;
a2de733c 687 int i;
a2de733c 688
58c4e173 689 sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
d9d181c1 690 if (!sctx)
a2de733c 691 goto nomem;
99f4cdb1 692 refcount_set(&sctx->refs, 1);
63a212ab 693 sctx->is_dev_replace = is_dev_replace;
b54ffb73 694 sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
d9d181c1 695 sctx->curr = -1;
0af6b5c4 696 sctx->fs_info = fs_info;
0bb994f6 697 INIT_LIST_HEAD(&sctx->csum_list);
ff023aac 698 for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
a2de733c
AJ
699 struct scrub_bio *sbio;
700
58c4e173 701 sbio = kzalloc(sizeof(*sbio), GFP_KERNEL);
a2de733c
AJ
702 if (!sbio)
703 goto nomem;
d9d181c1 704 sctx->bios[i] = sbio;
a2de733c 705
a2de733c 706 sbio->index = i;
d9d181c1 707 sbio->sctx = sctx;
b5d67f64 708 sbio->page_count = 0;
9e0af237
LB
709 btrfs_init_work(&sbio->work, btrfs_scrub_helper,
710 scrub_bio_end_io_worker, NULL, NULL);
a2de733c 711
ff023aac 712 if (i != SCRUB_BIOS_PER_SCTX - 1)
d9d181c1 713 sctx->bios[i]->next_free = i + 1;
0ef8e451 714 else
d9d181c1
SB
715 sctx->bios[i]->next_free = -1;
716 }
717 sctx->first_free = 0;
b6bfebc1
SB
718 atomic_set(&sctx->bios_in_flight, 0);
719 atomic_set(&sctx->workers_pending, 0);
d9d181c1
SB
720 atomic_set(&sctx->cancel_req, 0);
721 sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
d9d181c1
SB
722
723 spin_lock_init(&sctx->list_lock);
724 spin_lock_init(&sctx->stat_lock);
725 init_waitqueue_head(&sctx->list_wait);
ff023aac 726
3fb99303
DS
727 WARN_ON(sctx->wr_curr_bio != NULL);
728 mutex_init(&sctx->wr_lock);
729 sctx->wr_curr_bio = NULL;
8fcdac3f 730 if (is_dev_replace) {
ded56184 731 WARN_ON(!fs_info->dev_replace.tgtdev);
3fb99303 732 sctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO;
ded56184 733 sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
2073c4c2 734 sctx->flush_all_writes = false;
ff023aac 735 }
8fcdac3f 736
d9d181c1 737 return sctx;
a2de733c
AJ
738
739nomem:
d9d181c1 740 scrub_free_ctx(sctx);
a2de733c
AJ
741 return ERR_PTR(-ENOMEM);
742}
743
ff023aac
SB
744static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
745 void *warn_ctx)
558540c1
JS
746{
747 u64 isize;
748 u32 nlink;
749 int ret;
750 int i;
de2491fd 751 unsigned nofs_flag;
558540c1
JS
752 struct extent_buffer *eb;
753 struct btrfs_inode_item *inode_item;
ff023aac 754 struct scrub_warning *swarn = warn_ctx;
fb456252 755 struct btrfs_fs_info *fs_info = swarn->dev->fs_info;
558540c1
JS
756 struct inode_fs_paths *ipath = NULL;
757 struct btrfs_root *local_root;
758 struct btrfs_key root_key;
1d4c08e0 759 struct btrfs_key key;
558540c1
JS
760
761 root_key.objectid = root;
762 root_key.type = BTRFS_ROOT_ITEM_KEY;
763 root_key.offset = (u64)-1;
764 local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
765 if (IS_ERR(local_root)) {
766 ret = PTR_ERR(local_root);
767 goto err;
768 }
769
14692cc1
DS
770 /*
771 * this makes the path point to (inum INODE_ITEM ioff)
772 */
1d4c08e0
DS
773 key.objectid = inum;
774 key.type = BTRFS_INODE_ITEM_KEY;
775 key.offset = 0;
776
777 ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
558540c1
JS
778 if (ret) {
779 btrfs_release_path(swarn->path);
780 goto err;
781 }
782
783 eb = swarn->path->nodes[0];
784 inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
785 struct btrfs_inode_item);
786 isize = btrfs_inode_size(eb, inode_item);
787 nlink = btrfs_inode_nlink(eb, inode_item);
788 btrfs_release_path(swarn->path);
789
de2491fd
DS
790 /*
791 * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub
792 * uses GFP_NOFS in this context, so we keep it consistent but it does
793 * not seem to be strictly necessary.
794 */
795 nofs_flag = memalloc_nofs_save();
558540c1 796 ipath = init_ipath(4096, local_root, swarn->path);
de2491fd 797 memalloc_nofs_restore(nofs_flag);
26bdef54
DC
798 if (IS_ERR(ipath)) {
799 ret = PTR_ERR(ipath);
800 ipath = NULL;
801 goto err;
802 }
558540c1
JS
803 ret = paths_from_inode(inum, ipath);
804
805 if (ret < 0)
806 goto err;
807
808 /*
809 * we deliberately ignore the bit ipath might have been too small to
810 * hold all of the paths here
811 */
812 for (i = 0; i < ipath->fspath->elem_cnt; ++i)
5d163e0e 813 btrfs_warn_in_rcu(fs_info,
6aa21263 814"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %llu, links %u (path: %s)",
5d163e0e
JM
815 swarn->errstr, swarn->logical,
816 rcu_str_deref(swarn->dev->name),
6aa21263 817 swarn->physical,
5d163e0e
JM
818 root, inum, offset,
819 min(isize - offset, (u64)PAGE_SIZE), nlink,
820 (char *)(unsigned long)ipath->fspath->val[i]);
558540c1
JS
821
822 free_ipath(ipath);
823 return 0;
824
825err:
5d163e0e 826 btrfs_warn_in_rcu(fs_info,
6aa21263 827 "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
5d163e0e
JM
828 swarn->errstr, swarn->logical,
829 rcu_str_deref(swarn->dev->name),
6aa21263 830 swarn->physical,
5d163e0e 831 root, inum, offset, ret);
558540c1
JS
832
833 free_ipath(ipath);
834 return 0;
835}
836
b5d67f64 837static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
558540c1 838{
a36cf8b8
SB
839 struct btrfs_device *dev;
840 struct btrfs_fs_info *fs_info;
558540c1
JS
841 struct btrfs_path *path;
842 struct btrfs_key found_key;
843 struct extent_buffer *eb;
844 struct btrfs_extent_item *ei;
845 struct scrub_warning swarn;
69917e43
LB
846 unsigned long ptr = 0;
847 u64 extent_item_pos;
848 u64 flags = 0;
558540c1 849 u64 ref_root;
69917e43 850 u32 item_size;
07c9a8e0 851 u8 ref_level = 0;
69917e43 852 int ret;
558540c1 853
a36cf8b8 854 WARN_ON(sblock->page_count < 1);
7a9e9987 855 dev = sblock->pagev[0]->dev;
fb456252 856 fs_info = sblock->sctx->fs_info;
a36cf8b8 857
558540c1 858 path = btrfs_alloc_path();
8b9456da
DS
859 if (!path)
860 return;
558540c1 861
6aa21263 862 swarn.physical = sblock->pagev[0]->physical;
7a9e9987 863 swarn.logical = sblock->pagev[0]->logical;
558540c1 864 swarn.errstr = errstr;
a36cf8b8 865 swarn.dev = NULL;
558540c1 866
69917e43
LB
867 ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
868 &flags);
558540c1
JS
869 if (ret < 0)
870 goto out;
871
4692cf58 872 extent_item_pos = swarn.logical - found_key.objectid;
558540c1
JS
873 swarn.extent_item_size = found_key.offset;
874
875 eb = path->nodes[0];
876 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
877 item_size = btrfs_item_size_nr(eb, path->slots[0]);
878
69917e43 879 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
558540c1 880 do {
6eda71d0
LB
881 ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
882 item_size, &ref_root,
883 &ref_level);
ecaeb14b 884 btrfs_warn_in_rcu(fs_info,
6aa21263 885"%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
5d163e0e 886 errstr, swarn.logical,
606686ee 887 rcu_str_deref(dev->name),
6aa21263 888 swarn.physical,
558540c1
JS
889 ref_level ? "node" : "leaf",
890 ret < 0 ? -1 : ref_level,
891 ret < 0 ? -1 : ref_root);
892 } while (ret != 1);
d8fe29e9 893 btrfs_release_path(path);
558540c1 894 } else {
d8fe29e9 895 btrfs_release_path(path);
558540c1 896 swarn.path = path;
a36cf8b8 897 swarn.dev = dev;
7a3ae2f8
JS
898 iterate_extent_inodes(fs_info, found_key.objectid,
899 extent_item_pos, 1,
c995ab3c 900 scrub_print_warning_inode, &swarn, false);
558540c1
JS
901 }
902
903out:
904 btrfs_free_path(path);
558540c1
JS
905}
906
ff023aac 907static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
0ef8e451 908{
5da6fcbc 909 struct page *page = NULL;
0ef8e451 910 unsigned long index;
ff023aac 911 struct scrub_fixup_nodatasum *fixup = fixup_ctx;
0ef8e451 912 int ret;
5da6fcbc 913 int corrected = 0;
0ef8e451 914 struct btrfs_key key;
5da6fcbc 915 struct inode *inode = NULL;
6f1c3605 916 struct btrfs_fs_info *fs_info;
0ef8e451
JS
917 u64 end = offset + PAGE_SIZE - 1;
918 struct btrfs_root *local_root;
6f1c3605 919 int srcu_index;
0ef8e451
JS
920
921 key.objectid = root;
922 key.type = BTRFS_ROOT_ITEM_KEY;
923 key.offset = (u64)-1;
6f1c3605
LB
924
925 fs_info = fixup->root->fs_info;
926 srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
927
928 local_root = btrfs_read_fs_root_no_name(fs_info, &key);
929 if (IS_ERR(local_root)) {
930 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
0ef8e451 931 return PTR_ERR(local_root);
6f1c3605 932 }
0ef8e451
JS
933
934 key.type = BTRFS_INODE_ITEM_KEY;
935 key.objectid = inum;
936 key.offset = 0;
6f1c3605
LB
937 inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
938 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
0ef8e451
JS
939 if (IS_ERR(inode))
940 return PTR_ERR(inode);
941
09cbfeaf 942 index = offset >> PAGE_SHIFT;
0ef8e451
JS
943
944 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
5da6fcbc
JS
945 if (!page) {
946 ret = -ENOMEM;
947 goto out;
948 }
949
950 if (PageUptodate(page)) {
5da6fcbc
JS
951 if (PageDirty(page)) {
952 /*
953 * we need to write the data to the defect sector. the
954 * data that was in that sector is not in memory,
955 * because the page was modified. we must not write the
956 * modified page to that sector.
957 *
958 * TODO: what could be done here: wait for the delalloc
959 * runner to write out that page (might involve
960 * COW) and see whether the sector is still
961 * referenced afterwards.
962 *
963 * For the meantime, we'll treat this error
964 * incorrectable, although there is a chance that a
965 * later scrub will find the bad sector again and that
966 * there's no dirty page in memory, then.
967 */
968 ret = -EIO;
969 goto out;
970 }
6ec656bc 971 ret = repair_io_failure(fs_info, inum, offset, PAGE_SIZE,
5da6fcbc 972 fixup->logical, page,
ffdd2018 973 offset - page_offset(page),
5da6fcbc
JS
974 fixup->mirror_num);
975 unlock_page(page);
976 corrected = !ret;
977 } else {
978 /*
979 * we need to get good data first. the general readpage path
980 * will call repair_io_failure for us, we just have to make
981 * sure we read the bad mirror.
982 */
983 ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
ceeb0ae7 984 EXTENT_DAMAGED);
5da6fcbc
JS
985 if (ret) {
986 /* set_extent_bits should give proper error */
987 WARN_ON(ret > 0);
988 if (ret > 0)
989 ret = -EFAULT;
990 goto out;
991 }
992
993 ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
994 btrfs_get_extent,
995 fixup->mirror_num);
996 wait_on_page_locked(page);
997
998 corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset,
999 end, EXTENT_DAMAGED, 0, NULL);
1000 if (!corrected)
1001 clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
91166212 1002 EXTENT_DAMAGED);
5da6fcbc
JS
1003 }
1004
1005out:
1006 if (page)
1007 put_page(page);
7fb18a06
TK
1008
1009 iput(inode);
0ef8e451
JS
1010
1011 if (ret < 0)
1012 return ret;
1013
1014 if (ret == 0 && corrected) {
1015 /*
1016 * we only need to call readpage for one of the inodes belonging
1017 * to this extent. so make iterate_extent_inodes stop
1018 */
1019 return 1;
1020 }
1021
1022 return -EIO;
1023}
1024
1025static void scrub_fixup_nodatasum(struct btrfs_work *work)
1026{
0b246afa 1027 struct btrfs_fs_info *fs_info;
0ef8e451
JS
1028 int ret;
1029 struct scrub_fixup_nodatasum *fixup;
d9d181c1 1030 struct scrub_ctx *sctx;
0ef8e451 1031 struct btrfs_trans_handle *trans = NULL;
0ef8e451
JS
1032 struct btrfs_path *path;
1033 int uncorrectable = 0;
1034
1035 fixup = container_of(work, struct scrub_fixup_nodatasum, work);
d9d181c1 1036 sctx = fixup->sctx;
0b246afa 1037 fs_info = fixup->root->fs_info;
0ef8e451
JS
1038
1039 path = btrfs_alloc_path();
1040 if (!path) {
d9d181c1
SB
1041 spin_lock(&sctx->stat_lock);
1042 ++sctx->stat.malloc_errors;
1043 spin_unlock(&sctx->stat_lock);
0ef8e451
JS
1044 uncorrectable = 1;
1045 goto out;
1046 }
1047
1048 trans = btrfs_join_transaction(fixup->root);
1049 if (IS_ERR(trans)) {
1050 uncorrectable = 1;
1051 goto out;
1052 }
1053
1054 /*
1055 * the idea is to trigger a regular read through the standard path. we
1056 * read a page from the (failed) logical address by specifying the
1057 * corresponding copynum of the failed sector. thus, that readpage is
1058 * expected to fail.
1059 * that is the point where on-the-fly error correction will kick in
1060 * (once it's finished) and rewrite the failed sector if a good copy
1061 * can be found.
1062 */
0b246afa 1063 ret = iterate_inodes_from_logical(fixup->logical, fs_info, path,
c995ab3c 1064 scrub_fixup_readpage, fixup, false);
0ef8e451
JS
1065 if (ret < 0) {
1066 uncorrectable = 1;
1067 goto out;
1068 }
1069 WARN_ON(ret != 1);
1070
d9d181c1
SB
1071 spin_lock(&sctx->stat_lock);
1072 ++sctx->stat.corrected_errors;
1073 spin_unlock(&sctx->stat_lock);
0ef8e451
JS
1074
1075out:
1076 if (trans && !IS_ERR(trans))
3a45bb20 1077 btrfs_end_transaction(trans);
0ef8e451 1078 if (uncorrectable) {
d9d181c1
SB
1079 spin_lock(&sctx->stat_lock);
1080 ++sctx->stat.uncorrectable_errors;
1081 spin_unlock(&sctx->stat_lock);
ff023aac 1082 btrfs_dev_replace_stats_inc(
0b246afa
JM
1083 &fs_info->dev_replace.num_uncorrectable_read_errors);
1084 btrfs_err_rl_in_rcu(fs_info,
b14af3b4 1085 "unable to fixup (nodatasum) error at logical %llu on dev %s",
c1c9ff7c 1086 fixup->logical, rcu_str_deref(fixup->dev->name));
0ef8e451
JS
1087 }
1088
1089 btrfs_free_path(path);
1090 kfree(fixup);
1091
b6bfebc1 1092 scrub_pending_trans_workers_dec(sctx);
0ef8e451
JS
1093}
1094
af8e2d1d
MX
1095static inline void scrub_get_recover(struct scrub_recover *recover)
1096{
6f615018 1097 refcount_inc(&recover->refs);
af8e2d1d
MX
1098}
1099
e501bfe3
QW
1100static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
1101 struct scrub_recover *recover)
af8e2d1d 1102{
6f615018 1103 if (refcount_dec_and_test(&recover->refs)) {
e501bfe3 1104 btrfs_bio_counter_dec(fs_info);
6e9606d2 1105 btrfs_put_bbio(recover->bbio);
af8e2d1d
MX
1106 kfree(recover);
1107 }
1108}
1109
a2de733c 1110/*
b5d67f64
SB
1111 * scrub_handle_errored_block gets called when either verification of the
1112 * pages failed or the bio failed to read, e.g. with EIO. In the latter
1113 * case, this function handles all pages in the bio, even though only one
1114 * may be bad.
1115 * The goal of this function is to repair the errored block by using the
1116 * contents of one of the mirrors.
a2de733c 1117 */
b5d67f64 1118static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
a2de733c 1119{
d9d181c1 1120 struct scrub_ctx *sctx = sblock_to_check->sctx;
a36cf8b8 1121 struct btrfs_device *dev;
b5d67f64
SB
1122 struct btrfs_fs_info *fs_info;
1123 u64 length;
1124 u64 logical;
b5d67f64
SB
1125 unsigned int failed_mirror_index;
1126 unsigned int is_metadata;
1127 unsigned int have_csum;
b5d67f64
SB
1128 struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
1129 struct scrub_block *sblock_bad;
1130 int ret;
1131 int mirror_index;
1132 int page_num;
1133 int success;
28d70e23 1134 bool full_stripe_locked;
558540c1 1135 static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
b5d67f64
SB
1136 DEFAULT_RATELIMIT_BURST);
1137
1138 BUG_ON(sblock_to_check->page_count < 1);
fb456252 1139 fs_info = sctx->fs_info;
4ded4f63
SB
1140 if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
1141 /*
1142 * if we find an error in a super block, we just report it.
1143 * They will get written with the next transaction commit
1144 * anyway
1145 */
1146 spin_lock(&sctx->stat_lock);
1147 ++sctx->stat.super_errors;
1148 spin_unlock(&sctx->stat_lock);
1149 return 0;
1150 }
b5d67f64 1151 length = sblock_to_check->page_count * PAGE_SIZE;
7a9e9987 1152 logical = sblock_to_check->pagev[0]->logical;
7a9e9987
SB
1153 BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
1154 failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
1155 is_metadata = !(sblock_to_check->pagev[0]->flags &
b5d67f64 1156 BTRFS_EXTENT_FLAG_DATA);
7a9e9987 1157 have_csum = sblock_to_check->pagev[0]->have_csum;
7a9e9987 1158 dev = sblock_to_check->pagev[0]->dev;
13db62b7 1159
28d70e23
QW
1160 /*
1161 * For RAID5/6, race can happen for a different device scrub thread.
1162 * For data corruption, Parity and Data threads will both try
1163 * to recovery the data.
1164 * Race can lead to doubly added csum error, or even unrecoverable
1165 * error.
1166 */
1167 ret = lock_full_stripe(fs_info, logical, &full_stripe_locked);
1168 if (ret < 0) {
1169 spin_lock(&sctx->stat_lock);
1170 if (ret == -ENOMEM)
1171 sctx->stat.malloc_errors++;
1172 sctx->stat.read_errors++;
1173 sctx->stat.uncorrectable_errors++;
1174 spin_unlock(&sctx->stat_lock);
1175 return ret;
1176 }
1177
b5d67f64
SB
1178 /*
1179 * read all mirrors one after the other. This includes to
1180 * re-read the extent or metadata block that failed (that was
1181 * the cause that this fixup code is called) another time,
1182 * page by page this time in order to know which pages
1183 * caused I/O errors and which ones are good (for all mirrors).
1184 * It is the goal to handle the situation when more than one
1185 * mirror contains I/O errors, but the errors do not
1186 * overlap, i.e. the data can be repaired by selecting the
1187 * pages from those mirrors without I/O error on the
1188 * particular pages. One example (with blocks >= 2 * PAGE_SIZE)
1189 * would be that mirror #1 has an I/O error on the first page,
1190 * the second page is good, and mirror #2 has an I/O error on
1191 * the second page, but the first page is good.
1192 * Then the first page of the first mirror can be repaired by
1193 * taking the first page of the second mirror, and the
1194 * second page of the second mirror can be repaired by
1195 * copying the contents of the 2nd page of the 1st mirror.
1196 * One more note: if the pages of one mirror contain I/O
1197 * errors, the checksum cannot be verified. In order to get
1198 * the best data for repairing, the first attempt is to find
1199 * a mirror without I/O errors and with a validated checksum.
1200 * Only if this is not possible, the pages are picked from
1201 * mirrors with I/O errors without considering the checksum.
1202 * If the latter is the case, at the end, the checksum of the
1203 * repaired area is verified in order to correctly maintain
1204 * the statistics.
1205 */
1206
31e818fe
DS
1207 sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS,
1208 sizeof(*sblocks_for_recheck), GFP_NOFS);
b5d67f64 1209 if (!sblocks_for_recheck) {
d9d181c1
SB
1210 spin_lock(&sctx->stat_lock);
1211 sctx->stat.malloc_errors++;
1212 sctx->stat.read_errors++;
1213 sctx->stat.uncorrectable_errors++;
1214 spin_unlock(&sctx->stat_lock);
a36cf8b8 1215 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
b5d67f64 1216 goto out;
a2de733c
AJ
1217 }
1218
b5d67f64 1219 /* setup the context, map the logical blocks and alloc the pages */
be50a8dd 1220 ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
b5d67f64 1221 if (ret) {
d9d181c1
SB
1222 spin_lock(&sctx->stat_lock);
1223 sctx->stat.read_errors++;
1224 sctx->stat.uncorrectable_errors++;
1225 spin_unlock(&sctx->stat_lock);
a36cf8b8 1226 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
b5d67f64
SB
1227 goto out;
1228 }
1229 BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
1230 sblock_bad = sblocks_for_recheck + failed_mirror_index;
13db62b7 1231
b5d67f64 1232 /* build and submit the bios for the failed mirror, check checksums */
affe4a5a 1233 scrub_recheck_block(fs_info, sblock_bad, 1);
a2de733c 1234
b5d67f64
SB
1235 if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
1236 sblock_bad->no_io_error_seen) {
1237 /*
1238 * the error disappeared after reading page by page, or
1239 * the area was part of a huge bio and other parts of the
1240 * bio caused I/O errors, or the block layer merged several
1241 * read requests into one and the error is caused by a
1242 * different bio (usually one of the two latter cases is
1243 * the cause)
1244 */
d9d181c1
SB
1245 spin_lock(&sctx->stat_lock);
1246 sctx->stat.unverified_errors++;
5a6ac9ea 1247 sblock_to_check->data_corrected = 1;
d9d181c1 1248 spin_unlock(&sctx->stat_lock);
a2de733c 1249
ff023aac
SB
1250 if (sctx->is_dev_replace)
1251 scrub_write_block_to_dev_replace(sblock_bad);
b5d67f64 1252 goto out;
a2de733c 1253 }
a2de733c 1254
b5d67f64 1255 if (!sblock_bad->no_io_error_seen) {
d9d181c1
SB
1256 spin_lock(&sctx->stat_lock);
1257 sctx->stat.read_errors++;
1258 spin_unlock(&sctx->stat_lock);
b5d67f64
SB
1259 if (__ratelimit(&_rs))
1260 scrub_print_warning("i/o error", sblock_to_check);
a36cf8b8 1261 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
b5d67f64 1262 } else if (sblock_bad->checksum_error) {
d9d181c1
SB
1263 spin_lock(&sctx->stat_lock);
1264 sctx->stat.csum_errors++;
1265 spin_unlock(&sctx->stat_lock);
b5d67f64
SB
1266 if (__ratelimit(&_rs))
1267 scrub_print_warning("checksum error", sblock_to_check);
a36cf8b8 1268 btrfs_dev_stat_inc_and_print(dev,
442a4f63 1269 BTRFS_DEV_STAT_CORRUPTION_ERRS);
b5d67f64 1270 } else if (sblock_bad->header_error) {
d9d181c1
SB
1271 spin_lock(&sctx->stat_lock);
1272 sctx->stat.verify_errors++;
1273 spin_unlock(&sctx->stat_lock);
b5d67f64
SB
1274 if (__ratelimit(&_rs))
1275 scrub_print_warning("checksum/header error",
1276 sblock_to_check);
442a4f63 1277 if (sblock_bad->generation_error)
a36cf8b8 1278 btrfs_dev_stat_inc_and_print(dev,
442a4f63
SB
1279 BTRFS_DEV_STAT_GENERATION_ERRS);
1280 else
a36cf8b8 1281 btrfs_dev_stat_inc_and_print(dev,
442a4f63 1282 BTRFS_DEV_STAT_CORRUPTION_ERRS);
b5d67f64 1283 }
a2de733c 1284
33ef30ad
ID
1285 if (sctx->readonly) {
1286 ASSERT(!sctx->is_dev_replace);
1287 goto out;
1288 }
a2de733c 1289
85a971aa
QW
1290 /*
1291 * NOTE: Even for nodatasum case, it's still possible that it's a
1292 * compressed data extent, thus scrub_fixup_nodatasum(), which write
1293 * inode page cache onto disk, could cause serious data corruption.
1294 *
1295 * So here we could only read from disk, and hope our recovery could
1296 * reach disk before the newer write.
1297 */
1298 if (0 && !is_metadata && !have_csum) {
b5d67f64 1299 struct scrub_fixup_nodatasum *fixup_nodatasum;
a2de733c 1300
ff023aac
SB
1301 WARN_ON(sctx->is_dev_replace);
1302
b5d67f64
SB
1303 /*
1304 * !is_metadata and !have_csum, this means that the data
01327610 1305 * might not be COWed, that it might be modified
b5d67f64
SB
1306 * concurrently. The general strategy to work on the
1307 * commit root does not help in the case when COW is not
1308 * used.
1309 */
1310 fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);
1311 if (!fixup_nodatasum)
1312 goto did_not_correct_error;
d9d181c1 1313 fixup_nodatasum->sctx = sctx;
a36cf8b8 1314 fixup_nodatasum->dev = dev;
b5d67f64
SB
1315 fixup_nodatasum->logical = logical;
1316 fixup_nodatasum->root = fs_info->extent_root;
1317 fixup_nodatasum->mirror_num = failed_mirror_index + 1;
b6bfebc1 1318 scrub_pending_trans_workers_inc(sctx);
9e0af237
LB
1319 btrfs_init_work(&fixup_nodatasum->work, btrfs_scrub_helper,
1320 scrub_fixup_nodatasum, NULL, NULL);
0339ef2f
QW
1321 btrfs_queue_work(fs_info->scrub_workers,
1322 &fixup_nodatasum->work);
b5d67f64 1323 goto out;
a2de733c
AJ
1324 }
1325
b5d67f64
SB
1326 /*
1327 * now build and submit the bios for the other mirrors, check
cb2ced73
SB
1328 * checksums.
1329 * First try to pick the mirror which is completely without I/O
b5d67f64
SB
1330 * errors and also does not have a checksum error.
1331 * If one is found, and if a checksum is present, the full block
1332 * that is known to contain an error is rewritten. Afterwards
1333 * the block is known to be corrected.
1334 * If a mirror is found which is completely correct, and no
1335 * checksum is present, only those pages are rewritten that had
1336 * an I/O error in the block to be repaired, since it cannot be
1337 * determined, which copy of the other pages is better (and it
1338 * could happen otherwise that a correct page would be
1339 * overwritten by a bad one).
1340 */
42976286 1341 for (mirror_index = 0; ;mirror_index++) {
cb2ced73 1342 struct scrub_block *sblock_other;
b5d67f64 1343
cb2ced73
SB
1344 if (mirror_index == failed_mirror_index)
1345 continue;
42976286
LB
1346
1347 /* raid56's mirror can be more than BTRFS_MAX_MIRRORS */
1348 if (!scrub_is_page_on_raid56(sblock_bad->pagev[0])) {
1349 if (mirror_index >= BTRFS_MAX_MIRRORS)
1350 break;
1351 if (!sblocks_for_recheck[mirror_index].page_count)
1352 break;
1353
1354 sblock_other = sblocks_for_recheck + mirror_index;
1355 } else {
1356 struct scrub_recover *r = sblock_bad->pagev[0]->recover;
1357 int max_allowed = r->bbio->num_stripes -
1358 r->bbio->num_tgtdevs;
1359
1360 if (mirror_index >= max_allowed)
1361 break;
1362 if (!sblocks_for_recheck[1].page_count)
1363 break;
1364
1365 ASSERT(failed_mirror_index == 0);
1366 sblock_other = sblocks_for_recheck + 1;
1367 sblock_other->pagev[0]->mirror_num = 1 + mirror_index;
1368 }
cb2ced73
SB
1369
1370 /* build and submit the bios, check checksums */
affe4a5a 1371 scrub_recheck_block(fs_info, sblock_other, 0);
34f5c8e9
SB
1372
1373 if (!sblock_other->header_error &&
b5d67f64
SB
1374 !sblock_other->checksum_error &&
1375 sblock_other->no_io_error_seen) {
ff023aac
SB
1376 if (sctx->is_dev_replace) {
1377 scrub_write_block_to_dev_replace(sblock_other);
114ab50d 1378 goto corrected_error;
ff023aac 1379 } else {
ff023aac 1380 ret = scrub_repair_block_from_good_copy(
114ab50d
ZL
1381 sblock_bad, sblock_other);
1382 if (!ret)
1383 goto corrected_error;
ff023aac 1384 }
b5d67f64
SB
1385 }
1386 }
a2de733c 1387
b968fed1
ZL
1388 if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
1389 goto did_not_correct_error;
ff023aac
SB
1390
1391 /*
ff023aac 1392 * In case of I/O errors in the area that is supposed to be
b5d67f64
SB
1393 * repaired, continue by picking good copies of those pages.
1394 * Select the good pages from mirrors to rewrite bad pages from
1395 * the area to fix. Afterwards verify the checksum of the block
1396 * that is supposed to be repaired. This verification step is
1397 * only done for the purpose of statistic counting and for the
1398 * final scrub report, whether errors remain.
1399 * A perfect algorithm could make use of the checksum and try
1400 * all possible combinations of pages from the different mirrors
1401 * until the checksum verification succeeds. For example, when
1402 * the 2nd page of mirror #1 faces I/O errors, and the 2nd page
1403 * of mirror #2 is readable but the final checksum test fails,
1404 * then the 2nd page of mirror #3 could be tried, whether now
01327610 1405 * the final checksum succeeds. But this would be a rare
b5d67f64
SB
1406 * exception and is therefore not implemented. At least it is
1407 * avoided that the good copy is overwritten.
1408 * A more useful improvement would be to pick the sectors
1409 * without I/O error based on sector sizes (512 bytes on legacy
1410 * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one
1411 * mirror could be repaired by taking 512 byte of a different
1412 * mirror, even if other 512 byte sectors in the same PAGE_SIZE
1413 * area are unreadable.
a2de733c 1414 */
b5d67f64 1415 success = 1;
b968fed1
ZL
1416 for (page_num = 0; page_num < sblock_bad->page_count;
1417 page_num++) {
7a9e9987 1418 struct scrub_page *page_bad = sblock_bad->pagev[page_num];
b968fed1 1419 struct scrub_block *sblock_other = NULL;
b5d67f64 1420
b968fed1
ZL
1421 /* skip no-io-error page in scrub */
1422 if (!page_bad->io_error && !sctx->is_dev_replace)
a2de733c 1423 continue;
b5d67f64 1424
b968fed1
ZL
1425 /* try to find no-io-error page in mirrors */
1426 if (page_bad->io_error) {
1427 for (mirror_index = 0;
1428 mirror_index < BTRFS_MAX_MIRRORS &&
1429 sblocks_for_recheck[mirror_index].page_count > 0;
1430 mirror_index++) {
1431 if (!sblocks_for_recheck[mirror_index].
1432 pagev[page_num]->io_error) {
1433 sblock_other = sblocks_for_recheck +
1434 mirror_index;
1435 break;
b5d67f64
SB
1436 }
1437 }
b968fed1
ZL
1438 if (!sblock_other)
1439 success = 0;
96e36920 1440 }
a2de733c 1441
b968fed1
ZL
1442 if (sctx->is_dev_replace) {
1443 /*
1444 * did not find a mirror to fetch the page
1445 * from. scrub_write_page_to_dev_replace()
1446 * handles this case (page->io_error), by
1447 * filling the block with zeros before
1448 * submitting the write request
1449 */
1450 if (!sblock_other)
1451 sblock_other = sblock_bad;
1452
1453 if (scrub_write_page_to_dev_replace(sblock_other,
1454 page_num) != 0) {
1455 btrfs_dev_replace_stats_inc(
0b246afa 1456 &fs_info->dev_replace.num_write_errors);
b968fed1
ZL
1457 success = 0;
1458 }
1459 } else if (sblock_other) {
1460 ret = scrub_repair_page_from_good_copy(sblock_bad,
1461 sblock_other,
1462 page_num, 0);
1463 if (0 == ret)
1464 page_bad->io_error = 0;
1465 else
1466 success = 0;
b5d67f64 1467 }
a2de733c 1468 }
a2de733c 1469
b968fed1 1470 if (success && !sctx->is_dev_replace) {
b5d67f64
SB
1471 if (is_metadata || have_csum) {
1472 /*
1473 * need to verify the checksum now that all
1474 * sectors on disk are repaired (the write
1475 * request for data to be repaired is on its way).
1476 * Just be lazy and use scrub_recheck_block()
1477 * which re-reads the data before the checksum
1478 * is verified, but most likely the data comes out
1479 * of the page cache.
1480 */
affe4a5a 1481 scrub_recheck_block(fs_info, sblock_bad, 1);
34f5c8e9 1482 if (!sblock_bad->header_error &&
b5d67f64
SB
1483 !sblock_bad->checksum_error &&
1484 sblock_bad->no_io_error_seen)
1485 goto corrected_error;
1486 else
1487 goto did_not_correct_error;
1488 } else {
1489corrected_error:
d9d181c1
SB
1490 spin_lock(&sctx->stat_lock);
1491 sctx->stat.corrected_errors++;
5a6ac9ea 1492 sblock_to_check->data_corrected = 1;
d9d181c1 1493 spin_unlock(&sctx->stat_lock);
b14af3b4
DS
1494 btrfs_err_rl_in_rcu(fs_info,
1495 "fixed up error at logical %llu on dev %s",
c1c9ff7c 1496 logical, rcu_str_deref(dev->name));
8628764e 1497 }
b5d67f64
SB
1498 } else {
1499did_not_correct_error:
d9d181c1
SB
1500 spin_lock(&sctx->stat_lock);
1501 sctx->stat.uncorrectable_errors++;
1502 spin_unlock(&sctx->stat_lock);
b14af3b4
DS
1503 btrfs_err_rl_in_rcu(fs_info,
1504 "unable to fixup (regular) error at logical %llu on dev %s",
c1c9ff7c 1505 logical, rcu_str_deref(dev->name));
96e36920 1506 }
a2de733c 1507
b5d67f64
SB
1508out:
1509 if (sblocks_for_recheck) {
1510 for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
1511 mirror_index++) {
1512 struct scrub_block *sblock = sblocks_for_recheck +
1513 mirror_index;
af8e2d1d 1514 struct scrub_recover *recover;
b5d67f64
SB
1515 int page_index;
1516
7a9e9987
SB
1517 for (page_index = 0; page_index < sblock->page_count;
1518 page_index++) {
1519 sblock->pagev[page_index]->sblock = NULL;
af8e2d1d
MX
1520 recover = sblock->pagev[page_index]->recover;
1521 if (recover) {
e501bfe3 1522 scrub_put_recover(fs_info, recover);
af8e2d1d
MX
1523 sblock->pagev[page_index]->recover =
1524 NULL;
1525 }
7a9e9987
SB
1526 scrub_page_put(sblock->pagev[page_index]);
1527 }
b5d67f64
SB
1528 }
1529 kfree(sblocks_for_recheck);
1530 }
a2de733c 1531
28d70e23
QW
1532 ret = unlock_full_stripe(fs_info, logical, full_stripe_locked);
1533 if (ret < 0)
1534 return ret;
b5d67f64
SB
1535 return 0;
1536}
a2de733c 1537
8e5cfb55 1538static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio)
af8e2d1d 1539{
10f11900
ZL
1540 if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
1541 return 2;
1542 else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
1543 return 3;
1544 else
af8e2d1d 1545 return (int)bbio->num_stripes;
af8e2d1d
MX
1546}
1547
10f11900
ZL
1548static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
1549 u64 *raid_map,
af8e2d1d
MX
1550 u64 mapped_length,
1551 int nstripes, int mirror,
1552 int *stripe_index,
1553 u64 *stripe_offset)
1554{
1555 int i;
1556
ffe2d203 1557 if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
af8e2d1d
MX
1558 /* RAID5/6 */
1559 for (i = 0; i < nstripes; i++) {
1560 if (raid_map[i] == RAID6_Q_STRIPE ||
1561 raid_map[i] == RAID5_P_STRIPE)
1562 continue;
1563
1564 if (logical >= raid_map[i] &&
1565 logical < raid_map[i] + mapped_length)
1566 break;
1567 }
1568
1569 *stripe_index = i;
1570 *stripe_offset = logical - raid_map[i];
1571 } else {
1572 /* The other RAID type */
1573 *stripe_index = mirror;
1574 *stripe_offset = 0;
1575 }
1576}
1577
be50a8dd 1578static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
b5d67f64
SB
1579 struct scrub_block *sblocks_for_recheck)
1580{
be50a8dd 1581 struct scrub_ctx *sctx = original_sblock->sctx;
fb456252 1582 struct btrfs_fs_info *fs_info = sctx->fs_info;
be50a8dd
ZL
1583 u64 length = original_sblock->page_count * PAGE_SIZE;
1584 u64 logical = original_sblock->pagev[0]->logical;
4734b7ed
ZL
1585 u64 generation = original_sblock->pagev[0]->generation;
1586 u64 flags = original_sblock->pagev[0]->flags;
1587 u64 have_csum = original_sblock->pagev[0]->have_csum;
af8e2d1d
MX
1588 struct scrub_recover *recover;
1589 struct btrfs_bio *bbio;
af8e2d1d
MX
1590 u64 sublen;
1591 u64 mapped_length;
1592 u64 stripe_offset;
1593 int stripe_index;
be50a8dd 1594 int page_index = 0;
b5d67f64 1595 int mirror_index;
af8e2d1d 1596 int nmirrors;
b5d67f64
SB
1597 int ret;
1598
1599 /*
57019345 1600 * note: the two members refs and outstanding_pages
b5d67f64
SB
1601 * are not used (and not set) in the blocks that are used for
1602 * the recheck procedure
1603 */
1604
b5d67f64 1605 while (length > 0) {
af8e2d1d
MX
1606 sublen = min_t(u64, length, PAGE_SIZE);
1607 mapped_length = sublen;
1608 bbio = NULL;
a2de733c 1609
b5d67f64
SB
1610 /*
1611 * with a length of PAGE_SIZE, each returned stripe
1612 * represents one mirror
1613 */
e501bfe3 1614 btrfs_bio_counter_inc_blocked(fs_info);
cf8cddd3 1615 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
825ad4c9 1616 logical, &mapped_length, &bbio);
b5d67f64 1617 if (ret || !bbio || mapped_length < sublen) {
6e9606d2 1618 btrfs_put_bbio(bbio);
e501bfe3 1619 btrfs_bio_counter_dec(fs_info);
b5d67f64
SB
1620 return -EIO;
1621 }
a2de733c 1622
af8e2d1d
MX
1623 recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
1624 if (!recover) {
6e9606d2 1625 btrfs_put_bbio(bbio);
e501bfe3 1626 btrfs_bio_counter_dec(fs_info);
af8e2d1d
MX
1627 return -ENOMEM;
1628 }
1629
6f615018 1630 refcount_set(&recover->refs, 1);
af8e2d1d 1631 recover->bbio = bbio;
af8e2d1d
MX
1632 recover->map_length = mapped_length;
1633
24731149 1634 BUG_ON(page_index >= SCRUB_MAX_PAGES_PER_BLOCK);
af8e2d1d 1635
be50a8dd 1636 nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS);
10f11900 1637
af8e2d1d 1638 for (mirror_index = 0; mirror_index < nmirrors;
b5d67f64
SB
1639 mirror_index++) {
1640 struct scrub_block *sblock;
1641 struct scrub_page *page;
1642
b5d67f64 1643 sblock = sblocks_for_recheck + mirror_index;
7a9e9987 1644 sblock->sctx = sctx;
4734b7ed 1645
7a9e9987
SB
1646 page = kzalloc(sizeof(*page), GFP_NOFS);
1647 if (!page) {
1648leave_nomem:
d9d181c1
SB
1649 spin_lock(&sctx->stat_lock);
1650 sctx->stat.malloc_errors++;
1651 spin_unlock(&sctx->stat_lock);
e501bfe3 1652 scrub_put_recover(fs_info, recover);
b5d67f64
SB
1653 return -ENOMEM;
1654 }
7a9e9987
SB
1655 scrub_page_get(page);
1656 sblock->pagev[page_index] = page;
4734b7ed
ZL
1657 page->sblock = sblock;
1658 page->flags = flags;
1659 page->generation = generation;
7a9e9987 1660 page->logical = logical;
4734b7ed
ZL
1661 page->have_csum = have_csum;
1662 if (have_csum)
1663 memcpy(page->csum,
1664 original_sblock->pagev[0]->csum,
1665 sctx->csum_size);
af8e2d1d 1666
10f11900
ZL
1667 scrub_stripe_index_and_offset(logical,
1668 bbio->map_type,
1669 bbio->raid_map,
af8e2d1d 1670 mapped_length,
e34c330d
ZL
1671 bbio->num_stripes -
1672 bbio->num_tgtdevs,
af8e2d1d
MX
1673 mirror_index,
1674 &stripe_index,
1675 &stripe_offset);
1676 page->physical = bbio->stripes[stripe_index].physical +
1677 stripe_offset;
1678 page->dev = bbio->stripes[stripe_index].dev;
1679
ff023aac
SB
1680 BUG_ON(page_index >= original_sblock->page_count);
1681 page->physical_for_dev_replace =
1682 original_sblock->pagev[page_index]->
1683 physical_for_dev_replace;
7a9e9987 1684 /* for missing devices, dev->bdev is NULL */
7a9e9987 1685 page->mirror_num = mirror_index + 1;
b5d67f64 1686 sblock->page_count++;
7a9e9987
SB
1687 page->page = alloc_page(GFP_NOFS);
1688 if (!page->page)
1689 goto leave_nomem;
af8e2d1d
MX
1690
1691 scrub_get_recover(recover);
1692 page->recover = recover;
b5d67f64 1693 }
e501bfe3 1694 scrub_put_recover(fs_info, recover);
b5d67f64
SB
1695 length -= sublen;
1696 logical += sublen;
1697 page_index++;
1698 }
1699
1700 return 0;
96e36920
ID
1701}
1702
af8e2d1d
MX
1703struct scrub_bio_ret {
1704 struct completion event;
4e4cbee9 1705 blk_status_t status;
af8e2d1d
MX
1706};
1707
4246a0b6 1708static void scrub_bio_wait_endio(struct bio *bio)
af8e2d1d
MX
1709{
1710 struct scrub_bio_ret *ret = bio->bi_private;
1711
4e4cbee9 1712 ret->status = bio->bi_status;
af8e2d1d
MX
1713 complete(&ret->event);
1714}
1715
af8e2d1d
MX
1716static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
1717 struct bio *bio,
1718 struct scrub_page *page)
1719{
1720 struct scrub_bio_ret done;
1721 int ret;
42976286 1722 int mirror_num;
af8e2d1d
MX
1723
1724 init_completion(&done.event);
4e4cbee9 1725 done.status = 0;
af8e2d1d
MX
1726 bio->bi_iter.bi_sector = page->logical >> 9;
1727 bio->bi_private = &done;
1728 bio->bi_end_io = scrub_bio_wait_endio;
1729
42976286 1730 mirror_num = page->sblock->pagev[0]->mirror_num;
2ff7e61e 1731 ret = raid56_parity_recover(fs_info, bio, page->recover->bbio,
af8e2d1d 1732 page->recover->map_length,
42976286 1733 mirror_num, 0);
af8e2d1d
MX
1734 if (ret)
1735 return ret;
1736
131ce436 1737 wait_for_completion_io(&done.event);
4e4cbee9 1738 if (done.status)
af8e2d1d
MX
1739 return -EIO;
1740
1741 return 0;
1742}
1743
b5d67f64
SB
1744/*
1745 * this function will check the on disk data for checksum errors, header
1746 * errors and read I/O errors. If any I/O errors happen, the exact pages
1747 * which are errored are marked as being bad. The goal is to enable scrub
1748 * to take those pages that are not errored from all the mirrors so that
1749 * the pages that are errored in the just handled mirror can be repaired.
1750 */
34f5c8e9 1751static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
affe4a5a
ZL
1752 struct scrub_block *sblock,
1753 int retry_failed_mirror)
96e36920 1754{
b5d67f64 1755 int page_num;
96e36920 1756
b5d67f64 1757 sblock->no_io_error_seen = 1;
96e36920 1758
b5d67f64
SB
1759 for (page_num = 0; page_num < sblock->page_count; page_num++) {
1760 struct bio *bio;
7a9e9987 1761 struct scrub_page *page = sblock->pagev[page_num];
b5d67f64 1762
442a4f63 1763 if (page->dev->bdev == NULL) {
ea9947b4
SB
1764 page->io_error = 1;
1765 sblock->no_io_error_seen = 0;
1766 continue;
1767 }
1768
7a9e9987 1769 WARN_ON(!page->page);
c5e4c3d7 1770 bio = btrfs_io_bio_alloc(1);
74d46992 1771 bio_set_dev(bio, page->dev->bdev);
b5d67f64 1772
34f5c8e9 1773 bio_add_page(bio, page->page, PAGE_SIZE, 0);
af8e2d1d 1774 if (!retry_failed_mirror && scrub_is_page_on_raid56(page)) {
1bcd7aa1
LB
1775 if (scrub_submit_raid56_bio_wait(fs_info, bio, page)) {
1776 page->io_error = 1;
af8e2d1d 1777 sblock->no_io_error_seen = 0;
1bcd7aa1 1778 }
af8e2d1d
MX
1779 } else {
1780 bio->bi_iter.bi_sector = page->physical >> 9;
37226b21 1781 bio_set_op_attrs(bio, REQ_OP_READ, 0);
af8e2d1d 1782
1bcd7aa1
LB
1783 if (btrfsic_submit_bio_wait(bio)) {
1784 page->io_error = 1;
af8e2d1d 1785 sblock->no_io_error_seen = 0;
1bcd7aa1 1786 }
af8e2d1d 1787 }
33879d45 1788
b5d67f64
SB
1789 bio_put(bio);
1790 }
96e36920 1791
b5d67f64 1792 if (sblock->no_io_error_seen)
ba7cf988 1793 scrub_recheck_block_checksum(sblock);
a2de733c
AJ
1794}
1795
17a9be2f
MX
1796static inline int scrub_check_fsid(u8 fsid[],
1797 struct scrub_page *spage)
1798{
1799 struct btrfs_fs_devices *fs_devices = spage->dev->fs_devices;
1800 int ret;
1801
44880fdc 1802 ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
17a9be2f
MX
1803 return !ret;
1804}
1805
ba7cf988 1806static void scrub_recheck_block_checksum(struct scrub_block *sblock)
a2de733c 1807{
ba7cf988
ZL
1808 sblock->header_error = 0;
1809 sblock->checksum_error = 0;
1810 sblock->generation_error = 0;
b5d67f64 1811
ba7cf988
ZL
1812 if (sblock->pagev[0]->flags & BTRFS_EXTENT_FLAG_DATA)
1813 scrub_checksum_data(sblock);
1814 else
1815 scrub_checksum_tree_block(sblock);
a2de733c
AJ
1816}
1817
b5d67f64 1818static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
114ab50d 1819 struct scrub_block *sblock_good)
b5d67f64
SB
1820{
1821 int page_num;
1822 int ret = 0;
96e36920 1823
b5d67f64
SB
1824 for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
1825 int ret_sub;
96e36920 1826
b5d67f64
SB
1827 ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
1828 sblock_good,
114ab50d 1829 page_num, 1);
b5d67f64
SB
1830 if (ret_sub)
1831 ret = ret_sub;
a2de733c 1832 }
b5d67f64
SB
1833
1834 return ret;
1835}
1836
1837static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1838 struct scrub_block *sblock_good,
1839 int page_num, int force_write)
1840{
7a9e9987
SB
1841 struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1842 struct scrub_page *page_good = sblock_good->pagev[page_num];
0b246afa 1843 struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info;
b5d67f64 1844
7a9e9987
SB
1845 BUG_ON(page_bad->page == NULL);
1846 BUG_ON(page_good->page == NULL);
b5d67f64
SB
1847 if (force_write || sblock_bad->header_error ||
1848 sblock_bad->checksum_error || page_bad->io_error) {
1849 struct bio *bio;
1850 int ret;
b5d67f64 1851
ff023aac 1852 if (!page_bad->dev->bdev) {
0b246afa 1853 btrfs_warn_rl(fs_info,
5d163e0e 1854 "scrub_repair_page_from_good_copy(bdev == NULL) is unexpected");
ff023aac
SB
1855 return -EIO;
1856 }
1857
c5e4c3d7 1858 bio = btrfs_io_bio_alloc(1);
74d46992 1859 bio_set_dev(bio, page_bad->dev->bdev);
4f024f37 1860 bio->bi_iter.bi_sector = page_bad->physical >> 9;
37226b21 1861 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
b5d67f64
SB
1862
1863 ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
1864 if (PAGE_SIZE != ret) {
1865 bio_put(bio);
1866 return -EIO;
13db62b7 1867 }
b5d67f64 1868
4e49ea4a 1869 if (btrfsic_submit_bio_wait(bio)) {
442a4f63
SB
1870 btrfs_dev_stat_inc_and_print(page_bad->dev,
1871 BTRFS_DEV_STAT_WRITE_ERRS);
ff023aac 1872 btrfs_dev_replace_stats_inc(
0b246afa 1873 &fs_info->dev_replace.num_write_errors);
442a4f63
SB
1874 bio_put(bio);
1875 return -EIO;
1876 }
b5d67f64 1877 bio_put(bio);
a2de733c
AJ
1878 }
1879
b5d67f64
SB
1880 return 0;
1881}
1882
ff023aac
SB
1883static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1884{
0b246afa 1885 struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
ff023aac
SB
1886 int page_num;
1887
5a6ac9ea
MX
1888 /*
1889 * This block is used for the check of the parity on the source device,
1890 * so the data needn't be written into the destination device.
1891 */
1892 if (sblock->sparity)
1893 return;
1894
ff023aac
SB
1895 for (page_num = 0; page_num < sblock->page_count; page_num++) {
1896 int ret;
1897
1898 ret = scrub_write_page_to_dev_replace(sblock, page_num);
1899 if (ret)
1900 btrfs_dev_replace_stats_inc(
0b246afa 1901 &fs_info->dev_replace.num_write_errors);
ff023aac
SB
1902 }
1903}
1904
1905static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
1906 int page_num)
1907{
1908 struct scrub_page *spage = sblock->pagev[page_num];
1909
1910 BUG_ON(spage->page == NULL);
1911 if (spage->io_error) {
1912 void *mapped_buffer = kmap_atomic(spage->page);
1913
619a9742 1914 clear_page(mapped_buffer);
ff023aac
SB
1915 flush_dcache_page(spage->page);
1916 kunmap_atomic(mapped_buffer);
1917 }
1918 return scrub_add_page_to_wr_bio(sblock->sctx, spage);
1919}
1920
1921static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
1922 struct scrub_page *spage)
1923{
ff023aac
SB
1924 struct scrub_bio *sbio;
1925 int ret;
1926
3fb99303 1927 mutex_lock(&sctx->wr_lock);
ff023aac 1928again:
3fb99303 1929 if (!sctx->wr_curr_bio) {
225cce41
FM
1930 unsigned int nofs_flag;
1931
1932 /*
1933 * We must use GFP_NOFS because the scrub task might be waiting
1934 * for a worker task executing this function and in turn a
1935 * transaction commit might be waiting the scrub task to pause
1936 * (which needs to wait for all the worker tasks to complete
1937 * before pausing).
1938 */
1939 nofs_flag = memalloc_nofs_save();
3fb99303 1940 sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio),
58c4e173 1941 GFP_KERNEL);
225cce41 1942 memalloc_nofs_restore(nofs_flag);
3fb99303
DS
1943 if (!sctx->wr_curr_bio) {
1944 mutex_unlock(&sctx->wr_lock);
ff023aac
SB
1945 return -ENOMEM;
1946 }
3fb99303
DS
1947 sctx->wr_curr_bio->sctx = sctx;
1948 sctx->wr_curr_bio->page_count = 0;
ff023aac 1949 }
3fb99303 1950 sbio = sctx->wr_curr_bio;
ff023aac
SB
1951 if (sbio->page_count == 0) {
1952 struct bio *bio;
1953
1954 sbio->physical = spage->physical_for_dev_replace;
1955 sbio->logical = spage->logical;
3fb99303 1956 sbio->dev = sctx->wr_tgtdev;
ff023aac
SB
1957 bio = sbio->bio;
1958 if (!bio) {
c5e4c3d7 1959 bio = btrfs_io_bio_alloc(sctx->pages_per_wr_bio);
ff023aac
SB
1960 sbio->bio = bio;
1961 }
1962
1963 bio->bi_private = sbio;
1964 bio->bi_end_io = scrub_wr_bio_end_io;
74d46992 1965 bio_set_dev(bio, sbio->dev->bdev);
4f024f37 1966 bio->bi_iter.bi_sector = sbio->physical >> 9;
37226b21 1967 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
4e4cbee9 1968 sbio->status = 0;
ff023aac
SB
1969 } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1970 spage->physical_for_dev_replace ||
1971 sbio->logical + sbio->page_count * PAGE_SIZE !=
1972 spage->logical) {
1973 scrub_wr_submit(sctx);
1974 goto again;
1975 }
1976
1977 ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
1978 if (ret != PAGE_SIZE) {
1979 if (sbio->page_count < 1) {
1980 bio_put(sbio->bio);
1981 sbio->bio = NULL;
3fb99303 1982 mutex_unlock(&sctx->wr_lock);
ff023aac
SB
1983 return -EIO;
1984 }
1985 scrub_wr_submit(sctx);
1986 goto again;
1987 }
1988
1989 sbio->pagev[sbio->page_count] = spage;
1990 scrub_page_get(spage);
1991 sbio->page_count++;
3fb99303 1992 if (sbio->page_count == sctx->pages_per_wr_bio)
ff023aac 1993 scrub_wr_submit(sctx);
3fb99303 1994 mutex_unlock(&sctx->wr_lock);
ff023aac
SB
1995
1996 return 0;
1997}
1998
1999static void scrub_wr_submit(struct scrub_ctx *sctx)
2000{
ff023aac
SB
2001 struct scrub_bio *sbio;
2002
3fb99303 2003 if (!sctx->wr_curr_bio)
ff023aac
SB
2004 return;
2005
3fb99303
DS
2006 sbio = sctx->wr_curr_bio;
2007 sctx->wr_curr_bio = NULL;
74d46992 2008 WARN_ON(!sbio->bio->bi_disk);
ff023aac
SB
2009 scrub_pending_bio_inc(sctx);
2010 /* process all writes in a single worker thread. Then the block layer
2011 * orders the requests before sending them to the driver which
2012 * doubled the write performance on spinning disks when measured
2013 * with Linux 3.5 */
4e49ea4a 2014 btrfsic_submit_bio(sbio->bio);
ff023aac
SB
2015}
2016
4246a0b6 2017static void scrub_wr_bio_end_io(struct bio *bio)
ff023aac
SB
2018{
2019 struct scrub_bio *sbio = bio->bi_private;
fb456252 2020 struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
ff023aac 2021
4e4cbee9 2022 sbio->status = bio->bi_status;
ff023aac
SB
2023 sbio->bio = bio;
2024
9e0af237
LB
2025 btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper,
2026 scrub_wr_bio_end_io_worker, NULL, NULL);
0339ef2f 2027 btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
ff023aac
SB
2028}
2029
2030static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
2031{
2032 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
2033 struct scrub_ctx *sctx = sbio->sctx;
2034 int i;
2035
2036 WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
4e4cbee9 2037 if (sbio->status) {
ff023aac 2038 struct btrfs_dev_replace *dev_replace =
fb456252 2039 &sbio->sctx->fs_info->dev_replace;
ff023aac
SB
2040
2041 for (i = 0; i < sbio->page_count; i++) {
2042 struct scrub_page *spage = sbio->pagev[i];
2043
2044 spage->io_error = 1;
2045 btrfs_dev_replace_stats_inc(&dev_replace->
2046 num_write_errors);
2047 }
2048 }
2049
2050 for (i = 0; i < sbio->page_count; i++)
2051 scrub_page_put(sbio->pagev[i]);
2052
2053 bio_put(sbio->bio);
2054 kfree(sbio);
2055 scrub_pending_bio_dec(sctx);
2056}
2057
2058static int scrub_checksum(struct scrub_block *sblock)
b5d67f64
SB
2059{
2060 u64 flags;
2061 int ret;
2062
ba7cf988
ZL
2063 /*
2064 * No need to initialize these stats currently,
2065 * because this function only use return value
2066 * instead of these stats value.
2067 *
2068 * Todo:
2069 * always use stats
2070 */
2071 sblock->header_error = 0;
2072 sblock->generation_error = 0;
2073 sblock->checksum_error = 0;
2074
7a9e9987
SB
2075 WARN_ON(sblock->page_count < 1);
2076 flags = sblock->pagev[0]->flags;
b5d67f64
SB
2077 ret = 0;
2078 if (flags & BTRFS_EXTENT_FLAG_DATA)
2079 ret = scrub_checksum_data(sblock);
2080 else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
2081 ret = scrub_checksum_tree_block(sblock);
2082 else if (flags & BTRFS_EXTENT_FLAG_SUPER)
2083 (void)scrub_checksum_super(sblock);
2084 else
2085 WARN_ON(1);
2086 if (ret)
2087 scrub_handle_errored_block(sblock);
ff023aac
SB
2088
2089 return ret;
a2de733c
AJ
2090}
2091
b5d67f64 2092static int scrub_checksum_data(struct scrub_block *sblock)
a2de733c 2093{
d9d181c1 2094 struct scrub_ctx *sctx = sblock->sctx;
a2de733c 2095 u8 csum[BTRFS_CSUM_SIZE];
b5d67f64
SB
2096 u8 *on_disk_csum;
2097 struct page *page;
2098 void *buffer;
a2de733c 2099 u32 crc = ~(u32)0;
b5d67f64
SB
2100 u64 len;
2101 int index;
a2de733c 2102
b5d67f64 2103 BUG_ON(sblock->page_count < 1);
7a9e9987 2104 if (!sblock->pagev[0]->have_csum)
a2de733c
AJ
2105 return 0;
2106
7a9e9987
SB
2107 on_disk_csum = sblock->pagev[0]->csum;
2108 page = sblock->pagev[0]->page;
9613bebb 2109 buffer = kmap_atomic(page);
b5d67f64 2110
25cc1226 2111 len = sctx->fs_info->sectorsize;
b5d67f64
SB
2112 index = 0;
2113 for (;;) {
2114 u64 l = min_t(u64, len, PAGE_SIZE);
2115
b0496686 2116 crc = btrfs_csum_data(buffer, crc, l);
9613bebb 2117 kunmap_atomic(buffer);
b5d67f64
SB
2118 len -= l;
2119 if (len == 0)
2120 break;
2121 index++;
2122 BUG_ON(index >= sblock->page_count);
7a9e9987
SB
2123 BUG_ON(!sblock->pagev[index]->page);
2124 page = sblock->pagev[index]->page;
9613bebb 2125 buffer = kmap_atomic(page);
b5d67f64
SB
2126 }
2127
a2de733c 2128 btrfs_csum_final(crc, csum);
d9d181c1 2129 if (memcmp(csum, on_disk_csum, sctx->csum_size))
ba7cf988 2130 sblock->checksum_error = 1;
a2de733c 2131
ba7cf988 2132 return sblock->checksum_error;
a2de733c
AJ
2133}
2134
b5d67f64 2135static int scrub_checksum_tree_block(struct scrub_block *sblock)
a2de733c 2136{
d9d181c1 2137 struct scrub_ctx *sctx = sblock->sctx;
a2de733c 2138 struct btrfs_header *h;
0b246afa 2139 struct btrfs_fs_info *fs_info = sctx->fs_info;
b5d67f64
SB
2140 u8 calculated_csum[BTRFS_CSUM_SIZE];
2141 u8 on_disk_csum[BTRFS_CSUM_SIZE];
2142 struct page *page;
2143 void *mapped_buffer;
2144 u64 mapped_size;
2145 void *p;
a2de733c 2146 u32 crc = ~(u32)0;
b5d67f64
SB
2147 u64 len;
2148 int index;
2149
2150 BUG_ON(sblock->page_count < 1);
7a9e9987 2151 page = sblock->pagev[0]->page;
9613bebb 2152 mapped_buffer = kmap_atomic(page);
b5d67f64 2153 h = (struct btrfs_header *)mapped_buffer;
d9d181c1 2154 memcpy(on_disk_csum, h->csum, sctx->csum_size);
a2de733c
AJ
2155
2156 /*
2157 * we don't use the getter functions here, as we
2158 * a) don't have an extent buffer and
2159 * b) the page is already kmapped
2160 */
3cae210f 2161 if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h))
ba7cf988 2162 sblock->header_error = 1;
a2de733c 2163
ba7cf988
ZL
2164 if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h)) {
2165 sblock->header_error = 1;
2166 sblock->generation_error = 1;
2167 }
a2de733c 2168
17a9be2f 2169 if (!scrub_check_fsid(h->fsid, sblock->pagev[0]))
ba7cf988 2170 sblock->header_error = 1;
a2de733c
AJ
2171
2172 if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
2173 BTRFS_UUID_SIZE))
ba7cf988 2174 sblock->header_error = 1;
a2de733c 2175
25cc1226 2176 len = sctx->fs_info->nodesize - BTRFS_CSUM_SIZE;
b5d67f64
SB
2177 mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
2178 p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
2179 index = 0;
2180 for (;;) {
2181 u64 l = min_t(u64, len, mapped_size);
2182
b0496686 2183 crc = btrfs_csum_data(p, crc, l);
9613bebb 2184 kunmap_atomic(mapped_buffer);
b5d67f64
SB
2185 len -= l;
2186 if (len == 0)
2187 break;
2188 index++;
2189 BUG_ON(index >= sblock->page_count);
7a9e9987
SB
2190 BUG_ON(!sblock->pagev[index]->page);
2191 page = sblock->pagev[index]->page;
9613bebb 2192 mapped_buffer = kmap_atomic(page);
b5d67f64
SB
2193 mapped_size = PAGE_SIZE;
2194 p = mapped_buffer;
2195 }
2196
2197 btrfs_csum_final(crc, calculated_csum);
d9d181c1 2198 if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
ba7cf988 2199 sblock->checksum_error = 1;
a2de733c 2200
ba7cf988 2201 return sblock->header_error || sblock->checksum_error;
a2de733c
AJ
2202}
2203
b5d67f64 2204static int scrub_checksum_super(struct scrub_block *sblock)
a2de733c
AJ
2205{
2206 struct btrfs_super_block *s;
d9d181c1 2207 struct scrub_ctx *sctx = sblock->sctx;
b5d67f64
SB
2208 u8 calculated_csum[BTRFS_CSUM_SIZE];
2209 u8 on_disk_csum[BTRFS_CSUM_SIZE];
2210 struct page *page;
2211 void *mapped_buffer;
2212 u64 mapped_size;
2213 void *p;
a2de733c 2214 u32 crc = ~(u32)0;
442a4f63
SB
2215 int fail_gen = 0;
2216 int fail_cor = 0;
b5d67f64
SB
2217 u64 len;
2218 int index;
a2de733c 2219
b5d67f64 2220 BUG_ON(sblock->page_count < 1);
7a9e9987 2221 page = sblock->pagev[0]->page;
9613bebb 2222 mapped_buffer = kmap_atomic(page);
b5d67f64 2223 s = (struct btrfs_super_block *)mapped_buffer;
d9d181c1 2224 memcpy(on_disk_csum, s->csum, sctx->csum_size);
a2de733c 2225
3cae210f 2226 if (sblock->pagev[0]->logical != btrfs_super_bytenr(s))
442a4f63 2227 ++fail_cor;
a2de733c 2228
3cae210f 2229 if (sblock->pagev[0]->generation != btrfs_super_generation(s))
442a4f63 2230 ++fail_gen;
a2de733c 2231
17a9be2f 2232 if (!scrub_check_fsid(s->fsid, sblock->pagev[0]))
442a4f63 2233 ++fail_cor;
a2de733c 2234
b5d67f64
SB
2235 len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
2236 mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
2237 p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
2238 index = 0;
2239 for (;;) {
2240 u64 l = min_t(u64, len, mapped_size);
2241
b0496686 2242 crc = btrfs_csum_data(p, crc, l);
9613bebb 2243 kunmap_atomic(mapped_buffer);
b5d67f64
SB
2244 len -= l;
2245 if (len == 0)
2246 break;
2247 index++;
2248 BUG_ON(index >= sblock->page_count);
7a9e9987
SB
2249 BUG_ON(!sblock->pagev[index]->page);
2250 page = sblock->pagev[index]->page;
9613bebb 2251 mapped_buffer = kmap_atomic(page);
b5d67f64
SB
2252 mapped_size = PAGE_SIZE;
2253 p = mapped_buffer;
2254 }
2255
2256 btrfs_csum_final(crc, calculated_csum);
d9d181c1 2257 if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
442a4f63 2258 ++fail_cor;
a2de733c 2259
442a4f63 2260 if (fail_cor + fail_gen) {
a2de733c
AJ
2261 /*
2262 * if we find an error in a super block, we just report it.
2263 * They will get written with the next transaction commit
2264 * anyway
2265 */
d9d181c1
SB
2266 spin_lock(&sctx->stat_lock);
2267 ++sctx->stat.super_errors;
2268 spin_unlock(&sctx->stat_lock);
442a4f63 2269 if (fail_cor)
7a9e9987 2270 btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
442a4f63
SB
2271 BTRFS_DEV_STAT_CORRUPTION_ERRS);
2272 else
7a9e9987 2273 btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
442a4f63 2274 BTRFS_DEV_STAT_GENERATION_ERRS);
a2de733c
AJ
2275 }
2276
442a4f63 2277 return fail_cor + fail_gen;
a2de733c
AJ
2278}
2279
b5d67f64
SB
2280static void scrub_block_get(struct scrub_block *sblock)
2281{
186debd6 2282 refcount_inc(&sblock->refs);
b5d67f64
SB
2283}
2284
2285static void scrub_block_put(struct scrub_block *sblock)
2286{
186debd6 2287 if (refcount_dec_and_test(&sblock->refs)) {
b5d67f64
SB
2288 int i;
2289
5a6ac9ea
MX
2290 if (sblock->sparity)
2291 scrub_parity_put(sblock->sparity);
2292
b5d67f64 2293 for (i = 0; i < sblock->page_count; i++)
7a9e9987 2294 scrub_page_put(sblock->pagev[i]);
b5d67f64
SB
2295 kfree(sblock);
2296 }
2297}
2298
7a9e9987
SB
2299static void scrub_page_get(struct scrub_page *spage)
2300{
57019345 2301 atomic_inc(&spage->refs);
7a9e9987
SB
2302}
2303
2304static void scrub_page_put(struct scrub_page *spage)
2305{
57019345 2306 if (atomic_dec_and_test(&spage->refs)) {
7a9e9987
SB
2307 if (spage->page)
2308 __free_page(spage->page);
2309 kfree(spage);
2310 }
2311}
2312
d9d181c1 2313static void scrub_submit(struct scrub_ctx *sctx)
a2de733c
AJ
2314{
2315 struct scrub_bio *sbio;
2316
d9d181c1 2317 if (sctx->curr == -1)
1623edeb 2318 return;
a2de733c 2319
d9d181c1
SB
2320 sbio = sctx->bios[sctx->curr];
2321 sctx->curr = -1;
b6bfebc1 2322 scrub_pending_bio_inc(sctx);
4e49ea4a 2323 btrfsic_submit_bio(sbio->bio);
a2de733c
AJ
2324}
2325
ff023aac
SB
2326static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
2327 struct scrub_page *spage)
a2de733c 2328{
b5d67f64 2329 struct scrub_block *sblock = spage->sblock;
a2de733c 2330 struct scrub_bio *sbio;
69f4cb52 2331 int ret;
a2de733c
AJ
2332
2333again:
2334 /*
2335 * grab a fresh bio or wait for one to become available
2336 */
d9d181c1
SB
2337 while (sctx->curr == -1) {
2338 spin_lock(&sctx->list_lock);
2339 sctx->curr = sctx->first_free;
2340 if (sctx->curr != -1) {
2341 sctx->first_free = sctx->bios[sctx->curr]->next_free;
2342 sctx->bios[sctx->curr]->next_free = -1;
2343 sctx->bios[sctx->curr]->page_count = 0;
2344 spin_unlock(&sctx->list_lock);
a2de733c 2345 } else {
d9d181c1
SB
2346 spin_unlock(&sctx->list_lock);
2347 wait_event(sctx->list_wait, sctx->first_free != -1);
a2de733c
AJ
2348 }
2349 }
d9d181c1 2350 sbio = sctx->bios[sctx->curr];
b5d67f64 2351 if (sbio->page_count == 0) {
69f4cb52
AJ
2352 struct bio *bio;
2353
b5d67f64
SB
2354 sbio->physical = spage->physical;
2355 sbio->logical = spage->logical;
a36cf8b8 2356 sbio->dev = spage->dev;
b5d67f64
SB
2357 bio = sbio->bio;
2358 if (!bio) {
c5e4c3d7 2359 bio = btrfs_io_bio_alloc(sctx->pages_per_rd_bio);
b5d67f64
SB
2360 sbio->bio = bio;
2361 }
69f4cb52
AJ
2362
2363 bio->bi_private = sbio;
2364 bio->bi_end_io = scrub_bio_end_io;
74d46992 2365 bio_set_dev(bio, sbio->dev->bdev);
4f024f37 2366 bio->bi_iter.bi_sector = sbio->physical >> 9;
37226b21 2367 bio_set_op_attrs(bio, REQ_OP_READ, 0);
4e4cbee9 2368 sbio->status = 0;
b5d67f64
SB
2369 } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
2370 spage->physical ||
2371 sbio->logical + sbio->page_count * PAGE_SIZE !=
a36cf8b8
SB
2372 spage->logical ||
2373 sbio->dev != spage->dev) {
d9d181c1 2374 scrub_submit(sctx);
a2de733c
AJ
2375 goto again;
2376 }
69f4cb52 2377
b5d67f64
SB
2378 sbio->pagev[sbio->page_count] = spage;
2379 ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
2380 if (ret != PAGE_SIZE) {
2381 if (sbio->page_count < 1) {
2382 bio_put(sbio->bio);
2383 sbio->bio = NULL;
2384 return -EIO;
2385 }
d9d181c1 2386 scrub_submit(sctx);
69f4cb52
AJ
2387 goto again;
2388 }
2389
ff023aac 2390 scrub_block_get(sblock); /* one for the page added to the bio */
b5d67f64
SB
2391 atomic_inc(&sblock->outstanding_pages);
2392 sbio->page_count++;
ff023aac 2393 if (sbio->page_count == sctx->pages_per_rd_bio)
d9d181c1 2394 scrub_submit(sctx);
b5d67f64
SB
2395
2396 return 0;
2397}
2398
22365979 2399static void scrub_missing_raid56_end_io(struct bio *bio)
73ff61db
OS
2400{
2401 struct scrub_block *sblock = bio->bi_private;
fb456252 2402 struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
73ff61db 2403
4e4cbee9 2404 if (bio->bi_status)
73ff61db
OS
2405 sblock->no_io_error_seen = 0;
2406
4673272f
ST
2407 bio_put(bio);
2408
73ff61db
OS
2409 btrfs_queue_work(fs_info->scrub_workers, &sblock->work);
2410}
2411
2412static void scrub_missing_raid56_worker(struct btrfs_work *work)
2413{
2414 struct scrub_block *sblock = container_of(work, struct scrub_block, work);
2415 struct scrub_ctx *sctx = sblock->sctx;
0b246afa 2416 struct btrfs_fs_info *fs_info = sctx->fs_info;
73ff61db
OS
2417 u64 logical;
2418 struct btrfs_device *dev;
2419
73ff61db
OS
2420 logical = sblock->pagev[0]->logical;
2421 dev = sblock->pagev[0]->dev;
2422
affe4a5a 2423 if (sblock->no_io_error_seen)
ba7cf988 2424 scrub_recheck_block_checksum(sblock);
73ff61db
OS
2425
2426 if (!sblock->no_io_error_seen) {
2427 spin_lock(&sctx->stat_lock);
2428 sctx->stat.read_errors++;
2429 spin_unlock(&sctx->stat_lock);
0b246afa 2430 btrfs_err_rl_in_rcu(fs_info,
b14af3b4 2431 "IO error rebuilding logical %llu for dev %s",
73ff61db
OS
2432 logical, rcu_str_deref(dev->name));
2433 } else if (sblock->header_error || sblock->checksum_error) {
2434 spin_lock(&sctx->stat_lock);
2435 sctx->stat.uncorrectable_errors++;
2436 spin_unlock(&sctx->stat_lock);
0b246afa 2437 btrfs_err_rl_in_rcu(fs_info,
b14af3b4 2438 "failed to rebuild valid logical %llu for dev %s",
73ff61db
OS
2439 logical, rcu_str_deref(dev->name));
2440 } else {
2441 scrub_write_block_to_dev_replace(sblock);
2442 }
2443
2073c4c2 2444 if (sctx->is_dev_replace && sctx->flush_all_writes) {
3fb99303 2445 mutex_lock(&sctx->wr_lock);
73ff61db 2446 scrub_wr_submit(sctx);
3fb99303 2447 mutex_unlock(&sctx->wr_lock);
73ff61db
OS
2448 }
2449
e75c65c0 2450 scrub_block_put(sblock);
73ff61db
OS
2451 scrub_pending_bio_dec(sctx);
2452}
2453
2454static void scrub_missing_raid56_pages(struct scrub_block *sblock)
2455{
2456 struct scrub_ctx *sctx = sblock->sctx;
fb456252 2457 struct btrfs_fs_info *fs_info = sctx->fs_info;
73ff61db
OS
2458 u64 length = sblock->page_count * PAGE_SIZE;
2459 u64 logical = sblock->pagev[0]->logical;
f1fee653 2460 struct btrfs_bio *bbio = NULL;
73ff61db
OS
2461 struct bio *bio;
2462 struct btrfs_raid_bio *rbio;
2463 int ret;
2464 int i;
2465
ae6529c3 2466 btrfs_bio_counter_inc_blocked(fs_info);
cf8cddd3 2467 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
825ad4c9 2468 &length, &bbio);
73ff61db
OS
2469 if (ret || !bbio || !bbio->raid_map)
2470 goto bbio_out;
2471
2472 if (WARN_ON(!sctx->is_dev_replace ||
2473 !(bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
2474 /*
2475 * We shouldn't be scrubbing a missing device. Even for dev
2476 * replace, we should only get here for RAID 5/6. We either
2477 * managed to mount something with no mirrors remaining or
2478 * there's a bug in scrub_remap_extent()/btrfs_map_block().
2479 */
2480 goto bbio_out;
2481 }
2482
c5e4c3d7 2483 bio = btrfs_io_bio_alloc(0);
73ff61db
OS
2484 bio->bi_iter.bi_sector = logical >> 9;
2485 bio->bi_private = sblock;
2486 bio->bi_end_io = scrub_missing_raid56_end_io;
2487
2ff7e61e 2488 rbio = raid56_alloc_missing_rbio(fs_info, bio, bbio, length);
73ff61db
OS
2489 if (!rbio)
2490 goto rbio_out;
2491
2492 for (i = 0; i < sblock->page_count; i++) {
2493 struct scrub_page *spage = sblock->pagev[i];
2494
2495 raid56_add_scrub_pages(rbio, spage->page, spage->logical);
2496 }
2497
2498 btrfs_init_work(&sblock->work, btrfs_scrub_helper,
2499 scrub_missing_raid56_worker, NULL, NULL);
2500 scrub_block_get(sblock);
2501 scrub_pending_bio_inc(sctx);
2502 raid56_submit_missing_rbio(rbio);
2503 return;
2504
2505rbio_out:
2506 bio_put(bio);
2507bbio_out:
ae6529c3 2508 btrfs_bio_counter_dec(fs_info);
73ff61db
OS
2509 btrfs_put_bbio(bbio);
2510 spin_lock(&sctx->stat_lock);
2511 sctx->stat.malloc_errors++;
2512 spin_unlock(&sctx->stat_lock);
2513}
2514
d9d181c1 2515static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
a36cf8b8 2516 u64 physical, struct btrfs_device *dev, u64 flags,
ff023aac
SB
2517 u64 gen, int mirror_num, u8 *csum, int force,
2518 u64 physical_for_dev_replace)
b5d67f64
SB
2519{
2520 struct scrub_block *sblock;
2521 int index;
2522
58c4e173 2523 sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
b5d67f64 2524 if (!sblock) {
d9d181c1
SB
2525 spin_lock(&sctx->stat_lock);
2526 sctx->stat.malloc_errors++;
2527 spin_unlock(&sctx->stat_lock);
b5d67f64 2528 return -ENOMEM;
a2de733c 2529 }
b5d67f64 2530
7a9e9987
SB
2531 /* one ref inside this function, plus one for each page added to
2532 * a bio later on */
186debd6 2533 refcount_set(&sblock->refs, 1);
d9d181c1 2534 sblock->sctx = sctx;
b5d67f64
SB
2535 sblock->no_io_error_seen = 1;
2536
2537 for (index = 0; len > 0; index++) {
7a9e9987 2538 struct scrub_page *spage;
b5d67f64
SB
2539 u64 l = min_t(u64, len, PAGE_SIZE);
2540
58c4e173 2541 spage = kzalloc(sizeof(*spage), GFP_KERNEL);
7a9e9987
SB
2542 if (!spage) {
2543leave_nomem:
d9d181c1
SB
2544 spin_lock(&sctx->stat_lock);
2545 sctx->stat.malloc_errors++;
2546 spin_unlock(&sctx->stat_lock);
7a9e9987 2547 scrub_block_put(sblock);
b5d67f64
SB
2548 return -ENOMEM;
2549 }
7a9e9987
SB
2550 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2551 scrub_page_get(spage);
2552 sblock->pagev[index] = spage;
b5d67f64 2553 spage->sblock = sblock;
a36cf8b8 2554 spage->dev = dev;
b5d67f64
SB
2555 spage->flags = flags;
2556 spage->generation = gen;
2557 spage->logical = logical;
2558 spage->physical = physical;
ff023aac 2559 spage->physical_for_dev_replace = physical_for_dev_replace;
b5d67f64
SB
2560 spage->mirror_num = mirror_num;
2561 if (csum) {
2562 spage->have_csum = 1;
d9d181c1 2563 memcpy(spage->csum, csum, sctx->csum_size);
b5d67f64
SB
2564 } else {
2565 spage->have_csum = 0;
2566 }
2567 sblock->page_count++;
58c4e173 2568 spage->page = alloc_page(GFP_KERNEL);
7a9e9987
SB
2569 if (!spage->page)
2570 goto leave_nomem;
b5d67f64
SB
2571 len -= l;
2572 logical += l;
2573 physical += l;
ff023aac 2574 physical_for_dev_replace += l;
b5d67f64
SB
2575 }
2576
7a9e9987 2577 WARN_ON(sblock->page_count == 0);
73ff61db
OS
2578 if (dev->missing) {
2579 /*
2580 * This case should only be hit for RAID 5/6 device replace. See
2581 * the comment in scrub_missing_raid56_pages() for details.
2582 */
2583 scrub_missing_raid56_pages(sblock);
2584 } else {
2585 for (index = 0; index < sblock->page_count; index++) {
2586 struct scrub_page *spage = sblock->pagev[index];
2587 int ret;
1bc87793 2588
73ff61db
OS
2589 ret = scrub_add_page_to_rd_bio(sctx, spage);
2590 if (ret) {
2591 scrub_block_put(sblock);
2592 return ret;
2593 }
b5d67f64 2594 }
a2de733c 2595
73ff61db
OS
2596 if (force)
2597 scrub_submit(sctx);
2598 }
a2de733c 2599
b5d67f64
SB
2600 /* last one frees, either here or in bio completion for last page */
2601 scrub_block_put(sblock);
a2de733c
AJ
2602 return 0;
2603}
2604
4246a0b6 2605static void scrub_bio_end_io(struct bio *bio)
b5d67f64
SB
2606{
2607 struct scrub_bio *sbio = bio->bi_private;
fb456252 2608 struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
b5d67f64 2609
4e4cbee9 2610 sbio->status = bio->bi_status;
b5d67f64
SB
2611 sbio->bio = bio;
2612
0339ef2f 2613 btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
b5d67f64
SB
2614}
2615
2616static void scrub_bio_end_io_worker(struct btrfs_work *work)
2617{
2618 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
d9d181c1 2619 struct scrub_ctx *sctx = sbio->sctx;
b5d67f64
SB
2620 int i;
2621
ff023aac 2622 BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
4e4cbee9 2623 if (sbio->status) {
b5d67f64
SB
2624 for (i = 0; i < sbio->page_count; i++) {
2625 struct scrub_page *spage = sbio->pagev[i];
2626
2627 spage->io_error = 1;
2628 spage->sblock->no_io_error_seen = 0;
2629 }
2630 }
2631
2632 /* now complete the scrub_block items that have all pages completed */
2633 for (i = 0; i < sbio->page_count; i++) {
2634 struct scrub_page *spage = sbio->pagev[i];
2635 struct scrub_block *sblock = spage->sblock;
2636
2637 if (atomic_dec_and_test(&sblock->outstanding_pages))
2638 scrub_block_complete(sblock);
2639 scrub_block_put(sblock);
2640 }
2641
b5d67f64
SB
2642 bio_put(sbio->bio);
2643 sbio->bio = NULL;
d9d181c1
SB
2644 spin_lock(&sctx->list_lock);
2645 sbio->next_free = sctx->first_free;
2646 sctx->first_free = sbio->index;
2647 spin_unlock(&sctx->list_lock);
ff023aac 2648
2073c4c2 2649 if (sctx->is_dev_replace && sctx->flush_all_writes) {
3fb99303 2650 mutex_lock(&sctx->wr_lock);
ff023aac 2651 scrub_wr_submit(sctx);
3fb99303 2652 mutex_unlock(&sctx->wr_lock);
ff023aac
SB
2653 }
2654
b6bfebc1 2655 scrub_pending_bio_dec(sctx);
b5d67f64
SB
2656}
2657
5a6ac9ea
MX
2658static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
2659 unsigned long *bitmap,
2660 u64 start, u64 len)
2661{
972d7219 2662 u64 offset;
7736b0a4
DS
2663 u64 nsectors64;
2664 u32 nsectors;
da17066c 2665 int sectorsize = sparity->sctx->fs_info->sectorsize;
5a6ac9ea
MX
2666
2667 if (len >= sparity->stripe_len) {
2668 bitmap_set(bitmap, 0, sparity->nsectors);
2669 return;
2670 }
2671
2672 start -= sparity->logic_start;
972d7219
LB
2673 start = div64_u64_rem(start, sparity->stripe_len, &offset);
2674 offset = div_u64(offset, sectorsize);
7736b0a4
DS
2675 nsectors64 = div_u64(len, sectorsize);
2676
2677 ASSERT(nsectors64 < UINT_MAX);
2678 nsectors = (u32)nsectors64;
5a6ac9ea
MX
2679
2680 if (offset + nsectors <= sparity->nsectors) {
2681 bitmap_set(bitmap, offset, nsectors);
2682 return;
2683 }
2684
2685 bitmap_set(bitmap, offset, sparity->nsectors - offset);
2686 bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
2687}
2688
2689static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
2690 u64 start, u64 len)
2691{
2692 __scrub_mark_bitmap(sparity, sparity->ebitmap, start, len);
2693}
2694
2695static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
2696 u64 start, u64 len)
2697{
2698 __scrub_mark_bitmap(sparity, sparity->dbitmap, start, len);
2699}
2700
b5d67f64
SB
2701static void scrub_block_complete(struct scrub_block *sblock)
2702{
5a6ac9ea
MX
2703 int corrupted = 0;
2704
ff023aac 2705 if (!sblock->no_io_error_seen) {
5a6ac9ea 2706 corrupted = 1;
b5d67f64 2707 scrub_handle_errored_block(sblock);
ff023aac
SB
2708 } else {
2709 /*
2710 * if has checksum error, write via repair mechanism in
2711 * dev replace case, otherwise write here in dev replace
2712 * case.
2713 */
5a6ac9ea
MX
2714 corrupted = scrub_checksum(sblock);
2715 if (!corrupted && sblock->sctx->is_dev_replace)
ff023aac
SB
2716 scrub_write_block_to_dev_replace(sblock);
2717 }
5a6ac9ea
MX
2718
2719 if (sblock->sparity && corrupted && !sblock->data_corrected) {
2720 u64 start = sblock->pagev[0]->logical;
2721 u64 end = sblock->pagev[sblock->page_count - 1]->logical +
2722 PAGE_SIZE;
2723
2724 scrub_parity_mark_sectors_error(sblock->sparity,
2725 start, end - start);
2726 }
b5d67f64
SB
2727}
2728
3b5753ec 2729static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
a2de733c
AJ
2730{
2731 struct btrfs_ordered_sum *sum = NULL;
f51a4a18 2732 unsigned long index;
a2de733c 2733 unsigned long num_sectors;
a2de733c 2734
d9d181c1
SB
2735 while (!list_empty(&sctx->csum_list)) {
2736 sum = list_first_entry(&sctx->csum_list,
a2de733c
AJ
2737 struct btrfs_ordered_sum, list);
2738 if (sum->bytenr > logical)
2739 return 0;
2740 if (sum->bytenr + sum->len > logical)
2741 break;
2742
d9d181c1 2743 ++sctx->stat.csum_discards;
a2de733c
AJ
2744 list_del(&sum->list);
2745 kfree(sum);
2746 sum = NULL;
2747 }
2748 if (!sum)
2749 return 0;
2750
1d1bf92d
DS
2751 index = div_u64(logical - sum->bytenr, sctx->fs_info->sectorsize);
2752 ASSERT(index < UINT_MAX);
2753
25cc1226 2754 num_sectors = sum->len / sctx->fs_info->sectorsize;
f51a4a18
MX
2755 memcpy(csum, sum->sums + index, sctx->csum_size);
2756 if (index == num_sectors - 1) {
a2de733c
AJ
2757 list_del(&sum->list);
2758 kfree(sum);
2759 }
f51a4a18 2760 return 1;
a2de733c
AJ
2761}
2762
2763/* scrub extent tries to collect up to 64 kB for each bio */
d9d181c1 2764static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
a36cf8b8 2765 u64 physical, struct btrfs_device *dev, u64 flags,
ff023aac 2766 u64 gen, int mirror_num, u64 physical_for_dev_replace)
a2de733c
AJ
2767{
2768 int ret;
2769 u8 csum[BTRFS_CSUM_SIZE];
b5d67f64
SB
2770 u32 blocksize;
2771
2772 if (flags & BTRFS_EXTENT_FLAG_DATA) {
25cc1226 2773 blocksize = sctx->fs_info->sectorsize;
d9d181c1
SB
2774 spin_lock(&sctx->stat_lock);
2775 sctx->stat.data_extents_scrubbed++;
2776 sctx->stat.data_bytes_scrubbed += len;
2777 spin_unlock(&sctx->stat_lock);
b5d67f64 2778 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
25cc1226 2779 blocksize = sctx->fs_info->nodesize;
d9d181c1
SB
2780 spin_lock(&sctx->stat_lock);
2781 sctx->stat.tree_extents_scrubbed++;
2782 sctx->stat.tree_bytes_scrubbed += len;
2783 spin_unlock(&sctx->stat_lock);
b5d67f64 2784 } else {
25cc1226 2785 blocksize = sctx->fs_info->sectorsize;
ff023aac 2786 WARN_ON(1);
b5d67f64 2787 }
a2de733c
AJ
2788
2789 while (len) {
b5d67f64 2790 u64 l = min_t(u64, len, blocksize);
a2de733c
AJ
2791 int have_csum = 0;
2792
2793 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2794 /* push csums to sbio */
3b5753ec 2795 have_csum = scrub_find_csum(sctx, logical, csum);
a2de733c 2796 if (have_csum == 0)
d9d181c1 2797 ++sctx->stat.no_csum;
51fd66db 2798 if (0 && sctx->is_dev_replace && !have_csum) {
ff023aac
SB
2799 ret = copy_nocow_pages(sctx, logical, l,
2800 mirror_num,
2801 physical_for_dev_replace);
2802 goto behind_scrub_pages;
2803 }
a2de733c 2804 }
a36cf8b8 2805 ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
ff023aac
SB
2806 mirror_num, have_csum ? csum : NULL, 0,
2807 physical_for_dev_replace);
2808behind_scrub_pages:
a2de733c
AJ
2809 if (ret)
2810 return ret;
2811 len -= l;
2812 logical += l;
2813 physical += l;
ff023aac 2814 physical_for_dev_replace += l;
a2de733c
AJ
2815 }
2816 return 0;
2817}
2818
5a6ac9ea
MX
2819static int scrub_pages_for_parity(struct scrub_parity *sparity,
2820 u64 logical, u64 len,
2821 u64 physical, struct btrfs_device *dev,
2822 u64 flags, u64 gen, int mirror_num, u8 *csum)
2823{
2824 struct scrub_ctx *sctx = sparity->sctx;
2825 struct scrub_block *sblock;
2826 int index;
2827
58c4e173 2828 sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
5a6ac9ea
MX
2829 if (!sblock) {
2830 spin_lock(&sctx->stat_lock);
2831 sctx->stat.malloc_errors++;
2832 spin_unlock(&sctx->stat_lock);
2833 return -ENOMEM;
2834 }
2835
2836 /* one ref inside this function, plus one for each page added to
2837 * a bio later on */
186debd6 2838 refcount_set(&sblock->refs, 1);
5a6ac9ea
MX
2839 sblock->sctx = sctx;
2840 sblock->no_io_error_seen = 1;
2841 sblock->sparity = sparity;
2842 scrub_parity_get(sparity);
2843
2844 for (index = 0; len > 0; index++) {
2845 struct scrub_page *spage;
2846 u64 l = min_t(u64, len, PAGE_SIZE);
2847
58c4e173 2848 spage = kzalloc(sizeof(*spage), GFP_KERNEL);
5a6ac9ea
MX
2849 if (!spage) {
2850leave_nomem:
2851 spin_lock(&sctx->stat_lock);
2852 sctx->stat.malloc_errors++;
2853 spin_unlock(&sctx->stat_lock);
2854 scrub_block_put(sblock);
2855 return -ENOMEM;
2856 }
2857 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2858 /* For scrub block */
2859 scrub_page_get(spage);
2860 sblock->pagev[index] = spage;
2861 /* For scrub parity */
2862 scrub_page_get(spage);
2863 list_add_tail(&spage->list, &sparity->spages);
2864 spage->sblock = sblock;
2865 spage->dev = dev;
2866 spage->flags = flags;
2867 spage->generation = gen;
2868 spage->logical = logical;
2869 spage->physical = physical;
2870 spage->mirror_num = mirror_num;
2871 if (csum) {
2872 spage->have_csum = 1;
2873 memcpy(spage->csum, csum, sctx->csum_size);
2874 } else {
2875 spage->have_csum = 0;
2876 }
2877 sblock->page_count++;
58c4e173 2878 spage->page = alloc_page(GFP_KERNEL);
5a6ac9ea
MX
2879 if (!spage->page)
2880 goto leave_nomem;
2881 len -= l;
2882 logical += l;
2883 physical += l;
2884 }
2885
2886 WARN_ON(sblock->page_count == 0);
2887 for (index = 0; index < sblock->page_count; index++) {
2888 struct scrub_page *spage = sblock->pagev[index];
2889 int ret;
2890
2891 ret = scrub_add_page_to_rd_bio(sctx, spage);
2892 if (ret) {
2893 scrub_block_put(sblock);
2894 return ret;
2895 }
2896 }
2897
2898 /* last one frees, either here or in bio completion for last page */
2899 scrub_block_put(sblock);
2900 return 0;
2901}
2902
2903static int scrub_extent_for_parity(struct scrub_parity *sparity,
2904 u64 logical, u64 len,
2905 u64 physical, struct btrfs_device *dev,
2906 u64 flags, u64 gen, int mirror_num)
2907{
2908 struct scrub_ctx *sctx = sparity->sctx;
2909 int ret;
2910 u8 csum[BTRFS_CSUM_SIZE];
2911 u32 blocksize;
2912
4a770891
OS
2913 if (dev->missing) {
2914 scrub_parity_mark_sectors_error(sparity, logical, len);
2915 return 0;
2916 }
2917
5a6ac9ea 2918 if (flags & BTRFS_EXTENT_FLAG_DATA) {
25cc1226 2919 blocksize = sctx->fs_info->sectorsize;
5a6ac9ea 2920 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
25cc1226 2921 blocksize = sctx->fs_info->nodesize;
5a6ac9ea 2922 } else {
25cc1226 2923 blocksize = sctx->fs_info->sectorsize;
5a6ac9ea
MX
2924 WARN_ON(1);
2925 }
2926
2927 while (len) {
2928 u64 l = min_t(u64, len, blocksize);
2929 int have_csum = 0;
2930
2931 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2932 /* push csums to sbio */
3b5753ec 2933 have_csum = scrub_find_csum(sctx, logical, csum);
5a6ac9ea
MX
2934 if (have_csum == 0)
2935 goto skip;
2936 }
2937 ret = scrub_pages_for_parity(sparity, logical, l, physical, dev,
2938 flags, gen, mirror_num,
2939 have_csum ? csum : NULL);
5a6ac9ea
MX
2940 if (ret)
2941 return ret;
6b6d24b3 2942skip:
5a6ac9ea
MX
2943 len -= l;
2944 logical += l;
2945 physical += l;
2946 }
2947 return 0;
2948}
2949
3b080b25
WS
2950/*
2951 * Given a physical address, this will calculate it's
2952 * logical offset. if this is a parity stripe, it will return
2953 * the most left data stripe's logical offset.
2954 *
2955 * return 0 if it is a data stripe, 1 means parity stripe.
2956 */
2957static int get_raid56_logic_offset(u64 physical, int num,
5a6ac9ea
MX
2958 struct map_lookup *map, u64 *offset,
2959 u64 *stripe_start)
3b080b25
WS
2960{
2961 int i;
2962 int j = 0;
2963 u64 stripe_nr;
2964 u64 last_offset;
9d644a62
DS
2965 u32 stripe_index;
2966 u32 rot;
3b080b25
WS
2967
2968 last_offset = (physical - map->stripes[num].physical) *
2969 nr_data_stripes(map);
5a6ac9ea
MX
2970 if (stripe_start)
2971 *stripe_start = last_offset;
2972
3b080b25
WS
2973 *offset = last_offset;
2974 for (i = 0; i < nr_data_stripes(map); i++) {
2975 *offset = last_offset + i * map->stripe_len;
2976
42c61ab6 2977 stripe_nr = div64_u64(*offset, map->stripe_len);
b8b93add 2978 stripe_nr = div_u64(stripe_nr, nr_data_stripes(map));
3b080b25
WS
2979
2980 /* Work out the disk rotation on this stripe-set */
47c5713f 2981 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot);
3b080b25
WS
2982 /* calculate which stripe this data locates */
2983 rot += i;
e4fbaee2 2984 stripe_index = rot % map->num_stripes;
3b080b25
WS
2985 if (stripe_index == num)
2986 return 0;
2987 if (stripe_index < num)
2988 j++;
2989 }
2990 *offset = last_offset + j * map->stripe_len;
2991 return 1;
2992}
2993
5a6ac9ea
MX
2994static void scrub_free_parity(struct scrub_parity *sparity)
2995{
2996 struct scrub_ctx *sctx = sparity->sctx;
2997 struct scrub_page *curr, *next;
2998 int nbits;
2999
3000 nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors);
3001 if (nbits) {
3002 spin_lock(&sctx->stat_lock);
3003 sctx->stat.read_errors += nbits;
3004 sctx->stat.uncorrectable_errors += nbits;
3005 spin_unlock(&sctx->stat_lock);
3006 }
3007
3008 list_for_each_entry_safe(curr, next, &sparity->spages, list) {
3009 list_del_init(&curr->list);
3010 scrub_page_put(curr);
3011 }
3012
3013 kfree(sparity);
3014}
3015
20b2e302
ZL
3016static void scrub_parity_bio_endio_worker(struct btrfs_work *work)
3017{
3018 struct scrub_parity *sparity = container_of(work, struct scrub_parity,
3019 work);
3020 struct scrub_ctx *sctx = sparity->sctx;
3021
3022 scrub_free_parity(sparity);
3023 scrub_pending_bio_dec(sctx);
3024}
3025
4246a0b6 3026static void scrub_parity_bio_endio(struct bio *bio)
5a6ac9ea
MX
3027{
3028 struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private;
0b246afa 3029 struct btrfs_fs_info *fs_info = sparity->sctx->fs_info;
5a6ac9ea 3030
4e4cbee9 3031 if (bio->bi_status)
5a6ac9ea
MX
3032 bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
3033 sparity->nsectors);
3034
5a6ac9ea 3035 bio_put(bio);
20b2e302
ZL
3036
3037 btrfs_init_work(&sparity->work, btrfs_scrubparity_helper,
3038 scrub_parity_bio_endio_worker, NULL, NULL);
0b246afa 3039 btrfs_queue_work(fs_info->scrub_parity_workers, &sparity->work);
5a6ac9ea
MX
3040}
3041
3042static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
3043{
3044 struct scrub_ctx *sctx = sparity->sctx;
0b246afa 3045 struct btrfs_fs_info *fs_info = sctx->fs_info;
5a6ac9ea
MX
3046 struct bio *bio;
3047 struct btrfs_raid_bio *rbio;
5a6ac9ea 3048 struct btrfs_bio *bbio = NULL;
5a6ac9ea
MX
3049 u64 length;
3050 int ret;
3051
3052 if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap,
3053 sparity->nsectors))
3054 goto out;
3055
a0dd59de 3056 length = sparity->logic_end - sparity->logic_start;
ae6529c3
QW
3057
3058 btrfs_bio_counter_inc_blocked(fs_info);
0b246afa 3059 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
825ad4c9 3060 &length, &bbio);
8e5cfb55 3061 if (ret || !bbio || !bbio->raid_map)
5a6ac9ea
MX
3062 goto bbio_out;
3063
c5e4c3d7 3064 bio = btrfs_io_bio_alloc(0);
5a6ac9ea
MX
3065 bio->bi_iter.bi_sector = sparity->logic_start >> 9;
3066 bio->bi_private = sparity;
3067 bio->bi_end_io = scrub_parity_bio_endio;
3068
2ff7e61e 3069 rbio = raid56_parity_alloc_scrub_rbio(fs_info, bio, bbio,
8e5cfb55 3070 length, sparity->scrub_dev,
5a6ac9ea
MX
3071 sparity->dbitmap,
3072 sparity->nsectors);
3073 if (!rbio)
3074 goto rbio_out;
3075
5a6ac9ea
MX
3076 scrub_pending_bio_inc(sctx);
3077 raid56_parity_submit_scrub_rbio(rbio);
3078 return;
3079
3080rbio_out:
3081 bio_put(bio);
3082bbio_out:
ae6529c3 3083 btrfs_bio_counter_dec(fs_info);
6e9606d2 3084 btrfs_put_bbio(bbio);
5a6ac9ea
MX
3085 bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
3086 sparity->nsectors);
3087 spin_lock(&sctx->stat_lock);
3088 sctx->stat.malloc_errors++;
3089 spin_unlock(&sctx->stat_lock);
3090out:
3091 scrub_free_parity(sparity);
3092}
3093
3094static inline int scrub_calc_parity_bitmap_len(int nsectors)
3095{
bfca9a6d 3096 return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * sizeof(long);
5a6ac9ea
MX
3097}
3098
3099static void scrub_parity_get(struct scrub_parity *sparity)
3100{
78a76450 3101 refcount_inc(&sparity->refs);
5a6ac9ea
MX
3102}
3103
3104static void scrub_parity_put(struct scrub_parity *sparity)
3105{
78a76450 3106 if (!refcount_dec_and_test(&sparity->refs))
5a6ac9ea
MX
3107 return;
3108
3109 scrub_parity_check_and_repair(sparity);
3110}
3111
3112static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
3113 struct map_lookup *map,
3114 struct btrfs_device *sdev,
3115 struct btrfs_path *path,
3116 u64 logic_start,
3117 u64 logic_end)
3118{
fb456252 3119 struct btrfs_fs_info *fs_info = sctx->fs_info;
5a6ac9ea
MX
3120 struct btrfs_root *root = fs_info->extent_root;
3121 struct btrfs_root *csum_root = fs_info->csum_root;
3122 struct btrfs_extent_item *extent;
4a770891 3123 struct btrfs_bio *bbio = NULL;
5a6ac9ea
MX
3124 u64 flags;
3125 int ret;
3126 int slot;
3127 struct extent_buffer *l;
3128 struct btrfs_key key;
3129 u64 generation;
3130 u64 extent_logical;
3131 u64 extent_physical;
3132 u64 extent_len;
4a770891 3133 u64 mapped_length;
5a6ac9ea
MX
3134 struct btrfs_device *extent_dev;
3135 struct scrub_parity *sparity;
3136 int nsectors;
3137 int bitmap_len;
3138 int extent_mirror_num;
3139 int stop_loop = 0;
3140
0b246afa 3141 nsectors = div_u64(map->stripe_len, fs_info->sectorsize);
5a6ac9ea
MX
3142 bitmap_len = scrub_calc_parity_bitmap_len(nsectors);
3143 sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len,
3144 GFP_NOFS);
3145 if (!sparity) {
3146 spin_lock(&sctx->stat_lock);
3147 sctx->stat.malloc_errors++;
3148 spin_unlock(&sctx->stat_lock);
3149 return -ENOMEM;
3150 }
3151
3152 sparity->stripe_len = map->stripe_len;
3153 sparity->nsectors = nsectors;
3154 sparity->sctx = sctx;
3155 sparity->scrub_dev = sdev;
3156 sparity->logic_start = logic_start;
3157 sparity->logic_end = logic_end;
78a76450 3158 refcount_set(&sparity->refs, 1);
5a6ac9ea
MX
3159 INIT_LIST_HEAD(&sparity->spages);
3160 sparity->dbitmap = sparity->bitmap;
3161 sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
3162
3163 ret = 0;
3164 while (logic_start < logic_end) {
3165 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
3166 key.type = BTRFS_METADATA_ITEM_KEY;
3167 else
3168 key.type = BTRFS_EXTENT_ITEM_KEY;
3169 key.objectid = logic_start;
3170 key.offset = (u64)-1;
3171
3172 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3173 if (ret < 0)
3174 goto out;
3175
3176 if (ret > 0) {
3177 ret = btrfs_previous_extent_item(root, path, 0);
3178 if (ret < 0)
3179 goto out;
3180 if (ret > 0) {
3181 btrfs_release_path(path);
3182 ret = btrfs_search_slot(NULL, root, &key,
3183 path, 0, 0);
3184 if (ret < 0)
3185 goto out;
3186 }
3187 }
3188
3189 stop_loop = 0;
3190 while (1) {
3191 u64 bytes;
3192
3193 l = path->nodes[0];
3194 slot = path->slots[0];
3195 if (slot >= btrfs_header_nritems(l)) {
3196 ret = btrfs_next_leaf(root, path);
3197 if (ret == 0)
3198 continue;
3199 if (ret < 0)
3200 goto out;
3201
3202 stop_loop = 1;
3203 break;
3204 }
3205 btrfs_item_key_to_cpu(l, &key, slot);
3206
d7cad238
ZL
3207 if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3208 key.type != BTRFS_METADATA_ITEM_KEY)
3209 goto next;
3210
5a6ac9ea 3211 if (key.type == BTRFS_METADATA_ITEM_KEY)
0b246afa 3212 bytes = fs_info->nodesize;
5a6ac9ea
MX
3213 else
3214 bytes = key.offset;
3215
3216 if (key.objectid + bytes <= logic_start)
3217 goto next;
3218
a0dd59de 3219 if (key.objectid >= logic_end) {
5a6ac9ea
MX
3220 stop_loop = 1;
3221 break;
3222 }
3223
3224 while (key.objectid >= logic_start + map->stripe_len)
3225 logic_start += map->stripe_len;
3226
3227 extent = btrfs_item_ptr(l, slot,
3228 struct btrfs_extent_item);
3229 flags = btrfs_extent_flags(l, extent);
3230 generation = btrfs_extent_generation(l, extent);
3231
a323e813
ZL
3232 if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3233 (key.objectid < logic_start ||
3234 key.objectid + bytes >
3235 logic_start + map->stripe_len)) {
5d163e0e
JM
3236 btrfs_err(fs_info,
3237 "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
a323e813 3238 key.objectid, logic_start);
9799d2c3
ZL
3239 spin_lock(&sctx->stat_lock);
3240 sctx->stat.uncorrectable_errors++;
3241 spin_unlock(&sctx->stat_lock);
5a6ac9ea
MX
3242 goto next;
3243 }
3244again:
3245 extent_logical = key.objectid;
3246 extent_len = bytes;
3247
3248 if (extent_logical < logic_start) {
3249 extent_len -= logic_start - extent_logical;
3250 extent_logical = logic_start;
3251 }
3252
3253 if (extent_logical + extent_len >
3254 logic_start + map->stripe_len)
3255 extent_len = logic_start + map->stripe_len -
3256 extent_logical;
3257
3258 scrub_parity_mark_sectors_data(sparity, extent_logical,
3259 extent_len);
3260
4a770891 3261 mapped_length = extent_len;
f1fee653 3262 bbio = NULL;
cf8cddd3
CH
3263 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ,
3264 extent_logical, &mapped_length, &bbio,
3265 0);
4a770891
OS
3266 if (!ret) {
3267 if (!bbio || mapped_length < extent_len)
3268 ret = -EIO;
3269 }
3270 if (ret) {
3271 btrfs_put_bbio(bbio);
3272 goto out;
3273 }
3274 extent_physical = bbio->stripes[0].physical;
3275 extent_mirror_num = bbio->mirror_num;
3276 extent_dev = bbio->stripes[0].dev;
3277 btrfs_put_bbio(bbio);
5a6ac9ea
MX
3278
3279 ret = btrfs_lookup_csums_range(csum_root,
3280 extent_logical,
3281 extent_logical + extent_len - 1,
3282 &sctx->csum_list, 1);
3283 if (ret)
3284 goto out;
3285
3286 ret = scrub_extent_for_parity(sparity, extent_logical,
3287 extent_len,
3288 extent_physical,
3289 extent_dev, flags,
3290 generation,
3291 extent_mirror_num);
6fa96d72
ZL
3292
3293 scrub_free_csums(sctx);
3294
5a6ac9ea
MX
3295 if (ret)
3296 goto out;
3297
5a6ac9ea
MX
3298 if (extent_logical + extent_len <
3299 key.objectid + bytes) {
3300 logic_start += map->stripe_len;
3301
3302 if (logic_start >= logic_end) {
3303 stop_loop = 1;
3304 break;
3305 }
3306
3307 if (logic_start < key.objectid + bytes) {
3308 cond_resched();
3309 goto again;
3310 }
3311 }
3312next:
3313 path->slots[0]++;
3314 }
3315
3316 btrfs_release_path(path);
3317
3318 if (stop_loop)
3319 break;
3320
3321 logic_start += map->stripe_len;
3322 }
3323out:
3324 if (ret < 0)
3325 scrub_parity_mark_sectors_error(sparity, logic_start,
a0dd59de 3326 logic_end - logic_start);
5a6ac9ea
MX
3327 scrub_parity_put(sparity);
3328 scrub_submit(sctx);
3fb99303 3329 mutex_lock(&sctx->wr_lock);
5a6ac9ea 3330 scrub_wr_submit(sctx);
3fb99303 3331 mutex_unlock(&sctx->wr_lock);
5a6ac9ea
MX
3332
3333 btrfs_release_path(path);
3334 return ret < 0 ? ret : 0;
3335}
3336
d9d181c1 3337static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
a36cf8b8
SB
3338 struct map_lookup *map,
3339 struct btrfs_device *scrub_dev,
f9257db0 3340 int num, u64 base, u64 length)
a2de733c 3341{
5a6ac9ea 3342 struct btrfs_path *path, *ppath;
fb456252 3343 struct btrfs_fs_info *fs_info = sctx->fs_info;
a2de733c
AJ
3344 struct btrfs_root *root = fs_info->extent_root;
3345 struct btrfs_root *csum_root = fs_info->csum_root;
3346 struct btrfs_extent_item *extent;
e7786c3a 3347 struct blk_plug plug;
a2de733c
AJ
3348 u64 flags;
3349 int ret;
3350 int slot;
a2de733c 3351 u64 nstripes;
a2de733c 3352 struct extent_buffer *l;
a2de733c
AJ
3353 u64 physical;
3354 u64 logical;
625f1c8d 3355 u64 logic_end;
3b080b25 3356 u64 physical_end;
a2de733c 3357 u64 generation;
e12fa9cd 3358 int mirror_num;
7a26285e
AJ
3359 struct reada_control *reada1;
3360 struct reada_control *reada2;
e6c11f9a 3361 struct btrfs_key key;
7a26285e 3362 struct btrfs_key key_end;
a2de733c
AJ
3363 u64 increment = map->stripe_len;
3364 u64 offset;
ff023aac
SB
3365 u64 extent_logical;
3366 u64 extent_physical;
3367 u64 extent_len;
5a6ac9ea
MX
3368 u64 stripe_logical;
3369 u64 stripe_end;
ff023aac
SB
3370 struct btrfs_device *extent_dev;
3371 int extent_mirror_num;
3b080b25 3372 int stop_loop = 0;
53b381b3 3373
3b080b25 3374 physical = map->stripes[num].physical;
a2de733c 3375 offset = 0;
42c61ab6 3376 nstripes = div64_u64(length, map->stripe_len);
a2de733c
AJ
3377 if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3378 offset = map->stripe_len * num;
3379 increment = map->stripe_len * map->num_stripes;
193ea74b 3380 mirror_num = 1;
a2de733c
AJ
3381 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3382 int factor = map->num_stripes / map->sub_stripes;
3383 offset = map->stripe_len * (num / map->sub_stripes);
3384 increment = map->stripe_len * factor;
193ea74b 3385 mirror_num = num % map->sub_stripes + 1;
a2de733c
AJ
3386 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
3387 increment = map->stripe_len;
193ea74b 3388 mirror_num = num % map->num_stripes + 1;
a2de733c
AJ
3389 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
3390 increment = map->stripe_len;
193ea74b 3391 mirror_num = num % map->num_stripes + 1;
ffe2d203 3392 } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5a6ac9ea 3393 get_raid56_logic_offset(physical, num, map, &offset, NULL);
3b080b25
WS
3394 increment = map->stripe_len * nr_data_stripes(map);
3395 mirror_num = 1;
a2de733c
AJ
3396 } else {
3397 increment = map->stripe_len;
193ea74b 3398 mirror_num = 1;
a2de733c
AJ
3399 }
3400
3401 path = btrfs_alloc_path();
3402 if (!path)
3403 return -ENOMEM;
3404
5a6ac9ea
MX
3405 ppath = btrfs_alloc_path();
3406 if (!ppath) {
379d6854 3407 btrfs_free_path(path);
5a6ac9ea
MX
3408 return -ENOMEM;
3409 }
3410
b5d67f64
SB
3411 /*
3412 * work on commit root. The related disk blocks are static as
3413 * long as COW is applied. This means, it is save to rewrite
3414 * them to repair disk errors without any race conditions
3415 */
a2de733c
AJ
3416 path->search_commit_root = 1;
3417 path->skip_locking = 1;
3418
063c54dc
GH
3419 ppath->search_commit_root = 1;
3420 ppath->skip_locking = 1;
a2de733c 3421 /*
7a26285e
AJ
3422 * trigger the readahead for extent tree csum tree and wait for
3423 * completion. During readahead, the scrub is officially paused
3424 * to not hold off transaction commits
a2de733c
AJ
3425 */
3426 logical = base + offset;
3b080b25 3427 physical_end = physical + nstripes * map->stripe_len;
ffe2d203 3428 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3b080b25 3429 get_raid56_logic_offset(physical_end, num,
5a6ac9ea 3430 map, &logic_end, NULL);
3b080b25
WS
3431 logic_end += base;
3432 } else {
3433 logic_end = logical + increment * nstripes;
3434 }
d9d181c1 3435 wait_event(sctx->list_wait,
b6bfebc1 3436 atomic_read(&sctx->bios_in_flight) == 0);
cb7ab021 3437 scrub_blocked_if_needed(fs_info);
7a26285e
AJ
3438
3439 /* FIXME it might be better to start readahead at commit root */
e6c11f9a
DS
3440 key.objectid = logical;
3441 key.type = BTRFS_EXTENT_ITEM_KEY;
3442 key.offset = (u64)0;
3b080b25 3443 key_end.objectid = logic_end;
3173a18f
JB
3444 key_end.type = BTRFS_METADATA_ITEM_KEY;
3445 key_end.offset = (u64)-1;
e6c11f9a 3446 reada1 = btrfs_reada_add(root, &key, &key_end);
7a26285e 3447
e6c11f9a
DS
3448 key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
3449 key.type = BTRFS_EXTENT_CSUM_KEY;
3450 key.offset = logical;
7a26285e
AJ
3451 key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
3452 key_end.type = BTRFS_EXTENT_CSUM_KEY;
3b080b25 3453 key_end.offset = logic_end;
e6c11f9a 3454 reada2 = btrfs_reada_add(csum_root, &key, &key_end);
7a26285e
AJ
3455
3456 if (!IS_ERR(reada1))
3457 btrfs_reada_wait(reada1);
3458 if (!IS_ERR(reada2))
3459 btrfs_reada_wait(reada2);
3460
a2de733c
AJ
3461
3462 /*
3463 * collect all data csums for the stripe to avoid seeking during
3464 * the scrub. This might currently (crc32) end up to be about 1MB
3465 */
e7786c3a 3466 blk_start_plug(&plug);
a2de733c 3467
a2de733c
AJ
3468 /*
3469 * now find all extents for each stripe and scrub them
3470 */
a2de733c 3471 ret = 0;
3b080b25 3472 while (physical < physical_end) {
a2de733c
AJ
3473 /*
3474 * canceled?
3475 */
3476 if (atomic_read(&fs_info->scrub_cancel_req) ||
d9d181c1 3477 atomic_read(&sctx->cancel_req)) {
a2de733c
AJ
3478 ret = -ECANCELED;
3479 goto out;
3480 }
3481 /*
3482 * check to see if we have to pause
3483 */
3484 if (atomic_read(&fs_info->scrub_pause_req)) {
3485 /* push queued extents */
2073c4c2 3486 sctx->flush_all_writes = true;
d9d181c1 3487 scrub_submit(sctx);
3fb99303 3488 mutex_lock(&sctx->wr_lock);
ff023aac 3489 scrub_wr_submit(sctx);
3fb99303 3490 mutex_unlock(&sctx->wr_lock);
d9d181c1 3491 wait_event(sctx->list_wait,
b6bfebc1 3492 atomic_read(&sctx->bios_in_flight) == 0);
2073c4c2 3493 sctx->flush_all_writes = false;
3cb0929a 3494 scrub_blocked_if_needed(fs_info);
a2de733c
AJ
3495 }
3496
f2f66a2f
ZL
3497 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3498 ret = get_raid56_logic_offset(physical, num, map,
3499 &logical,
3500 &stripe_logical);
3501 logical += base;
3502 if (ret) {
7955323b 3503 /* it is parity strip */
f2f66a2f 3504 stripe_logical += base;
a0dd59de 3505 stripe_end = stripe_logical + increment;
f2f66a2f
ZL
3506 ret = scrub_raid56_parity(sctx, map, scrub_dev,
3507 ppath, stripe_logical,
3508 stripe_end);
3509 if (ret)
3510 goto out;
3511 goto skip;
3512 }
3513 }
3514
7c76edb7
WS
3515 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
3516 key.type = BTRFS_METADATA_ITEM_KEY;
3517 else
3518 key.type = BTRFS_EXTENT_ITEM_KEY;
a2de733c 3519 key.objectid = logical;
625f1c8d 3520 key.offset = (u64)-1;
a2de733c
AJ
3521
3522 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3523 if (ret < 0)
3524 goto out;
3173a18f 3525
8c51032f 3526 if (ret > 0) {
ade2e0b3 3527 ret = btrfs_previous_extent_item(root, path, 0);
a2de733c
AJ
3528 if (ret < 0)
3529 goto out;
8c51032f
AJ
3530 if (ret > 0) {
3531 /* there's no smaller item, so stick with the
3532 * larger one */
3533 btrfs_release_path(path);
3534 ret = btrfs_search_slot(NULL, root, &key,
3535 path, 0, 0);
3536 if (ret < 0)
3537 goto out;
3538 }
a2de733c
AJ
3539 }
3540
625f1c8d 3541 stop_loop = 0;
a2de733c 3542 while (1) {
3173a18f
JB
3543 u64 bytes;
3544
a2de733c
AJ
3545 l = path->nodes[0];
3546 slot = path->slots[0];
3547 if (slot >= btrfs_header_nritems(l)) {
3548 ret = btrfs_next_leaf(root, path);
3549 if (ret == 0)
3550 continue;
3551 if (ret < 0)
3552 goto out;
3553
625f1c8d 3554 stop_loop = 1;
a2de733c
AJ
3555 break;
3556 }
3557 btrfs_item_key_to_cpu(l, &key, slot);
3558
d7cad238
ZL
3559 if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3560 key.type != BTRFS_METADATA_ITEM_KEY)
3561 goto next;
3562
3173a18f 3563 if (key.type == BTRFS_METADATA_ITEM_KEY)
0b246afa 3564 bytes = fs_info->nodesize;
3173a18f
JB
3565 else
3566 bytes = key.offset;
3567
3568 if (key.objectid + bytes <= logical)
a2de733c
AJ
3569 goto next;
3570
625f1c8d
LB
3571 if (key.objectid >= logical + map->stripe_len) {
3572 /* out of this device extent */
3573 if (key.objectid >= logic_end)
3574 stop_loop = 1;
3575 break;
3576 }
a2de733c
AJ
3577
3578 extent = btrfs_item_ptr(l, slot,
3579 struct btrfs_extent_item);
3580 flags = btrfs_extent_flags(l, extent);
3581 generation = btrfs_extent_generation(l, extent);
3582
a323e813
ZL
3583 if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3584 (key.objectid < logical ||
3585 key.objectid + bytes >
3586 logical + map->stripe_len)) {
efe120a0 3587 btrfs_err(fs_info,
5d163e0e 3588 "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
c1c9ff7c 3589 key.objectid, logical);
9799d2c3
ZL
3590 spin_lock(&sctx->stat_lock);
3591 sctx->stat.uncorrectable_errors++;
3592 spin_unlock(&sctx->stat_lock);
a2de733c
AJ
3593 goto next;
3594 }
3595
625f1c8d
LB
3596again:
3597 extent_logical = key.objectid;
3598 extent_len = bytes;
3599
a2de733c
AJ
3600 /*
3601 * trim extent to this stripe
3602 */
625f1c8d
LB
3603 if (extent_logical < logical) {
3604 extent_len -= logical - extent_logical;
3605 extent_logical = logical;
a2de733c 3606 }
625f1c8d 3607 if (extent_logical + extent_len >
a2de733c 3608 logical + map->stripe_len) {
625f1c8d
LB
3609 extent_len = logical + map->stripe_len -
3610 extent_logical;
a2de733c
AJ
3611 }
3612
625f1c8d 3613 extent_physical = extent_logical - logical + physical;
ff023aac
SB
3614 extent_dev = scrub_dev;
3615 extent_mirror_num = mirror_num;
f9257db0 3616 if (sctx->is_dev_replace)
ff023aac
SB
3617 scrub_remap_extent(fs_info, extent_logical,
3618 extent_len, &extent_physical,
3619 &extent_dev,
3620 &extent_mirror_num);
625f1c8d 3621
fe8cf654
ZL
3622 ret = btrfs_lookup_csums_range(csum_root,
3623 extent_logical,
3624 extent_logical +
3625 extent_len - 1,
3626 &sctx->csum_list, 1);
625f1c8d
LB
3627 if (ret)
3628 goto out;
3629
ff023aac
SB
3630 ret = scrub_extent(sctx, extent_logical, extent_len,
3631 extent_physical, extent_dev, flags,
3632 generation, extent_mirror_num,
115930cb 3633 extent_logical - logical + physical);
6fa96d72
ZL
3634
3635 scrub_free_csums(sctx);
3636
a2de733c
AJ
3637 if (ret)
3638 goto out;
3639
625f1c8d
LB
3640 if (extent_logical + extent_len <
3641 key.objectid + bytes) {
ffe2d203 3642 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3b080b25
WS
3643 /*
3644 * loop until we find next data stripe
3645 * or we have finished all stripes.
3646 */
5a6ac9ea
MX
3647loop:
3648 physical += map->stripe_len;
3649 ret = get_raid56_logic_offset(physical,
3650 num, map, &logical,
3651 &stripe_logical);
3652 logical += base;
3653
3654 if (ret && physical < physical_end) {
3655 stripe_logical += base;
3656 stripe_end = stripe_logical +
a0dd59de 3657 increment;
5a6ac9ea
MX
3658 ret = scrub_raid56_parity(sctx,
3659 map, scrub_dev, ppath,
3660 stripe_logical,
3661 stripe_end);
3662 if (ret)
3663 goto out;
3664 goto loop;
3665 }
3b080b25
WS
3666 } else {
3667 physical += map->stripe_len;
3668 logical += increment;
3669 }
625f1c8d
LB
3670 if (logical < key.objectid + bytes) {
3671 cond_resched();
3672 goto again;
3673 }
3674
3b080b25 3675 if (physical >= physical_end) {
625f1c8d
LB
3676 stop_loop = 1;
3677 break;
3678 }
3679 }
a2de733c
AJ
3680next:
3681 path->slots[0]++;
3682 }
71267333 3683 btrfs_release_path(path);
3b080b25 3684skip:
a2de733c
AJ
3685 logical += increment;
3686 physical += map->stripe_len;
d9d181c1 3687 spin_lock(&sctx->stat_lock);
625f1c8d
LB
3688 if (stop_loop)
3689 sctx->stat.last_physical = map->stripes[num].physical +
3690 length;
3691 else
3692 sctx->stat.last_physical = physical;
d9d181c1 3693 spin_unlock(&sctx->stat_lock);
625f1c8d
LB
3694 if (stop_loop)
3695 break;
a2de733c 3696 }
ff023aac 3697out:
a2de733c 3698 /* push queued extents */
d9d181c1 3699 scrub_submit(sctx);
3fb99303 3700 mutex_lock(&sctx->wr_lock);
ff023aac 3701 scrub_wr_submit(sctx);
3fb99303 3702 mutex_unlock(&sctx->wr_lock);
a2de733c 3703
e7786c3a 3704 blk_finish_plug(&plug);
a2de733c 3705 btrfs_free_path(path);
5a6ac9ea 3706 btrfs_free_path(ppath);
a2de733c
AJ
3707 return ret < 0 ? ret : 0;
3708}
3709
d9d181c1 3710static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
a36cf8b8 3711 struct btrfs_device *scrub_dev,
a36cf8b8 3712 u64 chunk_offset, u64 length,
020d5b73 3713 u64 dev_offset,
f9257db0 3714 struct btrfs_block_group_cache *cache)
a2de733c 3715{
fb456252
JM
3716 struct btrfs_fs_info *fs_info = sctx->fs_info;
3717 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
a2de733c
AJ
3718 struct map_lookup *map;
3719 struct extent_map *em;
3720 int i;
ff023aac 3721 int ret = 0;
a2de733c
AJ
3722
3723 read_lock(&map_tree->map_tree.lock);
3724 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
3725 read_unlock(&map_tree->map_tree.lock);
3726
020d5b73
FM
3727 if (!em) {
3728 /*
3729 * Might have been an unused block group deleted by the cleaner
3730 * kthread or relocation.
3731 */
3732 spin_lock(&cache->lock);
3733 if (!cache->removed)
3734 ret = -EINVAL;
3735 spin_unlock(&cache->lock);
3736
3737 return ret;
3738 }
a2de733c 3739
95617d69 3740 map = em->map_lookup;
a2de733c
AJ
3741 if (em->start != chunk_offset)
3742 goto out;
3743
3744 if (em->len < length)
3745 goto out;
3746
3747 for (i = 0; i < map->num_stripes; ++i) {
a36cf8b8 3748 if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
859acaf1 3749 map->stripes[i].physical == dev_offset) {
a36cf8b8 3750 ret = scrub_stripe(sctx, map, scrub_dev, i,
f9257db0 3751 chunk_offset, length);
a2de733c
AJ
3752 if (ret)
3753 goto out;
3754 }
3755 }
3756out:
3757 free_extent_map(em);
3758
3759 return ret;
3760}
3761
3762static noinline_for_stack
a36cf8b8 3763int scrub_enumerate_chunks(struct scrub_ctx *sctx,
f9257db0 3764 struct btrfs_device *scrub_dev, u64 start, u64 end)
a2de733c
AJ
3765{
3766 struct btrfs_dev_extent *dev_extent = NULL;
3767 struct btrfs_path *path;
0b246afa
JM
3768 struct btrfs_fs_info *fs_info = sctx->fs_info;
3769 struct btrfs_root *root = fs_info->dev_root;
a2de733c 3770 u64 length;
a2de733c 3771 u64 chunk_offset;
55e3a601 3772 int ret = 0;
76a8efa1 3773 int ro_set;
a2de733c
AJ
3774 int slot;
3775 struct extent_buffer *l;
3776 struct btrfs_key key;
3777 struct btrfs_key found_key;
3778 struct btrfs_block_group_cache *cache;
ff023aac 3779 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
a2de733c
AJ
3780
3781 path = btrfs_alloc_path();
3782 if (!path)
3783 return -ENOMEM;
3784
e4058b54 3785 path->reada = READA_FORWARD;
a2de733c
AJ
3786 path->search_commit_root = 1;
3787 path->skip_locking = 1;
3788
a36cf8b8 3789 key.objectid = scrub_dev->devid;
a2de733c
AJ
3790 key.offset = 0ull;
3791 key.type = BTRFS_DEV_EXTENT_KEY;
3792
a2de733c
AJ
3793 while (1) {
3794 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3795 if (ret < 0)
8c51032f
AJ
3796 break;
3797 if (ret > 0) {
3798 if (path->slots[0] >=
3799 btrfs_header_nritems(path->nodes[0])) {
3800 ret = btrfs_next_leaf(root, path);
55e3a601
Z
3801 if (ret < 0)
3802 break;
3803 if (ret > 0) {
3804 ret = 0;
8c51032f 3805 break;
55e3a601
Z
3806 }
3807 } else {
3808 ret = 0;
8c51032f
AJ
3809 }
3810 }
a2de733c
AJ
3811
3812 l = path->nodes[0];
3813 slot = path->slots[0];
3814
3815 btrfs_item_key_to_cpu(l, &found_key, slot);
3816
a36cf8b8 3817 if (found_key.objectid != scrub_dev->devid)
a2de733c
AJ
3818 break;
3819
962a298f 3820 if (found_key.type != BTRFS_DEV_EXTENT_KEY)
a2de733c
AJ
3821 break;
3822
3823 if (found_key.offset >= end)
3824 break;
3825
3826 if (found_key.offset < key.offset)
3827 break;
3828
3829 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
3830 length = btrfs_dev_extent_length(l, dev_extent);
3831
ced96edc
QW
3832 if (found_key.offset + length <= start)
3833 goto skip;
a2de733c 3834
a2de733c
AJ
3835 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
3836
3837 /*
3838 * get a reference on the corresponding block group to prevent
3839 * the chunk from going away while we scrub it
3840 */
3841 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
ced96edc
QW
3842
3843 /* some chunks are removed but not committed to disk yet,
3844 * continue scrubbing */
3845 if (!cache)
3846 goto skip;
3847
55e3a601
Z
3848 /*
3849 * we need call btrfs_inc_block_group_ro() with scrubs_paused,
3850 * to avoid deadlock caused by:
3851 * btrfs_inc_block_group_ro()
3852 * -> btrfs_wait_for_commit()
3853 * -> btrfs_commit_transaction()
3854 * -> btrfs_scrub_pause()
3855 */
3856 scrub_pause_on(fs_info);
5e00f193 3857 ret = btrfs_inc_block_group_ro(fs_info, cache);
f9257db0 3858 if (!ret && sctx->is_dev_replace) {
f0e9b7d6
FM
3859 /*
3860 * If we are doing a device replace wait for any tasks
3861 * that started dellaloc right before we set the block
3862 * group to RO mode, as they might have just allocated
3863 * an extent from it or decided they could do a nocow
3864 * write. And if any such tasks did that, wait for their
3865 * ordered extents to complete and then commit the
3866 * current transaction, so that we can later see the new
3867 * extent items in the extent tree - the ordered extents
3868 * create delayed data references (for cow writes) when
3869 * they complete, which will be run and insert the
3870 * corresponding extent items into the extent tree when
3871 * we commit the transaction they used when running
3872 * inode.c:btrfs_finish_ordered_io(). We later use
3873 * the commit root of the extent tree to find extents
3874 * to copy from the srcdev into the tgtdev, and we don't
3875 * want to miss any new extents.
3876 */
3877 btrfs_wait_block_group_reservations(cache);
3878 btrfs_wait_nocow_writers(cache);
6374e57a 3879 ret = btrfs_wait_ordered_roots(fs_info, U64_MAX,
f0e9b7d6
FM
3880 cache->key.objectid,
3881 cache->key.offset);
3882 if (ret > 0) {
3883 struct btrfs_trans_handle *trans;
3884
3885 trans = btrfs_join_transaction(root);
3886 if (IS_ERR(trans))
3887 ret = PTR_ERR(trans);
3888 else
3a45bb20 3889 ret = btrfs_commit_transaction(trans);
f0e9b7d6
FM
3890 if (ret) {
3891 scrub_pause_off(fs_info);
3892 btrfs_put_block_group(cache);
3893 break;
3894 }
3895 }
3896 }
55e3a601 3897 scrub_pause_off(fs_info);
76a8efa1
Z
3898
3899 if (ret == 0) {
3900 ro_set = 1;
3901 } else if (ret == -ENOSPC) {
3902 /*
3903 * btrfs_inc_block_group_ro return -ENOSPC when it
3904 * failed in creating new chunk for metadata.
3905 * It is not a problem for scrub/replace, because
3906 * metadata are always cowed, and our scrub paused
3907 * commit_transactions.
3908 */
3909 ro_set = 0;
3910 } else {
5d163e0e 3911 btrfs_warn(fs_info,
913e1535 3912 "failed setting block group ro: %d", ret);
55e3a601
Z
3913 btrfs_put_block_group(cache);
3914 break;
3915 }
3916
81e87a73 3917 btrfs_dev_replace_lock(&fs_info->dev_replace, 1);
ff023aac
SB
3918 dev_replace->cursor_right = found_key.offset + length;
3919 dev_replace->cursor_left = found_key.offset;
3920 dev_replace->item_needs_writeback = 1;
81e87a73 3921 btrfs_dev_replace_unlock(&fs_info->dev_replace, 1);
8c204c96 3922 ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length,
f9257db0 3923 found_key.offset, cache);
ff023aac
SB
3924
3925 /*
3926 * flush, submit all pending read and write bios, afterwards
3927 * wait for them.
3928 * Note that in the dev replace case, a read request causes
3929 * write requests that are submitted in the read completion
3930 * worker. Therefore in the current situation, it is required
3931 * that all write requests are flushed, so that all read and
3932 * write requests are really completed when bios_in_flight
3933 * changes to 0.
3934 */
2073c4c2 3935 sctx->flush_all_writes = true;
ff023aac 3936 scrub_submit(sctx);
3fb99303 3937 mutex_lock(&sctx->wr_lock);
ff023aac 3938 scrub_wr_submit(sctx);
3fb99303 3939 mutex_unlock(&sctx->wr_lock);
ff023aac
SB
3940
3941 wait_event(sctx->list_wait,
3942 atomic_read(&sctx->bios_in_flight) == 0);
b708ce96
Z
3943
3944 scrub_pause_on(fs_info);
12cf9372
WS
3945
3946 /*
3947 * must be called before we decrease @scrub_paused.
3948 * make sure we don't block transaction commit while
3949 * we are waiting pending workers finished.
3950 */
ff023aac
SB
3951 wait_event(sctx->list_wait,
3952 atomic_read(&sctx->workers_pending) == 0);
2073c4c2 3953 sctx->flush_all_writes = false;
12cf9372 3954
b708ce96 3955 scrub_pause_off(fs_info);
ff023aac 3956
1a1a8b73
FM
3957 btrfs_dev_replace_lock(&fs_info->dev_replace, 1);
3958 dev_replace->cursor_left = dev_replace->cursor_right;
3959 dev_replace->item_needs_writeback = 1;
3960 btrfs_dev_replace_unlock(&fs_info->dev_replace, 1);
3961
76a8efa1 3962 if (ro_set)
2ff7e61e 3963 btrfs_dec_block_group_ro(cache);
ff023aac 3964
758f2dfc
FM
3965 /*
3966 * We might have prevented the cleaner kthread from deleting
3967 * this block group if it was already unused because we raced
3968 * and set it to RO mode first. So add it back to the unused
3969 * list, otherwise it might not ever be deleted unless a manual
3970 * balance is triggered or it becomes used and unused again.
3971 */
3972 spin_lock(&cache->lock);
3973 if (!cache->removed && !cache->ro && cache->reserved == 0 &&
3974 btrfs_block_group_used(&cache->item) == 0) {
3975 spin_unlock(&cache->lock);
3976 spin_lock(&fs_info->unused_bgs_lock);
3977 if (list_empty(&cache->bg_list)) {
3978 btrfs_get_block_group(cache);
3979 list_add_tail(&cache->bg_list,
3980 &fs_info->unused_bgs);
3981 }
3982 spin_unlock(&fs_info->unused_bgs_lock);
3983 } else {
3984 spin_unlock(&cache->lock);
3985 }
3986
a2de733c
AJ
3987 btrfs_put_block_group(cache);
3988 if (ret)
3989 break;
f9257db0 3990 if (sctx->is_dev_replace &&
af1be4f8 3991 atomic64_read(&dev_replace->num_write_errors) > 0) {
ff023aac
SB
3992 ret = -EIO;
3993 break;
3994 }
3995 if (sctx->stat.malloc_errors > 0) {
3996 ret = -ENOMEM;
3997 break;
3998 }
ced96edc 3999skip:
a2de733c 4000 key.offset = found_key.offset + length;
71267333 4001 btrfs_release_path(path);
a2de733c
AJ
4002 }
4003
a2de733c 4004 btrfs_free_path(path);
8c51032f 4005
55e3a601 4006 return ret;
a2de733c
AJ
4007}
4008
a36cf8b8
SB
4009static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
4010 struct btrfs_device *scrub_dev)
a2de733c
AJ
4011{
4012 int i;
4013 u64 bytenr;
4014 u64 gen;
4015 int ret;
0b246afa 4016 struct btrfs_fs_info *fs_info = sctx->fs_info;
a2de733c 4017
0b246afa 4018 if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
79787eaa
JM
4019 return -EIO;
4020
5f546063 4021 /* Seed devices of a new filesystem has their own generation. */
0b246afa 4022 if (scrub_dev->fs_devices != fs_info->fs_devices)
5f546063
MX
4023 gen = scrub_dev->generation;
4024 else
0b246afa 4025 gen = fs_info->last_trans_committed;
a2de733c
AJ
4026
4027 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
4028 bytenr = btrfs_sb_offset(i);
935e5cc9
MX
4029 if (bytenr + BTRFS_SUPER_INFO_SIZE >
4030 scrub_dev->commit_total_bytes)
a2de733c
AJ
4031 break;
4032
d9d181c1 4033 ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
a36cf8b8 4034 scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
ff023aac 4035 NULL, 1, bytenr);
a2de733c
AJ
4036 if (ret)
4037 return ret;
4038 }
b6bfebc1 4039 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
a2de733c
AJ
4040
4041 return 0;
4042}
4043
4044/*
4045 * get a reference count on fs_info->scrub_workers. start worker if necessary
4046 */
ff023aac
SB
4047static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
4048 int is_dev_replace)
a2de733c 4049{
6f011058 4050 unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
0339ef2f 4051 int max_active = fs_info->thread_pool_size;
a2de733c 4052
632dd772 4053 if (fs_info->scrub_workers_refcnt == 0) {
af1cbe0a
DS
4054 fs_info->scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub",
4055 flags, is_dev_replace ? 1 : max_active, 4);
e82afc52
ZL
4056 if (!fs_info->scrub_workers)
4057 goto fail_scrub_workers;
4058
0339ef2f 4059 fs_info->scrub_wr_completion_workers =
cb001095 4060 btrfs_alloc_workqueue(fs_info, "scrubwrc", flags,
0339ef2f 4061 max_active, 2);
e82afc52
ZL
4062 if (!fs_info->scrub_wr_completion_workers)
4063 goto fail_scrub_wr_completion_workers;
4064
0339ef2f 4065 fs_info->scrub_nocow_workers =
cb001095 4066 btrfs_alloc_workqueue(fs_info, "scrubnc", flags, 1, 0);
e82afc52
ZL
4067 if (!fs_info->scrub_nocow_workers)
4068 goto fail_scrub_nocow_workers;
20b2e302 4069 fs_info->scrub_parity_workers =
cb001095 4070 btrfs_alloc_workqueue(fs_info, "scrubparity", flags,
20b2e302 4071 max_active, 2);
e82afc52
ZL
4072 if (!fs_info->scrub_parity_workers)
4073 goto fail_scrub_parity_workers;
632dd772 4074 }
a2de733c 4075 ++fs_info->scrub_workers_refcnt;
e82afc52
ZL
4076 return 0;
4077
4078fail_scrub_parity_workers:
4079 btrfs_destroy_workqueue(fs_info->scrub_nocow_workers);
4080fail_scrub_nocow_workers:
4081 btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
4082fail_scrub_wr_completion_workers:
4083 btrfs_destroy_workqueue(fs_info->scrub_workers);
4084fail_scrub_workers:
4085 return -ENOMEM;
a2de733c
AJ
4086}
4087
aa1b8cd4 4088static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
a2de733c 4089{
ff023aac 4090 if (--fs_info->scrub_workers_refcnt == 0) {
0339ef2f
QW
4091 btrfs_destroy_workqueue(fs_info->scrub_workers);
4092 btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
4093 btrfs_destroy_workqueue(fs_info->scrub_nocow_workers);
20b2e302 4094 btrfs_destroy_workqueue(fs_info->scrub_parity_workers);
ff023aac 4095 }
a2de733c 4096 WARN_ON(fs_info->scrub_workers_refcnt < 0);
a2de733c
AJ
4097}
4098
aa1b8cd4
SB
4099int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
4100 u64 end, struct btrfs_scrub_progress *progress,
63a212ab 4101 int readonly, int is_dev_replace)
a2de733c 4102{
d9d181c1 4103 struct scrub_ctx *sctx;
a2de733c
AJ
4104 int ret;
4105 struct btrfs_device *dev;
5d68da3b 4106 struct rcu_string *name;
225cce41 4107 unsigned int nofs_flag;
a2de733c 4108
aa1b8cd4 4109 if (btrfs_fs_closing(fs_info))
a2de733c
AJ
4110 return -EINVAL;
4111
da17066c 4112 if (fs_info->nodesize > BTRFS_STRIPE_LEN) {
b5d67f64
SB
4113 /*
4114 * in this case scrub is unable to calculate the checksum
4115 * the way scrub is implemented. Do not handle this
4116 * situation at all because it won't ever happen.
4117 */
efe120a0
FH
4118 btrfs_err(fs_info,
4119 "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",
da17066c
JM
4120 fs_info->nodesize,
4121 BTRFS_STRIPE_LEN);
b5d67f64
SB
4122 return -EINVAL;
4123 }
4124
da17066c 4125 if (fs_info->sectorsize != PAGE_SIZE) {
b5d67f64 4126 /* not supported for data w/o checksums */
751bebbe 4127 btrfs_err_rl(fs_info,
5d163e0e 4128 "scrub: size assumption sectorsize != PAGE_SIZE (%d != %lu) fails",
da17066c 4129 fs_info->sectorsize, PAGE_SIZE);
a2de733c
AJ
4130 return -EINVAL;
4131 }
4132
da17066c 4133 if (fs_info->nodesize >
7a9e9987 4134 PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
da17066c 4135 fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
7a9e9987
SB
4136 /*
4137 * would exhaust the array bounds of pagev member in
4138 * struct scrub_block
4139 */
5d163e0e
JM
4140 btrfs_err(fs_info,
4141 "scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails",
da17066c 4142 fs_info->nodesize,
7a9e9987 4143 SCRUB_MAX_PAGES_PER_BLOCK,
da17066c 4144 fs_info->sectorsize,
7a9e9987
SB
4145 SCRUB_MAX_PAGES_PER_BLOCK);
4146 return -EINVAL;
4147 }
4148
a2de733c 4149
aa1b8cd4 4150 mutex_lock(&fs_info->fs_devices->device_list_mutex);
34857e15 4151 dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
63a212ab 4152 if (!dev || (dev->missing && !is_dev_replace)) {
aa1b8cd4 4153 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
a2de733c
AJ
4154 return -ENODEV;
4155 }
a2de733c 4156
5d68da3b
MX
4157 if (!is_dev_replace && !readonly && !dev->writeable) {
4158 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4159 rcu_read_lock();
4160 name = rcu_dereference(dev->name);
4161 btrfs_err(fs_info, "scrub: device %s is not writable",
4162 name->str);
4163 rcu_read_unlock();
4164 return -EROFS;
4165 }
4166
3b7a016f 4167 mutex_lock(&fs_info->scrub_lock);
63a212ab 4168 if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) {
a2de733c 4169 mutex_unlock(&fs_info->scrub_lock);
aa1b8cd4 4170 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
aa1b8cd4 4171 return -EIO;
a2de733c
AJ
4172 }
4173
73beece9 4174 btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
8dabb742
SB
4175 if (dev->scrub_device ||
4176 (!is_dev_replace &&
4177 btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
73beece9 4178 btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
a2de733c 4179 mutex_unlock(&fs_info->scrub_lock);
aa1b8cd4 4180 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
a2de733c
AJ
4181 return -EINPROGRESS;
4182 }
73beece9 4183 btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
3b7a016f
WS
4184
4185 ret = scrub_workers_get(fs_info, is_dev_replace);
4186 if (ret) {
4187 mutex_unlock(&fs_info->scrub_lock);
4188 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4189 return ret;
4190 }
4191
0af6b5c4 4192 sctx = scrub_setup_ctx(fs_info, is_dev_replace);
d9d181c1 4193 if (IS_ERR(sctx)) {
a2de733c 4194 mutex_unlock(&fs_info->scrub_lock);
aa1b8cd4
SB
4195 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4196 scrub_workers_put(fs_info);
d9d181c1 4197 return PTR_ERR(sctx);
a2de733c 4198 }
d9d181c1
SB
4199 sctx->readonly = readonly;
4200 dev->scrub_device = sctx;
3cb0929a 4201 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
a2de733c 4202
3cb0929a
WS
4203 /*
4204 * checking @scrub_pause_req here, we can avoid
4205 * race between committing transaction and scrubbing.
4206 */
cb7ab021 4207 __scrub_blocked_if_needed(fs_info);
a2de733c
AJ
4208 atomic_inc(&fs_info->scrubs_running);
4209 mutex_unlock(&fs_info->scrub_lock);
a2de733c 4210
225cce41
FM
4211 /*
4212 * In order to avoid deadlock with reclaim when there is a transaction
4213 * trying to pause scrub, make sure we use GFP_NOFS for all the
4214 * allocations done at btrfs_scrub_pages() and scrub_pages_for_parity()
4215 * invoked by our callees. The pausing request is done when the
4216 * transaction commit starts, and it blocks the transaction until scrub
4217 * is paused (done at specific points at scrub_stripe() or right above
4218 * before incrementing fs_info->scrubs_running).
4219 */
4220 nofs_flag = memalloc_nofs_save();
ff023aac 4221 if (!is_dev_replace) {
9b011adf
WS
4222 /*
4223 * by holding device list mutex, we can
4224 * kick off writing super in log tree sync.
4225 */
3cb0929a 4226 mutex_lock(&fs_info->fs_devices->device_list_mutex);
ff023aac 4227 ret = scrub_supers(sctx, dev);
3cb0929a 4228 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
ff023aac 4229 }
a2de733c
AJ
4230
4231 if (!ret)
f9257db0 4232 ret = scrub_enumerate_chunks(sctx, dev, start, end);
225cce41 4233 memalloc_nofs_restore(nofs_flag);
a2de733c 4234
b6bfebc1 4235 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
a2de733c
AJ
4236 atomic_dec(&fs_info->scrubs_running);
4237 wake_up(&fs_info->scrub_pause_wait);
4238
b6bfebc1 4239 wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
0ef8e451 4240
a2de733c 4241 if (progress)
d9d181c1 4242 memcpy(progress, &sctx->stat, sizeof(*progress));
a2de733c
AJ
4243
4244 mutex_lock(&fs_info->scrub_lock);
4245 dev->scrub_device = NULL;
3b7a016f 4246 scrub_workers_put(fs_info);
a2de733c
AJ
4247 mutex_unlock(&fs_info->scrub_lock);
4248
f55985f4 4249 scrub_put_ctx(sctx);
a2de733c
AJ
4250
4251 return ret;
4252}
4253
2ff7e61e 4254void btrfs_scrub_pause(struct btrfs_fs_info *fs_info)
a2de733c 4255{
a2de733c
AJ
4256 mutex_lock(&fs_info->scrub_lock);
4257 atomic_inc(&fs_info->scrub_pause_req);
4258 while (atomic_read(&fs_info->scrubs_paused) !=
4259 atomic_read(&fs_info->scrubs_running)) {
4260 mutex_unlock(&fs_info->scrub_lock);
4261 wait_event(fs_info->scrub_pause_wait,
4262 atomic_read(&fs_info->scrubs_paused) ==
4263 atomic_read(&fs_info->scrubs_running));
4264 mutex_lock(&fs_info->scrub_lock);
4265 }
4266 mutex_unlock(&fs_info->scrub_lock);
a2de733c
AJ
4267}
4268
2ff7e61e 4269void btrfs_scrub_continue(struct btrfs_fs_info *fs_info)
a2de733c 4270{
a2de733c
AJ
4271 atomic_dec(&fs_info->scrub_pause_req);
4272 wake_up(&fs_info->scrub_pause_wait);
a2de733c
AJ
4273}
4274
aa1b8cd4 4275int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
a2de733c 4276{
a2de733c
AJ
4277 mutex_lock(&fs_info->scrub_lock);
4278 if (!atomic_read(&fs_info->scrubs_running)) {
4279 mutex_unlock(&fs_info->scrub_lock);
4280 return -ENOTCONN;
4281 }
4282
4283 atomic_inc(&fs_info->scrub_cancel_req);
4284 while (atomic_read(&fs_info->scrubs_running)) {
4285 mutex_unlock(&fs_info->scrub_lock);
4286 wait_event(fs_info->scrub_pause_wait,
4287 atomic_read(&fs_info->scrubs_running) == 0);
4288 mutex_lock(&fs_info->scrub_lock);
4289 }
4290 atomic_dec(&fs_info->scrub_cancel_req);
4291 mutex_unlock(&fs_info->scrub_lock);
4292
4293 return 0;
4294}
4295
aa1b8cd4
SB
4296int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info,
4297 struct btrfs_device *dev)
49b25e05 4298{
d9d181c1 4299 struct scrub_ctx *sctx;
a2de733c
AJ
4300
4301 mutex_lock(&fs_info->scrub_lock);
d9d181c1
SB
4302 sctx = dev->scrub_device;
4303 if (!sctx) {
a2de733c
AJ
4304 mutex_unlock(&fs_info->scrub_lock);
4305 return -ENOTCONN;
4306 }
d9d181c1 4307 atomic_inc(&sctx->cancel_req);
a2de733c
AJ
4308 while (dev->scrub_device) {
4309 mutex_unlock(&fs_info->scrub_lock);
4310 wait_event(fs_info->scrub_pause_wait,
4311 dev->scrub_device == NULL);
4312 mutex_lock(&fs_info->scrub_lock);
4313 }
4314 mutex_unlock(&fs_info->scrub_lock);
4315
4316 return 0;
4317}
1623edeb 4318
2ff7e61e 4319int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
a2de733c
AJ
4320 struct btrfs_scrub_progress *progress)
4321{
4322 struct btrfs_device *dev;
d9d181c1 4323 struct scrub_ctx *sctx = NULL;
a2de733c 4324
0b246afa 4325 mutex_lock(&fs_info->fs_devices->device_list_mutex);
34857e15 4326 dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
a2de733c 4327 if (dev)
d9d181c1
SB
4328 sctx = dev->scrub_device;
4329 if (sctx)
4330 memcpy(progress, &sctx->stat, sizeof(*progress));
0b246afa 4331 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
a2de733c 4332
d9d181c1 4333 return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
a2de733c 4334}
ff023aac
SB
4335
4336static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
4337 u64 extent_logical, u64 extent_len,
4338 u64 *extent_physical,
4339 struct btrfs_device **extent_dev,
4340 int *extent_mirror_num)
4341{
4342 u64 mapped_length;
4343 struct btrfs_bio *bbio = NULL;
4344 int ret;
4345
4346 mapped_length = extent_len;
cf8cddd3 4347 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical,
ff023aac
SB
4348 &mapped_length, &bbio, 0);
4349 if (ret || !bbio || mapped_length < extent_len ||
4350 !bbio->stripes[0].dev->bdev) {
6e9606d2 4351 btrfs_put_bbio(bbio);
ff023aac
SB
4352 return;
4353 }
4354
4355 *extent_physical = bbio->stripes[0].physical;
4356 *extent_mirror_num = bbio->mirror_num;
4357 *extent_dev = bbio->stripes[0].dev;
6e9606d2 4358 btrfs_put_bbio(bbio);
ff023aac
SB
4359}
4360
ff023aac
SB
4361static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
4362 int mirror_num, u64 physical_for_dev_replace)
4363{
4364 struct scrub_copy_nocow_ctx *nocow_ctx;
fb456252 4365 struct btrfs_fs_info *fs_info = sctx->fs_info;
ff023aac
SB
4366
4367 nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS);
4368 if (!nocow_ctx) {
4369 spin_lock(&sctx->stat_lock);
4370 sctx->stat.malloc_errors++;
4371 spin_unlock(&sctx->stat_lock);
4372 return -ENOMEM;
4373 }
4374
4375 scrub_pending_trans_workers_inc(sctx);
4376
4377 nocow_ctx->sctx = sctx;
4378 nocow_ctx->logical = logical;
4379 nocow_ctx->len = len;
4380 nocow_ctx->mirror_num = mirror_num;
4381 nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
9e0af237
LB
4382 btrfs_init_work(&nocow_ctx->work, btrfs_scrubnc_helper,
4383 copy_nocow_pages_worker, NULL, NULL);
652f25a2 4384 INIT_LIST_HEAD(&nocow_ctx->inodes);
0339ef2f
QW
4385 btrfs_queue_work(fs_info->scrub_nocow_workers,
4386 &nocow_ctx->work);
ff023aac
SB
4387
4388 return 0;
4389}
4390
652f25a2
JB
4391static int record_inode_for_nocow(u64 inum, u64 offset, u64 root, void *ctx)
4392{
4393 struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
4394 struct scrub_nocow_inode *nocow_inode;
4395
4396 nocow_inode = kzalloc(sizeof(*nocow_inode), GFP_NOFS);
4397 if (!nocow_inode)
4398 return -ENOMEM;
4399 nocow_inode->inum = inum;
4400 nocow_inode->offset = offset;
4401 nocow_inode->root = root;
4402 list_add_tail(&nocow_inode->list, &nocow_ctx->inodes);
4403 return 0;
4404}
4405
4406#define COPY_COMPLETE 1
4407
ff023aac
SB
4408static void copy_nocow_pages_worker(struct btrfs_work *work)
4409{
4410 struct scrub_copy_nocow_ctx *nocow_ctx =
4411 container_of(work, struct scrub_copy_nocow_ctx, work);
4412 struct scrub_ctx *sctx = nocow_ctx->sctx;
0b246afa
JM
4413 struct btrfs_fs_info *fs_info = sctx->fs_info;
4414 struct btrfs_root *root = fs_info->extent_root;
ff023aac
SB
4415 u64 logical = nocow_ctx->logical;
4416 u64 len = nocow_ctx->len;
4417 int mirror_num = nocow_ctx->mirror_num;
4418 u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
4419 int ret;
4420 struct btrfs_trans_handle *trans = NULL;
ff023aac 4421 struct btrfs_path *path;
ff023aac
SB
4422 int not_written = 0;
4423
ff023aac
SB
4424 path = btrfs_alloc_path();
4425 if (!path) {
4426 spin_lock(&sctx->stat_lock);
4427 sctx->stat.malloc_errors++;
4428 spin_unlock(&sctx->stat_lock);
4429 not_written = 1;
4430 goto out;
4431 }
4432
4433 trans = btrfs_join_transaction(root);
4434 if (IS_ERR(trans)) {
4435 not_written = 1;
4436 goto out;
4437 }
4438
4439 ret = iterate_inodes_from_logical(logical, fs_info, path,
c995ab3c 4440 record_inode_for_nocow, nocow_ctx, false);
ff023aac 4441 if (ret != 0 && ret != -ENOENT) {
5d163e0e
JM
4442 btrfs_warn(fs_info,
4443 "iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %u, ret %d",
4444 logical, physical_for_dev_replace, len, mirror_num,
4445 ret);
ff023aac
SB
4446 not_written = 1;
4447 goto out;
4448 }
4449
3a45bb20 4450 btrfs_end_transaction(trans);
652f25a2
JB
4451 trans = NULL;
4452 while (!list_empty(&nocow_ctx->inodes)) {
4453 struct scrub_nocow_inode *entry;
4454 entry = list_first_entry(&nocow_ctx->inodes,
4455 struct scrub_nocow_inode,
4456 list);
4457 list_del_init(&entry->list);
4458 ret = copy_nocow_pages_for_inode(entry->inum, entry->offset,
4459 entry->root, nocow_ctx);
4460 kfree(entry);
4461 if (ret == COPY_COMPLETE) {
4462 ret = 0;
4463 break;
4464 } else if (ret) {
4465 break;
4466 }
4467 }
ff023aac 4468out:
652f25a2
JB
4469 while (!list_empty(&nocow_ctx->inodes)) {
4470 struct scrub_nocow_inode *entry;
4471 entry = list_first_entry(&nocow_ctx->inodes,
4472 struct scrub_nocow_inode,
4473 list);
4474 list_del_init(&entry->list);
4475 kfree(entry);
4476 }
ff023aac 4477 if (trans && !IS_ERR(trans))
3a45bb20 4478 btrfs_end_transaction(trans);
ff023aac
SB
4479 if (not_written)
4480 btrfs_dev_replace_stats_inc(&fs_info->dev_replace.
4481 num_uncorrectable_read_errors);
4482
4483 btrfs_free_path(path);
4484 kfree(nocow_ctx);
4485
4486 scrub_pending_trans_workers_dec(sctx);
4487}
4488
1c8c9c52 4489static int check_extent_to_block(struct btrfs_inode *inode, u64 start, u64 len,
32159242
GH
4490 u64 logical)
4491{
4492 struct extent_state *cached_state = NULL;
4493 struct btrfs_ordered_extent *ordered;
4494 struct extent_io_tree *io_tree;
4495 struct extent_map *em;
4496 u64 lockstart = start, lockend = start + len - 1;
4497 int ret = 0;
4498
1c8c9c52 4499 io_tree = &inode->io_tree;
32159242 4500
ff13db41 4501 lock_extent_bits(io_tree, lockstart, lockend, &cached_state);
1c8c9c52 4502 ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
32159242
GH
4503 if (ordered) {
4504 btrfs_put_ordered_extent(ordered);
4505 ret = 1;
4506 goto out_unlock;
4507 }
4508
4509 em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
4510 if (IS_ERR(em)) {
4511 ret = PTR_ERR(em);
4512 goto out_unlock;
4513 }
4514
4515 /*
4516 * This extent does not actually cover the logical extent anymore,
4517 * move on to the next inode.
4518 */
4519 if (em->block_start > logical ||
4520 em->block_start + em->block_len < logical + len) {
4521 free_extent_map(em);
4522 ret = 1;
4523 goto out_unlock;
4524 }
4525 free_extent_map(em);
4526
4527out_unlock:
4528 unlock_extent_cached(io_tree, lockstart, lockend, &cached_state,
4529 GFP_NOFS);
4530 return ret;
4531}
4532
652f25a2
JB
4533static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
4534 struct scrub_copy_nocow_ctx *nocow_ctx)
ff023aac 4535{
fb456252 4536 struct btrfs_fs_info *fs_info = nocow_ctx->sctx->fs_info;
ff023aac 4537 struct btrfs_key key;
826aa0a8
MX
4538 struct inode *inode;
4539 struct page *page;
ff023aac 4540 struct btrfs_root *local_root;
652f25a2 4541 struct extent_io_tree *io_tree;
ff023aac 4542 u64 physical_for_dev_replace;
32159242 4543 u64 nocow_ctx_logical;
652f25a2 4544 u64 len = nocow_ctx->len;
826aa0a8 4545 unsigned long index;
6f1c3605 4546 int srcu_index;
652f25a2
JB
4547 int ret = 0;
4548 int err = 0;
ff023aac
SB
4549
4550 key.objectid = root;
4551 key.type = BTRFS_ROOT_ITEM_KEY;
4552 key.offset = (u64)-1;
6f1c3605
LB
4553
4554 srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
4555
ff023aac 4556 local_root = btrfs_read_fs_root_no_name(fs_info, &key);
6f1c3605
LB
4557 if (IS_ERR(local_root)) {
4558 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
ff023aac 4559 return PTR_ERR(local_root);
6f1c3605 4560 }
ff023aac
SB
4561
4562 key.type = BTRFS_INODE_ITEM_KEY;
4563 key.objectid = inum;
4564 key.offset = 0;
4565 inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
6f1c3605 4566 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
ff023aac
SB
4567 if (IS_ERR(inode))
4568 return PTR_ERR(inode);
4569
edd1400b 4570 /* Avoid truncate/dio/punch hole.. */
5955102c 4571 inode_lock(inode);
edd1400b
MX
4572 inode_dio_wait(inode);
4573
ff023aac 4574 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
652f25a2 4575 io_tree = &BTRFS_I(inode)->io_tree;
32159242 4576 nocow_ctx_logical = nocow_ctx->logical;
652f25a2 4577
1c8c9c52
NB
4578 ret = check_extent_to_block(BTRFS_I(inode), offset, len,
4579 nocow_ctx_logical);
32159242
GH
4580 if (ret) {
4581 ret = ret > 0 ? 0 : ret;
4582 goto out;
652f25a2 4583 }
652f25a2 4584
09cbfeaf
KS
4585 while (len >= PAGE_SIZE) {
4586 index = offset >> PAGE_SHIFT;
edd1400b 4587again:
ff023aac
SB
4588 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
4589 if (!page) {
efe120a0 4590 btrfs_err(fs_info, "find_or_create_page() failed");
ff023aac 4591 ret = -ENOMEM;
826aa0a8 4592 goto out;
ff023aac
SB
4593 }
4594
4595 if (PageUptodate(page)) {
4596 if (PageDirty(page))
4597 goto next_page;
4598 } else {
4599 ClearPageError(page);
32159242 4600 err = extent_read_full_page(io_tree, page,
652f25a2
JB
4601 btrfs_get_extent,
4602 nocow_ctx->mirror_num);
826aa0a8
MX
4603 if (err) {
4604 ret = err;
ff023aac
SB
4605 goto next_page;
4606 }
edd1400b 4607
26b25891 4608 lock_page(page);
edd1400b
MX
4609 /*
4610 * If the page has been remove from the page cache,
4611 * the data on it is meaningless, because it may be
4612 * old one, the new data may be written into the new
4613 * page in the page cache.
4614 */
4615 if (page->mapping != inode->i_mapping) {
652f25a2 4616 unlock_page(page);
09cbfeaf 4617 put_page(page);
edd1400b
MX
4618 goto again;
4619 }
ff023aac
SB
4620 if (!PageUptodate(page)) {
4621 ret = -EIO;
4622 goto next_page;
4623 }
4624 }
32159242 4625
1c8c9c52 4626 ret = check_extent_to_block(BTRFS_I(inode), offset, len,
32159242
GH
4627 nocow_ctx_logical);
4628 if (ret) {
4629 ret = ret > 0 ? 0 : ret;
4630 goto next_page;
4631 }
4632
826aa0a8
MX
4633 err = write_page_nocow(nocow_ctx->sctx,
4634 physical_for_dev_replace, page);
4635 if (err)
4636 ret = err;
ff023aac 4637next_page:
826aa0a8 4638 unlock_page(page);
09cbfeaf 4639 put_page(page);
826aa0a8
MX
4640
4641 if (ret)
4642 break;
4643
09cbfeaf
KS
4644 offset += PAGE_SIZE;
4645 physical_for_dev_replace += PAGE_SIZE;
4646 nocow_ctx_logical += PAGE_SIZE;
4647 len -= PAGE_SIZE;
ff023aac 4648 }
652f25a2 4649 ret = COPY_COMPLETE;
826aa0a8 4650out:
5955102c 4651 inode_unlock(inode);
826aa0a8 4652 iput(inode);
ff023aac
SB
4653 return ret;
4654}
4655
4656static int write_page_nocow(struct scrub_ctx *sctx,
4657 u64 physical_for_dev_replace, struct page *page)
4658{
4659 struct bio *bio;
4660 struct btrfs_device *dev;
4661 int ret;
ff023aac 4662
3fb99303 4663 dev = sctx->wr_tgtdev;
ff023aac
SB
4664 if (!dev)
4665 return -EIO;
4666 if (!dev->bdev) {
fb456252 4667 btrfs_warn_rl(dev->fs_info,
94647322 4668 "scrub write_page_nocow(bdev == NULL) is unexpected");
ff023aac
SB
4669 return -EIO;
4670 }
c5e4c3d7 4671 bio = btrfs_io_bio_alloc(1);
4f024f37
KO
4672 bio->bi_iter.bi_size = 0;
4673 bio->bi_iter.bi_sector = physical_for_dev_replace >> 9;
74d46992 4674 bio_set_dev(bio, dev->bdev);
70fd7614 4675 bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
09cbfeaf
KS
4676 ret = bio_add_page(bio, page, PAGE_SIZE, 0);
4677 if (ret != PAGE_SIZE) {
ff023aac
SB
4678leave_with_eio:
4679 bio_put(bio);
4680 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
4681 return -EIO;
4682 }
ff023aac 4683
4e49ea4a 4684 if (btrfsic_submit_bio_wait(bio))
ff023aac
SB
4685 goto leave_with_eio;
4686
4687 bio_put(bio);
4688 return 0;
4689}