fs/btrfs/scrub.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (C) 2011, 2012 STRATO.  All rights reserved.
   4  */
   5
   6 #include <linux/blkdev.h>
   7 #include <linux/ratelimit.h>
   8 #include <linux/sched/mm.h>
   9 #include <crypto/hash.h>
  10 #include "ctree.h"
  11 #include "discard.h"
  12 #include "volumes.h"
  13 #include "disk-io.h"
  14 #include "ordered-data.h"
  15 #include "transaction.h"
  16 #include "backref.h"
  17 #include "extent_io.h"
  18 #include "dev-replace.h"
  19 #include "check-integrity.h"
  20 #include "rcu-string.h"
  21 #include "raid56.h"
  22 #include "block-group.h"
  23 #include "zoned.h"
  24
  25 /*
  26  * This is only the first step towards a full-features scrub. It reads all
  27  * extent and super block and verifies the checksums. In case a bad checksum
  28  * is found or the extent cannot be read, good data will be written back if
  29  * any can be found.
  30  *
  31  * Future enhancements:
  32  *  - In case an unrepairable extent is encountered, track which files are
  33  *    affected and report them
  34  *  - track and record media errors, throw out bad devices
  35  *  - add a mode to also read unallocated space
  36  */
  37
  38 struct scrub_block;
  39 struct scrub_ctx;
  40
  41 /*
  42  * the following three values only influence the performance.
  43  * The last one configures the number of parallel and outstanding I/O
  44  * operations. The first two values configure an upper limit for the number
  45  * of (dynamically allocated) pages that are added to a bio.
  46  */
  47 #define SCRUB_PAGES_PER_RD_BIO  32      /* 128k per bio */
  48 #define SCRUB_PAGES_PER_WR_BIO  32      /* 128k per bio */
  49 #define SCRUB_BIOS_PER_SCTX     64      /* 8MB per device in flight */
  50
  51 /*
  52  * the following value times PAGE_SIZE needs to be large enough to match the
  53  * largest node/leaf/sector size that shall be supported.
  54  * Values larger than BTRFS_STRIPE_LEN are not supported.
  55  */
  56 #define SCRUB_MAX_PAGES_PER_BLOCK       16      /* 64k per node/leaf/sector */
  57
  58 struct scrub_recover {
  59         refcount_t              refs;
  60         struct btrfs_bio        *bbio;
  61         u64                     map_length;
  62 };
  63
  64 struct scrub_page {
  65         struct scrub_block      *sblock;
  66         struct page             *page;
  67         struct btrfs_device     *dev;
  68         struct list_head        list;
  69         u64                     flags;  /* extent flags */
  70         u64                     generation;
  71         u64                     logical;
  72         u64                     physical;
  73         u64                     physical_for_dev_replace;
  74         atomic_t                refs;
  75         u8                      mirror_num;
  76         int                     have_csum:1;
  77         int                     io_error:1;
  78         u8                      csum[BTRFS_CSUM_SIZE];
  79
  80         struct scrub_recover    *recover;
  81 };
  82
  83 struct scrub_bio {
  84         int                     index;
  85         struct scrub_ctx        *sctx;
  86         struct btrfs_device     *dev;
  87         struct bio              *bio;
  88         blk_status_t            status;
  89         u64                     logical;
  90         u64                     physical;
  91 #if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
  92         struct scrub_page       *pagev[SCRUB_PAGES_PER_WR_BIO];
  93 #else
  94         struct scrub_page       *pagev[SCRUB_PAGES_PER_RD_BIO];
  95 #endif
  96         int                     page_count;
  97         int                     next_free;
  98         struct btrfs_work       work;
  99 };
 100
 101 struct scrub_block {
 102         struct scrub_page       *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
 103         int                     page_count;
 104         atomic_t                outstanding_pages;
 105         refcount_t              refs; /* free mem on transition to zero */
 106         struct scrub_ctx        *sctx;
 107         struct scrub_parity     *sparity;
 108         struct {
 109                 unsigned int    header_error:1;
 110                 unsigned int    checksum_error:1;
 111                 unsigned int    no_io_error_seen:1;
 112                 unsigned int    generation_error:1; /* also sets header_error */
 113
 114                 /* The following is for the data used to check parity */
 115                 /* It is for the data with checksum */
 116                 unsigned int    data_corrected:1;
 117         };
 118         struct btrfs_work       work;
 119 };
 120
 121 /* Used for the chunks with parity stripe such RAID5/6 */
 122 struct scrub_parity {
 123         struct scrub_ctx        *sctx;
 124
 125         struct btrfs_device     *scrub_dev;
 126
 127         u64                     logic_start;
 128
 129         u64                     logic_end;
 130
 131         int                     nsectors;
 132
 133         u32                     stripe_len;
 134
 135         refcount_t              refs;
 136
 137         struct list_head        spages;
 138
 139         /* Work of parity check and repair */
 140         struct btrfs_work       work;
 141
 142         /* Mark the parity blocks which have data */
 143         unsigned long           *dbitmap;
 144
 145         /*
 146          * Mark the parity blocks which have data, but errors happen when
 147          * read data or check data
 148          */
 149         unsigned long           *ebitmap;
 150
 151         unsigned long           bitmap[];
 152 };
 153
 154 struct scrub_ctx {
 155         struct scrub_bio        *bios[SCRUB_BIOS_PER_SCTX];
 156         struct btrfs_fs_info    *fs_info;
 157         int                     first_free;
 158         int                     curr;
 159         atomic_t                bios_in_flight;
 160         atomic_t                workers_pending;
 161         spinlock_t              list_lock;
 162         wait_queue_head_t       list_wait;
 163         struct list_head        csum_list;
 164         atomic_t                cancel_req;
 165         int                     readonly;
 166         int                     pages_per_rd_bio;
 167
 168         int                     is_dev_replace;
 169
 170         struct scrub_bio        *wr_curr_bio;
 171         struct mutex            wr_lock;
 172         int                     pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
 173         struct btrfs_device     *wr_tgtdev;
 174         bool                    flush_all_writes;
 175
 176         /*
 177          * statistics
 178          */
 179         struct btrfs_scrub_progress stat;
 180         spinlock_t              stat_lock;
 181
 182         /*
 183          * Use a ref counter to avoid use-after-free issues. Scrub workers
 184          * decrement bios_in_flight and workers_pending and then do a wakeup
 185          * on the list_wait wait queue. We must ensure the main scrub task
 186          * doesn't free the scrub context before or while the workers are
 187          * doing the wakeup() call.
 188          */
 189         refcount_t              refs;
 190 };
 191
 192 struct scrub_warning {
 193         struct btrfs_path       *path;
 194         u64                     extent_item_size;
 195         const char              *errstr;
 196         u64                     physical;
 197         u64                     logical;
 198         struct btrfs_device     *dev;
 199 };
 200
 201 struct full_stripe_lock {
 202         struct rb_node node;
 203         u64 logical;
 204         u64 refs;
 205         struct mutex mutex;
 206 };
 207
 208 static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
 209 static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
 210 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
 211 static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
 212                                      struct scrub_block *sblocks_for_recheck);
 213 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
 214                                 struct scrub_block *sblock,
 215                                 int retry_failed_mirror);
 216 static void scrub_recheck_block_checksum(struct scrub_block *sblock);
 217 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
 218                                              struct scrub_block *sblock_good);
 219 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
 220                                             struct scrub_block *sblock_good,
 221                                             int page_num, int force_write);
 222 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
 223 static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
 224                                            int page_num);
 225 static int scrub_checksum_data(struct scrub_block *sblock);
 226 static int scrub_checksum_tree_block(struct scrub_block *sblock);
 227 static int scrub_checksum_super(struct scrub_block *sblock);
 228 static void scrub_block_get(struct scrub_block *sblock);
 229 static void scrub_block_put(struct scrub_block *sblock);
 230 static void scrub_page_get(struct scrub_page *spage);
 231 static void scrub_page_put(struct scrub_page *spage);
 232 static void scrub_parity_get(struct scrub_parity *sparity);
 233 static void scrub_parity_put(struct scrub_parity *sparity);
 234 static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
 235                                     struct scrub_page *spage);
 236 static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len,
 237                        u64 physical, struct btrfs_device *dev, u64 flags,
 238                        u64 gen, int mirror_num, u8 *csum,
 239                        u64 physical_for_dev_replace);
 240 static void scrub_bio_end_io(struct bio *bio);
 241 static void scrub_bio_end_io_worker(struct btrfs_work *work);
 242 static void scrub_block_complete(struct scrub_block *sblock);
 243 static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
 244                                u64 extent_logical, u32 extent_len,
 245                                u64 *extent_physical,
 246                                struct btrfs_device **extent_dev,
 247                                int *extent_mirror_num);
 248 static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
 249                                     struct scrub_page *spage);
 250 static void scrub_wr_submit(struct scrub_ctx *sctx);
 251 static void scrub_wr_bio_end_io(struct bio *bio);
 252 static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
 253 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
 254 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
 255 static void scrub_put_ctx(struct scrub_ctx *sctx);
 256
 257 static inline int scrub_is_page_on_raid56(struct scrub_page *spage)
 258 {
 259         return spage->recover &&
 260                (spage->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
 261 }
 262
 263 static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
 264 {
 265         refcount_inc(&sctx->refs);
 266         atomic_inc(&sctx->bios_in_flight);
 267 }
 268
 269 static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
 270 {
 271         atomic_dec(&sctx->bios_in_flight);
 272         wake_up(&sctx->list_wait);
 273         scrub_put_ctx(sctx);
 274 }
 275
 276 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
 277 {
 278         while (atomic_read(&fs_info->scrub_pause_req)) {
 279                 mutex_unlock(&fs_info->scrub_lock);
 280                 wait_event(fs_info->scrub_pause_wait,
 281                    atomic_read(&fs_info->scrub_pause_req) == 0);
 282                 mutex_lock(&fs_info->scrub_lock);
 283         }
 284 }
 285
 286 static void scrub_pause_on(struct btrfs_fs_info *fs_info)
 287 {
 288         atomic_inc(&fs_info->scrubs_paused);
 289         wake_up(&fs_info->scrub_pause_wait);
 290 }
 291
 292 static void scrub_pause_off(struct btrfs_fs_info *fs_info)
 293 {
 294         mutex_lock(&fs_info->scrub_lock);
 295         __scrub_blocked_if_needed(fs_info);
 296         atomic_dec(&fs_info->scrubs_paused);
 297         mutex_unlock(&fs_info->scrub_lock);
 298
 299         wake_up(&fs_info->scrub_pause_wait);
 300 }
 301
 302 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
 303 {
 304         scrub_pause_on(fs_info);
 305         scrub_pause_off(fs_info);
 306 }
 307
 308 /*
 309  * Insert new full stripe lock into full stripe locks tree
 310  *
 311  * Return pointer to existing or newly inserted full_stripe_lock structure if
 312  * everything works well.
 313  * Return ERR_PTR(-ENOMEM) if we failed to allocate memory
 314  *
 315  * NOTE: caller must hold full_stripe_locks_root->lock before calling this
 316  * function
 317  */
 318 static struct full_stripe_lock *insert_full_stripe_lock(
 319                 struct btrfs_full_stripe_locks_tree *locks_root,
 320                 u64 fstripe_logical)
 321 {
 322         struct rb_node **p;
 323         struct rb_node *parent = NULL;
 324         struct full_stripe_lock *entry;
 325         struct full_stripe_lock *ret;
 326
 327         lockdep_assert_held(&locks_root->lock);
 328
 329         p = &locks_root->root.rb_node;
 330         while (*p) {
 331                 parent = *p;
 332                 entry = rb_entry(parent, struct full_stripe_lock, node);
 333                 if (fstripe_logical < entry->logical) {
 334                         p = &(*p)->rb_left;
 335                 } else if (fstripe_logical > entry->logical) {
 336                         p = &(*p)->rb_right;
 337                 } else {
 338                         entry->refs++;
 339                         return entry;
 340                 }
 341         }
 342
 343         /*
 344          * Insert new lock.
 345          */
 346         ret = kmalloc(sizeof(*ret), GFP_KERNEL);
 347         if (!ret)
 348                 return ERR_PTR(-ENOMEM);
 349         ret->logical = fstripe_logical;
 350         ret->refs = 1;
 351         mutex_init(&ret->mutex);
 352
 353         rb_link_node(&ret->node, parent, p);
 354         rb_insert_color(&ret->node, &locks_root->root);
 355         return ret;
 356 }
 357
 358 /*
 359  * Search for a full stripe lock of a block group
 360  *
 361  * Return pointer to existing full stripe lock if found
 362  * Return NULL if not found
 363  */
 364 static struct full_stripe_lock *search_full_stripe_lock(
 365                 struct btrfs_full_stripe_locks_tree *locks_root,
 366                 u64 fstripe_logical)
 367 {
 368         struct rb_node *node;
 369         struct full_stripe_lock *entry;
 370
 371         lockdep_assert_held(&locks_root->lock);
 372
 373         node = locks_root->root.rb_node;
 374         while (node) {
 375                 entry = rb_entry(node, struct full_stripe_lock, node);
 376                 if (fstripe_logical < entry->logical)
 377                         node = node->rb_left;
 378                 else if (fstripe_logical > entry->logical)
 379                         node = node->rb_right;
 380                 else
 381                         return entry;
 382         }
 383         return NULL;
 384 }
 385
 386 /*
 387  * Helper to get full stripe logical from a normal bytenr.
 388  *
 389  * Caller must ensure @cache is a RAID56 block group.
 390  */
 391 static u64 get_full_stripe_logical(struct btrfs_block_group *cache, u64 bytenr)
 392 {
 393         u64 ret;
 394
 395         /*
 396          * Due to chunk item size limit, full stripe length should not be
 397          * larger than U32_MAX. Just a sanity check here.
 398          */
 399         WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX);
 400
 401         /*
 402          * round_down() can only handle power of 2, while RAID56 full
 403          * stripe length can be 64KiB * n, so we need to manually round down.
 404          */
 405         ret = div64_u64(bytenr - cache->start, cache->full_stripe_len) *
 406                         cache->full_stripe_len + cache->start;
 407         return ret;
 408 }
 409
 410 /*
 411  * Lock a full stripe to avoid concurrency of recovery and read
 412  *
 413  * It's only used for profiles with parities (RAID5/6), for other profiles it
 414  * does nothing.
 415  *
 416  * Return 0 if we locked full stripe covering @bytenr, with a mutex held.
 417  * So caller must call unlock_full_stripe() at the same context.
 418  *
 419  * Return <0 if encounters error.
 420  */
 421 static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
 422                             bool *locked_ret)
 423 {
 424         struct btrfs_block_group *bg_cache;
 425         struct btrfs_full_stripe_locks_tree *locks_root;
 426         struct full_stripe_lock *existing;
 427         u64 fstripe_start;
 428         int ret = 0;
 429
 430         *locked_ret = false;
 431         bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
 432         if (!bg_cache) {
 433                 ASSERT(0);
 434                 return -ENOENT;
 435         }
 436
 437         /* Profiles not based on parity don't need full stripe lock */
 438         if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
 439                 goto out;
 440         locks_root = &bg_cache->full_stripe_locks_root;
 441
 442         fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
 443
 444         /* Now insert the full stripe lock */
 445         mutex_lock(&locks_root->lock);
 446         existing = insert_full_stripe_lock(locks_root, fstripe_start);
 447         mutex_unlock(&locks_root->lock);
 448         if (IS_ERR(existing)) {
 449                 ret = PTR_ERR(existing);
 450                 goto out;
 451         }
 452         mutex_lock(&existing->mutex);
 453         *locked_ret = true;
 454 out:
 455         btrfs_put_block_group(bg_cache);
 456         return ret;
 457 }
 458
 459 /*
 460  * Unlock a full stripe.
 461  *
 462  * NOTE: Caller must ensure it's the same context calling corresponding
 463  * lock_full_stripe().
 464  *
 465  * Return 0 if we unlock full stripe without problem.
 466  * Return <0 for error
 467  */
 468 static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
 469                               bool locked)
 470 {
 471         struct btrfs_block_group *bg_cache;
 472         struct btrfs_full_stripe_locks_tree *locks_root;
 473         struct full_stripe_lock *fstripe_lock;
 474         u64 fstripe_start;
 475         bool freeit = false;
 476         int ret = 0;
 477
 478         /* If we didn't acquire full stripe lock, no need to continue */
 479         if (!locked)
 480                 return 0;
 481
 482         bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
 483         if (!bg_cache) {
 484                 ASSERT(0);
 485                 return -ENOENT;
 486         }
 487         if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
 488                 goto out;
 489
 490         locks_root = &bg_cache->full_stripe_locks_root;
 491         fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
 492
 493         mutex_lock(&locks_root->lock);
 494         fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start);
 495         /* Unpaired unlock_full_stripe() detected */
 496         if (!fstripe_lock) {
 497                 WARN_ON(1);
 498                 ret = -ENOENT;
 499                 mutex_unlock(&locks_root->lock);
 500                 goto out;
 501         }
 502
 503         if (fstripe_lock->refs == 0) {
 504                 WARN_ON(1);
 505                 btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow",
 506                         fstripe_lock->logical);
 507         } else {
 508                 fstripe_lock->refs--;
 509         }
 510
 511         if (fstripe_lock->refs == 0) {
 512                 rb_erase(&fstripe_lock->node, &locks_root->root);
 513                 freeit = true;
 514         }
 515         mutex_unlock(&locks_root->lock);
 516
 517         mutex_unlock(&fstripe_lock->mutex);
 518         if (freeit)
 519                 kfree(fstripe_lock);
 520 out:
 521         btrfs_put_block_group(bg_cache);
 522         return ret;
 523 }
 524
 525 static void scrub_free_csums(struct scrub_ctx *sctx)
 526 {
 527         while (!list_empty(&sctx->csum_list)) {
 528                 struct btrfs_ordered_sum *sum;
 529                 sum = list_first_entry(&sctx->csum_list,
 530                                        struct btrfs_ordered_sum, list);
 531                 list_del(&sum->list);
 532                 kfree(sum);
 533         }
 534 }
 535
 536 static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
 537 {
 538         int i;
 539
 540         if (!sctx)
 541                 return;
 542
 543         /* this can happen when scrub is cancelled */
 544         if (sctx->curr != -1) {
 545                 struct scrub_bio *sbio = sctx->bios[sctx->curr];
 546
 547                 for (i = 0; i < sbio->page_count; i++) {
 548                         WARN_ON(!sbio->pagev[i]->page);
 549                         scrub_block_put(sbio->pagev[i]->sblock);
 550                 }
 551                 bio_put(sbio->bio);
 552         }
 553
 554         for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
 555                 struct scrub_bio *sbio = sctx->bios[i];
 556
 557                 if (!sbio)
 558                         break;
 559                 kfree(sbio);
 560         }
 561
 562         kfree(sctx->wr_curr_bio);
 563         scrub_free_csums(sctx);
 564         kfree(sctx);
 565 }
 566
 567 static void scrub_put_ctx(struct scrub_ctx *sctx)
 568 {
 569         if (refcount_dec_and_test(&sctx->refs))
 570                 scrub_free_ctx(sctx);
 571 }
 572
 573 static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
 574                 struct btrfs_fs_info *fs_info, int is_dev_replace)
 575 {
 576         struct scrub_ctx *sctx;
 577         int             i;
 578
 579         sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
 580         if (!sctx)
 581                 goto nomem;
 582         refcount_set(&sctx->refs, 1);
 583         sctx->is_dev_replace = is_dev_replace;
 584         sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
 585         sctx->curr = -1;
 586         sctx->fs_info = fs_info;
 587         INIT_LIST_HEAD(&sctx->csum_list);
 588         for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
 589                 struct scrub_bio *sbio;
 590
 591                 sbio = kzalloc(sizeof(*sbio), GFP_KERNEL);
 592                 if (!sbio)
 593                         goto nomem;
 594                 sctx->bios[i] = sbio;
 595
 596                 sbio->index = i;
 597                 sbio->sctx = sctx;
 598                 sbio->page_count = 0;
 599                 btrfs_init_work(&sbio->work, scrub_bio_end_io_worker, NULL,
 600                                 NULL);
 601
 602                 if (i != SCRUB_BIOS_PER_SCTX - 1)
 603                         sctx->bios[i]->next_free = i + 1;
 604                 else
 605                         sctx->bios[i]->next_free = -1;
 606         }
 607         sctx->first_free = 0;
 608         atomic_set(&sctx->bios_in_flight, 0);
 609         atomic_set(&sctx->workers_pending, 0);
 610         atomic_set(&sctx->cancel_req, 0);
 611
 612         spin_lock_init(&sctx->list_lock);
 613         spin_lock_init(&sctx->stat_lock);
 614         init_waitqueue_head(&sctx->list_wait);
 615
 616         WARN_ON(sctx->wr_curr_bio != NULL);
 617         mutex_init(&sctx->wr_lock);
 618         sctx->wr_curr_bio = NULL;
 619         if (is_dev_replace) {
 620                 WARN_ON(!fs_info->dev_replace.tgtdev);
 621                 sctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO;
 622                 sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
 623                 sctx->flush_all_writes = false;
 624         }
 625
 626         return sctx;
 627
 628 nomem:
 629         scrub_free_ctx(sctx);
 630         return ERR_PTR(-ENOMEM);
 631 }
 632
 633 static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
 634                                      void *warn_ctx)
 635 {
 636         u64 isize;
 637         u32 nlink;
 638         int ret;
 639         int i;
 640         unsigned nofs_flag;
 641         struct extent_buffer *eb;
 642         struct btrfs_inode_item *inode_item;
 643         struct scrub_warning *swarn = warn_ctx;
 644         struct btrfs_fs_info *fs_info = swarn->dev->fs_info;
 645         struct inode_fs_paths *ipath = NULL;
 646         struct btrfs_root *local_root;
 647         struct btrfs_key key;
 648
 649         local_root = btrfs_get_fs_root(fs_info, root, true);
 650         if (IS_ERR(local_root)) {
 651                 ret = PTR_ERR(local_root);
 652                 goto err;
 653         }
 654
 655         /*
 656          * this makes the path point to (inum INODE_ITEM ioff)
 657          */
 658         key.objectid = inum;
 659         key.type = BTRFS_INODE_ITEM_KEY;
 660         key.offset = 0;
 661
 662         ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
 663         if (ret) {
 664                 btrfs_put_root(local_root);
 665                 btrfs_release_path(swarn->path);
 666                 goto err;
 667         }
 668
 669         eb = swarn->path->nodes[0];
 670         inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
 671                                         struct btrfs_inode_item);
 672         isize = btrfs_inode_size(eb, inode_item);
 673         nlink = btrfs_inode_nlink(eb, inode_item);
 674         btrfs_release_path(swarn->path);
 675
 676         /*
 677          * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub
 678          * uses GFP_NOFS in this context, so we keep it consistent but it does
 679          * not seem to be strictly necessary.
 680          */
 681         nofs_flag = memalloc_nofs_save();
 682         ipath = init_ipath(4096, local_root, swarn->path);
 683         memalloc_nofs_restore(nofs_flag);
 684         if (IS_ERR(ipath)) {
 685                 btrfs_put_root(local_root);
 686                 ret = PTR_ERR(ipath);
 687                 ipath = NULL;
 688                 goto err;
 689         }
 690         ret = paths_from_inode(inum, ipath);
 691
 692         if (ret < 0)
 693                 goto err;
 694
 695         /*
 696          * we deliberately ignore the bit ipath might have been too small to
 697          * hold all of the paths here
 698          */
 699         for (i = 0; i < ipath->fspath->elem_cnt; ++i)
 700                 btrfs_warn_in_rcu(fs_info,
 701 "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %llu, links %u (path: %s)",
 702                                   swarn->errstr, swarn->logical,
 703                                   rcu_str_deref(swarn->dev->name),
 704                                   swarn->physical,
 705                                   root, inum, offset,
 706                                   min(isize - offset, (u64)PAGE_SIZE), nlink,
 707                                   (char *)(unsigned long)ipath->fspath->val[i]);
 708
 709         btrfs_put_root(local_root);
 710         free_ipath(ipath);
 711         return 0;
 712
 713 err:
 714         btrfs_warn_in_rcu(fs_info,
 715                           "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
 716                           swarn->errstr, swarn->logical,
 717                           rcu_str_deref(swarn->dev->name),
 718                           swarn->physical,
 719                           root, inum, offset, ret);
 720
 721         free_ipath(ipath);
 722         return 0;
 723 }
 724
 725 static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
 726 {
 727         struct btrfs_device *dev;
 728         struct btrfs_fs_info *fs_info;
 729         struct btrfs_path *path;
 730         struct btrfs_key found_key;
 731         struct extent_buffer *eb;
 732         struct btrfs_extent_item *ei;
 733         struct scrub_warning swarn;
 734         unsigned long ptr = 0;
 735         u64 extent_item_pos;
 736         u64 flags = 0;
 737         u64 ref_root;
 738         u32 item_size;
 739         u8 ref_level = 0;
 740         int ret;
 741
 742         WARN_ON(sblock->page_count < 1);
 743         dev = sblock->pagev[0]->dev;
 744         fs_info = sblock->sctx->fs_info;
 745
 746         path = btrfs_alloc_path();
 747         if (!path)
 748                 return;
 749
 750         swarn.physical = sblock->pagev[0]->physical;
 751         swarn.logical = sblock->pagev[0]->logical;
 752         swarn.errstr = errstr;
 753         swarn.dev = NULL;
 754
 755         ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
 756                                   &flags);
 757         if (ret < 0)
 758                 goto out;
 759
 760         extent_item_pos = swarn.logical - found_key.objectid;
 761         swarn.extent_item_size = found_key.offset;
 762
 763         eb = path->nodes[0];
 764         ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
 765         item_size = btrfs_item_size_nr(eb, path->slots[0]);
 766
 767         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
 768                 do {
 769                         ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
 770                                                       item_size, &ref_root,
 771                                                       &ref_level);
 772                         btrfs_warn_in_rcu(fs_info,
 773 "%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
 774                                 errstr, swarn.logical,
 775                                 rcu_str_deref(dev->name),
 776                                 swarn.physical,
 777                                 ref_level ? "node" : "leaf",
 778                                 ret < 0 ? -1 : ref_level,
 779                                 ret < 0 ? -1 : ref_root);
 780                 } while (ret != 1);
 781                 btrfs_release_path(path);
 782         } else {
 783                 btrfs_release_path(path);
 784                 swarn.path = path;
 785                 swarn.dev = dev;
 786                 iterate_extent_inodes(fs_info, found_key.objectid,
 787                                         extent_item_pos, 1,
 788                                         scrub_print_warning_inode, &swarn, false);
 789         }
 790
 791 out:
 792         btrfs_free_path(path);
 793 }
 794
 795 static inline void scrub_get_recover(struct scrub_recover *recover)
 796 {
 797         refcount_inc(&recover->refs);
 798 }
 799
 800 static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
 801                                      struct scrub_recover *recover)
 802 {
 803         if (refcount_dec_and_test(&recover->refs)) {
 804                 btrfs_bio_counter_dec(fs_info);
 805                 btrfs_put_bbio(recover->bbio);
 806                 kfree(recover);
 807         }
 808 }
 809
 810 /*
 811  * scrub_handle_errored_block gets called when either verification of the
 812  * pages failed or the bio failed to read, e.g. with EIO. In the latter
 813  * case, this function handles all pages in the bio, even though only one
 814  * may be bad.
 815  * The goal of this function is to repair the errored block by using the
 816  * contents of one of the mirrors.
 817  */
 818 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 819 {
 820         struct scrub_ctx *sctx = sblock_to_check->sctx;
 821         struct btrfs_device *dev;
 822         struct btrfs_fs_info *fs_info;
 823         u64 logical;
 824         unsigned int failed_mirror_index;
 825         unsigned int is_metadata;
 826         unsigned int have_csum;
 827         struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
 828         struct scrub_block *sblock_bad;
 829         int ret;
 830         int mirror_index;
 831         int page_num;
 832         int success;
 833         bool full_stripe_locked;
 834         unsigned int nofs_flag;
 835         static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
 836                                       DEFAULT_RATELIMIT_BURST);
 837
 838         BUG_ON(sblock_to_check->page_count < 1);
 839         fs_info = sctx->fs_info;
 840         if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
 841                 /*
 842                  * if we find an error in a super block, we just report it.
 843                  * They will get written with the next transaction commit
 844                  * anyway
 845                  */
 846                 spin_lock(&sctx->stat_lock);
 847                 ++sctx->stat.super_errors;
 848                 spin_unlock(&sctx->stat_lock);
 849                 return 0;
 850         }
 851         logical = sblock_to_check->pagev[0]->logical;
 852         BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
 853         failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
 854         is_metadata = !(sblock_to_check->pagev[0]->flags &
 855                         BTRFS_EXTENT_FLAG_DATA);
 856         have_csum = sblock_to_check->pagev[0]->have_csum;
 857         dev = sblock_to_check->pagev[0]->dev;
 858
 859         /*
 860          * We must use GFP_NOFS because the scrub task might be waiting for a
 861          * worker task executing this function and in turn a transaction commit
 862          * might be waiting the scrub task to pause (which needs to wait for all
 863          * the worker tasks to complete before pausing).
 864          * We do allocations in the workers through insert_full_stripe_lock()
 865          * and scrub_add_page_to_wr_bio(), which happens down the call chain of
 866          * this function.
 867          */
 868         nofs_flag = memalloc_nofs_save();
 869         /*
 870          * For RAID5/6, race can happen for a different device scrub thread.
 871          * For data corruption, Parity and Data threads will both try
 872          * to recovery the data.
 873          * Race can lead to doubly added csum error, or even unrecoverable
 874          * error.
 875          */
 876         ret = lock_full_stripe(fs_info, logical, &full_stripe_locked);
 877         if (ret < 0) {
 878                 memalloc_nofs_restore(nofs_flag);
 879                 spin_lock(&sctx->stat_lock);
 880                 if (ret == -ENOMEM)
 881                         sctx->stat.malloc_errors++;
 882                 sctx->stat.read_errors++;
 883                 sctx->stat.uncorrectable_errors++;
 884                 spin_unlock(&sctx->stat_lock);
 885                 return ret;
 886         }
 887
 888         /*
 889          * read all mirrors one after the other. This includes to
 890          * re-read the extent or metadata block that failed (that was
 891          * the cause that this fixup code is called) another time,
 892          * page by page this time in order to know which pages
 893          * caused I/O errors and which ones are good (for all mirrors).
 894          * It is the goal to handle the situation when more than one
 895          * mirror contains I/O errors, but the errors do not
 896          * overlap, i.e. the data can be repaired by selecting the
 897          * pages from those mirrors without I/O error on the
 898          * particular pages. One example (with blocks >= 2 * PAGE_SIZE)
 899          * would be that mirror #1 has an I/O error on the first page,
 900          * the second page is good, and mirror #2 has an I/O error on
 901          * the second page, but the first page is good.
 902          * Then the first page of the first mirror can be repaired by
 903          * taking the first page of the second mirror, and the
 904          * second page of the second mirror can be repaired by
 905          * copying the contents of the 2nd page of the 1st mirror.
 906          * One more note: if the pages of one mirror contain I/O
 907          * errors, the checksum cannot be verified. In order to get
 908          * the best data for repairing, the first attempt is to find
 909          * a mirror without I/O errors and with a validated checksum.
 910          * Only if this is not possible, the pages are picked from
 911          * mirrors with I/O errors without considering the checksum.
 912          * If the latter is the case, at the end, the checksum of the
 913          * repaired area is verified in order to correctly maintain
 914          * the statistics.
 915          */
 916
 917         sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS,
 918                                       sizeof(*sblocks_for_recheck), GFP_KERNEL);
 919         if (!sblocks_for_recheck) {
 920                 spin_lock(&sctx->stat_lock);
 921                 sctx->stat.malloc_errors++;
 922                 sctx->stat.read_errors++;
 923                 sctx->stat.uncorrectable_errors++;
 924                 spin_unlock(&sctx->stat_lock);
 925                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
 926                 goto out;
 927         }
 928
 929         /* setup the context, map the logical blocks and alloc the pages */
 930         ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
 931         if (ret) {
 932                 spin_lock(&sctx->stat_lock);
 933                 sctx->stat.read_errors++;
 934                 sctx->stat.uncorrectable_errors++;
 935                 spin_unlock(&sctx->stat_lock);
 936                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
 937                 goto out;
 938         }
 939         BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
 940         sblock_bad = sblocks_for_recheck + failed_mirror_index;
 941
 942         /* build and submit the bios for the failed mirror, check checksums */
 943         scrub_recheck_block(fs_info, sblock_bad, 1);
 944
 945         if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
 946             sblock_bad->no_io_error_seen) {
 947                 /*
 948                  * the error disappeared after reading page by page, or
 949                  * the area was part of a huge bio and other parts of the
 950                  * bio caused I/O errors, or the block layer merged several
 951                  * read requests into one and the error is caused by a
 952                  * different bio (usually one of the two latter cases is
 953                  * the cause)
 954                  */
 955                 spin_lock(&sctx->stat_lock);
 956                 sctx->stat.unverified_errors++;
 957                 sblock_to_check->data_corrected = 1;
 958                 spin_unlock(&sctx->stat_lock);
 959
 960                 if (sctx->is_dev_replace)
 961                         scrub_write_block_to_dev_replace(sblock_bad);
 962                 goto out;
 963         }
 964
 965         if (!sblock_bad->no_io_error_seen) {
 966                 spin_lock(&sctx->stat_lock);
 967                 sctx->stat.read_errors++;
 968                 spin_unlock(&sctx->stat_lock);
 969                 if (__ratelimit(&rs))
 970                         scrub_print_warning("i/o error", sblock_to_check);
 971                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
 972         } else if (sblock_bad->checksum_error) {
 973                 spin_lock(&sctx->stat_lock);
 974                 sctx->stat.csum_errors++;
 975                 spin_unlock(&sctx->stat_lock);
 976                 if (__ratelimit(&rs))
 977                         scrub_print_warning("checksum error", sblock_to_check);
 978                 btrfs_dev_stat_inc_and_print(dev,
 979                                              BTRFS_DEV_STAT_CORRUPTION_ERRS);
 980         } else if (sblock_bad->header_error) {
 981                 spin_lock(&sctx->stat_lock);
 982                 sctx->stat.verify_errors++;
 983                 spin_unlock(&sctx->stat_lock);
 984                 if (__ratelimit(&rs))
 985                         scrub_print_warning("checksum/header error",
 986                                             sblock_to_check);
 987                 if (sblock_bad->generation_error)
 988                         btrfs_dev_stat_inc_and_print(dev,
 989                                 BTRFS_DEV_STAT_GENERATION_ERRS);
 990                 else
 991                         btrfs_dev_stat_inc_and_print(dev,
 992                                 BTRFS_DEV_STAT_CORRUPTION_ERRS);
 993         }
 994
 995         if (sctx->readonly) {
 996                 ASSERT(!sctx->is_dev_replace);
 997                 goto out;
 998         }
 999
1000         /*
1001          * now build and submit the bios for the other mirrors, check
1002          * checksums.
1003          * First try to pick the mirror which is completely without I/O
1004          * errors and also does not have a checksum error.
1005          * If one is found, and if a checksum is present, the full block
1006          * that is known to contain an error is rewritten. Afterwards
1007          * the block is known to be corrected.
1008          * If a mirror is found which is completely correct, and no
1009          * checksum is present, only those pages are rewritten that had
1010          * an I/O error in the block to be repaired, since it cannot be
1011          * determined, which copy of the other pages is better (and it
1012          * could happen otherwise that a correct page would be
1013          * overwritten by a bad one).
1014          */
1015         for (mirror_index = 0; ;mirror_index++) {
1016                 struct scrub_block *sblock_other;
1017
1018                 if (mirror_index == failed_mirror_index)
1019                         continue;
1020
1021                 /* raid56's mirror can be more than BTRFS_MAX_MIRRORS */
1022                 if (!scrub_is_page_on_raid56(sblock_bad->pagev[0])) {
1023                         if (mirror_index >= BTRFS_MAX_MIRRORS)
1024                                 break;
1025                         if (!sblocks_for_recheck[mirror_index].page_count)
1026                                 break;
1027
1028                         sblock_other = sblocks_for_recheck + mirror_index;
1029                 } else {
1030                         struct scrub_recover *r = sblock_bad->pagev[0]->recover;
1031                         int max_allowed = r->bbio->num_stripes -
1032                                                 r->bbio->num_tgtdevs;
1033
1034                         if (mirror_index >= max_allowed)
1035                                 break;
1036                         if (!sblocks_for_recheck[1].page_count)
1037                                 break;
1038
1039                         ASSERT(failed_mirror_index == 0);
1040                         sblock_other = sblocks_for_recheck + 1;
1041                         sblock_other->pagev[0]->mirror_num = 1 + mirror_index;
1042                 }
1043
1044                 /* build and submit the bios, check checksums */
1045                 scrub_recheck_block(fs_info, sblock_other, 0);
1046
1047                 if (!sblock_other->header_error &&
1048                     !sblock_other->checksum_error &&
1049                     sblock_other->no_io_error_seen) {
1050                         if (sctx->is_dev_replace) {
1051                                 scrub_write_block_to_dev_replace(sblock_other);
1052                                 goto corrected_error;
1053                         } else {
1054                                 ret = scrub_repair_block_from_good_copy(
1055                                                 sblock_bad, sblock_other);
1056                                 if (!ret)
1057                                         goto corrected_error;
1058                         }
1059                 }
1060         }
1061
1062         if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
1063                 goto did_not_correct_error;
1064
1065         /*
1066          * In case of I/O errors in the area that is supposed to be
1067          * repaired, continue by picking good copies of those pages.
1068          * Select the good pages from mirrors to rewrite bad pages from
1069          * the area to fix. Afterwards verify the checksum of the block
1070          * that is supposed to be repaired. This verification step is
1071          * only done for the purpose of statistic counting and for the
1072          * final scrub report, whether errors remain.
1073          * A perfect algorithm could make use of the checksum and try
1074          * all possible combinations of pages from the different mirrors
1075          * until the checksum verification succeeds. For example, when
1076          * the 2nd page of mirror #1 faces I/O errors, and the 2nd page
1077          * of mirror #2 is readable but the final checksum test fails,
1078          * then the 2nd page of mirror #3 could be tried, whether now
1079          * the final checksum succeeds. But this would be a rare
1080          * exception and is therefore not implemented. At least it is
1081          * avoided that the good copy is overwritten.
1082          * A more useful improvement would be to pick the sectors
1083          * without I/O error based on sector sizes (512 bytes on legacy
1084          * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one
1085          * mirror could be repaired by taking 512 byte of a different
1086          * mirror, even if other 512 byte sectors in the same PAGE_SIZE
1087          * area are unreadable.
1088          */
1089         success = 1;
1090         for (page_num = 0; page_num < sblock_bad->page_count;
1091              page_num++) {
1092                 struct scrub_page *spage_bad = sblock_bad->pagev[page_num];
1093                 struct scrub_block *sblock_other = NULL;
1094
1095                 /* skip no-io-error page in scrub */
1096                 if (!spage_bad->io_error && !sctx->is_dev_replace)
1097                         continue;
1098
1099                 if (scrub_is_page_on_raid56(sblock_bad->pagev[0])) {
1100                         /*
1101                          * In case of dev replace, if raid56 rebuild process
1102                          * didn't work out correct data, then copy the content
1103                          * in sblock_bad to make sure target device is identical
1104                          * to source device, instead of writing garbage data in
1105                          * sblock_for_recheck array to target device.
1106                          */
1107                         sblock_other = NULL;
1108                 } else if (spage_bad->io_error) {
1109                         /* try to find no-io-error page in mirrors */
1110                         for (mirror_index = 0;
1111                              mirror_index < BTRFS_MAX_MIRRORS &&
1112                              sblocks_for_recheck[mirror_index].page_count > 0;
1113                              mirror_index++) {
1114                                 if (!sblocks_for_recheck[mirror_index].
1115                                     pagev[page_num]->io_error) {
1116                                         sblock_other = sblocks_for_recheck +
1117                                                        mirror_index;
1118                                         break;
1119                                 }
1120                         }
1121                         if (!sblock_other)
1122                                 success = 0;
1123                 }
1124
1125                 if (sctx->is_dev_replace) {
1126                         /*
1127                          * did not find a mirror to fetch the page
1128                          * from. scrub_write_page_to_dev_replace()
1129                          * handles this case (page->io_error), by
1130                          * filling the block with zeros before
1131                          * submitting the write request
1132                          */
1133                         if (!sblock_other)
1134                                 sblock_other = sblock_bad;
1135
1136                         if (scrub_write_page_to_dev_replace(sblock_other,
1137                                                             page_num) != 0) {
1138                                 atomic64_inc(
1139                                         &fs_info->dev_replace.num_write_errors);
1140                                 success = 0;
1141                         }
1142                 } else if (sblock_other) {
1143                         ret = scrub_repair_page_from_good_copy(sblock_bad,
1144                                                                sblock_other,
1145                                                                page_num, 0);
1146                         if (0 == ret)
1147                                 spage_bad->io_error = 0;
1148                         else
1149                                 success = 0;
1150                 }
1151         }
1152
1153         if (success && !sctx->is_dev_replace) {
1154                 if (is_metadata || have_csum) {
1155                         /*
1156                          * need to verify the checksum now that all
1157                          * sectors on disk are repaired (the write
1158                          * request for data to be repaired is on its way).
1159                          * Just be lazy and use scrub_recheck_block()
1160                          * which re-reads the data before the checksum
1161                          * is verified, but most likely the data comes out
1162                          * of the page cache.
1163                          */
1164                         scrub_recheck_block(fs_info, sblock_bad, 1);
1165                         if (!sblock_bad->header_error &&
1166                             !sblock_bad->checksum_error &&
1167                             sblock_bad->no_io_error_seen)
1168                                 goto corrected_error;
1169                         else
1170                                 goto did_not_correct_error;
1171                 } else {
1172 corrected_error:
1173                         spin_lock(&sctx->stat_lock);
1174                         sctx->stat.corrected_errors++;
1175                         sblock_to_check->data_corrected = 1;
1176                         spin_unlock(&sctx->stat_lock);
1177                         btrfs_err_rl_in_rcu(fs_info,
1178                                 "fixed up error at logical %llu on dev %s",
1179                                 logical, rcu_str_deref(dev->name));
1180                 }
1181         } else {
1182 did_not_correct_error:
1183                 spin_lock(&sctx->stat_lock);
1184                 sctx->stat.uncorrectable_errors++;
1185                 spin_unlock(&sctx->stat_lock);
1186                 btrfs_err_rl_in_rcu(fs_info,
1187                         "unable to fixup (regular) error at logical %llu on dev %s",
1188                         logical, rcu_str_deref(dev->name));
1189         }
1190
1191 out:
1192         if (sblocks_for_recheck) {
1193                 for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
1194                      mirror_index++) {
1195                         struct scrub_block *sblock = sblocks_for_recheck +
1196                                                      mirror_index;
1197                         struct scrub_recover *recover;
1198                         int page_index;
1199
1200                         for (page_index = 0; page_index < sblock->page_count;
1201                              page_index++) {
1202                                 sblock->pagev[page_index]->sblock = NULL;
1203                                 recover = sblock->pagev[page_index]->recover;
1204                                 if (recover) {
1205                                         scrub_put_recover(fs_info, recover);
1206                                         sblock->pagev[page_index]->recover =
1207                                                                         NULL;
1208                                 }
1209                                 scrub_page_put(sblock->pagev[page_index]);
1210                         }
1211                 }
1212                 kfree(sblocks_for_recheck);
1213         }
1214
1215         ret = unlock_full_stripe(fs_info, logical, full_stripe_locked);
1216         memalloc_nofs_restore(nofs_flag);
1217         if (ret < 0)
1218                 return ret;
1219         return 0;
1220 }
1221
1222 static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio)
1223 {
1224         if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
1225                 return 2;
1226         else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
1227                 return 3;
1228         else
1229                 return (int)bbio->num_stripes;
1230 }
1231
1232 static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
1233                                                  u64 *raid_map,
1234                                                  u64 mapped_length,
1235                                                  int nstripes, int mirror,
1236                                                  int *stripe_index,
1237                                                  u64 *stripe_offset)
1238 {
1239         int i;
1240
1241         if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
1242                 /* RAID5/6 */
1243                 for (i = 0; i < nstripes; i++) {
1244                         if (raid_map[i] == RAID6_Q_STRIPE ||
1245                             raid_map[i] == RAID5_P_STRIPE)
1246                                 continue;
1247
1248                         if (logical >= raid_map[i] &&
1249                             logical < raid_map[i] + mapped_length)
1250                                 break;
1251                 }
1252
1253                 *stripe_index = i;
1254                 *stripe_offset = logical - raid_map[i];
1255         } else {
1256                 /* The other RAID type */
1257                 *stripe_index = mirror;
1258                 *stripe_offset = 0;
1259         }
1260 }
1261
1262 static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
1263                                      struct scrub_block *sblocks_for_recheck)
1264 {
1265         struct scrub_ctx *sctx = original_sblock->sctx;
1266         struct btrfs_fs_info *fs_info = sctx->fs_info;
1267         u64 length = original_sblock->page_count * PAGE_SIZE;
1268         u64 logical = original_sblock->pagev[0]->logical;
1269         u64 generation = original_sblock->pagev[0]->generation;
1270         u64 flags = original_sblock->pagev[0]->flags;
1271         u64 have_csum = original_sblock->pagev[0]->have_csum;
1272         struct scrub_recover *recover;
1273         struct btrfs_bio *bbio;
1274         u64 sublen;
1275         u64 mapped_length;
1276         u64 stripe_offset;
1277         int stripe_index;
1278         int page_index = 0;
1279         int mirror_index;
1280         int nmirrors;
1281         int ret;
1282
1283         /*
1284          * note: the two members refs and outstanding_pages
1285          * are not used (and not set) in the blocks that are used for
1286          * the recheck procedure
1287          */
1288
1289         while (length > 0) {
1290                 sublen = min_t(u64, length, PAGE_SIZE);
1291                 mapped_length = sublen;
1292                 bbio = NULL;
1293
1294                 /*
1295                  * with a length of PAGE_SIZE, each returned stripe
1296                  * represents one mirror
1297                  */
1298                 btrfs_bio_counter_inc_blocked(fs_info);
1299                 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
1300                                 logical, &mapped_length, &bbio);
1301                 if (ret || !bbio || mapped_length < sublen) {
1302                         btrfs_put_bbio(bbio);
1303                         btrfs_bio_counter_dec(fs_info);
1304                         return -EIO;
1305                 }
1306
1307                 recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
1308                 if (!recover) {
1309                         btrfs_put_bbio(bbio);
1310                         btrfs_bio_counter_dec(fs_info);
1311                         return -ENOMEM;
1312                 }
1313
1314                 refcount_set(&recover->refs, 1);
1315                 recover->bbio = bbio;
1316                 recover->map_length = mapped_length;
1317
1318                 BUG_ON(page_index >= SCRUB_MAX_PAGES_PER_BLOCK);
1319
1320                 nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS);
1321
1322                 for (mirror_index = 0; mirror_index < nmirrors;
1323                      mirror_index++) {
1324                         struct scrub_block *sblock;
1325                         struct scrub_page *spage;
1326
1327                         sblock = sblocks_for_recheck + mirror_index;
1328                         sblock->sctx = sctx;
1329
1330                         spage = kzalloc(sizeof(*spage), GFP_NOFS);
1331                         if (!spage) {
1332 leave_nomem:
1333                                 spin_lock(&sctx->stat_lock);
1334                                 sctx->stat.malloc_errors++;
1335                                 spin_unlock(&sctx->stat_lock);
1336                                 scrub_put_recover(fs_info, recover);
1337                                 return -ENOMEM;
1338                         }
1339                         scrub_page_get(spage);
1340                         sblock->pagev[page_index] = spage;
1341                         spage->sblock = sblock;
1342                         spage->flags = flags;
1343                         spage->generation = generation;
1344                         spage->logical = logical;
1345                         spage->have_csum = have_csum;
1346                         if (have_csum)
1347                                 memcpy(spage->csum,
1348                                        original_sblock->pagev[0]->csum,
1349                                        sctx->fs_info->csum_size);
1350
1351                         scrub_stripe_index_and_offset(logical,
1352                                                       bbio->map_type,
1353                                                       bbio->raid_map,
1354                                                       mapped_length,
1355                                                       bbio->num_stripes -
1356                                                       bbio->num_tgtdevs,
1357                                                       mirror_index,
1358                                                       &stripe_index,
1359                                                       &stripe_offset);
1360                         spage->physical = bbio->stripes[stripe_index].physical +
1361                                          stripe_offset;
1362                         spage->dev = bbio->stripes[stripe_index].dev;
1363
1364                         BUG_ON(page_index >= original_sblock->page_count);
1365                         spage->physical_for_dev_replace =
1366                                 original_sblock->pagev[page_index]->
1367                                 physical_for_dev_replace;
1368                         /* for missing devices, dev->bdev is NULL */
1369                         spage->mirror_num = mirror_index + 1;
1370                         sblock->page_count++;
1371                         spage->page = alloc_page(GFP_NOFS);
1372                         if (!spage->page)
1373                                 goto leave_nomem;
1374
1375                         scrub_get_recover(recover);
1376                         spage->recover = recover;
1377                 }
1378                 scrub_put_recover(fs_info, recover);
1379                 length -= sublen;
1380                 logical += sublen;
1381                 page_index++;
1382         }
1383
1384         return 0;
1385 }
1386
1387 static void scrub_bio_wait_endio(struct bio *bio)
1388 {
1389         complete(bio->bi_private);
1390 }
1391
1392 static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
1393                                         struct bio *bio,
1394                                         struct scrub_page *spage)
1395 {
1396         DECLARE_COMPLETION_ONSTACK(done);
1397         int ret;
1398         int mirror_num;
1399
1400         bio->bi_iter.bi_sector = spage->logical >> 9;
1401         bio->bi_private = &done;
1402         bio->bi_end_io = scrub_bio_wait_endio;
1403
1404         mirror_num = spage->sblock->pagev[0]->mirror_num;
1405         ret = raid56_parity_recover(fs_info, bio, spage->recover->bbio,
1406                                     spage->recover->map_length,
1407                                     mirror_num, 0);
1408         if (ret)
1409                 return ret;
1410
1411         wait_for_completion_io(&done);
1412         return blk_status_to_errno(bio->bi_status);
1413 }
1414
1415 static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
1416                                           struct scrub_block *sblock)
1417 {
1418         struct scrub_page *first_page = sblock->pagev[0];
1419         struct bio *bio;
1420         int page_num;
1421
1422         /* All pages in sblock belong to the same stripe on the same device. */
1423         ASSERT(first_page->dev);
1424         if (!first_page->dev->bdev)
1425                 goto out;
1426
1427         bio = btrfs_io_bio_alloc(BIO_MAX_PAGES);
1428         bio_set_dev(bio, first_page->dev->bdev);
1429
1430         for (page_num = 0; page_num < sblock->page_count; page_num++) {
1431                 struct scrub_page *spage = sblock->pagev[page_num];
1432
1433                 WARN_ON(!spage->page);
1434                 bio_add_page(bio, spage->page, PAGE_SIZE, 0);
1435         }
1436
1437         if (scrub_submit_raid56_bio_wait(fs_info, bio, first_page)) {
1438                 bio_put(bio);
1439                 goto out;
1440         }
1441
1442         bio_put(bio);
1443
1444         scrub_recheck_block_checksum(sblock);
1445
1446         return;
1447 out:
1448         for (page_num = 0; page_num < sblock->page_count; page_num++)
1449                 sblock->pagev[page_num]->io_error = 1;
1450
1451         sblock->no_io_error_seen = 0;
1452 }
1453
1454 /*
1455  * this function will check the on disk data for checksum errors, header
1456  * errors and read I/O errors. If any I/O errors happen, the exact pages
1457  * which are errored are marked as being bad. The goal is to enable scrub
1458  * to take those pages that are not errored from all the mirrors so that
1459  * the pages that are errored in the just handled mirror can be repaired.
1460  */
1461 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1462                                 struct scrub_block *sblock,
1463                                 int retry_failed_mirror)
1464 {
1465         int page_num;
1466
1467         sblock->no_io_error_seen = 1;
1468
1469         /* short cut for raid56 */
1470         if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->pagev[0]))
1471                 return scrub_recheck_block_on_raid56(fs_info, sblock);
1472
1473         for (page_num = 0; page_num < sblock->page_count; page_num++) {
1474                 struct bio *bio;
1475                 struct scrub_page *spage = sblock->pagev[page_num];
1476
1477                 if (spage->dev->bdev == NULL) {
1478                         spage->io_error = 1;
1479                         sblock->no_io_error_seen = 0;
1480                         continue;
1481                 }
1482
1483                 WARN_ON(!spage->page);
1484                 bio = btrfs_io_bio_alloc(1);
1485                 bio_set_dev(bio, spage->dev->bdev);
1486
1487                 bio_add_page(bio, spage->page, PAGE_SIZE, 0);
1488                 bio->bi_iter.bi_sector = spage->physical >> 9;
1489                 bio->bi_opf = REQ_OP_READ;
1490
1491                 if (btrfsic_submit_bio_wait(bio)) {
1492                         spage->io_error = 1;
1493                         sblock->no_io_error_seen = 0;
1494                 }
1495
1496                 bio_put(bio);
1497         }
1498
1499         if (sblock->no_io_error_seen)
1500                 scrub_recheck_block_checksum(sblock);
1501 }
1502
1503 static inline int scrub_check_fsid(u8 fsid[],
1504                                    struct scrub_page *spage)
1505 {
1506         struct btrfs_fs_devices *fs_devices = spage->dev->fs_devices;
1507         int ret;
1508
1509         ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1510         return !ret;
1511 }
1512
1513 static void scrub_recheck_block_checksum(struct scrub_block *sblock)
1514 {
1515         sblock->header_error = 0;
1516         sblock->checksum_error = 0;
1517         sblock->generation_error = 0;
1518
1519         if (sblock->pagev[0]->flags & BTRFS_EXTENT_FLAG_DATA)
1520                 scrub_checksum_data(sblock);
1521         else
1522                 scrub_checksum_tree_block(sblock);
1523 }
1524
1525 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
1526                                              struct scrub_block *sblock_good)
1527 {
1528         int page_num;
1529         int ret = 0;
1530
1531         for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
1532                 int ret_sub;
1533
1534                 ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
1535                                                            sblock_good,
1536                                                            page_num, 1);
1537                 if (ret_sub)
1538                         ret = ret_sub;
1539         }
1540
1541         return ret;
1542 }
1543
1544 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1545                                             struct scrub_block *sblock_good,
1546                                             int page_num, int force_write)
1547 {
1548         struct scrub_page *spage_bad = sblock_bad->pagev[page_num];
1549         struct scrub_page *spage_good = sblock_good->pagev[page_num];
1550         struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info;
1551
1552         BUG_ON(spage_bad->page == NULL);
1553         BUG_ON(spage_good->page == NULL);
1554         if (force_write || sblock_bad->header_error ||
1555             sblock_bad->checksum_error || spage_bad->io_error) {
1556                 struct bio *bio;
1557                 int ret;
1558
1559                 if (!spage_bad->dev->bdev) {
1560                         btrfs_warn_rl(fs_info,
1561                                 "scrub_repair_page_from_good_copy(bdev == NULL) is unexpected");
1562                         return -EIO;
1563                 }
1564
1565                 bio = btrfs_io_bio_alloc(1);
1566                 bio_set_dev(bio, spage_bad->dev->bdev);
1567                 bio->bi_iter.bi_sector = spage_bad->physical >> 9;
1568                 bio->bi_opf = REQ_OP_WRITE;
1569
1570                 ret = bio_add_page(bio, spage_good->page, PAGE_SIZE, 0);
1571                 if (PAGE_SIZE != ret) {
1572                         bio_put(bio);
1573                         return -EIO;
1574                 }
1575
1576                 if (btrfsic_submit_bio_wait(bio)) {
1577                         btrfs_dev_stat_inc_and_print(spage_bad->dev,
1578                                 BTRFS_DEV_STAT_WRITE_ERRS);
1579                         atomic64_inc(&fs_info->dev_replace.num_write_errors);
1580                         bio_put(bio);
1581                         return -EIO;
1582                 }
1583                 bio_put(bio);
1584         }
1585
1586         return 0;
1587 }
1588
1589 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1590 {
1591         struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
1592         int page_num;
1593
1594         /*
1595          * This block is used for the check of the parity on the source device,
1596          * so the data needn't be written into the destination device.
1597          */
1598         if (sblock->sparity)
1599                 return;
1600
1601         for (page_num = 0; page_num < sblock->page_count; page_num++) {
1602                 int ret;
1603
1604                 ret = scrub_write_page_to_dev_replace(sblock, page_num);
1605                 if (ret)
1606                         atomic64_inc(&fs_info->dev_replace.num_write_errors);
1607         }
1608 }
1609
1610 static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
1611                                            int page_num)
1612 {
1613         struct scrub_page *spage = sblock->pagev[page_num];
1614
1615         BUG_ON(spage->page == NULL);
1616         if (spage->io_error)
1617                 clear_page(page_address(spage->page));
1618
1619         return scrub_add_page_to_wr_bio(sblock->sctx, spage);
1620 }
1621
1622 static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
1623                                     struct scrub_page *spage)
1624 {
1625         struct scrub_bio *sbio;
1626         int ret;
1627
1628         mutex_lock(&sctx->wr_lock);
1629 again:
1630         if (!sctx->wr_curr_bio) {
1631                 sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio),
1632                                               GFP_KERNEL);
1633                 if (!sctx->wr_curr_bio) {
1634                         mutex_unlock(&sctx->wr_lock);
1635                         return -ENOMEM;
1636                 }
1637                 sctx->wr_curr_bio->sctx = sctx;
1638                 sctx->wr_curr_bio->page_count = 0;
1639         }
1640         sbio = sctx->wr_curr_bio;
1641         if (sbio->page_count == 0) {
1642                 struct bio *bio;
1643
1644                 sbio->physical = spage->physical_for_dev_replace;
1645                 sbio->logical = spage->logical;
1646                 sbio->dev = sctx->wr_tgtdev;
1647                 bio = sbio->bio;
1648                 if (!bio) {
1649                         bio = btrfs_io_bio_alloc(sctx->pages_per_wr_bio);
1650                         sbio->bio = bio;
1651                 }
1652
1653                 bio->bi_private = sbio;
1654                 bio->bi_end_io = scrub_wr_bio_end_io;
1655                 bio_set_dev(bio, sbio->dev->bdev);
1656                 bio->bi_iter.bi_sector = sbio->physical >> 9;
1657                 bio->bi_opf = REQ_OP_WRITE;
1658                 sbio->status = 0;
1659         } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1660                    spage->physical_for_dev_replace ||
1661                    sbio->logical + sbio->page_count * PAGE_SIZE !=
1662                    spage->logical) {
1663                 scrub_wr_submit(sctx);
1664                 goto again;
1665         }
1666
1667         ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
1668         if (ret != PAGE_SIZE) {
1669                 if (sbio->page_count < 1) {
1670                         bio_put(sbio->bio);
1671                         sbio->bio = NULL;
1672                         mutex_unlock(&sctx->wr_lock);
1673                         return -EIO;
1674                 }
1675                 scrub_wr_submit(sctx);
1676                 goto again;
1677         }
1678
1679         sbio->pagev[sbio->page_count] = spage;
1680         scrub_page_get(spage);
1681         sbio->page_count++;
1682         if (sbio->page_count == sctx->pages_per_wr_bio)
1683                 scrub_wr_submit(sctx);
1684         mutex_unlock(&sctx->wr_lock);
1685
1686         return 0;
1687 }
1688
1689 static void scrub_wr_submit(struct scrub_ctx *sctx)
1690 {
1691         struct scrub_bio *sbio;
1692
1693         if (!sctx->wr_curr_bio)
1694                 return;
1695
1696         sbio = sctx->wr_curr_bio;
1697         sctx->wr_curr_bio = NULL;
1698         WARN_ON(!sbio->bio->bi_disk);
1699         scrub_pending_bio_inc(sctx);
1700         /* process all writes in a single worker thread. Then the block layer
1701          * orders the requests before sending them to the driver which
1702          * doubled the write performance on spinning disks when measured
1703          * with Linux 3.5 */
1704         btrfsic_submit_bio(sbio->bio);
1705 }
1706
1707 static void scrub_wr_bio_end_io(struct bio *bio)
1708 {
1709         struct scrub_bio *sbio = bio->bi_private;
1710         struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
1711
1712         sbio->status = bio->bi_status;
1713         sbio->bio = bio;
1714
1715         btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL);
1716         btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
1717 }
1718
1719 static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
1720 {
1721         struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1722         struct scrub_ctx *sctx = sbio->sctx;
1723         int i;
1724
1725         WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
1726         if (sbio->status) {
1727                 struct btrfs_dev_replace *dev_replace =
1728                         &sbio->sctx->fs_info->dev_replace;
1729
1730                 for (i = 0; i < sbio->page_count; i++) {
1731                         struct scrub_page *spage = sbio->pagev[i];
1732
1733                         spage->io_error = 1;
1734                         atomic64_inc(&dev_replace->num_write_errors);
1735                 }
1736         }
1737
1738         for (i = 0; i < sbio->page_count; i++)
1739                 scrub_page_put(sbio->pagev[i]);
1740
1741         bio_put(sbio->bio);
1742         kfree(sbio);
1743         scrub_pending_bio_dec(sctx);
1744 }
1745
1746 static int scrub_checksum(struct scrub_block *sblock)
1747 {
1748         u64 flags;
1749         int ret;
1750
1751         /*
1752          * No need to initialize these stats currently,
1753          * because this function only use return value
1754          * instead of these stats value.
1755          *
1756          * Todo:
1757          * always use stats
1758          */
1759         sblock->header_error = 0;
1760         sblock->generation_error = 0;
1761         sblock->checksum_error = 0;
1762
1763         WARN_ON(sblock->page_count < 1);
1764         flags = sblock->pagev[0]->flags;
1765         ret = 0;
1766         if (flags & BTRFS_EXTENT_FLAG_DATA)
1767                 ret = scrub_checksum_data(sblock);
1768         else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1769                 ret = scrub_checksum_tree_block(sblock);
1770         else if (flags & BTRFS_EXTENT_FLAG_SUPER)
1771                 (void)scrub_checksum_super(sblock);
1772         else
1773                 WARN_ON(1);
1774         if (ret)
1775                 scrub_handle_errored_block(sblock);
1776
1777         return ret;
1778 }
1779
1780 static int scrub_checksum_data(struct scrub_block *sblock)
1781 {
1782         struct scrub_ctx *sctx = sblock->sctx;
1783         struct btrfs_fs_info *fs_info = sctx->fs_info;
1784         SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1785         u8 csum[BTRFS_CSUM_SIZE];
1786         struct scrub_page *spage;
1787         char *kaddr;
1788
1789         BUG_ON(sblock->page_count < 1);
1790         spage = sblock->pagev[0];
1791         if (!spage->have_csum)
1792                 return 0;
1793
1794         kaddr = page_address(spage->page);
1795
1796         shash->tfm = fs_info->csum_shash;
1797         crypto_shash_init(shash);
1798
1799         /*
1800          * In scrub_pages() and scrub_pages_for_parity() we ensure each spage
1801          * only contains one sector of data.
1802          */
1803         crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
1804
1805         if (memcmp(csum, spage->csum, fs_info->csum_size))
1806                 sblock->checksum_error = 1;
1807         return sblock->checksum_error;
1808 }
1809
1810 static int scrub_checksum_tree_block(struct scrub_block *sblock)
1811 {
1812         struct scrub_ctx *sctx = sblock->sctx;
1813         struct btrfs_header *h;
1814         struct btrfs_fs_info *fs_info = sctx->fs_info;
1815         SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1816         u8 calculated_csum[BTRFS_CSUM_SIZE];
1817         u8 on_disk_csum[BTRFS_CSUM_SIZE];
1818         /*
1819          * This is done in sectorsize steps even for metadata as there's a
1820          * constraint for nodesize to be aligned to sectorsize. This will need
1821          * to change so we don't misuse data and metadata units like that.
1822          */
1823         const u32 sectorsize = sctx->fs_info->sectorsize;
1824         const int num_sectors = fs_info->nodesize >> fs_info->sectorsize_bits;
1825         int i;
1826         struct scrub_page *spage;
1827         char *kaddr;
1828
1829         BUG_ON(sblock->page_count < 1);
1830
1831         /* Each member in pagev is just one block, not a full page */
1832         ASSERT(sblock->page_count == num_sectors);
1833
1834         spage = sblock->pagev[0];
1835         kaddr = page_address(spage->page);
1836         h = (struct btrfs_header *)kaddr;
1837         memcpy(on_disk_csum, h->csum, sctx->fs_info->csum_size);
1838
1839         /*
1840          * we don't use the getter functions here, as we
1841          * a) don't have an extent buffer and
1842          * b) the page is already kmapped
1843          */
1844         if (spage->logical != btrfs_stack_header_bytenr(h))
1845                 sblock->header_error = 1;
1846
1847         if (spage->generation != btrfs_stack_header_generation(h)) {
1848                 sblock->header_error = 1;
1849                 sblock->generation_error = 1;
1850         }
1851
1852         if (!scrub_check_fsid(h->fsid, spage))
1853                 sblock->header_error = 1;
1854
1855         if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1856                    BTRFS_UUID_SIZE))
1857                 sblock->header_error = 1;
1858
1859         shash->tfm = fs_info->csum_shash;
1860         crypto_shash_init(shash);
1861         crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
1862                             sectorsize - BTRFS_CSUM_SIZE);
1863
1864         for (i = 1; i < num_sectors; i++) {
1865                 kaddr = page_address(sblock->pagev[i]->page);
1866                 crypto_shash_update(shash, kaddr, sectorsize);
1867         }
1868
1869         crypto_shash_final(shash, calculated_csum);
1870         if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size))
1871                 sblock->checksum_error = 1;
1872
1873         return sblock->header_error || sblock->checksum_error;
1874 }
1875
1876 static int scrub_checksum_super(struct scrub_block *sblock)
1877 {
1878         struct btrfs_super_block *s;
1879         struct scrub_ctx *sctx = sblock->sctx;
1880         struct btrfs_fs_info *fs_info = sctx->fs_info;
1881         SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1882         u8 calculated_csum[BTRFS_CSUM_SIZE];
1883         struct scrub_page *spage;
1884         char *kaddr;
1885         int fail_gen = 0;
1886         int fail_cor = 0;
1887
1888         BUG_ON(sblock->page_count < 1);
1889         spage = sblock->pagev[0];
1890         kaddr = page_address(spage->page);
1891         s = (struct btrfs_super_block *)kaddr;
1892
1893         if (spage->logical != btrfs_super_bytenr(s))
1894                 ++fail_cor;
1895
1896         if (spage->generation != btrfs_super_generation(s))
1897                 ++fail_gen;
1898
1899         if (!scrub_check_fsid(s->fsid, spage))
1900                 ++fail_cor;
1901
1902         shash->tfm = fs_info->csum_shash;
1903         crypto_shash_init(shash);
1904         crypto_shash_digest(shash, kaddr + BTRFS_CSUM_SIZE,
1905                         BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, calculated_csum);
1906
1907         if (memcmp(calculated_csum, s->csum, sctx->fs_info->csum_size))
1908                 ++fail_cor;
1909
1910         if (fail_cor + fail_gen) {
1911                 /*
1912                  * if we find an error in a super block, we just report it.
1913                  * They will get written with the next transaction commit
1914                  * anyway
1915                  */
1916                 spin_lock(&sctx->stat_lock);
1917                 ++sctx->stat.super_errors;
1918                 spin_unlock(&sctx->stat_lock);
1919                 if (fail_cor)
1920                         btrfs_dev_stat_inc_and_print(spage->dev,
1921                                 BTRFS_DEV_STAT_CORRUPTION_ERRS);
1922                 else
1923                         btrfs_dev_stat_inc_and_print(spage->dev,
1924                                 BTRFS_DEV_STAT_GENERATION_ERRS);
1925         }
1926
1927         return fail_cor + fail_gen;
1928 }
1929
1930 static void scrub_block_get(struct scrub_block *sblock)
1931 {
1932         refcount_inc(&sblock->refs);
1933 }
1934
1935 static void scrub_block_put(struct scrub_block *sblock)
1936 {
1937         if (refcount_dec_and_test(&sblock->refs)) {
1938                 int i;
1939
1940                 if (sblock->sparity)
1941                         scrub_parity_put(sblock->sparity);
1942
1943                 for (i = 0; i < sblock->page_count; i++)
1944                         scrub_page_put(sblock->pagev[i]);
1945                 kfree(sblock);
1946         }
1947 }
1948
1949 static void scrub_page_get(struct scrub_page *spage)
1950 {
1951         atomic_inc(&spage->refs);
1952 }
1953
1954 static void scrub_page_put(struct scrub_page *spage)
1955 {
1956         if (atomic_dec_and_test(&spage->refs)) {
1957                 if (spage->page)
1958                         __free_page(spage->page);
1959                 kfree(spage);
1960         }
1961 }
1962
1963 static void scrub_submit(struct scrub_ctx *sctx)
1964 {
1965         struct scrub_bio *sbio;
1966
1967         if (sctx->curr == -1)
1968                 return;
1969
1970         sbio = sctx->bios[sctx->curr];
1971         sctx->curr = -1;
1972         scrub_pending_bio_inc(sctx);
1973         btrfsic_submit_bio(sbio->bio);
1974 }
1975
1976 static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
1977                                     struct scrub_page *spage)
1978 {
1979         struct scrub_block *sblock = spage->sblock;
1980         struct scrub_bio *sbio;
1981         int ret;
1982
1983 again:
1984         /*
1985          * grab a fresh bio or wait for one to become available
1986          */
1987         while (sctx->curr == -1) {
1988                 spin_lock(&sctx->list_lock);
1989                 sctx->curr = sctx->first_free;
1990                 if (sctx->curr != -1) {
1991                         sctx->first_free = sctx->bios[sctx->curr]->next_free;
1992                         sctx->bios[sctx->curr]->next_free = -1;
1993                         sctx->bios[sctx->curr]->page_count = 0;
1994                         spin_unlock(&sctx->list_lock);
1995                 } else {
1996                         spin_unlock(&sctx->list_lock);
1997                         wait_event(sctx->list_wait, sctx->first_free != -1);
1998                 }
1999         }
2000         sbio = sctx->bios[sctx->curr];
2001         if (sbio->page_count == 0) {
2002                 struct bio *bio;
2003
2004                 sbio->physical = spage->physical;
2005                 sbio->logical = spage->logical;
2006                 sbio->dev = spage->dev;
2007                 bio = sbio->bio;
2008                 if (!bio) {
2009                         bio = btrfs_io_bio_alloc(sctx->pages_per_rd_bio);
2010                         sbio->bio = bio;
2011                 }
2012
2013                 bio->bi_private = sbio;
2014                 bio->bi_end_io = scrub_bio_end_io;
2015                 bio_set_dev(bio, sbio->dev->bdev);
2016                 bio->bi_iter.bi_sector = sbio->physical >> 9;
2017                 bio->bi_opf = REQ_OP_READ;
2018                 sbio->status = 0;
2019         } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
2020                    spage->physical ||
2021                    sbio->logical + sbio->page_count * PAGE_SIZE !=
2022                    spage->logical ||
2023                    sbio->dev != spage->dev) {
2024                 scrub_submit(sctx);
2025                 goto again;
2026         }
2027
2028         sbio->pagev[sbio->page_count] = spage;
2029         ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
2030         if (ret != PAGE_SIZE) {
2031                 if (sbio->page_count < 1) {
2032                         bio_put(sbio->bio);
2033                         sbio->bio = NULL;
2034                         return -EIO;
2035                 }
2036                 scrub_submit(sctx);
2037                 goto again;
2038         }
2039
2040         scrub_block_get(sblock); /* one for the page added to the bio */
2041         atomic_inc(&sblock->outstanding_pages);
2042         sbio->page_count++;
2043         if (sbio->page_count == sctx->pages_per_rd_bio)
2044                 scrub_submit(sctx);
2045
2046         return 0;
2047 }
2048
2049 static void scrub_missing_raid56_end_io(struct bio *bio)
2050 {
2051         struct scrub_block *sblock = bio->bi_private;
2052         struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
2053
2054         if (bio->bi_status)
2055                 sblock->no_io_error_seen = 0;
2056
2057         bio_put(bio);
2058
2059         btrfs_queue_work(fs_info->scrub_workers, &sblock->work);
2060 }
2061
2062 static void scrub_missing_raid56_worker(struct btrfs_work *work)
2063 {
2064         struct scrub_block *sblock = container_of(work, struct scrub_block, work);
2065         struct scrub_ctx *sctx = sblock->sctx;
2066         struct btrfs_fs_info *fs_info = sctx->fs_info;
2067         u64 logical;
2068         struct btrfs_device *dev;
2069
2070         logical = sblock->pagev[0]->logical;
2071         dev = sblock->pagev[0]->dev;
2072
2073         if (sblock->no_io_error_seen)
2074                 scrub_recheck_block_checksum(sblock);
2075
2076         if (!sblock->no_io_error_seen) {
2077                 spin_lock(&sctx->stat_lock);
2078                 sctx->stat.read_errors++;
2079                 spin_unlock(&sctx->stat_lock);
2080                 btrfs_err_rl_in_rcu(fs_info,
2081                         "IO error rebuilding logical %llu for dev %s",
2082                         logical, rcu_str_deref(dev->name));
2083         } else if (sblock->header_error || sblock->checksum_error) {
2084                 spin_lock(&sctx->stat_lock);
2085                 sctx->stat.uncorrectable_errors++;
2086                 spin_unlock(&sctx->stat_lock);
2087                 btrfs_err_rl_in_rcu(fs_info,
2088                         "failed to rebuild valid logical %llu for dev %s",
2089                         logical, rcu_str_deref(dev->name));
2090         } else {
2091                 scrub_write_block_to_dev_replace(sblock);
2092         }
2093
2094         if (sctx->is_dev_replace && sctx->flush_all_writes) {
2095                 mutex_lock(&sctx->wr_lock);
2096                 scrub_wr_submit(sctx);
2097                 mutex_unlock(&sctx->wr_lock);
2098         }
2099
2100         scrub_block_put(sblock);
2101         scrub_pending_bio_dec(sctx);
2102 }
2103
2104 static void scrub_missing_raid56_pages(struct scrub_block *sblock)
2105 {
2106         struct scrub_ctx *sctx = sblock->sctx;
2107         struct btrfs_fs_info *fs_info = sctx->fs_info;
2108         u64 length = sblock->page_count * PAGE_SIZE;
2109         u64 logical = sblock->pagev[0]->logical;
2110         struct btrfs_bio *bbio = NULL;
2111         struct bio *bio;
2112         struct btrfs_raid_bio *rbio;
2113         int ret;
2114         int i;
2115
2116         btrfs_bio_counter_inc_blocked(fs_info);
2117         ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
2118                         &length, &bbio);
2119         if (ret || !bbio || !bbio->raid_map)
2120                 goto bbio_out;
2121
2122         if (WARN_ON(!sctx->is_dev_replace ||
2123                     !(bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
2124                 /*
2125                  * We shouldn't be scrubbing a missing device. Even for dev
2126                  * replace, we should only get here for RAID 5/6. We either
2127                  * managed to mount something with no mirrors remaining or
2128                  * there's a bug in scrub_remap_extent()/btrfs_map_block().
2129                  */
2130                 goto bbio_out;
2131         }
2132
2133         bio = btrfs_io_bio_alloc(0);
2134         bio->bi_iter.bi_sector = logical >> 9;
2135         bio->bi_private = sblock;
2136         bio->bi_end_io = scrub_missing_raid56_end_io;
2137
2138         rbio = raid56_alloc_missing_rbio(fs_info, bio, bbio, length);
2139         if (!rbio)
2140                 goto rbio_out;
2141
2142         for (i = 0; i < sblock->page_count; i++) {
2143                 struct scrub_page *spage = sblock->pagev[i];
2144
2145                 raid56_add_scrub_pages(rbio, spage->page, spage->logical);
2146         }
2147
2148         btrfs_init_work(&sblock->work, scrub_missing_raid56_worker, NULL, NULL);
2149         scrub_block_get(sblock);
2150         scrub_pending_bio_inc(sctx);
2151         raid56_submit_missing_rbio(rbio);
2152         return;
2153
2154 rbio_out:
2155         bio_put(bio);
2156 bbio_out:
2157         btrfs_bio_counter_dec(fs_info);
2158         btrfs_put_bbio(bbio);
2159         spin_lock(&sctx->stat_lock);
2160         sctx->stat.malloc_errors++;
2161         spin_unlock(&sctx->stat_lock);
2162 }
2163
2164 static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len,
2165                        u64 physical, struct btrfs_device *dev, u64 flags,
2166                        u64 gen, int mirror_num, u8 *csum,
2167                        u64 physical_for_dev_replace)
2168 {
2169         struct scrub_block *sblock;
2170         const u32 sectorsize = sctx->fs_info->sectorsize;
2171         int index;
2172
2173         sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2174         if (!sblock) {
2175                 spin_lock(&sctx->stat_lock);
2176                 sctx->stat.malloc_errors++;
2177                 spin_unlock(&sctx->stat_lock);
2178                 return -ENOMEM;
2179         }
2180
2181         /* one ref inside this function, plus one for each page added to
2182          * a bio later on */
2183         refcount_set(&sblock->refs, 1);
2184         sblock->sctx = sctx;
2185         sblock->no_io_error_seen = 1;
2186
2187         for (index = 0; len > 0; index++) {
2188                 struct scrub_page *spage;
2189                 /*
2190                  * Here we will allocate one page for one sector to scrub.
2191                  * This is fine if PAGE_SIZE == sectorsize, but will cost
2192                  * more memory for PAGE_SIZE > sectorsize case.
2193                  */
2194                 u32 l = min(sectorsize, len);
2195
2196                 spage = kzalloc(sizeof(*spage), GFP_KERNEL);
2197                 if (!spage) {
2198 leave_nomem:
2199                         spin_lock(&sctx->stat_lock);
2200                         sctx->stat.malloc_errors++;
2201                         spin_unlock(&sctx->stat_lock);
2202                         scrub_block_put(sblock);
2203                         return -ENOMEM;
2204                 }
2205                 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2206                 scrub_page_get(spage);
2207                 sblock->pagev[index] = spage;
2208                 spage->sblock = sblock;
2209                 spage->dev = dev;
2210                 spage->flags = flags;
2211                 spage->generation = gen;
2212                 spage->logical = logical;
2213                 spage->physical = physical;
2214                 spage->physical_for_dev_replace = physical_for_dev_replace;
2215                 spage->mirror_num = mirror_num;
2216                 if (csum) {
2217                         spage->have_csum = 1;
2218                         memcpy(spage->csum, csum, sctx->fs_info->csum_size);
2219                 } else {
2220                         spage->have_csum = 0;
2221                 }
2222                 sblock->page_count++;
2223                 spage->page = alloc_page(GFP_KERNEL);
2224                 if (!spage->page)
2225                         goto leave_nomem;
2226                 len -= l;
2227                 logical += l;
2228                 physical += l;
2229                 physical_for_dev_replace += l;
2230         }
2231
2232         WARN_ON(sblock->page_count == 0);
2233         if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
2234                 /*
2235                  * This case should only be hit for RAID 5/6 device replace. See
2236                  * the comment in scrub_missing_raid56_pages() for details.
2237                  */
2238                 scrub_missing_raid56_pages(sblock);
2239         } else {
2240                 for (index = 0; index < sblock->page_count; index++) {
2241                         struct scrub_page *spage = sblock->pagev[index];
2242                         int ret;
2243
2244                         ret = scrub_add_page_to_rd_bio(sctx, spage);
2245                         if (ret) {
2246                                 scrub_block_put(sblock);
2247                                 return ret;
2248                         }
2249                 }
2250
2251                 if (flags & BTRFS_EXTENT_FLAG_SUPER)
2252                         scrub_submit(sctx);
2253         }
2254
2255         /* last one frees, either here or in bio completion for last page */
2256         scrub_block_put(sblock);
2257         return 0;
2258 }
2259
2260 static void scrub_bio_end_io(struct bio *bio)
2261 {
2262         struct scrub_bio *sbio = bio->bi_private;
2263         struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
2264
2265         sbio->status = bio->bi_status;
2266         sbio->bio = bio;
2267
2268         btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
2269 }
2270
2271 static void scrub_bio_end_io_worker(struct btrfs_work *work)
2272 {
2273         struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
2274         struct scrub_ctx *sctx = sbio->sctx;
2275         int i;
2276
2277         BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
2278         if (sbio->status) {
2279                 for (i = 0; i < sbio->page_count; i++) {
2280                         struct scrub_page *spage = sbio->pagev[i];
2281
2282                         spage->io_error = 1;
2283                         spage->sblock->no_io_error_seen = 0;
2284                 }
2285         }
2286
2287         /* now complete the scrub_block items that have all pages completed */
2288         for (i = 0; i < sbio->page_count; i++) {
2289                 struct scrub_page *spage = sbio->pagev[i];
2290                 struct scrub_block *sblock = spage->sblock;
2291
2292                 if (atomic_dec_and_test(&sblock->outstanding_pages))
2293                         scrub_block_complete(sblock);
2294                 scrub_block_put(sblock);
2295         }
2296
2297         bio_put(sbio->bio);
2298         sbio->bio = NULL;
2299         spin_lock(&sctx->list_lock);
2300         sbio->next_free = sctx->first_free;
2301         sctx->first_free = sbio->index;
2302         spin_unlock(&sctx->list_lock);
2303
2304         if (sctx->is_dev_replace && sctx->flush_all_writes) {
2305                 mutex_lock(&sctx->wr_lock);
2306                 scrub_wr_submit(sctx);
2307                 mutex_unlock(&sctx->wr_lock);
2308         }
2309
2310         scrub_pending_bio_dec(sctx);
2311 }
2312
2313 static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
2314                                        unsigned long *bitmap,
2315                                        u64 start, u32 len)
2316 {
2317         u64 offset;
2318         u32 nsectors;
2319         u32 sectorsize_bits = sparity->sctx->fs_info->sectorsize_bits;
2320
2321         if (len >= sparity->stripe_len) {
2322                 bitmap_set(bitmap, 0, sparity->nsectors);
2323                 return;
2324         }
2325
2326         start -= sparity->logic_start;
2327         start = div64_u64_rem(start, sparity->stripe_len, &offset);
2328         offset = offset >> sectorsize_bits;
2329         nsectors = len >> sectorsize_bits;
2330
2331         if (offset + nsectors <= sparity->nsectors) {
2332                 bitmap_set(bitmap, offset, nsectors);
2333                 return;
2334         }
2335
2336         bitmap_set(bitmap, offset, sparity->nsectors - offset);
2337         bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
2338 }
2339
2340 static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
2341                                                    u64 start, u32 len)
2342 {
2343         __scrub_mark_bitmap(sparity, sparity->ebitmap, start, len);
2344 }
2345
2346 static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
2347                                                   u64 start, u32 len)
2348 {
2349         __scrub_mark_bitmap(sparity, sparity->dbitmap, start, len);
2350 }
2351
2352 static void scrub_block_complete(struct scrub_block *sblock)
2353 {
2354         int corrupted = 0;
2355
2356         if (!sblock->no_io_error_seen) {
2357                 corrupted = 1;
2358                 scrub_handle_errored_block(sblock);
2359         } else {
2360                 /*
2361                  * if has checksum error, write via repair mechanism in
2362                  * dev replace case, otherwise write here in dev replace
2363                  * case.
2364                  */
2365                 corrupted = scrub_checksum(sblock);
2366                 if (!corrupted && sblock->sctx->is_dev_replace)
2367                         scrub_write_block_to_dev_replace(sblock);
2368         }
2369
2370         if (sblock->sparity && corrupted && !sblock->data_corrected) {
2371                 u64 start = sblock->pagev[0]->logical;
2372                 u64 end = sblock->pagev[sblock->page_count - 1]->logical +
2373                           PAGE_SIZE;
2374
2375                 ASSERT(end - start <= U32_MAX);
2376                 scrub_parity_mark_sectors_error(sblock->sparity,
2377                                                 start, end - start);
2378         }
2379 }
2380
2381 static void drop_csum_range(struct scrub_ctx *sctx, struct btrfs_ordered_sum *sum)
2382 {
2383         sctx->stat.csum_discards += sum->len >> sctx->fs_info->sectorsize_bits;
2384         list_del(&sum->list);
2385         kfree(sum);
2386 }
2387
2388 /*
2389  * Find the desired csum for range [logical, logical + sectorsize), and store
2390  * the csum into @csum.
2391  *
2392  * The search source is sctx->csum_list, which is a pre-populated list
2393  * storing bytenr ordered csum ranges.  We're reponsible to cleanup any range
2394  * that is before @logical.
2395  *
2396  * Return 0 if there is no csum for the range.
2397  * Return 1 if there is csum for the range and copied to @csum.
2398  */
2399 static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
2400 {
2401         bool found = false;
2402
2403         while (!list_empty(&sctx->csum_list)) {
2404                 struct btrfs_ordered_sum *sum = NULL;
2405                 unsigned long index;
2406                 unsigned long num_sectors;
2407
2408                 sum = list_first_entry(&sctx->csum_list,
2409                                        struct btrfs_ordered_sum, list);
2410                 /* The current csum range is beyond our range, no csum found */
2411                 if (sum->bytenr > logical)
2412                         break;
2413
2414                 /*
2415                  * The current sum is before our bytenr, since scrub is always
2416                  * done in bytenr order, the csum will never be used anymore,
2417                  * clean it up so that later calls won't bother with the range,
2418                  * and continue search the next range.
2419                  */
2420                 if (sum->bytenr + sum->len <= logical) {
2421                         drop_csum_range(sctx, sum);
2422                         continue;
2423                 }
2424
2425                 /* Now the csum range covers our bytenr, copy the csum */
2426                 found = true;
2427                 index = (logical - sum->bytenr) >> sctx->fs_info->sectorsize_bits;
2428                 num_sectors = sum->len >> sctx->fs_info->sectorsize_bits;
2429
2430                 memcpy(csum, sum->sums + index * sctx->fs_info->csum_size,
2431                        sctx->fs_info->csum_size);
2432
2433                 /* Cleanup the range if we're at the end of the csum range */
2434                 if (index == num_sectors - 1)
2435                         drop_csum_range(sctx, sum);
2436                 break;
2437         }
2438         if (!found)
2439                 return 0;
2440         return 1;
2441 }
2442
2443 /* scrub extent tries to collect up to 64 kB for each bio */
2444 static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
2445                         u64 logical, u32 len,
2446                         u64 physical, struct btrfs_device *dev, u64 flags,
2447                         u64 gen, int mirror_num, u64 physical_for_dev_replace)
2448 {
2449         int ret;
2450         u8 csum[BTRFS_CSUM_SIZE];
2451         u32 blocksize;
2452
2453         if (flags & BTRFS_EXTENT_FLAG_DATA) {
2454                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2455                         blocksize = map->stripe_len;
2456                 else
2457                         blocksize = sctx->fs_info->sectorsize;
2458                 spin_lock(&sctx->stat_lock);
2459                 sctx->stat.data_extents_scrubbed++;
2460                 sctx->stat.data_bytes_scrubbed += len;
2461                 spin_unlock(&sctx->stat_lock);
2462         } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2463                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2464                         blocksize = map->stripe_len;
2465                 else
2466                         blocksize = sctx->fs_info->nodesize;
2467                 spin_lock(&sctx->stat_lock);
2468                 sctx->stat.tree_extents_scrubbed++;
2469                 sctx->stat.tree_bytes_scrubbed += len;
2470                 spin_unlock(&sctx->stat_lock);
2471         } else {
2472                 blocksize = sctx->fs_info->sectorsize;
2473                 WARN_ON(1);
2474         }
2475
2476         while (len) {
2477                 u32 l = min(len, blocksize);
2478                 int have_csum = 0;
2479
2480                 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2481                         /* push csums to sbio */
2482                         have_csum = scrub_find_csum(sctx, logical, csum);
2483                         if (have_csum == 0)
2484                                 ++sctx->stat.no_csum;
2485                 }
2486                 ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
2487                                   mirror_num, have_csum ? csum : NULL,
2488                                   physical_for_dev_replace);
2489                 if (ret)
2490                         return ret;
2491                 len -= l;
2492                 logical += l;
2493                 physical += l;
2494                 physical_for_dev_replace += l;
2495         }
2496         return 0;
2497 }
2498
2499 static int scrub_pages_for_parity(struct scrub_parity *sparity,
2500                                   u64 logical, u32 len,
2501                                   u64 physical, struct btrfs_device *dev,
2502                                   u64 flags, u64 gen, int mirror_num, u8 *csum)
2503 {
2504         struct scrub_ctx *sctx = sparity->sctx;
2505         struct scrub_block *sblock;
2506         const u32 sectorsize = sctx->fs_info->sectorsize;
2507         int index;
2508
2509         ASSERT(IS_ALIGNED(len, sectorsize));
2510
2511         sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2512         if (!sblock) {
2513                 spin_lock(&sctx->stat_lock);
2514                 sctx->stat.malloc_errors++;
2515                 spin_unlock(&sctx->stat_lock);
2516                 return -ENOMEM;
2517         }
2518
2519         /* one ref inside this function, plus one for each page added to
2520          * a bio later on */
2521         refcount_set(&sblock->refs, 1);
2522         sblock->sctx = sctx;
2523         sblock->no_io_error_seen = 1;
2524         sblock->sparity = sparity;
2525         scrub_parity_get(sparity);
2526
2527         for (index = 0; len > 0; index++) {
2528                 struct scrub_page *spage;
2529
2530                 spage = kzalloc(sizeof(*spage), GFP_KERNEL);
2531                 if (!spage) {
2532 leave_nomem:
2533                         spin_lock(&sctx->stat_lock);
2534                         sctx->stat.malloc_errors++;
2535                         spin_unlock(&sctx->stat_lock);
2536                         scrub_block_put(sblock);
2537                         return -ENOMEM;
2538                 }
2539                 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2540                 /* For scrub block */
2541                 scrub_page_get(spage);
2542                 sblock->pagev[index] = spage;
2543                 /* For scrub parity */
2544                 scrub_page_get(spage);
2545                 list_add_tail(&spage->list, &sparity->spages);
2546                 spage->sblock = sblock;
2547                 spage->dev = dev;
2548                 spage->flags = flags;
2549                 spage->generation = gen;
2550                 spage->logical = logical;
2551                 spage->physical = physical;
2552                 spage->mirror_num = mirror_num;
2553                 if (csum) {
2554                         spage->have_csum = 1;
2555                         memcpy(spage->csum, csum, sctx->fs_info->csum_size);
2556                 } else {
2557                         spage->have_csum = 0;
2558                 }
2559                 sblock->page_count++;
2560                 spage->page = alloc_page(GFP_KERNEL);
2561                 if (!spage->page)
2562                         goto leave_nomem;
2563
2564
2565                 /* Iterate over the stripe range in sectorsize steps */
2566                 len -= sectorsize;
2567                 logical += sectorsize;
2568                 physical += sectorsize;
2569         }
2570
2571         WARN_ON(sblock->page_count == 0);
2572         for (index = 0; index < sblock->page_count; index++) {
2573                 struct scrub_page *spage = sblock->pagev[index];
2574                 int ret;
2575
2576                 ret = scrub_add_page_to_rd_bio(sctx, spage);
2577                 if (ret) {
2578                         scrub_block_put(sblock);
2579                         return ret;
2580                 }
2581         }
2582
2583         /* last one frees, either here or in bio completion for last page */
2584         scrub_block_put(sblock);
2585         return 0;
2586 }
2587
2588 static int scrub_extent_for_parity(struct scrub_parity *sparity,
2589                                    u64 logical, u32 len,
2590                                    u64 physical, struct btrfs_device *dev,
2591                                    u64 flags, u64 gen, int mirror_num)
2592 {
2593         struct scrub_ctx *sctx = sparity->sctx;
2594         int ret;
2595         u8 csum[BTRFS_CSUM_SIZE];
2596         u32 blocksize;
2597
2598         if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
2599                 scrub_parity_mark_sectors_error(sparity, logical, len);
2600                 return 0;
2601         }
2602
2603         if (flags & BTRFS_EXTENT_FLAG_DATA) {
2604                 blocksize = sparity->stripe_len;
2605         } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2606                 blocksize = sparity->stripe_len;
2607         } else {
2608                 blocksize = sctx->fs_info->sectorsize;
2609                 WARN_ON(1);
2610         }
2611
2612         while (len) {
2613                 u32 l = min(len, blocksize);
2614                 int have_csum = 0;
2615
2616                 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2617                         /* push csums to sbio */
2618                         have_csum = scrub_find_csum(sctx, logical, csum);
2619                         if (have_csum == 0)
2620                                 goto skip;
2621                 }
2622                 ret = scrub_pages_for_parity(sparity, logical, l, physical, dev,
2623                                              flags, gen, mirror_num,
2624                                              have_csum ? csum : NULL);
2625                 if (ret)
2626                         return ret;
2627 skip:
2628                 len -= l;
2629                 logical += l;
2630                 physical += l;
2631         }
2632         return 0;
2633 }
2634
2635 /*
2636  * Given a physical address, this will calculate it's
2637  * logical offset. if this is a parity stripe, it will return
2638  * the most left data stripe's logical offset.
2639  *
2640  * return 0 if it is a data stripe, 1 means parity stripe.
2641  */
2642 static int get_raid56_logic_offset(u64 physical, int num,
2643                                    struct map_lookup *map, u64 *offset,
2644                                    u64 *stripe_start)
2645 {
2646         int i;
2647         int j = 0;
2648         u64 stripe_nr;
2649         u64 last_offset;
2650         u32 stripe_index;
2651         u32 rot;
2652         const int data_stripes = nr_data_stripes(map);
2653
2654         last_offset = (physical - map->stripes[num].physical) * data_stripes;
2655         if (stripe_start)
2656                 *stripe_start = last_offset;
2657
2658         *offset = last_offset;
2659         for (i = 0; i < data_stripes; i++) {
2660                 *offset = last_offset + i * map->stripe_len;
2661
2662                 stripe_nr = div64_u64(*offset, map->stripe_len);
2663                 stripe_nr = div_u64(stripe_nr, data_stripes);
2664
2665                 /* Work out the disk rotation on this stripe-set */
2666                 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot);
2667                 /* calculate which stripe this data locates */
2668                 rot += i;
2669                 stripe_index = rot % map->num_stripes;
2670                 if (stripe_index == num)
2671                         return 0;
2672                 if (stripe_index < num)
2673                         j++;
2674         }
2675         *offset = last_offset + j * map->stripe_len;
2676         return 1;
2677 }
2678
2679 static void scrub_free_parity(struct scrub_parity *sparity)
2680 {
2681         struct scrub_ctx *sctx = sparity->sctx;
2682         struct scrub_page *curr, *next;
2683         int nbits;
2684
2685         nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors);
2686         if (nbits) {
2687                 spin_lock(&sctx->stat_lock);
2688                 sctx->stat.read_errors += nbits;
2689                 sctx->stat.uncorrectable_errors += nbits;
2690                 spin_unlock(&sctx->stat_lock);
2691         }
2692
2693         list_for_each_entry_safe(curr, next, &sparity->spages, list) {
2694                 list_del_init(&curr->list);
2695                 scrub_page_put(curr);
2696         }
2697
2698         kfree(sparity);
2699 }
2700
2701 static void scrub_parity_bio_endio_worker(struct btrfs_work *work)
2702 {
2703         struct scrub_parity *sparity = container_of(work, struct scrub_parity,
2704                                                     work);
2705         struct scrub_ctx *sctx = sparity->sctx;
2706
2707         scrub_free_parity(sparity);
2708         scrub_pending_bio_dec(sctx);
2709 }
2710
2711 static void scrub_parity_bio_endio(struct bio *bio)
2712 {
2713         struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private;
2714         struct btrfs_fs_info *fs_info = sparity->sctx->fs_info;
2715
2716         if (bio->bi_status)
2717                 bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
2718                           sparity->nsectors);
2719
2720         bio_put(bio);
2721
2722         btrfs_init_work(&sparity->work, scrub_parity_bio_endio_worker, NULL,
2723                         NULL);
2724         btrfs_queue_work(fs_info->scrub_parity_workers, &sparity->work);
2725 }
2726
2727 static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
2728 {
2729         struct scrub_ctx *sctx = sparity->sctx;
2730         struct btrfs_fs_info *fs_info = sctx->fs_info;
2731         struct bio *bio;
2732         struct btrfs_raid_bio *rbio;
2733         struct btrfs_bio *bbio = NULL;
2734         u64 length;
2735         int ret;
2736
2737         if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap,
2738                            sparity->nsectors))
2739                 goto out;
2740
2741         length = sparity->logic_end - sparity->logic_start;
2742
2743         btrfs_bio_counter_inc_blocked(fs_info);
2744         ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
2745                                &length, &bbio);
2746         if (ret || !bbio || !bbio->raid_map)
2747                 goto bbio_out;
2748
2749         bio = btrfs_io_bio_alloc(0);
2750         bio->bi_iter.bi_sector = sparity->logic_start >> 9;
2751         bio->bi_private = sparity;
2752         bio->bi_end_io = scrub_parity_bio_endio;
2753
2754         rbio = raid56_parity_alloc_scrub_rbio(fs_info, bio, bbio,
2755                                               length, sparity->scrub_dev,
2756                                               sparity->dbitmap,
2757                                               sparity->nsectors);
2758         if (!rbio)
2759                 goto rbio_out;
2760
2761         scrub_pending_bio_inc(sctx);
2762         raid56_parity_submit_scrub_rbio(rbio);
2763         return;
2764
2765 rbio_out:
2766         bio_put(bio);
2767 bbio_out:
2768         btrfs_bio_counter_dec(fs_info);
2769         btrfs_put_bbio(bbio);
2770         bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
2771                   sparity->nsectors);
2772         spin_lock(&sctx->stat_lock);
2773         sctx->stat.malloc_errors++;
2774         spin_unlock(&sctx->stat_lock);
2775 out:
2776         scrub_free_parity(sparity);
2777 }
2778
2779 static inline int scrub_calc_parity_bitmap_len(int nsectors)
2780 {
2781         return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * sizeof(long);
2782 }
2783
2784 static void scrub_parity_get(struct scrub_parity *sparity)
2785 {
2786         refcount_inc(&sparity->refs);
2787 }
2788
2789 static void scrub_parity_put(struct scrub_parity *sparity)
2790 {
2791         if (!refcount_dec_and_test(&sparity->refs))
2792                 return;
2793
2794         scrub_parity_check_and_repair(sparity);
2795 }
2796
2797 static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
2798                                                   struct map_lookup *map,
2799                                                   struct btrfs_device *sdev,
2800                                                   struct btrfs_path *path,
2801                                                   u64 logic_start,
2802                                                   u64 logic_end)
2803 {
2804         struct btrfs_fs_info *fs_info = sctx->fs_info;
2805         struct btrfs_root *root = fs_info->extent_root;
2806         struct btrfs_root *csum_root = fs_info->csum_root;
2807         struct btrfs_extent_item *extent;
2808         struct btrfs_bio *bbio = NULL;
2809         u64 flags;
2810         int ret;
2811         int slot;
2812         struct extent_buffer *l;
2813         struct btrfs_key key;
2814         u64 generation;
2815         u64 extent_logical;
2816         u64 extent_physical;
2817         /* Check the comment in scrub_stripe() for why u32 is enough here */
2818         u32 extent_len;
2819         u64 mapped_length;
2820         struct btrfs_device *extent_dev;
2821         struct scrub_parity *sparity;
2822         int nsectors;
2823         int bitmap_len;
2824         int extent_mirror_num;
2825         int stop_loop = 0;
2826
2827         ASSERT(map->stripe_len <= U32_MAX);
2828         nsectors = map->stripe_len >> fs_info->sectorsize_bits;
2829         bitmap_len = scrub_calc_parity_bitmap_len(nsectors);
2830         sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len,
2831                           GFP_NOFS);
2832         if (!sparity) {
2833                 spin_lock(&sctx->stat_lock);
2834                 sctx->stat.malloc_errors++;
2835                 spin_unlock(&sctx->stat_lock);
2836                 return -ENOMEM;
2837         }
2838
2839         ASSERT(map->stripe_len <= U32_MAX);
2840         sparity->stripe_len = map->stripe_len;
2841         sparity->nsectors = nsectors;
2842         sparity->sctx = sctx;
2843         sparity->scrub_dev = sdev;
2844         sparity->logic_start = logic_start;
2845         sparity->logic_end = logic_end;
2846         refcount_set(&sparity->refs, 1);
2847         INIT_LIST_HEAD(&sparity->spages);
2848         sparity->dbitmap = sparity->bitmap;
2849         sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
2850
2851         ret = 0;
2852         while (logic_start < logic_end) {
2853                 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2854                         key.type = BTRFS_METADATA_ITEM_KEY;
2855                 else
2856                         key.type = BTRFS_EXTENT_ITEM_KEY;
2857                 key.objectid = logic_start;
2858                 key.offset = (u64)-1;
2859
2860                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2861                 if (ret < 0)
2862                         goto out;
2863
2864                 if (ret > 0) {
2865                         ret = btrfs_previous_extent_item(root, path, 0);
2866                         if (ret < 0)
2867                                 goto out;
2868                         if (ret > 0) {
2869                                 btrfs_release_path(path);
2870                                 ret = btrfs_search_slot(NULL, root, &key,
2871                                                         path, 0, 0);
2872                                 if (ret < 0)
2873                                         goto out;
2874                         }
2875                 }
2876
2877                 stop_loop = 0;
2878                 while (1) {
2879                         u64 bytes;
2880
2881                         l = path->nodes[0];
2882                         slot = path->slots[0];
2883                         if (slot >= btrfs_header_nritems(l)) {
2884                                 ret = btrfs_next_leaf(root, path);
2885                                 if (ret == 0)
2886                                         continue;
2887                                 if (ret < 0)
2888                                         goto out;
2889
2890                                 stop_loop = 1;
2891                                 break;
2892                         }
2893                         btrfs_item_key_to_cpu(l, &key, slot);
2894
2895                         if (key.type != BTRFS_EXTENT_ITEM_KEY &&
2896                             key.type != BTRFS_METADATA_ITEM_KEY)
2897                                 goto next;
2898
2899                         if (key.type == BTRFS_METADATA_ITEM_KEY)
2900                                 bytes = fs_info->nodesize;
2901                         else
2902                                 bytes = key.offset;
2903
2904                         if (key.objectid + bytes <= logic_start)
2905                                 goto next;
2906
2907                         if (key.objectid >= logic_end) {
2908                                 stop_loop = 1;
2909                                 break;
2910                         }
2911
2912                         while (key.objectid >= logic_start + map->stripe_len)
2913                                 logic_start += map->stripe_len;
2914
2915                         extent = btrfs_item_ptr(l, slot,
2916                                                 struct btrfs_extent_item);
2917                         flags = btrfs_extent_flags(l, extent);
2918                         generation = btrfs_extent_generation(l, extent);
2919
2920                         if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
2921                             (key.objectid < logic_start ||
2922                              key.objectid + bytes >
2923                              logic_start + map->stripe_len)) {
2924                                 btrfs_err(fs_info,
2925                                           "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
2926                                           key.objectid, logic_start);
2927                                 spin_lock(&sctx->stat_lock);
2928                                 sctx->stat.uncorrectable_errors++;
2929                                 spin_unlock(&sctx->stat_lock);
2930                                 goto next;
2931                         }
2932 again:
2933                         extent_logical = key.objectid;
2934                         ASSERT(bytes <= U32_MAX);
2935                         extent_len = bytes;
2936
2937                         if (extent_logical < logic_start) {
2938                                 extent_len -= logic_start - extent_logical;
2939                                 extent_logical = logic_start;
2940                         }
2941
2942                         if (extent_logical + extent_len >
2943                             logic_start + map->stripe_len)
2944                                 extent_len = logic_start + map->stripe_len -
2945                                              extent_logical;
2946
2947                         scrub_parity_mark_sectors_data(sparity, extent_logical,
2948                                                        extent_len);
2949
2950                         mapped_length = extent_len;
2951                         bbio = NULL;
2952                         ret = btrfs_map_block(fs_info, BTRFS_MAP_READ,
2953                                         extent_logical, &mapped_length, &bbio,
2954                                         0);
2955                         if (!ret) {
2956                                 if (!bbio || mapped_length < extent_len)
2957                                         ret = -EIO;
2958                         }
2959                         if (ret) {
2960                                 btrfs_put_bbio(bbio);
2961                                 goto out;
2962                         }
2963                         extent_physical = bbio->stripes[0].physical;
2964                         extent_mirror_num = bbio->mirror_num;
2965                         extent_dev = bbio->stripes[0].dev;
2966                         btrfs_put_bbio(bbio);
2967
2968                         ret = btrfs_lookup_csums_range(csum_root,
2969                                                 extent_logical,
2970                                                 extent_logical + extent_len - 1,
2971                                                 &sctx->csum_list, 1);
2972                         if (ret)
2973                                 goto out;
2974
2975                         ret = scrub_extent_for_parity(sparity, extent_logical,
2976                                                       extent_len,
2977                                                       extent_physical,
2978                                                       extent_dev, flags,
2979                                                       generation,
2980                                                       extent_mirror_num);
2981
2982                         scrub_free_csums(sctx);
2983
2984                         if (ret)
2985                                 goto out;
2986
2987                         if (extent_logical + extent_len <
2988                             key.objectid + bytes) {
2989                                 logic_start += map->stripe_len;
2990
2991                                 if (logic_start >= logic_end) {
2992                                         stop_loop = 1;
2993                                         break;
2994                                 }
2995
2996                                 if (logic_start < key.objectid + bytes) {
2997                                         cond_resched();
2998                                         goto again;
2999                                 }
3000                         }
3001 next:
3002                         path->slots[0]++;
3003                 }
3004
3005                 btrfs_release_path(path);
3006
3007                 if (stop_loop)
3008                         break;
3009
3010                 logic_start += map->stripe_len;
3011         }
3012 out:
3013         if (ret < 0) {
3014                 ASSERT(logic_end - logic_start <= U32_MAX);
3015                 scrub_parity_mark_sectors_error(sparity, logic_start,
3016                                                 logic_end - logic_start);
3017         }
3018         scrub_parity_put(sparity);
3019         scrub_submit(sctx);
3020         mutex_lock(&sctx->wr_lock);
3021         scrub_wr_submit(sctx);
3022         mutex_unlock(&sctx->wr_lock);
3023
3024         btrfs_release_path(path);
3025         return ret < 0 ? ret : 0;
3026 }
3027
3028 static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
3029                                            struct map_lookup *map,
3030                                            struct btrfs_device *scrub_dev,
3031                                            int num, u64 base, u64 length,
3032                                            struct btrfs_block_group *cache)
3033 {
3034         struct btrfs_path *path, *ppath;
3035         struct btrfs_fs_info *fs_info = sctx->fs_info;
3036         struct btrfs_root *root = fs_info->extent_root;
3037         struct btrfs_root *csum_root = fs_info->csum_root;
3038         struct btrfs_extent_item *extent;
3039         struct blk_plug plug;
3040         u64 flags;
3041         int ret;
3042         int slot;
3043         u64 nstripes;
3044         struct extent_buffer *l;
3045         u64 physical;
3046         u64 logical;
3047         u64 logic_end;
3048         u64 physical_end;
3049         u64 generation;
3050         int mirror_num;
3051         struct reada_control *reada1;
3052         struct reada_control *reada2;
3053         struct btrfs_key key;
3054         struct btrfs_key key_end;
3055         u64 increment = map->stripe_len;
3056         u64 offset;
3057         u64 extent_logical;
3058         u64 extent_physical;
3059         /*
3060          * Unlike chunk length, extent length should never go beyond
3061          * BTRFS_MAX_EXTENT_SIZE, thus u32 is enough here.
3062          */
3063         u32 extent_len;
3064         u64 stripe_logical;
3065         u64 stripe_end;
3066         struct btrfs_device *extent_dev;
3067         int extent_mirror_num;
3068         int stop_loop = 0;
3069
3070         physical = map->stripes[num].physical;
3071         offset = 0;
3072         nstripes = div64_u64(length, map->stripe_len);
3073         if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3074                 offset = map->stripe_len * num;
3075                 increment = map->stripe_len * map->num_stripes;
3076                 mirror_num = 1;
3077         } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3078                 int factor = map->num_stripes / map->sub_stripes;
3079                 offset = map->stripe_len * (num / map->sub_stripes);
3080                 increment = map->stripe_len * factor;
3081                 mirror_num = num % map->sub_stripes + 1;
3082         } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
3083                 increment = map->stripe_len;
3084                 mirror_num = num % map->num_stripes + 1;
3085         } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
3086                 increment = map->stripe_len;
3087                 mirror_num = num % map->num_stripes + 1;
3088         } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3089                 get_raid56_logic_offset(physical, num, map, &offset, NULL);
3090                 increment = map->stripe_len * nr_data_stripes(map);
3091                 mirror_num = 1;
3092         } else {
3093                 increment = map->stripe_len;
3094                 mirror_num = 1;
3095         }
3096
3097         path = btrfs_alloc_path();
3098         if (!path)
3099                 return -ENOMEM;
3100
3101         ppath = btrfs_alloc_path();
3102         if (!ppath) {
3103                 btrfs_free_path(path);
3104                 return -ENOMEM;
3105         }
3106
3107         /*
3108          * work on commit root. The related disk blocks are static as
3109          * long as COW is applied. This means, it is save to rewrite
3110          * them to repair disk errors without any race conditions
3111          */
3112         path->search_commit_root = 1;
3113         path->skip_locking = 1;
3114
3115         ppath->search_commit_root = 1;
3116         ppath->skip_locking = 1;
3117         /*
3118          * trigger the readahead for extent tree csum tree and wait for
3119          * completion. During readahead, the scrub is officially paused
3120          * to not hold off transaction commits
3121          */
3122         logical = base + offset;
3123         physical_end = physical + nstripes * map->stripe_len;
3124         if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3125                 get_raid56_logic_offset(physical_end, num,
3126                                         map, &logic_end, NULL);
3127                 logic_end += base;
3128         } else {
3129                 logic_end = logical + increment * nstripes;
3130         }
3131         wait_event(sctx->list_wait,
3132                    atomic_read(&sctx->bios_in_flight) == 0);
3133         scrub_blocked_if_needed(fs_info);
3134
3135         /* FIXME it might be better to start readahead at commit root */
3136         key.objectid = logical;
3137         key.type = BTRFS_EXTENT_ITEM_KEY;
3138         key.offset = (u64)0;
3139         key_end.objectid = logic_end;
3140         key_end.type = BTRFS_METADATA_ITEM_KEY;
3141         key_end.offset = (u64)-1;
3142         reada1 = btrfs_reada_add(root, &key, &key_end);
3143
3144         if (cache->flags & BTRFS_BLOCK_GROUP_DATA) {
3145                 key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
3146                 key.type = BTRFS_EXTENT_CSUM_KEY;
3147                 key.offset = logical;
3148                 key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
3149                 key_end.type = BTRFS_EXTENT_CSUM_KEY;
3150                 key_end.offset = logic_end;
3151                 reada2 = btrfs_reada_add(csum_root, &key, &key_end);
3152         } else {
3153                 reada2 = NULL;
3154         }
3155
3156         if (!IS_ERR(reada1))
3157                 btrfs_reada_wait(reada1);
3158         if (!IS_ERR_OR_NULL(reada2))
3159                 btrfs_reada_wait(reada2);
3160
3161
3162         /*
3163          * collect all data csums for the stripe to avoid seeking during
3164          * the scrub. This might currently (crc32) end up to be about 1MB
3165          */
3166         blk_start_plug(&plug);
3167
3168         /*
3169          * now find all extents for each stripe and scrub them
3170          */
3171         ret = 0;
3172         while (physical < physical_end) {
3173                 /*
3174                  * canceled?
3175                  */
3176                 if (atomic_read(&fs_info->scrub_cancel_req) ||
3177                     atomic_read(&sctx->cancel_req)) {
3178                         ret = -ECANCELED;
3179                         goto out;
3180                 }
3181                 /*
3182                  * check to see if we have to pause
3183                  */
3184                 if (atomic_read(&fs_info->scrub_pause_req)) {
3185                         /* push queued extents */
3186                         sctx->flush_all_writes = true;
3187                         scrub_submit(sctx);
3188                         mutex_lock(&sctx->wr_lock);
3189                         scrub_wr_submit(sctx);
3190                         mutex_unlock(&sctx->wr_lock);
3191                         wait_event(sctx->list_wait,
3192                                    atomic_read(&sctx->bios_in_flight) == 0);
3193                         sctx->flush_all_writes = false;
3194                         scrub_blocked_if_needed(fs_info);
3195                 }
3196
3197                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3198                         ret = get_raid56_logic_offset(physical, num, map,
3199                                                       &logical,
3200                                                       &stripe_logical);
3201                         logical += base;
3202                         if (ret) {
3203                                 /* it is parity strip */
3204                                 stripe_logical += base;
3205                                 stripe_end = stripe_logical + increment;
3206                                 ret = scrub_raid56_parity(sctx, map, scrub_dev,
3207                                                           ppath, stripe_logical,
3208                                                           stripe_end);
3209                                 if (ret)
3210                                         goto out;
3211                                 goto skip;
3212                         }
3213                 }
3214
3215                 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
3216                         key.type = BTRFS_METADATA_ITEM_KEY;
3217                 else
3218                         key.type = BTRFS_EXTENT_ITEM_KEY;
3219                 key.objectid = logical;
3220                 key.offset = (u64)-1;
3221
3222                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3223                 if (ret < 0)
3224                         goto out;
3225
3226                 if (ret > 0) {
3227                         ret = btrfs_previous_extent_item(root, path, 0);
3228                         if (ret < 0)
3229                                 goto out;
3230                         if (ret > 0) {
3231                                 /* there's no smaller item, so stick with the
3232                                  * larger one */
3233                                 btrfs_release_path(path);
3234                                 ret = btrfs_search_slot(NULL, root, &key,
3235                                                         path, 0, 0);
3236                                 if (ret < 0)
3237                                         goto out;
3238                         }
3239                 }
3240
3241                 stop_loop = 0;
3242                 while (1) {
3243                         u64 bytes;
3244
3245                         l = path->nodes[0];
3246                         slot = path->slots[0];
3247                         if (slot >= btrfs_header_nritems(l)) {
3248                                 ret = btrfs_next_leaf(root, path);
3249                                 if (ret == 0)
3250                                         continue;
3251                                 if (ret < 0)
3252                                         goto out;
3253
3254                                 stop_loop = 1;
3255                                 break;
3256                         }
3257                         btrfs_item_key_to_cpu(l, &key, slot);
3258
3259                         if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3260                             key.type != BTRFS_METADATA_ITEM_KEY)
3261                                 goto next;
3262
3263                         if (key.type == BTRFS_METADATA_ITEM_KEY)
3264                                 bytes = fs_info->nodesize;
3265                         else
3266                                 bytes = key.offset;
3267
3268                         if (key.objectid + bytes <= logical)
3269                                 goto next;
3270
3271                         if (key.objectid >= logical + map->stripe_len) {
3272                                 /* out of this device extent */
3273                                 if (key.objectid >= logic_end)
3274                                         stop_loop = 1;
3275                                 break;
3276                         }
3277
3278                         /*
3279                          * If our block group was removed in the meanwhile, just
3280                          * stop scrubbing since there is no point in continuing.
3281                          * Continuing would prevent reusing its device extents
3282                          * for new block groups for a long time.
3283                          */
3284                         spin_lock(&cache->lock);
3285                         if (cache->removed) {
3286                                 spin_unlock(&cache->lock);
3287                                 ret = 0;
3288                                 goto out;
3289                         }
3290                         spin_unlock(&cache->lock);
3291
3292                         extent = btrfs_item_ptr(l, slot,
3293                                                 struct btrfs_extent_item);
3294                         flags = btrfs_extent_flags(l, extent);
3295                         generation = btrfs_extent_generation(l, extent);
3296
3297                         if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3298                             (key.objectid < logical ||
3299                              key.objectid + bytes >
3300                              logical + map->stripe_len)) {
3301                                 btrfs_err(fs_info,
3302                                            "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
3303                                        key.objectid, logical);
3304                                 spin_lock(&sctx->stat_lock);
3305                                 sctx->stat.uncorrectable_errors++;
3306                                 spin_unlock(&sctx->stat_lock);
3307                                 goto next;
3308                         }
3309
3310 again:
3311                         extent_logical = key.objectid;
3312                         ASSERT(bytes <= U32_MAX);
3313                         extent_len = bytes;
3314
3315                         /*
3316                          * trim extent to this stripe
3317                          */
3318                         if (extent_logical < logical) {
3319                                 extent_len -= logical - extent_logical;
3320                                 extent_logical = logical;
3321                         }
3322                         if (extent_logical + extent_len >
3323                             logical + map->stripe_len) {
3324                                 extent_len = logical + map->stripe_len -
3325                                              extent_logical;
3326                         }
3327
3328                         extent_physical = extent_logical - logical + physical;
3329                         extent_dev = scrub_dev;
3330                         extent_mirror_num = mirror_num;
3331                         if (sctx->is_dev_replace)
3332                                 scrub_remap_extent(fs_info, extent_logical,
3333                                                    extent_len, &extent_physical,
3334                                                    &extent_dev,
3335                                                    &extent_mirror_num);
3336
3337                         if (flags & BTRFS_EXTENT_FLAG_DATA) {
3338                                 ret = btrfs_lookup_csums_range(csum_root,
3339                                                 extent_logical,
3340                                                 extent_logical + extent_len - 1,
3341                                                 &sctx->csum_list, 1);
3342                                 if (ret)
3343                                         goto out;
3344                         }
3345
3346                         ret = scrub_extent(sctx, map, extent_logical, extent_len,
3347                                            extent_physical, extent_dev, flags,
3348                                            generation, extent_mirror_num,
3349                                            extent_logical - logical + physical);
3350
3351                         scrub_free_csums(sctx);
3352
3353                         if (ret)
3354                                 goto out;
3355
3356                         if (extent_logical + extent_len <
3357                             key.objectid + bytes) {
3358                                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3359                                         /*
3360                                          * loop until we find next data stripe
3361                                          * or we have finished all stripes.
3362                                          */
3363 loop:
3364                                         physical += map->stripe_len;
3365                                         ret = get_raid56_logic_offset(physical,
3366                                                         num, map, &logical,
3367                                                         &stripe_logical);
3368                                         logical += base;
3369
3370                                         if (ret && physical < physical_end) {
3371                                                 stripe_logical += base;
3372                                                 stripe_end = stripe_logical +
3373                                                                 increment;
3374                                                 ret = scrub_raid56_parity(sctx,
3375                                                         map, scrub_dev, ppath,
3376                                                         stripe_logical,
3377                                                         stripe_end);
3378                                                 if (ret)
3379                                                         goto out;
3380                                                 goto loop;
3381                                         }
3382                                 } else {
3383                                         physical += map->stripe_len;
3384                                         logical += increment;
3385                                 }
3386                                 if (logical < key.objectid + bytes) {
3387                                         cond_resched();
3388                                         goto again;
3389                                 }
3390
3391                                 if (physical >= physical_end) {
3392                                         stop_loop = 1;
3393                                         break;
3394                                 }
3395                         }
3396 next:
3397                         path->slots[0]++;
3398                 }
3399                 btrfs_release_path(path);
3400 skip:
3401                 logical += increment;
3402                 physical += map->stripe_len;
3403                 spin_lock(&sctx->stat_lock);
3404                 if (stop_loop)
3405                         sctx->stat.last_physical = map->stripes[num].physical +
3406                                                    length;
3407                 else
3408                         sctx->stat.last_physical = physical;
3409                 spin_unlock(&sctx->stat_lock);
3410                 if (stop_loop)
3411                         break;
3412         }
3413 out:
3414         /* push queued extents */
3415         scrub_submit(sctx);
3416         mutex_lock(&sctx->wr_lock);
3417         scrub_wr_submit(sctx);
3418         mutex_unlock(&sctx->wr_lock);
3419
3420         blk_finish_plug(&plug);
3421         btrfs_free_path(path);
3422         btrfs_free_path(ppath);
3423         return ret < 0 ? ret : 0;
3424 }
3425
3426 static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
3427                                           struct btrfs_device *scrub_dev,
3428                                           u64 chunk_offset, u64 length,
3429                                           u64 dev_offset,
3430                                           struct btrfs_block_group *cache)
3431 {
3432         struct btrfs_fs_info *fs_info = sctx->fs_info;
3433         struct extent_map_tree *map_tree = &fs_info->mapping_tree;
3434         struct map_lookup *map;
3435         struct extent_map *em;
3436         int i;
3437         int ret = 0;
3438
3439         read_lock(&map_tree->lock);
3440         em = lookup_extent_mapping(map_tree, chunk_offset, 1);
3441         read_unlock(&map_tree->lock);
3442
3443         if (!em) {
3444                 /*
3445                  * Might have been an unused block group deleted by the cleaner
3446                  * kthread or relocation.
3447                  */
3448                 spin_lock(&cache->lock);
3449                 if (!cache->removed)
3450                         ret = -EINVAL;
3451                 spin_unlock(&cache->lock);
3452
3453                 return ret;
3454         }
3455
3456         map = em->map_lookup;
3457         if (em->start != chunk_offset)
3458                 goto out;
3459
3460         if (em->len < length)
3461                 goto out;
3462
3463         for (i = 0; i < map->num_stripes; ++i) {
3464                 if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
3465                     map->stripes[i].physical == dev_offset) {
3466                         ret = scrub_stripe(sctx, map, scrub_dev, i,
3467                                            chunk_offset, length, cache);
3468                         if (ret)
3469                                 goto out;
3470                 }
3471         }
3472 out:
3473         free_extent_map(em);
3474
3475         return ret;
3476 }
3477
3478 static noinline_for_stack
3479 int scrub_enumerate_chunks(struct scrub_ctx *sctx,
3480                            struct btrfs_device *scrub_dev, u64 start, u64 end)
3481 {
3482         struct btrfs_dev_extent *dev_extent = NULL;
3483         struct btrfs_path *path;
3484         struct btrfs_fs_info *fs_info = sctx->fs_info;
3485         struct btrfs_root *root = fs_info->dev_root;
3486         u64 length;
3487         u64 chunk_offset;
3488         int ret = 0;
3489         int ro_set;
3490         int slot;
3491         struct extent_buffer *l;
3492         struct btrfs_key key;
3493         struct btrfs_key found_key;
3494         struct btrfs_block_group *cache;
3495         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
3496
3497         path = btrfs_alloc_path();
3498         if (!path)
3499                 return -ENOMEM;
3500
3501         path->reada = READA_FORWARD;
3502         path->search_commit_root = 1;
3503         path->skip_locking = 1;
3504
3505         key.objectid = scrub_dev->devid;
3506         key.offset = 0ull;
3507         key.type = BTRFS_DEV_EXTENT_KEY;
3508
3509         while (1) {
3510                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3511                 if (ret < 0)
3512                         break;
3513                 if (ret > 0) {
3514                         if (path->slots[0] >=
3515                             btrfs_header_nritems(path->nodes[0])) {
3516                                 ret = btrfs_next_leaf(root, path);
3517                                 if (ret < 0)
3518                                         break;
3519                                 if (ret > 0) {
3520                                         ret = 0;
3521                                         break;
3522                                 }
3523                         } else {
3524                                 ret = 0;
3525                         }
3526                 }
3527
3528                 l = path->nodes[0];
3529                 slot = path->slots[0];
3530
3531                 btrfs_item_key_to_cpu(l, &found_key, slot);
3532
3533                 if (found_key.objectid != scrub_dev->devid)
3534                         break;
3535
3536                 if (found_key.type != BTRFS_DEV_EXTENT_KEY)
3537                         break;
3538
3539                 if (found_key.offset >= end)
3540                         break;
3541
3542                 if (found_key.offset < key.offset)
3543                         break;
3544
3545                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
3546                 length = btrfs_dev_extent_length(l, dev_extent);
3547
3548                 if (found_key.offset + length <= start)
3549                         goto skip;
3550
3551                 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
3552
3553                 /*
3554                  * get a reference on the corresponding block group to prevent
3555                  * the chunk from going away while we scrub it
3556                  */
3557                 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3558
3559                 /* some chunks are removed but not committed to disk yet,
3560                  * continue scrubbing */
3561                 if (!cache)
3562                         goto skip;
3563
3564                 /*
3565                  * Make sure that while we are scrubbing the corresponding block
3566                  * group doesn't get its logical address and its device extents
3567                  * reused for another block group, which can possibly be of a
3568                  * different type and different profile. We do this to prevent
3569                  * false error detections and crashes due to bogus attempts to
3570                  * repair extents.
3571                  */
3572                 spin_lock(&cache->lock);
3573                 if (cache->removed) {
3574                         spin_unlock(&cache->lock);
3575                         btrfs_put_block_group(cache);
3576                         goto skip;
3577                 }
3578                 btrfs_freeze_block_group(cache);
3579                 spin_unlock(&cache->lock);
3580
3581                 /*
3582                  * we need call btrfs_inc_block_group_ro() with scrubs_paused,
3583                  * to avoid deadlock caused by:
3584                  * btrfs_inc_block_group_ro()
3585                  * -> btrfs_wait_for_commit()
3586                  * -> btrfs_commit_transaction()
3587                  * -> btrfs_scrub_pause()
3588                  */
3589                 scrub_pause_on(fs_info);
3590
3591                 /*
3592                  * Don't do chunk preallocation for scrub.
3593                  *
3594                  * This is especially important for SYSTEM bgs, or we can hit
3595                  * -EFBIG from btrfs_finish_chunk_alloc() like:
3596                  * 1. The only SYSTEM bg is marked RO.
3597                  *    Since SYSTEM bg is small, that's pretty common.
3598                  * 2. New SYSTEM bg will be allocated
3599                  *    Due to regular version will allocate new chunk.
3600                  * 3. New SYSTEM bg is empty and will get cleaned up
3601                  *    Before cleanup really happens, it's marked RO again.
3602                  * 4. Empty SYSTEM bg get scrubbed
3603                  *    We go back to 2.
3604                  *
3605                  * This can easily boost the amount of SYSTEM chunks if cleaner
3606                  * thread can't be triggered fast enough, and use up all space
3607                  * of btrfs_super_block::sys_chunk_array
3608                  *
3609                  * While for dev replace, we need to try our best to mark block
3610                  * group RO, to prevent race between:
3611                  * - Write duplication
3612                  *   Contains latest data
3613                  * - Scrub copy
3614                  *   Contains data from commit tree
3615                  *
3616                  * If target block group is not marked RO, nocow writes can
3617                  * be overwritten by scrub copy, causing data corruption.
3618                  * So for dev-replace, it's not allowed to continue if a block
3619                  * group is not RO.
3620                  */
3621                 ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace);
3622                 if (ret == 0) {
3623                         ro_set = 1;
3624                 } else if (ret == -ENOSPC && !sctx->is_dev_replace) {
3625                         /*
3626                          * btrfs_inc_block_group_ro return -ENOSPC when it
3627                          * failed in creating new chunk for metadata.
3628                          * It is not a problem for scrub, because
3629                          * metadata are always cowed, and our scrub paused
3630                          * commit_transactions.
3631                          */
3632                         ro_set = 0;
3633                 } else if (ret == -ETXTBSY) {
3634                         btrfs_warn(fs_info,
3635                    "skipping scrub of block group %llu due to active swapfile",
3636                                    cache->start);
3637                         scrub_pause_off(fs_info);
3638                         ret = 0;
3639                         goto skip_unfreeze;
3640                 } else {
3641                         btrfs_warn(fs_info,
3642                                    "failed setting block group ro: %d", ret);
3643                         btrfs_unfreeze_block_group(cache);
3644                         btrfs_put_block_group(cache);
3645                         scrub_pause_off(fs_info);
3646                         break;
3647                 }
3648
3649                 /*
3650                  * Now the target block is marked RO, wait for nocow writes to
3651                  * finish before dev-replace.
3652                  * COW is fine, as COW never overwrites extents in commit tree.
3653                  */
3654                 if (sctx->is_dev_replace) {
3655                         btrfs_wait_nocow_writers(cache);
3656                         btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start,
3657                                         cache->length);
3658                 }
3659
3660                 scrub_pause_off(fs_info);
3661                 down_write(&dev_replace->rwsem);
3662                 dev_replace->cursor_right = found_key.offset + length;
3663                 dev_replace->cursor_left = found_key.offset;
3664                 dev_replace->item_needs_writeback = 1;
3665                 up_write(&dev_replace->rwsem);
3666
3667                 ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length,
3668                                   found_key.offset, cache);
3669
3670                 /*
3671                  * flush, submit all pending read and write bios, afterwards
3672                  * wait for them.
3673                  * Note that in the dev replace case, a read request causes
3674                  * write requests that are submitted in the read completion
3675                  * worker. Therefore in the current situation, it is required
3676                  * that all write requests are flushed, so that all read and
3677                  * write requests are really completed when bios_in_flight
3678                  * changes to 0.
3679                  */
3680                 sctx->flush_all_writes = true;
3681                 scrub_submit(sctx);
3682                 mutex_lock(&sctx->wr_lock);
3683                 scrub_wr_submit(sctx);
3684                 mutex_unlock(&sctx->wr_lock);
3685
3686                 wait_event(sctx->list_wait,
3687                            atomic_read(&sctx->bios_in_flight) == 0);
3688
3689                 scrub_pause_on(fs_info);
3690
3691                 /*
3692                  * must be called before we decrease @scrub_paused.
3693                  * make sure we don't block transaction commit while
3694                  * we are waiting pending workers finished.
3695                  */
3696                 wait_event(sctx->list_wait,
3697                            atomic_read(&sctx->workers_pending) == 0);
3698                 sctx->flush_all_writes = false;
3699
3700                 scrub_pause_off(fs_info);
3701
3702                 down_write(&dev_replace->rwsem);
3703                 dev_replace->cursor_left = dev_replace->cursor_right;
3704                 dev_replace->item_needs_writeback = 1;
3705                 up_write(&dev_replace->rwsem);
3706
3707                 if (ro_set)
3708                         btrfs_dec_block_group_ro(cache);
3709
3710                 /*
3711                  * We might have prevented the cleaner kthread from deleting
3712                  * this block group if it was already unused because we raced
3713                  * and set it to RO mode first. So add it back to the unused
3714                  * list, otherwise it might not ever be deleted unless a manual
3715                  * balance is triggered or it becomes used and unused again.
3716                  */
3717                 spin_lock(&cache->lock);
3718                 if (!cache->removed && !cache->ro && cache->reserved == 0 &&
3719                     cache->used == 0) {
3720                         spin_unlock(&cache->lock);
3721                         if (btrfs_test_opt(fs_info, DISCARD_ASYNC))
3722                                 btrfs_discard_queue_work(&fs_info->discard_ctl,
3723                                                          cache);
3724                         else
3725                                 btrfs_mark_bg_unused(cache);
3726                 } else {
3727                         spin_unlock(&cache->lock);
3728                 }
3729 skip_unfreeze:
3730                 btrfs_unfreeze_block_group(cache);
3731                 btrfs_put_block_group(cache);
3732                 if (ret)
3733                         break;
3734                 if (sctx->is_dev_replace &&
3735                     atomic64_read(&dev_replace->num_write_errors) > 0) {
3736                         ret = -EIO;
3737                         break;
3738                 }
3739                 if (sctx->stat.malloc_errors > 0) {
3740                         ret = -ENOMEM;
3741                         break;
3742                 }
3743 skip:
3744                 key.offset = found_key.offset + length;
3745                 btrfs_release_path(path);
3746         }
3747
3748         btrfs_free_path(path);
3749
3750         return ret;
3751 }
3752
3753 static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
3754                                            struct btrfs_device *scrub_dev)
3755 {
3756         int     i;
3757         u64     bytenr;
3758         u64     gen;
3759         int     ret;
3760         struct btrfs_fs_info *fs_info = sctx->fs_info;
3761
3762         if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
3763                 return -EROFS;
3764
3765         /* Seed devices of a new filesystem has their own generation. */
3766         if (scrub_dev->fs_devices != fs_info->fs_devices)
3767                 gen = scrub_dev->generation;
3768         else
3769                 gen = fs_info->last_trans_committed;
3770
3771         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
3772                 bytenr = btrfs_sb_offset(i);
3773                 if (bytenr + BTRFS_SUPER_INFO_SIZE >
3774                     scrub_dev->commit_total_bytes)
3775                         break;
3776                 if (!btrfs_check_super_location(scrub_dev, bytenr))
3777                         continue;
3778
3779                 ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
3780                                   scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
3781                                   NULL, bytenr);
3782                 if (ret)
3783                         return ret;
3784         }
3785         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3786
3787         return 0;
3788 }
3789
3790 static void scrub_workers_put(struct btrfs_fs_info *fs_info)
3791 {
3792         if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt,
3793                                         &fs_info->scrub_lock)) {
3794                 struct btrfs_workqueue *scrub_workers = NULL;
3795                 struct btrfs_workqueue *scrub_wr_comp = NULL;
3796                 struct btrfs_workqueue *scrub_parity = NULL;
3797
3798                 scrub_workers = fs_info->scrub_workers;
3799                 scrub_wr_comp = fs_info->scrub_wr_completion_workers;
3800                 scrub_parity = fs_info->scrub_parity_workers;
3801
3802                 fs_info->scrub_workers = NULL;
3803                 fs_info->scrub_wr_completion_workers = NULL;
3804                 fs_info->scrub_parity_workers = NULL;
3805                 mutex_unlock(&fs_info->scrub_lock);
3806
3807                 btrfs_destroy_workqueue(scrub_workers);
3808                 btrfs_destroy_workqueue(scrub_wr_comp);
3809                 btrfs_destroy_workqueue(scrub_parity);
3810         }
3811 }
3812
3813 /*
3814  * get a reference count on fs_info->scrub_workers. start worker if necessary
3815  */
3816 static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
3817                                                 int is_dev_replace)
3818 {
3819         struct btrfs_workqueue *scrub_workers = NULL;
3820         struct btrfs_workqueue *scrub_wr_comp = NULL;
3821         struct btrfs_workqueue *scrub_parity = NULL;
3822         unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
3823         int max_active = fs_info->thread_pool_size;
3824         int ret = -ENOMEM;
3825
3826         if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt))
3827                 return 0;
3828
3829         scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub", flags,
3830                                               is_dev_replace ? 1 : max_active, 4);
3831         if (!scrub_workers)
3832                 goto fail_scrub_workers;
3833
3834         scrub_wr_comp = btrfs_alloc_workqueue(fs_info, "scrubwrc", flags,
3835                                               max_active, 2);
3836         if (!scrub_wr_comp)
3837                 goto fail_scrub_wr_completion_workers;
3838
3839         scrub_parity = btrfs_alloc_workqueue(fs_info, "scrubparity", flags,
3840                                              max_active, 2);
3841         if (!scrub_parity)
3842                 goto fail_scrub_parity_workers;
3843
3844         mutex_lock(&fs_info->scrub_lock);
3845         if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
3846                 ASSERT(fs_info->scrub_workers == NULL &&
3847                        fs_info->scrub_wr_completion_workers == NULL &&
3848                        fs_info->scrub_parity_workers == NULL);
3849                 fs_info->scrub_workers = scrub_workers;
3850                 fs_info->scrub_wr_completion_workers = scrub_wr_comp;
3851                 fs_info->scrub_parity_workers = scrub_parity;
3852                 refcount_set(&fs_info->scrub_workers_refcnt, 1);
3853                 mutex_unlock(&fs_info->scrub_lock);
3854                 return 0;
3855         }
3856         /* Other thread raced in and created the workers for us */
3857         refcount_inc(&fs_info->scrub_workers_refcnt);
3858         mutex_unlock(&fs_info->scrub_lock);
3859
3860         ret = 0;
3861         btrfs_destroy_workqueue(scrub_parity);
3862 fail_scrub_parity_workers:
3863         btrfs_destroy_workqueue(scrub_wr_comp);
3864 fail_scrub_wr_completion_workers:
3865         btrfs_destroy_workqueue(scrub_workers);
3866 fail_scrub_workers:
3867         return ret;
3868 }
3869
3870 int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
3871                     u64 end, struct btrfs_scrub_progress *progress,
3872                     int readonly, int is_dev_replace)
3873 {
3874         struct scrub_ctx *sctx;
3875         int ret;
3876         struct btrfs_device *dev;
3877         unsigned int nofs_flag;
3878
3879         if (btrfs_fs_closing(fs_info))
3880                 return -EAGAIN;
3881
3882         if (fs_info->nodesize > BTRFS_STRIPE_LEN) {
3883                 /*
3884                  * in this case scrub is unable to calculate the checksum
3885                  * the way scrub is implemented. Do not handle this
3886                  * situation at all because it won't ever happen.
3887                  */
3888                 btrfs_err(fs_info,
3889                            "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",
3890                        fs_info->nodesize,
3891                        BTRFS_STRIPE_LEN);
3892                 return -EINVAL;
3893         }
3894
3895         if (fs_info->nodesize >
3896             PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
3897             fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
3898                 /*
3899                  * would exhaust the array bounds of pagev member in
3900                  * struct scrub_block
3901                  */
3902                 btrfs_err(fs_info,
3903                           "scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails",
3904                        fs_info->nodesize,
3905                        SCRUB_MAX_PAGES_PER_BLOCK,
3906                        fs_info->sectorsize,
3907                        SCRUB_MAX_PAGES_PER_BLOCK);
3908                 return -EINVAL;
3909         }
3910
3911         /* Allocate outside of device_list_mutex */
3912         sctx = scrub_setup_ctx(fs_info, is_dev_replace);
3913         if (IS_ERR(sctx))
3914                 return PTR_ERR(sctx);
3915
3916         ret = scrub_workers_get(fs_info, is_dev_replace);
3917         if (ret)
3918                 goto out_free_ctx;
3919
3920         mutex_lock(&fs_info->fs_devices->device_list_mutex);
3921         dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
3922         if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
3923                      !is_dev_replace)) {
3924                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3925                 ret = -ENODEV;
3926                 goto out;
3927         }
3928
3929         if (!is_dev_replace && !readonly &&
3930             !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
3931                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3932                 btrfs_err_in_rcu(fs_info,
3933                         "scrub on devid %llu: filesystem on %s is not writable",
3934                                  devid, rcu_str_deref(dev->name));
3935                 ret = -EROFS;
3936                 goto out;
3937         }
3938
3939         mutex_lock(&fs_info->scrub_lock);
3940         if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
3941             test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) {
3942                 mutex_unlock(&fs_info->scrub_lock);
3943                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3944                 ret = -EIO;
3945                 goto out;
3946         }
3947
3948         down_read(&fs_info->dev_replace.rwsem);
3949         if (dev->scrub_ctx ||
3950             (!is_dev_replace &&
3951              btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
3952                 up_read(&fs_info->dev_replace.rwsem);
3953                 mutex_unlock(&fs_info->scrub_lock);
3954                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3955                 ret = -EINPROGRESS;
3956                 goto out;
3957         }
3958         up_read(&fs_info->dev_replace.rwsem);
3959
3960         sctx->readonly = readonly;
3961         dev->scrub_ctx = sctx;
3962         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3963
3964         /*
3965          * checking @scrub_pause_req here, we can avoid
3966          * race between committing transaction and scrubbing.
3967          */
3968         __scrub_blocked_if_needed(fs_info);
3969         atomic_inc(&fs_info->scrubs_running);
3970         mutex_unlock(&fs_info->scrub_lock);
3971
3972         /*
3973          * In order to avoid deadlock with reclaim when there is a transaction
3974          * trying to pause scrub, make sure we use GFP_NOFS for all the
3975          * allocations done at btrfs_scrub_pages() and scrub_pages_for_parity()
3976          * invoked by our callees. The pausing request is done when the
3977          * transaction commit starts, and it blocks the transaction until scrub
3978          * is paused (done at specific points at scrub_stripe() or right above
3979          * before incrementing fs_info->scrubs_running).
3980          */
3981         nofs_flag = memalloc_nofs_save();
3982         if (!is_dev_replace) {
3983                 btrfs_info(fs_info, "scrub: started on devid %llu", devid);
3984                 /*
3985                  * by holding device list mutex, we can
3986                  * kick off writing super in log tree sync.
3987                  */
3988                 mutex_lock(&fs_info->fs_devices->device_list_mutex);
3989                 ret = scrub_supers(sctx, dev);
3990                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3991         }
3992
3993         if (!ret)
3994                 ret = scrub_enumerate_chunks(sctx, dev, start, end);
3995         memalloc_nofs_restore(nofs_flag);
3996
3997         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3998         atomic_dec(&fs_info->scrubs_running);
3999         wake_up(&fs_info->scrub_pause_wait);
4000
4001         wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
4002
4003         if (progress)
4004                 memcpy(progress, &sctx->stat, sizeof(*progress));
4005
4006         if (!is_dev_replace)
4007                 btrfs_info(fs_info, "scrub: %s on devid %llu with status: %d",
4008                         ret ? "not finished" : "finished", devid, ret);
4009
4010         mutex_lock(&fs_info->scrub_lock);
4011         dev->scrub_ctx = NULL;
4012         mutex_unlock(&fs_info->scrub_lock);
4013
4014         scrub_workers_put(fs_info);
4015         scrub_put_ctx(sctx);
4016
4017         return ret;
4018 out:
4019         scrub_workers_put(fs_info);
4020 out_free_ctx:
4021         scrub_free_ctx(sctx);
4022
4023         return ret;
4024 }
4025
4026 void btrfs_scrub_pause(struct btrfs_fs_info *fs_info)
4027 {
4028         mutex_lock(&fs_info->scrub_lock);
4029         atomic_inc(&fs_info->scrub_pause_req);
4030         while (atomic_read(&fs_info->scrubs_paused) !=
4031                atomic_read(&fs_info->scrubs_running)) {
4032                 mutex_unlock(&fs_info->scrub_lock);
4033                 wait_event(fs_info->scrub_pause_wait,
4034                            atomic_read(&fs_info->scrubs_paused) ==
4035                            atomic_read(&fs_info->scrubs_running));
4036                 mutex_lock(&fs_info->scrub_lock);
4037         }
4038         mutex_unlock(&fs_info->scrub_lock);
4039 }
4040
4041 void btrfs_scrub_continue(struct btrfs_fs_info *fs_info)
4042 {
4043         atomic_dec(&fs_info->scrub_pause_req);
4044         wake_up(&fs_info->scrub_pause_wait);
4045 }
4046
4047 int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
4048 {
4049         mutex_lock(&fs_info->scrub_lock);
4050         if (!atomic_read(&fs_info->scrubs_running)) {
4051                 mutex_unlock(&fs_info->scrub_lock);
4052                 return -ENOTCONN;
4053         }
4054
4055         atomic_inc(&fs_info->scrub_cancel_req);
4056         while (atomic_read(&fs_info->scrubs_running)) {
4057                 mutex_unlock(&fs_info->scrub_lock);
4058                 wait_event(fs_info->scrub_pause_wait,
4059                            atomic_read(&fs_info->scrubs_running) == 0);
4060                 mutex_lock(&fs_info->scrub_lock);
4061         }
4062         atomic_dec(&fs_info->scrub_cancel_req);
4063         mutex_unlock(&fs_info->scrub_lock);
4064
4065         return 0;
4066 }
4067
4068 int btrfs_scrub_cancel_dev(struct btrfs_device *dev)
4069 {
4070         struct btrfs_fs_info *fs_info = dev->fs_info;
4071         struct scrub_ctx *sctx;
4072
4073         mutex_lock(&fs_info->scrub_lock);
4074         sctx = dev->scrub_ctx;
4075         if (!sctx) {
4076                 mutex_unlock(&fs_info->scrub_lock);
4077                 return -ENOTCONN;
4078         }
4079         atomic_inc(&sctx->cancel_req);
4080         while (dev->scrub_ctx) {
4081                 mutex_unlock(&fs_info->scrub_lock);
4082                 wait_event(fs_info->scrub_pause_wait,
4083                            dev->scrub_ctx == NULL);
4084                 mutex_lock(&fs_info->scrub_lock);
4085         }
4086         mutex_unlock(&fs_info->scrub_lock);
4087
4088         return 0;
4089 }
4090
4091 int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
4092                          struct btrfs_scrub_progress *progress)
4093 {
4094         struct btrfs_device *dev;
4095         struct scrub_ctx *sctx = NULL;
4096
4097         mutex_lock(&fs_info->fs_devices->device_list_mutex);
4098         dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
4099         if (dev)
4100                 sctx = dev->scrub_ctx;
4101         if (sctx)
4102                 memcpy(progress, &sctx->stat, sizeof(*progress));
4103         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4104
4105         return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
4106 }
4107
4108 static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
4109                                u64 extent_logical, u32 extent_len,
4110                                u64 *extent_physical,
4111                                struct btrfs_device **extent_dev,
4112                                int *extent_mirror_num)
4113 {
4114         u64 mapped_length;
4115         struct btrfs_bio *bbio = NULL;
4116         int ret;
4117
4118         mapped_length = extent_len;
4119         ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical,
4120                               &mapped_length, &bbio, 0);
4121         if (ret || !bbio || mapped_length < extent_len ||
4122             !bbio->stripes[0].dev->bdev) {
4123                 btrfs_put_bbio(bbio);
4124                 return;
4125         }
4126
4127         *extent_physical = bbio->stripes[0].physical;
4128         *extent_mirror_num = bbio->mirror_num;
4129         *extent_dev = bbio->stripes[0].dev;
4130         btrfs_put_bbio(bbio);
4131 }