fs/btrfs/scrub.c

   1 /*
   2  * Copyright (C) 2011, 2012 STRATO.  All rights reserved.
   3  *
   4  * This program is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU General Public
   6  * License v2 as published by the Free Software Foundation.
   7  *
   8  * This program is distributed in the hope that it will be useful,
   9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11  * General Public License for more details.
  12  *
  13  * You should have received a copy of the GNU General Public
  14  * License along with this program; if not, write to the
  15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16  * Boston, MA 021110-1307, USA.
  17  */
  18
  19 #include <linux/blkdev.h>
  20 #include <linux/ratelimit.h>
  21 #include "ctree.h"
  22 #include "volumes.h"
  23 #include "disk-io.h"
  24 #include "ordered-data.h"
  25 #include "transaction.h"
  26 #include "backref.h"
  27 #include "extent_io.h"
  28 #include "dev-replace.h"
  29 #include "check-integrity.h"
  30 #include "rcu-string.h"
  31 #include "raid56.h"
  32
  33 /*
  34  * This is only the first step towards a full-features scrub. It reads all
  35  * extent and super block and verifies the checksums. In case a bad checksum
  36  * is found or the extent cannot be read, good data will be written back if
  37  * any can be found.
  38  *
  39  * Future enhancements:
  40  *  - In case an unrepairable extent is encountered, track which files are
  41  *    affected and report them
  42  *  - track and record media errors, throw out bad devices
  43  *  - add a mode to also read unallocated space
  44  */
  45
  46 struct scrub_block;
  47 struct scrub_ctx;
  48
  49 /*
  50  * the following three values only influence the performance.
  51  * The last one configures the number of parallel and outstanding I/O
  52  * operations. The first two values configure an upper limit for the number
  53  * of (dynamically allocated) pages that are added to a bio.
  54  */
  55 #define SCRUB_PAGES_PER_RD_BIO  32      /* 128k per bio */
  56 #define SCRUB_PAGES_PER_WR_BIO  32      /* 128k per bio */
  57 #define SCRUB_BIOS_PER_SCTX     64      /* 8MB per device in flight */
  58
  59 /*
  60  * the following value times PAGE_SIZE needs to be large enough to match the
  61  * largest node/leaf/sector size that shall be supported.
  62  * Values larger than BTRFS_STRIPE_LEN are not supported.
  63  */
  64 #define SCRUB_MAX_PAGES_PER_BLOCK       16      /* 64k per node/leaf/sector */
  65
  66 struct scrub_recover {
  67         atomic_t                refs;
  68         struct btrfs_bio        *bbio;
  69         u64                     *raid_map;
  70         u64                     map_length;
  71 };
  72
  73 struct scrub_page {
  74         struct scrub_block      *sblock;
  75         struct page             *page;
  76         struct btrfs_device     *dev;
  77         u64                     flags;  /* extent flags */
  78         u64                     generation;
  79         u64                     logical;
  80         u64                     physical;
  81         u64                     physical_for_dev_replace;
  82         atomic_t                ref_count;
  83         struct {
  84                 unsigned int    mirror_num:8;
  85                 unsigned int    have_csum:1;
  86                 unsigned int    io_error:1;
  87         };
  88         u8                      csum[BTRFS_CSUM_SIZE];
  89
  90         struct scrub_recover    *recover;
  91 };
  92
  93 struct scrub_bio {
  94         int                     index;
  95         struct scrub_ctx        *sctx;
  96         struct btrfs_device     *dev;
  97         struct bio              *bio;
  98         int                     err;
  99         u64                     logical;
 100         u64                     physical;
 101 #if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
 102         struct scrub_page       *pagev[SCRUB_PAGES_PER_WR_BIO];
 103 #else
 104         struct scrub_page       *pagev[SCRUB_PAGES_PER_RD_BIO];
 105 #endif
 106         int                     page_count;
 107         int                     next_free;
 108         struct btrfs_work       work;
 109 };
 110
 111 struct scrub_block {
 112         struct scrub_page       *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
 113         int                     page_count;
 114         atomic_t                outstanding_pages;
 115         atomic_t                ref_count; /* free mem on transition to zero */
 116         struct scrub_ctx        *sctx;
 117         struct {
 118                 unsigned int    header_error:1;
 119                 unsigned int    checksum_error:1;
 120                 unsigned int    no_io_error_seen:1;
 121                 unsigned int    generation_error:1; /* also sets header_error */
 122         };
 123 };
 124
 125 struct scrub_wr_ctx {
 126         struct scrub_bio *wr_curr_bio;
 127         struct btrfs_device *tgtdev;
 128         int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
 129         atomic_t flush_all_writes;
 130         struct mutex wr_lock;
 131 };
 132
 133 struct scrub_ctx {
 134         struct scrub_bio        *bios[SCRUB_BIOS_PER_SCTX];
 135         struct btrfs_root       *dev_root;
 136         int                     first_free;
 137         int                     curr;
 138         atomic_t                bios_in_flight;
 139         atomic_t                workers_pending;
 140         spinlock_t              list_lock;
 141         wait_queue_head_t       list_wait;
 142         u16                     csum_size;
 143         struct list_head        csum_list;
 144         atomic_t                cancel_req;
 145         int                     readonly;
 146         int                     pages_per_rd_bio;
 147         u32                     sectorsize;
 148         u32                     nodesize;
 149
 150         int                     is_dev_replace;
 151         struct scrub_wr_ctx     wr_ctx;
 152
 153         /*
 154          * statistics
 155          */
 156         struct btrfs_scrub_progress stat;
 157         spinlock_t              stat_lock;
 158 };
 159
 160 struct scrub_fixup_nodatasum {
 161         struct scrub_ctx        *sctx;
 162         struct btrfs_device     *dev;
 163         u64                     logical;
 164         struct btrfs_root       *root;
 165         struct btrfs_work       work;
 166         int                     mirror_num;
 167 };
 168
 169 struct scrub_nocow_inode {
 170         u64                     inum;
 171         u64                     offset;
 172         u64                     root;
 173         struct list_head        list;
 174 };
 175
 176 struct scrub_copy_nocow_ctx {
 177         struct scrub_ctx        *sctx;
 178         u64                     logical;
 179         u64                     len;
 180         int                     mirror_num;
 181         u64                     physical_for_dev_replace;
 182         struct list_head        inodes;
 183         struct btrfs_work       work;
 184 };
 185
 186 struct scrub_warning {
 187         struct btrfs_path       *path;
 188         u64                     extent_item_size;
 189         const char              *errstr;
 190         sector_t                sector;
 191         u64                     logical;
 192         struct btrfs_device     *dev;
 193 };
 194
 195 static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
 196 static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
 197 static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
 198 static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);
 199 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
 200 static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
 201                                      struct btrfs_fs_info *fs_info,
 202                                      struct scrub_block *original_sblock,
 203                                      u64 length, u64 logical,
 204                                      struct scrub_block *sblocks_for_recheck);
 205 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
 206                                 struct scrub_block *sblock, int is_metadata,
 207                                 int have_csum, u8 *csum, u64 generation,
 208                                 u16 csum_size, int retry_failed_mirror);
 209 static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
 210                                          struct scrub_block *sblock,
 211                                          int is_metadata, int have_csum,
 212                                          const u8 *csum, u64 generation,
 213                                          u16 csum_size);
 214 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
 215                                              struct scrub_block *sblock_good,
 216                                              int force_write);
 217 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
 218                                             struct scrub_block *sblock_good,
 219                                             int page_num, int force_write);
 220 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
 221 static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
 222                                            int page_num);
 223 static int scrub_checksum_data(struct scrub_block *sblock);
 224 static int scrub_checksum_tree_block(struct scrub_block *sblock);
 225 static int scrub_checksum_super(struct scrub_block *sblock);
 226 static void scrub_block_get(struct scrub_block *sblock);
 227 static void scrub_block_put(struct scrub_block *sblock);
 228 static void scrub_page_get(struct scrub_page *spage);
 229 static void scrub_page_put(struct scrub_page *spage);
 230 static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
 231                                     struct scrub_page *spage);
 232 static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
 233                        u64 physical, struct btrfs_device *dev, u64 flags,
 234                        u64 gen, int mirror_num, u8 *csum, int force,
 235                        u64 physical_for_dev_replace);
 236 static void scrub_bio_end_io(struct bio *bio, int err);
 237 static void scrub_bio_end_io_worker(struct btrfs_work *work);
 238 static void scrub_block_complete(struct scrub_block *sblock);
 239 static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
 240                                u64 extent_logical, u64 extent_len,
 241                                u64 *extent_physical,
 242                                struct btrfs_device **extent_dev,
 243                                int *extent_mirror_num);
 244 static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
 245                               struct scrub_wr_ctx *wr_ctx,
 246                               struct btrfs_fs_info *fs_info,
 247                               struct btrfs_device *dev,
 248                               int is_dev_replace);
 249 static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx);
 250 static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
 251                                     struct scrub_page *spage);
 252 static void scrub_wr_submit(struct scrub_ctx *sctx);
 253 static void scrub_wr_bio_end_io(struct bio *bio, int err);
 254 static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
 255 static int write_page_nocow(struct scrub_ctx *sctx,
 256                             u64 physical_for_dev_replace, struct page *page);
 257 static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
 258                                       struct scrub_copy_nocow_ctx *ctx);
 259 static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
 260                             int mirror_num, u64 physical_for_dev_replace);
 261 static void copy_nocow_pages_worker(struct btrfs_work *work);
 262 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
 263 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
 264
 265
 266 static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
 267 {
 268         atomic_inc(&sctx->bios_in_flight);
 269 }
 270
 271 static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
 272 {
 273         atomic_dec(&sctx->bios_in_flight);
 274         wake_up(&sctx->list_wait);
 275 }
 276
 277 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
 278 {
 279         while (atomic_read(&fs_info->scrub_pause_req)) {
 280                 mutex_unlock(&fs_info->scrub_lock);
 281                 wait_event(fs_info->scrub_pause_wait,
 282                    atomic_read(&fs_info->scrub_pause_req) == 0);
 283                 mutex_lock(&fs_info->scrub_lock);
 284         }
 285 }
 286
 287 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
 288 {
 289         atomic_inc(&fs_info->scrubs_paused);
 290         wake_up(&fs_info->scrub_pause_wait);
 291
 292         mutex_lock(&fs_info->scrub_lock);
 293         __scrub_blocked_if_needed(fs_info);
 294         atomic_dec(&fs_info->scrubs_paused);
 295         mutex_unlock(&fs_info->scrub_lock);
 296
 297         wake_up(&fs_info->scrub_pause_wait);
 298 }
 299
 300 /*
 301  * used for workers that require transaction commits (i.e., for the
 302  * NOCOW case)
 303  */
 304 static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
 305 {
 306         struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
 307
 308         /*
 309          * increment scrubs_running to prevent cancel requests from
 310          * completing as long as a worker is running. we must also
 311          * increment scrubs_paused to prevent deadlocking on pause
 312          * requests used for transactions commits (as the worker uses a
 313          * transaction context). it is safe to regard the worker
 314          * as paused for all matters practical. effectively, we only
 315          * avoid cancellation requests from completing.
 316          */
 317         mutex_lock(&fs_info->scrub_lock);
 318         atomic_inc(&fs_info->scrubs_running);
 319         atomic_inc(&fs_info->scrubs_paused);
 320         mutex_unlock(&fs_info->scrub_lock);
 321
 322         /*
 323          * check if @scrubs_running=@scrubs_paused condition
 324          * inside wait_event() is not an atomic operation.
 325          * which means we may inc/dec @scrub_running/paused
 326          * at any time. Let's wake up @scrub_pause_wait as
 327          * much as we can to let commit transaction blocked less.
 328          */
 329         wake_up(&fs_info->scrub_pause_wait);
 330
 331         atomic_inc(&sctx->workers_pending);
 332 }
 333
 334 /* used for workers that require transaction commits */
 335 static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx)
 336 {
 337         struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
 338
 339         /*
 340          * see scrub_pending_trans_workers_inc() why we're pretending
 341          * to be paused in the scrub counters
 342          */
 343         mutex_lock(&fs_info->scrub_lock);
 344         atomic_dec(&fs_info->scrubs_running);
 345         atomic_dec(&fs_info->scrubs_paused);
 346         mutex_unlock(&fs_info->scrub_lock);
 347         atomic_dec(&sctx->workers_pending);
 348         wake_up(&fs_info->scrub_pause_wait);
 349         wake_up(&sctx->list_wait);
 350 }
 351
 352 static void scrub_free_csums(struct scrub_ctx *sctx)
 353 {
 354         while (!list_empty(&sctx->csum_list)) {
 355                 struct btrfs_ordered_sum *sum;
 356                 sum = list_first_entry(&sctx->csum_list,
 357                                        struct btrfs_ordered_sum, list);
 358                 list_del(&sum->list);
 359                 kfree(sum);
 360         }
 361 }
 362
 363 static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
 364 {
 365         int i;
 366
 367         if (!sctx)
 368                 return;
 369
 370         scrub_free_wr_ctx(&sctx->wr_ctx);
 371
 372         /* this can happen when scrub is cancelled */
 373         if (sctx->curr != -1) {
 374                 struct scrub_bio *sbio = sctx->bios[sctx->curr];
 375
 376                 for (i = 0; i < sbio->page_count; i++) {
 377                         WARN_ON(!sbio->pagev[i]->page);
 378                         scrub_block_put(sbio->pagev[i]->sblock);
 379                 }
 380                 bio_put(sbio->bio);
 381         }
 382
 383         for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
 384                 struct scrub_bio *sbio = sctx->bios[i];
 385
 386                 if (!sbio)
 387                         break;
 388                 kfree(sbio);
 389         }
 390
 391         scrub_free_csums(sctx);
 392         kfree(sctx);
 393 }
 394
 395 static noinline_for_stack
 396 struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
 397 {
 398         struct scrub_ctx *sctx;
 399         int             i;
 400         struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
 401         int pages_per_rd_bio;
 402         int ret;
 403
 404         /*
 405          * the setting of pages_per_rd_bio is correct for scrub but might
 406          * be wrong for the dev_replace code where we might read from
 407          * different devices in the initial huge bios. However, that
 408          * code is able to correctly handle the case when adding a page
 409          * to a bio fails.
 410          */
 411         if (dev->bdev)
 412                 pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO,
 413                                          bio_get_nr_vecs(dev->bdev));
 414         else
 415                 pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
 416         sctx = kzalloc(sizeof(*sctx), GFP_NOFS);
 417         if (!sctx)
 418                 goto nomem;
 419         sctx->is_dev_replace = is_dev_replace;
 420         sctx->pages_per_rd_bio = pages_per_rd_bio;
 421         sctx->curr = -1;
 422         sctx->dev_root = dev->dev_root;
 423         for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
 424                 struct scrub_bio *sbio;
 425
 426                 sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
 427                 if (!sbio)
 428                         goto nomem;
 429                 sctx->bios[i] = sbio;
 430
 431                 sbio->index = i;
 432                 sbio->sctx = sctx;
 433                 sbio->page_count = 0;
 434                 btrfs_init_work(&sbio->work, btrfs_scrub_helper,
 435                                 scrub_bio_end_io_worker, NULL, NULL);
 436
 437                 if (i != SCRUB_BIOS_PER_SCTX - 1)
 438                         sctx->bios[i]->next_free = i + 1;
 439                 else
 440                         sctx->bios[i]->next_free = -1;
 441         }
 442         sctx->first_free = 0;
 443         sctx->nodesize = dev->dev_root->nodesize;
 444         sctx->sectorsize = dev->dev_root->sectorsize;
 445         atomic_set(&sctx->bios_in_flight, 0);
 446         atomic_set(&sctx->workers_pending, 0);
 447         atomic_set(&sctx->cancel_req, 0);
 448         sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
 449         INIT_LIST_HEAD(&sctx->csum_list);
 450
 451         spin_lock_init(&sctx->list_lock);
 452         spin_lock_init(&sctx->stat_lock);
 453         init_waitqueue_head(&sctx->list_wait);
 454
 455         ret = scrub_setup_wr_ctx(sctx, &sctx->wr_ctx, fs_info,
 456                                  fs_info->dev_replace.tgtdev, is_dev_replace);
 457         if (ret) {
 458                 scrub_free_ctx(sctx);
 459                 return ERR_PTR(ret);
 460         }
 461         return sctx;
 462
 463 nomem:
 464         scrub_free_ctx(sctx);
 465         return ERR_PTR(-ENOMEM);
 466 }
 467
 468 static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
 469                                      void *warn_ctx)
 470 {
 471         u64 isize;
 472         u32 nlink;
 473         int ret;
 474         int i;
 475         struct extent_buffer *eb;
 476         struct btrfs_inode_item *inode_item;
 477         struct scrub_warning *swarn = warn_ctx;
 478         struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
 479         struct inode_fs_paths *ipath = NULL;
 480         struct btrfs_root *local_root;
 481         struct btrfs_key root_key;
 482
 483         root_key.objectid = root;
 484         root_key.type = BTRFS_ROOT_ITEM_KEY;
 485         root_key.offset = (u64)-1;
 486         local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
 487         if (IS_ERR(local_root)) {
 488                 ret = PTR_ERR(local_root);
 489                 goto err;
 490         }
 491
 492         ret = inode_item_info(inum, 0, local_root, swarn->path);
 493         if (ret) {
 494                 btrfs_release_path(swarn->path);
 495                 goto err;
 496         }
 497
 498         eb = swarn->path->nodes[0];
 499         inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
 500                                         struct btrfs_inode_item);
 501         isize = btrfs_inode_size(eb, inode_item);
 502         nlink = btrfs_inode_nlink(eb, inode_item);
 503         btrfs_release_path(swarn->path);
 504
 505         ipath = init_ipath(4096, local_root, swarn->path);
 506         if (IS_ERR(ipath)) {
 507                 ret = PTR_ERR(ipath);
 508                 ipath = NULL;
 509                 goto err;
 510         }
 511         ret = paths_from_inode(inum, ipath);
 512
 513         if (ret < 0)
 514                 goto err;
 515
 516         /*
 517          * we deliberately ignore the bit ipath might have been too small to
 518          * hold all of the paths here
 519          */
 520         for (i = 0; i < ipath->fspath->elem_cnt; ++i)
 521                 printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "
 522                         "%s, sector %llu, root %llu, inode %llu, offset %llu, "
 523                         "length %llu, links %u (path: %s)\n", swarn->errstr,
 524                         swarn->logical, rcu_str_deref(swarn->dev->name),
 525                         (unsigned long long)swarn->sector, root, inum, offset,
 526                         min(isize - offset, (u64)PAGE_SIZE), nlink,
 527                         (char *)(unsigned long)ipath->fspath->val[i]);
 528
 529         free_ipath(ipath);
 530         return 0;
 531
 532 err:
 533         printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "
 534                 "%s, sector %llu, root %llu, inode %llu, offset %llu: path "
 535                 "resolving failed with ret=%d\n", swarn->errstr,
 536                 swarn->logical, rcu_str_deref(swarn->dev->name),
 537                 (unsigned long long)swarn->sector, root, inum, offset, ret);
 538
 539         free_ipath(ipath);
 540         return 0;
 541 }
 542
 543 static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
 544 {
 545         struct btrfs_device *dev;
 546         struct btrfs_fs_info *fs_info;
 547         struct btrfs_path *path;
 548         struct btrfs_key found_key;
 549         struct extent_buffer *eb;
 550         struct btrfs_extent_item *ei;
 551         struct scrub_warning swarn;
 552         unsigned long ptr = 0;
 553         u64 extent_item_pos;
 554         u64 flags = 0;
 555         u64 ref_root;
 556         u32 item_size;
 557         u8 ref_level;
 558         int ret;
 559
 560         WARN_ON(sblock->page_count < 1);
 561         dev = sblock->pagev[0]->dev;
 562         fs_info = sblock->sctx->dev_root->fs_info;
 563
 564         path = btrfs_alloc_path();
 565         if (!path)
 566                 return;
 567
 568         swarn.sector = (sblock->pagev[0]->physical) >> 9;
 569         swarn.logical = sblock->pagev[0]->logical;
 570         swarn.errstr = errstr;
 571         swarn.dev = NULL;
 572
 573         ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
 574                                   &flags);
 575         if (ret < 0)
 576                 goto out;
 577
 578         extent_item_pos = swarn.logical - found_key.objectid;
 579         swarn.extent_item_size = found_key.offset;
 580
 581         eb = path->nodes[0];
 582         ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
 583         item_size = btrfs_item_size_nr(eb, path->slots[0]);
 584
 585         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
 586                 do {
 587                         ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
 588                                                       item_size, &ref_root,
 589                                                       &ref_level);
 590                         printk_in_rcu(KERN_WARNING
 591                                 "BTRFS: %s at logical %llu on dev %s, "
 592                                 "sector %llu: metadata %s (level %d) in tree "
 593                                 "%llu\n", errstr, swarn.logical,
 594                                 rcu_str_deref(dev->name),
 595                                 (unsigned long long)swarn.sector,
 596                                 ref_level ? "node" : "leaf",
 597                                 ret < 0 ? -1 : ref_level,
 598                                 ret < 0 ? -1 : ref_root);
 599                 } while (ret != 1);
 600                 btrfs_release_path(path);
 601         } else {
 602                 btrfs_release_path(path);
 603                 swarn.path = path;
 604                 swarn.dev = dev;
 605                 iterate_extent_inodes(fs_info, found_key.objectid,
 606                                         extent_item_pos, 1,
 607                                         scrub_print_warning_inode, &swarn);
 608         }
 609
 610 out:
 611         btrfs_free_path(path);
 612 }
 613
 614 static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
 615 {
 616         struct page *page = NULL;
 617         unsigned long index;
 618         struct scrub_fixup_nodatasum *fixup = fixup_ctx;
 619         int ret;
 620         int corrected = 0;
 621         struct btrfs_key key;
 622         struct inode *inode = NULL;
 623         struct btrfs_fs_info *fs_info;
 624         u64 end = offset + PAGE_SIZE - 1;
 625         struct btrfs_root *local_root;
 626         int srcu_index;
 627
 628         key.objectid = root;
 629         key.type = BTRFS_ROOT_ITEM_KEY;
 630         key.offset = (u64)-1;
 631
 632         fs_info = fixup->root->fs_info;
 633         srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
 634
 635         local_root = btrfs_read_fs_root_no_name(fs_info, &key);
 636         if (IS_ERR(local_root)) {
 637                 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
 638                 return PTR_ERR(local_root);
 639         }
 640
 641         key.type = BTRFS_INODE_ITEM_KEY;
 642         key.objectid = inum;
 643         key.offset = 0;
 644         inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
 645         srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
 646         if (IS_ERR(inode))
 647                 return PTR_ERR(inode);
 648
 649         index = offset >> PAGE_CACHE_SHIFT;
 650
 651         page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
 652         if (!page) {
 653                 ret = -ENOMEM;
 654                 goto out;
 655         }
 656
 657         if (PageUptodate(page)) {
 658                 if (PageDirty(page)) {
 659                         /*
 660                          * we need to write the data to the defect sector. the
 661                          * data that was in that sector is not in memory,
 662                          * because the page was modified. we must not write the
 663                          * modified page to that sector.
 664                          *
 665                          * TODO: what could be done here: wait for the delalloc
 666                          *       runner to write out that page (might involve
 667                          *       COW) and see whether the sector is still
 668                          *       referenced afterwards.
 669                          *
 670                          * For the meantime, we'll treat this error
 671                          * incorrectable, although there is a chance that a
 672                          * later scrub will find the bad sector again and that
 673                          * there's no dirty page in memory, then.
 674                          */
 675                         ret = -EIO;
 676                         goto out;
 677                 }
 678                 ret = repair_io_failure(inode, offset, PAGE_SIZE,
 679                                         fixup->logical, page,
 680                                         offset - page_offset(page),
 681                                         fixup->mirror_num);
 682                 unlock_page(page);
 683                 corrected = !ret;
 684         } else {
 685                 /*
 686                  * we need to get good data first. the general readpage path
 687                  * will call repair_io_failure for us, we just have to make
 688                  * sure we read the bad mirror.
 689                  */
 690                 ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
 691                                         EXTENT_DAMAGED, GFP_NOFS);
 692                 if (ret) {
 693                         /* set_extent_bits should give proper error */
 694                         WARN_ON(ret > 0);
 695                         if (ret > 0)
 696                                 ret = -EFAULT;
 697                         goto out;
 698                 }
 699
 700                 ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
 701                                                 btrfs_get_extent,
 702                                                 fixup->mirror_num);
 703                 wait_on_page_locked(page);
 704
 705                 corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset,
 706                                                 end, EXTENT_DAMAGED, 0, NULL);
 707                 if (!corrected)
 708                         clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
 709                                                 EXTENT_DAMAGED, GFP_NOFS);
 710         }
 711
 712 out:
 713         if (page)
 714                 put_page(page);
 715
 716         iput(inode);
 717
 718         if (ret < 0)
 719                 return ret;
 720
 721         if (ret == 0 && corrected) {
 722                 /*
 723                  * we only need to call readpage for one of the inodes belonging
 724                  * to this extent. so make iterate_extent_inodes stop
 725                  */
 726                 return 1;
 727         }
 728
 729         return -EIO;
 730 }
 731
 732 static void scrub_fixup_nodatasum(struct btrfs_work *work)
 733 {
 734         int ret;
 735         struct scrub_fixup_nodatasum *fixup;
 736         struct scrub_ctx *sctx;
 737         struct btrfs_trans_handle *trans = NULL;
 738         struct btrfs_path *path;
 739         int uncorrectable = 0;
 740
 741         fixup = container_of(work, struct scrub_fixup_nodatasum, work);
 742         sctx = fixup->sctx;
 743
 744         path = btrfs_alloc_path();
 745         if (!path) {
 746                 spin_lock(&sctx->stat_lock);
 747                 ++sctx->stat.malloc_errors;
 748                 spin_unlock(&sctx->stat_lock);
 749                 uncorrectable = 1;
 750                 goto out;
 751         }
 752
 753         trans = btrfs_join_transaction(fixup->root);
 754         if (IS_ERR(trans)) {
 755                 uncorrectable = 1;
 756                 goto out;
 757         }
 758
 759         /*
 760          * the idea is to trigger a regular read through the standard path. we
 761          * read a page from the (failed) logical address by specifying the
 762          * corresponding copynum of the failed sector. thus, that readpage is
 763          * expected to fail.
 764          * that is the point where on-the-fly error correction will kick in
 765          * (once it's finished) and rewrite the failed sector if a good copy
 766          * can be found.
 767          */
 768         ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info,
 769                                                 path, scrub_fixup_readpage,
 770                                                 fixup);
 771         if (ret < 0) {
 772                 uncorrectable = 1;
 773                 goto out;
 774         }
 775         WARN_ON(ret != 1);
 776
 777         spin_lock(&sctx->stat_lock);
 778         ++sctx->stat.corrected_errors;
 779         spin_unlock(&sctx->stat_lock);
 780
 781 out:
 782         if (trans && !IS_ERR(trans))
 783                 btrfs_end_transaction(trans, fixup->root);
 784         if (uncorrectable) {
 785                 spin_lock(&sctx->stat_lock);
 786                 ++sctx->stat.uncorrectable_errors;
 787                 spin_unlock(&sctx->stat_lock);
 788                 btrfs_dev_replace_stats_inc(
 789                         &sctx->dev_root->fs_info->dev_replace.
 790                         num_uncorrectable_read_errors);
 791                 printk_ratelimited_in_rcu(KERN_ERR "BTRFS: "
 792                     "unable to fixup (nodatasum) error at logical %llu on dev %s\n",
 793                         fixup->logical, rcu_str_deref(fixup->dev->name));
 794         }
 795
 796         btrfs_free_path(path);
 797         kfree(fixup);
 798
 799         scrub_pending_trans_workers_dec(sctx);
 800 }
 801
 802 static inline void scrub_get_recover(struct scrub_recover *recover)
 803 {
 804         atomic_inc(&recover->refs);
 805 }
 806
 807 static inline void scrub_put_recover(struct scrub_recover *recover)
 808 {
 809         if (atomic_dec_and_test(&recover->refs)) {
 810                 kfree(recover->bbio);
 811                 kfree(recover->raid_map);
 812                 kfree(recover);
 813         }
 814 }
 815
 816 /*
 817  * scrub_handle_errored_block gets called when either verification of the
 818  * pages failed or the bio failed to read, e.g. with EIO. In the latter
 819  * case, this function handles all pages in the bio, even though only one
 820  * may be bad.
 821  * The goal of this function is to repair the errored block by using the
 822  * contents of one of the mirrors.
 823  */
 824 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 825 {
 826         struct scrub_ctx *sctx = sblock_to_check->sctx;
 827         struct btrfs_device *dev;
 828         struct btrfs_fs_info *fs_info;
 829         u64 length;
 830         u64 logical;
 831         u64 generation;
 832         unsigned int failed_mirror_index;
 833         unsigned int is_metadata;
 834         unsigned int have_csum;
 835         u8 *csum;
 836         struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
 837         struct scrub_block *sblock_bad;
 838         int ret;
 839         int mirror_index;
 840         int page_num;
 841         int success;
 842         static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
 843                                       DEFAULT_RATELIMIT_BURST);
 844
 845         BUG_ON(sblock_to_check->page_count < 1);
 846         fs_info = sctx->dev_root->fs_info;
 847         if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
 848                 /*
 849                  * if we find an error in a super block, we just report it.
 850                  * They will get written with the next transaction commit
 851                  * anyway
 852                  */
 853                 spin_lock(&sctx->stat_lock);
 854                 ++sctx->stat.super_errors;
 855                 spin_unlock(&sctx->stat_lock);
 856                 return 0;
 857         }
 858         length = sblock_to_check->page_count * PAGE_SIZE;
 859         logical = sblock_to_check->pagev[0]->logical;
 860         generation = sblock_to_check->pagev[0]->generation;
 861         BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
 862         failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
 863         is_metadata = !(sblock_to_check->pagev[0]->flags &
 864                         BTRFS_EXTENT_FLAG_DATA);
 865         have_csum = sblock_to_check->pagev[0]->have_csum;
 866         csum = sblock_to_check->pagev[0]->csum;
 867         dev = sblock_to_check->pagev[0]->dev;
 868
 869         if (sctx->is_dev_replace && !is_metadata && !have_csum) {
 870                 sblocks_for_recheck = NULL;
 871                 goto nodatasum_case;
 872         }
 873
 874         /*
 875          * read all mirrors one after the other. This includes to
 876          * re-read the extent or metadata block that failed (that was
 877          * the cause that this fixup code is called) another time,
 878          * page by page this time in order to know which pages
 879          * caused I/O errors and which ones are good (for all mirrors).
 880          * It is the goal to handle the situation when more than one
 881          * mirror contains I/O errors, but the errors do not
 882          * overlap, i.e. the data can be repaired by selecting the
 883          * pages from those mirrors without I/O error on the
 884          * particular pages. One example (with blocks >= 2 * PAGE_SIZE)
 885          * would be that mirror #1 has an I/O error on the first page,
 886          * the second page is good, and mirror #2 has an I/O error on
 887          * the second page, but the first page is good.
 888          * Then the first page of the first mirror can be repaired by
 889          * taking the first page of the second mirror, and the
 890          * second page of the second mirror can be repaired by
 891          * copying the contents of the 2nd page of the 1st mirror.
 892          * One more note: if the pages of one mirror contain I/O
 893          * errors, the checksum cannot be verified. In order to get
 894          * the best data for repairing, the first attempt is to find
 895          * a mirror without I/O errors and with a validated checksum.
 896          * Only if this is not possible, the pages are picked from
 897          * mirrors with I/O errors without considering the checksum.
 898          * If the latter is the case, at the end, the checksum of the
 899          * repaired area is verified in order to correctly maintain
 900          * the statistics.
 901          */
 902
 903         sblocks_for_recheck = kzalloc(BTRFS_MAX_MIRRORS *
 904                                      sizeof(*sblocks_for_recheck),
 905                                      GFP_NOFS);
 906         if (!sblocks_for_recheck) {
 907                 spin_lock(&sctx->stat_lock);
 908                 sctx->stat.malloc_errors++;
 909                 sctx->stat.read_errors++;
 910                 sctx->stat.uncorrectable_errors++;
 911                 spin_unlock(&sctx->stat_lock);
 912                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
 913                 goto out;
 914         }
 915
 916         /* setup the context, map the logical blocks and alloc the pages */
 917         ret = scrub_setup_recheck_block(sctx, fs_info, sblock_to_check, length,
 918                                         logical, sblocks_for_recheck);
 919         if (ret) {
 920                 spin_lock(&sctx->stat_lock);
 921                 sctx->stat.read_errors++;
 922                 sctx->stat.uncorrectable_errors++;
 923                 spin_unlock(&sctx->stat_lock);
 924                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
 925                 goto out;
 926         }
 927         BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
 928         sblock_bad = sblocks_for_recheck + failed_mirror_index;
 929
 930         /* build and submit the bios for the failed mirror, check checksums */
 931         scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
 932                             csum, generation, sctx->csum_size, 1);
 933
 934         if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
 935             sblock_bad->no_io_error_seen) {
 936                 /*
 937                  * the error disappeared after reading page by page, or
 938                  * the area was part of a huge bio and other parts of the
 939                  * bio caused I/O errors, or the block layer merged several
 940                  * read requests into one and the error is caused by a
 941                  * different bio (usually one of the two latter cases is
 942                  * the cause)
 943                  */
 944                 spin_lock(&sctx->stat_lock);
 945                 sctx->stat.unverified_errors++;
 946                 spin_unlock(&sctx->stat_lock);
 947
 948                 if (sctx->is_dev_replace)
 949                         scrub_write_block_to_dev_replace(sblock_bad);
 950                 goto out;
 951         }
 952
 953         if (!sblock_bad->no_io_error_seen) {
 954                 spin_lock(&sctx->stat_lock);
 955                 sctx->stat.read_errors++;
 956                 spin_unlock(&sctx->stat_lock);
 957                 if (__ratelimit(&_rs))
 958                         scrub_print_warning("i/o error", sblock_to_check);
 959                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
 960         } else if (sblock_bad->checksum_error) {
 961                 spin_lock(&sctx->stat_lock);
 962                 sctx->stat.csum_errors++;
 963                 spin_unlock(&sctx->stat_lock);
 964                 if (__ratelimit(&_rs))
 965                         scrub_print_warning("checksum error", sblock_to_check);
 966                 btrfs_dev_stat_inc_and_print(dev,
 967                                              BTRFS_DEV_STAT_CORRUPTION_ERRS);
 968         } else if (sblock_bad->header_error) {
 969                 spin_lock(&sctx->stat_lock);
 970                 sctx->stat.verify_errors++;
 971                 spin_unlock(&sctx->stat_lock);
 972                 if (__ratelimit(&_rs))
 973                         scrub_print_warning("checksum/header error",
 974                                             sblock_to_check);
 975                 if (sblock_bad->generation_error)
 976                         btrfs_dev_stat_inc_and_print(dev,
 977                                 BTRFS_DEV_STAT_GENERATION_ERRS);
 978                 else
 979                         btrfs_dev_stat_inc_and_print(dev,
 980                                 BTRFS_DEV_STAT_CORRUPTION_ERRS);
 981         }
 982
 983         if (sctx->readonly) {
 984                 ASSERT(!sctx->is_dev_replace);
 985                 goto out;
 986         }
 987
 988         if (!is_metadata && !have_csum) {
 989                 struct scrub_fixup_nodatasum *fixup_nodatasum;
 990
 991 nodatasum_case:
 992                 WARN_ON(sctx->is_dev_replace);
 993
 994                 /*
 995                  * !is_metadata and !have_csum, this means that the data
 996                  * might not be COW'ed, that it might be modified
 997                  * concurrently. The general strategy to work on the
 998                  * commit root does not help in the case when COW is not
 999                  * used.
1000                  */
1001                 fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);
1002                 if (!fixup_nodatasum)
1003                         goto did_not_correct_error;
1004                 fixup_nodatasum->sctx = sctx;
1005                 fixup_nodatasum->dev = dev;
1006                 fixup_nodatasum->logical = logical;
1007                 fixup_nodatasum->root = fs_info->extent_root;
1008                 fixup_nodatasum->mirror_num = failed_mirror_index + 1;
1009                 scrub_pending_trans_workers_inc(sctx);
1010                 btrfs_init_work(&fixup_nodatasum->work, btrfs_scrub_helper,
1011                                 scrub_fixup_nodatasum, NULL, NULL);
1012                 btrfs_queue_work(fs_info->scrub_workers,
1013                                  &fixup_nodatasum->work);
1014                 goto out;
1015         }
1016
1017         /*
1018          * now build and submit the bios for the other mirrors, check
1019          * checksums.
1020          * First try to pick the mirror which is completely without I/O
1021          * errors and also does not have a checksum error.
1022          * If one is found, and if a checksum is present, the full block
1023          * that is known to contain an error is rewritten. Afterwards
1024          * the block is known to be corrected.
1025          * If a mirror is found which is completely correct, and no
1026          * checksum is present, only those pages are rewritten that had
1027          * an I/O error in the block to be repaired, since it cannot be
1028          * determined, which copy of the other pages is better (and it
1029          * could happen otherwise that a correct page would be
1030          * overwritten by a bad one).
1031          */
1032         for (mirror_index = 0;
1033              mirror_index < BTRFS_MAX_MIRRORS &&
1034              sblocks_for_recheck[mirror_index].page_count > 0;
1035              mirror_index++) {
1036                 struct scrub_block *sblock_other;
1037
1038                 if (mirror_index == failed_mirror_index)
1039                         continue;
1040                 sblock_other = sblocks_for_recheck + mirror_index;
1041
1042                 /* build and submit the bios, check checksums */
1043                 scrub_recheck_block(fs_info, sblock_other, is_metadata,
1044                                     have_csum, csum, generation,
1045                                     sctx->csum_size, 0);
1046
1047                 if (!sblock_other->header_error &&
1048                     !sblock_other->checksum_error &&
1049                     sblock_other->no_io_error_seen) {
1050                         if (sctx->is_dev_replace) {
1051                                 scrub_write_block_to_dev_replace(sblock_other);
1052                         } else {
1053                                 int force_write = is_metadata || have_csum;
1054
1055                                 ret = scrub_repair_block_from_good_copy(
1056                                                 sblock_bad, sblock_other,
1057                                                 force_write);
1058                         }
1059                         if (0 == ret)
1060                                 goto corrected_error;
1061                 }
1062         }
1063
1064         /*
1065          * for dev_replace, pick good pages and write to the target device.
1066          */
1067         if (sctx->is_dev_replace) {
1068                 success = 1;
1069                 for (page_num = 0; page_num < sblock_bad->page_count;
1070                      page_num++) {
1071                         int sub_success;
1072
1073                         sub_success = 0;
1074                         for (mirror_index = 0;
1075                              mirror_index < BTRFS_MAX_MIRRORS &&
1076                              sblocks_for_recheck[mirror_index].page_count > 0;
1077                              mirror_index++) {
1078                                 struct scrub_block *sblock_other =
1079                                         sblocks_for_recheck + mirror_index;
1080                                 struct scrub_page *page_other =
1081                                         sblock_other->pagev[page_num];
1082
1083                                 if (!page_other->io_error) {
1084                                         ret = scrub_write_page_to_dev_replace(
1085                                                         sblock_other, page_num);
1086                                         if (ret == 0) {
1087                                                 /* succeeded for this page */
1088                                                 sub_success = 1;
1089                                                 break;
1090                                         } else {
1091                                                 btrfs_dev_replace_stats_inc(
1092                                                         &sctx->dev_root->
1093                                                         fs_info->dev_replace.
1094                                                         num_write_errors);
1095                                         }
1096                                 }
1097                         }
1098
1099                         if (!sub_success) {
1100                                 /*
1101                                  * did not find a mirror to fetch the page
1102                                  * from. scrub_write_page_to_dev_replace()
1103                                  * handles this case (page->io_error), by
1104                                  * filling the block with zeros before
1105                                  * submitting the write request
1106                                  */
1107                                 success = 0;
1108                                 ret = scrub_write_page_to_dev_replace(
1109                                                 sblock_bad, page_num);
1110                                 if (ret)
1111                                         btrfs_dev_replace_stats_inc(
1112                                                 &sctx->dev_root->fs_info->
1113                                                 dev_replace.num_write_errors);
1114                         }
1115                 }
1116
1117                 goto out;
1118         }
1119
1120         /*
1121          * for regular scrub, repair those pages that are errored.
1122          * In case of I/O errors in the area that is supposed to be
1123          * repaired, continue by picking good copies of those pages.
1124          * Select the good pages from mirrors to rewrite bad pages from
1125          * the area to fix. Afterwards verify the checksum of the block
1126          * that is supposed to be repaired. This verification step is
1127          * only done for the purpose of statistic counting and for the
1128          * final scrub report, whether errors remain.
1129          * A perfect algorithm could make use of the checksum and try
1130          * all possible combinations of pages from the different mirrors
1131          * until the checksum verification succeeds. For example, when
1132          * the 2nd page of mirror #1 faces I/O errors, and the 2nd page
1133          * of mirror #2 is readable but the final checksum test fails,
1134          * then the 2nd page of mirror #3 could be tried, whether now
1135          * the final checksum succeedes. But this would be a rare
1136          * exception and is therefore not implemented. At least it is
1137          * avoided that the good copy is overwritten.
1138          * A more useful improvement would be to pick the sectors
1139          * without I/O error based on sector sizes (512 bytes on legacy
1140          * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one
1141          * mirror could be repaired by taking 512 byte of a different
1142          * mirror, even if other 512 byte sectors in the same PAGE_SIZE
1143          * area are unreadable.
1144          */
1145
1146         /* can only fix I/O errors from here on */
1147         if (sblock_bad->no_io_error_seen)
1148                 goto did_not_correct_error;
1149
1150         success = 1;
1151         for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
1152                 struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1153
1154                 if (!page_bad->io_error)
1155                         continue;
1156
1157                 for (mirror_index = 0;
1158                      mirror_index < BTRFS_MAX_MIRRORS &&
1159                      sblocks_for_recheck[mirror_index].page_count > 0;
1160                      mirror_index++) {
1161                         struct scrub_block *sblock_other = sblocks_for_recheck +
1162                                                            mirror_index;
1163                         struct scrub_page *page_other = sblock_other->pagev[
1164                                                         page_num];
1165
1166                         if (!page_other->io_error) {
1167                                 ret = scrub_repair_page_from_good_copy(
1168                                         sblock_bad, sblock_other, page_num, 0);
1169                                 if (0 == ret) {
1170                                         page_bad->io_error = 0;
1171                                         break; /* succeeded for this page */
1172                                 }
1173                         }
1174                 }
1175
1176                 if (page_bad->io_error) {
1177                         /* did not find a mirror to copy the page from */
1178                         success = 0;
1179                 }
1180         }
1181
1182         if (success) {
1183                 if (is_metadata || have_csum) {
1184                         /*
1185                          * need to verify the checksum now that all
1186                          * sectors on disk are repaired (the write
1187                          * request for data to be repaired is on its way).
1188                          * Just be lazy and use scrub_recheck_block()
1189                          * which re-reads the data before the checksum
1190                          * is verified, but most likely the data comes out
1191                          * of the page cache.
1192                          */
1193                         scrub_recheck_block(fs_info, sblock_bad,
1194                                             is_metadata, have_csum, csum,
1195                                             generation, sctx->csum_size, 1);
1196                         if (!sblock_bad->header_error &&
1197                             !sblock_bad->checksum_error &&
1198                             sblock_bad->no_io_error_seen)
1199                                 goto corrected_error;
1200                         else
1201                                 goto did_not_correct_error;
1202                 } else {
1203 corrected_error:
1204                         spin_lock(&sctx->stat_lock);
1205                         sctx->stat.corrected_errors++;
1206                         spin_unlock(&sctx->stat_lock);
1207                         printk_ratelimited_in_rcu(KERN_ERR
1208                                 "BTRFS: fixed up error at logical %llu on dev %s\n",
1209                                 logical, rcu_str_deref(dev->name));
1210                 }
1211         } else {
1212 did_not_correct_error:
1213                 spin_lock(&sctx->stat_lock);
1214                 sctx->stat.uncorrectable_errors++;
1215                 spin_unlock(&sctx->stat_lock);
1216                 printk_ratelimited_in_rcu(KERN_ERR
1217                         "BTRFS: unable to fixup (regular) error at logical %llu on dev %s\n",
1218                         logical, rcu_str_deref(dev->name));
1219         }
1220
1221 out:
1222         if (sblocks_for_recheck) {
1223                 for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
1224                      mirror_index++) {
1225                         struct scrub_block *sblock = sblocks_for_recheck +
1226                                                      mirror_index;
1227                         struct scrub_recover *recover;
1228                         int page_index;
1229
1230                         for (page_index = 0; page_index < sblock->page_count;
1231                              page_index++) {
1232                                 sblock->pagev[page_index]->sblock = NULL;
1233                                 recover = sblock->pagev[page_index]->recover;
1234                                 if (recover) {
1235                                         scrub_put_recover(recover);
1236                                         sblock->pagev[page_index]->recover =
1237                                                                         NULL;
1238                                 }
1239                                 scrub_page_put(sblock->pagev[page_index]);
1240                         }
1241                 }
1242                 kfree(sblocks_for_recheck);
1243         }
1244
1245         return 0;
1246 }
1247
1248 static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio, u64 *raid_map)
1249 {
1250         if (raid_map) {
1251                 if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
1252                         return 3;
1253                 else
1254                         return 2;
1255         } else {
1256                 return (int)bbio->num_stripes;
1257         }
1258 }
1259
1260 static inline void scrub_stripe_index_and_offset(u64 logical, u64 *raid_map,
1261                                                  u64 mapped_length,
1262                                                  int nstripes, int mirror,
1263                                                  int *stripe_index,
1264                                                  u64 *stripe_offset)
1265 {
1266         int i;
1267
1268         if (raid_map) {
1269                 /* RAID5/6 */
1270                 for (i = 0; i < nstripes; i++) {
1271                         if (raid_map[i] == RAID6_Q_STRIPE ||
1272                             raid_map[i] == RAID5_P_STRIPE)
1273                                 continue;
1274
1275                         if (logical >= raid_map[i] &&
1276                             logical < raid_map[i] + mapped_length)
1277                                 break;
1278                 }
1279
1280                 *stripe_index = i;
1281                 *stripe_offset = logical - raid_map[i];
1282         } else {
1283                 /* The other RAID type */
1284                 *stripe_index = mirror;
1285                 *stripe_offset = 0;
1286         }
1287 }
1288
1289 static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
1290                                      struct btrfs_fs_info *fs_info,
1291                                      struct scrub_block *original_sblock,
1292                                      u64 length, u64 logical,
1293                                      struct scrub_block *sblocks_for_recheck)
1294 {
1295         struct scrub_recover *recover;
1296         struct btrfs_bio *bbio;
1297         u64 *raid_map;
1298         u64 sublen;
1299         u64 mapped_length;
1300         u64 stripe_offset;
1301         int stripe_index;
1302         int page_index;
1303         int mirror_index;
1304         int nmirrors;
1305         int ret;
1306
1307         /*
1308          * note: the two members ref_count and outstanding_pages
1309          * are not used (and not set) in the blocks that are used for
1310          * the recheck procedure
1311          */
1312
1313         page_index = 0;
1314         while (length > 0) {
1315                 sublen = min_t(u64, length, PAGE_SIZE);
1316                 mapped_length = sublen;
1317                 bbio = NULL;
1318                 raid_map = NULL;
1319
1320                 /*
1321                  * with a length of PAGE_SIZE, each returned stripe
1322                  * represents one mirror
1323                  */
1324                 ret = btrfs_map_sblock(fs_info, REQ_GET_READ_MIRRORS, logical,
1325                                        &mapped_length, &bbio, 0, &raid_map);
1326                 if (ret || !bbio || mapped_length < sublen) {
1327                         kfree(bbio);
1328                         kfree(raid_map);
1329                         return -EIO;
1330                 }
1331
1332                 recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
1333                 if (!recover) {
1334                         kfree(bbio);
1335                         kfree(raid_map);
1336                         return -ENOMEM;
1337                 }
1338
1339                 atomic_set(&recover->refs, 1);
1340                 recover->bbio = bbio;
1341                 recover->raid_map = raid_map;
1342                 recover->map_length = mapped_length;
1343
1344                 BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
1345
1346                 nmirrors = scrub_nr_raid_mirrors(bbio, raid_map);
1347                 for (mirror_index = 0; mirror_index < nmirrors;
1348                      mirror_index++) {
1349                         struct scrub_block *sblock;
1350                         struct scrub_page *page;
1351
1352                         if (mirror_index >= BTRFS_MAX_MIRRORS)
1353                                 continue;
1354
1355                         sblock = sblocks_for_recheck + mirror_index;
1356                         sblock->sctx = sctx;
1357                         page = kzalloc(sizeof(*page), GFP_NOFS);
1358                         if (!page) {
1359 leave_nomem:
1360                                 spin_lock(&sctx->stat_lock);
1361                                 sctx->stat.malloc_errors++;
1362                                 spin_unlock(&sctx->stat_lock);
1363                                 scrub_put_recover(recover);
1364                                 return -ENOMEM;
1365                         }
1366                         scrub_page_get(page);
1367                         sblock->pagev[page_index] = page;
1368                         page->logical = logical;
1369
1370                         scrub_stripe_index_and_offset(logical, raid_map,
1371                                                       mapped_length,
1372                                                       bbio->num_stripes,
1373                                                       mirror_index,
1374                                                       &stripe_index,
1375                                                       &stripe_offset);
1376                         page->physical = bbio->stripes[stripe_index].physical +
1377                                          stripe_offset;
1378                         page->dev = bbio->stripes[stripe_index].dev;
1379
1380                         BUG_ON(page_index >= original_sblock->page_count);
1381                         page->physical_for_dev_replace =
1382                                 original_sblock->pagev[page_index]->
1383                                 physical_for_dev_replace;
1384                         /* for missing devices, dev->bdev is NULL */
1385                         page->mirror_num = mirror_index + 1;
1386                         sblock->page_count++;
1387                         page->page = alloc_page(GFP_NOFS);
1388                         if (!page->page)
1389                                 goto leave_nomem;
1390
1391                         scrub_get_recover(recover);
1392                         page->recover = recover;
1393                 }
1394                 scrub_put_recover(recover);
1395                 length -= sublen;
1396                 logical += sublen;
1397                 page_index++;
1398         }
1399
1400         return 0;
1401 }
1402
1403 struct scrub_bio_ret {
1404         struct completion event;
1405         int error;
1406 };
1407
1408 static void scrub_bio_wait_endio(struct bio *bio, int error)
1409 {
1410         struct scrub_bio_ret *ret = bio->bi_private;
1411
1412         ret->error = error;
1413         complete(&ret->event);
1414 }
1415
1416 static inline int scrub_is_page_on_raid56(struct scrub_page *page)
1417 {
1418         return page->recover && page->recover->raid_map;
1419 }
1420
1421 static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
1422                                         struct bio *bio,
1423                                         struct scrub_page *page)
1424 {
1425         struct scrub_bio_ret done;
1426         int ret;
1427
1428         init_completion(&done.event);
1429         done.error = 0;
1430         bio->bi_iter.bi_sector = page->logical >> 9;
1431         bio->bi_private = &done;
1432         bio->bi_end_io = scrub_bio_wait_endio;
1433
1434         ret = raid56_parity_recover(fs_info->fs_root, bio, page->recover->bbio,
1435                                     page->recover->raid_map,
1436                                     page->recover->map_length,
1437                                     page->mirror_num, 1);
1438         if (ret)
1439                 return ret;
1440
1441         wait_for_completion(&done.event);
1442         if (done.error)
1443                 return -EIO;
1444
1445         return 0;
1446 }
1447
1448 /*
1449  * this function will check the on disk data for checksum errors, header
1450  * errors and read I/O errors. If any I/O errors happen, the exact pages
1451  * which are errored are marked as being bad. The goal is to enable scrub
1452  * to take those pages that are not errored from all the mirrors so that
1453  * the pages that are errored in the just handled mirror can be repaired.
1454  */
1455 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1456                                 struct scrub_block *sblock, int is_metadata,
1457                                 int have_csum, u8 *csum, u64 generation,
1458                                 u16 csum_size, int retry_failed_mirror)
1459 {
1460         int page_num;
1461
1462         sblock->no_io_error_seen = 1;
1463         sblock->header_error = 0;
1464         sblock->checksum_error = 0;
1465
1466         for (page_num = 0; page_num < sblock->page_count; page_num++) {
1467                 struct bio *bio;
1468                 struct scrub_page *page = sblock->pagev[page_num];
1469
1470                 if (page->dev->bdev == NULL) {
1471                         page->io_error = 1;
1472                         sblock->no_io_error_seen = 0;
1473                         continue;
1474                 }
1475
1476                 WARN_ON(!page->page);
1477                 bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
1478                 if (!bio) {
1479                         page->io_error = 1;
1480                         sblock->no_io_error_seen = 0;
1481                         continue;
1482                 }
1483                 bio->bi_bdev = page->dev->bdev;
1484
1485                 bio_add_page(bio, page->page, PAGE_SIZE, 0);
1486                 if (!retry_failed_mirror && scrub_is_page_on_raid56(page)) {
1487                         if (scrub_submit_raid56_bio_wait(fs_info, bio, page))
1488                                 sblock->no_io_error_seen = 0;
1489                 } else {
1490                         bio->bi_iter.bi_sector = page->physical >> 9;
1491
1492                         if (btrfsic_submit_bio_wait(READ, bio))
1493                                 sblock->no_io_error_seen = 0;
1494                 }
1495
1496                 bio_put(bio);
1497         }
1498
1499         if (sblock->no_io_error_seen)
1500                 scrub_recheck_block_checksum(fs_info, sblock, is_metadata,
1501                                              have_csum, csum, generation,
1502                                              csum_size);
1503
1504         return;
1505 }
1506
1507 static inline int scrub_check_fsid(u8 fsid[],
1508                                    struct scrub_page *spage)
1509 {
1510         struct btrfs_fs_devices *fs_devices = spage->dev->fs_devices;
1511         int ret;
1512
1513         ret = memcmp(fsid, fs_devices->fsid, BTRFS_UUID_SIZE);
1514         return !ret;
1515 }
1516
1517 static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
1518                                          struct scrub_block *sblock,
1519                                          int is_metadata, int have_csum,
1520                                          const u8 *csum, u64 generation,
1521                                          u16 csum_size)
1522 {
1523         int page_num;
1524         u8 calculated_csum[BTRFS_CSUM_SIZE];
1525         u32 crc = ~(u32)0;
1526         void *mapped_buffer;
1527
1528         WARN_ON(!sblock->pagev[0]->page);
1529         if (is_metadata) {
1530                 struct btrfs_header *h;
1531
1532                 mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
1533                 h = (struct btrfs_header *)mapped_buffer;
1534
1535                 if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h) ||
1536                     !scrub_check_fsid(h->fsid, sblock->pagev[0]) ||
1537                     memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1538                            BTRFS_UUID_SIZE)) {
1539                         sblock->header_error = 1;
1540                 } else if (generation != btrfs_stack_header_generation(h)) {
1541                         sblock->header_error = 1;
1542                         sblock->generation_error = 1;
1543                 }
1544                 csum = h->csum;
1545         } else {
1546                 if (!have_csum)
1547                         return;
1548
1549                 mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
1550         }
1551
1552         for (page_num = 0;;) {
1553                 if (page_num == 0 && is_metadata)
1554                         crc = btrfs_csum_data(
1555                                 ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE,
1556                                 crc, PAGE_SIZE - BTRFS_CSUM_SIZE);
1557                 else
1558                         crc = btrfs_csum_data(mapped_buffer, crc, PAGE_SIZE);
1559
1560                 kunmap_atomic(mapped_buffer);
1561                 page_num++;
1562                 if (page_num >= sblock->page_count)
1563                         break;
1564                 WARN_ON(!sblock->pagev[page_num]->page);
1565
1566                 mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page);
1567         }
1568
1569         btrfs_csum_final(crc, calculated_csum);
1570         if (memcmp(calculated_csum, csum, csum_size))
1571                 sblock->checksum_error = 1;
1572 }
1573
1574 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
1575                                              struct scrub_block *sblock_good,
1576                                              int force_write)
1577 {
1578         int page_num;
1579         int ret = 0;
1580
1581         for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
1582                 int ret_sub;
1583
1584                 ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
1585                                                            sblock_good,
1586                                                            page_num,
1587                                                            force_write);
1588                 if (ret_sub)
1589                         ret = ret_sub;
1590         }
1591
1592         return ret;
1593 }
1594
1595 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1596                                             struct scrub_block *sblock_good,
1597                                             int page_num, int force_write)
1598 {
1599         struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1600         struct scrub_page *page_good = sblock_good->pagev[page_num];
1601
1602         BUG_ON(page_bad->page == NULL);
1603         BUG_ON(page_good->page == NULL);
1604         if (force_write || sblock_bad->header_error ||
1605             sblock_bad->checksum_error || page_bad->io_error) {
1606                 struct bio *bio;
1607                 int ret;
1608
1609                 if (!page_bad->dev->bdev) {
1610                         printk_ratelimited(KERN_WARNING "BTRFS: "
1611                                 "scrub_repair_page_from_good_copy(bdev == NULL) "
1612                                 "is unexpected!\n");
1613                         return -EIO;
1614                 }
1615
1616                 bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
1617                 if (!bio)
1618                         return -EIO;
1619                 bio->bi_bdev = page_bad->dev->bdev;
1620                 bio->bi_iter.bi_sector = page_bad->physical >> 9;
1621
1622                 ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
1623                 if (PAGE_SIZE != ret) {
1624                         bio_put(bio);
1625                         return -EIO;
1626                 }
1627
1628                 if (btrfsic_submit_bio_wait(WRITE, bio)) {
1629                         btrfs_dev_stat_inc_and_print(page_bad->dev,
1630                                 BTRFS_DEV_STAT_WRITE_ERRS);
1631                         btrfs_dev_replace_stats_inc(
1632                                 &sblock_bad->sctx->dev_root->fs_info->
1633                                 dev_replace.num_write_errors);
1634                         bio_put(bio);
1635                         return -EIO;
1636                 }
1637                 bio_put(bio);
1638         }
1639
1640         return 0;
1641 }
1642
1643 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1644 {
1645         int page_num;
1646
1647         for (page_num = 0; page_num < sblock->page_count; page_num++) {
1648                 int ret;
1649
1650                 ret = scrub_write_page_to_dev_replace(sblock, page_num);
1651                 if (ret)
1652                         btrfs_dev_replace_stats_inc(
1653                                 &sblock->sctx->dev_root->fs_info->dev_replace.
1654                                 num_write_errors);
1655         }
1656 }
1657
1658 static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
1659                                            int page_num)
1660 {
1661         struct scrub_page *spage = sblock->pagev[page_num];
1662
1663         BUG_ON(spage->page == NULL);
1664         if (spage->io_error) {
1665                 void *mapped_buffer = kmap_atomic(spage->page);
1666
1667                 memset(mapped_buffer, 0, PAGE_CACHE_SIZE);
1668                 flush_dcache_page(spage->page);
1669                 kunmap_atomic(mapped_buffer);
1670         }
1671         return scrub_add_page_to_wr_bio(sblock->sctx, spage);
1672 }
1673
1674 static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
1675                                     struct scrub_page *spage)
1676 {
1677         struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
1678         struct scrub_bio *sbio;
1679         int ret;
1680
1681         mutex_lock(&wr_ctx->wr_lock);
1682 again:
1683         if (!wr_ctx->wr_curr_bio) {
1684                 wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio),
1685                                               GFP_NOFS);
1686                 if (!wr_ctx->wr_curr_bio) {
1687                         mutex_unlock(&wr_ctx->wr_lock);
1688                         return -ENOMEM;
1689                 }
1690                 wr_ctx->wr_curr_bio->sctx = sctx;
1691                 wr_ctx->wr_curr_bio->page_count = 0;
1692         }
1693         sbio = wr_ctx->wr_curr_bio;
1694         if (sbio->page_count == 0) {
1695                 struct bio *bio;
1696
1697                 sbio->physical = spage->physical_for_dev_replace;
1698                 sbio->logical = spage->logical;
1699                 sbio->dev = wr_ctx->tgtdev;
1700                 bio = sbio->bio;
1701                 if (!bio) {
1702                         bio = btrfs_io_bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio);
1703                         if (!bio) {
1704                                 mutex_unlock(&wr_ctx->wr_lock);
1705                                 return -ENOMEM;
1706                         }
1707                         sbio->bio = bio;
1708                 }
1709
1710                 bio->bi_private = sbio;
1711                 bio->bi_end_io = scrub_wr_bio_end_io;
1712                 bio->bi_bdev = sbio->dev->bdev;
1713                 bio->bi_iter.bi_sector = sbio->physical >> 9;
1714                 sbio->err = 0;
1715         } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1716                    spage->physical_for_dev_replace ||
1717                    sbio->logical + sbio->page_count * PAGE_SIZE !=
1718                    spage->logical) {
1719                 scrub_wr_submit(sctx);
1720                 goto again;
1721         }
1722
1723         ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
1724         if (ret != PAGE_SIZE) {
1725                 if (sbio->page_count < 1) {
1726                         bio_put(sbio->bio);
1727                         sbio->bio = NULL;
1728                         mutex_unlock(&wr_ctx->wr_lock);
1729                         return -EIO;
1730                 }
1731                 scrub_wr_submit(sctx);
1732                 goto again;
1733         }
1734
1735         sbio->pagev[sbio->page_count] = spage;
1736         scrub_page_get(spage);
1737         sbio->page_count++;
1738         if (sbio->page_count == wr_ctx->pages_per_wr_bio)
1739                 scrub_wr_submit(sctx);
1740         mutex_unlock(&wr_ctx->wr_lock);
1741
1742         return 0;
1743 }
1744
1745 static void scrub_wr_submit(struct scrub_ctx *sctx)
1746 {
1747         struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
1748         struct scrub_bio *sbio;
1749
1750         if (!wr_ctx->wr_curr_bio)
1751                 return;
1752
1753         sbio = wr_ctx->wr_curr_bio;
1754         wr_ctx->wr_curr_bio = NULL;
1755         WARN_ON(!sbio->bio->bi_bdev);
1756         scrub_pending_bio_inc(sctx);
1757         /* process all writes in a single worker thread. Then the block layer
1758          * orders the requests before sending them to the driver which
1759          * doubled the write performance on spinning disks when measured
1760          * with Linux 3.5 */
1761         btrfsic_submit_bio(WRITE, sbio->bio);
1762 }
1763
1764 static void scrub_wr_bio_end_io(struct bio *bio, int err)
1765 {
1766         struct scrub_bio *sbio = bio->bi_private;
1767         struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
1768
1769         sbio->err = err;
1770         sbio->bio = bio;
1771
1772         btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper,
1773                          scrub_wr_bio_end_io_worker, NULL, NULL);
1774         btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
1775 }
1776
1777 static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
1778 {
1779         struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1780         struct scrub_ctx *sctx = sbio->sctx;
1781         int i;
1782
1783         WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
1784         if (sbio->err) {
1785                 struct btrfs_dev_replace *dev_replace =
1786                         &sbio->sctx->dev_root->fs_info->dev_replace;
1787
1788                 for (i = 0; i < sbio->page_count; i++) {
1789                         struct scrub_page *spage = sbio->pagev[i];
1790
1791                         spage->io_error = 1;
1792                         btrfs_dev_replace_stats_inc(&dev_replace->
1793                                                     num_write_errors);
1794                 }
1795         }
1796
1797         for (i = 0; i < sbio->page_count; i++)
1798                 scrub_page_put(sbio->pagev[i]);
1799
1800         bio_put(sbio->bio);
1801         kfree(sbio);
1802         scrub_pending_bio_dec(sctx);
1803 }
1804
1805 static int scrub_checksum(struct scrub_block *sblock)
1806 {
1807         u64 flags;
1808         int ret;
1809
1810         WARN_ON(sblock->page_count < 1);
1811         flags = sblock->pagev[0]->flags;
1812         ret = 0;
1813         if (flags & BTRFS_EXTENT_FLAG_DATA)
1814                 ret = scrub_checksum_data(sblock);
1815         else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1816                 ret = scrub_checksum_tree_block(sblock);
1817         else if (flags & BTRFS_EXTENT_FLAG_SUPER)
1818                 (void)scrub_checksum_super(sblock);
1819         else
1820                 WARN_ON(1);
1821         if (ret)
1822                 scrub_handle_errored_block(sblock);
1823
1824         return ret;
1825 }
1826
1827 static int scrub_checksum_data(struct scrub_block *sblock)
1828 {
1829         struct scrub_ctx *sctx = sblock->sctx;
1830         u8 csum[BTRFS_CSUM_SIZE];
1831         u8 *on_disk_csum;
1832         struct page *page;
1833         void *buffer;
1834         u32 crc = ~(u32)0;
1835         int fail = 0;
1836         u64 len;
1837         int index;
1838
1839         BUG_ON(sblock->page_count < 1);
1840         if (!sblock->pagev[0]->have_csum)
1841                 return 0;
1842
1843         on_disk_csum = sblock->pagev[0]->csum;
1844         page = sblock->pagev[0]->page;
1845         buffer = kmap_atomic(page);
1846
1847         len = sctx->sectorsize;
1848         index = 0;
1849         for (;;) {
1850                 u64 l = min_t(u64, len, PAGE_SIZE);
1851
1852                 crc = btrfs_csum_data(buffer, crc, l);
1853                 kunmap_atomic(buffer);
1854                 len -= l;
1855                 if (len == 0)
1856                         break;
1857                 index++;
1858                 BUG_ON(index >= sblock->page_count);
1859                 BUG_ON(!sblock->pagev[index]->page);
1860                 page = sblock->pagev[index]->page;
1861                 buffer = kmap_atomic(page);
1862         }
1863
1864         btrfs_csum_final(crc, csum);
1865         if (memcmp(csum, on_disk_csum, sctx->csum_size))
1866                 fail = 1;
1867
1868         return fail;
1869 }
1870
1871 static int scrub_checksum_tree_block(struct scrub_block *sblock)
1872 {
1873         struct scrub_ctx *sctx = sblock->sctx;
1874         struct btrfs_header *h;
1875         struct btrfs_root *root = sctx->dev_root;
1876         struct btrfs_fs_info *fs_info = root->fs_info;
1877         u8 calculated_csum[BTRFS_CSUM_SIZE];
1878         u8 on_disk_csum[BTRFS_CSUM_SIZE];
1879         struct page *page;
1880         void *mapped_buffer;
1881         u64 mapped_size;
1882         void *p;
1883         u32 crc = ~(u32)0;
1884         int fail = 0;
1885         int crc_fail = 0;
1886         u64 len;
1887         int index;
1888
1889         BUG_ON(sblock->page_count < 1);
1890         page = sblock->pagev[0]->page;
1891         mapped_buffer = kmap_atomic(page);
1892         h = (struct btrfs_header *)mapped_buffer;
1893         memcpy(on_disk_csum, h->csum, sctx->csum_size);
1894
1895         /*
1896          * we don't use the getter functions here, as we
1897          * a) don't have an extent buffer and
1898          * b) the page is already kmapped
1899          */
1900
1901         if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h))
1902                 ++fail;
1903
1904         if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h))
1905                 ++fail;
1906
1907         if (!scrub_check_fsid(h->fsid, sblock->pagev[0]))
1908                 ++fail;
1909
1910         if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1911                    BTRFS_UUID_SIZE))
1912                 ++fail;
1913
1914         len = sctx->nodesize - BTRFS_CSUM_SIZE;
1915         mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
1916         p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
1917         index = 0;
1918         for (;;) {
1919                 u64 l = min_t(u64, len, mapped_size);
1920
1921                 crc = btrfs_csum_data(p, crc, l);
1922                 kunmap_atomic(mapped_buffer);
1923                 len -= l;
1924                 if (len == 0)
1925                         break;
1926                 index++;
1927                 BUG_ON(index >= sblock->page_count);
1928                 BUG_ON(!sblock->pagev[index]->page);
1929                 page = sblock->pagev[index]->page;
1930                 mapped_buffer = kmap_atomic(page);
1931                 mapped_size = PAGE_SIZE;
1932                 p = mapped_buffer;
1933         }
1934
1935         btrfs_csum_final(crc, calculated_csum);
1936         if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
1937                 ++crc_fail;
1938
1939         return fail || crc_fail;
1940 }
1941
1942 static int scrub_checksum_super(struct scrub_block *sblock)
1943 {
1944         struct btrfs_super_block *s;
1945         struct scrub_ctx *sctx = sblock->sctx;
1946         u8 calculated_csum[BTRFS_CSUM_SIZE];
1947         u8 on_disk_csum[BTRFS_CSUM_SIZE];
1948         struct page *page;
1949         void *mapped_buffer;
1950         u64 mapped_size;
1951         void *p;
1952         u32 crc = ~(u32)0;
1953         int fail_gen = 0;
1954         int fail_cor = 0;
1955         u64 len;
1956         int index;
1957
1958         BUG_ON(sblock->page_count < 1);
1959         page = sblock->pagev[0]->page;
1960         mapped_buffer = kmap_atomic(page);
1961         s = (struct btrfs_super_block *)mapped_buffer;
1962         memcpy(on_disk_csum, s->csum, sctx->csum_size);
1963
1964         if (sblock->pagev[0]->logical != btrfs_super_bytenr(s))
1965                 ++fail_cor;
1966
1967         if (sblock->pagev[0]->generation != btrfs_super_generation(s))
1968                 ++fail_gen;
1969
1970         if (!scrub_check_fsid(s->fsid, sblock->pagev[0]))
1971                 ++fail_cor;
1972
1973         len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
1974         mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
1975         p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
1976         index = 0;
1977         for (;;) {
1978                 u64 l = min_t(u64, len, mapped_size);
1979
1980                 crc = btrfs_csum_data(p, crc, l);
1981                 kunmap_atomic(mapped_buffer);
1982                 len -= l;
1983                 if (len == 0)
1984                         break;
1985                 index++;
1986                 BUG_ON(index >= sblock->page_count);
1987                 BUG_ON(!sblock->pagev[index]->page);
1988                 page = sblock->pagev[index]->page;
1989                 mapped_buffer = kmap_atomic(page);
1990                 mapped_size = PAGE_SIZE;
1991                 p = mapped_buffer;
1992         }
1993
1994         btrfs_csum_final(crc, calculated_csum);
1995         if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
1996                 ++fail_cor;
1997
1998         if (fail_cor + fail_gen) {
1999                 /*
2000                  * if we find an error in a super block, we just report it.
2001                  * They will get written with the next transaction commit
2002                  * anyway
2003                  */
2004                 spin_lock(&sctx->stat_lock);
2005                 ++sctx->stat.super_errors;
2006                 spin_unlock(&sctx->stat_lock);
2007                 if (fail_cor)
2008                         btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
2009                                 BTRFS_DEV_STAT_CORRUPTION_ERRS);
2010                 else
2011                         btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
2012                                 BTRFS_DEV_STAT_GENERATION_ERRS);
2013         }
2014
2015         return fail_cor + fail_gen;
2016 }
2017
2018 static void scrub_block_get(struct scrub_block *sblock)
2019 {
2020         atomic_inc(&sblock->ref_count);
2021 }
2022
2023 static void scrub_block_put(struct scrub_block *sblock)
2024 {
2025         if (atomic_dec_and_test(&sblock->ref_count)) {
2026                 int i;
2027
2028                 for (i = 0; i < sblock->page_count; i++)
2029                         scrub_page_put(sblock->pagev[i]);
2030                 kfree(sblock);
2031         }
2032 }
2033
2034 static void scrub_page_get(struct scrub_page *spage)
2035 {
2036         atomic_inc(&spage->ref_count);
2037 }
2038
2039 static void scrub_page_put(struct scrub_page *spage)
2040 {
2041         if (atomic_dec_and_test(&spage->ref_count)) {
2042                 if (spage->page)
2043                         __free_page(spage->page);
2044                 kfree(spage);
2045         }
2046 }
2047
2048 static void scrub_submit(struct scrub_ctx *sctx)
2049 {
2050         struct scrub_bio *sbio;
2051
2052         if (sctx->curr == -1)
2053                 return;
2054
2055         sbio = sctx->bios[sctx->curr];
2056         sctx->curr = -1;
2057         scrub_pending_bio_inc(sctx);
2058
2059         if (!sbio->bio->bi_bdev) {
2060                 /*
2061                  * this case should not happen. If btrfs_map_block() is
2062                  * wrong, it could happen for dev-replace operations on
2063                  * missing devices when no mirrors are available, but in
2064                  * this case it should already fail the mount.
2065                  * This case is handled correctly (but _very_ slowly).
2066                  */
2067                 printk_ratelimited(KERN_WARNING
2068                         "BTRFS: scrub_submit(bio bdev == NULL) is unexpected!\n");
2069                 bio_endio(sbio->bio, -EIO);
2070         } else {
2071                 btrfsic_submit_bio(READ, sbio->bio);
2072         }
2073 }
2074
2075 static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
2076                                     struct scrub_page *spage)
2077 {
2078         struct scrub_block *sblock = spage->sblock;
2079         struct scrub_bio *sbio;
2080         int ret;
2081
2082 again:
2083         /*
2084          * grab a fresh bio or wait for one to become available
2085          */
2086         while (sctx->curr == -1) {
2087                 spin_lock(&sctx->list_lock);
2088                 sctx->curr = sctx->first_free;
2089                 if (sctx->curr != -1) {
2090                         sctx->first_free = sctx->bios[sctx->curr]->next_free;
2091                         sctx->bios[sctx->curr]->next_free = -1;
2092                         sctx->bios[sctx->curr]->page_count = 0;
2093                         spin_unlock(&sctx->list_lock);
2094                 } else {
2095                         spin_unlock(&sctx->list_lock);
2096                         wait_event(sctx->list_wait, sctx->first_free != -1);
2097                 }
2098         }
2099         sbio = sctx->bios[sctx->curr];
2100         if (sbio->page_count == 0) {
2101                 struct bio *bio;
2102
2103                 sbio->physical = spage->physical;
2104                 sbio->logical = spage->logical;
2105                 sbio->dev = spage->dev;
2106                 bio = sbio->bio;
2107                 if (!bio) {
2108                         bio = btrfs_io_bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio);
2109                         if (!bio)
2110                                 return -ENOMEM;
2111                         sbio->bio = bio;
2112                 }
2113
2114                 bio->bi_private = sbio;
2115                 bio->bi_end_io = scrub_bio_end_io;
2116                 bio->bi_bdev = sbio->dev->bdev;
2117                 bio->bi_iter.bi_sector = sbio->physical >> 9;
2118                 sbio->err = 0;
2119         } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
2120                    spage->physical ||
2121                    sbio->logical + sbio->page_count * PAGE_SIZE !=
2122                    spage->logical ||
2123                    sbio->dev != spage->dev) {
2124                 scrub_submit(sctx);
2125                 goto again;
2126         }
2127
2128         sbio->pagev[sbio->page_count] = spage;
2129         ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
2130         if (ret != PAGE_SIZE) {
2131                 if (sbio->page_count < 1) {
2132                         bio_put(sbio->bio);
2133                         sbio->bio = NULL;
2134                         return -EIO;
2135                 }
2136                 scrub_submit(sctx);
2137                 goto again;
2138         }
2139
2140         scrub_block_get(sblock); /* one for the page added to the bio */
2141         atomic_inc(&sblock->outstanding_pages);
2142         sbio->page_count++;
2143         if (sbio->page_count == sctx->pages_per_rd_bio)
2144                 scrub_submit(sctx);
2145
2146         return 0;
2147 }
2148
2149 static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
2150                        u64 physical, struct btrfs_device *dev, u64 flags,
2151                        u64 gen, int mirror_num, u8 *csum, int force,
2152                        u64 physical_for_dev_replace)
2153 {
2154         struct scrub_block *sblock;
2155         int index;
2156
2157         sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
2158         if (!sblock) {
2159                 spin_lock(&sctx->stat_lock);
2160                 sctx->stat.malloc_errors++;
2161                 spin_unlock(&sctx->stat_lock);
2162                 return -ENOMEM;
2163         }
2164
2165         /* one ref inside this function, plus one for each page added to
2166          * a bio later on */
2167         atomic_set(&sblock->ref_count, 1);
2168         sblock->sctx = sctx;
2169         sblock->no_io_error_seen = 1;
2170
2171         for (index = 0; len > 0; index++) {
2172                 struct scrub_page *spage;
2173                 u64 l = min_t(u64, len, PAGE_SIZE);
2174
2175                 spage = kzalloc(sizeof(*spage), GFP_NOFS);
2176                 if (!spage) {
2177 leave_nomem:
2178                         spin_lock(&sctx->stat_lock);
2179                         sctx->stat.malloc_errors++;
2180                         spin_unlock(&sctx->stat_lock);
2181                         scrub_block_put(sblock);
2182                         return -ENOMEM;
2183                 }
2184                 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2185                 scrub_page_get(spage);
2186                 sblock->pagev[index] = spage;
2187                 spage->sblock = sblock;
2188                 spage->dev = dev;
2189                 spage->flags = flags;
2190                 spage->generation = gen;
2191                 spage->logical = logical;
2192                 spage->physical = physical;
2193                 spage->physical_for_dev_replace = physical_for_dev_replace;
2194                 spage->mirror_num = mirror_num;
2195                 if (csum) {
2196                         spage->have_csum = 1;
2197                         memcpy(spage->csum, csum, sctx->csum_size);
2198                 } else {
2199                         spage->have_csum = 0;
2200                 }
2201                 sblock->page_count++;
2202                 spage->page = alloc_page(GFP_NOFS);
2203                 if (!spage->page)
2204                         goto leave_nomem;
2205                 len -= l;
2206                 logical += l;
2207                 physical += l;
2208                 physical_for_dev_replace += l;
2209         }
2210
2211         WARN_ON(sblock->page_count == 0);
2212         for (index = 0; index < sblock->page_count; index++) {
2213                 struct scrub_page *spage = sblock->pagev[index];
2214                 int ret;
2215
2216                 ret = scrub_add_page_to_rd_bio(sctx, spage);
2217                 if (ret) {
2218                         scrub_block_put(sblock);
2219                         return ret;
2220                 }
2221         }
2222
2223         if (force)
2224                 scrub_submit(sctx);
2225
2226         /* last one frees, either here or in bio completion for last page */
2227         scrub_block_put(sblock);
2228         return 0;
2229 }
2230
2231 static void scrub_bio_end_io(struct bio *bio, int err)
2232 {
2233         struct scrub_bio *sbio = bio->bi_private;
2234         struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
2235
2236         sbio->err = err;
2237         sbio->bio = bio;
2238
2239         btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
2240 }
2241
2242 static void scrub_bio_end_io_worker(struct btrfs_work *work)
2243 {
2244         struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
2245         struct scrub_ctx *sctx = sbio->sctx;
2246         int i;
2247
2248         BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
2249         if (sbio->err) {
2250                 for (i = 0; i < sbio->page_count; i++) {
2251                         struct scrub_page *spage = sbio->pagev[i];
2252
2253                         spage->io_error = 1;
2254                         spage->sblock->no_io_error_seen = 0;
2255                 }
2256         }
2257
2258         /* now complete the scrub_block items that have all pages completed */
2259         for (i = 0; i < sbio->page_count; i++) {
2260                 struct scrub_page *spage = sbio->pagev[i];
2261                 struct scrub_block *sblock = spage->sblock;
2262
2263                 if (atomic_dec_and_test(&sblock->outstanding_pages))
2264                         scrub_block_complete(sblock);
2265                 scrub_block_put(sblock);
2266         }
2267
2268         bio_put(sbio->bio);
2269         sbio->bio = NULL;
2270         spin_lock(&sctx->list_lock);
2271         sbio->next_free = sctx->first_free;
2272         sctx->first_free = sbio->index;
2273         spin_unlock(&sctx->list_lock);
2274
2275         if (sctx->is_dev_replace &&
2276             atomic_read(&sctx->wr_ctx.flush_all_writes)) {
2277                 mutex_lock(&sctx->wr_ctx.wr_lock);
2278                 scrub_wr_submit(sctx);
2279                 mutex_unlock(&sctx->wr_ctx.wr_lock);
2280         }
2281
2282         scrub_pending_bio_dec(sctx);
2283 }
2284
2285 static void scrub_block_complete(struct scrub_block *sblock)
2286 {
2287         if (!sblock->no_io_error_seen) {
2288                 scrub_handle_errored_block(sblock);
2289         } else {
2290                 /*
2291                  * if has checksum error, write via repair mechanism in
2292                  * dev replace case, otherwise write here in dev replace
2293                  * case.
2294                  */
2295                 if (!scrub_checksum(sblock) && sblock->sctx->is_dev_replace)
2296                         scrub_write_block_to_dev_replace(sblock);
2297         }
2298 }
2299
2300 static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
2301                            u8 *csum)
2302 {
2303         struct btrfs_ordered_sum *sum = NULL;
2304         unsigned long index;
2305         unsigned long num_sectors;
2306
2307         while (!list_empty(&sctx->csum_list)) {
2308                 sum = list_first_entry(&sctx->csum_list,
2309                                        struct btrfs_ordered_sum, list);
2310                 if (sum->bytenr > logical)
2311                         return 0;
2312                 if (sum->bytenr + sum->len > logical)
2313                         break;
2314
2315                 ++sctx->stat.csum_discards;
2316                 list_del(&sum->list);
2317                 kfree(sum);
2318                 sum = NULL;
2319         }
2320         if (!sum)
2321                 return 0;
2322
2323         index = ((u32)(logical - sum->bytenr)) / sctx->sectorsize;
2324         num_sectors = sum->len / sctx->sectorsize;
2325         memcpy(csum, sum->sums + index, sctx->csum_size);
2326         if (index == num_sectors - 1) {
2327                 list_del(&sum->list);
2328                 kfree(sum);
2329         }
2330         return 1;
2331 }
2332
2333 /* scrub extent tries to collect up to 64 kB for each bio */
2334 static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
2335                         u64 physical, struct btrfs_device *dev, u64 flags,
2336                         u64 gen, int mirror_num, u64 physical_for_dev_replace)
2337 {
2338         int ret;
2339         u8 csum[BTRFS_CSUM_SIZE];
2340         u32 blocksize;
2341
2342         if (flags & BTRFS_EXTENT_FLAG_DATA) {
2343                 blocksize = sctx->sectorsize;
2344                 spin_lock(&sctx->stat_lock);
2345                 sctx->stat.data_extents_scrubbed++;
2346                 sctx->stat.data_bytes_scrubbed += len;
2347                 spin_unlock(&sctx->stat_lock);
2348         } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2349                 blocksize = sctx->nodesize;
2350                 spin_lock(&sctx->stat_lock);
2351                 sctx->stat.tree_extents_scrubbed++;
2352                 sctx->stat.tree_bytes_scrubbed += len;
2353                 spin_unlock(&sctx->stat_lock);
2354         } else {
2355                 blocksize = sctx->sectorsize;
2356                 WARN_ON(1);
2357         }
2358
2359         while (len) {
2360                 u64 l = min_t(u64, len, blocksize);
2361                 int have_csum = 0;
2362
2363                 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2364                         /* push csums to sbio */
2365                         have_csum = scrub_find_csum(sctx, logical, l, csum);
2366                         if (have_csum == 0)
2367                                 ++sctx->stat.no_csum;
2368                         if (sctx->is_dev_replace && !have_csum) {
2369                                 ret = copy_nocow_pages(sctx, logical, l,
2370                                                        mirror_num,
2371                                                       physical_for_dev_replace);
2372                                 goto behind_scrub_pages;
2373                         }
2374                 }
2375                 ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
2376                                   mirror_num, have_csum ? csum : NULL, 0,
2377                                   physical_for_dev_replace);
2378 behind_scrub_pages:
2379                 if (ret)
2380                         return ret;
2381                 len -= l;
2382                 logical += l;
2383                 physical += l;
2384                 physical_for_dev_replace += l;
2385         }
2386         return 0;
2387 }
2388
2389 /*
2390  * Given a physical address, this will calculate it's
2391  * logical offset. if this is a parity stripe, it will return
2392  * the most left data stripe's logical offset.
2393  *
2394  * return 0 if it is a data stripe, 1 means parity stripe.
2395  */
2396 static int get_raid56_logic_offset(u64 physical, int num,
2397                                    struct map_lookup *map, u64 *offset)
2398 {
2399         int i;
2400         int j = 0;
2401         u64 stripe_nr;
2402         u64 last_offset;
2403         int stripe_index;
2404         int rot;
2405
2406         last_offset = (physical - map->stripes[num].physical) *
2407                       nr_data_stripes(map);
2408         *offset = last_offset;
2409         for (i = 0; i < nr_data_stripes(map); i++) {
2410                 *offset = last_offset + i * map->stripe_len;
2411
2412                 stripe_nr = *offset;
2413                 do_div(stripe_nr, map->stripe_len);
2414                 do_div(stripe_nr, nr_data_stripes(map));
2415
2416                 /* Work out the disk rotation on this stripe-set */
2417                 rot = do_div(stripe_nr, map->num_stripes);
2418                 /* calculate which stripe this data locates */
2419                 rot += i;
2420                 stripe_index = rot % map->num_stripes;
2421                 if (stripe_index == num)
2422                         return 0;
2423                 if (stripe_index < num)
2424                         j++;
2425         }
2426         *offset = last_offset + j * map->stripe_len;
2427         return 1;
2428 }
2429
2430 static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2431                                            struct map_lookup *map,
2432                                            struct btrfs_device *scrub_dev,
2433                                            int num, u64 base, u64 length,
2434                                            int is_dev_replace)
2435 {
2436         struct btrfs_path *path;
2437         struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
2438         struct btrfs_root *root = fs_info->extent_root;
2439         struct btrfs_root *csum_root = fs_info->csum_root;
2440         struct btrfs_extent_item *extent;
2441         struct blk_plug plug;
2442         u64 flags;
2443         int ret;
2444         int slot;
2445         u64 nstripes;
2446         struct extent_buffer *l;
2447         struct btrfs_key key;
2448         u64 physical;
2449         u64 logical;
2450         u64 logic_end;
2451         u64 physical_end;
2452         u64 generation;
2453         int mirror_num;
2454         struct reada_control *reada1;
2455         struct reada_control *reada2;
2456         struct btrfs_key key_start;
2457         struct btrfs_key key_end;
2458         u64 increment = map->stripe_len;
2459         u64 offset;
2460         u64 extent_logical;
2461         u64 extent_physical;
2462         u64 extent_len;
2463         struct btrfs_device *extent_dev;
2464         int extent_mirror_num;
2465         int stop_loop = 0;
2466
2467         nstripes = length;
2468         physical = map->stripes[num].physical;
2469         offset = 0;
2470         do_div(nstripes, map->stripe_len);
2471         if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
2472                 offset = map->stripe_len * num;
2473                 increment = map->stripe_len * map->num_stripes;
2474                 mirror_num = 1;
2475         } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
2476                 int factor = map->num_stripes / map->sub_stripes;
2477                 offset = map->stripe_len * (num / map->sub_stripes);
2478                 increment = map->stripe_len * factor;
2479                 mirror_num = num % map->sub_stripes + 1;
2480         } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
2481                 increment = map->stripe_len;
2482                 mirror_num = num % map->num_stripes + 1;
2483         } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
2484                 increment = map->stripe_len;
2485                 mirror_num = num % map->num_stripes + 1;
2486         } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
2487                                 BTRFS_BLOCK_GROUP_RAID6)) {
2488                 get_raid56_logic_offset(physical, num, map, &offset);
2489                 increment = map->stripe_len * nr_data_stripes(map);
2490                 mirror_num = 1;
2491         } else {
2492                 increment = map->stripe_len;
2493                 mirror_num = 1;
2494         }
2495
2496         path = btrfs_alloc_path();
2497         if (!path)
2498                 return -ENOMEM;
2499
2500         /*
2501          * work on commit root. The related disk blocks are static as
2502          * long as COW is applied. This means, it is save to rewrite
2503          * them to repair disk errors without any race conditions
2504          */
2505         path->search_commit_root = 1;
2506         path->skip_locking = 1;
2507
2508         /*
2509          * trigger the readahead for extent tree csum tree and wait for
2510          * completion. During readahead, the scrub is officially paused
2511          * to not hold off transaction commits
2512          */
2513         logical = base + offset;
2514         physical_end = physical + nstripes * map->stripe_len;
2515         if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
2516                          BTRFS_BLOCK_GROUP_RAID6)) {
2517                 get_raid56_logic_offset(physical_end, num,
2518                                         map, &logic_end);
2519                 logic_end += base;
2520         } else {
2521                 logic_end = logical + increment * nstripes;
2522         }
2523         wait_event(sctx->list_wait,
2524                    atomic_read(&sctx->bios_in_flight) == 0);
2525         scrub_blocked_if_needed(fs_info);
2526
2527         /* FIXME it might be better to start readahead at commit root */
2528         key_start.objectid = logical;
2529         key_start.type = BTRFS_EXTENT_ITEM_KEY;
2530         key_start.offset = (u64)0;
2531         key_end.objectid = logic_end;
2532         key_end.type = BTRFS_METADATA_ITEM_KEY;
2533         key_end.offset = (u64)-1;
2534         reada1 = btrfs_reada_add(root, &key_start, &key_end);
2535
2536         key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
2537         key_start.type = BTRFS_EXTENT_CSUM_KEY;
2538         key_start.offset = logical;
2539         key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
2540         key_end.type = BTRFS_EXTENT_CSUM_KEY;
2541         key_end.offset = logic_end;
2542         reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);
2543
2544         if (!IS_ERR(reada1))
2545                 btrfs_reada_wait(reada1);
2546         if (!IS_ERR(reada2))
2547                 btrfs_reada_wait(reada2);
2548
2549
2550         /*
2551          * collect all data csums for the stripe to avoid seeking during
2552          * the scrub. This might currently (crc32) end up to be about 1MB
2553          */
2554         blk_start_plug(&plug);
2555
2556         /*
2557          * now find all extents for each stripe and scrub them
2558          */
2559         ret = 0;
2560         while (physical < physical_end) {
2561                 /* for raid56, we skip parity stripe */
2562                 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
2563                                 BTRFS_BLOCK_GROUP_RAID6)) {
2564                         ret = get_raid56_logic_offset(physical, num,
2565                                         map, &logical);
2566                         logical += base;
2567                         if (ret)
2568                                 goto skip;
2569                 }
2570                 /*
2571                  * canceled?
2572                  */
2573                 if (atomic_read(&fs_info->scrub_cancel_req) ||
2574                     atomic_read(&sctx->cancel_req)) {
2575                         ret = -ECANCELED;
2576                         goto out;
2577                 }
2578                 /*
2579                  * check to see if we have to pause
2580                  */
2581                 if (atomic_read(&fs_info->scrub_pause_req)) {
2582                         /* push queued extents */
2583                         atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
2584                         scrub_submit(sctx);
2585                         mutex_lock(&sctx->wr_ctx.wr_lock);
2586                         scrub_wr_submit(sctx);
2587                         mutex_unlock(&sctx->wr_ctx.wr_lock);
2588                         wait_event(sctx->list_wait,
2589                                    atomic_read(&sctx->bios_in_flight) == 0);
2590                         atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
2591                         scrub_blocked_if_needed(fs_info);
2592                 }
2593
2594                 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2595                         key.type = BTRFS_METADATA_ITEM_KEY;
2596                 else
2597                         key.type = BTRFS_EXTENT_ITEM_KEY;
2598                 key.objectid = logical;
2599                 key.offset = (u64)-1;
2600
2601                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2602                 if (ret < 0)
2603                         goto out;
2604
2605                 if (ret > 0) {
2606                         ret = btrfs_previous_extent_item(root, path, 0);
2607                         if (ret < 0)
2608                                 goto out;
2609                         if (ret > 0) {
2610                                 /* there's no smaller item, so stick with the
2611                                  * larger one */
2612                                 btrfs_release_path(path);
2613                                 ret = btrfs_search_slot(NULL, root, &key,
2614                                                         path, 0, 0);
2615                                 if (ret < 0)
2616                                         goto out;
2617                         }
2618                 }
2619
2620                 stop_loop = 0;
2621                 while (1) {
2622                         u64 bytes;
2623
2624                         l = path->nodes[0];
2625                         slot = path->slots[0];
2626                         if (slot >= btrfs_header_nritems(l)) {
2627                                 ret = btrfs_next_leaf(root, path);
2628                                 if (ret == 0)
2629                                         continue;
2630                                 if (ret < 0)
2631                                         goto out;
2632
2633                                 stop_loop = 1;
2634                                 break;
2635                         }
2636                         btrfs_item_key_to_cpu(l, &key, slot);
2637
2638                         if (key.type == BTRFS_METADATA_ITEM_KEY)
2639                                 bytes = root->nodesize;
2640                         else
2641                                 bytes = key.offset;
2642
2643                         if (key.objectid + bytes <= logical)
2644                                 goto next;
2645
2646                         if (key.type != BTRFS_EXTENT_ITEM_KEY &&
2647                             key.type != BTRFS_METADATA_ITEM_KEY)
2648                                 goto next;
2649
2650                         if (key.objectid >= logical + map->stripe_len) {
2651                                 /* out of this device extent */
2652                                 if (key.objectid >= logic_end)
2653                                         stop_loop = 1;
2654                                 break;
2655                         }
2656
2657                         extent = btrfs_item_ptr(l, slot,
2658                                                 struct btrfs_extent_item);
2659                         flags = btrfs_extent_flags(l, extent);
2660                         generation = btrfs_extent_generation(l, extent);
2661
2662                         if (key.objectid < logical &&
2663                             (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
2664                                 btrfs_err(fs_info,
2665                                            "scrub: tree block %llu spanning "
2666                                            "stripes, ignored. logical=%llu",
2667                                        key.objectid, logical);
2668                                 goto next;
2669                         }
2670
2671 again:
2672                         extent_logical = key.objectid;
2673                         extent_len = bytes;
2674
2675                         /*
2676                          * trim extent to this stripe
2677                          */
2678                         if (extent_logical < logical) {
2679                                 extent_len -= logical - extent_logical;
2680                                 extent_logical = logical;
2681                         }
2682                         if (extent_logical + extent_len >
2683                             logical + map->stripe_len) {
2684                                 extent_len = logical + map->stripe_len -
2685                                              extent_logical;
2686                         }
2687
2688                         extent_physical = extent_logical - logical + physical;
2689                         extent_dev = scrub_dev;
2690                         extent_mirror_num = mirror_num;
2691                         if (is_dev_replace)
2692                                 scrub_remap_extent(fs_info, extent_logical,
2693                                                    extent_len, &extent_physical,
2694                                                    &extent_dev,
2695                                                    &extent_mirror_num);
2696
2697                         ret = btrfs_lookup_csums_range(csum_root, logical,
2698                                                 logical + map->stripe_len - 1,
2699                                                 &sctx->csum_list, 1);
2700                         if (ret)
2701                                 goto out;
2702
2703                         ret = scrub_extent(sctx, extent_logical, extent_len,
2704                                            extent_physical, extent_dev, flags,
2705                                            generation, extent_mirror_num,
2706                                            extent_logical - logical + physical);
2707                         if (ret)
2708                                 goto out;
2709
2710                         scrub_free_csums(sctx);
2711                         if (extent_logical + extent_len <
2712                             key.objectid + bytes) {
2713                                 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
2714                                         BTRFS_BLOCK_GROUP_RAID6)) {
2715                                         /*
2716                                          * loop until we find next data stripe
2717                                          * or we have finished all stripes.
2718                                          */
2719                                         do {
2720                                                 physical += map->stripe_len;
2721                                                 ret = get_raid56_logic_offset(
2722                                                                 physical, num,
2723                                                                 map, &logical);
2724                                                 logical += base;
2725                                         } while (physical < physical_end && ret);
2726                                 } else {
2727                                         physical += map->stripe_len;
2728                                         logical += increment;
2729                                 }
2730                                 if (logical < key.objectid + bytes) {
2731                                         cond_resched();
2732                                         goto again;
2733                                 }
2734
2735                                 if (physical >= physical_end) {
2736                                         stop_loop = 1;
2737                                         break;
2738                                 }
2739                         }
2740 next:
2741                         path->slots[0]++;
2742                 }
2743                 btrfs_release_path(path);
2744 skip:
2745                 logical += increment;
2746                 physical += map->stripe_len;
2747                 spin_lock(&sctx->stat_lock);
2748                 if (stop_loop)
2749                         sctx->stat.last_physical = map->stripes[num].physical +
2750                                                    length;
2751                 else
2752                         sctx->stat.last_physical = physical;
2753                 spin_unlock(&sctx->stat_lock);
2754                 if (stop_loop)
2755                         break;
2756         }
2757 out:
2758         /* push queued extents */
2759         scrub_submit(sctx);
2760         mutex_lock(&sctx->wr_ctx.wr_lock);
2761         scrub_wr_submit(sctx);
2762         mutex_unlock(&sctx->wr_ctx.wr_lock);
2763
2764         blk_finish_plug(&plug);
2765         btrfs_free_path(path);
2766         return ret < 0 ? ret : 0;
2767 }
2768
2769 static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
2770                                           struct btrfs_device *scrub_dev,
2771                                           u64 chunk_tree, u64 chunk_objectid,
2772                                           u64 chunk_offset, u64 length,
2773                                           u64 dev_offset, int is_dev_replace)
2774 {
2775         struct btrfs_mapping_tree *map_tree =
2776                 &sctx->dev_root->fs_info->mapping_tree;
2777         struct map_lookup *map;
2778         struct extent_map *em;
2779         int i;
2780         int ret = 0;
2781
2782         read_lock(&map_tree->map_tree.lock);
2783         em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
2784         read_unlock(&map_tree->map_tree.lock);
2785
2786         if (!em)
2787                 return -EINVAL;
2788
2789         map = (struct map_lookup *)em->bdev;
2790         if (em->start != chunk_offset)
2791                 goto out;
2792
2793         if (em->len < length)
2794                 goto out;
2795
2796         for (i = 0; i < map->num_stripes; ++i) {
2797                 if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
2798                     map->stripes[i].physical == dev_offset) {
2799                         ret = scrub_stripe(sctx, map, scrub_dev, i,
2800                                            chunk_offset, length,
2801                                            is_dev_replace);
2802                         if (ret)
2803                                 goto out;
2804                 }
2805         }
2806 out:
2807         free_extent_map(em);
2808
2809         return ret;
2810 }
2811
2812 static noinline_for_stack
2813 int scrub_enumerate_chunks(struct scrub_ctx *sctx,
2814                            struct btrfs_device *scrub_dev, u64 start, u64 end,
2815                            int is_dev_replace)
2816 {
2817         struct btrfs_dev_extent *dev_extent = NULL;
2818         struct btrfs_path *path;
2819         struct btrfs_root *root = sctx->dev_root;
2820         struct btrfs_fs_info *fs_info = root->fs_info;
2821         u64 length;
2822         u64 chunk_tree;
2823         u64 chunk_objectid;
2824         u64 chunk_offset;
2825         int ret;
2826         int slot;
2827         struct extent_buffer *l;
2828         struct btrfs_key key;
2829         struct btrfs_key found_key;
2830         struct btrfs_block_group_cache *cache;
2831         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
2832
2833         path = btrfs_alloc_path();
2834         if (!path)
2835                 return -ENOMEM;
2836
2837         path->reada = 2;
2838         path->search_commit_root = 1;
2839         path->skip_locking = 1;
2840
2841         key.objectid = scrub_dev->devid;
2842         key.offset = 0ull;
2843         key.type = BTRFS_DEV_EXTENT_KEY;
2844
2845         while (1) {
2846                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2847                 if (ret < 0)
2848                         break;
2849                 if (ret > 0) {
2850                         if (path->slots[0] >=
2851                             btrfs_header_nritems(path->nodes[0])) {
2852                                 ret = btrfs_next_leaf(root, path);
2853                                 if (ret)
2854                                         break;
2855                         }
2856                 }
2857
2858                 l = path->nodes[0];
2859                 slot = path->slots[0];
2860
2861                 btrfs_item_key_to_cpu(l, &found_key, slot);
2862
2863                 if (found_key.objectid != scrub_dev->devid)
2864                         break;
2865
2866                 if (found_key.type != BTRFS_DEV_EXTENT_KEY)
2867                         break;
2868
2869                 if (found_key.offset >= end)
2870                         break;
2871
2872                 if (found_key.offset < key.offset)
2873                         break;
2874
2875                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
2876                 length = btrfs_dev_extent_length(l, dev_extent);
2877
2878                 if (found_key.offset + length <= start)
2879                         goto skip;
2880
2881                 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
2882                 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
2883                 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
2884
2885                 /*
2886                  * get a reference on the corresponding block group to prevent
2887                  * the chunk from going away while we scrub it
2888                  */
2889                 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
2890
2891                 /* some chunks are removed but not committed to disk yet,
2892                  * continue scrubbing */
2893                 if (!cache)
2894                         goto skip;
2895
2896                 dev_replace->cursor_right = found_key.offset + length;
2897                 dev_replace->cursor_left = found_key.offset;
2898                 dev_replace->item_needs_writeback = 1;
2899                 ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid,
2900                                   chunk_offset, length, found_key.offset,
2901                                   is_dev_replace);
2902
2903                 /*
2904                  * flush, submit all pending read and write bios, afterwards
2905                  * wait for them.
2906                  * Note that in the dev replace case, a read request causes
2907                  * write requests that are submitted in the read completion
2908                  * worker. Therefore in the current situation, it is required
2909                  * that all write requests are flushed, so that all read and
2910                  * write requests are really completed when bios_in_flight
2911                  * changes to 0.
2912                  */
2913                 atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
2914                 scrub_submit(sctx);
2915                 mutex_lock(&sctx->wr_ctx.wr_lock);
2916                 scrub_wr_submit(sctx);
2917                 mutex_unlock(&sctx->wr_ctx.wr_lock);
2918
2919                 wait_event(sctx->list_wait,
2920                            atomic_read(&sctx->bios_in_flight) == 0);
2921                 atomic_inc(&fs_info->scrubs_paused);
2922                 wake_up(&fs_info->scrub_pause_wait);
2923
2924                 /*
2925                  * must be called before we decrease @scrub_paused.
2926                  * make sure we don't block transaction commit while
2927                  * we are waiting pending workers finished.
2928                  */
2929                 wait_event(sctx->list_wait,
2930                            atomic_read(&sctx->workers_pending) == 0);
2931                 atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
2932
2933                 mutex_lock(&fs_info->scrub_lock);
2934                 __scrub_blocked_if_needed(fs_info);
2935                 atomic_dec(&fs_info->scrubs_paused);
2936                 mutex_unlock(&fs_info->scrub_lock);
2937                 wake_up(&fs_info->scrub_pause_wait);
2938
2939                 btrfs_put_block_group(cache);
2940                 if (ret)
2941                         break;
2942                 if (is_dev_replace &&
2943                     atomic64_read(&dev_replace->num_write_errors) > 0) {
2944                         ret = -EIO;
2945                         break;
2946                 }
2947                 if (sctx->stat.malloc_errors > 0) {
2948                         ret = -ENOMEM;
2949                         break;
2950                 }
2951
2952                 dev_replace->cursor_left = dev_replace->cursor_right;
2953                 dev_replace->item_needs_writeback = 1;
2954 skip:
2955                 key.offset = found_key.offset + length;
2956                 btrfs_release_path(path);
2957         }
2958
2959         btrfs_free_path(path);
2960
2961         /*
2962          * ret can still be 1 from search_slot or next_leaf,
2963          * that's not an error
2964          */
2965         return ret < 0 ? ret : 0;
2966 }
2967
2968 static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
2969                                            struct btrfs_device *scrub_dev)
2970 {
2971         int     i;
2972         u64     bytenr;
2973         u64     gen;
2974         int     ret;
2975         struct btrfs_root *root = sctx->dev_root;
2976
2977         if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
2978                 return -EIO;
2979
2980         /* Seed devices of a new filesystem has their own generation. */
2981         if (scrub_dev->fs_devices != root->fs_info->fs_devices)
2982                 gen = scrub_dev->generation;
2983         else
2984                 gen = root->fs_info->last_trans_committed;
2985
2986         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
2987                 bytenr = btrfs_sb_offset(i);
2988                 if (bytenr + BTRFS_SUPER_INFO_SIZE >
2989                     scrub_dev->commit_total_bytes)
2990                         break;
2991
2992                 ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
2993                                   scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
2994                                   NULL, 1, bytenr);
2995                 if (ret)
2996                         return ret;
2997         }
2998         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
2999
3000         return 0;
3001 }
3002
3003 /*
3004  * get a reference count on fs_info->scrub_workers. start worker if necessary
3005  */
3006 static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
3007                                                 int is_dev_replace)
3008 {
3009         int ret = 0;
3010         int flags = WQ_FREEZABLE | WQ_UNBOUND;
3011         int max_active = fs_info->thread_pool_size;
3012
3013         if (fs_info->scrub_workers_refcnt == 0) {
3014                 if (is_dev_replace)
3015                         fs_info->scrub_workers =
3016                                 btrfs_alloc_workqueue("btrfs-scrub", flags,
3017                                                       1, 4);
3018                 else
3019                         fs_info->scrub_workers =
3020                                 btrfs_alloc_workqueue("btrfs-scrub", flags,
3021                                                       max_active, 4);
3022                 if (!fs_info->scrub_workers) {
3023                         ret = -ENOMEM;
3024                         goto out;
3025                 }
3026                 fs_info->scrub_wr_completion_workers =
3027                         btrfs_alloc_workqueue("btrfs-scrubwrc", flags,
3028                                               max_active, 2);
3029                 if (!fs_info->scrub_wr_completion_workers) {
3030                         ret = -ENOMEM;
3031                         goto out;
3032                 }
3033                 fs_info->scrub_nocow_workers =
3034                         btrfs_alloc_workqueue("btrfs-scrubnc", flags, 1, 0);
3035                 if (!fs_info->scrub_nocow_workers) {
3036                         ret = -ENOMEM;
3037                         goto out;
3038                 }
3039         }
3040         ++fs_info->scrub_workers_refcnt;
3041 out:
3042         return ret;
3043 }
3044
3045 static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
3046 {
3047         if (--fs_info->scrub_workers_refcnt == 0) {
3048                 btrfs_destroy_workqueue(fs_info->scrub_workers);
3049                 btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
3050                 btrfs_destroy_workqueue(fs_info->scrub_nocow_workers);
3051         }
3052         WARN_ON(fs_info->scrub_workers_refcnt < 0);
3053 }
3054
3055 int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
3056                     u64 end, struct btrfs_scrub_progress *progress,
3057                     int readonly, int is_dev_replace)
3058 {
3059         struct scrub_ctx *sctx;
3060         int ret;
3061         struct btrfs_device *dev;
3062         struct rcu_string *name;
3063
3064         if (btrfs_fs_closing(fs_info))
3065                 return -EINVAL;
3066
3067         if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) {
3068                 /*
3069                  * in this case scrub is unable to calculate the checksum
3070                  * the way scrub is implemented. Do not handle this
3071                  * situation at all because it won't ever happen.
3072                  */
3073                 btrfs_err(fs_info,
3074                            "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",
3075                        fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN);
3076                 return -EINVAL;
3077         }
3078
3079         if (fs_info->chunk_root->sectorsize != PAGE_SIZE) {
3080                 /* not supported for data w/o checksums */
3081                 btrfs_err(fs_info,
3082                            "scrub: size assumption sectorsize != PAGE_SIZE "
3083                            "(%d != %lu) fails",
3084                        fs_info->chunk_root->sectorsize, PAGE_SIZE);
3085                 return -EINVAL;
3086         }
3087
3088         if (fs_info->chunk_root->nodesize >
3089             PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
3090             fs_info->chunk_root->sectorsize >
3091             PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
3092                 /*
3093                  * would exhaust the array bounds of pagev member in
3094                  * struct scrub_block
3095                  */
3096                 btrfs_err(fs_info, "scrub: size assumption nodesize and sectorsize "
3097                            "<= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails",
3098                        fs_info->chunk_root->nodesize,
3099                        SCRUB_MAX_PAGES_PER_BLOCK,
3100                        fs_info->chunk_root->sectorsize,
3101                        SCRUB_MAX_PAGES_PER_BLOCK);
3102                 return -EINVAL;
3103         }
3104
3105
3106         mutex_lock(&fs_info->fs_devices->device_list_mutex);
3107         dev = btrfs_find_device(fs_info, devid, NULL, NULL);
3108         if (!dev || (dev->missing && !is_dev_replace)) {
3109                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3110                 return -ENODEV;
3111         }
3112
3113         if (!is_dev_replace && !readonly && !dev->writeable) {
3114                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3115                 rcu_read_lock();
3116                 name = rcu_dereference(dev->name);
3117                 btrfs_err(fs_info, "scrub: device %s is not writable",
3118                           name->str);
3119                 rcu_read_unlock();
3120                 return -EROFS;
3121         }
3122
3123         mutex_lock(&fs_info->scrub_lock);
3124         if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) {
3125                 mutex_unlock(&fs_info->scrub_lock);
3126                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3127                 return -EIO;
3128         }
3129
3130         btrfs_dev_replace_lock(&fs_info->dev_replace);
3131         if (dev->scrub_device ||
3132             (!is_dev_replace &&
3133              btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
3134                 btrfs_dev_replace_unlock(&fs_info->dev_replace);
3135                 mutex_unlock(&fs_info->scrub_lock);
3136                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3137                 return -EINPROGRESS;
3138         }
3139         btrfs_dev_replace_unlock(&fs_info->dev_replace);
3140
3141         ret = scrub_workers_get(fs_info, is_dev_replace);
3142         if (ret) {
3143                 mutex_unlock(&fs_info->scrub_lock);
3144                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3145                 return ret;
3146         }
3147
3148         sctx = scrub_setup_ctx(dev, is_dev_replace);
3149         if (IS_ERR(sctx)) {
3150                 mutex_unlock(&fs_info->scrub_lock);
3151                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3152                 scrub_workers_put(fs_info);
3153                 return PTR_ERR(sctx);
3154         }
3155         sctx->readonly = readonly;
3156         dev->scrub_device = sctx;
3157         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3158
3159         /*
3160          * checking @scrub_pause_req here, we can avoid
3161          * race between committing transaction and scrubbing.
3162          */
3163         __scrub_blocked_if_needed(fs_info);
3164         atomic_inc(&fs_info->scrubs_running);
3165         mutex_unlock(&fs_info->scrub_lock);
3166
3167         if (!is_dev_replace) {
3168                 /*
3169                  * by holding device list mutex, we can
3170                  * kick off writing super in log tree sync.
3171                  */
3172                 mutex_lock(&fs_info->fs_devices->device_list_mutex);
3173                 ret = scrub_supers(sctx, dev);
3174                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3175         }
3176
3177         if (!ret)
3178                 ret = scrub_enumerate_chunks(sctx, dev, start, end,
3179                                              is_dev_replace);
3180
3181         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3182         atomic_dec(&fs_info->scrubs_running);
3183         wake_up(&fs_info->scrub_pause_wait);
3184
3185         wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
3186
3187         if (progress)
3188                 memcpy(progress, &sctx->stat, sizeof(*progress));
3189
3190         mutex_lock(&fs_info->scrub_lock);
3191         dev->scrub_device = NULL;
3192         scrub_workers_put(fs_info);
3193         mutex_unlock(&fs_info->scrub_lock);
3194
3195         scrub_free_ctx(sctx);
3196
3197         return ret;
3198 }
3199
3200 void btrfs_scrub_pause(struct btrfs_root *root)
3201 {
3202         struct btrfs_fs_info *fs_info = root->fs_info;
3203
3204         mutex_lock(&fs_info->scrub_lock);
3205         atomic_inc(&fs_info->scrub_pause_req);
3206         while (atomic_read(&fs_info->scrubs_paused) !=
3207                atomic_read(&fs_info->scrubs_running)) {
3208                 mutex_unlock(&fs_info->scrub_lock);
3209                 wait_event(fs_info->scrub_pause_wait,
3210                            atomic_read(&fs_info->scrubs_paused) ==
3211                            atomic_read(&fs_info->scrubs_running));
3212                 mutex_lock(&fs_info->scrub_lock);
3213         }
3214         mutex_unlock(&fs_info->scrub_lock);
3215 }
3216
3217 void btrfs_scrub_continue(struct btrfs_root *root)
3218 {
3219         struct btrfs_fs_info *fs_info = root->fs_info;
3220
3221         atomic_dec(&fs_info->scrub_pause_req);
3222         wake_up(&fs_info->scrub_pause_wait);
3223 }
3224
3225 int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
3226 {
3227         mutex_lock(&fs_info->scrub_lock);
3228         if (!atomic_read(&fs_info->scrubs_running)) {
3229                 mutex_unlock(&fs_info->scrub_lock);
3230                 return -ENOTCONN;
3231         }
3232
3233         atomic_inc(&fs_info->scrub_cancel_req);
3234         while (atomic_read(&fs_info->scrubs_running)) {
3235                 mutex_unlock(&fs_info->scrub_lock);
3236                 wait_event(fs_info->scrub_pause_wait,
3237                            atomic_read(&fs_info->scrubs_running) == 0);
3238                 mutex_lock(&fs_info->scrub_lock);
3239         }
3240         atomic_dec(&fs_info->scrub_cancel_req);
3241         mutex_unlock(&fs_info->scrub_lock);
3242
3243         return 0;
3244 }
3245
3246 int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info,
3247                            struct btrfs_device *dev)
3248 {
3249         struct scrub_ctx *sctx;
3250
3251         mutex_lock(&fs_info->scrub_lock);
3252         sctx = dev->scrub_device;
3253         if (!sctx) {
3254                 mutex_unlock(&fs_info->scrub_lock);
3255                 return -ENOTCONN;
3256         }
3257         atomic_inc(&sctx->cancel_req);
3258         while (dev->scrub_device) {
3259                 mutex_unlock(&fs_info->scrub_lock);
3260                 wait_event(fs_info->scrub_pause_wait,
3261                            dev->scrub_device == NULL);
3262                 mutex_lock(&fs_info->scrub_lock);
3263         }
3264         mutex_unlock(&fs_info->scrub_lock);
3265
3266         return 0;
3267 }
3268
3269 int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
3270                          struct btrfs_scrub_progress *progress)
3271 {
3272         struct btrfs_device *dev;
3273         struct scrub_ctx *sctx = NULL;
3274
3275         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
3276         dev = btrfs_find_device(root->fs_info, devid, NULL, NULL);
3277         if (dev)
3278                 sctx = dev->scrub_device;
3279         if (sctx)
3280                 memcpy(progress, &sctx->stat, sizeof(*progress));
3281         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
3282
3283         return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
3284 }
3285
3286 static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
3287                                u64 extent_logical, u64 extent_len,
3288                                u64 *extent_physical,
3289                                struct btrfs_device **extent_dev,
3290                                int *extent_mirror_num)
3291 {
3292         u64 mapped_length;
3293         struct btrfs_bio *bbio = NULL;
3294         int ret;
3295
3296         mapped_length = extent_len;
3297         ret = btrfs_map_block(fs_info, READ, extent_logical,
3298                               &mapped_length, &bbio, 0);
3299         if (ret || !bbio || mapped_length < extent_len ||
3300             !bbio->stripes[0].dev->bdev) {
3301                 kfree(bbio);
3302                 return;
3303         }
3304
3305         *extent_physical = bbio->stripes[0].physical;
3306         *extent_mirror_num = bbio->mirror_num;
3307         *extent_dev = bbio->stripes[0].dev;
3308         kfree(bbio);
3309 }
3310
3311 static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
3312                               struct scrub_wr_ctx *wr_ctx,
3313                               struct btrfs_fs_info *fs_info,
3314                               struct btrfs_device *dev,
3315                               int is_dev_replace)
3316 {
3317         WARN_ON(wr_ctx->wr_curr_bio != NULL);
3318
3319         mutex_init(&wr_ctx->wr_lock);
3320         wr_ctx->wr_curr_bio = NULL;
3321         if (!is_dev_replace)
3322                 return 0;
3323
3324         WARN_ON(!dev->bdev);
3325         wr_ctx->pages_per_wr_bio = min_t(int, SCRUB_PAGES_PER_WR_BIO,
3326                                          bio_get_nr_vecs(dev->bdev));
3327         wr_ctx->tgtdev = dev;
3328         atomic_set(&wr_ctx->flush_all_writes, 0);
3329         return 0;
3330 }
3331
3332 static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx)
3333 {
3334         mutex_lock(&wr_ctx->wr_lock);
3335         kfree(wr_ctx->wr_curr_bio);
3336         wr_ctx->wr_curr_bio = NULL;
3337         mutex_unlock(&wr_ctx->wr_lock);
3338 }
3339
3340 static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
3341                             int mirror_num, u64 physical_for_dev_replace)
3342 {
3343         struct scrub_copy_nocow_ctx *nocow_ctx;
3344         struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
3345
3346         nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS);
3347         if (!nocow_ctx) {
3348                 spin_lock(&sctx->stat_lock);
3349                 sctx->stat.malloc_errors++;
3350                 spin_unlock(&sctx->stat_lock);
3351                 return -ENOMEM;
3352         }
3353
3354         scrub_pending_trans_workers_inc(sctx);
3355
3356         nocow_ctx->sctx = sctx;
3357         nocow_ctx->logical = logical;
3358         nocow_ctx->len = len;
3359         nocow_ctx->mirror_num = mirror_num;
3360         nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
3361         btrfs_init_work(&nocow_ctx->work, btrfs_scrubnc_helper,
3362                         copy_nocow_pages_worker, NULL, NULL);
3363         INIT_LIST_HEAD(&nocow_ctx->inodes);
3364         btrfs_queue_work(fs_info->scrub_nocow_workers,
3365                          &nocow_ctx->work);
3366
3367         return 0;
3368 }
3369
3370 static int record_inode_for_nocow(u64 inum, u64 offset, u64 root, void *ctx)
3371 {
3372         struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
3373         struct scrub_nocow_inode *nocow_inode;
3374
3375         nocow_inode = kzalloc(sizeof(*nocow_inode), GFP_NOFS);
3376         if (!nocow_inode)
3377                 return -ENOMEM;
3378         nocow_inode->inum = inum;
3379         nocow_inode->offset = offset;
3380         nocow_inode->root = root;
3381         list_add_tail(&nocow_inode->list, &nocow_ctx->inodes);
3382         return 0;
3383 }
3384
3385 #define COPY_COMPLETE 1
3386
3387 static void copy_nocow_pages_worker(struct btrfs_work *work)
3388 {
3389         struct scrub_copy_nocow_ctx *nocow_ctx =
3390                 container_of(work, struct scrub_copy_nocow_ctx, work);
3391         struct scrub_ctx *sctx = nocow_ctx->sctx;
3392         u64 logical = nocow_ctx->logical;
3393         u64 len = nocow_ctx->len;
3394         int mirror_num = nocow_ctx->mirror_num;
3395         u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
3396         int ret;
3397         struct btrfs_trans_handle *trans = NULL;
3398         struct btrfs_fs_info *fs_info;
3399         struct btrfs_path *path;
3400         struct btrfs_root *root;
3401         int not_written = 0;
3402
3403         fs_info = sctx->dev_root->fs_info;
3404         root = fs_info->extent_root;
3405
3406         path = btrfs_alloc_path();
3407         if (!path) {
3408                 spin_lock(&sctx->stat_lock);
3409                 sctx->stat.malloc_errors++;
3410                 spin_unlock(&sctx->stat_lock);
3411                 not_written = 1;
3412                 goto out;
3413         }
3414
3415         trans = btrfs_join_transaction(root);
3416         if (IS_ERR(trans)) {
3417                 not_written = 1;
3418                 goto out;
3419         }
3420
3421         ret = iterate_inodes_from_logical(logical, fs_info, path,
3422                                           record_inode_for_nocow, nocow_ctx);
3423         if (ret != 0 && ret != -ENOENT) {
3424                 btrfs_warn(fs_info, "iterate_inodes_from_logical() failed: log %llu, "
3425                         "phys %llu, len %llu, mir %u, ret %d",
3426                         logical, physical_for_dev_replace, len, mirror_num,
3427                         ret);
3428                 not_written = 1;
3429                 goto out;
3430         }
3431
3432         btrfs_end_transaction(trans, root);
3433         trans = NULL;
3434         while (!list_empty(&nocow_ctx->inodes)) {
3435                 struct scrub_nocow_inode *entry;
3436                 entry = list_first_entry(&nocow_ctx->inodes,
3437                                          struct scrub_nocow_inode,
3438                                          list);
3439                 list_del_init(&entry->list);
3440                 ret = copy_nocow_pages_for_inode(entry->inum, entry->offset,
3441                                                  entry->root, nocow_ctx);
3442                 kfree(entry);
3443                 if (ret == COPY_COMPLETE) {
3444                         ret = 0;
3445                         break;
3446                 } else if (ret) {
3447                         break;
3448                 }
3449         }
3450 out:
3451         while (!list_empty(&nocow_ctx->inodes)) {
3452                 struct scrub_nocow_inode *entry;
3453                 entry = list_first_entry(&nocow_ctx->inodes,
3454                                          struct scrub_nocow_inode,
3455                                          list);
3456                 list_del_init(&entry->list);
3457                 kfree(entry);
3458         }
3459         if (trans && !IS_ERR(trans))
3460                 btrfs_end_transaction(trans, root);
3461         if (not_written)
3462                 btrfs_dev_replace_stats_inc(&fs_info->dev_replace.
3463                                             num_uncorrectable_read_errors);
3464
3465         btrfs_free_path(path);
3466         kfree(nocow_ctx);
3467
3468         scrub_pending_trans_workers_dec(sctx);
3469 }
3470
3471 static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
3472                                       struct scrub_copy_nocow_ctx *nocow_ctx)
3473 {
3474         struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
3475         struct btrfs_key key;
3476         struct inode *inode;
3477         struct page *page;
3478         struct btrfs_root *local_root;
3479         struct btrfs_ordered_extent *ordered;
3480         struct extent_map *em;
3481         struct extent_state *cached_state = NULL;
3482         struct extent_io_tree *io_tree;
3483         u64 physical_for_dev_replace;
3484         u64 len = nocow_ctx->len;
3485         u64 lockstart = offset, lockend = offset + len - 1;
3486         unsigned long index;
3487         int srcu_index;
3488         int ret = 0;
3489         int err = 0;
3490
3491         key.objectid = root;
3492         key.type = BTRFS_ROOT_ITEM_KEY;
3493         key.offset = (u64)-1;
3494
3495         srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
3496
3497         local_root = btrfs_read_fs_root_no_name(fs_info, &key);
3498         if (IS_ERR(local_root)) {
3499                 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
3500                 return PTR_ERR(local_root);
3501         }
3502
3503         key.type = BTRFS_INODE_ITEM_KEY;
3504         key.objectid = inum;
3505         key.offset = 0;
3506         inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
3507         srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
3508         if (IS_ERR(inode))
3509                 return PTR_ERR(inode);
3510
3511         /* Avoid truncate/dio/punch hole.. */
3512         mutex_lock(&inode->i_mutex);
3513         inode_dio_wait(inode);
3514
3515         physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
3516         io_tree = &BTRFS_I(inode)->io_tree;
3517
3518         lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state);
3519         ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
3520         if (ordered) {
3521                 btrfs_put_ordered_extent(ordered);
3522                 goto out_unlock;
3523         }
3524
3525         em = btrfs_get_extent(inode, NULL, 0, lockstart, len, 0);
3526         if (IS_ERR(em)) {
3527                 ret = PTR_ERR(em);
3528                 goto out_unlock;
3529         }
3530
3531         /*
3532          * This extent does not actually cover the logical extent anymore,
3533          * move on to the next inode.
3534          */
3535         if (em->block_start > nocow_ctx->logical ||
3536             em->block_start + em->block_len < nocow_ctx->logical + len) {
3537                 free_extent_map(em);
3538                 goto out_unlock;
3539         }
3540         free_extent_map(em);
3541
3542         while (len >= PAGE_CACHE_SIZE) {
3543                 index = offset >> PAGE_CACHE_SHIFT;
3544 again:
3545                 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
3546                 if (!page) {
3547                         btrfs_err(fs_info, "find_or_create_page() failed");
3548                         ret = -ENOMEM;
3549                         goto out;
3550                 }
3551
3552                 if (PageUptodate(page)) {
3553                         if (PageDirty(page))
3554                                 goto next_page;
3555                 } else {
3556                         ClearPageError(page);
3557                         err = extent_read_full_page_nolock(io_tree, page,
3558                                                            btrfs_get_extent,
3559                                                            nocow_ctx->mirror_num);
3560                         if (err) {
3561                                 ret = err;
3562                                 goto next_page;
3563                         }
3564
3565                         lock_page(page);
3566                         /*
3567                          * If the page has been remove from the page cache,
3568                          * the data on it is meaningless, because it may be
3569                          * old one, the new data may be written into the new
3570                          * page in the page cache.
3571                          */
3572                         if (page->mapping != inode->i_mapping) {
3573                                 unlock_page(page);
3574                                 page_cache_release(page);
3575                                 goto again;
3576                         }
3577                         if (!PageUptodate(page)) {
3578                                 ret = -EIO;
3579                                 goto next_page;
3580                         }
3581                 }
3582                 err = write_page_nocow(nocow_ctx->sctx,
3583                                        physical_for_dev_replace, page);
3584                 if (err)
3585                         ret = err;
3586 next_page:
3587                 unlock_page(page);
3588                 page_cache_release(page);
3589
3590                 if (ret)
3591                         break;
3592
3593                 offset += PAGE_CACHE_SIZE;
3594                 physical_for_dev_replace += PAGE_CACHE_SIZE;
3595                 len -= PAGE_CACHE_SIZE;
3596         }
3597         ret = COPY_COMPLETE;
3598 out_unlock:
3599         unlock_extent_cached(io_tree, lockstart, lockend, &cached_state,
3600                              GFP_NOFS);
3601 out:
3602         mutex_unlock(&inode->i_mutex);
3603         iput(inode);
3604         return ret;
3605 }
3606
3607 static int write_page_nocow(struct scrub_ctx *sctx,
3608                             u64 physical_for_dev_replace, struct page *page)
3609 {
3610         struct bio *bio;
3611         struct btrfs_device *dev;
3612         int ret;
3613
3614         dev = sctx->wr_ctx.tgtdev;
3615         if (!dev)
3616                 return -EIO;
3617         if (!dev->bdev) {
3618                 printk_ratelimited(KERN_WARNING
3619                         "BTRFS: scrub write_page_nocow(bdev == NULL) is unexpected!\n");
3620                 return -EIO;
3621         }
3622         bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
3623         if (!bio) {
3624                 spin_lock(&sctx->stat_lock);
3625                 sctx->stat.malloc_errors++;
3626                 spin_unlock(&sctx->stat_lock);
3627                 return -ENOMEM;
3628         }
3629         bio->bi_iter.bi_size = 0;
3630         bio->bi_iter.bi_sector = physical_for_dev_replace >> 9;
3631         bio->bi_bdev = dev->bdev;
3632         ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
3633         if (ret != PAGE_CACHE_SIZE) {
3634 leave_with_eio:
3635                 bio_put(bio);
3636                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
3637                 return -EIO;
3638         }
3639
3640         if (btrfsic_submit_bio_wait(WRITE_SYNC, bio))
3641                 goto leave_with_eio;
3642
3643         bio_put(bio);
3644         return 0;
3645 }