fs/btrfs/raid56.c

   1 /*
   2  * Copyright (C) 2012 Fusion-io  All rights reserved.
   3  * Copyright (C) 2012 Intel Corp. All rights reserved.
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public
   7  * License v2 as published by the Free Software Foundation.
   8  *
   9  * This program is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public
  15  * License along with this program; if not, write to the
  16  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  17  * Boston, MA 021110-1307, USA.
  18  */
  19 #include <linux/sched.h>
  20 #include <linux/wait.h>
  21 #include <linux/bio.h>
  22 #include <linux/slab.h>
  23 #include <linux/buffer_head.h>
  24 #include <linux/blkdev.h>
  25 #include <linux/random.h>
  26 #include <linux/iocontext.h>
  27 #include <linux/capability.h>
  28 #include <linux/ratelimit.h>
  29 #include <linux/kthread.h>
  30 #include <linux/raid/pq.h>
  31 #include <linux/hash.h>
  32 #include <linux/list_sort.h>
  33 #include <linux/raid/xor.h>
  34 #include <linux/vmalloc.h>
  35 #include <asm/div64.h>
  36 #include "ctree.h"
  37 #include "extent_map.h"
  38 #include "disk-io.h"
  39 #include "transaction.h"
  40 #include "print-tree.h"
  41 #include "volumes.h"
  42 #include "raid56.h"
  43 #include "async-thread.h"
  44 #include "check-integrity.h"
  45 #include "rcu-string.h"
  46
  47 /* set when additional merges to this rbio are not allowed */
  48 #define RBIO_RMW_LOCKED_BIT     1
  49
  50 /*
  51  * set when this rbio is sitting in the hash, but it is just a cache
  52  * of past RMW
  53  */
  54 #define RBIO_CACHE_BIT          2
  55
  56 /*
  57  * set when it is safe to trust the stripe_pages for caching
  58  */
  59 #define RBIO_CACHE_READY_BIT    3
  60
  61 /*
  62  * bbio and raid_map is managed by the caller, so we shouldn't free
  63  * them here. And besides that, all rbios with this flag should not
  64  * be cached, because we need raid_map to check the rbios' stripe
  65  * is the same or not, but it is very likely that the caller has
  66  * free raid_map, so don't cache those rbios.
  67  */
  68 #define RBIO_HOLD_BBIO_MAP_BIT  4
  69
  70 #define RBIO_CACHE_SIZE 1024
  71
  72 enum btrfs_rbio_ops {
  73         BTRFS_RBIO_WRITE        = 0,
  74         BTRFS_RBIO_READ_REBUILD = 1,
  75         BTRFS_RBIO_PARITY_SCRUB = 2,
  76 };
  77
  78 struct btrfs_raid_bio {
  79         struct btrfs_fs_info *fs_info;
  80         struct btrfs_bio *bbio;
  81
  82         /* while we're doing rmw on a stripe
  83          * we put it into a hash table so we can
  84          * lock the stripe and merge more rbios
  85          * into it.
  86          */
  87         struct list_head hash_list;
  88
  89         /*
  90          * LRU list for the stripe cache
  91          */
  92         struct list_head stripe_cache;
  93
  94         /*
  95          * for scheduling work in the helper threads
  96          */
  97         struct btrfs_work work;
  98
  99         /*
 100          * bio list and bio_list_lock are used
 101          * to add more bios into the stripe
 102          * in hopes of avoiding the full rmw
 103          */
 104         struct bio_list bio_list;
 105         spinlock_t bio_list_lock;
 106
 107         /* also protected by the bio_list_lock, the
 108          * plug list is used by the plugging code
 109          * to collect partial bios while plugged.  The
 110          * stripe locking code also uses it to hand off
 111          * the stripe lock to the next pending IO
 112          */
 113         struct list_head plug_list;
 114
 115         /*
 116          * flags that tell us if it is safe to
 117          * merge with this bio
 118          */
 119         unsigned long flags;
 120
 121         /* size of each individual stripe on disk */
 122         int stripe_len;
 123
 124         /* number of data stripes (no p/q) */
 125         int nr_data;
 126
 127         int real_stripes;
 128
 129         int stripe_npages;
 130         /*
 131          * set if we're doing a parity rebuild
 132          * for a read from higher up, which is handled
 133          * differently from a parity rebuild as part of
 134          * rmw
 135          */
 136         enum btrfs_rbio_ops operation;
 137
 138         /* first bad stripe */
 139         int faila;
 140
 141         /* second bad stripe (for raid6 use) */
 142         int failb;
 143
 144         int scrubp;
 145         /*
 146          * number of pages needed to represent the full
 147          * stripe
 148          */
 149         int nr_pages;
 150
 151         /*
 152          * size of all the bios in the bio_list.  This
 153          * helps us decide if the rbio maps to a full
 154          * stripe or not
 155          */
 156         int bio_list_bytes;
 157
 158         int generic_bio_cnt;
 159
 160         atomic_t refs;
 161
 162         atomic_t stripes_pending;
 163
 164         atomic_t error;
 165         /*
 166          * these are two arrays of pointers.  We allocate the
 167          * rbio big enough to hold them both and setup their
 168          * locations when the rbio is allocated
 169          */
 170
 171         /* pointers to pages that we allocated for
 172          * reading/writing stripes directly from the disk (including P/Q)
 173          */
 174         struct page **stripe_pages;
 175
 176         /*
 177          * pointers to the pages in the bio_list.  Stored
 178          * here for faster lookup
 179          */
 180         struct page **bio_pages;
 181
 182         /*
 183          * bitmap to record which horizontal stripe has data
 184          */
 185         unsigned long *dbitmap;
 186 };
 187
 188 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
 189 static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
 190 static void rmw_work(struct btrfs_work *work);
 191 static void read_rebuild_work(struct btrfs_work *work);
 192 static void async_rmw_stripe(struct btrfs_raid_bio *rbio);
 193 static void async_read_rebuild(struct btrfs_raid_bio *rbio);
 194 static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
 195 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
 196 static void __free_raid_bio(struct btrfs_raid_bio *rbio);
 197 static void index_rbio_pages(struct btrfs_raid_bio *rbio);
 198 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
 199
 200 static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
 201                                          int need_check);
 202 static void async_scrub_parity(struct btrfs_raid_bio *rbio);
 203
 204 /*
 205  * the stripe hash table is used for locking, and to collect
 206  * bios in hopes of making a full stripe
 207  */
 208 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
 209 {
 210         struct btrfs_stripe_hash_table *table;
 211         struct btrfs_stripe_hash_table *x;
 212         struct btrfs_stripe_hash *cur;
 213         struct btrfs_stripe_hash *h;
 214         int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
 215         int i;
 216         int table_size;
 217
 218         if (info->stripe_hash_table)
 219                 return 0;
 220
 221         /*
 222          * The table is large, starting with order 4 and can go as high as
 223          * order 7 in case lock debugging is turned on.
 224          *
 225          * Try harder to allocate and fallback to vmalloc to lower the chance
 226          * of a failing mount.
 227          */
 228         table_size = sizeof(*table) + sizeof(*h) * num_entries;
 229         table = kzalloc(table_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
 230         if (!table) {
 231                 table = vzalloc(table_size);
 232                 if (!table)
 233                         return -ENOMEM;
 234         }
 235
 236         spin_lock_init(&table->cache_lock);
 237         INIT_LIST_HEAD(&table->stripe_cache);
 238
 239         h = table->table;
 240
 241         for (i = 0; i < num_entries; i++) {
 242                 cur = h + i;
 243                 INIT_LIST_HEAD(&cur->hash_list);
 244                 spin_lock_init(&cur->lock);
 245                 init_waitqueue_head(&cur->wait);
 246         }
 247
 248         x = cmpxchg(&info->stripe_hash_table, NULL, table);
 249         if (x) {
 250                 if (is_vmalloc_addr(x))
 251                         vfree(x);
 252                 else
 253                         kfree(x);
 254         }
 255         return 0;
 256 }
 257
 258 /*
 259  * caching an rbio means to copy anything from the
 260  * bio_pages array into the stripe_pages array.  We
 261  * use the page uptodate bit in the stripe cache array
 262  * to indicate if it has valid data
 263  *
 264  * once the caching is done, we set the cache ready
 265  * bit.
 266  */
 267 static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
 268 {
 269         int i;
 270         char *s;
 271         char *d;
 272         int ret;
 273
 274         ret = alloc_rbio_pages(rbio);
 275         if (ret)
 276                 return;
 277
 278         for (i = 0; i < rbio->nr_pages; i++) {
 279                 if (!rbio->bio_pages[i])
 280                         continue;
 281
 282                 s = kmap(rbio->bio_pages[i]);
 283                 d = kmap(rbio->stripe_pages[i]);
 284
 285                 memcpy(d, s, PAGE_CACHE_SIZE);
 286
 287                 kunmap(rbio->bio_pages[i]);
 288                 kunmap(rbio->stripe_pages[i]);
 289                 SetPageUptodate(rbio->stripe_pages[i]);
 290         }
 291         set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
 292 }
 293
 294 /*
 295  * we hash on the first logical address of the stripe
 296  */
 297 static int rbio_bucket(struct btrfs_raid_bio *rbio)
 298 {
 299         u64 num = rbio->bbio->raid_map[0];
 300
 301         /*
 302          * we shift down quite a bit.  We're using byte
 303          * addressing, and most of the lower bits are zeros.
 304          * This tends to upset hash_64, and it consistently
 305          * returns just one or two different values.
 306          *
 307          * shifting off the lower bits fixes things.
 308          */
 309         return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
 310 }
 311
 312 /*
 313  * stealing an rbio means taking all the uptodate pages from the stripe
 314  * array in the source rbio and putting them into the destination rbio
 315  */
 316 static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
 317 {
 318         int i;
 319         struct page *s;
 320         struct page *d;
 321
 322         if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
 323                 return;
 324
 325         for (i = 0; i < dest->nr_pages; i++) {
 326                 s = src->stripe_pages[i];
 327                 if (!s || !PageUptodate(s)) {
 328                         continue;
 329                 }
 330
 331                 d = dest->stripe_pages[i];
 332                 if (d)
 333                         __free_page(d);
 334
 335                 dest->stripe_pages[i] = s;
 336                 src->stripe_pages[i] = NULL;
 337         }
 338 }
 339
 340 /*
 341  * merging means we take the bio_list from the victim and
 342  * splice it into the destination.  The victim should
 343  * be discarded afterwards.
 344  *
 345  * must be called with dest->rbio_list_lock held
 346  */
 347 static void merge_rbio(struct btrfs_raid_bio *dest,
 348                        struct btrfs_raid_bio *victim)
 349 {
 350         bio_list_merge(&dest->bio_list, &victim->bio_list);
 351         dest->bio_list_bytes += victim->bio_list_bytes;
 352         dest->generic_bio_cnt += victim->generic_bio_cnt;
 353         bio_list_init(&victim->bio_list);
 354 }
 355
 356 /*
 357  * used to prune items that are in the cache.  The caller
 358  * must hold the hash table lock.
 359  */
 360 static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
 361 {
 362         int bucket = rbio_bucket(rbio);
 363         struct btrfs_stripe_hash_table *table;
 364         struct btrfs_stripe_hash *h;
 365         int freeit = 0;
 366
 367         /*
 368          * check the bit again under the hash table lock.
 369          */
 370         if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
 371                 return;
 372
 373         table = rbio->fs_info->stripe_hash_table;
 374         h = table->table + bucket;
 375
 376         /* hold the lock for the bucket because we may be
 377          * removing it from the hash table
 378          */
 379         spin_lock(&h->lock);
 380
 381         /*
 382          * hold the lock for the bio list because we need
 383          * to make sure the bio list is empty
 384          */
 385         spin_lock(&rbio->bio_list_lock);
 386
 387         if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
 388                 list_del_init(&rbio->stripe_cache);
 389                 table->cache_size -= 1;
 390                 freeit = 1;
 391
 392                 /* if the bio list isn't empty, this rbio is
 393                  * still involved in an IO.  We take it out
 394                  * of the cache list, and drop the ref that
 395                  * was held for the list.
 396                  *
 397                  * If the bio_list was empty, we also remove
 398                  * the rbio from the hash_table, and drop
 399                  * the corresponding ref
 400                  */
 401                 if (bio_list_empty(&rbio->bio_list)) {
 402                         if (!list_empty(&rbio->hash_list)) {
 403                                 list_del_init(&rbio->hash_list);
 404                                 atomic_dec(&rbio->refs);
 405                                 BUG_ON(!list_empty(&rbio->plug_list));
 406                         }
 407                 }
 408         }
 409
 410         spin_unlock(&rbio->bio_list_lock);
 411         spin_unlock(&h->lock);
 412
 413         if (freeit)
 414                 __free_raid_bio(rbio);
 415 }
 416
 417 /*
 418  * prune a given rbio from the cache
 419  */
 420 static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
 421 {
 422         struct btrfs_stripe_hash_table *table;
 423         unsigned long flags;
 424
 425         if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
 426                 return;
 427
 428         table = rbio->fs_info->stripe_hash_table;
 429
 430         spin_lock_irqsave(&table->cache_lock, flags);
 431         __remove_rbio_from_cache(rbio);
 432         spin_unlock_irqrestore(&table->cache_lock, flags);
 433 }
 434
 435 /*
 436  * remove everything in the cache
 437  */
 438 static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
 439 {
 440         struct btrfs_stripe_hash_table *table;
 441         unsigned long flags;
 442         struct btrfs_raid_bio *rbio;
 443
 444         table = info->stripe_hash_table;
 445
 446         spin_lock_irqsave(&table->cache_lock, flags);
 447         while (!list_empty(&table->stripe_cache)) {
 448                 rbio = list_entry(table->stripe_cache.next,
 449                                   struct btrfs_raid_bio,
 450                                   stripe_cache);
 451                 __remove_rbio_from_cache(rbio);
 452         }
 453         spin_unlock_irqrestore(&table->cache_lock, flags);
 454 }
 455
 456 /*
 457  * remove all cached entries and free the hash table
 458  * used by unmount
 459  */
 460 void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
 461 {
 462         if (!info->stripe_hash_table)
 463                 return;
 464         btrfs_clear_rbio_cache(info);
 465         if (is_vmalloc_addr(info->stripe_hash_table))
 466                 vfree(info->stripe_hash_table);
 467         else
 468                 kfree(info->stripe_hash_table);
 469         info->stripe_hash_table = NULL;
 470 }
 471
 472 /*
 473  * insert an rbio into the stripe cache.  It
 474  * must have already been prepared by calling
 475  * cache_rbio_pages
 476  *
 477  * If this rbio was already cached, it gets
 478  * moved to the front of the lru.
 479  *
 480  * If the size of the rbio cache is too big, we
 481  * prune an item.
 482  */
 483 static void cache_rbio(struct btrfs_raid_bio *rbio)
 484 {
 485         struct btrfs_stripe_hash_table *table;
 486         unsigned long flags;
 487
 488         if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
 489                 return;
 490
 491         table = rbio->fs_info->stripe_hash_table;
 492
 493         spin_lock_irqsave(&table->cache_lock, flags);
 494         spin_lock(&rbio->bio_list_lock);
 495
 496         /* bump our ref if we were not in the list before */
 497         if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
 498                 atomic_inc(&rbio->refs);
 499
 500         if (!list_empty(&rbio->stripe_cache)){
 501                 list_move(&rbio->stripe_cache, &table->stripe_cache);
 502         } else {
 503                 list_add(&rbio->stripe_cache, &table->stripe_cache);
 504                 table->cache_size += 1;
 505         }
 506
 507         spin_unlock(&rbio->bio_list_lock);
 508
 509         if (table->cache_size > RBIO_CACHE_SIZE) {
 510                 struct btrfs_raid_bio *found;
 511
 512                 found = list_entry(table->stripe_cache.prev,
 513                                   struct btrfs_raid_bio,
 514                                   stripe_cache);
 515
 516                 if (found != rbio)
 517                         __remove_rbio_from_cache(found);
 518         }
 519
 520         spin_unlock_irqrestore(&table->cache_lock, flags);
 521         return;
 522 }
 523
 524 /*
 525  * helper function to run the xor_blocks api.  It is only
 526  * able to do MAX_XOR_BLOCKS at a time, so we need to
 527  * loop through.
 528  */
 529 static void run_xor(void **pages, int src_cnt, ssize_t len)
 530 {
 531         int src_off = 0;
 532         int xor_src_cnt = 0;
 533         void *dest = pages[src_cnt];
 534
 535         while(src_cnt > 0) {
 536                 xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
 537                 xor_blocks(xor_src_cnt, len, dest, pages + src_off);
 538
 539                 src_cnt -= xor_src_cnt;
 540                 src_off += xor_src_cnt;
 541         }
 542 }
 543
 544 /*
 545  * returns true if the bio list inside this rbio
 546  * covers an entire stripe (no rmw required).
 547  * Must be called with the bio list lock held, or
 548  * at a time when you know it is impossible to add
 549  * new bios into the list
 550  */
 551 static int __rbio_is_full(struct btrfs_raid_bio *rbio)
 552 {
 553         unsigned long size = rbio->bio_list_bytes;
 554         int ret = 1;
 555
 556         if (size != rbio->nr_data * rbio->stripe_len)
 557                 ret = 0;
 558
 559         BUG_ON(size > rbio->nr_data * rbio->stripe_len);
 560         return ret;
 561 }
 562
 563 static int rbio_is_full(struct btrfs_raid_bio *rbio)
 564 {
 565         unsigned long flags;
 566         int ret;
 567
 568         spin_lock_irqsave(&rbio->bio_list_lock, flags);
 569         ret = __rbio_is_full(rbio);
 570         spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
 571         return ret;
 572 }
 573
 574 /*
 575  * returns 1 if it is safe to merge two rbios together.
 576  * The merging is safe if the two rbios correspond to
 577  * the same stripe and if they are both going in the same
 578  * direction (read vs write), and if neither one is
 579  * locked for final IO
 580  *
 581  * The caller is responsible for locking such that
 582  * rmw_locked is safe to test
 583  */
 584 static int rbio_can_merge(struct btrfs_raid_bio *last,
 585                           struct btrfs_raid_bio *cur)
 586 {
 587         if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
 588             test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
 589                 return 0;
 590
 591         /*
 592          * we can't merge with cached rbios, since the
 593          * idea is that when we merge the destination
 594          * rbio is going to run our IO for us.  We can
 595          * steal from cached rbio's though, other functions
 596          * handle that.
 597          */
 598         if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
 599             test_bit(RBIO_CACHE_BIT, &cur->flags))
 600                 return 0;
 601
 602         if (last->bbio->raid_map[0] !=
 603             cur->bbio->raid_map[0])
 604                 return 0;
 605
 606         /* we can't merge with different operations */
 607         if (last->operation != cur->operation)
 608                 return 0;
 609         /*
 610          * We've need read the full stripe from the drive.
 611          * check and repair the parity and write the new results.
 612          *
 613          * We're not allowed to add any new bios to the
 614          * bio list here, anyone else that wants to
 615          * change this stripe needs to do their own rmw.
 616          */
 617         if (last->operation == BTRFS_RBIO_PARITY_SCRUB ||
 618             cur->operation == BTRFS_RBIO_PARITY_SCRUB)
 619                 return 0;
 620
 621         return 1;
 622 }
 623
 624 /*
 625  * helper to index into the pstripe
 626  */
 627 static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
 628 {
 629         index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
 630         return rbio->stripe_pages[index];
 631 }
 632
 633 /*
 634  * helper to index into the qstripe, returns null
 635  * if there is no qstripe
 636  */
 637 static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
 638 {
 639         if (rbio->nr_data + 1 == rbio->real_stripes)
 640                 return NULL;
 641
 642         index += ((rbio->nr_data + 1) * rbio->stripe_len) >>
 643                 PAGE_CACHE_SHIFT;
 644         return rbio->stripe_pages[index];
 645 }
 646
 647 /*
 648  * The first stripe in the table for a logical address
 649  * has the lock.  rbios are added in one of three ways:
 650  *
 651  * 1) Nobody has the stripe locked yet.  The rbio is given
 652  * the lock and 0 is returned.  The caller must start the IO
 653  * themselves.
 654  *
 655  * 2) Someone has the stripe locked, but we're able to merge
 656  * with the lock owner.  The rbio is freed and the IO will
 657  * start automatically along with the existing rbio.  1 is returned.
 658  *
 659  * 3) Someone has the stripe locked, but we're not able to merge.
 660  * The rbio is added to the lock owner's plug list, or merged into
 661  * an rbio already on the plug list.  When the lock owner unlocks,
 662  * the next rbio on the list is run and the IO is started automatically.
 663  * 1 is returned
 664  *
 665  * If we return 0, the caller still owns the rbio and must continue with
 666  * IO submission.  If we return 1, the caller must assume the rbio has
 667  * already been freed.
 668  */
 669 static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
 670 {
 671         int bucket = rbio_bucket(rbio);
 672         struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket;
 673         struct btrfs_raid_bio *cur;
 674         struct btrfs_raid_bio *pending;
 675         unsigned long flags;
 676         DEFINE_WAIT(wait);
 677         struct btrfs_raid_bio *freeit = NULL;
 678         struct btrfs_raid_bio *cache_drop = NULL;
 679         int ret = 0;
 680         int walk = 0;
 681
 682         spin_lock_irqsave(&h->lock, flags);
 683         list_for_each_entry(cur, &h->hash_list, hash_list) {
 684                 walk++;
 685                 if (cur->bbio->raid_map[0] == rbio->bbio->raid_map[0]) {
 686                         spin_lock(&cur->bio_list_lock);
 687
 688                         /* can we steal this cached rbio's pages? */
 689                         if (bio_list_empty(&cur->bio_list) &&
 690                             list_empty(&cur->plug_list) &&
 691                             test_bit(RBIO_CACHE_BIT, &cur->flags) &&
 692                             !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
 693                                 list_del_init(&cur->hash_list);
 694                                 atomic_dec(&cur->refs);
 695
 696                                 steal_rbio(cur, rbio);
 697                                 cache_drop = cur;
 698                                 spin_unlock(&cur->bio_list_lock);
 699
 700                                 goto lockit;
 701                         }
 702
 703                         /* can we merge into the lock owner? */
 704                         if (rbio_can_merge(cur, rbio)) {
 705                                 merge_rbio(cur, rbio);
 706                                 spin_unlock(&cur->bio_list_lock);
 707                                 freeit = rbio;
 708                                 ret = 1;
 709                                 goto out;
 710                         }
 711
 712
 713                         /*
 714                          * we couldn't merge with the running
 715                          * rbio, see if we can merge with the
 716                          * pending ones.  We don't have to
 717                          * check for rmw_locked because there
 718                          * is no way they are inside finish_rmw
 719                          * right now
 720                          */
 721                         list_for_each_entry(pending, &cur->plug_list,
 722                                             plug_list) {
 723                                 if (rbio_can_merge(pending, rbio)) {
 724                                         merge_rbio(pending, rbio);
 725                                         spin_unlock(&cur->bio_list_lock);
 726                                         freeit = rbio;
 727                                         ret = 1;
 728                                         goto out;
 729                                 }
 730                         }
 731
 732                         /* no merging, put us on the tail of the plug list,
 733                          * our rbio will be started with the currently
 734                          * running rbio unlocks
 735                          */
 736                         list_add_tail(&rbio->plug_list, &cur->plug_list);
 737                         spin_unlock(&cur->bio_list_lock);
 738                         ret = 1;
 739                         goto out;
 740                 }
 741         }
 742 lockit:
 743         atomic_inc(&rbio->refs);
 744         list_add(&rbio->hash_list, &h->hash_list);
 745 out:
 746         spin_unlock_irqrestore(&h->lock, flags);
 747         if (cache_drop)
 748                 remove_rbio_from_cache(cache_drop);
 749         if (freeit)
 750                 __free_raid_bio(freeit);
 751         return ret;
 752 }
 753
 754 /*
 755  * called as rmw or parity rebuild is completed.  If the plug list has more
 756  * rbios waiting for this stripe, the next one on the list will be started
 757  */
 758 static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
 759 {
 760         int bucket;
 761         struct btrfs_stripe_hash *h;
 762         unsigned long flags;
 763         int keep_cache = 0;
 764
 765         bucket = rbio_bucket(rbio);
 766         h = rbio->fs_info->stripe_hash_table->table + bucket;
 767
 768         if (list_empty(&rbio->plug_list))
 769                 cache_rbio(rbio);
 770
 771         spin_lock_irqsave(&h->lock, flags);
 772         spin_lock(&rbio->bio_list_lock);
 773
 774         if (!list_empty(&rbio->hash_list)) {
 775                 /*
 776                  * if we're still cached and there is no other IO
 777                  * to perform, just leave this rbio here for others
 778                  * to steal from later
 779                  */
 780                 if (list_empty(&rbio->plug_list) &&
 781                     test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
 782                         keep_cache = 1;
 783                         clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
 784                         BUG_ON(!bio_list_empty(&rbio->bio_list));
 785                         goto done;
 786                 }
 787
 788                 list_del_init(&rbio->hash_list);
 789                 atomic_dec(&rbio->refs);
 790
 791                 /*
 792                  * we use the plug list to hold all the rbios
 793                  * waiting for the chance to lock this stripe.
 794                  * hand the lock over to one of them.
 795                  */
 796                 if (!list_empty(&rbio->plug_list)) {
 797                         struct btrfs_raid_bio *next;
 798                         struct list_head *head = rbio->plug_list.next;
 799
 800                         next = list_entry(head, struct btrfs_raid_bio,
 801                                           plug_list);
 802
 803                         list_del_init(&rbio->plug_list);
 804
 805                         list_add(&next->hash_list, &h->hash_list);
 806                         atomic_inc(&next->refs);
 807                         spin_unlock(&rbio->bio_list_lock);
 808                         spin_unlock_irqrestore(&h->lock, flags);
 809
 810                         if (next->operation == BTRFS_RBIO_READ_REBUILD)
 811                                 async_read_rebuild(next);
 812                         else if (next->operation == BTRFS_RBIO_WRITE) {
 813                                 steal_rbio(rbio, next);
 814                                 async_rmw_stripe(next);
 815                         } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
 816                                 steal_rbio(rbio, next);
 817                                 async_scrub_parity(next);
 818                         }
 819
 820                         goto done_nolock;
 821                 } else  if (waitqueue_active(&h->wait)) {
 822                         spin_unlock(&rbio->bio_list_lock);
 823                         spin_unlock_irqrestore(&h->lock, flags);
 824                         wake_up(&h->wait);
 825                         goto done_nolock;
 826                 }
 827         }
 828 done:
 829         spin_unlock(&rbio->bio_list_lock);
 830         spin_unlock_irqrestore(&h->lock, flags);
 831
 832 done_nolock:
 833         if (!keep_cache)
 834                 remove_rbio_from_cache(rbio);
 835 }
 836
 837 static inline void
 838 __free_bbio(struct btrfs_bio *bbio, int need)
 839 {
 840         if (need)
 841                 kfree(bbio);
 842 }
 843
 844 static inline void free_bbio(struct btrfs_raid_bio *rbio)
 845 {
 846         __free_bbio(rbio->bbio,
 847                     !test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags));
 848 }
 849
 850 static void __free_raid_bio(struct btrfs_raid_bio *rbio)
 851 {
 852         int i;
 853
 854         WARN_ON(atomic_read(&rbio->refs) < 0);
 855         if (!atomic_dec_and_test(&rbio->refs))
 856                 return;
 857
 858         WARN_ON(!list_empty(&rbio->stripe_cache));
 859         WARN_ON(!list_empty(&rbio->hash_list));
 860         WARN_ON(!bio_list_empty(&rbio->bio_list));
 861
 862         for (i = 0; i < rbio->nr_pages; i++) {
 863                 if (rbio->stripe_pages[i]) {
 864                         __free_page(rbio->stripe_pages[i]);
 865                         rbio->stripe_pages[i] = NULL;
 866                 }
 867         }
 868
 869         free_bbio(rbio);
 870
 871         kfree(rbio);
 872 }
 873
 874 static void free_raid_bio(struct btrfs_raid_bio *rbio)
 875 {
 876         unlock_stripe(rbio);
 877         __free_raid_bio(rbio);
 878 }
 879
 880 /*
 881  * this frees the rbio and runs through all the bios in the
 882  * bio_list and calls end_io on them
 883  */
 884 static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
 885 {
 886         struct bio *cur = bio_list_get(&rbio->bio_list);
 887         struct bio *next;
 888
 889         if (rbio->generic_bio_cnt)
 890                 btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt);
 891
 892         free_raid_bio(rbio);
 893
 894         while (cur) {
 895                 next = cur->bi_next;
 896                 cur->bi_next = NULL;
 897                 if (uptodate)
 898                         set_bit(BIO_UPTODATE, &cur->bi_flags);
 899                 bio_endio(cur, err);
 900                 cur = next;
 901         }
 902 }
 903
 904 /*
 905  * end io function used by finish_rmw.  When we finally
 906  * get here, we've written a full stripe
 907  */
 908 static void raid_write_end_io(struct bio *bio, int err)
 909 {
 910         struct btrfs_raid_bio *rbio = bio->bi_private;
 911
 912         if (err)
 913                 fail_bio_stripe(rbio, bio);
 914
 915         bio_put(bio);
 916
 917         if (!atomic_dec_and_test(&rbio->stripes_pending))
 918                 return;
 919
 920         err = 0;
 921
 922         /* OK, we have read all the stripes we need to. */
 923         if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
 924                 err = -EIO;
 925
 926         rbio_orig_end_io(rbio, err, 0);
 927         return;
 928 }
 929
 930 /*
 931  * the read/modify/write code wants to use the original bio for
 932  * any pages it included, and then use the rbio for everything
 933  * else.  This function decides if a given index (stripe number)
 934  * and page number in that stripe fall inside the original bio
 935  * or the rbio.
 936  *
 937  * if you set bio_list_only, you'll get a NULL back for any ranges
 938  * that are outside the bio_list
 939  *
 940  * This doesn't take any refs on anything, you get a bare page pointer
 941  * and the caller must bump refs as required.
 942  *
 943  * You must call index_rbio_pages once before you can trust
 944  * the answers from this function.
 945  */
 946 static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
 947                                  int index, int pagenr, int bio_list_only)
 948 {
 949         int chunk_page;
 950         struct page *p = NULL;
 951
 952         chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
 953
 954         spin_lock_irq(&rbio->bio_list_lock);
 955         p = rbio->bio_pages[chunk_page];
 956         spin_unlock_irq(&rbio->bio_list_lock);
 957
 958         if (p || bio_list_only)
 959                 return p;
 960
 961         return rbio->stripe_pages[chunk_page];
 962 }
 963
 964 /*
 965  * number of pages we need for the entire stripe across all the
 966  * drives
 967  */
 968 static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
 969 {
 970         unsigned long nr = stripe_len * nr_stripes;
 971         return DIV_ROUND_UP(nr, PAGE_CACHE_SIZE);
 972 }
 973
 974 /*
 975  * allocation and initial setup for the btrfs_raid_bio.  Not
 976  * this does not allocate any pages for rbio->pages.
 977  */
 978 static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
 979                           struct btrfs_bio *bbio, u64 stripe_len)
 980 {
 981         struct btrfs_raid_bio *rbio;
 982         int nr_data = 0;
 983         int real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
 984         int num_pages = rbio_nr_pages(stripe_len, real_stripes);
 985         int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE);
 986         void *p;
 987
 988         rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2 +
 989                        DIV_ROUND_UP(stripe_npages, BITS_PER_LONG / 8),
 990                         GFP_NOFS);
 991         if (!rbio)
 992                 return ERR_PTR(-ENOMEM);
 993
 994         bio_list_init(&rbio->bio_list);
 995         INIT_LIST_HEAD(&rbio->plug_list);
 996         spin_lock_init(&rbio->bio_list_lock);
 997         INIT_LIST_HEAD(&rbio->stripe_cache);
 998         INIT_LIST_HEAD(&rbio->hash_list);
 999         rbio->bbio = bbio;
1000         rbio->fs_info = root->fs_info;
1001         rbio->stripe_len = stripe_len;
1002         rbio->nr_pages = num_pages;
1003         rbio->real_stripes = real_stripes;
1004         rbio->stripe_npages = stripe_npages;
1005         rbio->faila = -1;
1006         rbio->failb = -1;
1007         atomic_set(&rbio->refs, 1);
1008         atomic_set(&rbio->error, 0);
1009         atomic_set(&rbio->stripes_pending, 0);
1010
1011         /*
1012          * the stripe_pages and bio_pages array point to the extra
1013          * memory we allocated past the end of the rbio
1014          */
1015         p = rbio + 1;
1016         rbio->stripe_pages = p;
1017         rbio->bio_pages = p + sizeof(struct page *) * num_pages;
1018         rbio->dbitmap = p + sizeof(struct page *) * num_pages * 2;
1019
1020         if (bbio->raid_map[real_stripes - 1] == RAID6_Q_STRIPE)
1021                 nr_data = real_stripes - 2;
1022         else
1023                 nr_data = real_stripes - 1;
1024
1025         rbio->nr_data = nr_data;
1026         return rbio;
1027 }
1028
1029 /* allocate pages for all the stripes in the bio, including parity */
1030 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
1031 {
1032         int i;
1033         struct page *page;
1034
1035         for (i = 0; i < rbio->nr_pages; i++) {
1036                 if (rbio->stripe_pages[i])
1037                         continue;
1038                 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
1039                 if (!page)
1040                         return -ENOMEM;
1041                 rbio->stripe_pages[i] = page;
1042                 ClearPageUptodate(page);
1043         }
1044         return 0;
1045 }
1046
1047 /* allocate pages for just the p/q stripes */
1048 static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
1049 {
1050         int i;
1051         struct page *page;
1052
1053         i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
1054
1055         for (; i < rbio->nr_pages; i++) {
1056                 if (rbio->stripe_pages[i])
1057                         continue;
1058                 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
1059                 if (!page)
1060                         return -ENOMEM;
1061                 rbio->stripe_pages[i] = page;
1062         }
1063         return 0;
1064 }
1065
1066 /*
1067  * add a single page from a specific stripe into our list of bios for IO
1068  * this will try to merge into existing bios if possible, and returns
1069  * zero if all went well.
1070  */
1071 static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
1072                             struct bio_list *bio_list,
1073                             struct page *page,
1074                             int stripe_nr,
1075                             unsigned long page_index,
1076                             unsigned long bio_max_len)
1077 {
1078         struct bio *last = bio_list->tail;
1079         u64 last_end = 0;
1080         int ret;
1081         struct bio *bio;
1082         struct btrfs_bio_stripe *stripe;
1083         u64 disk_start;
1084
1085         stripe = &rbio->bbio->stripes[stripe_nr];
1086         disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT);
1087
1088         /* if the device is missing, just fail this stripe */
1089         if (!stripe->dev->bdev)
1090                 return fail_rbio_index(rbio, stripe_nr);
1091
1092         /* see if we can add this page onto our existing bio */
1093         if (last) {
1094                 last_end = (u64)last->bi_iter.bi_sector << 9;
1095                 last_end += last->bi_iter.bi_size;
1096
1097                 /*
1098                  * we can't merge these if they are from different
1099                  * devices or if they are not contiguous
1100                  */
1101                 if (last_end == disk_start && stripe->dev->bdev &&
1102                     test_bit(BIO_UPTODATE, &last->bi_flags) &&
1103                     last->bi_bdev == stripe->dev->bdev) {
1104                         ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0);
1105                         if (ret == PAGE_CACHE_SIZE)
1106                                 return 0;
1107                 }
1108         }
1109
1110         /* put a new bio on the list */
1111         bio = btrfs_io_bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1);
1112         if (!bio)
1113                 return -ENOMEM;
1114
1115         bio->bi_iter.bi_size = 0;
1116         bio->bi_bdev = stripe->dev->bdev;
1117         bio->bi_iter.bi_sector = disk_start >> 9;
1118         set_bit(BIO_UPTODATE, &bio->bi_flags);
1119
1120         bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
1121         bio_list_add(bio_list, bio);
1122         return 0;
1123 }
1124
1125 /*
1126  * while we're doing the read/modify/write cycle, we could
1127  * have errors in reading pages off the disk.  This checks
1128  * for errors and if we're not able to read the page it'll
1129  * trigger parity reconstruction.  The rmw will be finished
1130  * after we've reconstructed the failed stripes
1131  */
1132 static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
1133 {
1134         if (rbio->faila >= 0 || rbio->failb >= 0) {
1135                 BUG_ON(rbio->faila == rbio->real_stripes - 1);
1136                 __raid56_parity_recover(rbio);
1137         } else {
1138                 finish_rmw(rbio);
1139         }
1140 }
1141
1142 /*
1143  * these are just the pages from the rbio array, not from anything
1144  * the FS sent down to us
1145  */
1146 static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page)
1147 {
1148         int index;
1149         index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT);
1150         index += page;
1151         return rbio->stripe_pages[index];
1152 }
1153
1154 /*
1155  * helper function to walk our bio list and populate the bio_pages array with
1156  * the result.  This seems expensive, but it is faster than constantly
1157  * searching through the bio list as we setup the IO in finish_rmw or stripe
1158  * reconstruction.
1159  *
1160  * This must be called before you trust the answers from page_in_rbio
1161  */
1162 static void index_rbio_pages(struct btrfs_raid_bio *rbio)
1163 {
1164         struct bio *bio;
1165         u64 start;
1166         unsigned long stripe_offset;
1167         unsigned long page_index;
1168         struct page *p;
1169         int i;
1170
1171         spin_lock_irq(&rbio->bio_list_lock);
1172         bio_list_for_each(bio, &rbio->bio_list) {
1173                 start = (u64)bio->bi_iter.bi_sector << 9;
1174                 stripe_offset = start - rbio->bbio->raid_map[0];
1175                 page_index = stripe_offset >> PAGE_CACHE_SHIFT;
1176
1177                 for (i = 0; i < bio->bi_vcnt; i++) {
1178                         p = bio->bi_io_vec[i].bv_page;
1179                         rbio->bio_pages[page_index + i] = p;
1180                 }
1181         }
1182         spin_unlock_irq(&rbio->bio_list_lock);
1183 }
1184
1185 /*
1186  * this is called from one of two situations.  We either
1187  * have a full stripe from the higher layers, or we've read all
1188  * the missing bits off disk.
1189  *
1190  * This will calculate the parity and then send down any
1191  * changed blocks.
1192  */
1193 static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1194 {
1195         struct btrfs_bio *bbio = rbio->bbio;
1196         void *pointers[rbio->real_stripes];
1197         int stripe_len = rbio->stripe_len;
1198         int nr_data = rbio->nr_data;
1199         int stripe;
1200         int pagenr;
1201         int p_stripe = -1;
1202         int q_stripe = -1;
1203         struct bio_list bio_list;
1204         struct bio *bio;
1205         int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT;
1206         int ret;
1207
1208         bio_list_init(&bio_list);
1209
1210         if (rbio->real_stripes - rbio->nr_data == 1) {
1211                 p_stripe = rbio->real_stripes - 1;
1212         } else if (rbio->real_stripes - rbio->nr_data == 2) {
1213                 p_stripe = rbio->real_stripes - 2;
1214                 q_stripe = rbio->real_stripes - 1;
1215         } else {
1216                 BUG();
1217         }
1218
1219         /* at this point we either have a full stripe,
1220          * or we've read the full stripe from the drive.
1221          * recalculate the parity and write the new results.
1222          *
1223          * We're not allowed to add any new bios to the
1224          * bio list here, anyone else that wants to
1225          * change this stripe needs to do their own rmw.
1226          */
1227         spin_lock_irq(&rbio->bio_list_lock);
1228         set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1229         spin_unlock_irq(&rbio->bio_list_lock);
1230
1231         atomic_set(&rbio->error, 0);
1232
1233         /*
1234          * now that we've set rmw_locked, run through the
1235          * bio list one last time and map the page pointers
1236          *
1237          * We don't cache full rbios because we're assuming
1238          * the higher layers are unlikely to use this area of
1239          * the disk again soon.  If they do use it again,
1240          * hopefully they will send another full bio.
1241          */
1242         index_rbio_pages(rbio);
1243         if (!rbio_is_full(rbio))
1244                 cache_rbio_pages(rbio);
1245         else
1246                 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
1247
1248         for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
1249                 struct page *p;
1250                 /* first collect one page from each data stripe */
1251                 for (stripe = 0; stripe < nr_data; stripe++) {
1252                         p = page_in_rbio(rbio, stripe, pagenr, 0);
1253                         pointers[stripe] = kmap(p);
1254                 }
1255
1256                 /* then add the parity stripe */
1257                 p = rbio_pstripe_page(rbio, pagenr);
1258                 SetPageUptodate(p);
1259                 pointers[stripe++] = kmap(p);
1260
1261                 if (q_stripe != -1) {
1262
1263                         /*
1264                          * raid6, add the qstripe and call the
1265                          * library function to fill in our p/q
1266                          */
1267                         p = rbio_qstripe_page(rbio, pagenr);
1268                         SetPageUptodate(p);
1269                         pointers[stripe++] = kmap(p);
1270
1271                         raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
1272                                                 pointers);
1273                 } else {
1274                         /* raid5 */
1275                         memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
1276                         run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
1277                 }
1278
1279
1280                 for (stripe = 0; stripe < rbio->real_stripes; stripe++)
1281                         kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
1282         }
1283
1284         /*
1285          * time to start writing.  Make bios for everything from the
1286          * higher layers (the bio_list in our rbio) and our p/q.  Ignore
1287          * everything else.
1288          */
1289         for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1290                 for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
1291                         struct page *page;
1292                         if (stripe < rbio->nr_data) {
1293                                 page = page_in_rbio(rbio, stripe, pagenr, 1);
1294                                 if (!page)
1295                                         continue;
1296                         } else {
1297                                page = rbio_stripe_page(rbio, stripe, pagenr);
1298                         }
1299
1300                         ret = rbio_add_io_page(rbio, &bio_list,
1301                                        page, stripe, pagenr, rbio->stripe_len);
1302                         if (ret)
1303                                 goto cleanup;
1304                 }
1305         }
1306
1307         if (likely(!bbio->num_tgtdevs))
1308                 goto write_data;
1309
1310         for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1311                 if (!bbio->tgtdev_map[stripe])
1312                         continue;
1313
1314                 for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
1315                         struct page *page;
1316                         if (stripe < rbio->nr_data) {
1317                                 page = page_in_rbio(rbio, stripe, pagenr, 1);
1318                                 if (!page)
1319                                         continue;
1320                         } else {
1321                                page = rbio_stripe_page(rbio, stripe, pagenr);
1322                         }
1323
1324                         ret = rbio_add_io_page(rbio, &bio_list, page,
1325                                                rbio->bbio->tgtdev_map[stripe],
1326                                                pagenr, rbio->stripe_len);
1327                         if (ret)
1328                                 goto cleanup;
1329                 }
1330         }
1331
1332 write_data:
1333         atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
1334         BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
1335
1336         while (1) {
1337                 bio = bio_list_pop(&bio_list);
1338                 if (!bio)
1339                         break;
1340
1341                 bio->bi_private = rbio;
1342                 bio->bi_end_io = raid_write_end_io;
1343                 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
1344                 submit_bio(WRITE, bio);
1345         }
1346         return;
1347
1348 cleanup:
1349         rbio_orig_end_io(rbio, -EIO, 0);
1350 }
1351
1352 /*
1353  * helper to find the stripe number for a given bio.  Used to figure out which
1354  * stripe has failed.  This expects the bio to correspond to a physical disk,
1355  * so it looks up based on physical sector numbers.
1356  */
1357 static int find_bio_stripe(struct btrfs_raid_bio *rbio,
1358                            struct bio *bio)
1359 {
1360         u64 physical = bio->bi_iter.bi_sector;
1361         u64 stripe_start;
1362         int i;
1363         struct btrfs_bio_stripe *stripe;
1364
1365         physical <<= 9;
1366
1367         for (i = 0; i < rbio->bbio->num_stripes; i++) {
1368                 stripe = &rbio->bbio->stripes[i];
1369                 stripe_start = stripe->physical;
1370                 if (physical >= stripe_start &&
1371                     physical < stripe_start + rbio->stripe_len &&
1372                     bio->bi_bdev == stripe->dev->bdev) {
1373                         return i;
1374                 }
1375         }
1376         return -1;
1377 }
1378
1379 /*
1380  * helper to find the stripe number for a given
1381  * bio (before mapping).  Used to figure out which stripe has
1382  * failed.  This looks up based on logical block numbers.
1383  */
1384 static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
1385                                    struct bio *bio)
1386 {
1387         u64 logical = bio->bi_iter.bi_sector;
1388         u64 stripe_start;
1389         int i;
1390
1391         logical <<= 9;
1392
1393         for (i = 0; i < rbio->nr_data; i++) {
1394                 stripe_start = rbio->bbio->raid_map[i];
1395                 if (logical >= stripe_start &&
1396                     logical < stripe_start + rbio->stripe_len) {
1397                         return i;
1398                 }
1399         }
1400         return -1;
1401 }
1402
1403 /*
1404  * returns -EIO if we had too many failures
1405  */
1406 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
1407 {
1408         unsigned long flags;
1409         int ret = 0;
1410
1411         spin_lock_irqsave(&rbio->bio_list_lock, flags);
1412
1413         /* we already know this stripe is bad, move on */
1414         if (rbio->faila == failed || rbio->failb == failed)
1415                 goto out;
1416
1417         if (rbio->faila == -1) {
1418                 /* first failure on this rbio */
1419                 rbio->faila = failed;
1420                 atomic_inc(&rbio->error);
1421         } else if (rbio->failb == -1) {
1422                 /* second failure on this rbio */
1423                 rbio->failb = failed;
1424                 atomic_inc(&rbio->error);
1425         } else {
1426                 ret = -EIO;
1427         }
1428 out:
1429         spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
1430
1431         return ret;
1432 }
1433
1434 /*
1435  * helper to fail a stripe based on a physical disk
1436  * bio.
1437  */
1438 static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
1439                            struct bio *bio)
1440 {
1441         int failed = find_bio_stripe(rbio, bio);
1442
1443         if (failed < 0)
1444                 return -EIO;
1445
1446         return fail_rbio_index(rbio, failed);
1447 }
1448
1449 /*
1450  * this sets each page in the bio uptodate.  It should only be used on private
1451  * rbio pages, nothing that comes in from the higher layers
1452  */
1453 static void set_bio_pages_uptodate(struct bio *bio)
1454 {
1455         int i;
1456         struct page *p;
1457
1458         for (i = 0; i < bio->bi_vcnt; i++) {
1459                 p = bio->bi_io_vec[i].bv_page;
1460                 SetPageUptodate(p);
1461         }
1462 }
1463
1464 /*
1465  * end io for the read phase of the rmw cycle.  All the bios here are physical
1466  * stripe bios we've read from the disk so we can recalculate the parity of the
1467  * stripe.
1468  *
1469  * This will usually kick off finish_rmw once all the bios are read in, but it
1470  * may trigger parity reconstruction if we had any errors along the way
1471  */
1472 static void raid_rmw_end_io(struct bio *bio, int err)
1473 {
1474         struct btrfs_raid_bio *rbio = bio->bi_private;
1475
1476         if (err)
1477                 fail_bio_stripe(rbio, bio);
1478         else
1479                 set_bio_pages_uptodate(bio);
1480
1481         bio_put(bio);
1482
1483         if (!atomic_dec_and_test(&rbio->stripes_pending))
1484                 return;
1485
1486         err = 0;
1487         if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
1488                 goto cleanup;
1489
1490         /*
1491          * this will normally call finish_rmw to start our write
1492          * but if there are any failed stripes we'll reconstruct
1493          * from parity first
1494          */
1495         validate_rbio_for_rmw(rbio);
1496         return;
1497
1498 cleanup:
1499
1500         rbio_orig_end_io(rbio, -EIO, 0);
1501 }
1502
1503 static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
1504 {
1505         btrfs_init_work(&rbio->work, btrfs_rmw_helper,
1506                         rmw_work, NULL, NULL);
1507
1508         btrfs_queue_work(rbio->fs_info->rmw_workers,
1509                          &rbio->work);
1510 }
1511
1512 static void async_read_rebuild(struct btrfs_raid_bio *rbio)
1513 {
1514         btrfs_init_work(&rbio->work, btrfs_rmw_helper,
1515                         read_rebuild_work, NULL, NULL);
1516
1517         btrfs_queue_work(rbio->fs_info->rmw_workers,
1518                          &rbio->work);
1519 }
1520
1521 /*
1522  * the stripe must be locked by the caller.  It will
1523  * unlock after all the writes are done
1524  */
1525 static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
1526 {
1527         int bios_to_read = 0;
1528         struct bio_list bio_list;
1529         int ret;
1530         int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
1531         int pagenr;
1532         int stripe;
1533         struct bio *bio;
1534
1535         bio_list_init(&bio_list);
1536
1537         ret = alloc_rbio_pages(rbio);
1538         if (ret)
1539                 goto cleanup;
1540
1541         index_rbio_pages(rbio);
1542
1543         atomic_set(&rbio->error, 0);
1544         /*
1545          * build a list of bios to read all the missing parts of this
1546          * stripe
1547          */
1548         for (stripe = 0; stripe < rbio->nr_data; stripe++) {
1549                 for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1550                         struct page *page;
1551                         /*
1552                          * we want to find all the pages missing from
1553                          * the rbio and read them from the disk.  If
1554                          * page_in_rbio finds a page in the bio list
1555                          * we don't need to read it off the stripe.
1556                          */
1557                         page = page_in_rbio(rbio, stripe, pagenr, 1);
1558                         if (page)
1559                                 continue;
1560
1561                         page = rbio_stripe_page(rbio, stripe, pagenr);
1562                         /*
1563                          * the bio cache may have handed us an uptodate
1564                          * page.  If so, be happy and use it
1565                          */
1566                         if (PageUptodate(page))
1567                                 continue;
1568
1569                         ret = rbio_add_io_page(rbio, &bio_list, page,
1570                                        stripe, pagenr, rbio->stripe_len);
1571                         if (ret)
1572                                 goto cleanup;
1573                 }
1574         }
1575
1576         bios_to_read = bio_list_size(&bio_list);
1577         if (!bios_to_read) {
1578                 /*
1579                  * this can happen if others have merged with
1580                  * us, it means there is nothing left to read.
1581                  * But if there are missing devices it may not be
1582                  * safe to do the full stripe write yet.
1583                  */
1584                 goto finish;
1585         }
1586
1587         /*
1588          * the bbio may be freed once we submit the last bio.  Make sure
1589          * not to touch it after that
1590          */
1591         atomic_set(&rbio->stripes_pending, bios_to_read);
1592         while (1) {
1593                 bio = bio_list_pop(&bio_list);
1594                 if (!bio)
1595                         break;
1596
1597                 bio->bi_private = rbio;
1598                 bio->bi_end_io = raid_rmw_end_io;
1599
1600                 btrfs_bio_wq_end_io(rbio->fs_info, bio,
1601                                     BTRFS_WQ_ENDIO_RAID56);
1602
1603                 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
1604                 submit_bio(READ, bio);
1605         }
1606         /* the actual write will happen once the reads are done */
1607         return 0;
1608
1609 cleanup:
1610         rbio_orig_end_io(rbio, -EIO, 0);
1611         return -EIO;
1612
1613 finish:
1614         validate_rbio_for_rmw(rbio);
1615         return 0;
1616 }
1617
1618 /*
1619  * if the upper layers pass in a full stripe, we thank them by only allocating
1620  * enough pages to hold the parity, and sending it all down quickly.
1621  */
1622 static int full_stripe_write(struct btrfs_raid_bio *rbio)
1623 {
1624         int ret;
1625
1626         ret = alloc_rbio_parity_pages(rbio);
1627         if (ret) {
1628                 __free_raid_bio(rbio);
1629                 return ret;
1630         }
1631
1632         ret = lock_stripe_add(rbio);
1633         if (ret == 0)
1634                 finish_rmw(rbio);
1635         return 0;
1636 }
1637
1638 /*
1639  * partial stripe writes get handed over to async helpers.
1640  * We're really hoping to merge a few more writes into this
1641  * rbio before calculating new parity
1642  */
1643 static int partial_stripe_write(struct btrfs_raid_bio *rbio)
1644 {
1645         int ret;
1646
1647         ret = lock_stripe_add(rbio);
1648         if (ret == 0)
1649                 async_rmw_stripe(rbio);
1650         return 0;
1651 }
1652
1653 /*
1654  * sometimes while we were reading from the drive to
1655  * recalculate parity, enough new bios come into create
1656  * a full stripe.  So we do a check here to see if we can
1657  * go directly to finish_rmw
1658  */
1659 static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
1660 {
1661         /* head off into rmw land if we don't have a full stripe */
1662         if (!rbio_is_full(rbio))
1663                 return partial_stripe_write(rbio);
1664         return full_stripe_write(rbio);
1665 }
1666
1667 /*
1668  * We use plugging call backs to collect full stripes.
1669  * Any time we get a partial stripe write while plugged
1670  * we collect it into a list.  When the unplug comes down,
1671  * we sort the list by logical block number and merge
1672  * everything we can into the same rbios
1673  */
1674 struct btrfs_plug_cb {
1675         struct blk_plug_cb cb;
1676         struct btrfs_fs_info *info;
1677         struct list_head rbio_list;
1678         struct btrfs_work work;
1679 };
1680
1681 /*
1682  * rbios on the plug list are sorted for easier merging.
1683  */
1684 static int plug_cmp(void *priv, struct list_head *a, struct list_head *b)
1685 {
1686         struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
1687                                                  plug_list);
1688         struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
1689                                                  plug_list);
1690         u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
1691         u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
1692
1693         if (a_sector < b_sector)
1694                 return -1;
1695         if (a_sector > b_sector)
1696                 return 1;
1697         return 0;
1698 }
1699
1700 static void run_plug(struct btrfs_plug_cb *plug)
1701 {
1702         struct btrfs_raid_bio *cur;
1703         struct btrfs_raid_bio *last = NULL;
1704
1705         /*
1706          * sort our plug list then try to merge
1707          * everything we can in hopes of creating full
1708          * stripes.
1709          */
1710         list_sort(NULL, &plug->rbio_list, plug_cmp);
1711         while (!list_empty(&plug->rbio_list)) {
1712                 cur = list_entry(plug->rbio_list.next,
1713                                  struct btrfs_raid_bio, plug_list);
1714                 list_del_init(&cur->plug_list);
1715
1716                 if (rbio_is_full(cur)) {
1717                         /* we have a full stripe, send it down */
1718                         full_stripe_write(cur);
1719                         continue;
1720                 }
1721                 if (last) {
1722                         if (rbio_can_merge(last, cur)) {
1723                                 merge_rbio(last, cur);
1724                                 __free_raid_bio(cur);
1725                                 continue;
1726
1727                         }
1728                         __raid56_parity_write(last);
1729                 }
1730                 last = cur;
1731         }
1732         if (last) {
1733                 __raid56_parity_write(last);
1734         }
1735         kfree(plug);
1736 }
1737
1738 /*
1739  * if the unplug comes from schedule, we have to push the
1740  * work off to a helper thread
1741  */
1742 static void unplug_work(struct btrfs_work *work)
1743 {
1744         struct btrfs_plug_cb *plug;
1745         plug = container_of(work, struct btrfs_plug_cb, work);
1746         run_plug(plug);
1747 }
1748
1749 static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
1750 {
1751         struct btrfs_plug_cb *plug;
1752         plug = container_of(cb, struct btrfs_plug_cb, cb);
1753
1754         if (from_schedule) {
1755                 btrfs_init_work(&plug->work, btrfs_rmw_helper,
1756                                 unplug_work, NULL, NULL);
1757                 btrfs_queue_work(plug->info->rmw_workers,
1758                                  &plug->work);
1759                 return;
1760         }
1761         run_plug(plug);
1762 }
1763
1764 /*
1765  * our main entry point for writes from the rest of the FS.
1766  */
1767 int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
1768                         struct btrfs_bio *bbio, u64 stripe_len)
1769 {
1770         struct btrfs_raid_bio *rbio;
1771         struct btrfs_plug_cb *plug = NULL;
1772         struct blk_plug_cb *cb;
1773         int ret;
1774
1775         rbio = alloc_rbio(root, bbio, stripe_len);
1776         if (IS_ERR(rbio)) {
1777                 __free_bbio(bbio, 1);
1778                 return PTR_ERR(rbio);
1779         }
1780         bio_list_add(&rbio->bio_list, bio);
1781         rbio->bio_list_bytes = bio->bi_iter.bi_size;
1782         rbio->operation = BTRFS_RBIO_WRITE;
1783
1784         btrfs_bio_counter_inc_noblocked(root->fs_info);
1785         rbio->generic_bio_cnt = 1;
1786
1787         /*
1788          * don't plug on full rbios, just get them out the door
1789          * as quickly as we can
1790          */
1791         if (rbio_is_full(rbio)) {
1792                 ret = full_stripe_write(rbio);
1793                 if (ret)
1794                         btrfs_bio_counter_dec(root->fs_info);
1795                 return ret;
1796         }
1797
1798         cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info,
1799                                sizeof(*plug));
1800         if (cb) {
1801                 plug = container_of(cb, struct btrfs_plug_cb, cb);
1802                 if (!plug->info) {
1803                         plug->info = root->fs_info;
1804                         INIT_LIST_HEAD(&plug->rbio_list);
1805                 }
1806                 list_add_tail(&rbio->plug_list, &plug->rbio_list);
1807                 ret = 0;
1808         } else {
1809                 ret = __raid56_parity_write(rbio);
1810                 if (ret)
1811                         btrfs_bio_counter_dec(root->fs_info);
1812         }
1813         return ret;
1814 }
1815
1816 /*
1817  * all parity reconstruction happens here.  We've read in everything
1818  * we can find from the drives and this does the heavy lifting of
1819  * sorting the good from the bad.
1820  */
1821 static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1822 {
1823         int pagenr, stripe;
1824         void **pointers;
1825         int faila = -1, failb = -1;
1826         int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
1827         struct page *page;
1828         int err;
1829         int i;
1830
1831         pointers = kzalloc(rbio->real_stripes * sizeof(void *),
1832                            GFP_NOFS);
1833         if (!pointers) {
1834                 err = -ENOMEM;
1835                 goto cleanup_io;
1836         }
1837
1838         faila = rbio->faila;
1839         failb = rbio->failb;
1840
1841         if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
1842                 spin_lock_irq(&rbio->bio_list_lock);
1843                 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1844                 spin_unlock_irq(&rbio->bio_list_lock);
1845         }
1846
1847         index_rbio_pages(rbio);
1848
1849         for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1850                 /*
1851                  * Now we just use bitmap to mark the horizontal stripes in
1852                  * which we have data when doing parity scrub.
1853                  */
1854                 if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
1855                     !test_bit(pagenr, rbio->dbitmap))
1856                         continue;
1857
1858                 /* setup our array of pointers with pages
1859                  * from each stripe
1860                  */
1861                 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1862                         /*
1863                          * if we're rebuilding a read, we have to use
1864                          * pages from the bio list
1865                          */
1866                         if (rbio->operation == BTRFS_RBIO_READ_REBUILD &&
1867                             (stripe == faila || stripe == failb)) {
1868                                 page = page_in_rbio(rbio, stripe, pagenr, 0);
1869                         } else {
1870                                 page = rbio_stripe_page(rbio, stripe, pagenr);
1871                         }
1872                         pointers[stripe] = kmap(page);
1873                 }
1874
1875                 /* all raid6 handling here */
1876                 if (rbio->bbio->raid_map[rbio->real_stripes - 1] ==
1877                     RAID6_Q_STRIPE) {
1878
1879                         /*
1880                          * single failure, rebuild from parity raid5
1881                          * style
1882                          */
1883                         if (failb < 0) {
1884                                 if (faila == rbio->nr_data) {
1885                                         /*
1886                                          * Just the P stripe has failed, without
1887                                          * a bad data or Q stripe.
1888                                          * TODO, we should redo the xor here.
1889                                          */
1890                                         err = -EIO;
1891                                         goto cleanup;
1892                                 }
1893                                 /*
1894                                  * a single failure in raid6 is rebuilt
1895                                  * in the pstripe code below
1896                                  */
1897                                 goto pstripe;
1898                         }
1899
1900                         /* make sure our ps and qs are in order */
1901                         if (faila > failb) {
1902                                 int tmp = failb;
1903                                 failb = faila;
1904                                 faila = tmp;
1905                         }
1906
1907                         /* if the q stripe is failed, do a pstripe reconstruction
1908                          * from the xors.
1909                          * If both the q stripe and the P stripe are failed, we're
1910                          * here due to a crc mismatch and we can't give them the
1911                          * data they want
1912                          */
1913                         if (rbio->bbio->raid_map[failb] == RAID6_Q_STRIPE) {
1914                                 if (rbio->bbio->raid_map[faila] ==
1915                                     RAID5_P_STRIPE) {
1916                                         err = -EIO;
1917                                         goto cleanup;
1918                                 }
1919                                 /*
1920                                  * otherwise we have one bad data stripe and
1921                                  * a good P stripe.  raid5!
1922                                  */
1923                                 goto pstripe;
1924                         }
1925
1926                         if (rbio->bbio->raid_map[failb] == RAID5_P_STRIPE) {
1927                                 raid6_datap_recov(rbio->real_stripes,
1928                                                   PAGE_SIZE, faila, pointers);
1929                         } else {
1930                                 raid6_2data_recov(rbio->real_stripes,
1931                                                   PAGE_SIZE, faila, failb,
1932                                                   pointers);
1933                         }
1934                 } else {
1935                         void *p;
1936
1937                         /* rebuild from P stripe here (raid5 or raid6) */
1938                         BUG_ON(failb != -1);
1939 pstripe:
1940                         /* Copy parity block into failed block to start with */
1941                         memcpy(pointers[faila],
1942                                pointers[rbio->nr_data],
1943                                PAGE_CACHE_SIZE);
1944
1945                         /* rearrange the pointer array */
1946                         p = pointers[faila];
1947                         for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
1948                                 pointers[stripe] = pointers[stripe + 1];
1949                         pointers[rbio->nr_data - 1] = p;
1950
1951                         /* xor in the rest */
1952                         run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE);
1953                 }
1954                 /* if we're doing this rebuild as part of an rmw, go through
1955                  * and set all of our private rbio pages in the
1956                  * failed stripes as uptodate.  This way finish_rmw will
1957                  * know they can be trusted.  If this was a read reconstruction,
1958                  * other endio functions will fiddle the uptodate bits
1959                  */
1960                 if (rbio->operation == BTRFS_RBIO_WRITE) {
1961                         for (i = 0;  i < nr_pages; i++) {
1962                                 if (faila != -1) {
1963                                         page = rbio_stripe_page(rbio, faila, i);
1964                                         SetPageUptodate(page);
1965                                 }
1966                                 if (failb != -1) {
1967                                         page = rbio_stripe_page(rbio, failb, i);
1968                                         SetPageUptodate(page);
1969                                 }
1970                         }
1971                 }
1972                 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1973                         /*
1974                          * if we're rebuilding a read, we have to use
1975                          * pages from the bio list
1976                          */
1977                         if (rbio->operation == BTRFS_RBIO_READ_REBUILD &&
1978                             (stripe == faila || stripe == failb)) {
1979                                 page = page_in_rbio(rbio, stripe, pagenr, 0);
1980                         } else {
1981                                 page = rbio_stripe_page(rbio, stripe, pagenr);
1982                         }
1983                         kunmap(page);
1984                 }
1985         }
1986
1987         err = 0;
1988 cleanup:
1989         kfree(pointers);
1990
1991 cleanup_io:
1992         if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
1993                 if (err == 0 &&
1994                     !test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags))
1995                         cache_rbio_pages(rbio);
1996                 else
1997                         clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
1998
1999                 rbio_orig_end_io(rbio, err, err == 0);
2000         } else if (err == 0) {
2001                 rbio->faila = -1;
2002                 rbio->failb = -1;
2003
2004                 if (rbio->operation == BTRFS_RBIO_WRITE)
2005                         finish_rmw(rbio);
2006                 else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB)
2007                         finish_parity_scrub(rbio, 0);
2008                 else
2009                         BUG();
2010         } else {
2011                 rbio_orig_end_io(rbio, err, 0);
2012         }
2013 }
2014
2015 /*
2016  * This is called only for stripes we've read from disk to
2017  * reconstruct the parity.
2018  */
2019 static void raid_recover_end_io(struct bio *bio, int err)
2020 {
2021         struct btrfs_raid_bio *rbio = bio->bi_private;
2022
2023         /*
2024          * we only read stripe pages off the disk, set them
2025          * up to date if there were no errors
2026          */
2027         if (err)
2028                 fail_bio_stripe(rbio, bio);
2029         else
2030                 set_bio_pages_uptodate(bio);
2031         bio_put(bio);
2032
2033         if (!atomic_dec_and_test(&rbio->stripes_pending))
2034                 return;
2035
2036         if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
2037                 rbio_orig_end_io(rbio, -EIO, 0);
2038         else
2039                 __raid_recover_end_io(rbio);
2040 }
2041
2042 /*
2043  * reads everything we need off the disk to reconstruct
2044  * the parity. endio handlers trigger final reconstruction
2045  * when the IO is done.
2046  *
2047  * This is used both for reads from the higher layers and for
2048  * parity construction required to finish a rmw cycle.
2049  */
2050 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
2051 {
2052         int bios_to_read = 0;
2053         struct bio_list bio_list;
2054         int ret;
2055         int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
2056         int pagenr;
2057         int stripe;
2058         struct bio *bio;
2059
2060         bio_list_init(&bio_list);
2061
2062         ret = alloc_rbio_pages(rbio);
2063         if (ret)
2064                 goto cleanup;
2065
2066         atomic_set(&rbio->error, 0);
2067
2068         /*
2069          * read everything that hasn't failed.  Thanks to the
2070          * stripe cache, it is possible that some or all of these
2071          * pages are going to be uptodate.
2072          */
2073         for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
2074                 if (rbio->faila == stripe || rbio->failb == stripe) {
2075                         atomic_inc(&rbio->error);
2076                         continue;
2077                 }
2078
2079                 for (pagenr = 0; pagenr < nr_pages; pagenr++) {
2080                         struct page *p;
2081
2082                         /*
2083                          * the rmw code may have already read this
2084                          * page in
2085                          */
2086                         p = rbio_stripe_page(rbio, stripe, pagenr);
2087                         if (PageUptodate(p))
2088                                 continue;
2089
2090                         ret = rbio_add_io_page(rbio, &bio_list,
2091                                        rbio_stripe_page(rbio, stripe, pagenr),
2092                                        stripe, pagenr, rbio->stripe_len);
2093                         if (ret < 0)
2094                                 goto cleanup;
2095                 }
2096         }
2097
2098         bios_to_read = bio_list_size(&bio_list);
2099         if (!bios_to_read) {
2100                 /*
2101                  * we might have no bios to read just because the pages
2102                  * were up to date, or we might have no bios to read because
2103                  * the devices were gone.
2104                  */
2105                 if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) {
2106                         __raid_recover_end_io(rbio);
2107                         goto out;
2108                 } else {
2109                         goto cleanup;
2110                 }
2111         }
2112
2113         /*
2114          * the bbio may be freed once we submit the last bio.  Make sure
2115          * not to touch it after that
2116          */
2117         atomic_set(&rbio->stripes_pending, bios_to_read);
2118         while (1) {
2119                 bio = bio_list_pop(&bio_list);
2120                 if (!bio)
2121                         break;
2122
2123                 bio->bi_private = rbio;
2124                 bio->bi_end_io = raid_recover_end_io;
2125
2126                 btrfs_bio_wq_end_io(rbio->fs_info, bio,
2127                                     BTRFS_WQ_ENDIO_RAID56);
2128
2129                 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
2130                 submit_bio(READ, bio);
2131         }
2132 out:
2133         return 0;
2134
2135 cleanup:
2136         if (rbio->operation == BTRFS_RBIO_READ_REBUILD)
2137                 rbio_orig_end_io(rbio, -EIO, 0);
2138         return -EIO;
2139 }
2140
2141 /*
2142  * the main entry point for reads from the higher layers.  This
2143  * is really only called when the normal read path had a failure,
2144  * so we assume the bio they send down corresponds to a failed part
2145  * of the drive.
2146  */
2147 int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
2148                           struct btrfs_bio *bbio, u64 stripe_len,
2149                           int mirror_num, int generic_io)
2150 {
2151         struct btrfs_raid_bio *rbio;
2152         int ret;
2153
2154         rbio = alloc_rbio(root, bbio, stripe_len);
2155         if (IS_ERR(rbio)) {
2156                 __free_bbio(bbio, generic_io);
2157                 return PTR_ERR(rbio);
2158         }
2159
2160         rbio->operation = BTRFS_RBIO_READ_REBUILD;
2161         bio_list_add(&rbio->bio_list, bio);
2162         rbio->bio_list_bytes = bio->bi_iter.bi_size;
2163
2164         rbio->faila = find_logical_bio_stripe(rbio, bio);
2165         if (rbio->faila == -1) {
2166                 BUG();
2167                 __free_bbio(bbio, generic_io);
2168                 kfree(rbio);
2169                 return -EIO;
2170         }
2171
2172         if (generic_io) {
2173                 btrfs_bio_counter_inc_noblocked(root->fs_info);
2174                 rbio->generic_bio_cnt = 1;
2175         } else {
2176                 set_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags);
2177         }
2178
2179         /*
2180          * reconstruct from the q stripe if they are
2181          * asking for mirror 3
2182          */
2183         if (mirror_num == 3)
2184                 rbio->failb = rbio->real_stripes - 2;
2185
2186         ret = lock_stripe_add(rbio);
2187
2188         /*
2189          * __raid56_parity_recover will end the bio with
2190          * any errors it hits.  We don't want to return
2191          * its error value up the stack because our caller
2192          * will end up calling bio_endio with any nonzero
2193          * return
2194          */
2195         if (ret == 0)
2196                 __raid56_parity_recover(rbio);
2197         /*
2198          * our rbio has been added to the list of
2199          * rbios that will be handled after the
2200          * currently lock owner is done
2201          */
2202         return 0;
2203
2204 }
2205
2206 static void rmw_work(struct btrfs_work *work)
2207 {
2208         struct btrfs_raid_bio *rbio;
2209
2210         rbio = container_of(work, struct btrfs_raid_bio, work);
2211         raid56_rmw_stripe(rbio);
2212 }
2213
2214 static void read_rebuild_work(struct btrfs_work *work)
2215 {
2216         struct btrfs_raid_bio *rbio;
2217
2218         rbio = container_of(work, struct btrfs_raid_bio, work);
2219         __raid56_parity_recover(rbio);
2220 }
2221
2222 /*
2223  * The following code is used to scrub/replace the parity stripe
2224  *
2225  * Note: We need make sure all the pages that add into the scrub/replace
2226  * raid bio are correct and not be changed during the scrub/replace. That
2227  * is those pages just hold metadata or file data with checksum.
2228  */
2229
2230 struct btrfs_raid_bio *
2231 raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio,
2232                                struct btrfs_bio *bbio, u64 stripe_len,
2233                                struct btrfs_device *scrub_dev,
2234                                unsigned long *dbitmap, int stripe_nsectors)
2235 {
2236         struct btrfs_raid_bio *rbio;
2237         int i;
2238
2239         rbio = alloc_rbio(root, bbio, stripe_len);
2240         if (IS_ERR(rbio))
2241                 return NULL;
2242         bio_list_add(&rbio->bio_list, bio);
2243         /*
2244          * This is a special bio which is used to hold the completion handler
2245          * and make the scrub rbio is similar to the other types
2246          */
2247         ASSERT(!bio->bi_iter.bi_size);
2248         rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
2249
2250         for (i = 0; i < rbio->real_stripes; i++) {
2251                 if (bbio->stripes[i].dev == scrub_dev) {
2252                         rbio->scrubp = i;
2253                         break;
2254                 }
2255         }
2256
2257         /* Now we just support the sectorsize equals to page size */
2258         ASSERT(root->sectorsize == PAGE_SIZE);
2259         ASSERT(rbio->stripe_npages == stripe_nsectors);
2260         bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors);
2261
2262         return rbio;
2263 }
2264
2265 void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio,
2266                                    struct page *page, u64 logical)
2267 {
2268         int stripe_offset;
2269         int index;
2270
2271         ASSERT(logical >= rbio->bbio->raid_map[0]);
2272         ASSERT(logical + PAGE_SIZE <= rbio->bbio->raid_map[0] +
2273                                 rbio->stripe_len * rbio->nr_data);
2274         stripe_offset = (int)(logical - rbio->bbio->raid_map[0]);
2275         index = stripe_offset >> PAGE_CACHE_SHIFT;
2276         rbio->bio_pages[index] = page;
2277 }
2278
2279 /*
2280  * We just scrub the parity that we have correct data on the same horizontal,
2281  * so we needn't allocate all pages for all the stripes.
2282  */
2283 static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
2284 {
2285         int i;
2286         int bit;
2287         int index;
2288         struct page *page;
2289
2290         for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) {
2291                 for (i = 0; i < rbio->real_stripes; i++) {
2292                         index = i * rbio->stripe_npages + bit;
2293                         if (rbio->stripe_pages[index])
2294                                 continue;
2295
2296                         page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
2297                         if (!page)
2298                                 return -ENOMEM;
2299                         rbio->stripe_pages[index] = page;
2300                         ClearPageUptodate(page);
2301                 }
2302         }
2303         return 0;
2304 }
2305
2306 /*
2307  * end io function used by finish_rmw.  When we finally
2308  * get here, we've written a full stripe
2309  */
2310 static void raid_write_parity_end_io(struct bio *bio, int err)
2311 {
2312         struct btrfs_raid_bio *rbio = bio->bi_private;
2313
2314         if (err)
2315                 fail_bio_stripe(rbio, bio);
2316
2317         bio_put(bio);
2318
2319         if (!atomic_dec_and_test(&rbio->stripes_pending))
2320                 return;
2321
2322         err = 0;
2323
2324         if (atomic_read(&rbio->error))
2325                 err = -EIO;
2326
2327         rbio_orig_end_io(rbio, err, 0);
2328 }
2329
2330 static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
2331                                          int need_check)
2332 {
2333         struct btrfs_bio *bbio = rbio->bbio;
2334         void *pointers[rbio->real_stripes];
2335         DECLARE_BITMAP(pbitmap, rbio->stripe_npages);
2336         int nr_data = rbio->nr_data;
2337         int stripe;
2338         int pagenr;
2339         int p_stripe = -1;
2340         int q_stripe = -1;
2341         struct page *p_page = NULL;
2342         struct page *q_page = NULL;
2343         struct bio_list bio_list;
2344         struct bio *bio;
2345         int is_replace = 0;
2346         int ret;
2347
2348         bio_list_init(&bio_list);
2349
2350         if (rbio->real_stripes - rbio->nr_data == 1) {
2351                 p_stripe = rbio->real_stripes - 1;
2352         } else if (rbio->real_stripes - rbio->nr_data == 2) {
2353                 p_stripe = rbio->real_stripes - 2;
2354                 q_stripe = rbio->real_stripes - 1;
2355         } else {
2356                 BUG();
2357         }
2358
2359         if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) {
2360                 is_replace = 1;
2361                 bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages);
2362         }
2363
2364         /*
2365          * Because the higher layers(scrubber) are unlikely to
2366          * use this area of the disk again soon, so don't cache
2367          * it.
2368          */
2369         clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2370
2371         if (!need_check)
2372                 goto writeback;
2373
2374         p_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
2375         if (!p_page)
2376                 goto cleanup;
2377         SetPageUptodate(p_page);
2378
2379         if (q_stripe != -1) {
2380                 q_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
2381                 if (!q_page) {
2382                         __free_page(p_page);
2383                         goto cleanup;
2384                 }
2385                 SetPageUptodate(q_page);
2386         }
2387
2388         atomic_set(&rbio->error, 0);
2389
2390         for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
2391                 struct page *p;
2392                 void *parity;
2393                 /* first collect one page from each data stripe */
2394                 for (stripe = 0; stripe < nr_data; stripe++) {
2395                         p = page_in_rbio(rbio, stripe, pagenr, 0);
2396                         pointers[stripe] = kmap(p);
2397                 }
2398
2399                 /* then add the parity stripe */
2400                 pointers[stripe++] = kmap(p_page);
2401
2402                 if (q_stripe != -1) {
2403
2404                         /*
2405                          * raid6, add the qstripe and call the
2406                          * library function to fill in our p/q
2407                          */
2408                         pointers[stripe++] = kmap(q_page);
2409
2410                         raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
2411                                                 pointers);
2412                 } else {
2413                         /* raid5 */
2414                         memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
2415                         run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
2416                 }
2417
2418                 /* Check scrubbing pairty and repair it */
2419                 p = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
2420                 parity = kmap(p);
2421                 if (memcmp(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE))
2422                         memcpy(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE);
2423                 else
2424                         /* Parity is right, needn't writeback */
2425                         bitmap_clear(rbio->dbitmap, pagenr, 1);
2426                 kunmap(p);
2427
2428                 for (stripe = 0; stripe < rbio->real_stripes; stripe++)
2429                         kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
2430         }
2431
2432         __free_page(p_page);
2433         if (q_page)
2434                 __free_page(q_page);
2435
2436 writeback:
2437         /*
2438          * time to start writing.  Make bios for everything from the
2439          * higher layers (the bio_list in our rbio) and our p/q.  Ignore
2440          * everything else.
2441          */
2442         for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
2443                 struct page *page;
2444
2445                 page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
2446                 ret = rbio_add_io_page(rbio, &bio_list,
2447                                page, rbio->scrubp, pagenr, rbio->stripe_len);
2448                 if (ret)
2449                         goto cleanup;
2450         }
2451
2452         if (!is_replace)
2453                 goto submit_write;
2454
2455         for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) {
2456                 struct page *page;
2457
2458                 page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
2459                 ret = rbio_add_io_page(rbio, &bio_list, page,
2460                                        bbio->tgtdev_map[rbio->scrubp],
2461                                        pagenr, rbio->stripe_len);
2462                 if (ret)
2463                         goto cleanup;
2464         }
2465
2466 submit_write:
2467         nr_data = bio_list_size(&bio_list);
2468         if (!nr_data) {
2469                 /* Every parity is right */
2470                 rbio_orig_end_io(rbio, 0, 0);
2471                 return;
2472         }
2473
2474         atomic_set(&rbio->stripes_pending, nr_data);
2475
2476         while (1) {
2477                 bio = bio_list_pop(&bio_list);
2478                 if (!bio)
2479                         break;
2480
2481                 bio->bi_private = rbio;
2482                 bio->bi_end_io = raid_write_parity_end_io;
2483                 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
2484                 submit_bio(WRITE, bio);
2485         }
2486         return;
2487
2488 cleanup:
2489         rbio_orig_end_io(rbio, -EIO, 0);
2490 }
2491
2492 static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
2493 {
2494         if (stripe >= 0 && stripe < rbio->nr_data)
2495                 return 1;
2496         return 0;
2497 }
2498
2499 /*
2500  * While we're doing the parity check and repair, we could have errors
2501  * in reading pages off the disk.  This checks for errors and if we're
2502  * not able to read the page it'll trigger parity reconstruction.  The
2503  * parity scrub will be finished after we've reconstructed the failed
2504  * stripes
2505  */
2506 static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
2507 {
2508         if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
2509                 goto cleanup;
2510
2511         if (rbio->faila >= 0 || rbio->failb >= 0) {
2512                 int dfail = 0, failp = -1;
2513
2514                 if (is_data_stripe(rbio, rbio->faila))
2515                         dfail++;
2516                 else if (is_parity_stripe(rbio->faila))
2517                         failp = rbio->faila;
2518
2519                 if (is_data_stripe(rbio, rbio->failb))
2520                         dfail++;
2521                 else if (is_parity_stripe(rbio->failb))
2522                         failp = rbio->failb;
2523
2524                 /*
2525                  * Because we can not use a scrubbing parity to repair
2526                  * the data, so the capability of the repair is declined.
2527                  * (In the case of RAID5, we can not repair anything)
2528                  */
2529                 if (dfail > rbio->bbio->max_errors - 1)
2530                         goto cleanup;
2531
2532                 /*
2533                  * If all data is good, only parity is correctly, just
2534                  * repair the parity.
2535                  */
2536                 if (dfail == 0) {
2537                         finish_parity_scrub(rbio, 0);
2538                         return;
2539                 }
2540
2541                 /*
2542                  * Here means we got one corrupted data stripe and one
2543                  * corrupted parity on RAID6, if the corrupted parity
2544                  * is scrubbing parity, luckly, use the other one to repair
2545                  * the data, or we can not repair the data stripe.
2546                  */
2547                 if (failp != rbio->scrubp)
2548                         goto cleanup;
2549
2550                 __raid_recover_end_io(rbio);
2551         } else {
2552                 finish_parity_scrub(rbio, 1);
2553         }
2554         return;
2555
2556 cleanup:
2557         rbio_orig_end_io(rbio, -EIO, 0);
2558 }
2559
2560 /*
2561  * end io for the read phase of the rmw cycle.  All the bios here are physical
2562  * stripe bios we've read from the disk so we can recalculate the parity of the
2563  * stripe.
2564  *
2565  * This will usually kick off finish_rmw once all the bios are read in, but it
2566  * may trigger parity reconstruction if we had any errors along the way
2567  */
2568 static void raid56_parity_scrub_end_io(struct bio *bio, int err)
2569 {
2570         struct btrfs_raid_bio *rbio = bio->bi_private;
2571
2572         if (err)
2573                 fail_bio_stripe(rbio, bio);
2574         else
2575                 set_bio_pages_uptodate(bio);
2576
2577         bio_put(bio);
2578
2579         if (!atomic_dec_and_test(&rbio->stripes_pending))
2580                 return;
2581
2582         /*
2583          * this will normally call finish_rmw to start our write
2584          * but if there are any failed stripes we'll reconstruct
2585          * from parity first
2586          */
2587         validate_rbio_for_parity_scrub(rbio);
2588 }
2589
2590 static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
2591 {
2592         int bios_to_read = 0;
2593         struct bio_list bio_list;
2594         int ret;
2595         int pagenr;
2596         int stripe;
2597         struct bio *bio;
2598
2599         ret = alloc_rbio_essential_pages(rbio);
2600         if (ret)
2601                 goto cleanup;
2602
2603         bio_list_init(&bio_list);
2604
2605         atomic_set(&rbio->error, 0);
2606         /*
2607          * build a list of bios to read all the missing parts of this
2608          * stripe
2609          */
2610         for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
2611                 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
2612                         struct page *page;
2613                         /*
2614                          * we want to find all the pages missing from
2615                          * the rbio and read them from the disk.  If
2616                          * page_in_rbio finds a page in the bio list
2617                          * we don't need to read it off the stripe.
2618                          */
2619                         page = page_in_rbio(rbio, stripe, pagenr, 1);
2620                         if (page)
2621                                 continue;
2622
2623                         page = rbio_stripe_page(rbio, stripe, pagenr);
2624                         /*
2625                          * the bio cache may have handed us an uptodate
2626                          * page.  If so, be happy and use it
2627                          */
2628                         if (PageUptodate(page))
2629                                 continue;
2630
2631                         ret = rbio_add_io_page(rbio, &bio_list, page,
2632                                        stripe, pagenr, rbio->stripe_len);
2633                         if (ret)
2634                                 goto cleanup;
2635                 }
2636         }
2637
2638         bios_to_read = bio_list_size(&bio_list);
2639         if (!bios_to_read) {
2640                 /*
2641                  * this can happen if others have merged with
2642                  * us, it means there is nothing left to read.
2643                  * But if there are missing devices it may not be
2644                  * safe to do the full stripe write yet.
2645                  */
2646                 goto finish;
2647         }
2648
2649         /*
2650          * the bbio may be freed once we submit the last bio.  Make sure
2651          * not to touch it after that
2652          */
2653         atomic_set(&rbio->stripes_pending, bios_to_read);
2654         while (1) {
2655                 bio = bio_list_pop(&bio_list);
2656                 if (!bio)
2657                         break;
2658
2659                 bio->bi_private = rbio;
2660                 bio->bi_end_io = raid56_parity_scrub_end_io;
2661
2662                 btrfs_bio_wq_end_io(rbio->fs_info, bio,
2663                                     BTRFS_WQ_ENDIO_RAID56);
2664
2665                 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
2666                 submit_bio(READ, bio);
2667         }
2668         /* the actual write will happen once the reads are done */
2669         return;
2670
2671 cleanup:
2672         rbio_orig_end_io(rbio, -EIO, 0);
2673         return;
2674
2675 finish:
2676         validate_rbio_for_parity_scrub(rbio);
2677 }
2678
2679 static void scrub_parity_work(struct btrfs_work *work)
2680 {
2681         struct btrfs_raid_bio *rbio;
2682
2683         rbio = container_of(work, struct btrfs_raid_bio, work);
2684         raid56_parity_scrub_stripe(rbio);
2685 }
2686
2687 static void async_scrub_parity(struct btrfs_raid_bio *rbio)
2688 {
2689         btrfs_init_work(&rbio->work, btrfs_rmw_helper,
2690                         scrub_parity_work, NULL, NULL);
2691
2692         btrfs_queue_work(rbio->fs_info->rmw_workers,
2693                          &rbio->work);
2694 }
2695
2696 void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
2697 {
2698         if (!lock_stripe_add(rbio))
2699                 async_scrub_parity(rbio);
2700 }