]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blob - fs/btrfs/raid56.c
Btrfs: Make raid_map array be inlined in btrfs_bio structure
[mirror_ubuntu-bionic-kernel.git] / fs / btrfs / raid56.c
1 /*
2 * Copyright (C) 2012 Fusion-io All rights reserved.
3 * Copyright (C) 2012 Intel Corp. All rights reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public
7 * License v2 as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public
15 * License along with this program; if not, write to the
16 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
17 * Boston, MA 021110-1307, USA.
18 */
19 #include <linux/sched.h>
20 #include <linux/wait.h>
21 #include <linux/bio.h>
22 #include <linux/slab.h>
23 #include <linux/buffer_head.h>
24 #include <linux/blkdev.h>
25 #include <linux/random.h>
26 #include <linux/iocontext.h>
27 #include <linux/capability.h>
28 #include <linux/ratelimit.h>
29 #include <linux/kthread.h>
30 #include <linux/raid/pq.h>
31 #include <linux/hash.h>
32 #include <linux/list_sort.h>
33 #include <linux/raid/xor.h>
34 #include <linux/vmalloc.h>
35 #include <asm/div64.h>
36 #include "ctree.h"
37 #include "extent_map.h"
38 #include "disk-io.h"
39 #include "transaction.h"
40 #include "print-tree.h"
41 #include "volumes.h"
42 #include "raid56.h"
43 #include "async-thread.h"
44 #include "check-integrity.h"
45 #include "rcu-string.h"
46
47 /* set when additional merges to this rbio are not allowed */
48 #define RBIO_RMW_LOCKED_BIT 1
49
50 /*
51 * set when this rbio is sitting in the hash, but it is just a cache
52 * of past RMW
53 */
54 #define RBIO_CACHE_BIT 2
55
56 /*
57 * set when it is safe to trust the stripe_pages for caching
58 */
59 #define RBIO_CACHE_READY_BIT 3
60
61 /*
62 * bbio and raid_map is managed by the caller, so we shouldn't free
63 * them here. And besides that, all rbios with this flag should not
64 * be cached, because we need raid_map to check the rbios' stripe
65 * is the same or not, but it is very likely that the caller has
66 * free raid_map, so don't cache those rbios.
67 */
68 #define RBIO_HOLD_BBIO_MAP_BIT 4
69
70 #define RBIO_CACHE_SIZE 1024
71
72 enum btrfs_rbio_ops {
73 BTRFS_RBIO_WRITE = 0,
74 BTRFS_RBIO_READ_REBUILD = 1,
75 BTRFS_RBIO_PARITY_SCRUB = 2,
76 };
77
78 struct btrfs_raid_bio {
79 struct btrfs_fs_info *fs_info;
80 struct btrfs_bio *bbio;
81
82 /* while we're doing rmw on a stripe
83 * we put it into a hash table so we can
84 * lock the stripe and merge more rbios
85 * into it.
86 */
87 struct list_head hash_list;
88
89 /*
90 * LRU list for the stripe cache
91 */
92 struct list_head stripe_cache;
93
94 /*
95 * for scheduling work in the helper threads
96 */
97 struct btrfs_work work;
98
99 /*
100 * bio list and bio_list_lock are used
101 * to add more bios into the stripe
102 * in hopes of avoiding the full rmw
103 */
104 struct bio_list bio_list;
105 spinlock_t bio_list_lock;
106
107 /* also protected by the bio_list_lock, the
108 * plug list is used by the plugging code
109 * to collect partial bios while plugged. The
110 * stripe locking code also uses it to hand off
111 * the stripe lock to the next pending IO
112 */
113 struct list_head plug_list;
114
115 /*
116 * flags that tell us if it is safe to
117 * merge with this bio
118 */
119 unsigned long flags;
120
121 /* size of each individual stripe on disk */
122 int stripe_len;
123
124 /* number of data stripes (no p/q) */
125 int nr_data;
126
127 int real_stripes;
128
129 int stripe_npages;
130 /*
131 * set if we're doing a parity rebuild
132 * for a read from higher up, which is handled
133 * differently from a parity rebuild as part of
134 * rmw
135 */
136 enum btrfs_rbio_ops operation;
137
138 /* first bad stripe */
139 int faila;
140
141 /* second bad stripe (for raid6 use) */
142 int failb;
143
144 int scrubp;
145 /*
146 * number of pages needed to represent the full
147 * stripe
148 */
149 int nr_pages;
150
151 /*
152 * size of all the bios in the bio_list. This
153 * helps us decide if the rbio maps to a full
154 * stripe or not
155 */
156 int bio_list_bytes;
157
158 int generic_bio_cnt;
159
160 atomic_t refs;
161
162 atomic_t stripes_pending;
163
164 atomic_t error;
165 /*
166 * these are two arrays of pointers. We allocate the
167 * rbio big enough to hold them both and setup their
168 * locations when the rbio is allocated
169 */
170
171 /* pointers to pages that we allocated for
172 * reading/writing stripes directly from the disk (including P/Q)
173 */
174 struct page **stripe_pages;
175
176 /*
177 * pointers to the pages in the bio_list. Stored
178 * here for faster lookup
179 */
180 struct page **bio_pages;
181
182 /*
183 * bitmap to record which horizontal stripe has data
184 */
185 unsigned long *dbitmap;
186 };
187
188 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
189 static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
190 static void rmw_work(struct btrfs_work *work);
191 static void read_rebuild_work(struct btrfs_work *work);
192 static void async_rmw_stripe(struct btrfs_raid_bio *rbio);
193 static void async_read_rebuild(struct btrfs_raid_bio *rbio);
194 static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
195 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
196 static void __free_raid_bio(struct btrfs_raid_bio *rbio);
197 static void index_rbio_pages(struct btrfs_raid_bio *rbio);
198 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
199
200 static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
201 int need_check);
202 static void async_scrub_parity(struct btrfs_raid_bio *rbio);
203
204 /*
205 * the stripe hash table is used for locking, and to collect
206 * bios in hopes of making a full stripe
207 */
208 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
209 {
210 struct btrfs_stripe_hash_table *table;
211 struct btrfs_stripe_hash_table *x;
212 struct btrfs_stripe_hash *cur;
213 struct btrfs_stripe_hash *h;
214 int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
215 int i;
216 int table_size;
217
218 if (info->stripe_hash_table)
219 return 0;
220
221 /*
222 * The table is large, starting with order 4 and can go as high as
223 * order 7 in case lock debugging is turned on.
224 *
225 * Try harder to allocate and fallback to vmalloc to lower the chance
226 * of a failing mount.
227 */
228 table_size = sizeof(*table) + sizeof(*h) * num_entries;
229 table = kzalloc(table_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
230 if (!table) {
231 table = vzalloc(table_size);
232 if (!table)
233 return -ENOMEM;
234 }
235
236 spin_lock_init(&table->cache_lock);
237 INIT_LIST_HEAD(&table->stripe_cache);
238
239 h = table->table;
240
241 for (i = 0; i < num_entries; i++) {
242 cur = h + i;
243 INIT_LIST_HEAD(&cur->hash_list);
244 spin_lock_init(&cur->lock);
245 init_waitqueue_head(&cur->wait);
246 }
247
248 x = cmpxchg(&info->stripe_hash_table, NULL, table);
249 if (x) {
250 if (is_vmalloc_addr(x))
251 vfree(x);
252 else
253 kfree(x);
254 }
255 return 0;
256 }
257
258 /*
259 * caching an rbio means to copy anything from the
260 * bio_pages array into the stripe_pages array. We
261 * use the page uptodate bit in the stripe cache array
262 * to indicate if it has valid data
263 *
264 * once the caching is done, we set the cache ready
265 * bit.
266 */
267 static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
268 {
269 int i;
270 char *s;
271 char *d;
272 int ret;
273
274 ret = alloc_rbio_pages(rbio);
275 if (ret)
276 return;
277
278 for (i = 0; i < rbio->nr_pages; i++) {
279 if (!rbio->bio_pages[i])
280 continue;
281
282 s = kmap(rbio->bio_pages[i]);
283 d = kmap(rbio->stripe_pages[i]);
284
285 memcpy(d, s, PAGE_CACHE_SIZE);
286
287 kunmap(rbio->bio_pages[i]);
288 kunmap(rbio->stripe_pages[i]);
289 SetPageUptodate(rbio->stripe_pages[i]);
290 }
291 set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
292 }
293
294 /*
295 * we hash on the first logical address of the stripe
296 */
297 static int rbio_bucket(struct btrfs_raid_bio *rbio)
298 {
299 u64 num = rbio->bbio->raid_map[0];
300
301 /*
302 * we shift down quite a bit. We're using byte
303 * addressing, and most of the lower bits are zeros.
304 * This tends to upset hash_64, and it consistently
305 * returns just one or two different values.
306 *
307 * shifting off the lower bits fixes things.
308 */
309 return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
310 }
311
312 /*
313 * stealing an rbio means taking all the uptodate pages from the stripe
314 * array in the source rbio and putting them into the destination rbio
315 */
316 static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
317 {
318 int i;
319 struct page *s;
320 struct page *d;
321
322 if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
323 return;
324
325 for (i = 0; i < dest->nr_pages; i++) {
326 s = src->stripe_pages[i];
327 if (!s || !PageUptodate(s)) {
328 continue;
329 }
330
331 d = dest->stripe_pages[i];
332 if (d)
333 __free_page(d);
334
335 dest->stripe_pages[i] = s;
336 src->stripe_pages[i] = NULL;
337 }
338 }
339
340 /*
341 * merging means we take the bio_list from the victim and
342 * splice it into the destination. The victim should
343 * be discarded afterwards.
344 *
345 * must be called with dest->rbio_list_lock held
346 */
347 static void merge_rbio(struct btrfs_raid_bio *dest,
348 struct btrfs_raid_bio *victim)
349 {
350 bio_list_merge(&dest->bio_list, &victim->bio_list);
351 dest->bio_list_bytes += victim->bio_list_bytes;
352 dest->generic_bio_cnt += victim->generic_bio_cnt;
353 bio_list_init(&victim->bio_list);
354 }
355
356 /*
357 * used to prune items that are in the cache. The caller
358 * must hold the hash table lock.
359 */
360 static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
361 {
362 int bucket = rbio_bucket(rbio);
363 struct btrfs_stripe_hash_table *table;
364 struct btrfs_stripe_hash *h;
365 int freeit = 0;
366
367 /*
368 * check the bit again under the hash table lock.
369 */
370 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
371 return;
372
373 table = rbio->fs_info->stripe_hash_table;
374 h = table->table + bucket;
375
376 /* hold the lock for the bucket because we may be
377 * removing it from the hash table
378 */
379 spin_lock(&h->lock);
380
381 /*
382 * hold the lock for the bio list because we need
383 * to make sure the bio list is empty
384 */
385 spin_lock(&rbio->bio_list_lock);
386
387 if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
388 list_del_init(&rbio->stripe_cache);
389 table->cache_size -= 1;
390 freeit = 1;
391
392 /* if the bio list isn't empty, this rbio is
393 * still involved in an IO. We take it out
394 * of the cache list, and drop the ref that
395 * was held for the list.
396 *
397 * If the bio_list was empty, we also remove
398 * the rbio from the hash_table, and drop
399 * the corresponding ref
400 */
401 if (bio_list_empty(&rbio->bio_list)) {
402 if (!list_empty(&rbio->hash_list)) {
403 list_del_init(&rbio->hash_list);
404 atomic_dec(&rbio->refs);
405 BUG_ON(!list_empty(&rbio->plug_list));
406 }
407 }
408 }
409
410 spin_unlock(&rbio->bio_list_lock);
411 spin_unlock(&h->lock);
412
413 if (freeit)
414 __free_raid_bio(rbio);
415 }
416
417 /*
418 * prune a given rbio from the cache
419 */
420 static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
421 {
422 struct btrfs_stripe_hash_table *table;
423 unsigned long flags;
424
425 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
426 return;
427
428 table = rbio->fs_info->stripe_hash_table;
429
430 spin_lock_irqsave(&table->cache_lock, flags);
431 __remove_rbio_from_cache(rbio);
432 spin_unlock_irqrestore(&table->cache_lock, flags);
433 }
434
435 /*
436 * remove everything in the cache
437 */
438 static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
439 {
440 struct btrfs_stripe_hash_table *table;
441 unsigned long flags;
442 struct btrfs_raid_bio *rbio;
443
444 table = info->stripe_hash_table;
445
446 spin_lock_irqsave(&table->cache_lock, flags);
447 while (!list_empty(&table->stripe_cache)) {
448 rbio = list_entry(table->stripe_cache.next,
449 struct btrfs_raid_bio,
450 stripe_cache);
451 __remove_rbio_from_cache(rbio);
452 }
453 spin_unlock_irqrestore(&table->cache_lock, flags);
454 }
455
456 /*
457 * remove all cached entries and free the hash table
458 * used by unmount
459 */
460 void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
461 {
462 if (!info->stripe_hash_table)
463 return;
464 btrfs_clear_rbio_cache(info);
465 if (is_vmalloc_addr(info->stripe_hash_table))
466 vfree(info->stripe_hash_table);
467 else
468 kfree(info->stripe_hash_table);
469 info->stripe_hash_table = NULL;
470 }
471
472 /*
473 * insert an rbio into the stripe cache. It
474 * must have already been prepared by calling
475 * cache_rbio_pages
476 *
477 * If this rbio was already cached, it gets
478 * moved to the front of the lru.
479 *
480 * If the size of the rbio cache is too big, we
481 * prune an item.
482 */
483 static void cache_rbio(struct btrfs_raid_bio *rbio)
484 {
485 struct btrfs_stripe_hash_table *table;
486 unsigned long flags;
487
488 if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
489 return;
490
491 table = rbio->fs_info->stripe_hash_table;
492
493 spin_lock_irqsave(&table->cache_lock, flags);
494 spin_lock(&rbio->bio_list_lock);
495
496 /* bump our ref if we were not in the list before */
497 if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
498 atomic_inc(&rbio->refs);
499
500 if (!list_empty(&rbio->stripe_cache)){
501 list_move(&rbio->stripe_cache, &table->stripe_cache);
502 } else {
503 list_add(&rbio->stripe_cache, &table->stripe_cache);
504 table->cache_size += 1;
505 }
506
507 spin_unlock(&rbio->bio_list_lock);
508
509 if (table->cache_size > RBIO_CACHE_SIZE) {
510 struct btrfs_raid_bio *found;
511
512 found = list_entry(table->stripe_cache.prev,
513 struct btrfs_raid_bio,
514 stripe_cache);
515
516 if (found != rbio)
517 __remove_rbio_from_cache(found);
518 }
519
520 spin_unlock_irqrestore(&table->cache_lock, flags);
521 return;
522 }
523
524 /*
525 * helper function to run the xor_blocks api. It is only
526 * able to do MAX_XOR_BLOCKS at a time, so we need to
527 * loop through.
528 */
529 static void run_xor(void **pages, int src_cnt, ssize_t len)
530 {
531 int src_off = 0;
532 int xor_src_cnt = 0;
533 void *dest = pages[src_cnt];
534
535 while(src_cnt > 0) {
536 xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
537 xor_blocks(xor_src_cnt, len, dest, pages + src_off);
538
539 src_cnt -= xor_src_cnt;
540 src_off += xor_src_cnt;
541 }
542 }
543
544 /*
545 * returns true if the bio list inside this rbio
546 * covers an entire stripe (no rmw required).
547 * Must be called with the bio list lock held, or
548 * at a time when you know it is impossible to add
549 * new bios into the list
550 */
551 static int __rbio_is_full(struct btrfs_raid_bio *rbio)
552 {
553 unsigned long size = rbio->bio_list_bytes;
554 int ret = 1;
555
556 if (size != rbio->nr_data * rbio->stripe_len)
557 ret = 0;
558
559 BUG_ON(size > rbio->nr_data * rbio->stripe_len);
560 return ret;
561 }
562
563 static int rbio_is_full(struct btrfs_raid_bio *rbio)
564 {
565 unsigned long flags;
566 int ret;
567
568 spin_lock_irqsave(&rbio->bio_list_lock, flags);
569 ret = __rbio_is_full(rbio);
570 spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
571 return ret;
572 }
573
574 /*
575 * returns 1 if it is safe to merge two rbios together.
576 * The merging is safe if the two rbios correspond to
577 * the same stripe and if they are both going in the same
578 * direction (read vs write), and if neither one is
579 * locked for final IO
580 *
581 * The caller is responsible for locking such that
582 * rmw_locked is safe to test
583 */
584 static int rbio_can_merge(struct btrfs_raid_bio *last,
585 struct btrfs_raid_bio *cur)
586 {
587 if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
588 test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
589 return 0;
590
591 /*
592 * we can't merge with cached rbios, since the
593 * idea is that when we merge the destination
594 * rbio is going to run our IO for us. We can
595 * steal from cached rbio's though, other functions
596 * handle that.
597 */
598 if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
599 test_bit(RBIO_CACHE_BIT, &cur->flags))
600 return 0;
601
602 if (last->bbio->raid_map[0] !=
603 cur->bbio->raid_map[0])
604 return 0;
605
606 /* we can't merge with different operations */
607 if (last->operation != cur->operation)
608 return 0;
609 /*
610 * We've need read the full stripe from the drive.
611 * check and repair the parity and write the new results.
612 *
613 * We're not allowed to add any new bios to the
614 * bio list here, anyone else that wants to
615 * change this stripe needs to do their own rmw.
616 */
617 if (last->operation == BTRFS_RBIO_PARITY_SCRUB ||
618 cur->operation == BTRFS_RBIO_PARITY_SCRUB)
619 return 0;
620
621 return 1;
622 }
623
624 /*
625 * helper to index into the pstripe
626 */
627 static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
628 {
629 index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
630 return rbio->stripe_pages[index];
631 }
632
633 /*
634 * helper to index into the qstripe, returns null
635 * if there is no qstripe
636 */
637 static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
638 {
639 if (rbio->nr_data + 1 == rbio->real_stripes)
640 return NULL;
641
642 index += ((rbio->nr_data + 1) * rbio->stripe_len) >>
643 PAGE_CACHE_SHIFT;
644 return rbio->stripe_pages[index];
645 }
646
647 /*
648 * The first stripe in the table for a logical address
649 * has the lock. rbios are added in one of three ways:
650 *
651 * 1) Nobody has the stripe locked yet. The rbio is given
652 * the lock and 0 is returned. The caller must start the IO
653 * themselves.
654 *
655 * 2) Someone has the stripe locked, but we're able to merge
656 * with the lock owner. The rbio is freed and the IO will
657 * start automatically along with the existing rbio. 1 is returned.
658 *
659 * 3) Someone has the stripe locked, but we're not able to merge.
660 * The rbio is added to the lock owner's plug list, or merged into
661 * an rbio already on the plug list. When the lock owner unlocks,
662 * the next rbio on the list is run and the IO is started automatically.
663 * 1 is returned
664 *
665 * If we return 0, the caller still owns the rbio and must continue with
666 * IO submission. If we return 1, the caller must assume the rbio has
667 * already been freed.
668 */
669 static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
670 {
671 int bucket = rbio_bucket(rbio);
672 struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket;
673 struct btrfs_raid_bio *cur;
674 struct btrfs_raid_bio *pending;
675 unsigned long flags;
676 DEFINE_WAIT(wait);
677 struct btrfs_raid_bio *freeit = NULL;
678 struct btrfs_raid_bio *cache_drop = NULL;
679 int ret = 0;
680 int walk = 0;
681
682 spin_lock_irqsave(&h->lock, flags);
683 list_for_each_entry(cur, &h->hash_list, hash_list) {
684 walk++;
685 if (cur->bbio->raid_map[0] == rbio->bbio->raid_map[0]) {
686 spin_lock(&cur->bio_list_lock);
687
688 /* can we steal this cached rbio's pages? */
689 if (bio_list_empty(&cur->bio_list) &&
690 list_empty(&cur->plug_list) &&
691 test_bit(RBIO_CACHE_BIT, &cur->flags) &&
692 !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
693 list_del_init(&cur->hash_list);
694 atomic_dec(&cur->refs);
695
696 steal_rbio(cur, rbio);
697 cache_drop = cur;
698 spin_unlock(&cur->bio_list_lock);
699
700 goto lockit;
701 }
702
703 /* can we merge into the lock owner? */
704 if (rbio_can_merge(cur, rbio)) {
705 merge_rbio(cur, rbio);
706 spin_unlock(&cur->bio_list_lock);
707 freeit = rbio;
708 ret = 1;
709 goto out;
710 }
711
712
713 /*
714 * we couldn't merge with the running
715 * rbio, see if we can merge with the
716 * pending ones. We don't have to
717 * check for rmw_locked because there
718 * is no way they are inside finish_rmw
719 * right now
720 */
721 list_for_each_entry(pending, &cur->plug_list,
722 plug_list) {
723 if (rbio_can_merge(pending, rbio)) {
724 merge_rbio(pending, rbio);
725 spin_unlock(&cur->bio_list_lock);
726 freeit = rbio;
727 ret = 1;
728 goto out;
729 }
730 }
731
732 /* no merging, put us on the tail of the plug list,
733 * our rbio will be started with the currently
734 * running rbio unlocks
735 */
736 list_add_tail(&rbio->plug_list, &cur->plug_list);
737 spin_unlock(&cur->bio_list_lock);
738 ret = 1;
739 goto out;
740 }
741 }
742 lockit:
743 atomic_inc(&rbio->refs);
744 list_add(&rbio->hash_list, &h->hash_list);
745 out:
746 spin_unlock_irqrestore(&h->lock, flags);
747 if (cache_drop)
748 remove_rbio_from_cache(cache_drop);
749 if (freeit)
750 __free_raid_bio(freeit);
751 return ret;
752 }
753
754 /*
755 * called as rmw or parity rebuild is completed. If the plug list has more
756 * rbios waiting for this stripe, the next one on the list will be started
757 */
758 static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
759 {
760 int bucket;
761 struct btrfs_stripe_hash *h;
762 unsigned long flags;
763 int keep_cache = 0;
764
765 bucket = rbio_bucket(rbio);
766 h = rbio->fs_info->stripe_hash_table->table + bucket;
767
768 if (list_empty(&rbio->plug_list))
769 cache_rbio(rbio);
770
771 spin_lock_irqsave(&h->lock, flags);
772 spin_lock(&rbio->bio_list_lock);
773
774 if (!list_empty(&rbio->hash_list)) {
775 /*
776 * if we're still cached and there is no other IO
777 * to perform, just leave this rbio here for others
778 * to steal from later
779 */
780 if (list_empty(&rbio->plug_list) &&
781 test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
782 keep_cache = 1;
783 clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
784 BUG_ON(!bio_list_empty(&rbio->bio_list));
785 goto done;
786 }
787
788 list_del_init(&rbio->hash_list);
789 atomic_dec(&rbio->refs);
790
791 /*
792 * we use the plug list to hold all the rbios
793 * waiting for the chance to lock this stripe.
794 * hand the lock over to one of them.
795 */
796 if (!list_empty(&rbio->plug_list)) {
797 struct btrfs_raid_bio *next;
798 struct list_head *head = rbio->plug_list.next;
799
800 next = list_entry(head, struct btrfs_raid_bio,
801 plug_list);
802
803 list_del_init(&rbio->plug_list);
804
805 list_add(&next->hash_list, &h->hash_list);
806 atomic_inc(&next->refs);
807 spin_unlock(&rbio->bio_list_lock);
808 spin_unlock_irqrestore(&h->lock, flags);
809
810 if (next->operation == BTRFS_RBIO_READ_REBUILD)
811 async_read_rebuild(next);
812 else if (next->operation == BTRFS_RBIO_WRITE) {
813 steal_rbio(rbio, next);
814 async_rmw_stripe(next);
815 } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
816 steal_rbio(rbio, next);
817 async_scrub_parity(next);
818 }
819
820 goto done_nolock;
821 } else if (waitqueue_active(&h->wait)) {
822 spin_unlock(&rbio->bio_list_lock);
823 spin_unlock_irqrestore(&h->lock, flags);
824 wake_up(&h->wait);
825 goto done_nolock;
826 }
827 }
828 done:
829 spin_unlock(&rbio->bio_list_lock);
830 spin_unlock_irqrestore(&h->lock, flags);
831
832 done_nolock:
833 if (!keep_cache)
834 remove_rbio_from_cache(rbio);
835 }
836
837 static inline void
838 __free_bbio(struct btrfs_bio *bbio, int need)
839 {
840 if (need)
841 kfree(bbio);
842 }
843
844 static inline void free_bbio(struct btrfs_raid_bio *rbio)
845 {
846 __free_bbio(rbio->bbio,
847 !test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags));
848 }
849
850 static void __free_raid_bio(struct btrfs_raid_bio *rbio)
851 {
852 int i;
853
854 WARN_ON(atomic_read(&rbio->refs) < 0);
855 if (!atomic_dec_and_test(&rbio->refs))
856 return;
857
858 WARN_ON(!list_empty(&rbio->stripe_cache));
859 WARN_ON(!list_empty(&rbio->hash_list));
860 WARN_ON(!bio_list_empty(&rbio->bio_list));
861
862 for (i = 0; i < rbio->nr_pages; i++) {
863 if (rbio->stripe_pages[i]) {
864 __free_page(rbio->stripe_pages[i]);
865 rbio->stripe_pages[i] = NULL;
866 }
867 }
868
869 free_bbio(rbio);
870
871 kfree(rbio);
872 }
873
874 static void free_raid_bio(struct btrfs_raid_bio *rbio)
875 {
876 unlock_stripe(rbio);
877 __free_raid_bio(rbio);
878 }
879
880 /*
881 * this frees the rbio and runs through all the bios in the
882 * bio_list and calls end_io on them
883 */
884 static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
885 {
886 struct bio *cur = bio_list_get(&rbio->bio_list);
887 struct bio *next;
888
889 if (rbio->generic_bio_cnt)
890 btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt);
891
892 free_raid_bio(rbio);
893
894 while (cur) {
895 next = cur->bi_next;
896 cur->bi_next = NULL;
897 if (uptodate)
898 set_bit(BIO_UPTODATE, &cur->bi_flags);
899 bio_endio(cur, err);
900 cur = next;
901 }
902 }
903
904 /*
905 * end io function used by finish_rmw. When we finally
906 * get here, we've written a full stripe
907 */
908 static void raid_write_end_io(struct bio *bio, int err)
909 {
910 struct btrfs_raid_bio *rbio = bio->bi_private;
911
912 if (err)
913 fail_bio_stripe(rbio, bio);
914
915 bio_put(bio);
916
917 if (!atomic_dec_and_test(&rbio->stripes_pending))
918 return;
919
920 err = 0;
921
922 /* OK, we have read all the stripes we need to. */
923 if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
924 err = -EIO;
925
926 rbio_orig_end_io(rbio, err, 0);
927 return;
928 }
929
930 /*
931 * the read/modify/write code wants to use the original bio for
932 * any pages it included, and then use the rbio for everything
933 * else. This function decides if a given index (stripe number)
934 * and page number in that stripe fall inside the original bio
935 * or the rbio.
936 *
937 * if you set bio_list_only, you'll get a NULL back for any ranges
938 * that are outside the bio_list
939 *
940 * This doesn't take any refs on anything, you get a bare page pointer
941 * and the caller must bump refs as required.
942 *
943 * You must call index_rbio_pages once before you can trust
944 * the answers from this function.
945 */
946 static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
947 int index, int pagenr, int bio_list_only)
948 {
949 int chunk_page;
950 struct page *p = NULL;
951
952 chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
953
954 spin_lock_irq(&rbio->bio_list_lock);
955 p = rbio->bio_pages[chunk_page];
956 spin_unlock_irq(&rbio->bio_list_lock);
957
958 if (p || bio_list_only)
959 return p;
960
961 return rbio->stripe_pages[chunk_page];
962 }
963
964 /*
965 * number of pages we need for the entire stripe across all the
966 * drives
967 */
968 static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
969 {
970 unsigned long nr = stripe_len * nr_stripes;
971 return DIV_ROUND_UP(nr, PAGE_CACHE_SIZE);
972 }
973
974 /*
975 * allocation and initial setup for the btrfs_raid_bio. Not
976 * this does not allocate any pages for rbio->pages.
977 */
978 static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
979 struct btrfs_bio *bbio, u64 stripe_len)
980 {
981 struct btrfs_raid_bio *rbio;
982 int nr_data = 0;
983 int real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
984 int num_pages = rbio_nr_pages(stripe_len, real_stripes);
985 int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE);
986 void *p;
987
988 rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2 +
989 DIV_ROUND_UP(stripe_npages, BITS_PER_LONG / 8),
990 GFP_NOFS);
991 if (!rbio)
992 return ERR_PTR(-ENOMEM);
993
994 bio_list_init(&rbio->bio_list);
995 INIT_LIST_HEAD(&rbio->plug_list);
996 spin_lock_init(&rbio->bio_list_lock);
997 INIT_LIST_HEAD(&rbio->stripe_cache);
998 INIT_LIST_HEAD(&rbio->hash_list);
999 rbio->bbio = bbio;
1000 rbio->fs_info = root->fs_info;
1001 rbio->stripe_len = stripe_len;
1002 rbio->nr_pages = num_pages;
1003 rbio->real_stripes = real_stripes;
1004 rbio->stripe_npages = stripe_npages;
1005 rbio->faila = -1;
1006 rbio->failb = -1;
1007 atomic_set(&rbio->refs, 1);
1008 atomic_set(&rbio->error, 0);
1009 atomic_set(&rbio->stripes_pending, 0);
1010
1011 /*
1012 * the stripe_pages and bio_pages array point to the extra
1013 * memory we allocated past the end of the rbio
1014 */
1015 p = rbio + 1;
1016 rbio->stripe_pages = p;
1017 rbio->bio_pages = p + sizeof(struct page *) * num_pages;
1018 rbio->dbitmap = p + sizeof(struct page *) * num_pages * 2;
1019
1020 if (bbio->raid_map[real_stripes - 1] == RAID6_Q_STRIPE)
1021 nr_data = real_stripes - 2;
1022 else
1023 nr_data = real_stripes - 1;
1024
1025 rbio->nr_data = nr_data;
1026 return rbio;
1027 }
1028
1029 /* allocate pages for all the stripes in the bio, including parity */
1030 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
1031 {
1032 int i;
1033 struct page *page;
1034
1035 for (i = 0; i < rbio->nr_pages; i++) {
1036 if (rbio->stripe_pages[i])
1037 continue;
1038 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
1039 if (!page)
1040 return -ENOMEM;
1041 rbio->stripe_pages[i] = page;
1042 ClearPageUptodate(page);
1043 }
1044 return 0;
1045 }
1046
1047 /* allocate pages for just the p/q stripes */
1048 static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
1049 {
1050 int i;
1051 struct page *page;
1052
1053 i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
1054
1055 for (; i < rbio->nr_pages; i++) {
1056 if (rbio->stripe_pages[i])
1057 continue;
1058 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
1059 if (!page)
1060 return -ENOMEM;
1061 rbio->stripe_pages[i] = page;
1062 }
1063 return 0;
1064 }
1065
1066 /*
1067 * add a single page from a specific stripe into our list of bios for IO
1068 * this will try to merge into existing bios if possible, and returns
1069 * zero if all went well.
1070 */
1071 static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
1072 struct bio_list *bio_list,
1073 struct page *page,
1074 int stripe_nr,
1075 unsigned long page_index,
1076 unsigned long bio_max_len)
1077 {
1078 struct bio *last = bio_list->tail;
1079 u64 last_end = 0;
1080 int ret;
1081 struct bio *bio;
1082 struct btrfs_bio_stripe *stripe;
1083 u64 disk_start;
1084
1085 stripe = &rbio->bbio->stripes[stripe_nr];
1086 disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT);
1087
1088 /* if the device is missing, just fail this stripe */
1089 if (!stripe->dev->bdev)
1090 return fail_rbio_index(rbio, stripe_nr);
1091
1092 /* see if we can add this page onto our existing bio */
1093 if (last) {
1094 last_end = (u64)last->bi_iter.bi_sector << 9;
1095 last_end += last->bi_iter.bi_size;
1096
1097 /*
1098 * we can't merge these if they are from different
1099 * devices or if they are not contiguous
1100 */
1101 if (last_end == disk_start && stripe->dev->bdev &&
1102 test_bit(BIO_UPTODATE, &last->bi_flags) &&
1103 last->bi_bdev == stripe->dev->bdev) {
1104 ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0);
1105 if (ret == PAGE_CACHE_SIZE)
1106 return 0;
1107 }
1108 }
1109
1110 /* put a new bio on the list */
1111 bio = btrfs_io_bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1);
1112 if (!bio)
1113 return -ENOMEM;
1114
1115 bio->bi_iter.bi_size = 0;
1116 bio->bi_bdev = stripe->dev->bdev;
1117 bio->bi_iter.bi_sector = disk_start >> 9;
1118 set_bit(BIO_UPTODATE, &bio->bi_flags);
1119
1120 bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
1121 bio_list_add(bio_list, bio);
1122 return 0;
1123 }
1124
1125 /*
1126 * while we're doing the read/modify/write cycle, we could
1127 * have errors in reading pages off the disk. This checks
1128 * for errors and if we're not able to read the page it'll
1129 * trigger parity reconstruction. The rmw will be finished
1130 * after we've reconstructed the failed stripes
1131 */
1132 static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
1133 {
1134 if (rbio->faila >= 0 || rbio->failb >= 0) {
1135 BUG_ON(rbio->faila == rbio->real_stripes - 1);
1136 __raid56_parity_recover(rbio);
1137 } else {
1138 finish_rmw(rbio);
1139 }
1140 }
1141
1142 /*
1143 * these are just the pages from the rbio array, not from anything
1144 * the FS sent down to us
1145 */
1146 static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page)
1147 {
1148 int index;
1149 index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT);
1150 index += page;
1151 return rbio->stripe_pages[index];
1152 }
1153
1154 /*
1155 * helper function to walk our bio list and populate the bio_pages array with
1156 * the result. This seems expensive, but it is faster than constantly
1157 * searching through the bio list as we setup the IO in finish_rmw or stripe
1158 * reconstruction.
1159 *
1160 * This must be called before you trust the answers from page_in_rbio
1161 */
1162 static void index_rbio_pages(struct btrfs_raid_bio *rbio)
1163 {
1164 struct bio *bio;
1165 u64 start;
1166 unsigned long stripe_offset;
1167 unsigned long page_index;
1168 struct page *p;
1169 int i;
1170
1171 spin_lock_irq(&rbio->bio_list_lock);
1172 bio_list_for_each(bio, &rbio->bio_list) {
1173 start = (u64)bio->bi_iter.bi_sector << 9;
1174 stripe_offset = start - rbio->bbio->raid_map[0];
1175 page_index = stripe_offset >> PAGE_CACHE_SHIFT;
1176
1177 for (i = 0; i < bio->bi_vcnt; i++) {
1178 p = bio->bi_io_vec[i].bv_page;
1179 rbio->bio_pages[page_index + i] = p;
1180 }
1181 }
1182 spin_unlock_irq(&rbio->bio_list_lock);
1183 }
1184
1185 /*
1186 * this is called from one of two situations. We either
1187 * have a full stripe from the higher layers, or we've read all
1188 * the missing bits off disk.
1189 *
1190 * This will calculate the parity and then send down any
1191 * changed blocks.
1192 */
1193 static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1194 {
1195 struct btrfs_bio *bbio = rbio->bbio;
1196 void *pointers[rbio->real_stripes];
1197 int stripe_len = rbio->stripe_len;
1198 int nr_data = rbio->nr_data;
1199 int stripe;
1200 int pagenr;
1201 int p_stripe = -1;
1202 int q_stripe = -1;
1203 struct bio_list bio_list;
1204 struct bio *bio;
1205 int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT;
1206 int ret;
1207
1208 bio_list_init(&bio_list);
1209
1210 if (rbio->real_stripes - rbio->nr_data == 1) {
1211 p_stripe = rbio->real_stripes - 1;
1212 } else if (rbio->real_stripes - rbio->nr_data == 2) {
1213 p_stripe = rbio->real_stripes - 2;
1214 q_stripe = rbio->real_stripes - 1;
1215 } else {
1216 BUG();
1217 }
1218
1219 /* at this point we either have a full stripe,
1220 * or we've read the full stripe from the drive.
1221 * recalculate the parity and write the new results.
1222 *
1223 * We're not allowed to add any new bios to the
1224 * bio list here, anyone else that wants to
1225 * change this stripe needs to do their own rmw.
1226 */
1227 spin_lock_irq(&rbio->bio_list_lock);
1228 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1229 spin_unlock_irq(&rbio->bio_list_lock);
1230
1231 atomic_set(&rbio->error, 0);
1232
1233 /*
1234 * now that we've set rmw_locked, run through the
1235 * bio list one last time and map the page pointers
1236 *
1237 * We don't cache full rbios because we're assuming
1238 * the higher layers are unlikely to use this area of
1239 * the disk again soon. If they do use it again,
1240 * hopefully they will send another full bio.
1241 */
1242 index_rbio_pages(rbio);
1243 if (!rbio_is_full(rbio))
1244 cache_rbio_pages(rbio);
1245 else
1246 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
1247
1248 for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
1249 struct page *p;
1250 /* first collect one page from each data stripe */
1251 for (stripe = 0; stripe < nr_data; stripe++) {
1252 p = page_in_rbio(rbio, stripe, pagenr, 0);
1253 pointers[stripe] = kmap(p);
1254 }
1255
1256 /* then add the parity stripe */
1257 p = rbio_pstripe_page(rbio, pagenr);
1258 SetPageUptodate(p);
1259 pointers[stripe++] = kmap(p);
1260
1261 if (q_stripe != -1) {
1262
1263 /*
1264 * raid6, add the qstripe and call the
1265 * library function to fill in our p/q
1266 */
1267 p = rbio_qstripe_page(rbio, pagenr);
1268 SetPageUptodate(p);
1269 pointers[stripe++] = kmap(p);
1270
1271 raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
1272 pointers);
1273 } else {
1274 /* raid5 */
1275 memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
1276 run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
1277 }
1278
1279
1280 for (stripe = 0; stripe < rbio->real_stripes; stripe++)
1281 kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
1282 }
1283
1284 /*
1285 * time to start writing. Make bios for everything from the
1286 * higher layers (the bio_list in our rbio) and our p/q. Ignore
1287 * everything else.
1288 */
1289 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1290 for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
1291 struct page *page;
1292 if (stripe < rbio->nr_data) {
1293 page = page_in_rbio(rbio, stripe, pagenr, 1);
1294 if (!page)
1295 continue;
1296 } else {
1297 page = rbio_stripe_page(rbio, stripe, pagenr);
1298 }
1299
1300 ret = rbio_add_io_page(rbio, &bio_list,
1301 page, stripe, pagenr, rbio->stripe_len);
1302 if (ret)
1303 goto cleanup;
1304 }
1305 }
1306
1307 if (likely(!bbio->num_tgtdevs))
1308 goto write_data;
1309
1310 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1311 if (!bbio->tgtdev_map[stripe])
1312 continue;
1313
1314 for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
1315 struct page *page;
1316 if (stripe < rbio->nr_data) {
1317 page = page_in_rbio(rbio, stripe, pagenr, 1);
1318 if (!page)
1319 continue;
1320 } else {
1321 page = rbio_stripe_page(rbio, stripe, pagenr);
1322 }
1323
1324 ret = rbio_add_io_page(rbio, &bio_list, page,
1325 rbio->bbio->tgtdev_map[stripe],
1326 pagenr, rbio->stripe_len);
1327 if (ret)
1328 goto cleanup;
1329 }
1330 }
1331
1332 write_data:
1333 atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
1334 BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
1335
1336 while (1) {
1337 bio = bio_list_pop(&bio_list);
1338 if (!bio)
1339 break;
1340
1341 bio->bi_private = rbio;
1342 bio->bi_end_io = raid_write_end_io;
1343 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
1344 submit_bio(WRITE, bio);
1345 }
1346 return;
1347
1348 cleanup:
1349 rbio_orig_end_io(rbio, -EIO, 0);
1350 }
1351
1352 /*
1353 * helper to find the stripe number for a given bio. Used to figure out which
1354 * stripe has failed. This expects the bio to correspond to a physical disk,
1355 * so it looks up based on physical sector numbers.
1356 */
1357 static int find_bio_stripe(struct btrfs_raid_bio *rbio,
1358 struct bio *bio)
1359 {
1360 u64 physical = bio->bi_iter.bi_sector;
1361 u64 stripe_start;
1362 int i;
1363 struct btrfs_bio_stripe *stripe;
1364
1365 physical <<= 9;
1366
1367 for (i = 0; i < rbio->bbio->num_stripes; i++) {
1368 stripe = &rbio->bbio->stripes[i];
1369 stripe_start = stripe->physical;
1370 if (physical >= stripe_start &&
1371 physical < stripe_start + rbio->stripe_len &&
1372 bio->bi_bdev == stripe->dev->bdev) {
1373 return i;
1374 }
1375 }
1376 return -1;
1377 }
1378
1379 /*
1380 * helper to find the stripe number for a given
1381 * bio (before mapping). Used to figure out which stripe has
1382 * failed. This looks up based on logical block numbers.
1383 */
1384 static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
1385 struct bio *bio)
1386 {
1387 u64 logical = bio->bi_iter.bi_sector;
1388 u64 stripe_start;
1389 int i;
1390
1391 logical <<= 9;
1392
1393 for (i = 0; i < rbio->nr_data; i++) {
1394 stripe_start = rbio->bbio->raid_map[i];
1395 if (logical >= stripe_start &&
1396 logical < stripe_start + rbio->stripe_len) {
1397 return i;
1398 }
1399 }
1400 return -1;
1401 }
1402
1403 /*
1404 * returns -EIO if we had too many failures
1405 */
1406 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
1407 {
1408 unsigned long flags;
1409 int ret = 0;
1410
1411 spin_lock_irqsave(&rbio->bio_list_lock, flags);
1412
1413 /* we already know this stripe is bad, move on */
1414 if (rbio->faila == failed || rbio->failb == failed)
1415 goto out;
1416
1417 if (rbio->faila == -1) {
1418 /* first failure on this rbio */
1419 rbio->faila = failed;
1420 atomic_inc(&rbio->error);
1421 } else if (rbio->failb == -1) {
1422 /* second failure on this rbio */
1423 rbio->failb = failed;
1424 atomic_inc(&rbio->error);
1425 } else {
1426 ret = -EIO;
1427 }
1428 out:
1429 spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
1430
1431 return ret;
1432 }
1433
1434 /*
1435 * helper to fail a stripe based on a physical disk
1436 * bio.
1437 */
1438 static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
1439 struct bio *bio)
1440 {
1441 int failed = find_bio_stripe(rbio, bio);
1442
1443 if (failed < 0)
1444 return -EIO;
1445
1446 return fail_rbio_index(rbio, failed);
1447 }
1448
1449 /*
1450 * this sets each page in the bio uptodate. It should only be used on private
1451 * rbio pages, nothing that comes in from the higher layers
1452 */
1453 static void set_bio_pages_uptodate(struct bio *bio)
1454 {
1455 int i;
1456 struct page *p;
1457
1458 for (i = 0; i < bio->bi_vcnt; i++) {
1459 p = bio->bi_io_vec[i].bv_page;
1460 SetPageUptodate(p);
1461 }
1462 }
1463
1464 /*
1465 * end io for the read phase of the rmw cycle. All the bios here are physical
1466 * stripe bios we've read from the disk so we can recalculate the parity of the
1467 * stripe.
1468 *
1469 * This will usually kick off finish_rmw once all the bios are read in, but it
1470 * may trigger parity reconstruction if we had any errors along the way
1471 */
1472 static void raid_rmw_end_io(struct bio *bio, int err)
1473 {
1474 struct btrfs_raid_bio *rbio = bio->bi_private;
1475
1476 if (err)
1477 fail_bio_stripe(rbio, bio);
1478 else
1479 set_bio_pages_uptodate(bio);
1480
1481 bio_put(bio);
1482
1483 if (!atomic_dec_and_test(&rbio->stripes_pending))
1484 return;
1485
1486 err = 0;
1487 if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
1488 goto cleanup;
1489
1490 /*
1491 * this will normally call finish_rmw to start our write
1492 * but if there are any failed stripes we'll reconstruct
1493 * from parity first
1494 */
1495 validate_rbio_for_rmw(rbio);
1496 return;
1497
1498 cleanup:
1499
1500 rbio_orig_end_io(rbio, -EIO, 0);
1501 }
1502
1503 static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
1504 {
1505 btrfs_init_work(&rbio->work, btrfs_rmw_helper,
1506 rmw_work, NULL, NULL);
1507
1508 btrfs_queue_work(rbio->fs_info->rmw_workers,
1509 &rbio->work);
1510 }
1511
1512 static void async_read_rebuild(struct btrfs_raid_bio *rbio)
1513 {
1514 btrfs_init_work(&rbio->work, btrfs_rmw_helper,
1515 read_rebuild_work, NULL, NULL);
1516
1517 btrfs_queue_work(rbio->fs_info->rmw_workers,
1518 &rbio->work);
1519 }
1520
1521 /*
1522 * the stripe must be locked by the caller. It will
1523 * unlock after all the writes are done
1524 */
1525 static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
1526 {
1527 int bios_to_read = 0;
1528 struct bio_list bio_list;
1529 int ret;
1530 int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
1531 int pagenr;
1532 int stripe;
1533 struct bio *bio;
1534
1535 bio_list_init(&bio_list);
1536
1537 ret = alloc_rbio_pages(rbio);
1538 if (ret)
1539 goto cleanup;
1540
1541 index_rbio_pages(rbio);
1542
1543 atomic_set(&rbio->error, 0);
1544 /*
1545 * build a list of bios to read all the missing parts of this
1546 * stripe
1547 */
1548 for (stripe = 0; stripe < rbio->nr_data; stripe++) {
1549 for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1550 struct page *page;
1551 /*
1552 * we want to find all the pages missing from
1553 * the rbio and read them from the disk. If
1554 * page_in_rbio finds a page in the bio list
1555 * we don't need to read it off the stripe.
1556 */
1557 page = page_in_rbio(rbio, stripe, pagenr, 1);
1558 if (page)
1559 continue;
1560
1561 page = rbio_stripe_page(rbio, stripe, pagenr);
1562 /*
1563 * the bio cache may have handed us an uptodate
1564 * page. If so, be happy and use it
1565 */
1566 if (PageUptodate(page))
1567 continue;
1568
1569 ret = rbio_add_io_page(rbio, &bio_list, page,
1570 stripe, pagenr, rbio->stripe_len);
1571 if (ret)
1572 goto cleanup;
1573 }
1574 }
1575
1576 bios_to_read = bio_list_size(&bio_list);
1577 if (!bios_to_read) {
1578 /*
1579 * this can happen if others have merged with
1580 * us, it means there is nothing left to read.
1581 * But if there are missing devices it may not be
1582 * safe to do the full stripe write yet.
1583 */
1584 goto finish;
1585 }
1586
1587 /*
1588 * the bbio may be freed once we submit the last bio. Make sure
1589 * not to touch it after that
1590 */
1591 atomic_set(&rbio->stripes_pending, bios_to_read);
1592 while (1) {
1593 bio = bio_list_pop(&bio_list);
1594 if (!bio)
1595 break;
1596
1597 bio->bi_private = rbio;
1598 bio->bi_end_io = raid_rmw_end_io;
1599
1600 btrfs_bio_wq_end_io(rbio->fs_info, bio,
1601 BTRFS_WQ_ENDIO_RAID56);
1602
1603 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
1604 submit_bio(READ, bio);
1605 }
1606 /* the actual write will happen once the reads are done */
1607 return 0;
1608
1609 cleanup:
1610 rbio_orig_end_io(rbio, -EIO, 0);
1611 return -EIO;
1612
1613 finish:
1614 validate_rbio_for_rmw(rbio);
1615 return 0;
1616 }
1617
1618 /*
1619 * if the upper layers pass in a full stripe, we thank them by only allocating
1620 * enough pages to hold the parity, and sending it all down quickly.
1621 */
1622 static int full_stripe_write(struct btrfs_raid_bio *rbio)
1623 {
1624 int ret;
1625
1626 ret = alloc_rbio_parity_pages(rbio);
1627 if (ret) {
1628 __free_raid_bio(rbio);
1629 return ret;
1630 }
1631
1632 ret = lock_stripe_add(rbio);
1633 if (ret == 0)
1634 finish_rmw(rbio);
1635 return 0;
1636 }
1637
1638 /*
1639 * partial stripe writes get handed over to async helpers.
1640 * We're really hoping to merge a few more writes into this
1641 * rbio before calculating new parity
1642 */
1643 static int partial_stripe_write(struct btrfs_raid_bio *rbio)
1644 {
1645 int ret;
1646
1647 ret = lock_stripe_add(rbio);
1648 if (ret == 0)
1649 async_rmw_stripe(rbio);
1650 return 0;
1651 }
1652
1653 /*
1654 * sometimes while we were reading from the drive to
1655 * recalculate parity, enough new bios come into create
1656 * a full stripe. So we do a check here to see if we can
1657 * go directly to finish_rmw
1658 */
1659 static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
1660 {
1661 /* head off into rmw land if we don't have a full stripe */
1662 if (!rbio_is_full(rbio))
1663 return partial_stripe_write(rbio);
1664 return full_stripe_write(rbio);
1665 }
1666
1667 /*
1668 * We use plugging call backs to collect full stripes.
1669 * Any time we get a partial stripe write while plugged
1670 * we collect it into a list. When the unplug comes down,
1671 * we sort the list by logical block number and merge
1672 * everything we can into the same rbios
1673 */
1674 struct btrfs_plug_cb {
1675 struct blk_plug_cb cb;
1676 struct btrfs_fs_info *info;
1677 struct list_head rbio_list;
1678 struct btrfs_work work;
1679 };
1680
1681 /*
1682 * rbios on the plug list are sorted for easier merging.
1683 */
1684 static int plug_cmp(void *priv, struct list_head *a, struct list_head *b)
1685 {
1686 struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
1687 plug_list);
1688 struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
1689 plug_list);
1690 u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
1691 u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
1692
1693 if (a_sector < b_sector)
1694 return -1;
1695 if (a_sector > b_sector)
1696 return 1;
1697 return 0;
1698 }
1699
1700 static void run_plug(struct btrfs_plug_cb *plug)
1701 {
1702 struct btrfs_raid_bio *cur;
1703 struct btrfs_raid_bio *last = NULL;
1704
1705 /*
1706 * sort our plug list then try to merge
1707 * everything we can in hopes of creating full
1708 * stripes.
1709 */
1710 list_sort(NULL, &plug->rbio_list, plug_cmp);
1711 while (!list_empty(&plug->rbio_list)) {
1712 cur = list_entry(plug->rbio_list.next,
1713 struct btrfs_raid_bio, plug_list);
1714 list_del_init(&cur->plug_list);
1715
1716 if (rbio_is_full(cur)) {
1717 /* we have a full stripe, send it down */
1718 full_stripe_write(cur);
1719 continue;
1720 }
1721 if (last) {
1722 if (rbio_can_merge(last, cur)) {
1723 merge_rbio(last, cur);
1724 __free_raid_bio(cur);
1725 continue;
1726
1727 }
1728 __raid56_parity_write(last);
1729 }
1730 last = cur;
1731 }
1732 if (last) {
1733 __raid56_parity_write(last);
1734 }
1735 kfree(plug);
1736 }
1737
1738 /*
1739 * if the unplug comes from schedule, we have to push the
1740 * work off to a helper thread
1741 */
1742 static void unplug_work(struct btrfs_work *work)
1743 {
1744 struct btrfs_plug_cb *plug;
1745 plug = container_of(work, struct btrfs_plug_cb, work);
1746 run_plug(plug);
1747 }
1748
1749 static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
1750 {
1751 struct btrfs_plug_cb *plug;
1752 plug = container_of(cb, struct btrfs_plug_cb, cb);
1753
1754 if (from_schedule) {
1755 btrfs_init_work(&plug->work, btrfs_rmw_helper,
1756 unplug_work, NULL, NULL);
1757 btrfs_queue_work(plug->info->rmw_workers,
1758 &plug->work);
1759 return;
1760 }
1761 run_plug(plug);
1762 }
1763
1764 /*
1765 * our main entry point for writes from the rest of the FS.
1766 */
1767 int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
1768 struct btrfs_bio *bbio, u64 stripe_len)
1769 {
1770 struct btrfs_raid_bio *rbio;
1771 struct btrfs_plug_cb *plug = NULL;
1772 struct blk_plug_cb *cb;
1773 int ret;
1774
1775 rbio = alloc_rbio(root, bbio, stripe_len);
1776 if (IS_ERR(rbio)) {
1777 __free_bbio(bbio, 1);
1778 return PTR_ERR(rbio);
1779 }
1780 bio_list_add(&rbio->bio_list, bio);
1781 rbio->bio_list_bytes = bio->bi_iter.bi_size;
1782 rbio->operation = BTRFS_RBIO_WRITE;
1783
1784 btrfs_bio_counter_inc_noblocked(root->fs_info);
1785 rbio->generic_bio_cnt = 1;
1786
1787 /*
1788 * don't plug on full rbios, just get them out the door
1789 * as quickly as we can
1790 */
1791 if (rbio_is_full(rbio)) {
1792 ret = full_stripe_write(rbio);
1793 if (ret)
1794 btrfs_bio_counter_dec(root->fs_info);
1795 return ret;
1796 }
1797
1798 cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info,
1799 sizeof(*plug));
1800 if (cb) {
1801 plug = container_of(cb, struct btrfs_plug_cb, cb);
1802 if (!plug->info) {
1803 plug->info = root->fs_info;
1804 INIT_LIST_HEAD(&plug->rbio_list);
1805 }
1806 list_add_tail(&rbio->plug_list, &plug->rbio_list);
1807 ret = 0;
1808 } else {
1809 ret = __raid56_parity_write(rbio);
1810 if (ret)
1811 btrfs_bio_counter_dec(root->fs_info);
1812 }
1813 return ret;
1814 }
1815
1816 /*
1817 * all parity reconstruction happens here. We've read in everything
1818 * we can find from the drives and this does the heavy lifting of
1819 * sorting the good from the bad.
1820 */
1821 static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1822 {
1823 int pagenr, stripe;
1824 void **pointers;
1825 int faila = -1, failb = -1;
1826 int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
1827 struct page *page;
1828 int err;
1829 int i;
1830
1831 pointers = kzalloc(rbio->real_stripes * sizeof(void *),
1832 GFP_NOFS);
1833 if (!pointers) {
1834 err = -ENOMEM;
1835 goto cleanup_io;
1836 }
1837
1838 faila = rbio->faila;
1839 failb = rbio->failb;
1840
1841 if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
1842 spin_lock_irq(&rbio->bio_list_lock);
1843 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1844 spin_unlock_irq(&rbio->bio_list_lock);
1845 }
1846
1847 index_rbio_pages(rbio);
1848
1849 for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1850 /*
1851 * Now we just use bitmap to mark the horizontal stripes in
1852 * which we have data when doing parity scrub.
1853 */
1854 if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
1855 !test_bit(pagenr, rbio->dbitmap))
1856 continue;
1857
1858 /* setup our array of pointers with pages
1859 * from each stripe
1860 */
1861 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1862 /*
1863 * if we're rebuilding a read, we have to use
1864 * pages from the bio list
1865 */
1866 if (rbio->operation == BTRFS_RBIO_READ_REBUILD &&
1867 (stripe == faila || stripe == failb)) {
1868 page = page_in_rbio(rbio, stripe, pagenr, 0);
1869 } else {
1870 page = rbio_stripe_page(rbio, stripe, pagenr);
1871 }
1872 pointers[stripe] = kmap(page);
1873 }
1874
1875 /* all raid6 handling here */
1876 if (rbio->bbio->raid_map[rbio->real_stripes - 1] ==
1877 RAID6_Q_STRIPE) {
1878
1879 /*
1880 * single failure, rebuild from parity raid5
1881 * style
1882 */
1883 if (failb < 0) {
1884 if (faila == rbio->nr_data) {
1885 /*
1886 * Just the P stripe has failed, without
1887 * a bad data or Q stripe.
1888 * TODO, we should redo the xor here.
1889 */
1890 err = -EIO;
1891 goto cleanup;
1892 }
1893 /*
1894 * a single failure in raid6 is rebuilt
1895 * in the pstripe code below
1896 */
1897 goto pstripe;
1898 }
1899
1900 /* make sure our ps and qs are in order */
1901 if (faila > failb) {
1902 int tmp = failb;
1903 failb = faila;
1904 faila = tmp;
1905 }
1906
1907 /* if the q stripe is failed, do a pstripe reconstruction
1908 * from the xors.
1909 * If both the q stripe and the P stripe are failed, we're
1910 * here due to a crc mismatch and we can't give them the
1911 * data they want
1912 */
1913 if (rbio->bbio->raid_map[failb] == RAID6_Q_STRIPE) {
1914 if (rbio->bbio->raid_map[faila] ==
1915 RAID5_P_STRIPE) {
1916 err = -EIO;
1917 goto cleanup;
1918 }
1919 /*
1920 * otherwise we have one bad data stripe and
1921 * a good P stripe. raid5!
1922 */
1923 goto pstripe;
1924 }
1925
1926 if (rbio->bbio->raid_map[failb] == RAID5_P_STRIPE) {
1927 raid6_datap_recov(rbio->real_stripes,
1928 PAGE_SIZE, faila, pointers);
1929 } else {
1930 raid6_2data_recov(rbio->real_stripes,
1931 PAGE_SIZE, faila, failb,
1932 pointers);
1933 }
1934 } else {
1935 void *p;
1936
1937 /* rebuild from P stripe here (raid5 or raid6) */
1938 BUG_ON(failb != -1);
1939 pstripe:
1940 /* Copy parity block into failed block to start with */
1941 memcpy(pointers[faila],
1942 pointers[rbio->nr_data],
1943 PAGE_CACHE_SIZE);
1944
1945 /* rearrange the pointer array */
1946 p = pointers[faila];
1947 for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
1948 pointers[stripe] = pointers[stripe + 1];
1949 pointers[rbio->nr_data - 1] = p;
1950
1951 /* xor in the rest */
1952 run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE);
1953 }
1954 /* if we're doing this rebuild as part of an rmw, go through
1955 * and set all of our private rbio pages in the
1956 * failed stripes as uptodate. This way finish_rmw will
1957 * know they can be trusted. If this was a read reconstruction,
1958 * other endio functions will fiddle the uptodate bits
1959 */
1960 if (rbio->operation == BTRFS_RBIO_WRITE) {
1961 for (i = 0; i < nr_pages; i++) {
1962 if (faila != -1) {
1963 page = rbio_stripe_page(rbio, faila, i);
1964 SetPageUptodate(page);
1965 }
1966 if (failb != -1) {
1967 page = rbio_stripe_page(rbio, failb, i);
1968 SetPageUptodate(page);
1969 }
1970 }
1971 }
1972 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1973 /*
1974 * if we're rebuilding a read, we have to use
1975 * pages from the bio list
1976 */
1977 if (rbio->operation == BTRFS_RBIO_READ_REBUILD &&
1978 (stripe == faila || stripe == failb)) {
1979 page = page_in_rbio(rbio, stripe, pagenr, 0);
1980 } else {
1981 page = rbio_stripe_page(rbio, stripe, pagenr);
1982 }
1983 kunmap(page);
1984 }
1985 }
1986
1987 err = 0;
1988 cleanup:
1989 kfree(pointers);
1990
1991 cleanup_io:
1992 if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
1993 if (err == 0 &&
1994 !test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags))
1995 cache_rbio_pages(rbio);
1996 else
1997 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
1998
1999 rbio_orig_end_io(rbio, err, err == 0);
2000 } else if (err == 0) {
2001 rbio->faila = -1;
2002 rbio->failb = -1;
2003
2004 if (rbio->operation == BTRFS_RBIO_WRITE)
2005 finish_rmw(rbio);
2006 else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB)
2007 finish_parity_scrub(rbio, 0);
2008 else
2009 BUG();
2010 } else {
2011 rbio_orig_end_io(rbio, err, 0);
2012 }
2013 }
2014
2015 /*
2016 * This is called only for stripes we've read from disk to
2017 * reconstruct the parity.
2018 */
2019 static void raid_recover_end_io(struct bio *bio, int err)
2020 {
2021 struct btrfs_raid_bio *rbio = bio->bi_private;
2022
2023 /*
2024 * we only read stripe pages off the disk, set them
2025 * up to date if there were no errors
2026 */
2027 if (err)
2028 fail_bio_stripe(rbio, bio);
2029 else
2030 set_bio_pages_uptodate(bio);
2031 bio_put(bio);
2032
2033 if (!atomic_dec_and_test(&rbio->stripes_pending))
2034 return;
2035
2036 if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
2037 rbio_orig_end_io(rbio, -EIO, 0);
2038 else
2039 __raid_recover_end_io(rbio);
2040 }
2041
2042 /*
2043 * reads everything we need off the disk to reconstruct
2044 * the parity. endio handlers trigger final reconstruction
2045 * when the IO is done.
2046 *
2047 * This is used both for reads from the higher layers and for
2048 * parity construction required to finish a rmw cycle.
2049 */
2050 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
2051 {
2052 int bios_to_read = 0;
2053 struct bio_list bio_list;
2054 int ret;
2055 int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
2056 int pagenr;
2057 int stripe;
2058 struct bio *bio;
2059
2060 bio_list_init(&bio_list);
2061
2062 ret = alloc_rbio_pages(rbio);
2063 if (ret)
2064 goto cleanup;
2065
2066 atomic_set(&rbio->error, 0);
2067
2068 /*
2069 * read everything that hasn't failed. Thanks to the
2070 * stripe cache, it is possible that some or all of these
2071 * pages are going to be uptodate.
2072 */
2073 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
2074 if (rbio->faila == stripe || rbio->failb == stripe) {
2075 atomic_inc(&rbio->error);
2076 continue;
2077 }
2078
2079 for (pagenr = 0; pagenr < nr_pages; pagenr++) {
2080 struct page *p;
2081
2082 /*
2083 * the rmw code may have already read this
2084 * page in
2085 */
2086 p = rbio_stripe_page(rbio, stripe, pagenr);
2087 if (PageUptodate(p))
2088 continue;
2089
2090 ret = rbio_add_io_page(rbio, &bio_list,
2091 rbio_stripe_page(rbio, stripe, pagenr),
2092 stripe, pagenr, rbio->stripe_len);
2093 if (ret < 0)
2094 goto cleanup;
2095 }
2096 }
2097
2098 bios_to_read = bio_list_size(&bio_list);
2099 if (!bios_to_read) {
2100 /*
2101 * we might have no bios to read just because the pages
2102 * were up to date, or we might have no bios to read because
2103 * the devices were gone.
2104 */
2105 if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) {
2106 __raid_recover_end_io(rbio);
2107 goto out;
2108 } else {
2109 goto cleanup;
2110 }
2111 }
2112
2113 /*
2114 * the bbio may be freed once we submit the last bio. Make sure
2115 * not to touch it after that
2116 */
2117 atomic_set(&rbio->stripes_pending, bios_to_read);
2118 while (1) {
2119 bio = bio_list_pop(&bio_list);
2120 if (!bio)
2121 break;
2122
2123 bio->bi_private = rbio;
2124 bio->bi_end_io = raid_recover_end_io;
2125
2126 btrfs_bio_wq_end_io(rbio->fs_info, bio,
2127 BTRFS_WQ_ENDIO_RAID56);
2128
2129 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
2130 submit_bio(READ, bio);
2131 }
2132 out:
2133 return 0;
2134
2135 cleanup:
2136 if (rbio->operation == BTRFS_RBIO_READ_REBUILD)
2137 rbio_orig_end_io(rbio, -EIO, 0);
2138 return -EIO;
2139 }
2140
2141 /*
2142 * the main entry point for reads from the higher layers. This
2143 * is really only called when the normal read path had a failure,
2144 * so we assume the bio they send down corresponds to a failed part
2145 * of the drive.
2146 */
2147 int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
2148 struct btrfs_bio *bbio, u64 stripe_len,
2149 int mirror_num, int generic_io)
2150 {
2151 struct btrfs_raid_bio *rbio;
2152 int ret;
2153
2154 rbio = alloc_rbio(root, bbio, stripe_len);
2155 if (IS_ERR(rbio)) {
2156 __free_bbio(bbio, generic_io);
2157 return PTR_ERR(rbio);
2158 }
2159
2160 rbio->operation = BTRFS_RBIO_READ_REBUILD;
2161 bio_list_add(&rbio->bio_list, bio);
2162 rbio->bio_list_bytes = bio->bi_iter.bi_size;
2163
2164 rbio->faila = find_logical_bio_stripe(rbio, bio);
2165 if (rbio->faila == -1) {
2166 BUG();
2167 __free_bbio(bbio, generic_io);
2168 kfree(rbio);
2169 return -EIO;
2170 }
2171
2172 if (generic_io) {
2173 btrfs_bio_counter_inc_noblocked(root->fs_info);
2174 rbio->generic_bio_cnt = 1;
2175 } else {
2176 set_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags);
2177 }
2178
2179 /*
2180 * reconstruct from the q stripe if they are
2181 * asking for mirror 3
2182 */
2183 if (mirror_num == 3)
2184 rbio->failb = rbio->real_stripes - 2;
2185
2186 ret = lock_stripe_add(rbio);
2187
2188 /*
2189 * __raid56_parity_recover will end the bio with
2190 * any errors it hits. We don't want to return
2191 * its error value up the stack because our caller
2192 * will end up calling bio_endio with any nonzero
2193 * return
2194 */
2195 if (ret == 0)
2196 __raid56_parity_recover(rbio);
2197 /*
2198 * our rbio has been added to the list of
2199 * rbios that will be handled after the
2200 * currently lock owner is done
2201 */
2202 return 0;
2203
2204 }
2205
2206 static void rmw_work(struct btrfs_work *work)
2207 {
2208 struct btrfs_raid_bio *rbio;
2209
2210 rbio = container_of(work, struct btrfs_raid_bio, work);
2211 raid56_rmw_stripe(rbio);
2212 }
2213
2214 static void read_rebuild_work(struct btrfs_work *work)
2215 {
2216 struct btrfs_raid_bio *rbio;
2217
2218 rbio = container_of(work, struct btrfs_raid_bio, work);
2219 __raid56_parity_recover(rbio);
2220 }
2221
2222 /*
2223 * The following code is used to scrub/replace the parity stripe
2224 *
2225 * Note: We need make sure all the pages that add into the scrub/replace
2226 * raid bio are correct and not be changed during the scrub/replace. That
2227 * is those pages just hold metadata or file data with checksum.
2228 */
2229
2230 struct btrfs_raid_bio *
2231 raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio,
2232 struct btrfs_bio *bbio, u64 stripe_len,
2233 struct btrfs_device *scrub_dev,
2234 unsigned long *dbitmap, int stripe_nsectors)
2235 {
2236 struct btrfs_raid_bio *rbio;
2237 int i;
2238
2239 rbio = alloc_rbio(root, bbio, stripe_len);
2240 if (IS_ERR(rbio))
2241 return NULL;
2242 bio_list_add(&rbio->bio_list, bio);
2243 /*
2244 * This is a special bio which is used to hold the completion handler
2245 * and make the scrub rbio is similar to the other types
2246 */
2247 ASSERT(!bio->bi_iter.bi_size);
2248 rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
2249
2250 for (i = 0; i < rbio->real_stripes; i++) {
2251 if (bbio->stripes[i].dev == scrub_dev) {
2252 rbio->scrubp = i;
2253 break;
2254 }
2255 }
2256
2257 /* Now we just support the sectorsize equals to page size */
2258 ASSERT(root->sectorsize == PAGE_SIZE);
2259 ASSERT(rbio->stripe_npages == stripe_nsectors);
2260 bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors);
2261
2262 return rbio;
2263 }
2264
2265 void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio,
2266 struct page *page, u64 logical)
2267 {
2268 int stripe_offset;
2269 int index;
2270
2271 ASSERT(logical >= rbio->bbio->raid_map[0]);
2272 ASSERT(logical + PAGE_SIZE <= rbio->bbio->raid_map[0] +
2273 rbio->stripe_len * rbio->nr_data);
2274 stripe_offset = (int)(logical - rbio->bbio->raid_map[0]);
2275 index = stripe_offset >> PAGE_CACHE_SHIFT;
2276 rbio->bio_pages[index] = page;
2277 }
2278
2279 /*
2280 * We just scrub the parity that we have correct data on the same horizontal,
2281 * so we needn't allocate all pages for all the stripes.
2282 */
2283 static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
2284 {
2285 int i;
2286 int bit;
2287 int index;
2288 struct page *page;
2289
2290 for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) {
2291 for (i = 0; i < rbio->real_stripes; i++) {
2292 index = i * rbio->stripe_npages + bit;
2293 if (rbio->stripe_pages[index])
2294 continue;
2295
2296 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
2297 if (!page)
2298 return -ENOMEM;
2299 rbio->stripe_pages[index] = page;
2300 ClearPageUptodate(page);
2301 }
2302 }
2303 return 0;
2304 }
2305
2306 /*
2307 * end io function used by finish_rmw. When we finally
2308 * get here, we've written a full stripe
2309 */
2310 static void raid_write_parity_end_io(struct bio *bio, int err)
2311 {
2312 struct btrfs_raid_bio *rbio = bio->bi_private;
2313
2314 if (err)
2315 fail_bio_stripe(rbio, bio);
2316
2317 bio_put(bio);
2318
2319 if (!atomic_dec_and_test(&rbio->stripes_pending))
2320 return;
2321
2322 err = 0;
2323
2324 if (atomic_read(&rbio->error))
2325 err = -EIO;
2326
2327 rbio_orig_end_io(rbio, err, 0);
2328 }
2329
2330 static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
2331 int need_check)
2332 {
2333 struct btrfs_bio *bbio = rbio->bbio;
2334 void *pointers[rbio->real_stripes];
2335 DECLARE_BITMAP(pbitmap, rbio->stripe_npages);
2336 int nr_data = rbio->nr_data;
2337 int stripe;
2338 int pagenr;
2339 int p_stripe = -1;
2340 int q_stripe = -1;
2341 struct page *p_page = NULL;
2342 struct page *q_page = NULL;
2343 struct bio_list bio_list;
2344 struct bio *bio;
2345 int is_replace = 0;
2346 int ret;
2347
2348 bio_list_init(&bio_list);
2349
2350 if (rbio->real_stripes - rbio->nr_data == 1) {
2351 p_stripe = rbio->real_stripes - 1;
2352 } else if (rbio->real_stripes - rbio->nr_data == 2) {
2353 p_stripe = rbio->real_stripes - 2;
2354 q_stripe = rbio->real_stripes - 1;
2355 } else {
2356 BUG();
2357 }
2358
2359 if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) {
2360 is_replace = 1;
2361 bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages);
2362 }
2363
2364 /*
2365 * Because the higher layers(scrubber) are unlikely to
2366 * use this area of the disk again soon, so don't cache
2367 * it.
2368 */
2369 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2370
2371 if (!need_check)
2372 goto writeback;
2373
2374 p_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
2375 if (!p_page)
2376 goto cleanup;
2377 SetPageUptodate(p_page);
2378
2379 if (q_stripe != -1) {
2380 q_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
2381 if (!q_page) {
2382 __free_page(p_page);
2383 goto cleanup;
2384 }
2385 SetPageUptodate(q_page);
2386 }
2387
2388 atomic_set(&rbio->error, 0);
2389
2390 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
2391 struct page *p;
2392 void *parity;
2393 /* first collect one page from each data stripe */
2394 for (stripe = 0; stripe < nr_data; stripe++) {
2395 p = page_in_rbio(rbio, stripe, pagenr, 0);
2396 pointers[stripe] = kmap(p);
2397 }
2398
2399 /* then add the parity stripe */
2400 pointers[stripe++] = kmap(p_page);
2401
2402 if (q_stripe != -1) {
2403
2404 /*
2405 * raid6, add the qstripe and call the
2406 * library function to fill in our p/q
2407 */
2408 pointers[stripe++] = kmap(q_page);
2409
2410 raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
2411 pointers);
2412 } else {
2413 /* raid5 */
2414 memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
2415 run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
2416 }
2417
2418 /* Check scrubbing pairty and repair it */
2419 p = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
2420 parity = kmap(p);
2421 if (memcmp(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE))
2422 memcpy(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE);
2423 else
2424 /* Parity is right, needn't writeback */
2425 bitmap_clear(rbio->dbitmap, pagenr, 1);
2426 kunmap(p);
2427
2428 for (stripe = 0; stripe < rbio->real_stripes; stripe++)
2429 kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
2430 }
2431
2432 __free_page(p_page);
2433 if (q_page)
2434 __free_page(q_page);
2435
2436 writeback:
2437 /*
2438 * time to start writing. Make bios for everything from the
2439 * higher layers (the bio_list in our rbio) and our p/q. Ignore
2440 * everything else.
2441 */
2442 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
2443 struct page *page;
2444
2445 page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
2446 ret = rbio_add_io_page(rbio, &bio_list,
2447 page, rbio->scrubp, pagenr, rbio->stripe_len);
2448 if (ret)
2449 goto cleanup;
2450 }
2451
2452 if (!is_replace)
2453 goto submit_write;
2454
2455 for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) {
2456 struct page *page;
2457
2458 page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
2459 ret = rbio_add_io_page(rbio, &bio_list, page,
2460 bbio->tgtdev_map[rbio->scrubp],
2461 pagenr, rbio->stripe_len);
2462 if (ret)
2463 goto cleanup;
2464 }
2465
2466 submit_write:
2467 nr_data = bio_list_size(&bio_list);
2468 if (!nr_data) {
2469 /* Every parity is right */
2470 rbio_orig_end_io(rbio, 0, 0);
2471 return;
2472 }
2473
2474 atomic_set(&rbio->stripes_pending, nr_data);
2475
2476 while (1) {
2477 bio = bio_list_pop(&bio_list);
2478 if (!bio)
2479 break;
2480
2481 bio->bi_private = rbio;
2482 bio->bi_end_io = raid_write_parity_end_io;
2483 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
2484 submit_bio(WRITE, bio);
2485 }
2486 return;
2487
2488 cleanup:
2489 rbio_orig_end_io(rbio, -EIO, 0);
2490 }
2491
2492 static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
2493 {
2494 if (stripe >= 0 && stripe < rbio->nr_data)
2495 return 1;
2496 return 0;
2497 }
2498
2499 /*
2500 * While we're doing the parity check and repair, we could have errors
2501 * in reading pages off the disk. This checks for errors and if we're
2502 * not able to read the page it'll trigger parity reconstruction. The
2503 * parity scrub will be finished after we've reconstructed the failed
2504 * stripes
2505 */
2506 static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
2507 {
2508 if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
2509 goto cleanup;
2510
2511 if (rbio->faila >= 0 || rbio->failb >= 0) {
2512 int dfail = 0, failp = -1;
2513
2514 if (is_data_stripe(rbio, rbio->faila))
2515 dfail++;
2516 else if (is_parity_stripe(rbio->faila))
2517 failp = rbio->faila;
2518
2519 if (is_data_stripe(rbio, rbio->failb))
2520 dfail++;
2521 else if (is_parity_stripe(rbio->failb))
2522 failp = rbio->failb;
2523
2524 /*
2525 * Because we can not use a scrubbing parity to repair
2526 * the data, so the capability of the repair is declined.
2527 * (In the case of RAID5, we can not repair anything)
2528 */
2529 if (dfail > rbio->bbio->max_errors - 1)
2530 goto cleanup;
2531
2532 /*
2533 * If all data is good, only parity is correctly, just
2534 * repair the parity.
2535 */
2536 if (dfail == 0) {
2537 finish_parity_scrub(rbio, 0);
2538 return;
2539 }
2540
2541 /*
2542 * Here means we got one corrupted data stripe and one
2543 * corrupted parity on RAID6, if the corrupted parity
2544 * is scrubbing parity, luckly, use the other one to repair
2545 * the data, or we can not repair the data stripe.
2546 */
2547 if (failp != rbio->scrubp)
2548 goto cleanup;
2549
2550 __raid_recover_end_io(rbio);
2551 } else {
2552 finish_parity_scrub(rbio, 1);
2553 }
2554 return;
2555
2556 cleanup:
2557 rbio_orig_end_io(rbio, -EIO, 0);
2558 }
2559
2560 /*
2561 * end io for the read phase of the rmw cycle. All the bios here are physical
2562 * stripe bios we've read from the disk so we can recalculate the parity of the
2563 * stripe.
2564 *
2565 * This will usually kick off finish_rmw once all the bios are read in, but it
2566 * may trigger parity reconstruction if we had any errors along the way
2567 */
2568 static void raid56_parity_scrub_end_io(struct bio *bio, int err)
2569 {
2570 struct btrfs_raid_bio *rbio = bio->bi_private;
2571
2572 if (err)
2573 fail_bio_stripe(rbio, bio);
2574 else
2575 set_bio_pages_uptodate(bio);
2576
2577 bio_put(bio);
2578
2579 if (!atomic_dec_and_test(&rbio->stripes_pending))
2580 return;
2581
2582 /*
2583 * this will normally call finish_rmw to start our write
2584 * but if there are any failed stripes we'll reconstruct
2585 * from parity first
2586 */
2587 validate_rbio_for_parity_scrub(rbio);
2588 }
2589
2590 static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
2591 {
2592 int bios_to_read = 0;
2593 struct bio_list bio_list;
2594 int ret;
2595 int pagenr;
2596 int stripe;
2597 struct bio *bio;
2598
2599 ret = alloc_rbio_essential_pages(rbio);
2600 if (ret)
2601 goto cleanup;
2602
2603 bio_list_init(&bio_list);
2604
2605 atomic_set(&rbio->error, 0);
2606 /*
2607 * build a list of bios to read all the missing parts of this
2608 * stripe
2609 */
2610 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
2611 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
2612 struct page *page;
2613 /*
2614 * we want to find all the pages missing from
2615 * the rbio and read them from the disk. If
2616 * page_in_rbio finds a page in the bio list
2617 * we don't need to read it off the stripe.
2618 */
2619 page = page_in_rbio(rbio, stripe, pagenr, 1);
2620 if (page)
2621 continue;
2622
2623 page = rbio_stripe_page(rbio, stripe, pagenr);
2624 /*
2625 * the bio cache may have handed us an uptodate
2626 * page. If so, be happy and use it
2627 */
2628 if (PageUptodate(page))
2629 continue;
2630
2631 ret = rbio_add_io_page(rbio, &bio_list, page,
2632 stripe, pagenr, rbio->stripe_len);
2633 if (ret)
2634 goto cleanup;
2635 }
2636 }
2637
2638 bios_to_read = bio_list_size(&bio_list);
2639 if (!bios_to_read) {
2640 /*
2641 * this can happen if others have merged with
2642 * us, it means there is nothing left to read.
2643 * But if there are missing devices it may not be
2644 * safe to do the full stripe write yet.
2645 */
2646 goto finish;
2647 }
2648
2649 /*
2650 * the bbio may be freed once we submit the last bio. Make sure
2651 * not to touch it after that
2652 */
2653 atomic_set(&rbio->stripes_pending, bios_to_read);
2654 while (1) {
2655 bio = bio_list_pop(&bio_list);
2656 if (!bio)
2657 break;
2658
2659 bio->bi_private = rbio;
2660 bio->bi_end_io = raid56_parity_scrub_end_io;
2661
2662 btrfs_bio_wq_end_io(rbio->fs_info, bio,
2663 BTRFS_WQ_ENDIO_RAID56);
2664
2665 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
2666 submit_bio(READ, bio);
2667 }
2668 /* the actual write will happen once the reads are done */
2669 return;
2670
2671 cleanup:
2672 rbio_orig_end_io(rbio, -EIO, 0);
2673 return;
2674
2675 finish:
2676 validate_rbio_for_parity_scrub(rbio);
2677 }
2678
2679 static void scrub_parity_work(struct btrfs_work *work)
2680 {
2681 struct btrfs_raid_bio *rbio;
2682
2683 rbio = container_of(work, struct btrfs_raid_bio, work);
2684 raid56_parity_scrub_stripe(rbio);
2685 }
2686
2687 static void async_scrub_parity(struct btrfs_raid_bio *rbio)
2688 {
2689 btrfs_init_work(&rbio->work, btrfs_rmw_helper,
2690 scrub_parity_work, NULL, NULL);
2691
2692 btrfs_queue_work(rbio->fs_info->rmw_workers,
2693 &rbio->work);
2694 }
2695
2696 void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
2697 {
2698 if (!lock_stripe_add(rbio))
2699 async_scrub_parity(rbio);
2700 }