]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blame - fs/btrfs/raid56.c
Btrfs: RAID5 and RAID6
[mirror_ubuntu-artful-kernel.git] / fs / btrfs / raid56.c
CommitLineData
53b381b3
DW
1/*
2 * Copyright (C) 2012 Fusion-io All rights reserved.
3 * Copyright (C) 2012 Intel Corp. All rights reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public
7 * License v2 as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public
15 * License along with this program; if not, write to the
16 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
17 * Boston, MA 021110-1307, USA.
18 */
19#include <linux/sched.h>
20#include <linux/wait.h>
21#include <linux/bio.h>
22#include <linux/slab.h>
23#include <linux/buffer_head.h>
24#include <linux/blkdev.h>
25#include <linux/random.h>
26#include <linux/iocontext.h>
27#include <linux/capability.h>
28#include <linux/ratelimit.h>
29#include <linux/kthread.h>
30#include <linux/raid/pq.h>
31#include <linux/hash.h>
32#include <linux/list_sort.h>
33#include <linux/raid/xor.h>
34#include <asm/div64.h>
35#include "compat.h"
36#include "ctree.h"
37#include "extent_map.h"
38#include "disk-io.h"
39#include "transaction.h"
40#include "print-tree.h"
41#include "volumes.h"
42#include "raid56.h"
43#include "async-thread.h"
44#include "check-integrity.h"
45#include "rcu-string.h"
46
47/* set when additional merges to this rbio are not allowed */
48#define RBIO_RMW_LOCKED_BIT 1
49
50struct btrfs_raid_bio {
51 struct btrfs_fs_info *fs_info;
52 struct btrfs_bio *bbio;
53
54 /*
55 * logical block numbers for the start of each stripe
56 * The last one or two are p/q. These are sorted,
57 * so raid_map[0] is the start of our full stripe
58 */
59 u64 *raid_map;
60
61 /* while we're doing rmw on a stripe
62 * we put it into a hash table so we can
63 * lock the stripe and merge more rbios
64 * into it.
65 */
66 struct list_head hash_list;
67
68 /*
69 * for scheduling work in the helper threads
70 */
71 struct btrfs_work work;
72
73 /*
74 * bio list and bio_list_lock are used
75 * to add more bios into the stripe
76 * in hopes of avoiding the full rmw
77 */
78 struct bio_list bio_list;
79 spinlock_t bio_list_lock;
80
81 /*
82 * also protected by the bio_list_lock, the
83 * stripe locking code uses plug_list to hand off
84 * the stripe lock to the next pending IO
85 */
86 struct list_head plug_list;
87
88 /*
89 * flags that tell us if it is safe to
90 * merge with this bio
91 */
92 unsigned long flags;
93
94 /* size of each individual stripe on disk */
95 int stripe_len;
96
97 /* number of data stripes (no p/q) */
98 int nr_data;
99
100 /*
101 * set if we're doing a parity rebuild
102 * for a read from higher up, which is handled
103 * differently from a parity rebuild as part of
104 * rmw
105 */
106 int read_rebuild;
107
108 /* first bad stripe */
109 int faila;
110
111 /* second bad stripe (for raid6 use) */
112 int failb;
113
114 /*
115 * number of pages needed to represent the full
116 * stripe
117 */
118 int nr_pages;
119
120 /*
121 * size of all the bios in the bio_list. This
122 * helps us decide if the rbio maps to a full
123 * stripe or not
124 */
125 int bio_list_bytes;
126
127 atomic_t refs;
128
129 /*
130 * these are two arrays of pointers. We allocate the
131 * rbio big enough to hold them both and setup their
132 * locations when the rbio is allocated
133 */
134
135 /* pointers to pages that we allocated for
136 * reading/writing stripes directly from the disk (including P/Q)
137 */
138 struct page **stripe_pages;
139
140 /*
141 * pointers to the pages in the bio_list. Stored
142 * here for faster lookup
143 */
144 struct page **bio_pages;
145};
146
147static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
148static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
149static void rmw_work(struct btrfs_work *work);
150static void read_rebuild_work(struct btrfs_work *work);
151static void async_rmw_stripe(struct btrfs_raid_bio *rbio);
152static void async_read_rebuild(struct btrfs_raid_bio *rbio);
153static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
154static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
155static void __free_raid_bio(struct btrfs_raid_bio *rbio);
156static void index_rbio_pages(struct btrfs_raid_bio *rbio);
157static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
158
159/*
160 * the stripe hash table is used for locking, and to collect
161 * bios in hopes of making a full stripe
162 */
163int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
164{
165 struct btrfs_stripe_hash_table *table;
166 struct btrfs_stripe_hash_table *x;
167 struct btrfs_stripe_hash *cur;
168 struct btrfs_stripe_hash *h;
169 int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
170 int i;
171
172 if (info->stripe_hash_table)
173 return 0;
174
175 table = kzalloc(sizeof(*table) + sizeof(*h) * num_entries, GFP_NOFS);
176 if (!table)
177 return -ENOMEM;
178
179 table->table = (void *)(table + 1);
180 h = table->table;
181
182 for (i = 0; i < num_entries; i++) {
183 cur = h + i;
184 INIT_LIST_HEAD(&cur->hash_list);
185 spin_lock_init(&cur->lock);
186 init_waitqueue_head(&cur->wait);
187 }
188
189 x = cmpxchg(&info->stripe_hash_table, NULL, table);
190 if (x)
191 kfree(x);
192 return 0;
193}
194
195/*
196 * we hash on the first logical address of the stripe
197 */
198static int rbio_bucket(struct btrfs_raid_bio *rbio)
199{
200 u64 num = rbio->raid_map[0];
201
202 /*
203 * we shift down quite a bit. We're using byte
204 * addressing, and most of the lower bits are zeros.
205 * This tends to upset hash_64, and it consistently
206 * returns just one or two different values.
207 *
208 * shifting off the lower bits fixes things.
209 */
210 return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
211}
212
213/*
214 * merging means we take the bio_list from the victim and
215 * splice it into the destination. The victim should
216 * be discarded afterwards.
217 *
218 * must be called with dest->rbio_list_lock held
219 */
220static void merge_rbio(struct btrfs_raid_bio *dest,
221 struct btrfs_raid_bio *victim)
222{
223 bio_list_merge(&dest->bio_list, &victim->bio_list);
224 dest->bio_list_bytes += victim->bio_list_bytes;
225 bio_list_init(&victim->bio_list);
226}
227
228/*
229 * free the hash table used by unmount
230 */
231void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
232{
233 if (!info->stripe_hash_table)
234 return;
235 kfree(info->stripe_hash_table);
236 info->stripe_hash_table = NULL;
237}
238
239/*
240 * helper function to run the xor_blocks api. It is only
241 * able to do MAX_XOR_BLOCKS at a time, so we need to
242 * loop through.
243 */
244static void run_xor(void **pages, int src_cnt, ssize_t len)
245{
246 int src_off = 0;
247 int xor_src_cnt = 0;
248 void *dest = pages[src_cnt];
249
250 while(src_cnt > 0) {
251 xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
252 xor_blocks(xor_src_cnt, len, dest, pages + src_off);
253
254 src_cnt -= xor_src_cnt;
255 src_off += xor_src_cnt;
256 }
257}
258
259/*
260 * returns true if the bio list inside this rbio
261 * covers an entire stripe (no rmw required).
262 * Must be called with the bio list lock held, or
263 * at a time when you know it is impossible to add
264 * new bios into the list
265 */
266static int __rbio_is_full(struct btrfs_raid_bio *rbio)
267{
268 unsigned long size = rbio->bio_list_bytes;
269 int ret = 1;
270
271 if (size != rbio->nr_data * rbio->stripe_len)
272 ret = 0;
273
274 BUG_ON(size > rbio->nr_data * rbio->stripe_len);
275 return ret;
276}
277
278static int rbio_is_full(struct btrfs_raid_bio *rbio)
279{
280 unsigned long flags;
281 int ret;
282
283 spin_lock_irqsave(&rbio->bio_list_lock, flags);
284 ret = __rbio_is_full(rbio);
285 spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
286 return ret;
287}
288
289/*
290 * returns 1 if it is safe to merge two rbios together.
291 * The merging is safe if the two rbios correspond to
292 * the same stripe and if they are both going in the same
293 * direction (read vs write), and if neither one is
294 * locked for final IO
295 *
296 * The caller is responsible for locking such that
297 * rmw_locked is safe to test
298 */
299static int rbio_can_merge(struct btrfs_raid_bio *last,
300 struct btrfs_raid_bio *cur)
301{
302 if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
303 test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
304 return 0;
305
306 if (last->raid_map[0] !=
307 cur->raid_map[0])
308 return 0;
309
310 /* reads can't merge with writes */
311 if (last->read_rebuild !=
312 cur->read_rebuild) {
313 return 0;
314 }
315
316 return 1;
317}
318
319/*
320 * helper to index into the pstripe
321 */
322static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
323{
324 index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
325 return rbio->stripe_pages[index];
326}
327
328/*
329 * helper to index into the qstripe, returns null
330 * if there is no qstripe
331 */
332static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
333{
334 if (rbio->nr_data + 1 == rbio->bbio->num_stripes)
335 return NULL;
336
337 index += ((rbio->nr_data + 1) * rbio->stripe_len) >>
338 PAGE_CACHE_SHIFT;
339 return rbio->stripe_pages[index];
340}
341
342/*
343 * The first stripe in the table for a logical address
344 * has the lock. rbios are added in one of three ways:
345 *
346 * 1) Nobody has the stripe locked yet. The rbio is given
347 * the lock and 0 is returned. The caller must start the IO
348 * themselves.
349 *
350 * 2) Someone has the stripe locked, but we're able to merge
351 * with the lock owner. The rbio is freed and the IO will
352 * start automatically along with the existing rbio. 1 is returned.
353 *
354 * 3) Someone has the stripe locked, but we're not able to merge.
355 * The rbio is added to the lock owner's plug list, or merged into
356 * an rbio already on the plug list. When the lock owner unlocks,
357 * the next rbio on the list is run and the IO is started automatically.
358 * 1 is returned
359 *
360 * If we return 0, the caller still owns the rbio and must continue with
361 * IO submission. If we return 1, the caller must assume the rbio has
362 * already been freed.
363 */
364static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
365{
366 int bucket = rbio_bucket(rbio);
367 struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket;
368 struct btrfs_raid_bio *cur;
369 struct btrfs_raid_bio *pending;
370 unsigned long flags;
371 DEFINE_WAIT(wait);
372 struct btrfs_raid_bio *freeit = NULL;
373 int ret = 0;
374 int walk = 0;
375
376 spin_lock_irqsave(&h->lock, flags);
377 list_for_each_entry(cur, &h->hash_list, hash_list) {
378 walk++;
379 if (cur->raid_map[0] == rbio->raid_map[0]) {
380 spin_lock(&cur->bio_list_lock);
381
382 /* can we merge into the lock owner? */
383 if (rbio_can_merge(cur, rbio)) {
384 merge_rbio(cur, rbio);
385 spin_unlock(&cur->bio_list_lock);
386 freeit = rbio;
387 ret = 1;
388 goto out;
389 }
390
391 /*
392 * we couldn't merge with the running
393 * rbio, see if we can merge with the
394 * pending ones. We don't have to
395 * check for rmw_locked because there
396 * is no way they are inside finish_rmw
397 * right now
398 */
399 list_for_each_entry(pending, &cur->plug_list,
400 plug_list) {
401 if (rbio_can_merge(pending, rbio)) {
402 merge_rbio(pending, rbio);
403 spin_unlock(&cur->bio_list_lock);
404 freeit = rbio;
405 ret = 1;
406 goto out;
407 }
408 }
409
410 /* no merging, put us on the tail of the plug list,
411 * our rbio will be started with the currently
412 * running rbio unlocks
413 */
414 list_add_tail(&rbio->plug_list, &cur->plug_list);
415 spin_unlock(&cur->bio_list_lock);
416 ret = 1;
417 goto out;
418 }
419 }
420
421 atomic_inc(&rbio->refs);
422 list_add(&rbio->hash_list, &h->hash_list);
423out:
424 spin_unlock_irqrestore(&h->lock, flags);
425 if (freeit)
426 __free_raid_bio(freeit);
427 return ret;
428}
429
430/*
431 * called as rmw or parity rebuild is completed. If the plug list has more
432 * rbios waiting for this stripe, the next one on the list will be started
433 */
434static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
435{
436 int bucket;
437 struct btrfs_stripe_hash *h;
438 unsigned long flags;
439
440 bucket = rbio_bucket(rbio);
441 h = rbio->fs_info->stripe_hash_table->table + bucket;
442
443 spin_lock_irqsave(&h->lock, flags);
444 spin_lock(&rbio->bio_list_lock);
445
446 if (!list_empty(&rbio->hash_list)) {
447
448 list_del_init(&rbio->hash_list);
449 atomic_dec(&rbio->refs);
450
451 /*
452 * we use the plug list to hold all the rbios
453 * waiting for the chance to lock this stripe.
454 * hand the lock over to one of them.
455 */
456 if (!list_empty(&rbio->plug_list)) {
457 struct btrfs_raid_bio *next;
458 struct list_head *head = rbio->plug_list.next;
459
460 next = list_entry(head, struct btrfs_raid_bio,
461 plug_list);
462
463 list_del_init(&rbio->plug_list);
464
465 list_add(&next->hash_list, &h->hash_list);
466 atomic_inc(&next->refs);
467 spin_unlock(&rbio->bio_list_lock);
468 spin_unlock_irqrestore(&h->lock, flags);
469
470 if (next->read_rebuild)
471 async_read_rebuild(next);
472 else
473 async_rmw_stripe(next);
474
475 goto done_nolock;
476
477 } else if (waitqueue_active(&h->wait)) {
478 spin_unlock(&rbio->bio_list_lock);
479 spin_unlock_irqrestore(&h->lock, flags);
480 wake_up(&h->wait);
481 goto done_nolock;
482 }
483 }
484 spin_unlock(&rbio->bio_list_lock);
485 spin_unlock_irqrestore(&h->lock, flags);
486
487done_nolock:
488 return;
489}
490
491static void __free_raid_bio(struct btrfs_raid_bio *rbio)
492{
493 int i;
494
495 WARN_ON(atomic_read(&rbio->refs) < 0);
496 if (!atomic_dec_and_test(&rbio->refs))
497 return;
498
499 WARN_ON(!list_empty(&rbio->hash_list));
500 WARN_ON(!bio_list_empty(&rbio->bio_list));
501
502 for (i = 0; i < rbio->nr_pages; i++) {
503 if (rbio->stripe_pages[i]) {
504 __free_page(rbio->stripe_pages[i]);
505 rbio->stripe_pages[i] = NULL;
506 }
507 }
508 kfree(rbio->raid_map);
509 kfree(rbio->bbio);
510 kfree(rbio);
511}
512
513static void free_raid_bio(struct btrfs_raid_bio *rbio)
514{
515 unlock_stripe(rbio);
516 __free_raid_bio(rbio);
517}
518
519/*
520 * this frees the rbio and runs through all the bios in the
521 * bio_list and calls end_io on them
522 */
523static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
524{
525 struct bio *cur = bio_list_get(&rbio->bio_list);
526 struct bio *next;
527 free_raid_bio(rbio);
528
529 while (cur) {
530 next = cur->bi_next;
531 cur->bi_next = NULL;
532 if (uptodate)
533 set_bit(BIO_UPTODATE, &cur->bi_flags);
534 bio_endio(cur, err);
535 cur = next;
536 }
537}
538
539/*
540 * end io function used by finish_rmw. When we finally
541 * get here, we've written a full stripe
542 */
543static void raid_write_end_io(struct bio *bio, int err)
544{
545 struct btrfs_raid_bio *rbio = bio->bi_private;
546
547 if (err)
548 fail_bio_stripe(rbio, bio);
549
550 bio_put(bio);
551
552 if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
553 return;
554
555 err = 0;
556
557 /* OK, we have read all the stripes we need to. */
558 if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
559 err = -EIO;
560
561 rbio_orig_end_io(rbio, err, 0);
562 return;
563}
564
565/*
566 * the read/modify/write code wants to use the original bio for
567 * any pages it included, and then use the rbio for everything
568 * else. This function decides if a given index (stripe number)
569 * and page number in that stripe fall inside the original bio
570 * or the rbio.
571 *
572 * if you set bio_list_only, you'll get a NULL back for any ranges
573 * that are outside the bio_list
574 *
575 * This doesn't take any refs on anything, you get a bare page pointer
576 * and the caller must bump refs as required.
577 *
578 * You must call index_rbio_pages once before you can trust
579 * the answers from this function.
580 */
581static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
582 int index, int pagenr, int bio_list_only)
583{
584 int chunk_page;
585 struct page *p = NULL;
586
587 chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
588
589 spin_lock_irq(&rbio->bio_list_lock);
590 p = rbio->bio_pages[chunk_page];
591 spin_unlock_irq(&rbio->bio_list_lock);
592
593 if (p || bio_list_only)
594 return p;
595
596 return rbio->stripe_pages[chunk_page];
597}
598
599/*
600 * number of pages we need for the entire stripe across all the
601 * drives
602 */
603static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
604{
605 unsigned long nr = stripe_len * nr_stripes;
606 return (nr + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
607}
608
609/*
610 * allocation and initial setup for the btrfs_raid_bio. Not
611 * this does not allocate any pages for rbio->pages.
612 */
613static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
614 struct btrfs_bio *bbio, u64 *raid_map,
615 u64 stripe_len)
616{
617 struct btrfs_raid_bio *rbio;
618 int nr_data = 0;
619 int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes);
620 void *p;
621
622 rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2,
623 GFP_NOFS);
624 if (!rbio) {
625 kfree(raid_map);
626 kfree(bbio);
627 return ERR_PTR(-ENOMEM);
628 }
629
630 bio_list_init(&rbio->bio_list);
631 INIT_LIST_HEAD(&rbio->plug_list);
632 spin_lock_init(&rbio->bio_list_lock);
633 INIT_LIST_HEAD(&rbio->hash_list);
634 rbio->bbio = bbio;
635 rbio->raid_map = raid_map;
636 rbio->fs_info = root->fs_info;
637 rbio->stripe_len = stripe_len;
638 rbio->nr_pages = num_pages;
639 rbio->faila = -1;
640 rbio->failb = -1;
641 atomic_set(&rbio->refs, 1);
642
643 /*
644 * the stripe_pages and bio_pages array point to the extra
645 * memory we allocated past the end of the rbio
646 */
647 p = rbio + 1;
648 rbio->stripe_pages = p;
649 rbio->bio_pages = p + sizeof(struct page *) * num_pages;
650
651 if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
652 nr_data = bbio->num_stripes - 2;
653 else
654 nr_data = bbio->num_stripes - 1;
655
656 rbio->nr_data = nr_data;
657 return rbio;
658}
659
660/* allocate pages for all the stripes in the bio, including parity */
661static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
662{
663 int i;
664 struct page *page;
665
666 for (i = 0; i < rbio->nr_pages; i++) {
667 if (rbio->stripe_pages[i])
668 continue;
669 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
670 if (!page)
671 return -ENOMEM;
672 rbio->stripe_pages[i] = page;
673 ClearPageUptodate(page);
674 }
675 return 0;
676}
677
678/* allocate pages for just the p/q stripes */
679static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
680{
681 int i;
682 struct page *page;
683
684 i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
685
686 for (; i < rbio->nr_pages; i++) {
687 if (rbio->stripe_pages[i])
688 continue;
689 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
690 if (!page)
691 return -ENOMEM;
692 rbio->stripe_pages[i] = page;
693 }
694 return 0;
695}
696
697/*
698 * add a single page from a specific stripe into our list of bios for IO
699 * this will try to merge into existing bios if possible, and returns
700 * zero if all went well.
701 */
702int rbio_add_io_page(struct btrfs_raid_bio *rbio,
703 struct bio_list *bio_list,
704 struct page *page,
705 int stripe_nr,
706 unsigned long page_index,
707 unsigned long bio_max_len)
708{
709 struct bio *last = bio_list->tail;
710 u64 last_end = 0;
711 int ret;
712 struct bio *bio;
713 struct btrfs_bio_stripe *stripe;
714 u64 disk_start;
715
716 stripe = &rbio->bbio->stripes[stripe_nr];
717 disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT);
718
719 /* if the device is missing, just fail this stripe */
720 if (!stripe->dev->bdev)
721 return fail_rbio_index(rbio, stripe_nr);
722
723 /* see if we can add this page onto our existing bio */
724 if (last) {
725 last_end = (u64)last->bi_sector << 9;
726 last_end += last->bi_size;
727
728 /*
729 * we can't merge these if they are from different
730 * devices or if they are not contiguous
731 */
732 if (last_end == disk_start && stripe->dev->bdev &&
733 test_bit(BIO_UPTODATE, &last->bi_flags) &&
734 last->bi_bdev == stripe->dev->bdev) {
735 ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0);
736 if (ret == PAGE_CACHE_SIZE)
737 return 0;
738 }
739 }
740
741 /* put a new bio on the list */
742 bio = bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1);
743 if (!bio)
744 return -ENOMEM;
745
746 bio->bi_size = 0;
747 bio->bi_bdev = stripe->dev->bdev;
748 bio->bi_sector = disk_start >> 9;
749 set_bit(BIO_UPTODATE, &bio->bi_flags);
750
751 bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
752 bio_list_add(bio_list, bio);
753 return 0;
754}
755
756/*
757 * while we're doing the read/modify/write cycle, we could
758 * have errors in reading pages off the disk. This checks
759 * for errors and if we're not able to read the page it'll
760 * trigger parity reconstruction. The rmw will be finished
761 * after we've reconstructed the failed stripes
762 */
763static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
764{
765 if (rbio->faila >= 0 || rbio->failb >= 0) {
766 BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1);
767 __raid56_parity_recover(rbio);
768 } else {
769 finish_rmw(rbio);
770 }
771}
772
773/*
774 * these are just the pages from the rbio array, not from anything
775 * the FS sent down to us
776 */
777static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page)
778{
779 int index;
780 index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT);
781 index += page;
782 return rbio->stripe_pages[index];
783}
784
785/*
786 * helper function to walk our bio list and populate the bio_pages array with
787 * the result. This seems expensive, but it is faster than constantly
788 * searching through the bio list as we setup the IO in finish_rmw or stripe
789 * reconstruction.
790 *
791 * This must be called before you trust the answers from page_in_rbio
792 */
793static void index_rbio_pages(struct btrfs_raid_bio *rbio)
794{
795 struct bio *bio;
796 u64 start;
797 unsigned long stripe_offset;
798 unsigned long page_index;
799 struct page *p;
800 int i;
801
802 spin_lock_irq(&rbio->bio_list_lock);
803 bio_list_for_each(bio, &rbio->bio_list) {
804 start = (u64)bio->bi_sector << 9;
805 stripe_offset = start - rbio->raid_map[0];
806 page_index = stripe_offset >> PAGE_CACHE_SHIFT;
807
808 for (i = 0; i < bio->bi_vcnt; i++) {
809 p = bio->bi_io_vec[i].bv_page;
810 rbio->bio_pages[page_index + i] = p;
811 }
812 }
813 spin_unlock_irq(&rbio->bio_list_lock);
814}
815
816/*
817 * this is called from one of two situations. We either
818 * have a full stripe from the higher layers, or we've read all
819 * the missing bits off disk.
820 *
821 * This will calculate the parity and then send down any
822 * changed blocks.
823 */
824static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
825{
826 struct btrfs_bio *bbio = rbio->bbio;
827 void *pointers[bbio->num_stripes];
828 int stripe_len = rbio->stripe_len;
829 int nr_data = rbio->nr_data;
830 int stripe;
831 int pagenr;
832 int p_stripe = -1;
833 int q_stripe = -1;
834 struct bio_list bio_list;
835 struct bio *bio;
836 int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT;
837 int ret;
838
839 bio_list_init(&bio_list);
840
841 if (bbio->num_stripes - rbio->nr_data == 1) {
842 p_stripe = bbio->num_stripes - 1;
843 } else if (bbio->num_stripes - rbio->nr_data == 2) {
844 p_stripe = bbio->num_stripes - 2;
845 q_stripe = bbio->num_stripes - 1;
846 } else {
847 BUG();
848 }
849
850 /* at this point we either have a full stripe,
851 * or we've read the full stripe from the drive.
852 * recalculate the parity and write the new results.
853 *
854 * We're not allowed to add any new bios to the
855 * bio list here, anyone else that wants to
856 * change this stripe needs to do their own rmw.
857 */
858 spin_lock_irq(&rbio->bio_list_lock);
859 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
860 spin_unlock_irq(&rbio->bio_list_lock);
861
862 atomic_set(&rbio->bbio->error, 0);
863
864 /*
865 * now that we've set rmw_locked, run through the
866 * bio list one last time and map the page pointers
867 */
868 index_rbio_pages(rbio);
869
870 for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
871 struct page *p;
872 /* first collect one page from each data stripe */
873 for (stripe = 0; stripe < nr_data; stripe++) {
874 p = page_in_rbio(rbio, stripe, pagenr, 0);
875 pointers[stripe] = kmap(p);
876 }
877
878 /* then add the parity stripe */
879 p = rbio_pstripe_page(rbio, pagenr);
880 SetPageUptodate(p);
881 pointers[stripe++] = kmap(p);
882
883 if (q_stripe != -1) {
884
885 /*
886 * raid6, add the qstripe and call the
887 * library function to fill in our p/q
888 */
889 p = rbio_qstripe_page(rbio, pagenr);
890 SetPageUptodate(p);
891 pointers[stripe++] = kmap(p);
892
893 raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE,
894 pointers);
895 } else {
896 /* raid5 */
897 memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
898 run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
899 }
900
901
902 for (stripe = 0; stripe < bbio->num_stripes; stripe++)
903 kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
904 }
905
906 /*
907 * time to start writing. Make bios for everything from the
908 * higher layers (the bio_list in our rbio) and our p/q. Ignore
909 * everything else.
910 */
911 for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
912 for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
913 struct page *page;
914 if (stripe < rbio->nr_data) {
915 page = page_in_rbio(rbio, stripe, pagenr, 1);
916 if (!page)
917 continue;
918 } else {
919 page = rbio_stripe_page(rbio, stripe, pagenr);
920 }
921
922 ret = rbio_add_io_page(rbio, &bio_list,
923 page, stripe, pagenr, rbio->stripe_len);
924 if (ret)
925 goto cleanup;
926 }
927 }
928
929 atomic_set(&bbio->stripes_pending, bio_list_size(&bio_list));
930 BUG_ON(atomic_read(&bbio->stripes_pending) == 0);
931
932 while (1) {
933 bio = bio_list_pop(&bio_list);
934 if (!bio)
935 break;
936
937 bio->bi_private = rbio;
938 bio->bi_end_io = raid_write_end_io;
939 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
940 submit_bio(WRITE, bio);
941 }
942 return;
943
944cleanup:
945 rbio_orig_end_io(rbio, -EIO, 0);
946}
947
948/*
949 * helper to find the stripe number for a given bio. Used to figure out which
950 * stripe has failed. This expects the bio to correspond to a physical disk,
951 * so it looks up based on physical sector numbers.
952 */
953static int find_bio_stripe(struct btrfs_raid_bio *rbio,
954 struct bio *bio)
955{
956 u64 physical = bio->bi_sector;
957 u64 stripe_start;
958 int i;
959 struct btrfs_bio_stripe *stripe;
960
961 physical <<= 9;
962
963 for (i = 0; i < rbio->bbio->num_stripes; i++) {
964 stripe = &rbio->bbio->stripes[i];
965 stripe_start = stripe->physical;
966 if (physical >= stripe_start &&
967 physical < stripe_start + rbio->stripe_len) {
968 return i;
969 }
970 }
971 return -1;
972}
973
974/*
975 * helper to find the stripe number for a given
976 * bio (before mapping). Used to figure out which stripe has
977 * failed. This looks up based on logical block numbers.
978 */
979static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
980 struct bio *bio)
981{
982 u64 logical = bio->bi_sector;
983 u64 stripe_start;
984 int i;
985
986 logical <<= 9;
987
988 for (i = 0; i < rbio->nr_data; i++) {
989 stripe_start = rbio->raid_map[i];
990 if (logical >= stripe_start &&
991 logical < stripe_start + rbio->stripe_len) {
992 return i;
993 }
994 }
995 return -1;
996}
997
998/*
999 * returns -EIO if we had too many failures
1000 */
1001static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
1002{
1003 unsigned long flags;
1004 int ret = 0;
1005
1006 spin_lock_irqsave(&rbio->bio_list_lock, flags);
1007
1008 /* we already know this stripe is bad, move on */
1009 if (rbio->faila == failed || rbio->failb == failed)
1010 goto out;
1011
1012 if (rbio->faila == -1) {
1013 /* first failure on this rbio */
1014 rbio->faila = failed;
1015 atomic_inc(&rbio->bbio->error);
1016 } else if (rbio->failb == -1) {
1017 /* second failure on this rbio */
1018 rbio->failb = failed;
1019 atomic_inc(&rbio->bbio->error);
1020 } else {
1021 ret = -EIO;
1022 }
1023out:
1024 spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
1025
1026 return ret;
1027}
1028
1029/*
1030 * helper to fail a stripe based on a physical disk
1031 * bio.
1032 */
1033static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
1034 struct bio *bio)
1035{
1036 int failed = find_bio_stripe(rbio, bio);
1037
1038 if (failed < 0)
1039 return -EIO;
1040
1041 return fail_rbio_index(rbio, failed);
1042}
1043
1044/*
1045 * this sets each page in the bio uptodate. It should only be used on private
1046 * rbio pages, nothing that comes in from the higher layers
1047 */
1048static void set_bio_pages_uptodate(struct bio *bio)
1049{
1050 int i;
1051 struct page *p;
1052
1053 for (i = 0; i < bio->bi_vcnt; i++) {
1054 p = bio->bi_io_vec[i].bv_page;
1055 SetPageUptodate(p);
1056 }
1057}
1058
1059/*
1060 * end io for the read phase of the rmw cycle. All the bios here are physical
1061 * stripe bios we've read from the disk so we can recalculate the parity of the
1062 * stripe.
1063 *
1064 * This will usually kick off finish_rmw once all the bios are read in, but it
1065 * may trigger parity reconstruction if we had any errors along the way
1066 */
1067static void raid_rmw_end_io(struct bio *bio, int err)
1068{
1069 struct btrfs_raid_bio *rbio = bio->bi_private;
1070
1071 if (err)
1072 fail_bio_stripe(rbio, bio);
1073 else
1074 set_bio_pages_uptodate(bio);
1075
1076 bio_put(bio);
1077
1078 if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
1079 return;
1080
1081 err = 0;
1082 if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
1083 goto cleanup;
1084
1085 /*
1086 * this will normally call finish_rmw to start our write
1087 * but if there are any failed stripes we'll reconstruct
1088 * from parity first
1089 */
1090 validate_rbio_for_rmw(rbio);
1091 return;
1092
1093cleanup:
1094
1095 rbio_orig_end_io(rbio, -EIO, 0);
1096}
1097
1098static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
1099{
1100 rbio->work.flags = 0;
1101 rbio->work.func = rmw_work;
1102
1103 btrfs_queue_worker(&rbio->fs_info->rmw_workers,
1104 &rbio->work);
1105}
1106
1107static void async_read_rebuild(struct btrfs_raid_bio *rbio)
1108{
1109 rbio->work.flags = 0;
1110 rbio->work.func = read_rebuild_work;
1111
1112 btrfs_queue_worker(&rbio->fs_info->rmw_workers,
1113 &rbio->work);
1114}
1115
1116/*
1117 * the stripe must be locked by the caller. It will
1118 * unlock after all the writes are done
1119 */
1120static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
1121{
1122 int bios_to_read = 0;
1123 struct btrfs_bio *bbio = rbio->bbio;
1124 struct bio_list bio_list;
1125 int ret;
1126 int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1127 int pagenr;
1128 int stripe;
1129 struct bio *bio;
1130
1131 bio_list_init(&bio_list);
1132
1133 ret = alloc_rbio_pages(rbio);
1134 if (ret)
1135 goto cleanup;
1136
1137 index_rbio_pages(rbio);
1138
1139 atomic_set(&rbio->bbio->error, 0);
1140 /*
1141 * build a list of bios to read all the missing parts of this
1142 * stripe
1143 */
1144 for (stripe = 0; stripe < rbio->nr_data; stripe++) {
1145 for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1146 struct page *page;
1147 /*
1148 * we want to find all the pages missing from
1149 * the rbio and read them from the disk. If
1150 * page_in_rbio finds a page in the bio list
1151 * we don't need to read it off the stripe.
1152 */
1153 page = page_in_rbio(rbio, stripe, pagenr, 1);
1154 if (page)
1155 continue;
1156
1157 page = rbio_stripe_page(rbio, stripe, pagenr);
1158 ret = rbio_add_io_page(rbio, &bio_list, page,
1159 stripe, pagenr, rbio->stripe_len);
1160 if (ret)
1161 goto cleanup;
1162 }
1163 }
1164
1165 bios_to_read = bio_list_size(&bio_list);
1166 if (!bios_to_read) {
1167 /*
1168 * this can happen if others have merged with
1169 * us, it means there is nothing left to read.
1170 * But if there are missing devices it may not be
1171 * safe to do the full stripe write yet.
1172 */
1173 goto finish;
1174 }
1175
1176 /*
1177 * the bbio may be freed once we submit the last bio. Make sure
1178 * not to touch it after that
1179 */
1180 atomic_set(&bbio->stripes_pending, bios_to_read);
1181 while (1) {
1182 bio = bio_list_pop(&bio_list);
1183 if (!bio)
1184 break;
1185
1186 bio->bi_private = rbio;
1187 bio->bi_end_io = raid_rmw_end_io;
1188
1189 btrfs_bio_wq_end_io(rbio->fs_info, bio,
1190 BTRFS_WQ_ENDIO_RAID56);
1191
1192 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
1193 submit_bio(READ, bio);
1194 }
1195 /* the actual write will happen once the reads are done */
1196 return 0;
1197
1198cleanup:
1199 rbio_orig_end_io(rbio, -EIO, 0);
1200 return -EIO;
1201
1202finish:
1203 validate_rbio_for_rmw(rbio);
1204 return 0;
1205}
1206
1207/*
1208 * if the upper layers pass in a full stripe, we thank them by only allocating
1209 * enough pages to hold the parity, and sending it all down quickly.
1210 */
1211static int full_stripe_write(struct btrfs_raid_bio *rbio)
1212{
1213 int ret;
1214
1215 ret = alloc_rbio_parity_pages(rbio);
1216 if (ret)
1217 return ret;
1218
1219 ret = lock_stripe_add(rbio);
1220 if (ret == 0)
1221 finish_rmw(rbio);
1222 return 0;
1223}
1224
1225/*
1226 * partial stripe writes get handed over to async helpers.
1227 * We're really hoping to merge a few more writes into this
1228 * rbio before calculating new parity
1229 */
1230static int partial_stripe_write(struct btrfs_raid_bio *rbio)
1231{
1232 int ret;
1233
1234 ret = lock_stripe_add(rbio);
1235 if (ret == 0)
1236 async_rmw_stripe(rbio);
1237 return 0;
1238}
1239
1240/*
1241 * sometimes while we were reading from the drive to
1242 * recalculate parity, enough new bios come into create
1243 * a full stripe. So we do a check here to see if we can
1244 * go directly to finish_rmw
1245 */
1246static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
1247{
1248 /* head off into rmw land if we don't have a full stripe */
1249 if (!rbio_is_full(rbio))
1250 return partial_stripe_write(rbio);
1251 return full_stripe_write(rbio);
1252}
1253
1254/*
1255 * our main entry point for writes from the rest of the FS.
1256 */
1257int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
1258 struct btrfs_bio *bbio, u64 *raid_map,
1259 u64 stripe_len)
1260{
1261 struct btrfs_raid_bio *rbio;
1262
1263 rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
1264 if (IS_ERR(rbio)) {
1265 kfree(raid_map);
1266 kfree(bbio);
1267 return PTR_ERR(rbio);
1268 }
1269 bio_list_add(&rbio->bio_list, bio);
1270 rbio->bio_list_bytes = bio->bi_size;
1271 return __raid56_parity_write(rbio);
1272}
1273
1274/*
1275 * all parity reconstruction happens here. We've read in everything
1276 * we can find from the drives and this does the heavy lifting of
1277 * sorting the good from the bad.
1278 */
1279static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1280{
1281 int pagenr, stripe;
1282 void **pointers;
1283 int faila = -1, failb = -1;
1284 int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1285 struct page *page;
1286 int err;
1287 int i;
1288
1289 pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *),
1290 GFP_NOFS);
1291 if (!pointers) {
1292 err = -ENOMEM;
1293 goto cleanup_io;
1294 }
1295
1296 faila = rbio->faila;
1297 failb = rbio->failb;
1298
1299 if (rbio->read_rebuild) {
1300 spin_lock_irq(&rbio->bio_list_lock);
1301 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1302 spin_unlock_irq(&rbio->bio_list_lock);
1303 }
1304
1305 index_rbio_pages(rbio);
1306
1307 for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1308 /* setup our array of pointers with pages
1309 * from each stripe
1310 */
1311 for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
1312 /*
1313 * if we're rebuilding a read, we have to use
1314 * pages from the bio list
1315 */
1316 if (rbio->read_rebuild &&
1317 (stripe == faila || stripe == failb)) {
1318 page = page_in_rbio(rbio, stripe, pagenr, 0);
1319 } else {
1320 page = rbio_stripe_page(rbio, stripe, pagenr);
1321 }
1322 pointers[stripe] = kmap(page);
1323 }
1324
1325 /* all raid6 handling here */
1326 if (rbio->raid_map[rbio->bbio->num_stripes - 1] ==
1327 RAID6_Q_STRIPE) {
1328
1329 /*
1330 * single failure, rebuild from parity raid5
1331 * style
1332 */
1333 if (failb < 0) {
1334 if (faila == rbio->nr_data) {
1335 /*
1336 * Just the P stripe has failed, without
1337 * a bad data or Q stripe.
1338 * TODO, we should redo the xor here.
1339 */
1340 err = -EIO;
1341 goto cleanup;
1342 }
1343 /*
1344 * a single failure in raid6 is rebuilt
1345 * in the pstripe code below
1346 */
1347 goto pstripe;
1348 }
1349
1350 /* make sure our ps and qs are in order */
1351 if (faila > failb) {
1352 int tmp = failb;
1353 failb = faila;
1354 faila = tmp;
1355 }
1356
1357 /* if the q stripe is failed, do a pstripe reconstruction
1358 * from the xors.
1359 * If both the q stripe and the P stripe are failed, we're
1360 * here due to a crc mismatch and we can't give them the
1361 * data they want
1362 */
1363 if (rbio->raid_map[failb] == RAID6_Q_STRIPE) {
1364 if (rbio->raid_map[faila] == RAID5_P_STRIPE) {
1365 err = -EIO;
1366 goto cleanup;
1367 }
1368 /*
1369 * otherwise we have one bad data stripe and
1370 * a good P stripe. raid5!
1371 */
1372 goto pstripe;
1373 }
1374
1375 if (rbio->raid_map[failb] == RAID5_P_STRIPE) {
1376 raid6_datap_recov(rbio->bbio->num_stripes,
1377 PAGE_SIZE, faila, pointers);
1378 } else {
1379 raid6_2data_recov(rbio->bbio->num_stripes,
1380 PAGE_SIZE, faila, failb,
1381 pointers);
1382 }
1383 } else {
1384 void *p;
1385
1386 /* rebuild from P stripe here (raid5 or raid6) */
1387 BUG_ON(failb != -1);
1388pstripe:
1389 /* Copy parity block into failed block to start with */
1390 memcpy(pointers[faila],
1391 pointers[rbio->nr_data],
1392 PAGE_CACHE_SIZE);
1393
1394 /* rearrange the pointer array */
1395 p = pointers[faila];
1396 for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
1397 pointers[stripe] = pointers[stripe + 1];
1398 pointers[rbio->nr_data - 1] = p;
1399
1400 /* xor in the rest */
1401 run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE);
1402 }
1403 /* if we're doing this rebuild as part of an rmw, go through
1404 * and set all of our private rbio pages in the
1405 * failed stripes as uptodate. This way finish_rmw will
1406 * know they can be trusted. If this was a read reconstruction,
1407 * other endio functions will fiddle the uptodate bits
1408 */
1409 if (!rbio->read_rebuild) {
1410 for (i = 0; i < nr_pages; i++) {
1411 if (faila != -1) {
1412 page = rbio_stripe_page(rbio, faila, i);
1413 SetPageUptodate(page);
1414 }
1415 if (failb != -1) {
1416 page = rbio_stripe_page(rbio, failb, i);
1417 SetPageUptodate(page);
1418 }
1419 }
1420 }
1421 for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
1422 /*
1423 * if we're rebuilding a read, we have to use
1424 * pages from the bio list
1425 */
1426 if (rbio->read_rebuild &&
1427 (stripe == faila || stripe == failb)) {
1428 page = page_in_rbio(rbio, stripe, pagenr, 0);
1429 } else {
1430 page = rbio_stripe_page(rbio, stripe, pagenr);
1431 }
1432 kunmap(page);
1433 }
1434 }
1435
1436 err = 0;
1437cleanup:
1438 kfree(pointers);
1439
1440cleanup_io:
1441
1442 if (rbio->read_rebuild) {
1443 rbio_orig_end_io(rbio, err, err == 0);
1444 } else if (err == 0) {
1445 rbio->faila = -1;
1446 rbio->failb = -1;
1447 finish_rmw(rbio);
1448 } else {
1449 rbio_orig_end_io(rbio, err, 0);
1450 }
1451}
1452
1453/*
1454 * This is called only for stripes we've read from disk to
1455 * reconstruct the parity.
1456 */
1457static void raid_recover_end_io(struct bio *bio, int err)
1458{
1459 struct btrfs_raid_bio *rbio = bio->bi_private;
1460
1461 /*
1462 * we only read stripe pages off the disk, set them
1463 * up to date if there were no errors
1464 */
1465 if (err)
1466 fail_bio_stripe(rbio, bio);
1467 else
1468 set_bio_pages_uptodate(bio);
1469 bio_put(bio);
1470
1471 if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
1472 return;
1473
1474 if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
1475 rbio_orig_end_io(rbio, -EIO, 0);
1476 else
1477 __raid_recover_end_io(rbio);
1478}
1479
1480/*
1481 * reads everything we need off the disk to reconstruct
1482 * the parity. endio handlers trigger final reconstruction
1483 * when the IO is done.
1484 *
1485 * This is used both for reads from the higher layers and for
1486 * parity construction required to finish a rmw cycle.
1487 */
1488static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
1489{
1490 int bios_to_read = 0;
1491 struct btrfs_bio *bbio = rbio->bbio;
1492 struct bio_list bio_list;
1493 int ret;
1494 int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1495 int pagenr;
1496 int stripe;
1497 struct bio *bio;
1498
1499 bio_list_init(&bio_list);
1500
1501 ret = alloc_rbio_pages(rbio);
1502 if (ret)
1503 goto cleanup;
1504
1505 atomic_set(&rbio->bbio->error, 0);
1506
1507 /*
1508 * read everything that hasn't failed.
1509 */
1510 for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
1511 if (rbio->faila == stripe ||
1512 rbio->failb == stripe)
1513 continue;
1514
1515 for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1516 struct page *p;
1517
1518 /*
1519 * the rmw code may have already read this
1520 * page in
1521 */
1522 p = rbio_stripe_page(rbio, stripe, pagenr);
1523 if (PageUptodate(p))
1524 continue;
1525
1526 ret = rbio_add_io_page(rbio, &bio_list,
1527 rbio_stripe_page(rbio, stripe, pagenr),
1528 stripe, pagenr, rbio->stripe_len);
1529 if (ret < 0)
1530 goto cleanup;
1531 }
1532 }
1533
1534 bios_to_read = bio_list_size(&bio_list);
1535 if (!bios_to_read) {
1536 /*
1537 * we might have no bios to read just because the pages
1538 * were up to date, or we might have no bios to read because
1539 * the devices were gone.
1540 */
1541 if (atomic_read(&rbio->bbio->error) <= rbio->bbio->max_errors) {
1542 __raid_recover_end_io(rbio);
1543 goto out;
1544 } else {
1545 goto cleanup;
1546 }
1547 }
1548
1549 /*
1550 * the bbio may be freed once we submit the last bio. Make sure
1551 * not to touch it after that
1552 */
1553 atomic_set(&bbio->stripes_pending, bios_to_read);
1554 while (1) {
1555 bio = bio_list_pop(&bio_list);
1556 if (!bio)
1557 break;
1558
1559 bio->bi_private = rbio;
1560 bio->bi_end_io = raid_recover_end_io;
1561
1562 btrfs_bio_wq_end_io(rbio->fs_info, bio,
1563 BTRFS_WQ_ENDIO_RAID56);
1564
1565 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
1566 submit_bio(READ, bio);
1567 }
1568out:
1569 return 0;
1570
1571cleanup:
1572 if (rbio->read_rebuild)
1573 rbio_orig_end_io(rbio, -EIO, 0);
1574 return -EIO;
1575}
1576
1577/*
1578 * the main entry point for reads from the higher layers. This
1579 * is really only called when the normal read path had a failure,
1580 * so we assume the bio they send down corresponds to a failed part
1581 * of the drive.
1582 */
1583int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
1584 struct btrfs_bio *bbio, u64 *raid_map,
1585 u64 stripe_len, int mirror_num)
1586{
1587 struct btrfs_raid_bio *rbio;
1588 int ret;
1589
1590 rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
1591 if (IS_ERR(rbio)) {
1592 return PTR_ERR(rbio);
1593 }
1594
1595 rbio->read_rebuild = 1;
1596 bio_list_add(&rbio->bio_list, bio);
1597 rbio->bio_list_bytes = bio->bi_size;
1598
1599 rbio->faila = find_logical_bio_stripe(rbio, bio);
1600 if (rbio->faila == -1) {
1601 BUG();
1602 kfree(rbio);
1603 return -EIO;
1604 }
1605
1606 /*
1607 * reconstruct from the q stripe if they are
1608 * asking for mirror 3
1609 */
1610 if (mirror_num == 3)
1611 rbio->failb = bbio->num_stripes - 2;
1612
1613 ret = lock_stripe_add(rbio);
1614
1615 /*
1616 * __raid56_parity_recover will end the bio with
1617 * any errors it hits. We don't want to return
1618 * its error value up the stack because our caller
1619 * will end up calling bio_endio with any nonzero
1620 * return
1621 */
1622 if (ret == 0)
1623 __raid56_parity_recover(rbio);
1624 /*
1625 * our rbio has been added to the list of
1626 * rbios that will be handled after the
1627 * currently lock owner is done
1628 */
1629 return 0;
1630
1631}
1632
1633static void rmw_work(struct btrfs_work *work)
1634{
1635 struct btrfs_raid_bio *rbio;
1636
1637 rbio = container_of(work, struct btrfs_raid_bio, work);
1638 raid56_rmw_stripe(rbio);
1639}
1640
1641static void read_rebuild_work(struct btrfs_work *work)
1642{
1643 struct btrfs_raid_bio *rbio;
1644
1645 rbio = container_of(work, struct btrfs_raid_bio, work);
1646 __raid56_parity_recover(rbio);
1647}