]>
Commit | Line | Data |
---|---|---|
53b381b3 DW |
1 | /* |
2 | * Copyright (C) 2012 Fusion-io All rights reserved. | |
3 | * Copyright (C) 2012 Intel Corp. All rights reserved. | |
4 | * | |
5 | * This program is free software; you can redistribute it and/or | |
6 | * modify it under the terms of the GNU General Public | |
7 | * License v2 as published by the Free Software Foundation. | |
8 | * | |
9 | * This program is distributed in the hope that it will be useful, | |
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
12 | * General Public License for more details. | |
13 | * | |
14 | * You should have received a copy of the GNU General Public | |
15 | * License along with this program; if not, write to the | |
16 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | |
17 | * Boston, MA 021110-1307, USA. | |
18 | */ | |
19 | #include <linux/sched.h> | |
20 | #include <linux/wait.h> | |
21 | #include <linux/bio.h> | |
22 | #include <linux/slab.h> | |
23 | #include <linux/buffer_head.h> | |
24 | #include <linux/blkdev.h> | |
25 | #include <linux/random.h> | |
26 | #include <linux/iocontext.h> | |
27 | #include <linux/capability.h> | |
28 | #include <linux/ratelimit.h> | |
29 | #include <linux/kthread.h> | |
30 | #include <linux/raid/pq.h> | |
31 | #include <linux/hash.h> | |
32 | #include <linux/list_sort.h> | |
33 | #include <linux/raid/xor.h> | |
34 | #include <asm/div64.h> | |
35 | #include "compat.h" | |
36 | #include "ctree.h" | |
37 | #include "extent_map.h" | |
38 | #include "disk-io.h" | |
39 | #include "transaction.h" | |
40 | #include "print-tree.h" | |
41 | #include "volumes.h" | |
42 | #include "raid56.h" | |
43 | #include "async-thread.h" | |
44 | #include "check-integrity.h" | |
45 | #include "rcu-string.h" | |
46 | ||
47 | /* set when additional merges to this rbio are not allowed */ | |
48 | #define RBIO_RMW_LOCKED_BIT 1 | |
49 | ||
50 | struct btrfs_raid_bio { | |
51 | struct btrfs_fs_info *fs_info; | |
52 | struct btrfs_bio *bbio; | |
53 | ||
54 | /* | |
55 | * logical block numbers for the start of each stripe | |
56 | * The last one or two are p/q. These are sorted, | |
57 | * so raid_map[0] is the start of our full stripe | |
58 | */ | |
59 | u64 *raid_map; | |
60 | ||
61 | /* while we're doing rmw on a stripe | |
62 | * we put it into a hash table so we can | |
63 | * lock the stripe and merge more rbios | |
64 | * into it. | |
65 | */ | |
66 | struct list_head hash_list; | |
67 | ||
68 | /* | |
69 | * for scheduling work in the helper threads | |
70 | */ | |
71 | struct btrfs_work work; | |
72 | ||
73 | /* | |
74 | * bio list and bio_list_lock are used | |
75 | * to add more bios into the stripe | |
76 | * in hopes of avoiding the full rmw | |
77 | */ | |
78 | struct bio_list bio_list; | |
79 | spinlock_t bio_list_lock; | |
80 | ||
81 | /* | |
82 | * also protected by the bio_list_lock, the | |
83 | * stripe locking code uses plug_list to hand off | |
84 | * the stripe lock to the next pending IO | |
85 | */ | |
86 | struct list_head plug_list; | |
87 | ||
88 | /* | |
89 | * flags that tell us if it is safe to | |
90 | * merge with this bio | |
91 | */ | |
92 | unsigned long flags; | |
93 | ||
94 | /* size of each individual stripe on disk */ | |
95 | int stripe_len; | |
96 | ||
97 | /* number of data stripes (no p/q) */ | |
98 | int nr_data; | |
99 | ||
100 | /* | |
101 | * set if we're doing a parity rebuild | |
102 | * for a read from higher up, which is handled | |
103 | * differently from a parity rebuild as part of | |
104 | * rmw | |
105 | */ | |
106 | int read_rebuild; | |
107 | ||
108 | /* first bad stripe */ | |
109 | int faila; | |
110 | ||
111 | /* second bad stripe (for raid6 use) */ | |
112 | int failb; | |
113 | ||
114 | /* | |
115 | * number of pages needed to represent the full | |
116 | * stripe | |
117 | */ | |
118 | int nr_pages; | |
119 | ||
120 | /* | |
121 | * size of all the bios in the bio_list. This | |
122 | * helps us decide if the rbio maps to a full | |
123 | * stripe or not | |
124 | */ | |
125 | int bio_list_bytes; | |
126 | ||
127 | atomic_t refs; | |
128 | ||
129 | /* | |
130 | * these are two arrays of pointers. We allocate the | |
131 | * rbio big enough to hold them both and setup their | |
132 | * locations when the rbio is allocated | |
133 | */ | |
134 | ||
135 | /* pointers to pages that we allocated for | |
136 | * reading/writing stripes directly from the disk (including P/Q) | |
137 | */ | |
138 | struct page **stripe_pages; | |
139 | ||
140 | /* | |
141 | * pointers to the pages in the bio_list. Stored | |
142 | * here for faster lookup | |
143 | */ | |
144 | struct page **bio_pages; | |
145 | }; | |
146 | ||
147 | static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); | |
148 | static noinline void finish_rmw(struct btrfs_raid_bio *rbio); | |
149 | static void rmw_work(struct btrfs_work *work); | |
150 | static void read_rebuild_work(struct btrfs_work *work); | |
151 | static void async_rmw_stripe(struct btrfs_raid_bio *rbio); | |
152 | static void async_read_rebuild(struct btrfs_raid_bio *rbio); | |
153 | static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio); | |
154 | static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed); | |
155 | static void __free_raid_bio(struct btrfs_raid_bio *rbio); | |
156 | static void index_rbio_pages(struct btrfs_raid_bio *rbio); | |
157 | static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); | |
158 | ||
159 | /* | |
160 | * the stripe hash table is used for locking, and to collect | |
161 | * bios in hopes of making a full stripe | |
162 | */ | |
163 | int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) | |
164 | { | |
165 | struct btrfs_stripe_hash_table *table; | |
166 | struct btrfs_stripe_hash_table *x; | |
167 | struct btrfs_stripe_hash *cur; | |
168 | struct btrfs_stripe_hash *h; | |
169 | int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS; | |
170 | int i; | |
171 | ||
172 | if (info->stripe_hash_table) | |
173 | return 0; | |
174 | ||
175 | table = kzalloc(sizeof(*table) + sizeof(*h) * num_entries, GFP_NOFS); | |
176 | if (!table) | |
177 | return -ENOMEM; | |
178 | ||
179 | table->table = (void *)(table + 1); | |
180 | h = table->table; | |
181 | ||
182 | for (i = 0; i < num_entries; i++) { | |
183 | cur = h + i; | |
184 | INIT_LIST_HEAD(&cur->hash_list); | |
185 | spin_lock_init(&cur->lock); | |
186 | init_waitqueue_head(&cur->wait); | |
187 | } | |
188 | ||
189 | x = cmpxchg(&info->stripe_hash_table, NULL, table); | |
190 | if (x) | |
191 | kfree(x); | |
192 | return 0; | |
193 | } | |
194 | ||
195 | /* | |
196 | * we hash on the first logical address of the stripe | |
197 | */ | |
198 | static int rbio_bucket(struct btrfs_raid_bio *rbio) | |
199 | { | |
200 | u64 num = rbio->raid_map[0]; | |
201 | ||
202 | /* | |
203 | * we shift down quite a bit. We're using byte | |
204 | * addressing, and most of the lower bits are zeros. | |
205 | * This tends to upset hash_64, and it consistently | |
206 | * returns just one or two different values. | |
207 | * | |
208 | * shifting off the lower bits fixes things. | |
209 | */ | |
210 | return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS); | |
211 | } | |
212 | ||
213 | /* | |
214 | * merging means we take the bio_list from the victim and | |
215 | * splice it into the destination. The victim should | |
216 | * be discarded afterwards. | |
217 | * | |
218 | * must be called with dest->rbio_list_lock held | |
219 | */ | |
220 | static void merge_rbio(struct btrfs_raid_bio *dest, | |
221 | struct btrfs_raid_bio *victim) | |
222 | { | |
223 | bio_list_merge(&dest->bio_list, &victim->bio_list); | |
224 | dest->bio_list_bytes += victim->bio_list_bytes; | |
225 | bio_list_init(&victim->bio_list); | |
226 | } | |
227 | ||
228 | /* | |
229 | * free the hash table used by unmount | |
230 | */ | |
231 | void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info) | |
232 | { | |
233 | if (!info->stripe_hash_table) | |
234 | return; | |
235 | kfree(info->stripe_hash_table); | |
236 | info->stripe_hash_table = NULL; | |
237 | } | |
238 | ||
239 | /* | |
240 | * helper function to run the xor_blocks api. It is only | |
241 | * able to do MAX_XOR_BLOCKS at a time, so we need to | |
242 | * loop through. | |
243 | */ | |
244 | static void run_xor(void **pages, int src_cnt, ssize_t len) | |
245 | { | |
246 | int src_off = 0; | |
247 | int xor_src_cnt = 0; | |
248 | void *dest = pages[src_cnt]; | |
249 | ||
250 | while(src_cnt > 0) { | |
251 | xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS); | |
252 | xor_blocks(xor_src_cnt, len, dest, pages + src_off); | |
253 | ||
254 | src_cnt -= xor_src_cnt; | |
255 | src_off += xor_src_cnt; | |
256 | } | |
257 | } | |
258 | ||
259 | /* | |
260 | * returns true if the bio list inside this rbio | |
261 | * covers an entire stripe (no rmw required). | |
262 | * Must be called with the bio list lock held, or | |
263 | * at a time when you know it is impossible to add | |
264 | * new bios into the list | |
265 | */ | |
266 | static int __rbio_is_full(struct btrfs_raid_bio *rbio) | |
267 | { | |
268 | unsigned long size = rbio->bio_list_bytes; | |
269 | int ret = 1; | |
270 | ||
271 | if (size != rbio->nr_data * rbio->stripe_len) | |
272 | ret = 0; | |
273 | ||
274 | BUG_ON(size > rbio->nr_data * rbio->stripe_len); | |
275 | return ret; | |
276 | } | |
277 | ||
278 | static int rbio_is_full(struct btrfs_raid_bio *rbio) | |
279 | { | |
280 | unsigned long flags; | |
281 | int ret; | |
282 | ||
283 | spin_lock_irqsave(&rbio->bio_list_lock, flags); | |
284 | ret = __rbio_is_full(rbio); | |
285 | spin_unlock_irqrestore(&rbio->bio_list_lock, flags); | |
286 | return ret; | |
287 | } | |
288 | ||
289 | /* | |
290 | * returns 1 if it is safe to merge two rbios together. | |
291 | * The merging is safe if the two rbios correspond to | |
292 | * the same stripe and if they are both going in the same | |
293 | * direction (read vs write), and if neither one is | |
294 | * locked for final IO | |
295 | * | |
296 | * The caller is responsible for locking such that | |
297 | * rmw_locked is safe to test | |
298 | */ | |
299 | static int rbio_can_merge(struct btrfs_raid_bio *last, | |
300 | struct btrfs_raid_bio *cur) | |
301 | { | |
302 | if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) || | |
303 | test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) | |
304 | return 0; | |
305 | ||
306 | if (last->raid_map[0] != | |
307 | cur->raid_map[0]) | |
308 | return 0; | |
309 | ||
310 | /* reads can't merge with writes */ | |
311 | if (last->read_rebuild != | |
312 | cur->read_rebuild) { | |
313 | return 0; | |
314 | } | |
315 | ||
316 | return 1; | |
317 | } | |
318 | ||
319 | /* | |
320 | * helper to index into the pstripe | |
321 | */ | |
322 | static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index) | |
323 | { | |
324 | index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT; | |
325 | return rbio->stripe_pages[index]; | |
326 | } | |
327 | ||
328 | /* | |
329 | * helper to index into the qstripe, returns null | |
330 | * if there is no qstripe | |
331 | */ | |
332 | static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index) | |
333 | { | |
334 | if (rbio->nr_data + 1 == rbio->bbio->num_stripes) | |
335 | return NULL; | |
336 | ||
337 | index += ((rbio->nr_data + 1) * rbio->stripe_len) >> | |
338 | PAGE_CACHE_SHIFT; | |
339 | return rbio->stripe_pages[index]; | |
340 | } | |
341 | ||
342 | /* | |
343 | * The first stripe in the table for a logical address | |
344 | * has the lock. rbios are added in one of three ways: | |
345 | * | |
346 | * 1) Nobody has the stripe locked yet. The rbio is given | |
347 | * the lock and 0 is returned. The caller must start the IO | |
348 | * themselves. | |
349 | * | |
350 | * 2) Someone has the stripe locked, but we're able to merge | |
351 | * with the lock owner. The rbio is freed and the IO will | |
352 | * start automatically along with the existing rbio. 1 is returned. | |
353 | * | |
354 | * 3) Someone has the stripe locked, but we're not able to merge. | |
355 | * The rbio is added to the lock owner's plug list, or merged into | |
356 | * an rbio already on the plug list. When the lock owner unlocks, | |
357 | * the next rbio on the list is run and the IO is started automatically. | |
358 | * 1 is returned | |
359 | * | |
360 | * If we return 0, the caller still owns the rbio and must continue with | |
361 | * IO submission. If we return 1, the caller must assume the rbio has | |
362 | * already been freed. | |
363 | */ | |
364 | static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) | |
365 | { | |
366 | int bucket = rbio_bucket(rbio); | |
367 | struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket; | |
368 | struct btrfs_raid_bio *cur; | |
369 | struct btrfs_raid_bio *pending; | |
370 | unsigned long flags; | |
371 | DEFINE_WAIT(wait); | |
372 | struct btrfs_raid_bio *freeit = NULL; | |
373 | int ret = 0; | |
374 | int walk = 0; | |
375 | ||
376 | spin_lock_irqsave(&h->lock, flags); | |
377 | list_for_each_entry(cur, &h->hash_list, hash_list) { | |
378 | walk++; | |
379 | if (cur->raid_map[0] == rbio->raid_map[0]) { | |
380 | spin_lock(&cur->bio_list_lock); | |
381 | ||
382 | /* can we merge into the lock owner? */ | |
383 | if (rbio_can_merge(cur, rbio)) { | |
384 | merge_rbio(cur, rbio); | |
385 | spin_unlock(&cur->bio_list_lock); | |
386 | freeit = rbio; | |
387 | ret = 1; | |
388 | goto out; | |
389 | } | |
390 | ||
391 | /* | |
392 | * we couldn't merge with the running | |
393 | * rbio, see if we can merge with the | |
394 | * pending ones. We don't have to | |
395 | * check for rmw_locked because there | |
396 | * is no way they are inside finish_rmw | |
397 | * right now | |
398 | */ | |
399 | list_for_each_entry(pending, &cur->plug_list, | |
400 | plug_list) { | |
401 | if (rbio_can_merge(pending, rbio)) { | |
402 | merge_rbio(pending, rbio); | |
403 | spin_unlock(&cur->bio_list_lock); | |
404 | freeit = rbio; | |
405 | ret = 1; | |
406 | goto out; | |
407 | } | |
408 | } | |
409 | ||
410 | /* no merging, put us on the tail of the plug list, | |
411 | * our rbio will be started with the currently | |
412 | * running rbio unlocks | |
413 | */ | |
414 | list_add_tail(&rbio->plug_list, &cur->plug_list); | |
415 | spin_unlock(&cur->bio_list_lock); | |
416 | ret = 1; | |
417 | goto out; | |
418 | } | |
419 | } | |
420 | ||
421 | atomic_inc(&rbio->refs); | |
422 | list_add(&rbio->hash_list, &h->hash_list); | |
423 | out: | |
424 | spin_unlock_irqrestore(&h->lock, flags); | |
425 | if (freeit) | |
426 | __free_raid_bio(freeit); | |
427 | return ret; | |
428 | } | |
429 | ||
430 | /* | |
431 | * called as rmw or parity rebuild is completed. If the plug list has more | |
432 | * rbios waiting for this stripe, the next one on the list will be started | |
433 | */ | |
434 | static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) | |
435 | { | |
436 | int bucket; | |
437 | struct btrfs_stripe_hash *h; | |
438 | unsigned long flags; | |
439 | ||
440 | bucket = rbio_bucket(rbio); | |
441 | h = rbio->fs_info->stripe_hash_table->table + bucket; | |
442 | ||
443 | spin_lock_irqsave(&h->lock, flags); | |
444 | spin_lock(&rbio->bio_list_lock); | |
445 | ||
446 | if (!list_empty(&rbio->hash_list)) { | |
447 | ||
448 | list_del_init(&rbio->hash_list); | |
449 | atomic_dec(&rbio->refs); | |
450 | ||
451 | /* | |
452 | * we use the plug list to hold all the rbios | |
453 | * waiting for the chance to lock this stripe. | |
454 | * hand the lock over to one of them. | |
455 | */ | |
456 | if (!list_empty(&rbio->plug_list)) { | |
457 | struct btrfs_raid_bio *next; | |
458 | struct list_head *head = rbio->plug_list.next; | |
459 | ||
460 | next = list_entry(head, struct btrfs_raid_bio, | |
461 | plug_list); | |
462 | ||
463 | list_del_init(&rbio->plug_list); | |
464 | ||
465 | list_add(&next->hash_list, &h->hash_list); | |
466 | atomic_inc(&next->refs); | |
467 | spin_unlock(&rbio->bio_list_lock); | |
468 | spin_unlock_irqrestore(&h->lock, flags); | |
469 | ||
470 | if (next->read_rebuild) | |
471 | async_read_rebuild(next); | |
472 | else | |
473 | async_rmw_stripe(next); | |
474 | ||
475 | goto done_nolock; | |
476 | ||
477 | } else if (waitqueue_active(&h->wait)) { | |
478 | spin_unlock(&rbio->bio_list_lock); | |
479 | spin_unlock_irqrestore(&h->lock, flags); | |
480 | wake_up(&h->wait); | |
481 | goto done_nolock; | |
482 | } | |
483 | } | |
484 | spin_unlock(&rbio->bio_list_lock); | |
485 | spin_unlock_irqrestore(&h->lock, flags); | |
486 | ||
487 | done_nolock: | |
488 | return; | |
489 | } | |
490 | ||
491 | static void __free_raid_bio(struct btrfs_raid_bio *rbio) | |
492 | { | |
493 | int i; | |
494 | ||
495 | WARN_ON(atomic_read(&rbio->refs) < 0); | |
496 | if (!atomic_dec_and_test(&rbio->refs)) | |
497 | return; | |
498 | ||
499 | WARN_ON(!list_empty(&rbio->hash_list)); | |
500 | WARN_ON(!bio_list_empty(&rbio->bio_list)); | |
501 | ||
502 | for (i = 0; i < rbio->nr_pages; i++) { | |
503 | if (rbio->stripe_pages[i]) { | |
504 | __free_page(rbio->stripe_pages[i]); | |
505 | rbio->stripe_pages[i] = NULL; | |
506 | } | |
507 | } | |
508 | kfree(rbio->raid_map); | |
509 | kfree(rbio->bbio); | |
510 | kfree(rbio); | |
511 | } | |
512 | ||
513 | static void free_raid_bio(struct btrfs_raid_bio *rbio) | |
514 | { | |
515 | unlock_stripe(rbio); | |
516 | __free_raid_bio(rbio); | |
517 | } | |
518 | ||
519 | /* | |
520 | * this frees the rbio and runs through all the bios in the | |
521 | * bio_list and calls end_io on them | |
522 | */ | |
523 | static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate) | |
524 | { | |
525 | struct bio *cur = bio_list_get(&rbio->bio_list); | |
526 | struct bio *next; | |
527 | free_raid_bio(rbio); | |
528 | ||
529 | while (cur) { | |
530 | next = cur->bi_next; | |
531 | cur->bi_next = NULL; | |
532 | if (uptodate) | |
533 | set_bit(BIO_UPTODATE, &cur->bi_flags); | |
534 | bio_endio(cur, err); | |
535 | cur = next; | |
536 | } | |
537 | } | |
538 | ||
539 | /* | |
540 | * end io function used by finish_rmw. When we finally | |
541 | * get here, we've written a full stripe | |
542 | */ | |
543 | static void raid_write_end_io(struct bio *bio, int err) | |
544 | { | |
545 | struct btrfs_raid_bio *rbio = bio->bi_private; | |
546 | ||
547 | if (err) | |
548 | fail_bio_stripe(rbio, bio); | |
549 | ||
550 | bio_put(bio); | |
551 | ||
552 | if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) | |
553 | return; | |
554 | ||
555 | err = 0; | |
556 | ||
557 | /* OK, we have read all the stripes we need to. */ | |
558 | if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) | |
559 | err = -EIO; | |
560 | ||
561 | rbio_orig_end_io(rbio, err, 0); | |
562 | return; | |
563 | } | |
564 | ||
565 | /* | |
566 | * the read/modify/write code wants to use the original bio for | |
567 | * any pages it included, and then use the rbio for everything | |
568 | * else. This function decides if a given index (stripe number) | |
569 | * and page number in that stripe fall inside the original bio | |
570 | * or the rbio. | |
571 | * | |
572 | * if you set bio_list_only, you'll get a NULL back for any ranges | |
573 | * that are outside the bio_list | |
574 | * | |
575 | * This doesn't take any refs on anything, you get a bare page pointer | |
576 | * and the caller must bump refs as required. | |
577 | * | |
578 | * You must call index_rbio_pages once before you can trust | |
579 | * the answers from this function. | |
580 | */ | |
581 | static struct page *page_in_rbio(struct btrfs_raid_bio *rbio, | |
582 | int index, int pagenr, int bio_list_only) | |
583 | { | |
584 | int chunk_page; | |
585 | struct page *p = NULL; | |
586 | ||
587 | chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr; | |
588 | ||
589 | spin_lock_irq(&rbio->bio_list_lock); | |
590 | p = rbio->bio_pages[chunk_page]; | |
591 | spin_unlock_irq(&rbio->bio_list_lock); | |
592 | ||
593 | if (p || bio_list_only) | |
594 | return p; | |
595 | ||
596 | return rbio->stripe_pages[chunk_page]; | |
597 | } | |
598 | ||
599 | /* | |
600 | * number of pages we need for the entire stripe across all the | |
601 | * drives | |
602 | */ | |
603 | static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes) | |
604 | { | |
605 | unsigned long nr = stripe_len * nr_stripes; | |
606 | return (nr + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | |
607 | } | |
608 | ||
609 | /* | |
610 | * allocation and initial setup for the btrfs_raid_bio. Not | |
611 | * this does not allocate any pages for rbio->pages. | |
612 | */ | |
613 | static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root, | |
614 | struct btrfs_bio *bbio, u64 *raid_map, | |
615 | u64 stripe_len) | |
616 | { | |
617 | struct btrfs_raid_bio *rbio; | |
618 | int nr_data = 0; | |
619 | int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes); | |
620 | void *p; | |
621 | ||
622 | rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2, | |
623 | GFP_NOFS); | |
624 | if (!rbio) { | |
625 | kfree(raid_map); | |
626 | kfree(bbio); | |
627 | return ERR_PTR(-ENOMEM); | |
628 | } | |
629 | ||
630 | bio_list_init(&rbio->bio_list); | |
631 | INIT_LIST_HEAD(&rbio->plug_list); | |
632 | spin_lock_init(&rbio->bio_list_lock); | |
633 | INIT_LIST_HEAD(&rbio->hash_list); | |
634 | rbio->bbio = bbio; | |
635 | rbio->raid_map = raid_map; | |
636 | rbio->fs_info = root->fs_info; | |
637 | rbio->stripe_len = stripe_len; | |
638 | rbio->nr_pages = num_pages; | |
639 | rbio->faila = -1; | |
640 | rbio->failb = -1; | |
641 | atomic_set(&rbio->refs, 1); | |
642 | ||
643 | /* | |
644 | * the stripe_pages and bio_pages array point to the extra | |
645 | * memory we allocated past the end of the rbio | |
646 | */ | |
647 | p = rbio + 1; | |
648 | rbio->stripe_pages = p; | |
649 | rbio->bio_pages = p + sizeof(struct page *) * num_pages; | |
650 | ||
651 | if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE) | |
652 | nr_data = bbio->num_stripes - 2; | |
653 | else | |
654 | nr_data = bbio->num_stripes - 1; | |
655 | ||
656 | rbio->nr_data = nr_data; | |
657 | return rbio; | |
658 | } | |
659 | ||
660 | /* allocate pages for all the stripes in the bio, including parity */ | |
661 | static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) | |
662 | { | |
663 | int i; | |
664 | struct page *page; | |
665 | ||
666 | for (i = 0; i < rbio->nr_pages; i++) { | |
667 | if (rbio->stripe_pages[i]) | |
668 | continue; | |
669 | page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); | |
670 | if (!page) | |
671 | return -ENOMEM; | |
672 | rbio->stripe_pages[i] = page; | |
673 | ClearPageUptodate(page); | |
674 | } | |
675 | return 0; | |
676 | } | |
677 | ||
678 | /* allocate pages for just the p/q stripes */ | |
679 | static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) | |
680 | { | |
681 | int i; | |
682 | struct page *page; | |
683 | ||
684 | i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT; | |
685 | ||
686 | for (; i < rbio->nr_pages; i++) { | |
687 | if (rbio->stripe_pages[i]) | |
688 | continue; | |
689 | page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); | |
690 | if (!page) | |
691 | return -ENOMEM; | |
692 | rbio->stripe_pages[i] = page; | |
693 | } | |
694 | return 0; | |
695 | } | |
696 | ||
697 | /* | |
698 | * add a single page from a specific stripe into our list of bios for IO | |
699 | * this will try to merge into existing bios if possible, and returns | |
700 | * zero if all went well. | |
701 | */ | |
702 | int rbio_add_io_page(struct btrfs_raid_bio *rbio, | |
703 | struct bio_list *bio_list, | |
704 | struct page *page, | |
705 | int stripe_nr, | |
706 | unsigned long page_index, | |
707 | unsigned long bio_max_len) | |
708 | { | |
709 | struct bio *last = bio_list->tail; | |
710 | u64 last_end = 0; | |
711 | int ret; | |
712 | struct bio *bio; | |
713 | struct btrfs_bio_stripe *stripe; | |
714 | u64 disk_start; | |
715 | ||
716 | stripe = &rbio->bbio->stripes[stripe_nr]; | |
717 | disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT); | |
718 | ||
719 | /* if the device is missing, just fail this stripe */ | |
720 | if (!stripe->dev->bdev) | |
721 | return fail_rbio_index(rbio, stripe_nr); | |
722 | ||
723 | /* see if we can add this page onto our existing bio */ | |
724 | if (last) { | |
725 | last_end = (u64)last->bi_sector << 9; | |
726 | last_end += last->bi_size; | |
727 | ||
728 | /* | |
729 | * we can't merge these if they are from different | |
730 | * devices or if they are not contiguous | |
731 | */ | |
732 | if (last_end == disk_start && stripe->dev->bdev && | |
733 | test_bit(BIO_UPTODATE, &last->bi_flags) && | |
734 | last->bi_bdev == stripe->dev->bdev) { | |
735 | ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0); | |
736 | if (ret == PAGE_CACHE_SIZE) | |
737 | return 0; | |
738 | } | |
739 | } | |
740 | ||
741 | /* put a new bio on the list */ | |
742 | bio = bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1); | |
743 | if (!bio) | |
744 | return -ENOMEM; | |
745 | ||
746 | bio->bi_size = 0; | |
747 | bio->bi_bdev = stripe->dev->bdev; | |
748 | bio->bi_sector = disk_start >> 9; | |
749 | set_bit(BIO_UPTODATE, &bio->bi_flags); | |
750 | ||
751 | bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); | |
752 | bio_list_add(bio_list, bio); | |
753 | return 0; | |
754 | } | |
755 | ||
756 | /* | |
757 | * while we're doing the read/modify/write cycle, we could | |
758 | * have errors in reading pages off the disk. This checks | |
759 | * for errors and if we're not able to read the page it'll | |
760 | * trigger parity reconstruction. The rmw will be finished | |
761 | * after we've reconstructed the failed stripes | |
762 | */ | |
763 | static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) | |
764 | { | |
765 | if (rbio->faila >= 0 || rbio->failb >= 0) { | |
766 | BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1); | |
767 | __raid56_parity_recover(rbio); | |
768 | } else { | |
769 | finish_rmw(rbio); | |
770 | } | |
771 | } | |
772 | ||
773 | /* | |
774 | * these are just the pages from the rbio array, not from anything | |
775 | * the FS sent down to us | |
776 | */ | |
777 | static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page) | |
778 | { | |
779 | int index; | |
780 | index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT); | |
781 | index += page; | |
782 | return rbio->stripe_pages[index]; | |
783 | } | |
784 | ||
785 | /* | |
786 | * helper function to walk our bio list and populate the bio_pages array with | |
787 | * the result. This seems expensive, but it is faster than constantly | |
788 | * searching through the bio list as we setup the IO in finish_rmw or stripe | |
789 | * reconstruction. | |
790 | * | |
791 | * This must be called before you trust the answers from page_in_rbio | |
792 | */ | |
793 | static void index_rbio_pages(struct btrfs_raid_bio *rbio) | |
794 | { | |
795 | struct bio *bio; | |
796 | u64 start; | |
797 | unsigned long stripe_offset; | |
798 | unsigned long page_index; | |
799 | struct page *p; | |
800 | int i; | |
801 | ||
802 | spin_lock_irq(&rbio->bio_list_lock); | |
803 | bio_list_for_each(bio, &rbio->bio_list) { | |
804 | start = (u64)bio->bi_sector << 9; | |
805 | stripe_offset = start - rbio->raid_map[0]; | |
806 | page_index = stripe_offset >> PAGE_CACHE_SHIFT; | |
807 | ||
808 | for (i = 0; i < bio->bi_vcnt; i++) { | |
809 | p = bio->bi_io_vec[i].bv_page; | |
810 | rbio->bio_pages[page_index + i] = p; | |
811 | } | |
812 | } | |
813 | spin_unlock_irq(&rbio->bio_list_lock); | |
814 | } | |
815 | ||
816 | /* | |
817 | * this is called from one of two situations. We either | |
818 | * have a full stripe from the higher layers, or we've read all | |
819 | * the missing bits off disk. | |
820 | * | |
821 | * This will calculate the parity and then send down any | |
822 | * changed blocks. | |
823 | */ | |
824 | static noinline void finish_rmw(struct btrfs_raid_bio *rbio) | |
825 | { | |
826 | struct btrfs_bio *bbio = rbio->bbio; | |
827 | void *pointers[bbio->num_stripes]; | |
828 | int stripe_len = rbio->stripe_len; | |
829 | int nr_data = rbio->nr_data; | |
830 | int stripe; | |
831 | int pagenr; | |
832 | int p_stripe = -1; | |
833 | int q_stripe = -1; | |
834 | struct bio_list bio_list; | |
835 | struct bio *bio; | |
836 | int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT; | |
837 | int ret; | |
838 | ||
839 | bio_list_init(&bio_list); | |
840 | ||
841 | if (bbio->num_stripes - rbio->nr_data == 1) { | |
842 | p_stripe = bbio->num_stripes - 1; | |
843 | } else if (bbio->num_stripes - rbio->nr_data == 2) { | |
844 | p_stripe = bbio->num_stripes - 2; | |
845 | q_stripe = bbio->num_stripes - 1; | |
846 | } else { | |
847 | BUG(); | |
848 | } | |
849 | ||
850 | /* at this point we either have a full stripe, | |
851 | * or we've read the full stripe from the drive. | |
852 | * recalculate the parity and write the new results. | |
853 | * | |
854 | * We're not allowed to add any new bios to the | |
855 | * bio list here, anyone else that wants to | |
856 | * change this stripe needs to do their own rmw. | |
857 | */ | |
858 | spin_lock_irq(&rbio->bio_list_lock); | |
859 | set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); | |
860 | spin_unlock_irq(&rbio->bio_list_lock); | |
861 | ||
862 | atomic_set(&rbio->bbio->error, 0); | |
863 | ||
864 | /* | |
865 | * now that we've set rmw_locked, run through the | |
866 | * bio list one last time and map the page pointers | |
867 | */ | |
868 | index_rbio_pages(rbio); | |
869 | ||
870 | for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { | |
871 | struct page *p; | |
872 | /* first collect one page from each data stripe */ | |
873 | for (stripe = 0; stripe < nr_data; stripe++) { | |
874 | p = page_in_rbio(rbio, stripe, pagenr, 0); | |
875 | pointers[stripe] = kmap(p); | |
876 | } | |
877 | ||
878 | /* then add the parity stripe */ | |
879 | p = rbio_pstripe_page(rbio, pagenr); | |
880 | SetPageUptodate(p); | |
881 | pointers[stripe++] = kmap(p); | |
882 | ||
883 | if (q_stripe != -1) { | |
884 | ||
885 | /* | |
886 | * raid6, add the qstripe and call the | |
887 | * library function to fill in our p/q | |
888 | */ | |
889 | p = rbio_qstripe_page(rbio, pagenr); | |
890 | SetPageUptodate(p); | |
891 | pointers[stripe++] = kmap(p); | |
892 | ||
893 | raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE, | |
894 | pointers); | |
895 | } else { | |
896 | /* raid5 */ | |
897 | memcpy(pointers[nr_data], pointers[0], PAGE_SIZE); | |
898 | run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE); | |
899 | } | |
900 | ||
901 | ||
902 | for (stripe = 0; stripe < bbio->num_stripes; stripe++) | |
903 | kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); | |
904 | } | |
905 | ||
906 | /* | |
907 | * time to start writing. Make bios for everything from the | |
908 | * higher layers (the bio_list in our rbio) and our p/q. Ignore | |
909 | * everything else. | |
910 | */ | |
911 | for (stripe = 0; stripe < bbio->num_stripes; stripe++) { | |
912 | for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { | |
913 | struct page *page; | |
914 | if (stripe < rbio->nr_data) { | |
915 | page = page_in_rbio(rbio, stripe, pagenr, 1); | |
916 | if (!page) | |
917 | continue; | |
918 | } else { | |
919 | page = rbio_stripe_page(rbio, stripe, pagenr); | |
920 | } | |
921 | ||
922 | ret = rbio_add_io_page(rbio, &bio_list, | |
923 | page, stripe, pagenr, rbio->stripe_len); | |
924 | if (ret) | |
925 | goto cleanup; | |
926 | } | |
927 | } | |
928 | ||
929 | atomic_set(&bbio->stripes_pending, bio_list_size(&bio_list)); | |
930 | BUG_ON(atomic_read(&bbio->stripes_pending) == 0); | |
931 | ||
932 | while (1) { | |
933 | bio = bio_list_pop(&bio_list); | |
934 | if (!bio) | |
935 | break; | |
936 | ||
937 | bio->bi_private = rbio; | |
938 | bio->bi_end_io = raid_write_end_io; | |
939 | BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); | |
940 | submit_bio(WRITE, bio); | |
941 | } | |
942 | return; | |
943 | ||
944 | cleanup: | |
945 | rbio_orig_end_io(rbio, -EIO, 0); | |
946 | } | |
947 | ||
948 | /* | |
949 | * helper to find the stripe number for a given bio. Used to figure out which | |
950 | * stripe has failed. This expects the bio to correspond to a physical disk, | |
951 | * so it looks up based on physical sector numbers. | |
952 | */ | |
953 | static int find_bio_stripe(struct btrfs_raid_bio *rbio, | |
954 | struct bio *bio) | |
955 | { | |
956 | u64 physical = bio->bi_sector; | |
957 | u64 stripe_start; | |
958 | int i; | |
959 | struct btrfs_bio_stripe *stripe; | |
960 | ||
961 | physical <<= 9; | |
962 | ||
963 | for (i = 0; i < rbio->bbio->num_stripes; i++) { | |
964 | stripe = &rbio->bbio->stripes[i]; | |
965 | stripe_start = stripe->physical; | |
966 | if (physical >= stripe_start && | |
967 | physical < stripe_start + rbio->stripe_len) { | |
968 | return i; | |
969 | } | |
970 | } | |
971 | return -1; | |
972 | } | |
973 | ||
974 | /* | |
975 | * helper to find the stripe number for a given | |
976 | * bio (before mapping). Used to figure out which stripe has | |
977 | * failed. This looks up based on logical block numbers. | |
978 | */ | |
979 | static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, | |
980 | struct bio *bio) | |
981 | { | |
982 | u64 logical = bio->bi_sector; | |
983 | u64 stripe_start; | |
984 | int i; | |
985 | ||
986 | logical <<= 9; | |
987 | ||
988 | for (i = 0; i < rbio->nr_data; i++) { | |
989 | stripe_start = rbio->raid_map[i]; | |
990 | if (logical >= stripe_start && | |
991 | logical < stripe_start + rbio->stripe_len) { | |
992 | return i; | |
993 | } | |
994 | } | |
995 | return -1; | |
996 | } | |
997 | ||
998 | /* | |
999 | * returns -EIO if we had too many failures | |
1000 | */ | |
1001 | static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed) | |
1002 | { | |
1003 | unsigned long flags; | |
1004 | int ret = 0; | |
1005 | ||
1006 | spin_lock_irqsave(&rbio->bio_list_lock, flags); | |
1007 | ||
1008 | /* we already know this stripe is bad, move on */ | |
1009 | if (rbio->faila == failed || rbio->failb == failed) | |
1010 | goto out; | |
1011 | ||
1012 | if (rbio->faila == -1) { | |
1013 | /* first failure on this rbio */ | |
1014 | rbio->faila = failed; | |
1015 | atomic_inc(&rbio->bbio->error); | |
1016 | } else if (rbio->failb == -1) { | |
1017 | /* second failure on this rbio */ | |
1018 | rbio->failb = failed; | |
1019 | atomic_inc(&rbio->bbio->error); | |
1020 | } else { | |
1021 | ret = -EIO; | |
1022 | } | |
1023 | out: | |
1024 | spin_unlock_irqrestore(&rbio->bio_list_lock, flags); | |
1025 | ||
1026 | return ret; | |
1027 | } | |
1028 | ||
1029 | /* | |
1030 | * helper to fail a stripe based on a physical disk | |
1031 | * bio. | |
1032 | */ | |
1033 | static int fail_bio_stripe(struct btrfs_raid_bio *rbio, | |
1034 | struct bio *bio) | |
1035 | { | |
1036 | int failed = find_bio_stripe(rbio, bio); | |
1037 | ||
1038 | if (failed < 0) | |
1039 | return -EIO; | |
1040 | ||
1041 | return fail_rbio_index(rbio, failed); | |
1042 | } | |
1043 | ||
1044 | /* | |
1045 | * this sets each page in the bio uptodate. It should only be used on private | |
1046 | * rbio pages, nothing that comes in from the higher layers | |
1047 | */ | |
1048 | static void set_bio_pages_uptodate(struct bio *bio) | |
1049 | { | |
1050 | int i; | |
1051 | struct page *p; | |
1052 | ||
1053 | for (i = 0; i < bio->bi_vcnt; i++) { | |
1054 | p = bio->bi_io_vec[i].bv_page; | |
1055 | SetPageUptodate(p); | |
1056 | } | |
1057 | } | |
1058 | ||
1059 | /* | |
1060 | * end io for the read phase of the rmw cycle. All the bios here are physical | |
1061 | * stripe bios we've read from the disk so we can recalculate the parity of the | |
1062 | * stripe. | |
1063 | * | |
1064 | * This will usually kick off finish_rmw once all the bios are read in, but it | |
1065 | * may trigger parity reconstruction if we had any errors along the way | |
1066 | */ | |
1067 | static void raid_rmw_end_io(struct bio *bio, int err) | |
1068 | { | |
1069 | struct btrfs_raid_bio *rbio = bio->bi_private; | |
1070 | ||
1071 | if (err) | |
1072 | fail_bio_stripe(rbio, bio); | |
1073 | else | |
1074 | set_bio_pages_uptodate(bio); | |
1075 | ||
1076 | bio_put(bio); | |
1077 | ||
1078 | if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) | |
1079 | return; | |
1080 | ||
1081 | err = 0; | |
1082 | if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) | |
1083 | goto cleanup; | |
1084 | ||
1085 | /* | |
1086 | * this will normally call finish_rmw to start our write | |
1087 | * but if there are any failed stripes we'll reconstruct | |
1088 | * from parity first | |
1089 | */ | |
1090 | validate_rbio_for_rmw(rbio); | |
1091 | return; | |
1092 | ||
1093 | cleanup: | |
1094 | ||
1095 | rbio_orig_end_io(rbio, -EIO, 0); | |
1096 | } | |
1097 | ||
1098 | static void async_rmw_stripe(struct btrfs_raid_bio *rbio) | |
1099 | { | |
1100 | rbio->work.flags = 0; | |
1101 | rbio->work.func = rmw_work; | |
1102 | ||
1103 | btrfs_queue_worker(&rbio->fs_info->rmw_workers, | |
1104 | &rbio->work); | |
1105 | } | |
1106 | ||
1107 | static void async_read_rebuild(struct btrfs_raid_bio *rbio) | |
1108 | { | |
1109 | rbio->work.flags = 0; | |
1110 | rbio->work.func = read_rebuild_work; | |
1111 | ||
1112 | btrfs_queue_worker(&rbio->fs_info->rmw_workers, | |
1113 | &rbio->work); | |
1114 | } | |
1115 | ||
1116 | /* | |
1117 | * the stripe must be locked by the caller. It will | |
1118 | * unlock after all the writes are done | |
1119 | */ | |
1120 | static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) | |
1121 | { | |
1122 | int bios_to_read = 0; | |
1123 | struct btrfs_bio *bbio = rbio->bbio; | |
1124 | struct bio_list bio_list; | |
1125 | int ret; | |
1126 | int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | |
1127 | int pagenr; | |
1128 | int stripe; | |
1129 | struct bio *bio; | |
1130 | ||
1131 | bio_list_init(&bio_list); | |
1132 | ||
1133 | ret = alloc_rbio_pages(rbio); | |
1134 | if (ret) | |
1135 | goto cleanup; | |
1136 | ||
1137 | index_rbio_pages(rbio); | |
1138 | ||
1139 | atomic_set(&rbio->bbio->error, 0); | |
1140 | /* | |
1141 | * build a list of bios to read all the missing parts of this | |
1142 | * stripe | |
1143 | */ | |
1144 | for (stripe = 0; stripe < rbio->nr_data; stripe++) { | |
1145 | for (pagenr = 0; pagenr < nr_pages; pagenr++) { | |
1146 | struct page *page; | |
1147 | /* | |
1148 | * we want to find all the pages missing from | |
1149 | * the rbio and read them from the disk. If | |
1150 | * page_in_rbio finds a page in the bio list | |
1151 | * we don't need to read it off the stripe. | |
1152 | */ | |
1153 | page = page_in_rbio(rbio, stripe, pagenr, 1); | |
1154 | if (page) | |
1155 | continue; | |
1156 | ||
1157 | page = rbio_stripe_page(rbio, stripe, pagenr); | |
1158 | ret = rbio_add_io_page(rbio, &bio_list, page, | |
1159 | stripe, pagenr, rbio->stripe_len); | |
1160 | if (ret) | |
1161 | goto cleanup; | |
1162 | } | |
1163 | } | |
1164 | ||
1165 | bios_to_read = bio_list_size(&bio_list); | |
1166 | if (!bios_to_read) { | |
1167 | /* | |
1168 | * this can happen if others have merged with | |
1169 | * us, it means there is nothing left to read. | |
1170 | * But if there are missing devices it may not be | |
1171 | * safe to do the full stripe write yet. | |
1172 | */ | |
1173 | goto finish; | |
1174 | } | |
1175 | ||
1176 | /* | |
1177 | * the bbio may be freed once we submit the last bio. Make sure | |
1178 | * not to touch it after that | |
1179 | */ | |
1180 | atomic_set(&bbio->stripes_pending, bios_to_read); | |
1181 | while (1) { | |
1182 | bio = bio_list_pop(&bio_list); | |
1183 | if (!bio) | |
1184 | break; | |
1185 | ||
1186 | bio->bi_private = rbio; | |
1187 | bio->bi_end_io = raid_rmw_end_io; | |
1188 | ||
1189 | btrfs_bio_wq_end_io(rbio->fs_info, bio, | |
1190 | BTRFS_WQ_ENDIO_RAID56); | |
1191 | ||
1192 | BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); | |
1193 | submit_bio(READ, bio); | |
1194 | } | |
1195 | /* the actual write will happen once the reads are done */ | |
1196 | return 0; | |
1197 | ||
1198 | cleanup: | |
1199 | rbio_orig_end_io(rbio, -EIO, 0); | |
1200 | return -EIO; | |
1201 | ||
1202 | finish: | |
1203 | validate_rbio_for_rmw(rbio); | |
1204 | return 0; | |
1205 | } | |
1206 | ||
1207 | /* | |
1208 | * if the upper layers pass in a full stripe, we thank them by only allocating | |
1209 | * enough pages to hold the parity, and sending it all down quickly. | |
1210 | */ | |
1211 | static int full_stripe_write(struct btrfs_raid_bio *rbio) | |
1212 | { | |
1213 | int ret; | |
1214 | ||
1215 | ret = alloc_rbio_parity_pages(rbio); | |
1216 | if (ret) | |
1217 | return ret; | |
1218 | ||
1219 | ret = lock_stripe_add(rbio); | |
1220 | if (ret == 0) | |
1221 | finish_rmw(rbio); | |
1222 | return 0; | |
1223 | } | |
1224 | ||
1225 | /* | |
1226 | * partial stripe writes get handed over to async helpers. | |
1227 | * We're really hoping to merge a few more writes into this | |
1228 | * rbio before calculating new parity | |
1229 | */ | |
1230 | static int partial_stripe_write(struct btrfs_raid_bio *rbio) | |
1231 | { | |
1232 | int ret; | |
1233 | ||
1234 | ret = lock_stripe_add(rbio); | |
1235 | if (ret == 0) | |
1236 | async_rmw_stripe(rbio); | |
1237 | return 0; | |
1238 | } | |
1239 | ||
1240 | /* | |
1241 | * sometimes while we were reading from the drive to | |
1242 | * recalculate parity, enough new bios come into create | |
1243 | * a full stripe. So we do a check here to see if we can | |
1244 | * go directly to finish_rmw | |
1245 | */ | |
1246 | static int __raid56_parity_write(struct btrfs_raid_bio *rbio) | |
1247 | { | |
1248 | /* head off into rmw land if we don't have a full stripe */ | |
1249 | if (!rbio_is_full(rbio)) | |
1250 | return partial_stripe_write(rbio); | |
1251 | return full_stripe_write(rbio); | |
1252 | } | |
1253 | ||
1254 | /* | |
1255 | * our main entry point for writes from the rest of the FS. | |
1256 | */ | |
1257 | int raid56_parity_write(struct btrfs_root *root, struct bio *bio, | |
1258 | struct btrfs_bio *bbio, u64 *raid_map, | |
1259 | u64 stripe_len) | |
1260 | { | |
1261 | struct btrfs_raid_bio *rbio; | |
1262 | ||
1263 | rbio = alloc_rbio(root, bbio, raid_map, stripe_len); | |
1264 | if (IS_ERR(rbio)) { | |
1265 | kfree(raid_map); | |
1266 | kfree(bbio); | |
1267 | return PTR_ERR(rbio); | |
1268 | } | |
1269 | bio_list_add(&rbio->bio_list, bio); | |
1270 | rbio->bio_list_bytes = bio->bi_size; | |
1271 | return __raid56_parity_write(rbio); | |
1272 | } | |
1273 | ||
1274 | /* | |
1275 | * all parity reconstruction happens here. We've read in everything | |
1276 | * we can find from the drives and this does the heavy lifting of | |
1277 | * sorting the good from the bad. | |
1278 | */ | |
1279 | static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) | |
1280 | { | |
1281 | int pagenr, stripe; | |
1282 | void **pointers; | |
1283 | int faila = -1, failb = -1; | |
1284 | int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | |
1285 | struct page *page; | |
1286 | int err; | |
1287 | int i; | |
1288 | ||
1289 | pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *), | |
1290 | GFP_NOFS); | |
1291 | if (!pointers) { | |
1292 | err = -ENOMEM; | |
1293 | goto cleanup_io; | |
1294 | } | |
1295 | ||
1296 | faila = rbio->faila; | |
1297 | failb = rbio->failb; | |
1298 | ||
1299 | if (rbio->read_rebuild) { | |
1300 | spin_lock_irq(&rbio->bio_list_lock); | |
1301 | set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); | |
1302 | spin_unlock_irq(&rbio->bio_list_lock); | |
1303 | } | |
1304 | ||
1305 | index_rbio_pages(rbio); | |
1306 | ||
1307 | for (pagenr = 0; pagenr < nr_pages; pagenr++) { | |
1308 | /* setup our array of pointers with pages | |
1309 | * from each stripe | |
1310 | */ | |
1311 | for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) { | |
1312 | /* | |
1313 | * if we're rebuilding a read, we have to use | |
1314 | * pages from the bio list | |
1315 | */ | |
1316 | if (rbio->read_rebuild && | |
1317 | (stripe == faila || stripe == failb)) { | |
1318 | page = page_in_rbio(rbio, stripe, pagenr, 0); | |
1319 | } else { | |
1320 | page = rbio_stripe_page(rbio, stripe, pagenr); | |
1321 | } | |
1322 | pointers[stripe] = kmap(page); | |
1323 | } | |
1324 | ||
1325 | /* all raid6 handling here */ | |
1326 | if (rbio->raid_map[rbio->bbio->num_stripes - 1] == | |
1327 | RAID6_Q_STRIPE) { | |
1328 | ||
1329 | /* | |
1330 | * single failure, rebuild from parity raid5 | |
1331 | * style | |
1332 | */ | |
1333 | if (failb < 0) { | |
1334 | if (faila == rbio->nr_data) { | |
1335 | /* | |
1336 | * Just the P stripe has failed, without | |
1337 | * a bad data or Q stripe. | |
1338 | * TODO, we should redo the xor here. | |
1339 | */ | |
1340 | err = -EIO; | |
1341 | goto cleanup; | |
1342 | } | |
1343 | /* | |
1344 | * a single failure in raid6 is rebuilt | |
1345 | * in the pstripe code below | |
1346 | */ | |
1347 | goto pstripe; | |
1348 | } | |
1349 | ||
1350 | /* make sure our ps and qs are in order */ | |
1351 | if (faila > failb) { | |
1352 | int tmp = failb; | |
1353 | failb = faila; | |
1354 | faila = tmp; | |
1355 | } | |
1356 | ||
1357 | /* if the q stripe is failed, do a pstripe reconstruction | |
1358 | * from the xors. | |
1359 | * If both the q stripe and the P stripe are failed, we're | |
1360 | * here due to a crc mismatch and we can't give them the | |
1361 | * data they want | |
1362 | */ | |
1363 | if (rbio->raid_map[failb] == RAID6_Q_STRIPE) { | |
1364 | if (rbio->raid_map[faila] == RAID5_P_STRIPE) { | |
1365 | err = -EIO; | |
1366 | goto cleanup; | |
1367 | } | |
1368 | /* | |
1369 | * otherwise we have one bad data stripe and | |
1370 | * a good P stripe. raid5! | |
1371 | */ | |
1372 | goto pstripe; | |
1373 | } | |
1374 | ||
1375 | if (rbio->raid_map[failb] == RAID5_P_STRIPE) { | |
1376 | raid6_datap_recov(rbio->bbio->num_stripes, | |
1377 | PAGE_SIZE, faila, pointers); | |
1378 | } else { | |
1379 | raid6_2data_recov(rbio->bbio->num_stripes, | |
1380 | PAGE_SIZE, faila, failb, | |
1381 | pointers); | |
1382 | } | |
1383 | } else { | |
1384 | void *p; | |
1385 | ||
1386 | /* rebuild from P stripe here (raid5 or raid6) */ | |
1387 | BUG_ON(failb != -1); | |
1388 | pstripe: | |
1389 | /* Copy parity block into failed block to start with */ | |
1390 | memcpy(pointers[faila], | |
1391 | pointers[rbio->nr_data], | |
1392 | PAGE_CACHE_SIZE); | |
1393 | ||
1394 | /* rearrange the pointer array */ | |
1395 | p = pointers[faila]; | |
1396 | for (stripe = faila; stripe < rbio->nr_data - 1; stripe++) | |
1397 | pointers[stripe] = pointers[stripe + 1]; | |
1398 | pointers[rbio->nr_data - 1] = p; | |
1399 | ||
1400 | /* xor in the rest */ | |
1401 | run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE); | |
1402 | } | |
1403 | /* if we're doing this rebuild as part of an rmw, go through | |
1404 | * and set all of our private rbio pages in the | |
1405 | * failed stripes as uptodate. This way finish_rmw will | |
1406 | * know they can be trusted. If this was a read reconstruction, | |
1407 | * other endio functions will fiddle the uptodate bits | |
1408 | */ | |
1409 | if (!rbio->read_rebuild) { | |
1410 | for (i = 0; i < nr_pages; i++) { | |
1411 | if (faila != -1) { | |
1412 | page = rbio_stripe_page(rbio, faila, i); | |
1413 | SetPageUptodate(page); | |
1414 | } | |
1415 | if (failb != -1) { | |
1416 | page = rbio_stripe_page(rbio, failb, i); | |
1417 | SetPageUptodate(page); | |
1418 | } | |
1419 | } | |
1420 | } | |
1421 | for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) { | |
1422 | /* | |
1423 | * if we're rebuilding a read, we have to use | |
1424 | * pages from the bio list | |
1425 | */ | |
1426 | if (rbio->read_rebuild && | |
1427 | (stripe == faila || stripe == failb)) { | |
1428 | page = page_in_rbio(rbio, stripe, pagenr, 0); | |
1429 | } else { | |
1430 | page = rbio_stripe_page(rbio, stripe, pagenr); | |
1431 | } | |
1432 | kunmap(page); | |
1433 | } | |
1434 | } | |
1435 | ||
1436 | err = 0; | |
1437 | cleanup: | |
1438 | kfree(pointers); | |
1439 | ||
1440 | cleanup_io: | |
1441 | ||
1442 | if (rbio->read_rebuild) { | |
1443 | rbio_orig_end_io(rbio, err, err == 0); | |
1444 | } else if (err == 0) { | |
1445 | rbio->faila = -1; | |
1446 | rbio->failb = -1; | |
1447 | finish_rmw(rbio); | |
1448 | } else { | |
1449 | rbio_orig_end_io(rbio, err, 0); | |
1450 | } | |
1451 | } | |
1452 | ||
1453 | /* | |
1454 | * This is called only for stripes we've read from disk to | |
1455 | * reconstruct the parity. | |
1456 | */ | |
1457 | static void raid_recover_end_io(struct bio *bio, int err) | |
1458 | { | |
1459 | struct btrfs_raid_bio *rbio = bio->bi_private; | |
1460 | ||
1461 | /* | |
1462 | * we only read stripe pages off the disk, set them | |
1463 | * up to date if there were no errors | |
1464 | */ | |
1465 | if (err) | |
1466 | fail_bio_stripe(rbio, bio); | |
1467 | else | |
1468 | set_bio_pages_uptodate(bio); | |
1469 | bio_put(bio); | |
1470 | ||
1471 | if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) | |
1472 | return; | |
1473 | ||
1474 | if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) | |
1475 | rbio_orig_end_io(rbio, -EIO, 0); | |
1476 | else | |
1477 | __raid_recover_end_io(rbio); | |
1478 | } | |
1479 | ||
1480 | /* | |
1481 | * reads everything we need off the disk to reconstruct | |
1482 | * the parity. endio handlers trigger final reconstruction | |
1483 | * when the IO is done. | |
1484 | * | |
1485 | * This is used both for reads from the higher layers and for | |
1486 | * parity construction required to finish a rmw cycle. | |
1487 | */ | |
1488 | static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) | |
1489 | { | |
1490 | int bios_to_read = 0; | |
1491 | struct btrfs_bio *bbio = rbio->bbio; | |
1492 | struct bio_list bio_list; | |
1493 | int ret; | |
1494 | int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | |
1495 | int pagenr; | |
1496 | int stripe; | |
1497 | struct bio *bio; | |
1498 | ||
1499 | bio_list_init(&bio_list); | |
1500 | ||
1501 | ret = alloc_rbio_pages(rbio); | |
1502 | if (ret) | |
1503 | goto cleanup; | |
1504 | ||
1505 | atomic_set(&rbio->bbio->error, 0); | |
1506 | ||
1507 | /* | |
1508 | * read everything that hasn't failed. | |
1509 | */ | |
1510 | for (stripe = 0; stripe < bbio->num_stripes; stripe++) { | |
1511 | if (rbio->faila == stripe || | |
1512 | rbio->failb == stripe) | |
1513 | continue; | |
1514 | ||
1515 | for (pagenr = 0; pagenr < nr_pages; pagenr++) { | |
1516 | struct page *p; | |
1517 | ||
1518 | /* | |
1519 | * the rmw code may have already read this | |
1520 | * page in | |
1521 | */ | |
1522 | p = rbio_stripe_page(rbio, stripe, pagenr); | |
1523 | if (PageUptodate(p)) | |
1524 | continue; | |
1525 | ||
1526 | ret = rbio_add_io_page(rbio, &bio_list, | |
1527 | rbio_stripe_page(rbio, stripe, pagenr), | |
1528 | stripe, pagenr, rbio->stripe_len); | |
1529 | if (ret < 0) | |
1530 | goto cleanup; | |
1531 | } | |
1532 | } | |
1533 | ||
1534 | bios_to_read = bio_list_size(&bio_list); | |
1535 | if (!bios_to_read) { | |
1536 | /* | |
1537 | * we might have no bios to read just because the pages | |
1538 | * were up to date, or we might have no bios to read because | |
1539 | * the devices were gone. | |
1540 | */ | |
1541 | if (atomic_read(&rbio->bbio->error) <= rbio->bbio->max_errors) { | |
1542 | __raid_recover_end_io(rbio); | |
1543 | goto out; | |
1544 | } else { | |
1545 | goto cleanup; | |
1546 | } | |
1547 | } | |
1548 | ||
1549 | /* | |
1550 | * the bbio may be freed once we submit the last bio. Make sure | |
1551 | * not to touch it after that | |
1552 | */ | |
1553 | atomic_set(&bbio->stripes_pending, bios_to_read); | |
1554 | while (1) { | |
1555 | bio = bio_list_pop(&bio_list); | |
1556 | if (!bio) | |
1557 | break; | |
1558 | ||
1559 | bio->bi_private = rbio; | |
1560 | bio->bi_end_io = raid_recover_end_io; | |
1561 | ||
1562 | btrfs_bio_wq_end_io(rbio->fs_info, bio, | |
1563 | BTRFS_WQ_ENDIO_RAID56); | |
1564 | ||
1565 | BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); | |
1566 | submit_bio(READ, bio); | |
1567 | } | |
1568 | out: | |
1569 | return 0; | |
1570 | ||
1571 | cleanup: | |
1572 | if (rbio->read_rebuild) | |
1573 | rbio_orig_end_io(rbio, -EIO, 0); | |
1574 | return -EIO; | |
1575 | } | |
1576 | ||
1577 | /* | |
1578 | * the main entry point for reads from the higher layers. This | |
1579 | * is really only called when the normal read path had a failure, | |
1580 | * so we assume the bio they send down corresponds to a failed part | |
1581 | * of the drive. | |
1582 | */ | |
1583 | int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, | |
1584 | struct btrfs_bio *bbio, u64 *raid_map, | |
1585 | u64 stripe_len, int mirror_num) | |
1586 | { | |
1587 | struct btrfs_raid_bio *rbio; | |
1588 | int ret; | |
1589 | ||
1590 | rbio = alloc_rbio(root, bbio, raid_map, stripe_len); | |
1591 | if (IS_ERR(rbio)) { | |
1592 | return PTR_ERR(rbio); | |
1593 | } | |
1594 | ||
1595 | rbio->read_rebuild = 1; | |
1596 | bio_list_add(&rbio->bio_list, bio); | |
1597 | rbio->bio_list_bytes = bio->bi_size; | |
1598 | ||
1599 | rbio->faila = find_logical_bio_stripe(rbio, bio); | |
1600 | if (rbio->faila == -1) { | |
1601 | BUG(); | |
1602 | kfree(rbio); | |
1603 | return -EIO; | |
1604 | } | |
1605 | ||
1606 | /* | |
1607 | * reconstruct from the q stripe if they are | |
1608 | * asking for mirror 3 | |
1609 | */ | |
1610 | if (mirror_num == 3) | |
1611 | rbio->failb = bbio->num_stripes - 2; | |
1612 | ||
1613 | ret = lock_stripe_add(rbio); | |
1614 | ||
1615 | /* | |
1616 | * __raid56_parity_recover will end the bio with | |
1617 | * any errors it hits. We don't want to return | |
1618 | * its error value up the stack because our caller | |
1619 | * will end up calling bio_endio with any nonzero | |
1620 | * return | |
1621 | */ | |
1622 | if (ret == 0) | |
1623 | __raid56_parity_recover(rbio); | |
1624 | /* | |
1625 | * our rbio has been added to the list of | |
1626 | * rbios that will be handled after the | |
1627 | * currently lock owner is done | |
1628 | */ | |
1629 | return 0; | |
1630 | ||
1631 | } | |
1632 | ||
1633 | static void rmw_work(struct btrfs_work *work) | |
1634 | { | |
1635 | struct btrfs_raid_bio *rbio; | |
1636 | ||
1637 | rbio = container_of(work, struct btrfs_raid_bio, work); | |
1638 | raid56_rmw_stripe(rbio); | |
1639 | } | |
1640 | ||
1641 | static void read_rebuild_work(struct btrfs_work *work) | |
1642 | { | |
1643 | struct btrfs_raid_bio *rbio; | |
1644 | ||
1645 | rbio = container_of(work, struct btrfs_raid_bio, work); | |
1646 | __raid56_parity_recover(rbio); | |
1647 | } |