]> git.proxmox.com Git - mirror_zfs.git/blob - module/zfs/abd.c
Fix ENXIO from spa_ld_verify_logs() in ztest
[mirror_zfs.git] / module / zfs / abd.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
23 * Copyright (c) 2016 by Delphix. All rights reserved.
24 */
25
26 /*
27 * ARC buffer data (ABD).
28 *
29 * ABDs are an abstract data structure for the ARC which can use two
30 * different ways of storing the underlying data:
31 *
32 * (a) Linear buffer. In this case, all the data in the ABD is stored in one
33 * contiguous buffer in memory (from a zio_[data_]buf_* kmem cache).
34 *
35 * +-------------------+
36 * | ABD (linear) |
37 * | abd_flags = ... |
38 * | abd_size = ... | +--------------------------------+
39 * | abd_buf ------------->| raw buffer of size abd_size |
40 * +-------------------+ +--------------------------------+
41 * no abd_chunks
42 *
43 * (b) Scattered buffer. In this case, the data in the ABD is split into
44 * equal-sized chunks (from the abd_chunk_cache kmem_cache), with pointers
45 * to the chunks recorded in an array at the end of the ABD structure.
46 *
47 * +-------------------+
48 * | ABD (scattered) |
49 * | abd_flags = ... |
50 * | abd_size = ... |
51 * | abd_offset = 0 | +-----------+
52 * | abd_chunks[0] ----------------------------->| chunk 0 |
53 * | abd_chunks[1] ---------------------+ +-----------+
54 * | ... | | +-----------+
55 * | abd_chunks[N-1] ---------+ +------->| chunk 1 |
56 * +-------------------+ | +-----------+
57 * | ...
58 * | +-----------+
59 * +----------------->| chunk N-1 |
60 * +-----------+
61 *
62 * Linear buffers act exactly like normal buffers and are always mapped into the
63 * kernel's virtual memory space, while scattered ABD data chunks are allocated
64 * as physical pages and then mapped in only while they are actually being
65 * accessed through one of the abd_* library functions. Using scattered ABDs
66 * provides several benefits:
67 *
68 * (1) They avoid use of kmem_*, preventing performance problems where running
69 * kmem_reap on very large memory systems never finishes and causes
70 * constant TLB shootdowns.
71 *
72 * (2) Fragmentation is less of an issue since when we are at the limit of
73 * allocatable space, we won't have to search around for a long free
74 * hole in the VA space for large ARC allocations. Each chunk is mapped in
75 * individually, so even if we weren't using segkpm (see next point) we
76 * wouldn't need to worry about finding a contiguous address range.
77 *
78 * (3) Use of segkpm will avoid the need for map / unmap / TLB shootdown costs
79 * on each ABD access. (If segkpm isn't available then we use all linear
80 * ABDs to avoid this penalty.) See seg_kpm.c for more details.
81 *
82 * It is possible to make all ABDs linear by setting zfs_abd_scatter_enabled to
83 * B_FALSE. However, it is not possible to use scattered ABDs if segkpm is not
84 * available, which is the case on all 32-bit systems and any 64-bit systems
85 * where kpm_enable is turned off.
86 *
87 * In addition to directly allocating a linear or scattered ABD, it is also
88 * possible to create an ABD by requesting the "sub-ABD" starting at an offset
89 * within an existing ABD. In linear buffers this is simple (set abd_buf of
90 * the new ABD to the starting point within the original raw buffer), but
91 * scattered ABDs are a little more complex. The new ABD makes a copy of the
92 * relevant abd_chunks pointers (but not the underlying data). However, to
93 * provide arbitrary rather than only chunk-aligned starting offsets, it also
94 * tracks an abd_offset field which represents the starting point of the data
95 * within the first chunk in abd_chunks. For both linear and scattered ABDs,
96 * creating an offset ABD marks the original ABD as the offset's parent, and the
97 * original ABD's abd_children refcount is incremented. This data allows us to
98 * ensure the root ABD isn't deleted before its children.
99 *
100 * Most consumers should never need to know what type of ABD they're using --
101 * the ABD public API ensures that it's possible to transparently switch from
102 * using a linear ABD to a scattered one when doing so would be beneficial.
103 *
104 * If you need to use the data within an ABD directly, if you know it's linear
105 * (because you allocated it) you can use abd_to_buf() to access the underlying
106 * raw buffer. Otherwise, you should use one of the abd_borrow_buf* functions
107 * which will allocate a raw buffer if necessary. Use the abd_return_buf*
108 * functions to return any raw buffers that are no longer necessary when you're
109 * done using them.
110 *
111 * There are a variety of ABD APIs that implement basic buffer operations:
112 * compare, copy, read, write, and fill with zeroes. If you need a custom
113 * function which progressively accesses the whole ABD, use the abd_iterate_*
114 * functions.
115 */
116
117 #include <sys/abd.h>
118 #include <sys/param.h>
119 #include <sys/zio.h>
120 #include <sys/zfs_context.h>
121 #include <sys/zfs_znode.h>
122 #ifdef _KERNEL
123 #include <linux/scatterlist.h>
124 #include <linux/kmap_compat.h>
125 #else
126 #define MAX_ORDER 1
127 #endif
128
129 typedef struct abd_stats {
130 kstat_named_t abdstat_struct_size;
131 kstat_named_t abdstat_linear_cnt;
132 kstat_named_t abdstat_linear_data_size;
133 kstat_named_t abdstat_scatter_cnt;
134 kstat_named_t abdstat_scatter_data_size;
135 kstat_named_t abdstat_scatter_chunk_waste;
136 kstat_named_t abdstat_scatter_orders[MAX_ORDER];
137 kstat_named_t abdstat_scatter_page_multi_chunk;
138 kstat_named_t abdstat_scatter_page_multi_zone;
139 kstat_named_t abdstat_scatter_page_alloc_retry;
140 kstat_named_t abdstat_scatter_sg_table_retry;
141 } abd_stats_t;
142
143 static abd_stats_t abd_stats = {
144 /* Amount of memory occupied by all of the abd_t struct allocations */
145 { "struct_size", KSTAT_DATA_UINT64 },
146 /*
147 * The number of linear ABDs which are currently allocated, excluding
148 * ABDs which don't own their data (for instance the ones which were
149 * allocated through abd_get_offset() and abd_get_from_buf()). If an
150 * ABD takes ownership of its buf then it will become tracked.
151 */
152 { "linear_cnt", KSTAT_DATA_UINT64 },
153 /* Amount of data stored in all linear ABDs tracked by linear_cnt */
154 { "linear_data_size", KSTAT_DATA_UINT64 },
155 /*
156 * The number of scatter ABDs which are currently allocated, excluding
157 * ABDs which don't own their data (for instance the ones which were
158 * allocated through abd_get_offset()).
159 */
160 { "scatter_cnt", KSTAT_DATA_UINT64 },
161 /* Amount of data stored in all scatter ABDs tracked by scatter_cnt */
162 { "scatter_data_size", KSTAT_DATA_UINT64 },
163 /*
164 * The amount of space wasted at the end of the last chunk across all
165 * scatter ABDs tracked by scatter_cnt.
166 */
167 { "scatter_chunk_waste", KSTAT_DATA_UINT64 },
168 /*
169 * The number of compound allocations of a given order. These
170 * allocations are spread over all currently allocated ABDs, and
171 * act as a measure of memory fragmentation.
172 */
173 { { "scatter_order_N", KSTAT_DATA_UINT64 } },
174 /*
175 * The number of scatter ABDs which contain multiple chunks.
176 * ABDs are preferentially allocated from the minimum number of
177 * contiguous multi-page chunks, a single chunk is optimal.
178 */
179 { "scatter_page_multi_chunk", KSTAT_DATA_UINT64 },
180 /*
181 * The number of scatter ABDs which are split across memory zones.
182 * ABDs are preferentially allocated using pages from a single zone.
183 */
184 { "scatter_page_multi_zone", KSTAT_DATA_UINT64 },
185 /*
186 * The total number of retries encountered when attempting to
187 * allocate the pages to populate the scatter ABD.
188 */
189 { "scatter_page_alloc_retry", KSTAT_DATA_UINT64 },
190 /*
191 * The total number of retries encountered when attempting to
192 * allocate the sg table for an ABD.
193 */
194 { "scatter_sg_table_retry", KSTAT_DATA_UINT64 },
195 };
196
197 #define ABDSTAT(stat) (abd_stats.stat.value.ui64)
198 #define ABDSTAT_INCR(stat, val) \
199 atomic_add_64(&abd_stats.stat.value.ui64, (val))
200 #define ABDSTAT_BUMP(stat) ABDSTAT_INCR(stat, 1)
201 #define ABDSTAT_BUMPDOWN(stat) ABDSTAT_INCR(stat, -1)
202
203 #define ABD_SCATTER(abd) (abd->abd_u.abd_scatter)
204 #define ABD_BUF(abd) (abd->abd_u.abd_linear.abd_buf)
205 #define abd_for_each_sg(abd, sg, n, i) \
206 for_each_sg(ABD_SCATTER(abd).abd_sgl, sg, n, i)
207
208 /* see block comment above for description */
209 int zfs_abd_scatter_enabled = B_TRUE;
210 unsigned zfs_abd_scatter_max_order = MAX_ORDER - 1;
211
212 static kmem_cache_t *abd_cache = NULL;
213 static kstat_t *abd_ksp;
214
215 static inline size_t
216 abd_chunkcnt_for_bytes(size_t size)
217 {
218 return (P2ROUNDUP(size, PAGESIZE) / PAGESIZE);
219 }
220
221 #ifdef _KERNEL
222 #ifndef CONFIG_HIGHMEM
223
224 #ifndef __GFP_RECLAIM
225 #define __GFP_RECLAIM __GFP_WAIT
226 #endif
227
228 static unsigned long
229 abd_alloc_chunk(int nid, gfp_t gfp, unsigned int order)
230 {
231 struct page *page;
232
233 page = alloc_pages_node(nid, gfp, order);
234 if (!page)
235 return (0);
236
237 return ((unsigned long) page_address(page));
238 }
239
240 /*
241 * The goal is to minimize fragmentation by preferentially populating ABDs
242 * with higher order compound pages from a single zone. Allocation size is
243 * progressively decreased until it can be satisfied without performing
244 * reclaim or compaction. When necessary this function will degenerate to
245 * allocating individual pages and allowing reclaim to satisfy allocations.
246 */
247 static void
248 abd_alloc_pages(abd_t *abd, size_t size)
249 {
250 struct list_head pages;
251 struct sg_table table;
252 struct scatterlist *sg;
253 struct page *page, *tmp_page = NULL;
254 gfp_t gfp = __GFP_NOWARN | GFP_NOIO;
255 gfp_t gfp_comp = (gfp | __GFP_NORETRY | __GFP_COMP) & ~__GFP_RECLAIM;
256 int max_order = MIN(zfs_abd_scatter_max_order, MAX_ORDER - 1);
257 int nr_pages = abd_chunkcnt_for_bytes(size);
258 int chunks = 0, zones = 0;
259 size_t remaining_size;
260 int nid = NUMA_NO_NODE;
261 int alloc_pages = 0;
262 int order;
263
264 INIT_LIST_HEAD(&pages);
265
266 while (alloc_pages < nr_pages) {
267 unsigned long paddr;
268 unsigned chunk_pages;
269
270 order = MIN(highbit64(nr_pages - alloc_pages) - 1, max_order);
271 chunk_pages = (1U << order);
272
273 paddr = abd_alloc_chunk(nid, order ? gfp_comp : gfp, order);
274 if (paddr == 0) {
275 if (order == 0) {
276 ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry);
277 schedule_timeout_interruptible(1);
278 } else {
279 max_order = MAX(0, order - 1);
280 }
281 continue;
282 }
283
284 page = virt_to_page(paddr);
285 list_add_tail(&page->lru, &pages);
286
287 if ((nid != NUMA_NO_NODE) && (page_to_nid(page) != nid))
288 zones++;
289
290 nid = page_to_nid(page);
291 ABDSTAT_BUMP(abdstat_scatter_orders[order]);
292 chunks++;
293 alloc_pages += chunk_pages;
294 }
295
296 ASSERT3S(alloc_pages, ==, nr_pages);
297
298 while (sg_alloc_table(&table, chunks, gfp)) {
299 ABDSTAT_BUMP(abdstat_scatter_sg_table_retry);
300 schedule_timeout_interruptible(1);
301 }
302
303 sg = table.sgl;
304 remaining_size = size;
305 list_for_each_entry_safe(page, tmp_page, &pages, lru) {
306 size_t sg_size = MIN(PAGESIZE << compound_order(page),
307 remaining_size);
308 sg_set_page(sg, page, sg_size, 0);
309 remaining_size -= sg_size;
310
311 sg = sg_next(sg);
312 list_del(&page->lru);
313 }
314
315 if (chunks > 1) {
316 ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
317 abd->abd_flags |= ABD_FLAG_MULTI_CHUNK;
318
319 if (zones) {
320 ABDSTAT_BUMP(abdstat_scatter_page_multi_zone);
321 abd->abd_flags |= ABD_FLAG_MULTI_ZONE;
322 }
323 }
324
325 ABD_SCATTER(abd).abd_sgl = table.sgl;
326 ABD_SCATTER(abd).abd_nents = table.nents;
327 }
328 #else
329 /*
330 * Allocate N individual pages to construct a scatter ABD. This function
331 * makes no attempt to request contiguous pages and requires the minimal
332 * number of kernel interfaces. It's designed for maximum compatibility.
333 */
334 static void
335 abd_alloc_pages(abd_t *abd, size_t size)
336 {
337 struct scatterlist *sg = NULL;
338 struct sg_table table;
339 struct page *page;
340 gfp_t gfp = __GFP_NOWARN | GFP_NOIO;
341 int nr_pages = abd_chunkcnt_for_bytes(size);
342 int i = 0;
343
344 while (sg_alloc_table(&table, nr_pages, gfp)) {
345 ABDSTAT_BUMP(abdstat_scatter_sg_table_retry);
346 schedule_timeout_interruptible(1);
347 }
348
349 ASSERT3U(table.nents, ==, nr_pages);
350 ABD_SCATTER(abd).abd_sgl = table.sgl;
351 ABD_SCATTER(abd).abd_nents = nr_pages;
352
353 abd_for_each_sg(abd, sg, nr_pages, i) {
354 while ((page = __page_cache_alloc(gfp)) == NULL) {
355 ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry);
356 schedule_timeout_interruptible(1);
357 }
358
359 ABDSTAT_BUMP(abdstat_scatter_orders[0]);
360 sg_set_page(sg, page, PAGESIZE, 0);
361 }
362
363 if (nr_pages > 1) {
364 ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
365 abd->abd_flags |= ABD_FLAG_MULTI_CHUNK;
366 }
367 }
368 #endif /* !CONFIG_HIGHMEM */
369
370 static void
371 abd_free_pages(abd_t *abd)
372 {
373 struct scatterlist *sg = NULL;
374 struct sg_table table;
375 struct page *page;
376 int nr_pages = ABD_SCATTER(abd).abd_nents;
377 int order, i = 0;
378
379 if (abd->abd_flags & ABD_FLAG_MULTI_ZONE)
380 ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_zone);
381
382 if (abd->abd_flags & ABD_FLAG_MULTI_CHUNK)
383 ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk);
384
385 abd_for_each_sg(abd, sg, nr_pages, i) {
386 page = sg_page(sg);
387 order = compound_order(page);
388 __free_pages(page, order);
389 ASSERT3U(sg->length, <=, PAGE_SIZE << order);
390 ABDSTAT_BUMPDOWN(abdstat_scatter_orders[order]);
391 }
392
393 table.sgl = ABD_SCATTER(abd).abd_sgl;
394 table.nents = table.orig_nents = nr_pages;
395 sg_free_table(&table);
396 }
397
398 #else /* _KERNEL */
399
400 #ifndef PAGE_SHIFT
401 #define PAGE_SHIFT (highbit64(PAGESIZE)-1)
402 #endif
403
404 struct page;
405
406 #define kpm_enable 1
407 #define abd_alloc_chunk(o) \
408 ((struct page *)umem_alloc_aligned(PAGESIZE << (o), 64, KM_SLEEP))
409 #define abd_free_chunk(chunk, o) umem_free(chunk, PAGESIZE << (o))
410 #define zfs_kmap_atomic(chunk, km) ((void *)chunk)
411 #define zfs_kunmap_atomic(addr, km) do { (void)(addr); } while (0)
412 #define local_irq_save(flags) do { (void)(flags); } while (0)
413 #define local_irq_restore(flags) do { (void)(flags); } while (0)
414 #define nth_page(pg, i) \
415 ((struct page *)((void *)(pg) + (i) * PAGESIZE))
416
417 struct scatterlist {
418 struct page *page;
419 int length;
420 int end;
421 };
422
423 static void
424 sg_init_table(struct scatterlist *sg, int nr)
425 {
426 memset(sg, 0, nr * sizeof (struct scatterlist));
427 sg[nr - 1].end = 1;
428 }
429
430 #define for_each_sg(sgl, sg, nr, i) \
431 for ((i) = 0, (sg) = (sgl); (i) < (nr); (i)++, (sg) = sg_next(sg))
432
433 static inline void
434 sg_set_page(struct scatterlist *sg, struct page *page, unsigned int len,
435 unsigned int offset)
436 {
437 /* currently we don't use offset */
438 ASSERT(offset == 0);
439 sg->page = page;
440 sg->length = len;
441 }
442
443 static inline struct page *
444 sg_page(struct scatterlist *sg)
445 {
446 return (sg->page);
447 }
448
449 static inline struct scatterlist *
450 sg_next(struct scatterlist *sg)
451 {
452 if (sg->end)
453 return (NULL);
454
455 return (sg + 1);
456 }
457
458 static void
459 abd_alloc_pages(abd_t *abd, size_t size)
460 {
461 unsigned nr_pages = abd_chunkcnt_for_bytes(size);
462 struct scatterlist *sg;
463 int i;
464
465 ABD_SCATTER(abd).abd_sgl = vmem_alloc(nr_pages *
466 sizeof (struct scatterlist), KM_SLEEP);
467 sg_init_table(ABD_SCATTER(abd).abd_sgl, nr_pages);
468
469 abd_for_each_sg(abd, sg, nr_pages, i) {
470 struct page *p = abd_alloc_chunk(0);
471 sg_set_page(sg, p, PAGESIZE, 0);
472 }
473 ABD_SCATTER(abd).abd_nents = nr_pages;
474 }
475
476 static void
477 abd_free_pages(abd_t *abd)
478 {
479 int i, n = ABD_SCATTER(abd).abd_nents;
480 struct scatterlist *sg;
481 int j;
482
483 abd_for_each_sg(abd, sg, n, i) {
484 for (j = 0; j < sg->length; j += PAGESIZE) {
485 struct page *p = nth_page(sg_page(sg), j>>PAGE_SHIFT);
486 abd_free_chunk(p, 0);
487 }
488 }
489
490 vmem_free(ABD_SCATTER(abd).abd_sgl, n * sizeof (struct scatterlist));
491 }
492
493 #endif /* _KERNEL */
494
495 void
496 abd_init(void)
497 {
498 int i;
499
500 abd_cache = kmem_cache_create("abd_t", sizeof (abd_t),
501 0, NULL, NULL, NULL, NULL, NULL, 0);
502
503 abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED,
504 sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
505 if (abd_ksp != NULL) {
506 abd_ksp->ks_data = &abd_stats;
507 kstat_install(abd_ksp);
508
509 for (i = 0; i < MAX_ORDER; i++) {
510 snprintf(abd_stats.abdstat_scatter_orders[i].name,
511 KSTAT_STRLEN, "scatter_order_%d", i);
512 abd_stats.abdstat_scatter_orders[i].data_type =
513 KSTAT_DATA_UINT64;
514 }
515 }
516 }
517
518 void
519 abd_fini(void)
520 {
521 if (abd_ksp != NULL) {
522 kstat_delete(abd_ksp);
523 abd_ksp = NULL;
524 }
525
526 if (abd_cache) {
527 kmem_cache_destroy(abd_cache);
528 abd_cache = NULL;
529 }
530 }
531
532 static inline void
533 abd_verify(abd_t *abd)
534 {
535 ASSERT3U(abd->abd_size, >, 0);
536 ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE);
537 ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR |
538 ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE |
539 ABD_FLAG_MULTI_CHUNK));
540 IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER));
541 IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER);
542 if (abd_is_linear(abd)) {
543 ASSERT3P(abd->abd_u.abd_linear.abd_buf, !=, NULL);
544 } else {
545 size_t n;
546 int i = 0;
547 struct scatterlist *sg = NULL;
548
549 ASSERT3U(ABD_SCATTER(abd).abd_nents, >, 0);
550 ASSERT3U(ABD_SCATTER(abd).abd_offset, <,
551 ABD_SCATTER(abd).abd_sgl->length);
552 n = ABD_SCATTER(abd).abd_nents;
553 abd_for_each_sg(abd, sg, n, i) {
554 ASSERT3P(sg_page(sg), !=, NULL);
555 }
556 }
557 }
558
559 static inline abd_t *
560 abd_alloc_struct(void)
561 {
562 abd_t *abd = kmem_cache_alloc(abd_cache, KM_PUSHPAGE);
563
564 ASSERT3P(abd, !=, NULL);
565 ABDSTAT_INCR(abdstat_struct_size, sizeof (abd_t));
566
567 return (abd);
568 }
569
570 static inline void
571 abd_free_struct(abd_t *abd)
572 {
573 kmem_cache_free(abd_cache, abd);
574 ABDSTAT_INCR(abdstat_struct_size, -(int)sizeof (abd_t));
575 }
576
577 /*
578 * Allocate an ABD, along with its own underlying data buffers. Use this if you
579 * don't care whether the ABD is linear or not.
580 */
581 abd_t *
582 abd_alloc(size_t size, boolean_t is_metadata)
583 {
584 if (!zfs_abd_scatter_enabled || size <= PAGESIZE)
585 return (abd_alloc_linear(size, is_metadata));
586
587 VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
588
589 abd_t *abd = abd_alloc_struct();
590 abd->abd_flags = ABD_FLAG_OWNER;
591 abd_alloc_pages(abd, size);
592
593 if (is_metadata) {
594 abd->abd_flags |= ABD_FLAG_META;
595 }
596 abd->abd_size = size;
597 abd->abd_parent = NULL;
598 zfs_refcount_create(&abd->abd_children);
599
600 abd->abd_u.abd_scatter.abd_offset = 0;
601
602 ABDSTAT_BUMP(abdstat_scatter_cnt);
603 ABDSTAT_INCR(abdstat_scatter_data_size, size);
604 ABDSTAT_INCR(abdstat_scatter_chunk_waste,
605 P2ROUNDUP(size, PAGESIZE) - size);
606
607 return (abd);
608 }
609
610 static void
611 abd_free_scatter(abd_t *abd)
612 {
613 abd_free_pages(abd);
614
615 zfs_refcount_destroy(&abd->abd_children);
616 ABDSTAT_BUMPDOWN(abdstat_scatter_cnt);
617 ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size);
618 ABDSTAT_INCR(abdstat_scatter_chunk_waste,
619 (int)abd->abd_size - (int)P2ROUNDUP(abd->abd_size, PAGESIZE));
620
621 abd_free_struct(abd);
622 }
623
624 /*
625 * Allocate an ABD that must be linear, along with its own underlying data
626 * buffer. Only use this when it would be very annoying to write your ABD
627 * consumer with a scattered ABD.
628 */
629 abd_t *
630 abd_alloc_linear(size_t size, boolean_t is_metadata)
631 {
632 abd_t *abd = abd_alloc_struct();
633
634 VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
635
636 abd->abd_flags = ABD_FLAG_LINEAR | ABD_FLAG_OWNER;
637 if (is_metadata) {
638 abd->abd_flags |= ABD_FLAG_META;
639 }
640 abd->abd_size = size;
641 abd->abd_parent = NULL;
642 zfs_refcount_create(&abd->abd_children);
643
644 if (is_metadata) {
645 abd->abd_u.abd_linear.abd_buf = zio_buf_alloc(size);
646 } else {
647 abd->abd_u.abd_linear.abd_buf = zio_data_buf_alloc(size);
648 }
649
650 ABDSTAT_BUMP(abdstat_linear_cnt);
651 ABDSTAT_INCR(abdstat_linear_data_size, size);
652
653 return (abd);
654 }
655
656 static void
657 abd_free_linear(abd_t *abd)
658 {
659 if (abd->abd_flags & ABD_FLAG_META) {
660 zio_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size);
661 } else {
662 zio_data_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size);
663 }
664
665 zfs_refcount_destroy(&abd->abd_children);
666 ABDSTAT_BUMPDOWN(abdstat_linear_cnt);
667 ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size);
668
669 abd_free_struct(abd);
670 }
671
672 /*
673 * Free an ABD. Only use this on ABDs allocated with abd_alloc() or
674 * abd_alloc_linear().
675 */
676 void
677 abd_free(abd_t *abd)
678 {
679 abd_verify(abd);
680 ASSERT3P(abd->abd_parent, ==, NULL);
681 ASSERT(abd->abd_flags & ABD_FLAG_OWNER);
682 if (abd_is_linear(abd))
683 abd_free_linear(abd);
684 else
685 abd_free_scatter(abd);
686 }
687
688 /*
689 * Allocate an ABD of the same format (same metadata flag, same scatterize
690 * setting) as another ABD.
691 */
692 abd_t *
693 abd_alloc_sametype(abd_t *sabd, size_t size)
694 {
695 boolean_t is_metadata = (sabd->abd_flags & ABD_FLAG_META) != 0;
696 if (abd_is_linear(sabd)) {
697 return (abd_alloc_linear(size, is_metadata));
698 } else {
699 return (abd_alloc(size, is_metadata));
700 }
701 }
702
703 /*
704 * If we're going to use this ABD for doing I/O using the block layer, the
705 * consumer of the ABD data doesn't care if it's scattered or not, and we don't
706 * plan to store this ABD in memory for a long period of time, we should
707 * allocate the ABD type that requires the least data copying to do the I/O.
708 *
709 * On Illumos this is linear ABDs, however if ldi_strategy() can ever issue I/Os
710 * using a scatter/gather list we should switch to that and replace this call
711 * with vanilla abd_alloc().
712 *
713 * On Linux the optimal thing to do would be to use abd_get_offset() and
714 * construct a new ABD which shares the original pages thereby eliminating
715 * the copy. But for the moment a new linear ABD is allocated until this
716 * performance optimization can be implemented.
717 */
718 abd_t *
719 abd_alloc_for_io(size_t size, boolean_t is_metadata)
720 {
721 return (abd_alloc(size, is_metadata));
722 }
723
724 /*
725 * Allocate a new ABD to point to offset off of sabd. It shares the underlying
726 * buffer data with sabd. Use abd_put() to free. sabd must not be freed while
727 * any derived ABDs exist.
728 */
729 static inline abd_t *
730 abd_get_offset_impl(abd_t *sabd, size_t off, size_t size)
731 {
732 abd_t *abd;
733
734 abd_verify(sabd);
735 ASSERT3U(off, <=, sabd->abd_size);
736
737 if (abd_is_linear(sabd)) {
738 abd = abd_alloc_struct();
739
740 /*
741 * Even if this buf is filesystem metadata, we only track that
742 * if we own the underlying data buffer, which is not true in
743 * this case. Therefore, we don't ever use ABD_FLAG_META here.
744 */
745 abd->abd_flags = ABD_FLAG_LINEAR;
746
747 abd->abd_u.abd_linear.abd_buf =
748 (char *)sabd->abd_u.abd_linear.abd_buf + off;
749 } else {
750 int i = 0;
751 struct scatterlist *sg = NULL;
752 size_t new_offset = sabd->abd_u.abd_scatter.abd_offset + off;
753
754 abd = abd_alloc_struct();
755
756 /*
757 * Even if this buf is filesystem metadata, we only track that
758 * if we own the underlying data buffer, which is not true in
759 * this case. Therefore, we don't ever use ABD_FLAG_META here.
760 */
761 abd->abd_flags = 0;
762
763 abd_for_each_sg(sabd, sg, ABD_SCATTER(sabd).abd_nents, i) {
764 if (new_offset < sg->length)
765 break;
766 new_offset -= sg->length;
767 }
768
769 ABD_SCATTER(abd).abd_sgl = sg;
770 ABD_SCATTER(abd).abd_offset = new_offset;
771 ABD_SCATTER(abd).abd_nents = ABD_SCATTER(sabd).abd_nents - i;
772 }
773
774 abd->abd_size = size;
775 abd->abd_parent = sabd;
776 zfs_refcount_create(&abd->abd_children);
777 (void) zfs_refcount_add_many(&sabd->abd_children, abd->abd_size, abd);
778
779 return (abd);
780 }
781
782 abd_t *
783 abd_get_offset(abd_t *sabd, size_t off)
784 {
785 size_t size = sabd->abd_size > off ? sabd->abd_size - off : 0;
786
787 VERIFY3U(size, >, 0);
788
789 return (abd_get_offset_impl(sabd, off, size));
790 }
791
792 abd_t *
793 abd_get_offset_size(abd_t *sabd, size_t off, size_t size)
794 {
795 ASSERT3U(off + size, <=, sabd->abd_size);
796
797 return (abd_get_offset_impl(sabd, off, size));
798 }
799
800 /*
801 * Allocate a linear ABD structure for buf. You must free this with abd_put()
802 * since the resulting ABD doesn't own its own buffer.
803 */
804 abd_t *
805 abd_get_from_buf(void *buf, size_t size)
806 {
807 abd_t *abd = abd_alloc_struct();
808
809 VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
810
811 /*
812 * Even if this buf is filesystem metadata, we only track that if we
813 * own the underlying data buffer, which is not true in this case.
814 * Therefore, we don't ever use ABD_FLAG_META here.
815 */
816 abd->abd_flags = ABD_FLAG_LINEAR;
817 abd->abd_size = size;
818 abd->abd_parent = NULL;
819 zfs_refcount_create(&abd->abd_children);
820
821 abd->abd_u.abd_linear.abd_buf = buf;
822
823 return (abd);
824 }
825
826 /*
827 * Free an ABD allocated from abd_get_offset() or abd_get_from_buf(). Will not
828 * free the underlying scatterlist or buffer.
829 */
830 void
831 abd_put(abd_t *abd)
832 {
833 abd_verify(abd);
834 ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER));
835
836 if (abd->abd_parent != NULL) {
837 (void) zfs_refcount_remove_many(&abd->abd_parent->abd_children,
838 abd->abd_size, abd);
839 }
840
841 zfs_refcount_destroy(&abd->abd_children);
842 abd_free_struct(abd);
843 }
844
845 /*
846 * Get the raw buffer associated with a linear ABD.
847 */
848 void *
849 abd_to_buf(abd_t *abd)
850 {
851 ASSERT(abd_is_linear(abd));
852 abd_verify(abd);
853 return (abd->abd_u.abd_linear.abd_buf);
854 }
855
856 /*
857 * Borrow a raw buffer from an ABD without copying the contents of the ABD
858 * into the buffer. If the ABD is scattered, this will allocate a raw buffer
859 * whose contents are undefined. To copy over the existing data in the ABD, use
860 * abd_borrow_buf_copy() instead.
861 */
862 void *
863 abd_borrow_buf(abd_t *abd, size_t n)
864 {
865 void *buf;
866 abd_verify(abd);
867 ASSERT3U(abd->abd_size, >=, n);
868 if (abd_is_linear(abd)) {
869 buf = abd_to_buf(abd);
870 } else {
871 buf = zio_buf_alloc(n);
872 }
873 (void) zfs_refcount_add_many(&abd->abd_children, n, buf);
874
875 return (buf);
876 }
877
878 void *
879 abd_borrow_buf_copy(abd_t *abd, size_t n)
880 {
881 void *buf = abd_borrow_buf(abd, n);
882 if (!abd_is_linear(abd)) {
883 abd_copy_to_buf(buf, abd, n);
884 }
885 return (buf);
886 }
887
888 /*
889 * Return a borrowed raw buffer to an ABD. If the ABD is scattered, this will
890 * not change the contents of the ABD and will ASSERT that you didn't modify
891 * the buffer since it was borrowed. If you want any changes you made to buf to
892 * be copied back to abd, use abd_return_buf_copy() instead.
893 */
894 void
895 abd_return_buf(abd_t *abd, void *buf, size_t n)
896 {
897 abd_verify(abd);
898 ASSERT3U(abd->abd_size, >=, n);
899 if (abd_is_linear(abd)) {
900 ASSERT3P(buf, ==, abd_to_buf(abd));
901 } else {
902 ASSERT0(abd_cmp_buf(abd, buf, n));
903 zio_buf_free(buf, n);
904 }
905 (void) zfs_refcount_remove_many(&abd->abd_children, n, buf);
906 }
907
908 void
909 abd_return_buf_copy(abd_t *abd, void *buf, size_t n)
910 {
911 if (!abd_is_linear(abd)) {
912 abd_copy_from_buf(abd, buf, n);
913 }
914 abd_return_buf(abd, buf, n);
915 }
916
917 /*
918 * Give this ABD ownership of the buffer that it's storing. Can only be used on
919 * linear ABDs which were allocated via abd_get_from_buf(), or ones allocated
920 * with abd_alloc_linear() which subsequently released ownership of their buf
921 * with abd_release_ownership_of_buf().
922 */
923 void
924 abd_take_ownership_of_buf(abd_t *abd, boolean_t is_metadata)
925 {
926 ASSERT(abd_is_linear(abd));
927 ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER));
928 abd_verify(abd);
929
930 abd->abd_flags |= ABD_FLAG_OWNER;
931 if (is_metadata) {
932 abd->abd_flags |= ABD_FLAG_META;
933 }
934
935 ABDSTAT_BUMP(abdstat_linear_cnt);
936 ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size);
937 }
938
939 void
940 abd_release_ownership_of_buf(abd_t *abd)
941 {
942 ASSERT(abd_is_linear(abd));
943 ASSERT(abd->abd_flags & ABD_FLAG_OWNER);
944 abd_verify(abd);
945
946 abd->abd_flags &= ~ABD_FLAG_OWNER;
947 /* Disable this flag since we no longer own the data buffer */
948 abd->abd_flags &= ~ABD_FLAG_META;
949
950 ABDSTAT_BUMPDOWN(abdstat_linear_cnt);
951 ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size);
952 }
953
954 #ifndef HAVE_1ARG_KMAP_ATOMIC
955 #define NR_KM_TYPE (6)
956 #ifdef _KERNEL
957 int km_table[NR_KM_TYPE] = {
958 KM_USER0,
959 KM_USER1,
960 KM_BIO_SRC_IRQ,
961 KM_BIO_DST_IRQ,
962 KM_PTE0,
963 KM_PTE1,
964 };
965 #endif
966 #endif
967
968 struct abd_iter {
969 /* public interface */
970 void *iter_mapaddr; /* addr corresponding to iter_pos */
971 size_t iter_mapsize; /* length of data valid at mapaddr */
972
973 /* private */
974 abd_t *iter_abd; /* ABD being iterated through */
975 size_t iter_pos;
976 size_t iter_offset; /* offset in current sg/abd_buf, */
977 /* abd_offset included */
978 struct scatterlist *iter_sg; /* current sg */
979 #ifndef HAVE_1ARG_KMAP_ATOMIC
980 int iter_km; /* KM_* for kmap_atomic */
981 #endif
982 };
983
984 /*
985 * Initialize the abd_iter.
986 */
987 static void
988 abd_iter_init(struct abd_iter *aiter, abd_t *abd, int km_type)
989 {
990 abd_verify(abd);
991 aiter->iter_abd = abd;
992 aiter->iter_mapaddr = NULL;
993 aiter->iter_mapsize = 0;
994 aiter->iter_pos = 0;
995 if (abd_is_linear(abd)) {
996 aiter->iter_offset = 0;
997 aiter->iter_sg = NULL;
998 } else {
999 aiter->iter_offset = ABD_SCATTER(abd).abd_offset;
1000 aiter->iter_sg = ABD_SCATTER(abd).abd_sgl;
1001 }
1002 #ifndef HAVE_1ARG_KMAP_ATOMIC
1003 ASSERT3U(km_type, <, NR_KM_TYPE);
1004 aiter->iter_km = km_type;
1005 #endif
1006 }
1007
1008 /*
1009 * Advance the iterator by a certain amount. Cannot be called when a chunk is
1010 * in use. This can be safely called when the aiter has already exhausted, in
1011 * which case this does nothing.
1012 */
1013 static void
1014 abd_iter_advance(struct abd_iter *aiter, size_t amount)
1015 {
1016 ASSERT3P(aiter->iter_mapaddr, ==, NULL);
1017 ASSERT0(aiter->iter_mapsize);
1018
1019 /* There's nothing left to advance to, so do nothing */
1020 if (aiter->iter_pos == aiter->iter_abd->abd_size)
1021 return;
1022
1023 aiter->iter_pos += amount;
1024 aiter->iter_offset += amount;
1025 if (!abd_is_linear(aiter->iter_abd)) {
1026 while (aiter->iter_offset >= aiter->iter_sg->length) {
1027 aiter->iter_offset -= aiter->iter_sg->length;
1028 aiter->iter_sg = sg_next(aiter->iter_sg);
1029 if (aiter->iter_sg == NULL) {
1030 ASSERT0(aiter->iter_offset);
1031 break;
1032 }
1033 }
1034 }
1035 }
1036
1037 /*
1038 * Map the current chunk into aiter. This can be safely called when the aiter
1039 * has already exhausted, in which case this does nothing.
1040 */
1041 static void
1042 abd_iter_map(struct abd_iter *aiter)
1043 {
1044 void *paddr;
1045 size_t offset = 0;
1046
1047 ASSERT3P(aiter->iter_mapaddr, ==, NULL);
1048 ASSERT0(aiter->iter_mapsize);
1049
1050 /* There's nothing left to iterate over, so do nothing */
1051 if (aiter->iter_pos == aiter->iter_abd->abd_size)
1052 return;
1053
1054 if (abd_is_linear(aiter->iter_abd)) {
1055 ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset);
1056 offset = aiter->iter_offset;
1057 aiter->iter_mapsize = aiter->iter_abd->abd_size - offset;
1058 paddr = aiter->iter_abd->abd_u.abd_linear.abd_buf;
1059 } else {
1060 offset = aiter->iter_offset;
1061 aiter->iter_mapsize = MIN(aiter->iter_sg->length - offset,
1062 aiter->iter_abd->abd_size - aiter->iter_pos);
1063
1064 paddr = zfs_kmap_atomic(sg_page(aiter->iter_sg),
1065 km_table[aiter->iter_km]);
1066 }
1067
1068 aiter->iter_mapaddr = (char *)paddr + offset;
1069 }
1070
1071 /*
1072 * Unmap the current chunk from aiter. This can be safely called when the aiter
1073 * has already exhausted, in which case this does nothing.
1074 */
1075 static void
1076 abd_iter_unmap(struct abd_iter *aiter)
1077 {
1078 /* There's nothing left to unmap, so do nothing */
1079 if (aiter->iter_pos == aiter->iter_abd->abd_size)
1080 return;
1081
1082 if (!abd_is_linear(aiter->iter_abd)) {
1083 /* LINTED E_FUNC_SET_NOT_USED */
1084 zfs_kunmap_atomic(aiter->iter_mapaddr - aiter->iter_offset,
1085 km_table[aiter->iter_km]);
1086 }
1087
1088 ASSERT3P(aiter->iter_mapaddr, !=, NULL);
1089 ASSERT3U(aiter->iter_mapsize, >, 0);
1090
1091 aiter->iter_mapaddr = NULL;
1092 aiter->iter_mapsize = 0;
1093 }
1094
1095 int
1096 abd_iterate_func(abd_t *abd, size_t off, size_t size,
1097 abd_iter_func_t *func, void *private)
1098 {
1099 int ret = 0;
1100 struct abd_iter aiter;
1101
1102 abd_verify(abd);
1103 ASSERT3U(off + size, <=, abd->abd_size);
1104
1105 abd_iter_init(&aiter, abd, 0);
1106 abd_iter_advance(&aiter, off);
1107
1108 while (size > 0) {
1109 abd_iter_map(&aiter);
1110
1111 size_t len = MIN(aiter.iter_mapsize, size);
1112 ASSERT3U(len, >, 0);
1113
1114 ret = func(aiter.iter_mapaddr, len, private);
1115
1116 abd_iter_unmap(&aiter);
1117
1118 if (ret != 0)
1119 break;
1120
1121 size -= len;
1122 abd_iter_advance(&aiter, len);
1123 }
1124
1125 return (ret);
1126 }
1127
1128 struct buf_arg {
1129 void *arg_buf;
1130 };
1131
1132 static int
1133 abd_copy_to_buf_off_cb(void *buf, size_t size, void *private)
1134 {
1135 struct buf_arg *ba_ptr = private;
1136
1137 (void) memcpy(ba_ptr->arg_buf, buf, size);
1138 ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;
1139
1140 return (0);
1141 }
1142
1143 /*
1144 * Copy abd to buf. (off is the offset in abd.)
1145 */
1146 void
1147 abd_copy_to_buf_off(void *buf, abd_t *abd, size_t off, size_t size)
1148 {
1149 struct buf_arg ba_ptr = { buf };
1150
1151 (void) abd_iterate_func(abd, off, size, abd_copy_to_buf_off_cb,
1152 &ba_ptr);
1153 }
1154
1155 static int
1156 abd_cmp_buf_off_cb(void *buf, size_t size, void *private)
1157 {
1158 int ret;
1159 struct buf_arg *ba_ptr = private;
1160
1161 ret = memcmp(buf, ba_ptr->arg_buf, size);
1162 ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;
1163
1164 return (ret);
1165 }
1166
1167 /*
1168 * Compare the contents of abd to buf. (off is the offset in abd.)
1169 */
1170 int
1171 abd_cmp_buf_off(abd_t *abd, const void *buf, size_t off, size_t size)
1172 {
1173 struct buf_arg ba_ptr = { (void *) buf };
1174
1175 return (abd_iterate_func(abd, off, size, abd_cmp_buf_off_cb, &ba_ptr));
1176 }
1177
1178 static int
1179 abd_copy_from_buf_off_cb(void *buf, size_t size, void *private)
1180 {
1181 struct buf_arg *ba_ptr = private;
1182
1183 (void) memcpy(buf, ba_ptr->arg_buf, size);
1184 ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;
1185
1186 return (0);
1187 }
1188
1189 /*
1190 * Copy from buf to abd. (off is the offset in abd.)
1191 */
1192 void
1193 abd_copy_from_buf_off(abd_t *abd, const void *buf, size_t off, size_t size)
1194 {
1195 struct buf_arg ba_ptr = { (void *) buf };
1196
1197 (void) abd_iterate_func(abd, off, size, abd_copy_from_buf_off_cb,
1198 &ba_ptr);
1199 }
1200
1201 /*ARGSUSED*/
1202 static int
1203 abd_zero_off_cb(void *buf, size_t size, void *private)
1204 {
1205 (void) memset(buf, 0, size);
1206 return (0);
1207 }
1208
1209 /*
1210 * Zero out the abd from a particular offset to the end.
1211 */
1212 void
1213 abd_zero_off(abd_t *abd, size_t off, size_t size)
1214 {
1215 (void) abd_iterate_func(abd, off, size, abd_zero_off_cb, NULL);
1216 }
1217
1218 /*
1219 * Iterate over two ABDs and call func incrementally on the two ABDs' data in
1220 * equal-sized chunks (passed to func as raw buffers). func could be called many
1221 * times during this iteration.
1222 */
1223 int
1224 abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff,
1225 size_t size, abd_iter_func2_t *func, void *private)
1226 {
1227 int ret = 0;
1228 struct abd_iter daiter, saiter;
1229
1230 abd_verify(dabd);
1231 abd_verify(sabd);
1232
1233 ASSERT3U(doff + size, <=, dabd->abd_size);
1234 ASSERT3U(soff + size, <=, sabd->abd_size);
1235
1236 abd_iter_init(&daiter, dabd, 0);
1237 abd_iter_init(&saiter, sabd, 1);
1238 abd_iter_advance(&daiter, doff);
1239 abd_iter_advance(&saiter, soff);
1240
1241 while (size > 0) {
1242 abd_iter_map(&daiter);
1243 abd_iter_map(&saiter);
1244
1245 size_t dlen = MIN(daiter.iter_mapsize, size);
1246 size_t slen = MIN(saiter.iter_mapsize, size);
1247 size_t len = MIN(dlen, slen);
1248 ASSERT(dlen > 0 || slen > 0);
1249
1250 ret = func(daiter.iter_mapaddr, saiter.iter_mapaddr, len,
1251 private);
1252
1253 abd_iter_unmap(&saiter);
1254 abd_iter_unmap(&daiter);
1255
1256 if (ret != 0)
1257 break;
1258
1259 size -= len;
1260 abd_iter_advance(&daiter, len);
1261 abd_iter_advance(&saiter, len);
1262 }
1263
1264 return (ret);
1265 }
1266
1267 /*ARGSUSED*/
1268 static int
1269 abd_copy_off_cb(void *dbuf, void *sbuf, size_t size, void *private)
1270 {
1271 (void) memcpy(dbuf, sbuf, size);
1272 return (0);
1273 }
1274
1275 /*
1276 * Copy from sabd to dabd starting from soff and doff.
1277 */
1278 void
1279 abd_copy_off(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, size_t size)
1280 {
1281 (void) abd_iterate_func2(dabd, sabd, doff, soff, size,
1282 abd_copy_off_cb, NULL);
1283 }
1284
1285 /*ARGSUSED*/
1286 static int
1287 abd_cmp_cb(void *bufa, void *bufb, size_t size, void *private)
1288 {
1289 return (memcmp(bufa, bufb, size));
1290 }
1291
1292 /*
1293 * Compares the contents of two ABDs.
1294 */
1295 int
1296 abd_cmp(abd_t *dabd, abd_t *sabd)
1297 {
1298 ASSERT3U(dabd->abd_size, ==, sabd->abd_size);
1299 return (abd_iterate_func2(dabd, sabd, 0, 0, dabd->abd_size,
1300 abd_cmp_cb, NULL));
1301 }
1302
1303 /*
1304 * Iterate over code ABDs and a data ABD and call @func_raidz_gen.
1305 *
1306 * @cabds parity ABDs, must have equal size
1307 * @dabd data ABD. Can be NULL (in this case @dsize = 0)
1308 * @func_raidz_gen should be implemented so that its behaviour
1309 * is the same when taking linear and when taking scatter
1310 */
1311 void
1312 abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd,
1313 ssize_t csize, ssize_t dsize, const unsigned parity,
1314 void (*func_raidz_gen)(void **, const void *, size_t, size_t))
1315 {
1316 int i;
1317 ssize_t len, dlen;
1318 struct abd_iter caiters[3];
1319 struct abd_iter daiter = {0};
1320 void *caddrs[3];
1321 unsigned long flags;
1322
1323 ASSERT3U(parity, <=, 3);
1324
1325 for (i = 0; i < parity; i++)
1326 abd_iter_init(&caiters[i], cabds[i], i);
1327
1328 if (dabd)
1329 abd_iter_init(&daiter, dabd, i);
1330
1331 ASSERT3S(dsize, >=, 0);
1332
1333 local_irq_save(flags);
1334 while (csize > 0) {
1335 len = csize;
1336
1337 if (dabd && dsize > 0)
1338 abd_iter_map(&daiter);
1339
1340 for (i = 0; i < parity; i++) {
1341 abd_iter_map(&caiters[i]);
1342 caddrs[i] = caiters[i].iter_mapaddr;
1343 }
1344
1345 switch (parity) {
1346 case 3:
1347 len = MIN(caiters[2].iter_mapsize, len);
1348 case 2:
1349 len = MIN(caiters[1].iter_mapsize, len);
1350 case 1:
1351 len = MIN(caiters[0].iter_mapsize, len);
1352 }
1353
1354 /* must be progressive */
1355 ASSERT3S(len, >, 0);
1356
1357 if (dabd && dsize > 0) {
1358 /* this needs precise iter.length */
1359 len = MIN(daiter.iter_mapsize, len);
1360 dlen = len;
1361 } else
1362 dlen = 0;
1363
1364 /* must be progressive */
1365 ASSERT3S(len, >, 0);
1366 /*
1367 * The iterated function likely will not do well if each
1368 * segment except the last one is not multiple of 512 (raidz).
1369 */
1370 ASSERT3U(((uint64_t)len & 511ULL), ==, 0);
1371
1372 func_raidz_gen(caddrs, daiter.iter_mapaddr, len, dlen);
1373
1374 for (i = parity-1; i >= 0; i--) {
1375 abd_iter_unmap(&caiters[i]);
1376 abd_iter_advance(&caiters[i], len);
1377 }
1378
1379 if (dabd && dsize > 0) {
1380 abd_iter_unmap(&daiter);
1381 abd_iter_advance(&daiter, dlen);
1382 dsize -= dlen;
1383 }
1384
1385 csize -= len;
1386
1387 ASSERT3S(dsize, >=, 0);
1388 ASSERT3S(csize, >=, 0);
1389 }
1390 local_irq_restore(flags);
1391 }
1392
1393 /*
1394 * Iterate over code ABDs and data reconstruction target ABDs and call
1395 * @func_raidz_rec. Function maps at most 6 pages atomically.
1396 *
1397 * @cabds parity ABDs, must have equal size
1398 * @tabds rec target ABDs, at most 3
1399 * @tsize size of data target columns
1400 * @func_raidz_rec expects syndrome data in target columns. Function
1401 * reconstructs data and overwrites target columns.
1402 */
1403 void
1404 abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds,
1405 ssize_t tsize, const unsigned parity,
1406 void (*func_raidz_rec)(void **t, const size_t tsize, void **c,
1407 const unsigned *mul),
1408 const unsigned *mul)
1409 {
1410 int i;
1411 ssize_t len;
1412 struct abd_iter citers[3];
1413 struct abd_iter xiters[3];
1414 void *caddrs[3], *xaddrs[3];
1415 unsigned long flags;
1416
1417 ASSERT3U(parity, <=, 3);
1418
1419 for (i = 0; i < parity; i++) {
1420 abd_iter_init(&citers[i], cabds[i], 2*i);
1421 abd_iter_init(&xiters[i], tabds[i], 2*i+1);
1422 }
1423
1424 local_irq_save(flags);
1425 while (tsize > 0) {
1426
1427 for (i = 0; i < parity; i++) {
1428 abd_iter_map(&citers[i]);
1429 abd_iter_map(&xiters[i]);
1430 caddrs[i] = citers[i].iter_mapaddr;
1431 xaddrs[i] = xiters[i].iter_mapaddr;
1432 }
1433
1434 len = tsize;
1435 switch (parity) {
1436 case 3:
1437 len = MIN(xiters[2].iter_mapsize, len);
1438 len = MIN(citers[2].iter_mapsize, len);
1439 case 2:
1440 len = MIN(xiters[1].iter_mapsize, len);
1441 len = MIN(citers[1].iter_mapsize, len);
1442 case 1:
1443 len = MIN(xiters[0].iter_mapsize, len);
1444 len = MIN(citers[0].iter_mapsize, len);
1445 }
1446 /* must be progressive */
1447 ASSERT3S(len, >, 0);
1448 /*
1449 * The iterated function likely will not do well if each
1450 * segment except the last one is not multiple of 512 (raidz).
1451 */
1452 ASSERT3U(((uint64_t)len & 511ULL), ==, 0);
1453
1454 func_raidz_rec(xaddrs, len, caddrs, mul);
1455
1456 for (i = parity-1; i >= 0; i--) {
1457 abd_iter_unmap(&xiters[i]);
1458 abd_iter_unmap(&citers[i]);
1459 abd_iter_advance(&xiters[i], len);
1460 abd_iter_advance(&citers[i], len);
1461 }
1462
1463 tsize -= len;
1464 ASSERT3S(tsize, >=, 0);
1465 }
1466 local_irq_restore(flags);
1467 }
1468
1469 #if defined(_KERNEL)
1470 /*
1471 * bio_nr_pages for ABD.
1472 * @off is the offset in @abd
1473 */
1474 unsigned long
1475 abd_nr_pages_off(abd_t *abd, unsigned int size, size_t off)
1476 {
1477 unsigned long pos;
1478
1479 if (abd_is_linear(abd))
1480 pos = (unsigned long)abd_to_buf(abd) + off;
1481 else
1482 pos = abd->abd_u.abd_scatter.abd_offset + off;
1483
1484 return ((pos + size + PAGESIZE - 1) >> PAGE_SHIFT) -
1485 (pos >> PAGE_SHIFT);
1486 }
1487
1488 /*
1489 * bio_map for scatter ABD.
1490 * @off is the offset in @abd
1491 * Remaining IO size is returned
1492 */
1493 unsigned int
1494 abd_scatter_bio_map_off(struct bio *bio, abd_t *abd,
1495 unsigned int io_size, size_t off)
1496 {
1497 int i;
1498 struct abd_iter aiter;
1499
1500 ASSERT(!abd_is_linear(abd));
1501 ASSERT3U(io_size, <=, abd->abd_size - off);
1502
1503 abd_iter_init(&aiter, abd, 0);
1504 abd_iter_advance(&aiter, off);
1505
1506 for (i = 0; i < bio->bi_max_vecs; i++) {
1507 struct page *pg;
1508 size_t len, sgoff, pgoff;
1509 struct scatterlist *sg;
1510
1511 if (io_size <= 0)
1512 break;
1513
1514 sg = aiter.iter_sg;
1515 sgoff = aiter.iter_offset;
1516 pgoff = sgoff & (PAGESIZE - 1);
1517 len = MIN(io_size, PAGESIZE - pgoff);
1518 ASSERT(len > 0);
1519
1520 pg = nth_page(sg_page(sg), sgoff >> PAGE_SHIFT);
1521 if (bio_add_page(bio, pg, len, pgoff) != len)
1522 break;
1523
1524 io_size -= len;
1525 abd_iter_advance(&aiter, len);
1526 }
1527
1528 return (io_size);
1529 }
1530
1531 /* Tunable Parameters */
1532 module_param(zfs_abd_scatter_enabled, int, 0644);
1533 MODULE_PARM_DESC(zfs_abd_scatter_enabled,
1534 "Toggle whether ABD allocations must be linear.");
1535 /* CSTYLED */
1536 module_param(zfs_abd_scatter_max_order, uint, 0644);
1537 MODULE_PARM_DESC(zfs_abd_scatter_max_order,
1538 "Maximum order allocation used for a scatter ABD.");
1539 #endif