4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
23 * Copyright (c) 2016 by Delphix. All rights reserved.
27 * ARC buffer data (ABD).
29 * ABDs are an abstract data structure for the ARC which can use two
30 * different ways of storing the underlying data:
32 * (a) Linear buffer. In this case, all the data in the ABD is stored in one
33 * contiguous buffer in memory (from a zio_[data_]buf_* kmem cache).
35 * +-------------------+
38 * | abd_size = ... | +--------------------------------+
39 * | abd_buf ------------->| raw buffer of size abd_size |
40 * +-------------------+ +--------------------------------+
43 * (b) Scattered buffer. In this case, the data in the ABD is split into
44 * equal-sized chunks (from the abd_chunk_cache kmem_cache), with pointers
45 * to the chunks recorded in an array at the end of the ABD structure.
47 * +-------------------+
51 * | abd_offset = 0 | +-----------+
52 * | abd_chunks[0] ----------------------------->| chunk 0 |
53 * | abd_chunks[1] ---------------------+ +-----------+
54 * | ... | | +-----------+
55 * | abd_chunks[N-1] ---------+ +------->| chunk 1 |
56 * +-------------------+ | +-----------+
59 * +----------------->| chunk N-1 |
62 * Linear buffers act exactly like normal buffers and are always mapped into the
63 * kernel's virtual memory space, while scattered ABD data chunks are allocated
64 * as physical pages and then mapped in only while they are actually being
65 * accessed through one of the abd_* library functions. Using scattered ABDs
66 * provides several benefits:
68 * (1) They avoid use of kmem_*, preventing performance problems where running
69 * kmem_reap on very large memory systems never finishes and causes
70 * constant TLB shootdowns.
72 * (2) Fragmentation is less of an issue since when we are at the limit of
73 * allocatable space, we won't have to search around for a long free
74 * hole in the VA space for large ARC allocations. Each chunk is mapped in
75 * individually, so even if we weren't using segkpm (see next point) we
76 * wouldn't need to worry about finding a contiguous address range.
78 * (3) Use of segkpm will avoid the need for map / unmap / TLB shootdown costs
79 * on each ABD access. (If segkpm isn't available then we use all linear
80 * ABDs to avoid this penalty.) See seg_kpm.c for more details.
82 * It is possible to make all ABDs linear by setting zfs_abd_scatter_enabled to
83 * B_FALSE. However, it is not possible to use scattered ABDs if segkpm is not
84 * available, which is the case on all 32-bit systems and any 64-bit systems
85 * where kpm_enable is turned off.
87 * In addition to directly allocating a linear or scattered ABD, it is also
88 * possible to create an ABD by requesting the "sub-ABD" starting at an offset
89 * within an existing ABD. In linear buffers this is simple (set abd_buf of
90 * the new ABD to the starting point within the original raw buffer), but
91 * scattered ABDs are a little more complex. The new ABD makes a copy of the
92 * relevant abd_chunks pointers (but not the underlying data). However, to
93 * provide arbitrary rather than only chunk-aligned starting offsets, it also
94 * tracks an abd_offset field which represents the starting point of the data
95 * within the first chunk in abd_chunks. For both linear and scattered ABDs,
96 * creating an offset ABD marks the original ABD as the offset's parent, and the
97 * original ABD's abd_children refcount is incremented. This data allows us to
98 * ensure the root ABD isn't deleted before its children.
100 * Most consumers should never need to know what type of ABD they're using --
101 * the ABD public API ensures that it's possible to transparently switch from
102 * using a linear ABD to a scattered one when doing so would be beneficial.
104 * If you need to use the data within an ABD directly, if you know it's linear
105 * (because you allocated it) you can use abd_to_buf() to access the underlying
106 * raw buffer. Otherwise, you should use one of the abd_borrow_buf* functions
107 * which will allocate a raw buffer if necessary. Use the abd_return_buf*
108 * functions to return any raw buffers that are no longer necessary when you're
111 * There are a variety of ABD APIs that implement basic buffer operations:
112 * compare, copy, read, write, and fill with zeroes. If you need a custom
113 * function which progressively accesses the whole ABD, use the abd_iterate_*
118 #include <sys/param.h>
120 #include <sys/zfs_context.h>
121 #include <sys/zfs_znode.h>
124 #define KMC_NOTOUCH 0
127 typedef struct abd_stats
{
128 kstat_named_t abdstat_struct_size
;
129 kstat_named_t abdstat_scatter_cnt
;
130 kstat_named_t abdstat_scatter_data_size
;
131 kstat_named_t abdstat_scatter_chunk_waste
;
132 kstat_named_t abdstat_linear_cnt
;
133 kstat_named_t abdstat_linear_data_size
;
136 static abd_stats_t abd_stats
= {
137 /* Amount of memory occupied by all of the abd_t struct allocations */
138 { "struct_size", KSTAT_DATA_UINT64
},
140 * The number of scatter ABDs which are currently allocated, excluding
141 * ABDs which don't own their data (for instance the ones which were
142 * allocated through abd_get_offset()).
144 { "scatter_cnt", KSTAT_DATA_UINT64
},
145 /* Amount of data stored in all scatter ABDs tracked by scatter_cnt */
146 { "scatter_data_size", KSTAT_DATA_UINT64
},
148 * The amount of space wasted at the end of the last chunk across all
149 * scatter ABDs tracked by scatter_cnt.
151 { "scatter_chunk_waste", KSTAT_DATA_UINT64
},
153 * The number of linear ABDs which are currently allocated, excluding
154 * ABDs which don't own their data (for instance the ones which were
155 * allocated through abd_get_offset() and abd_get_from_buf()). If an
156 * ABD takes ownership of its buf then it will become tracked.
158 { "linear_cnt", KSTAT_DATA_UINT64
},
159 /* Amount of data stored in all linear ABDs tracked by linear_cnt */
160 { "linear_data_size", KSTAT_DATA_UINT64
},
163 #define ABDSTAT(stat) (abd_stats.stat.value.ui64)
164 #define ABDSTAT_INCR(stat, val) \
165 atomic_add_64(&abd_stats.stat.value.ui64, (val))
166 #define ABDSTAT_BUMP(stat) ABDSTAT_INCR(stat, 1)
167 #define ABDSTAT_BUMPDOWN(stat) ABDSTAT_INCR(stat, -1)
169 /* see block comment above for description */
170 int zfs_abd_scatter_enabled
= B_TRUE
;
174 static kstat_t
*abd_ksp
;
177 abd_alloc_chunk(void)
179 struct page
*c
= alloc_page(kmem_flags_convert(KM_SLEEP
));
180 ASSERT3P(c
, !=, NULL
);
185 abd_free_chunk(struct page
*c
)
191 abd_map_chunk(struct page
*c
)
194 * Use of segkpm means we don't care if this is mapped S_READ or S_WRITE
195 * but S_WRITE is conceptually more accurate.
201 abd_unmap_chunk(struct page
*c
)
209 abd_ksp
= kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED
,
210 sizeof (abd_stats
) / sizeof (kstat_named_t
), KSTAT_FLAG_VIRTUAL
);
211 if (abd_ksp
!= NULL
) {
212 abd_ksp
->ks_data
= &abd_stats
;
213 kstat_install(abd_ksp
);
220 if (abd_ksp
!= NULL
) {
221 kstat_delete(abd_ksp
);
230 #define abd_alloc_chunk() \
231 ((struct page *) umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP))
232 #define abd_free_chunk(chunk) umem_free(chunk, PAGESIZE)
233 #define abd_map_chunk(chunk) ((void *)chunk)
235 abd_unmap_chunk(struct page
*c
)
252 abd_chunkcnt_for_bytes(size_t size
)
254 return (P2ROUNDUP(size
, PAGESIZE
) / PAGESIZE
);
258 abd_scatter_chunkcnt(abd_t
*abd
)
260 ASSERT(!abd_is_linear(abd
));
261 return (abd_chunkcnt_for_bytes(
262 abd
->abd_u
.abd_scatter
.abd_offset
+ abd
->abd_size
));
266 abd_verify(abd_t
*abd
)
268 ASSERT3U(abd
->abd_size
, >, 0);
269 ASSERT3U(abd
->abd_size
, <=, SPA_MAXBLOCKSIZE
);
270 ASSERT3U(abd
->abd_flags
, ==, abd
->abd_flags
& (ABD_FLAG_LINEAR
|
271 ABD_FLAG_OWNER
| ABD_FLAG_META
));
272 IMPLY(abd
->abd_parent
!= NULL
, !(abd
->abd_flags
& ABD_FLAG_OWNER
));
273 IMPLY(abd
->abd_flags
& ABD_FLAG_META
, abd
->abd_flags
& ABD_FLAG_OWNER
);
274 if (abd_is_linear(abd
)) {
275 ASSERT3P(abd
->abd_u
.abd_linear
.abd_buf
, !=, NULL
);
280 ASSERT3U(abd
->abd_u
.abd_scatter
.abd_offset
, <, PAGESIZE
);
281 n
= abd_scatter_chunkcnt(abd
);
282 for (i
= 0; i
< n
; i
++) {
284 abd
->abd_u
.abd_scatter
.abd_chunks
[i
], !=, NULL
);
289 static inline abd_t
*
290 abd_alloc_struct(size_t chunkcnt
)
292 size_t size
= offsetof(abd_t
, abd_u
.abd_scatter
.abd_chunks
[chunkcnt
]);
293 abd_t
*abd
= kmem_alloc(size
, KM_PUSHPAGE
);
294 ASSERT3P(abd
, !=, NULL
);
295 ABDSTAT_INCR(abdstat_struct_size
, size
);
301 abd_free_struct(abd_t
*abd
)
303 size_t chunkcnt
= abd_is_linear(abd
) ? 0 : abd_scatter_chunkcnt(abd
);
304 int size
= offsetof(abd_t
, abd_u
.abd_scatter
.abd_chunks
[chunkcnt
]);
305 kmem_free(abd
, size
);
306 ABDSTAT_INCR(abdstat_struct_size
, -size
);
310 * Allocate an ABD, along with its own underlying data buffers. Use this if you
311 * don't care whether the ABD is linear or not.
314 abd_alloc(size_t size
, boolean_t is_metadata
)
320 if (!zfs_abd_scatter_enabled
)
321 return (abd_alloc_linear(size
, is_metadata
));
323 VERIFY3U(size
, <=, SPA_MAXBLOCKSIZE
);
325 n
= abd_chunkcnt_for_bytes(size
);
326 abd
= abd_alloc_struct(n
);
328 abd
->abd_flags
= ABD_FLAG_OWNER
;
330 abd
->abd_flags
|= ABD_FLAG_META
;
332 abd
->abd_size
= size
;
333 abd
->abd_parent
= NULL
;
334 refcount_create(&abd
->abd_children
);
336 abd
->abd_u
.abd_scatter
.abd_offset
= 0;
337 abd
->abd_u
.abd_scatter
.abd_chunk_size
= PAGESIZE
;
339 for (i
= 0; i
< n
; i
++) {
340 void *c
= abd_alloc_chunk();
341 ASSERT3P(c
, !=, NULL
);
342 abd
->abd_u
.abd_scatter
.abd_chunks
[i
] = c
;
345 ABDSTAT_BUMP(abdstat_scatter_cnt
);
346 ABDSTAT_INCR(abdstat_scatter_data_size
, size
);
347 ABDSTAT_INCR(abdstat_scatter_chunk_waste
,
348 n
* PAGESIZE
- size
);
354 abd_free_scatter(abd_t
*abd
)
356 size_t n
= abd_scatter_chunkcnt(abd
);
359 for (i
= 0; i
< n
; i
++) {
360 abd_free_chunk(abd
->abd_u
.abd_scatter
.abd_chunks
[i
]);
363 refcount_destroy(&abd
->abd_children
);
364 ABDSTAT_BUMPDOWN(abdstat_scatter_cnt
);
365 ABDSTAT_INCR(abdstat_scatter_data_size
, -(int)abd
->abd_size
);
366 ABDSTAT_INCR(abdstat_scatter_chunk_waste
,
367 abd
->abd_size
- n
* PAGESIZE
);
369 abd_free_struct(abd
);
373 * Allocate an ABD that must be linear, along with its own underlying data
374 * buffer. Only use this when it would be very annoying to write your ABD
375 * consumer with a scattered ABD.
378 abd_alloc_linear(size_t size
, boolean_t is_metadata
)
380 abd_t
*abd
= abd_alloc_struct(0);
382 VERIFY3U(size
, <=, SPA_MAXBLOCKSIZE
);
384 abd
->abd_flags
= ABD_FLAG_LINEAR
| ABD_FLAG_OWNER
;
386 abd
->abd_flags
|= ABD_FLAG_META
;
388 abd
->abd_size
= size
;
389 abd
->abd_parent
= NULL
;
390 refcount_create(&abd
->abd_children
);
393 abd
->abd_u
.abd_linear
.abd_buf
= zio_buf_alloc(size
);
395 abd
->abd_u
.abd_linear
.abd_buf
= zio_data_buf_alloc(size
);
398 ABDSTAT_BUMP(abdstat_linear_cnt
);
399 ABDSTAT_INCR(abdstat_linear_data_size
, size
);
405 abd_free_linear(abd_t
*abd
)
407 if (abd
->abd_flags
& ABD_FLAG_META
) {
408 zio_buf_free(abd
->abd_u
.abd_linear
.abd_buf
, abd
->abd_size
);
410 zio_data_buf_free(abd
->abd_u
.abd_linear
.abd_buf
, abd
->abd_size
);
413 refcount_destroy(&abd
->abd_children
);
414 ABDSTAT_BUMPDOWN(abdstat_linear_cnt
);
415 ABDSTAT_INCR(abdstat_linear_data_size
, -(int)abd
->abd_size
);
417 abd_free_struct(abd
);
421 * Free an ABD. Only use this on ABDs allocated with abd_alloc() or
422 * abd_alloc_linear().
428 ASSERT3P(abd
->abd_parent
, ==, NULL
);
429 ASSERT(abd
->abd_flags
& ABD_FLAG_OWNER
);
430 if (abd_is_linear(abd
))
431 abd_free_linear(abd
);
433 abd_free_scatter(abd
);
437 * Allocate an ABD of the same format (same metadata flag, same scatterize
438 * setting) as another ABD.
441 abd_alloc_sametype(abd_t
*sabd
, size_t size
)
443 boolean_t is_metadata
= (sabd
->abd_flags
| ABD_FLAG_META
) != 0;
444 if (abd_is_linear(sabd
)) {
445 return (abd_alloc_linear(size
, is_metadata
));
447 return (abd_alloc(size
, is_metadata
));
452 * If we're going to use this ABD for doing I/O using the block layer, the
453 * consumer of the ABD data doesn't care if it's scattered or not, and we don't
454 * plan to store this ABD in memory for a long period of time, we should
455 * allocate the ABD type that requires the least data copying to do the I/O.
457 * On Illumos this is linear ABDs, however if ldi_strategy() can ever issue I/Os
458 * using a scatter/gather list we should switch to that and replace this call
459 * with vanilla abd_alloc().
461 * On Linux the optimal thing to do would be to use abd_get_offset() and
462 * construct a new ABD which shares the original pages thereby eliminating
463 * the copy. But for the moment a new linear ABD is allocated until this
464 * performance optimization can be implemented.
467 abd_alloc_for_io(size_t size
, boolean_t is_metadata
)
469 return (abd_alloc_linear(size
, is_metadata
));
473 * Allocate a new ABD to point to offset off of sabd. It shares the underlying
474 * buffer data with sabd. Use abd_put() to free. sabd must not be freed while
475 * any derived ABDs exist.
477 static inline abd_t
*
478 abd_get_offset_impl(abd_t
*sabd
, size_t off
, size_t size
)
483 ASSERT3U(off
, <=, sabd
->abd_size
);
485 if (abd_is_linear(sabd
)) {
486 abd
= abd_alloc_struct(0);
489 * Even if this buf is filesystem metadata, we only track that
490 * if we own the underlying data buffer, which is not true in
491 * this case. Therefore, we don't ever use ABD_FLAG_META here.
493 abd
->abd_flags
= ABD_FLAG_LINEAR
;
495 abd
->abd_u
.abd_linear
.abd_buf
=
496 (char *)sabd
->abd_u
.abd_linear
.abd_buf
+ off
;
498 size_t new_offset
= sabd
->abd_u
.abd_scatter
.abd_offset
+ off
;
499 size_t chunkcnt
= abd_chunkcnt_for_bytes(size
+
500 new_offset
% PAGESIZE
);
502 abd
= abd_alloc_struct(chunkcnt
);
505 * Even if this buf is filesystem metadata, we only track that
506 * if we own the underlying data buffer, which is not true in
507 * this case. Therefore, we don't ever use ABD_FLAG_META here.
511 abd
->abd_u
.abd_scatter
.abd_offset
= new_offset
% PAGESIZE
;
512 abd
->abd_u
.abd_scatter
.abd_chunk_size
= PAGESIZE
;
514 /* Copy the scatterlist starting at the correct offset */
515 (void) memcpy(&abd
->abd_u
.abd_scatter
.abd_chunks
,
516 &sabd
->abd_u
.abd_scatter
.abd_chunks
[new_offset
/ PAGESIZE
],
517 chunkcnt
* sizeof (void *));
520 abd
->abd_size
= size
;
521 abd
->abd_parent
= sabd
;
522 refcount_create(&abd
->abd_children
);
523 (void) refcount_add_many(&sabd
->abd_children
, abd
->abd_size
, abd
);
529 abd_get_offset(abd_t
*sabd
, size_t off
)
531 size_t size
= sabd
->abd_size
> off
? sabd
->abd_size
- off
: 0;
533 VERIFY3U(size
, >, 0);
535 return (abd_get_offset_impl(sabd
, off
, size
));
539 abd_get_offset_size(abd_t
*sabd
, size_t off
, size_t size
)
541 ASSERT3U(off
+ size
, <=, sabd
->abd_size
);
543 return (abd_get_offset_impl(sabd
, off
, size
));
547 * Allocate a linear ABD structure for buf. You must free this with abd_put()
548 * since the resulting ABD doesn't own its own buffer.
551 abd_get_from_buf(void *buf
, size_t size
)
553 abd_t
*abd
= abd_alloc_struct(0);
555 VERIFY3U(size
, <=, SPA_MAXBLOCKSIZE
);
558 * Even if this buf is filesystem metadata, we only track that if we
559 * own the underlying data buffer, which is not true in this case.
560 * Therefore, we don't ever use ABD_FLAG_META here.
562 abd
->abd_flags
= ABD_FLAG_LINEAR
;
563 abd
->abd_size
= size
;
564 abd
->abd_parent
= NULL
;
565 refcount_create(&abd
->abd_children
);
567 abd
->abd_u
.abd_linear
.abd_buf
= buf
;
573 * Free an ABD allocated from abd_get_offset() or abd_get_from_buf(). Will not
574 * free the underlying scatterlist or buffer.
580 ASSERT(!(abd
->abd_flags
& ABD_FLAG_OWNER
));
582 if (abd
->abd_parent
!= NULL
) {
583 (void) refcount_remove_many(&abd
->abd_parent
->abd_children
,
587 refcount_destroy(&abd
->abd_children
);
588 abd_free_struct(abd
);
592 * Get the raw buffer associated with a linear ABD.
595 abd_to_buf(abd_t
*abd
)
597 ASSERT(abd_is_linear(abd
));
599 return (abd
->abd_u
.abd_linear
.abd_buf
);
603 * Borrow a raw buffer from an ABD without copying the contents of the ABD
604 * into the buffer. If the ABD is scattered, this will allocate a raw buffer
605 * whose contents are undefined. To copy over the existing data in the ABD, use
606 * abd_borrow_buf_copy() instead.
609 abd_borrow_buf(abd_t
*abd
, size_t n
)
613 ASSERT3U(abd
->abd_size
, >=, n
);
614 if (abd_is_linear(abd
)) {
615 buf
= abd_to_buf(abd
);
617 buf
= zio_buf_alloc(n
);
619 (void) refcount_add_many(&abd
->abd_children
, n
, buf
);
625 abd_borrow_buf_copy(abd_t
*abd
, size_t n
)
627 void *buf
= abd_borrow_buf(abd
, n
);
628 if (!abd_is_linear(abd
)) {
629 abd_copy_to_buf(buf
, abd
, n
);
635 * Return a borrowed raw buffer to an ABD. If the ABD is scattered, this will
636 * not change the contents of the ABD and will ASSERT that you didn't modify
637 * the buffer since it was borrowed. If you want any changes you made to buf to
638 * be copied back to abd, use abd_return_buf_copy() instead.
641 abd_return_buf(abd_t
*abd
, void *buf
, size_t n
)
644 ASSERT3U(abd
->abd_size
, >=, n
);
645 if (abd_is_linear(abd
)) {
646 ASSERT3P(buf
, ==, abd_to_buf(abd
));
648 ASSERT0(abd_cmp_buf(abd
, buf
, n
));
649 zio_buf_free(buf
, n
);
651 (void) refcount_remove_many(&abd
->abd_children
, n
, buf
);
655 abd_return_buf_copy(abd_t
*abd
, void *buf
, size_t n
)
657 if (!abd_is_linear(abd
)) {
658 abd_copy_from_buf(abd
, buf
, n
);
660 abd_return_buf(abd
, buf
, n
);
664 * Give this ABD ownership of the buffer that it's storing. Can only be used on
665 * linear ABDs which were allocated via abd_get_from_buf(), or ones allocated
666 * with abd_alloc_linear() which subsequently released ownership of their buf
667 * with abd_release_ownership_of_buf().
670 abd_take_ownership_of_buf(abd_t
*abd
, boolean_t is_metadata
)
672 ASSERT(abd_is_linear(abd
));
673 ASSERT(!(abd
->abd_flags
& ABD_FLAG_OWNER
));
676 abd
->abd_flags
|= ABD_FLAG_OWNER
;
678 abd
->abd_flags
|= ABD_FLAG_META
;
681 ABDSTAT_BUMP(abdstat_linear_cnt
);
682 ABDSTAT_INCR(abdstat_linear_data_size
, abd
->abd_size
);
686 abd_release_ownership_of_buf(abd_t
*abd
)
688 ASSERT(abd_is_linear(abd
));
689 ASSERT(abd
->abd_flags
& ABD_FLAG_OWNER
);
692 abd
->abd_flags
&= ~ABD_FLAG_OWNER
;
693 /* Disable this flag since we no longer own the data buffer */
694 abd
->abd_flags
&= ~ABD_FLAG_META
;
696 ABDSTAT_BUMPDOWN(abdstat_linear_cnt
);
697 ABDSTAT_INCR(abdstat_linear_data_size
, -(int)abd
->abd_size
);
701 abd_t
*iter_abd
; /* ABD being iterated through */
702 size_t iter_pos
; /* position (relative to abd_offset) */
703 void *iter_mapaddr
; /* addr corresponding to iter_pos */
704 size_t iter_mapsize
; /* length of data valid at mapaddr */
708 abd_iter_scatter_chunk_offset(struct abd_iter
*aiter
)
710 ASSERT(!abd_is_linear(aiter
->iter_abd
));
711 return ((aiter
->iter_abd
->abd_u
.abd_scatter
.abd_offset
+
712 aiter
->iter_pos
) % PAGESIZE
);
716 abd_iter_scatter_chunk_index(struct abd_iter
*aiter
)
718 ASSERT(!abd_is_linear(aiter
->iter_abd
));
719 return ((aiter
->iter_abd
->abd_u
.abd_scatter
.abd_offset
+
720 aiter
->iter_pos
) / PAGESIZE
);
724 * Initialize the abd_iter.
727 abd_iter_init(struct abd_iter
*aiter
, abd_t
*abd
)
730 aiter
->iter_abd
= abd
;
732 aiter
->iter_mapaddr
= NULL
;
733 aiter
->iter_mapsize
= 0;
737 * Advance the iterator by a certain amount. Cannot be called when a chunk is
738 * in use. This can be safely called when the aiter has already exhausted, in
739 * which case this does nothing.
742 abd_iter_advance(struct abd_iter
*aiter
, size_t amount
)
744 ASSERT3P(aiter
->iter_mapaddr
, ==, NULL
);
745 ASSERT0(aiter
->iter_mapsize
);
747 /* There's nothing left to advance to, so do nothing */
748 if (aiter
->iter_pos
== aiter
->iter_abd
->abd_size
)
751 aiter
->iter_pos
+= amount
;
755 * Map the current chunk into aiter. This can be safely called when the aiter
756 * has already exhausted, in which case this does nothing.
759 abd_iter_map(struct abd_iter
*aiter
)
764 ASSERT3P(aiter
->iter_mapaddr
, ==, NULL
);
765 ASSERT0(aiter
->iter_mapsize
);
767 /* There's nothing left to iterate over, so do nothing */
768 if (aiter
->iter_pos
== aiter
->iter_abd
->abd_size
)
771 if (abd_is_linear(aiter
->iter_abd
)) {
772 offset
= aiter
->iter_pos
;
773 aiter
->iter_mapsize
= aiter
->iter_abd
->abd_size
- offset
;
774 paddr
= aiter
->iter_abd
->abd_u
.abd_linear
.abd_buf
;
776 size_t index
= abd_iter_scatter_chunk_index(aiter
);
777 offset
= abd_iter_scatter_chunk_offset(aiter
);
779 aiter
->iter_mapsize
= MIN(PAGESIZE
- offset
,
780 aiter
->iter_abd
->abd_size
- aiter
->iter_pos
);
782 paddr
= abd_map_chunk(
783 aiter
->iter_abd
->abd_u
.abd_scatter
.abd_chunks
[index
]);
786 aiter
->iter_mapaddr
= (char *)paddr
+ offset
;
790 * Unmap the current chunk from aiter. This can be safely called when the aiter
791 * has already exhausted, in which case this does nothing.
794 abd_iter_unmap(struct abd_iter
*aiter
)
796 /* There's nothing left to unmap, so do nothing */
797 if (aiter
->iter_pos
== aiter
->iter_abd
->abd_size
)
800 if (!abd_is_linear(aiter
->iter_abd
)) {
801 /* LINTED E_FUNC_SET_NOT_USED */
802 size_t index
= abd_iter_scatter_chunk_index(aiter
);
804 aiter
->iter_abd
->abd_u
.abd_scatter
.abd_chunks
[index
]);
807 ASSERT3P(aiter
->iter_mapaddr
, !=, NULL
);
808 ASSERT3U(aiter
->iter_mapsize
, >, 0);
810 aiter
->iter_mapaddr
= NULL
;
811 aiter
->iter_mapsize
= 0;
815 abd_iterate_func(abd_t
*abd
, size_t off
, size_t size
,
816 abd_iter_func_t
*func
, void *private)
819 struct abd_iter aiter
;
822 ASSERT3U(off
+ size
, <=, abd
->abd_size
);
824 abd_iter_init(&aiter
, abd
);
825 abd_iter_advance(&aiter
, off
);
829 abd_iter_map(&aiter
);
831 len
= MIN(aiter
.iter_mapsize
, size
);
834 ret
= func(aiter
.iter_mapaddr
, len
, private);
836 abd_iter_unmap(&aiter
);
842 abd_iter_advance(&aiter
, len
);
853 abd_copy_to_buf_off_cb(void *buf
, size_t size
, void *private)
855 struct buf_arg
*ba_ptr
= private;
857 (void) memcpy(ba_ptr
->arg_buf
, buf
, size
);
858 ba_ptr
->arg_buf
= (char *)ba_ptr
->arg_buf
+ size
;
864 * Copy abd to buf. (off is the offset in abd.)
867 abd_copy_to_buf_off(void *buf
, abd_t
*abd
, size_t off
, size_t size
)
869 struct buf_arg ba_ptr
= { buf
};
871 (void) abd_iterate_func(abd
, off
, size
, abd_copy_to_buf_off_cb
,
876 abd_cmp_buf_off_cb(void *buf
, size_t size
, void *private)
879 struct buf_arg
*ba_ptr
= private;
881 ret
= memcmp(buf
, ba_ptr
->arg_buf
, size
);
882 ba_ptr
->arg_buf
= (char *)ba_ptr
->arg_buf
+ size
;
888 * Compare the contents of abd to buf. (off is the offset in abd.)
891 abd_cmp_buf_off(abd_t
*abd
, const void *buf
, size_t off
, size_t size
)
893 struct buf_arg ba_ptr
= { (void *) buf
};
895 return (abd_iterate_func(abd
, off
, size
, abd_cmp_buf_off_cb
, &ba_ptr
));
899 abd_copy_from_buf_off_cb(void *buf
, size_t size
, void *private)
901 struct buf_arg
*ba_ptr
= private;
903 (void) memcpy(buf
, ba_ptr
->arg_buf
, size
);
904 ba_ptr
->arg_buf
= (char *)ba_ptr
->arg_buf
+ size
;
910 * Copy from buf to abd. (off is the offset in abd.)
913 abd_copy_from_buf_off(abd_t
*abd
, const void *buf
, size_t off
, size_t size
)
915 struct buf_arg ba_ptr
= { (void *) buf
};
917 (void) abd_iterate_func(abd
, off
, size
, abd_copy_from_buf_off_cb
,
923 abd_zero_off_cb(void *buf
, size_t size
, void *private)
925 (void) memset(buf
, 0, size
);
930 * Zero out the abd from a particular offset to the end.
933 abd_zero_off(abd_t
*abd
, size_t off
, size_t size
)
935 (void) abd_iterate_func(abd
, off
, size
, abd_zero_off_cb
, NULL
);
939 * Iterate over two ABDs and call func incrementally on the two ABDs' data in
940 * equal-sized chunks (passed to func as raw buffers). func could be called many
941 * times during this iteration.
944 abd_iterate_func2(abd_t
*dabd
, abd_t
*sabd
, size_t doff
, size_t soff
,
945 size_t size
, abd_iter_func2_t
*func
, void *private)
948 struct abd_iter daiter
, saiter
;
953 ASSERT3U(doff
+ size
, <=, dabd
->abd_size
);
954 ASSERT3U(soff
+ size
, <=, sabd
->abd_size
);
956 abd_iter_init(&daiter
, dabd
);
957 abd_iter_init(&saiter
, sabd
);
958 abd_iter_advance(&daiter
, doff
);
959 abd_iter_advance(&saiter
, soff
);
962 size_t dlen
, slen
, len
;
963 abd_iter_map(&daiter
);
964 abd_iter_map(&saiter
);
966 dlen
= MIN(daiter
.iter_mapsize
, size
);
967 slen
= MIN(saiter
.iter_mapsize
, size
);
968 len
= MIN(dlen
, slen
);
969 ASSERT(dlen
> 0 || slen
> 0);
971 ret
= func(daiter
.iter_mapaddr
, saiter
.iter_mapaddr
, len
,
974 abd_iter_unmap(&saiter
);
975 abd_iter_unmap(&daiter
);
981 abd_iter_advance(&daiter
, len
);
982 abd_iter_advance(&saiter
, len
);
990 abd_copy_off_cb(void *dbuf
, void *sbuf
, size_t size
, void *private)
992 (void) memcpy(dbuf
, sbuf
, size
);
997 * Copy from sabd to dabd starting from soff and doff.
1000 abd_copy_off(abd_t
*dabd
, abd_t
*sabd
, size_t doff
, size_t soff
, size_t size
)
1002 (void) abd_iterate_func2(dabd
, sabd
, doff
, soff
, size
,
1003 abd_copy_off_cb
, NULL
);
1008 abd_cmp_cb(void *bufa
, void *bufb
, size_t size
, void *private)
1010 return (memcmp(bufa
, bufb
, size
));
1014 * Compares the contents of two ABDs.
1017 abd_cmp(abd_t
*dabd
, abd_t
*sabd
)
1019 ASSERT3U(dabd
->abd_size
, ==, sabd
->abd_size
);
1020 return (abd_iterate_func2(dabd
, sabd
, 0, 0, dabd
->abd_size
,
1025 * Iterate over code ABDs and a data ABD and call @func_raidz_gen.
1027 * @cabds parity ABDs, must have equal size
1028 * @dabd data ABD. Can be NULL (in this case @dsize = 0)
1029 * @func_raidz_gen should be implemented so that its behaviour
1030 * is the same when taking linear and when taking scatter
1033 abd_raidz_gen_iterate(abd_t
**cabds
, abd_t
*dabd
,
1034 ssize_t csize
, ssize_t dsize
, const unsigned parity
,
1035 void (*func_raidz_gen
)(void **, const void *, size_t, size_t))
1039 struct abd_iter caiters
[3];
1040 struct abd_iter daiter
;
1043 ASSERT3U(parity
, <=, 3);
1045 for (i
= 0; i
< parity
; i
++)
1046 abd_iter_init(&caiters
[i
], cabds
[i
]);
1049 abd_iter_init(&daiter
, dabd
);
1051 ASSERT3S(dsize
, >=, 0);
1056 if (dabd
&& dsize
> 0)
1057 abd_iter_map(&daiter
);
1059 for (i
= 0; i
< parity
; i
++) {
1060 abd_iter_map(&caiters
[i
]);
1061 caddrs
[i
] = caiters
[i
].iter_mapaddr
;
1066 len
= MIN(caiters
[2].iter_mapsize
, len
);
1068 len
= MIN(caiters
[1].iter_mapsize
, len
);
1070 len
= MIN(caiters
[0].iter_mapsize
, len
);
1073 /* must be progressive */
1074 ASSERT3S(len
, >, 0);
1076 if (dabd
&& dsize
> 0) {
1077 /* this needs precise iter.length */
1078 len
= MIN(daiter
.iter_mapsize
, len
);
1083 /* must be progressive */
1084 ASSERT3S(len
, >, 0);
1086 * The iterated function likely will not do well if each
1087 * segment except the last one is not multiple of 512 (raidz).
1089 ASSERT3U(((uint64_t)len
& 511ULL), ==, 0);
1091 func_raidz_gen(caddrs
, daiter
.iter_mapaddr
, len
, dlen
);
1093 for (i
= parity
-1; i
>= 0; i
--) {
1094 abd_iter_unmap(&caiters
[i
]);
1095 abd_iter_advance(&caiters
[i
], len
);
1098 if (dabd
&& dsize
> 0) {
1099 abd_iter_unmap(&daiter
);
1100 abd_iter_advance(&daiter
, dlen
);
1106 ASSERT3S(dsize
, >=, 0);
1107 ASSERT3S(csize
, >=, 0);
1112 * Iterate over code ABDs and data reconstruction target ABDs and call
1113 * @func_raidz_rec. Function maps at most 6 pages atomically.
1115 * @cabds parity ABDs, must have equal size
1116 * @tabds rec target ABDs, at most 3
1117 * @tsize size of data target columns
1118 * @func_raidz_rec expects syndrome data in target columns. Function
1119 * reconstructs data and overwrites target columns.
1122 abd_raidz_rec_iterate(abd_t
**cabds
, abd_t
**tabds
,
1123 ssize_t tsize
, const unsigned parity
,
1124 void (*func_raidz_rec
)(void **t
, const size_t tsize
, void **c
,
1125 const unsigned *mul
),
1126 const unsigned *mul
)
1130 struct abd_iter citers
[3];
1131 struct abd_iter xiters
[3];
1132 void *caddrs
[3], *xaddrs
[3];
1134 ASSERT3U(parity
, <=, 3);
1136 for (i
= 0; i
< parity
; i
++) {
1137 abd_iter_init(&citers
[i
], cabds
[i
]);
1138 abd_iter_init(&xiters
[i
], tabds
[i
]);
1143 for (i
= 0; i
< parity
; i
++) {
1144 abd_iter_map(&citers
[i
]);
1145 abd_iter_map(&xiters
[i
]);
1146 caddrs
[i
] = citers
[i
].iter_mapaddr
;
1147 xaddrs
[i
] = xiters
[i
].iter_mapaddr
;
1153 len
= MIN(xiters
[2].iter_mapsize
, len
);
1154 len
= MIN(citers
[2].iter_mapsize
, len
);
1156 len
= MIN(xiters
[1].iter_mapsize
, len
);
1157 len
= MIN(citers
[1].iter_mapsize
, len
);
1159 len
= MIN(xiters
[0].iter_mapsize
, len
);
1160 len
= MIN(citers
[0].iter_mapsize
, len
);
1162 /* must be progressive */
1163 ASSERT3S(len
, >, 0);
1165 * The iterated function likely will not do well if each
1166 * segment except the last one is not multiple of 512 (raidz).
1168 ASSERT3U(((uint64_t)len
& 511ULL), ==, 0);
1170 func_raidz_rec(xaddrs
, len
, caddrs
, mul
);
1172 for (i
= parity
-1; i
>= 0; i
--) {
1173 abd_iter_unmap(&xiters
[i
]);
1174 abd_iter_unmap(&citers
[i
]);
1175 abd_iter_advance(&xiters
[i
], len
);
1176 abd_iter_advance(&citers
[i
], len
);
1180 ASSERT3S(tsize
, >=, 0);
1184 #if defined(_KERNEL) && defined(HAVE_SPL)
1186 * bio_nr_pages for ABD.
1187 * @off is the offset in @abd
1190 abd_nr_pages_off(abd_t
*abd
, unsigned int size
, size_t off
)
1194 if (abd_is_linear(abd
))
1195 pos
= (unsigned long)abd_to_buf(abd
) + off
;
1197 pos
= abd
->abd_u
.abd_scatter
.abd_offset
+ off
;
1199 return ((pos
+ size
+ PAGESIZE
- 1) >> PAGE_SHIFT
)
1200 - (pos
>> PAGE_SHIFT
);
1204 * bio_map for scatter ABD.
1205 * @off is the offset in @abd
1206 * Remaining IO size is returned
1209 abd_scatter_bio_map_off(struct bio
*bio
, abd_t
*abd
,
1210 unsigned int io_size
, size_t off
)
1213 struct abd_iter aiter
;
1215 ASSERT(!abd_is_linear(abd
));
1216 ASSERT3U(io_size
, <=, abd
->abd_size
- off
);
1218 abd_iter_init(&aiter
, abd
);
1219 abd_iter_advance(&aiter
, off
);
1221 for (i
= 0; i
< bio
->bi_max_vecs
; i
++) {
1223 size_t len
, pgoff
, index
;
1228 pgoff
= abd_iter_scatter_chunk_offset(&aiter
);
1229 len
= MIN(io_size
, PAGESIZE
- pgoff
);
1232 index
= abd_iter_scatter_chunk_index(&aiter
);
1233 pg
= abd
->abd_u
.abd_scatter
.abd_chunks
[index
];
1234 if (bio_add_page(bio
, pg
, len
, pgoff
) != len
)
1238 abd_iter_advance(&aiter
, len
);
1244 /* Tunable Parameters */
1245 module_param(zfs_abd_scatter_enabled
, int, 0644);
1246 MODULE_PARM_DESC(zfs_abd_scatter_enabled
,
1247 "Toggle whether ABD allocations must be linear.");