]> git.proxmox.com Git - mirror_zfs.git/blob - module/zfs/abd.c
ABD changes for vectorized RAIDZ
[mirror_zfs.git] / module / zfs / abd.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
23 * Copyright (c) 2016 by Delphix. All rights reserved.
24 */
25
26 /*
27 * ARC buffer data (ABD).
28 *
29 * ABDs are an abstract data structure for the ARC which can use two
30 * different ways of storing the underlying data:
31 *
32 * (a) Linear buffer. In this case, all the data in the ABD is stored in one
33 * contiguous buffer in memory (from a zio_[data_]buf_* kmem cache).
34 *
35 * +-------------------+
36 * | ABD (linear) |
37 * | abd_flags = ... |
38 * | abd_size = ... | +--------------------------------+
39 * | abd_buf ------------->| raw buffer of size abd_size |
40 * +-------------------+ +--------------------------------+
41 * no abd_chunks
42 *
43 * (b) Scattered buffer. In this case, the data in the ABD is split into
44 * equal-sized chunks (from the abd_chunk_cache kmem_cache), with pointers
45 * to the chunks recorded in an array at the end of the ABD structure.
46 *
47 * +-------------------+
48 * | ABD (scattered) |
49 * | abd_flags = ... |
50 * | abd_size = ... |
51 * | abd_offset = 0 | +-----------+
52 * | abd_chunks[0] ----------------------------->| chunk 0 |
53 * | abd_chunks[1] ---------------------+ +-----------+
54 * | ... | | +-----------+
55 * | abd_chunks[N-1] ---------+ +------->| chunk 1 |
56 * +-------------------+ | +-----------+
57 * | ...
58 * | +-----------+
59 * +----------------->| chunk N-1 |
60 * +-----------+
61 *
62 * Linear buffers act exactly like normal buffers and are always mapped into the
63 * kernel's virtual memory space, while scattered ABD data chunks are allocated
64 * as physical pages and then mapped in only while they are actually being
65 * accessed through one of the abd_* library functions. Using scattered ABDs
66 * provides several benefits:
67 *
68 * (1) They avoid use of kmem_*, preventing performance problems where running
69 * kmem_reap on very large memory systems never finishes and causes
70 * constant TLB shootdowns.
71 *
72 * (2) Fragmentation is less of an issue since when we are at the limit of
73 * allocatable space, we won't have to search around for a long free
74 * hole in the VA space for large ARC allocations. Each chunk is mapped in
75 * individually, so even if we weren't using segkpm (see next point) we
76 * wouldn't need to worry about finding a contiguous address range.
77 *
78 * (3) Use of segkpm will avoid the need for map / unmap / TLB shootdown costs
79 * on each ABD access. (If segkpm isn't available then we use all linear
80 * ABDs to avoid this penalty.) See seg_kpm.c for more details.
81 *
82 * It is possible to make all ABDs linear by setting zfs_abd_scatter_enabled to
83 * B_FALSE. However, it is not possible to use scattered ABDs if segkpm is not
84 * available, which is the case on all 32-bit systems and any 64-bit systems
85 * where kpm_enable is turned off.
86 *
87 * In addition to directly allocating a linear or scattered ABD, it is also
88 * possible to create an ABD by requesting the "sub-ABD" starting at an offset
89 * within an existing ABD. In linear buffers this is simple (set abd_buf of
90 * the new ABD to the starting point within the original raw buffer), but
91 * scattered ABDs are a little more complex. The new ABD makes a copy of the
92 * relevant abd_chunks pointers (but not the underlying data). However, to
93 * provide arbitrary rather than only chunk-aligned starting offsets, it also
94 * tracks an abd_offset field which represents the starting point of the data
95 * within the first chunk in abd_chunks. For both linear and scattered ABDs,
96 * creating an offset ABD marks the original ABD as the offset's parent, and the
97 * original ABD's abd_children refcount is incremented. This data allows us to
98 * ensure the root ABD isn't deleted before its children.
99 *
100 * Most consumers should never need to know what type of ABD they're using --
101 * the ABD public API ensures that it's possible to transparently switch from
102 * using a linear ABD to a scattered one when doing so would be beneficial.
103 *
104 * If you need to use the data within an ABD directly, if you know it's linear
105 * (because you allocated it) you can use abd_to_buf() to access the underlying
106 * raw buffer. Otherwise, you should use one of the abd_borrow_buf* functions
107 * which will allocate a raw buffer if necessary. Use the abd_return_buf*
108 * functions to return any raw buffers that are no longer necessary when you're
109 * done using them.
110 *
111 * There are a variety of ABD APIs that implement basic buffer operations:
112 * compare, copy, read, write, and fill with zeroes. If you need a custom
113 * function which progressively accesses the whole ABD, use the abd_iterate_*
114 * functions.
115 */
116
117 #include <sys/abd.h>
118 #include <sys/param.h>
119 #include <sys/zio.h>
120 #include <sys/zfs_context.h>
121 #include <sys/zfs_znode.h>
122
123 #ifndef KMC_NOTOUCH
124 #define KMC_NOTOUCH 0
125 #endif
126
127 typedef struct abd_stats {
128 kstat_named_t abdstat_struct_size;
129 kstat_named_t abdstat_scatter_cnt;
130 kstat_named_t abdstat_scatter_data_size;
131 kstat_named_t abdstat_scatter_chunk_waste;
132 kstat_named_t abdstat_linear_cnt;
133 kstat_named_t abdstat_linear_data_size;
134 } abd_stats_t;
135
136 static abd_stats_t abd_stats = {
137 /* Amount of memory occupied by all of the abd_t struct allocations */
138 { "struct_size", KSTAT_DATA_UINT64 },
139 /*
140 * The number of scatter ABDs which are currently allocated, excluding
141 * ABDs which don't own their data (for instance the ones which were
142 * allocated through abd_get_offset()).
143 */
144 { "scatter_cnt", KSTAT_DATA_UINT64 },
145 /* Amount of data stored in all scatter ABDs tracked by scatter_cnt */
146 { "scatter_data_size", KSTAT_DATA_UINT64 },
147 /*
148 * The amount of space wasted at the end of the last chunk across all
149 * scatter ABDs tracked by scatter_cnt.
150 */
151 { "scatter_chunk_waste", KSTAT_DATA_UINT64 },
152 /*
153 * The number of linear ABDs which are currently allocated, excluding
154 * ABDs which don't own their data (for instance the ones which were
155 * allocated through abd_get_offset() and abd_get_from_buf()). If an
156 * ABD takes ownership of its buf then it will become tracked.
157 */
158 { "linear_cnt", KSTAT_DATA_UINT64 },
159 /* Amount of data stored in all linear ABDs tracked by linear_cnt */
160 { "linear_data_size", KSTAT_DATA_UINT64 },
161 };
162
163 #define ABDSTAT(stat) (abd_stats.stat.value.ui64)
164 #define ABDSTAT_INCR(stat, val) \
165 atomic_add_64(&abd_stats.stat.value.ui64, (val))
166 #define ABDSTAT_BUMP(stat) ABDSTAT_INCR(stat, 1)
167 #define ABDSTAT_BUMPDOWN(stat) ABDSTAT_INCR(stat, -1)
168
169 /* see block comment above for description */
170 int zfs_abd_scatter_enabled = B_TRUE;
171
172
173 #ifdef _KERNEL
174 static kstat_t *abd_ksp;
175
176 static struct page *
177 abd_alloc_chunk(void)
178 {
179 struct page *c = alloc_page(kmem_flags_convert(KM_SLEEP));
180 ASSERT3P(c, !=, NULL);
181 return (c);
182 }
183
184 static void
185 abd_free_chunk(struct page *c)
186 {
187 __free_pages(c, 0);
188 }
189
190 static void *
191 abd_map_chunk(struct page *c)
192 {
193 /*
194 * Use of segkpm means we don't care if this is mapped S_READ or S_WRITE
195 * but S_WRITE is conceptually more accurate.
196 */
197 return (kmap(c));
198 }
199
200 static void
201 abd_unmap_chunk(struct page *c)
202 {
203 kunmap(c);
204 }
205
206 void
207 abd_init(void)
208 {
209 abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED,
210 sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
211 if (abd_ksp != NULL) {
212 abd_ksp->ks_data = &abd_stats;
213 kstat_install(abd_ksp);
214 }
215 }
216
217 void
218 abd_fini(void)
219 {
220 if (abd_ksp != NULL) {
221 kstat_delete(abd_ksp);
222 abd_ksp = NULL;
223 }
224 }
225
226 #else
227
228 struct page;
229 #define kpm_enable 1
230 #define abd_alloc_chunk() \
231 ((struct page *) umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP))
232 #define abd_free_chunk(chunk) umem_free(chunk, PAGESIZE)
233 #define abd_map_chunk(chunk) ((void *)chunk)
234 static void
235 abd_unmap_chunk(struct page *c)
236 {
237 }
238
239 void
240 abd_init(void)
241 {
242 }
243
244 void
245 abd_fini(void)
246 {
247 }
248
249 #endif /* _KERNEL */
250
251 static inline size_t
252 abd_chunkcnt_for_bytes(size_t size)
253 {
254 return (P2ROUNDUP(size, PAGESIZE) / PAGESIZE);
255 }
256
257 static inline size_t
258 abd_scatter_chunkcnt(abd_t *abd)
259 {
260 ASSERT(!abd_is_linear(abd));
261 return (abd_chunkcnt_for_bytes(
262 abd->abd_u.abd_scatter.abd_offset + abd->abd_size));
263 }
264
265 static inline void
266 abd_verify(abd_t *abd)
267 {
268 ASSERT3U(abd->abd_size, >, 0);
269 ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE);
270 ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR |
271 ABD_FLAG_OWNER | ABD_FLAG_META));
272 IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER));
273 IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER);
274 if (abd_is_linear(abd)) {
275 ASSERT3P(abd->abd_u.abd_linear.abd_buf, !=, NULL);
276 } else {
277 size_t n;
278 int i;
279
280 ASSERT3U(abd->abd_u.abd_scatter.abd_offset, <, PAGESIZE);
281 n = abd_scatter_chunkcnt(abd);
282 for (i = 0; i < n; i++) {
283 ASSERT3P(
284 abd->abd_u.abd_scatter.abd_chunks[i], !=, NULL);
285 }
286 }
287 }
288
289 static inline abd_t *
290 abd_alloc_struct(size_t chunkcnt)
291 {
292 size_t size = offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]);
293 abd_t *abd = kmem_alloc(size, KM_PUSHPAGE);
294 ASSERT3P(abd, !=, NULL);
295 ABDSTAT_INCR(abdstat_struct_size, size);
296
297 return (abd);
298 }
299
300 static inline void
301 abd_free_struct(abd_t *abd)
302 {
303 size_t chunkcnt = abd_is_linear(abd) ? 0 : abd_scatter_chunkcnt(abd);
304 int size = offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]);
305 kmem_free(abd, size);
306 ABDSTAT_INCR(abdstat_struct_size, -size);
307 }
308
309 /*
310 * Allocate an ABD, along with its own underlying data buffers. Use this if you
311 * don't care whether the ABD is linear or not.
312 */
313 abd_t *
314 abd_alloc(size_t size, boolean_t is_metadata)
315 {
316 int i;
317 size_t n;
318 abd_t *abd;
319
320 if (!zfs_abd_scatter_enabled)
321 return (abd_alloc_linear(size, is_metadata));
322
323 VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
324
325 n = abd_chunkcnt_for_bytes(size);
326 abd = abd_alloc_struct(n);
327
328 abd->abd_flags = ABD_FLAG_OWNER;
329 if (is_metadata) {
330 abd->abd_flags |= ABD_FLAG_META;
331 }
332 abd->abd_size = size;
333 abd->abd_parent = NULL;
334 refcount_create(&abd->abd_children);
335
336 abd->abd_u.abd_scatter.abd_offset = 0;
337 abd->abd_u.abd_scatter.abd_chunk_size = PAGESIZE;
338
339 for (i = 0; i < n; i++) {
340 void *c = abd_alloc_chunk();
341 ASSERT3P(c, !=, NULL);
342 abd->abd_u.abd_scatter.abd_chunks[i] = c;
343 }
344
345 ABDSTAT_BUMP(abdstat_scatter_cnt);
346 ABDSTAT_INCR(abdstat_scatter_data_size, size);
347 ABDSTAT_INCR(abdstat_scatter_chunk_waste,
348 n * PAGESIZE - size);
349
350 return (abd);
351 }
352
353 static void
354 abd_free_scatter(abd_t *abd)
355 {
356 size_t n = abd_scatter_chunkcnt(abd);
357 int i;
358
359 for (i = 0; i < n; i++) {
360 abd_free_chunk(abd->abd_u.abd_scatter.abd_chunks[i]);
361 }
362
363 refcount_destroy(&abd->abd_children);
364 ABDSTAT_BUMPDOWN(abdstat_scatter_cnt);
365 ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size);
366 ABDSTAT_INCR(abdstat_scatter_chunk_waste,
367 abd->abd_size - n * PAGESIZE);
368
369 abd_free_struct(abd);
370 }
371
372 /*
373 * Allocate an ABD that must be linear, along with its own underlying data
374 * buffer. Only use this when it would be very annoying to write your ABD
375 * consumer with a scattered ABD.
376 */
377 abd_t *
378 abd_alloc_linear(size_t size, boolean_t is_metadata)
379 {
380 abd_t *abd = abd_alloc_struct(0);
381
382 VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
383
384 abd->abd_flags = ABD_FLAG_LINEAR | ABD_FLAG_OWNER;
385 if (is_metadata) {
386 abd->abd_flags |= ABD_FLAG_META;
387 }
388 abd->abd_size = size;
389 abd->abd_parent = NULL;
390 refcount_create(&abd->abd_children);
391
392 if (is_metadata) {
393 abd->abd_u.abd_linear.abd_buf = zio_buf_alloc(size);
394 } else {
395 abd->abd_u.abd_linear.abd_buf = zio_data_buf_alloc(size);
396 }
397
398 ABDSTAT_BUMP(abdstat_linear_cnt);
399 ABDSTAT_INCR(abdstat_linear_data_size, size);
400
401 return (abd);
402 }
403
404 static void
405 abd_free_linear(abd_t *abd)
406 {
407 if (abd->abd_flags & ABD_FLAG_META) {
408 zio_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size);
409 } else {
410 zio_data_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size);
411 }
412
413 refcount_destroy(&abd->abd_children);
414 ABDSTAT_BUMPDOWN(abdstat_linear_cnt);
415 ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size);
416
417 abd_free_struct(abd);
418 }
419
420 /*
421 * Free an ABD. Only use this on ABDs allocated with abd_alloc() or
422 * abd_alloc_linear().
423 */
424 void
425 abd_free(abd_t *abd)
426 {
427 abd_verify(abd);
428 ASSERT3P(abd->abd_parent, ==, NULL);
429 ASSERT(abd->abd_flags & ABD_FLAG_OWNER);
430 if (abd_is_linear(abd))
431 abd_free_linear(abd);
432 else
433 abd_free_scatter(abd);
434 }
435
436 /*
437 * Allocate an ABD of the same format (same metadata flag, same scatterize
438 * setting) as another ABD.
439 */
440 abd_t *
441 abd_alloc_sametype(abd_t *sabd, size_t size)
442 {
443 boolean_t is_metadata = (sabd->abd_flags | ABD_FLAG_META) != 0;
444 if (abd_is_linear(sabd)) {
445 return (abd_alloc_linear(size, is_metadata));
446 } else {
447 return (abd_alloc(size, is_metadata));
448 }
449 }
450
451 /*
452 * If we're going to use this ABD for doing I/O using the block layer, the
453 * consumer of the ABD data doesn't care if it's scattered or not, and we don't
454 * plan to store this ABD in memory for a long period of time, we should
455 * allocate the ABD type that requires the least data copying to do the I/O.
456 *
457 * On Illumos this is linear ABDs, however if ldi_strategy() can ever issue I/Os
458 * using a scatter/gather list we should switch to that and replace this call
459 * with vanilla abd_alloc().
460 *
461 * On Linux the optimal thing to do would be to use abd_get_offset() and
462 * construct a new ABD which shares the original pages thereby eliminating
463 * the copy. But for the moment a new linear ABD is allocated until this
464 * performance optimization can be implemented.
465 */
466 abd_t *
467 abd_alloc_for_io(size_t size, boolean_t is_metadata)
468 {
469 return (abd_alloc_linear(size, is_metadata));
470 }
471
472 /*
473 * Allocate a new ABD to point to offset off of sabd. It shares the underlying
474 * buffer data with sabd. Use abd_put() to free. sabd must not be freed while
475 * any derived ABDs exist.
476 */
477 static inline abd_t *
478 abd_get_offset_impl(abd_t *sabd, size_t off, size_t size)
479 {
480 abd_t *abd;
481
482 abd_verify(sabd);
483 ASSERT3U(off, <=, sabd->abd_size);
484
485 if (abd_is_linear(sabd)) {
486 abd = abd_alloc_struct(0);
487
488 /*
489 * Even if this buf is filesystem metadata, we only track that
490 * if we own the underlying data buffer, which is not true in
491 * this case. Therefore, we don't ever use ABD_FLAG_META here.
492 */
493 abd->abd_flags = ABD_FLAG_LINEAR;
494
495 abd->abd_u.abd_linear.abd_buf =
496 (char *)sabd->abd_u.abd_linear.abd_buf + off;
497 } else {
498 size_t new_offset = sabd->abd_u.abd_scatter.abd_offset + off;
499 size_t chunkcnt = abd_chunkcnt_for_bytes(size +
500 new_offset % PAGESIZE);
501
502 abd = abd_alloc_struct(chunkcnt);
503
504 /*
505 * Even if this buf is filesystem metadata, we only track that
506 * if we own the underlying data buffer, which is not true in
507 * this case. Therefore, we don't ever use ABD_FLAG_META here.
508 */
509 abd->abd_flags = 0;
510
511 abd->abd_u.abd_scatter.abd_offset = new_offset % PAGESIZE;
512 abd->abd_u.abd_scatter.abd_chunk_size = PAGESIZE;
513
514 /* Copy the scatterlist starting at the correct offset */
515 (void) memcpy(&abd->abd_u.abd_scatter.abd_chunks,
516 &sabd->abd_u.abd_scatter.abd_chunks[new_offset / PAGESIZE],
517 chunkcnt * sizeof (void *));
518 }
519
520 abd->abd_size = size;
521 abd->abd_parent = sabd;
522 refcount_create(&abd->abd_children);
523 (void) refcount_add_many(&sabd->abd_children, abd->abd_size, abd);
524
525 return (abd);
526 }
527
528 abd_t *
529 abd_get_offset(abd_t *sabd, size_t off)
530 {
531 size_t size = sabd->abd_size > off ? sabd->abd_size - off : 0;
532
533 VERIFY3U(size, >, 0);
534
535 return (abd_get_offset_impl(sabd, off, size));
536 }
537
538 abd_t *
539 abd_get_offset_size(abd_t *sabd, size_t off, size_t size)
540 {
541 ASSERT3U(off + size, <=, sabd->abd_size);
542
543 return (abd_get_offset_impl(sabd, off, size));
544 }
545
546 /*
547 * Allocate a linear ABD structure for buf. You must free this with abd_put()
548 * since the resulting ABD doesn't own its own buffer.
549 */
550 abd_t *
551 abd_get_from_buf(void *buf, size_t size)
552 {
553 abd_t *abd = abd_alloc_struct(0);
554
555 VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
556
557 /*
558 * Even if this buf is filesystem metadata, we only track that if we
559 * own the underlying data buffer, which is not true in this case.
560 * Therefore, we don't ever use ABD_FLAG_META here.
561 */
562 abd->abd_flags = ABD_FLAG_LINEAR;
563 abd->abd_size = size;
564 abd->abd_parent = NULL;
565 refcount_create(&abd->abd_children);
566
567 abd->abd_u.abd_linear.abd_buf = buf;
568
569 return (abd);
570 }
571
572 /*
573 * Free an ABD allocated from abd_get_offset() or abd_get_from_buf(). Will not
574 * free the underlying scatterlist or buffer.
575 */
576 void
577 abd_put(abd_t *abd)
578 {
579 abd_verify(abd);
580 ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER));
581
582 if (abd->abd_parent != NULL) {
583 (void) refcount_remove_many(&abd->abd_parent->abd_children,
584 abd->abd_size, abd);
585 }
586
587 refcount_destroy(&abd->abd_children);
588 abd_free_struct(abd);
589 }
590
591 /*
592 * Get the raw buffer associated with a linear ABD.
593 */
594 void *
595 abd_to_buf(abd_t *abd)
596 {
597 ASSERT(abd_is_linear(abd));
598 abd_verify(abd);
599 return (abd->abd_u.abd_linear.abd_buf);
600 }
601
602 /*
603 * Borrow a raw buffer from an ABD without copying the contents of the ABD
604 * into the buffer. If the ABD is scattered, this will allocate a raw buffer
605 * whose contents are undefined. To copy over the existing data in the ABD, use
606 * abd_borrow_buf_copy() instead.
607 */
608 void *
609 abd_borrow_buf(abd_t *abd, size_t n)
610 {
611 void *buf;
612 abd_verify(abd);
613 ASSERT3U(abd->abd_size, >=, n);
614 if (abd_is_linear(abd)) {
615 buf = abd_to_buf(abd);
616 } else {
617 buf = zio_buf_alloc(n);
618 }
619 (void) refcount_add_many(&abd->abd_children, n, buf);
620
621 return (buf);
622 }
623
624 void *
625 abd_borrow_buf_copy(abd_t *abd, size_t n)
626 {
627 void *buf = abd_borrow_buf(abd, n);
628 if (!abd_is_linear(abd)) {
629 abd_copy_to_buf(buf, abd, n);
630 }
631 return (buf);
632 }
633
634 /*
635 * Return a borrowed raw buffer to an ABD. If the ABD is scattered, this will
636 * not change the contents of the ABD and will ASSERT that you didn't modify
637 * the buffer since it was borrowed. If you want any changes you made to buf to
638 * be copied back to abd, use abd_return_buf_copy() instead.
639 */
640 void
641 abd_return_buf(abd_t *abd, void *buf, size_t n)
642 {
643 abd_verify(abd);
644 ASSERT3U(abd->abd_size, >=, n);
645 if (abd_is_linear(abd)) {
646 ASSERT3P(buf, ==, abd_to_buf(abd));
647 } else {
648 ASSERT0(abd_cmp_buf(abd, buf, n));
649 zio_buf_free(buf, n);
650 }
651 (void) refcount_remove_many(&abd->abd_children, n, buf);
652 }
653
654 void
655 abd_return_buf_copy(abd_t *abd, void *buf, size_t n)
656 {
657 if (!abd_is_linear(abd)) {
658 abd_copy_from_buf(abd, buf, n);
659 }
660 abd_return_buf(abd, buf, n);
661 }
662
663 /*
664 * Give this ABD ownership of the buffer that it's storing. Can only be used on
665 * linear ABDs which were allocated via abd_get_from_buf(), or ones allocated
666 * with abd_alloc_linear() which subsequently released ownership of their buf
667 * with abd_release_ownership_of_buf().
668 */
669 void
670 abd_take_ownership_of_buf(abd_t *abd, boolean_t is_metadata)
671 {
672 ASSERT(abd_is_linear(abd));
673 ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER));
674 abd_verify(abd);
675
676 abd->abd_flags |= ABD_FLAG_OWNER;
677 if (is_metadata) {
678 abd->abd_flags |= ABD_FLAG_META;
679 }
680
681 ABDSTAT_BUMP(abdstat_linear_cnt);
682 ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size);
683 }
684
685 void
686 abd_release_ownership_of_buf(abd_t *abd)
687 {
688 ASSERT(abd_is_linear(abd));
689 ASSERT(abd->abd_flags & ABD_FLAG_OWNER);
690 abd_verify(abd);
691
692 abd->abd_flags &= ~ABD_FLAG_OWNER;
693 /* Disable this flag since we no longer own the data buffer */
694 abd->abd_flags &= ~ABD_FLAG_META;
695
696 ABDSTAT_BUMPDOWN(abdstat_linear_cnt);
697 ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size);
698 }
699
700 struct abd_iter {
701 abd_t *iter_abd; /* ABD being iterated through */
702 size_t iter_pos; /* position (relative to abd_offset) */
703 void *iter_mapaddr; /* addr corresponding to iter_pos */
704 size_t iter_mapsize; /* length of data valid at mapaddr */
705 };
706
707 static inline size_t
708 abd_iter_scatter_chunk_offset(struct abd_iter *aiter)
709 {
710 ASSERT(!abd_is_linear(aiter->iter_abd));
711 return ((aiter->iter_abd->abd_u.abd_scatter.abd_offset +
712 aiter->iter_pos) % PAGESIZE);
713 }
714
715 static inline size_t
716 abd_iter_scatter_chunk_index(struct abd_iter *aiter)
717 {
718 ASSERT(!abd_is_linear(aiter->iter_abd));
719 return ((aiter->iter_abd->abd_u.abd_scatter.abd_offset +
720 aiter->iter_pos) / PAGESIZE);
721 }
722
723 /*
724 * Initialize the abd_iter.
725 */
726 static void
727 abd_iter_init(struct abd_iter *aiter, abd_t *abd)
728 {
729 abd_verify(abd);
730 aiter->iter_abd = abd;
731 aiter->iter_pos = 0;
732 aiter->iter_mapaddr = NULL;
733 aiter->iter_mapsize = 0;
734 }
735
736 /*
737 * Advance the iterator by a certain amount. Cannot be called when a chunk is
738 * in use. This can be safely called when the aiter has already exhausted, in
739 * which case this does nothing.
740 */
741 static void
742 abd_iter_advance(struct abd_iter *aiter, size_t amount)
743 {
744 ASSERT3P(aiter->iter_mapaddr, ==, NULL);
745 ASSERT0(aiter->iter_mapsize);
746
747 /* There's nothing left to advance to, so do nothing */
748 if (aiter->iter_pos == aiter->iter_abd->abd_size)
749 return;
750
751 aiter->iter_pos += amount;
752 }
753
754 /*
755 * Map the current chunk into aiter. This can be safely called when the aiter
756 * has already exhausted, in which case this does nothing.
757 */
758 static void
759 abd_iter_map(struct abd_iter *aiter)
760 {
761 void *paddr;
762 size_t offset = 0;
763
764 ASSERT3P(aiter->iter_mapaddr, ==, NULL);
765 ASSERT0(aiter->iter_mapsize);
766
767 /* There's nothing left to iterate over, so do nothing */
768 if (aiter->iter_pos == aiter->iter_abd->abd_size)
769 return;
770
771 if (abd_is_linear(aiter->iter_abd)) {
772 offset = aiter->iter_pos;
773 aiter->iter_mapsize = aiter->iter_abd->abd_size - offset;
774 paddr = aiter->iter_abd->abd_u.abd_linear.abd_buf;
775 } else {
776 size_t index = abd_iter_scatter_chunk_index(aiter);
777 offset = abd_iter_scatter_chunk_offset(aiter);
778
779 aiter->iter_mapsize = MIN(PAGESIZE - offset,
780 aiter->iter_abd->abd_size - aiter->iter_pos);
781
782 paddr = abd_map_chunk(
783 aiter->iter_abd->abd_u.abd_scatter.abd_chunks[index]);
784 }
785
786 aiter->iter_mapaddr = (char *)paddr + offset;
787 }
788
789 /*
790 * Unmap the current chunk from aiter. This can be safely called when the aiter
791 * has already exhausted, in which case this does nothing.
792 */
793 static void
794 abd_iter_unmap(struct abd_iter *aiter)
795 {
796 /* There's nothing left to unmap, so do nothing */
797 if (aiter->iter_pos == aiter->iter_abd->abd_size)
798 return;
799
800 if (!abd_is_linear(aiter->iter_abd)) {
801 /* LINTED E_FUNC_SET_NOT_USED */
802 size_t index = abd_iter_scatter_chunk_index(aiter);
803 abd_unmap_chunk(
804 aiter->iter_abd->abd_u.abd_scatter.abd_chunks[index]);
805 }
806
807 ASSERT3P(aiter->iter_mapaddr, !=, NULL);
808 ASSERT3U(aiter->iter_mapsize, >, 0);
809
810 aiter->iter_mapaddr = NULL;
811 aiter->iter_mapsize = 0;
812 }
813
814 int
815 abd_iterate_func(abd_t *abd, size_t off, size_t size,
816 abd_iter_func_t *func, void *private)
817 {
818 int ret = 0;
819 struct abd_iter aiter;
820
821 abd_verify(abd);
822 ASSERT3U(off + size, <=, abd->abd_size);
823
824 abd_iter_init(&aiter, abd);
825 abd_iter_advance(&aiter, off);
826
827 while (size > 0) {
828 size_t len;
829 abd_iter_map(&aiter);
830
831 len = MIN(aiter.iter_mapsize, size);
832 ASSERT3U(len, >, 0);
833
834 ret = func(aiter.iter_mapaddr, len, private);
835
836 abd_iter_unmap(&aiter);
837
838 if (ret != 0)
839 break;
840
841 size -= len;
842 abd_iter_advance(&aiter, len);
843 }
844
845 return (ret);
846 }
847
848 struct buf_arg {
849 void *arg_buf;
850 };
851
852 static int
853 abd_copy_to_buf_off_cb(void *buf, size_t size, void *private)
854 {
855 struct buf_arg *ba_ptr = private;
856
857 (void) memcpy(ba_ptr->arg_buf, buf, size);
858 ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;
859
860 return (0);
861 }
862
863 /*
864 * Copy abd to buf. (off is the offset in abd.)
865 */
866 void
867 abd_copy_to_buf_off(void *buf, abd_t *abd, size_t off, size_t size)
868 {
869 struct buf_arg ba_ptr = { buf };
870
871 (void) abd_iterate_func(abd, off, size, abd_copy_to_buf_off_cb,
872 &ba_ptr);
873 }
874
875 static int
876 abd_cmp_buf_off_cb(void *buf, size_t size, void *private)
877 {
878 int ret;
879 struct buf_arg *ba_ptr = private;
880
881 ret = memcmp(buf, ba_ptr->arg_buf, size);
882 ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;
883
884 return (ret);
885 }
886
887 /*
888 * Compare the contents of abd to buf. (off is the offset in abd.)
889 */
890 int
891 abd_cmp_buf_off(abd_t *abd, const void *buf, size_t off, size_t size)
892 {
893 struct buf_arg ba_ptr = { (void *) buf };
894
895 return (abd_iterate_func(abd, off, size, abd_cmp_buf_off_cb, &ba_ptr));
896 }
897
898 static int
899 abd_copy_from_buf_off_cb(void *buf, size_t size, void *private)
900 {
901 struct buf_arg *ba_ptr = private;
902
903 (void) memcpy(buf, ba_ptr->arg_buf, size);
904 ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;
905
906 return (0);
907 }
908
909 /*
910 * Copy from buf to abd. (off is the offset in abd.)
911 */
912 void
913 abd_copy_from_buf_off(abd_t *abd, const void *buf, size_t off, size_t size)
914 {
915 struct buf_arg ba_ptr = { (void *) buf };
916
917 (void) abd_iterate_func(abd, off, size, abd_copy_from_buf_off_cb,
918 &ba_ptr);
919 }
920
921 /*ARGSUSED*/
922 static int
923 abd_zero_off_cb(void *buf, size_t size, void *private)
924 {
925 (void) memset(buf, 0, size);
926 return (0);
927 }
928
929 /*
930 * Zero out the abd from a particular offset to the end.
931 */
932 void
933 abd_zero_off(abd_t *abd, size_t off, size_t size)
934 {
935 (void) abd_iterate_func(abd, off, size, abd_zero_off_cb, NULL);
936 }
937
938 /*
939 * Iterate over two ABDs and call func incrementally on the two ABDs' data in
940 * equal-sized chunks (passed to func as raw buffers). func could be called many
941 * times during this iteration.
942 */
943 int
944 abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff,
945 size_t size, abd_iter_func2_t *func, void *private)
946 {
947 int ret = 0;
948 struct abd_iter daiter, saiter;
949
950 abd_verify(dabd);
951 abd_verify(sabd);
952
953 ASSERT3U(doff + size, <=, dabd->abd_size);
954 ASSERT3U(soff + size, <=, sabd->abd_size);
955
956 abd_iter_init(&daiter, dabd);
957 abd_iter_init(&saiter, sabd);
958 abd_iter_advance(&daiter, doff);
959 abd_iter_advance(&saiter, soff);
960
961 while (size > 0) {
962 size_t dlen, slen, len;
963 abd_iter_map(&daiter);
964 abd_iter_map(&saiter);
965
966 dlen = MIN(daiter.iter_mapsize, size);
967 slen = MIN(saiter.iter_mapsize, size);
968 len = MIN(dlen, slen);
969 ASSERT(dlen > 0 || slen > 0);
970
971 ret = func(daiter.iter_mapaddr, saiter.iter_mapaddr, len,
972 private);
973
974 abd_iter_unmap(&saiter);
975 abd_iter_unmap(&daiter);
976
977 if (ret != 0)
978 break;
979
980 size -= len;
981 abd_iter_advance(&daiter, len);
982 abd_iter_advance(&saiter, len);
983 }
984
985 return (ret);
986 }
987
988 /*ARGSUSED*/
989 static int
990 abd_copy_off_cb(void *dbuf, void *sbuf, size_t size, void *private)
991 {
992 (void) memcpy(dbuf, sbuf, size);
993 return (0);
994 }
995
996 /*
997 * Copy from sabd to dabd starting from soff and doff.
998 */
999 void
1000 abd_copy_off(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, size_t size)
1001 {
1002 (void) abd_iterate_func2(dabd, sabd, doff, soff, size,
1003 abd_copy_off_cb, NULL);
1004 }
1005
1006 /*ARGSUSED*/
1007 static int
1008 abd_cmp_cb(void *bufa, void *bufb, size_t size, void *private)
1009 {
1010 return (memcmp(bufa, bufb, size));
1011 }
1012
1013 /*
1014 * Compares the contents of two ABDs.
1015 */
1016 int
1017 abd_cmp(abd_t *dabd, abd_t *sabd)
1018 {
1019 ASSERT3U(dabd->abd_size, ==, sabd->abd_size);
1020 return (abd_iterate_func2(dabd, sabd, 0, 0, dabd->abd_size,
1021 abd_cmp_cb, NULL));
1022 }
1023
1024 /*
1025 * Iterate over code ABDs and a data ABD and call @func_raidz_gen.
1026 *
1027 * @cabds parity ABDs, must have equal size
1028 * @dabd data ABD. Can be NULL (in this case @dsize = 0)
1029 * @func_raidz_gen should be implemented so that its behaviour
1030 * is the same when taking linear and when taking scatter
1031 */
1032 void
1033 abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd,
1034 ssize_t csize, ssize_t dsize, const unsigned parity,
1035 void (*func_raidz_gen)(void **, const void *, size_t, size_t))
1036 {
1037 int i;
1038 ssize_t len, dlen;
1039 struct abd_iter caiters[3];
1040 struct abd_iter daiter;
1041 void *caddrs[3];
1042
1043 ASSERT3U(parity, <=, 3);
1044
1045 for (i = 0; i < parity; i++)
1046 abd_iter_init(&caiters[i], cabds[i]);
1047
1048 if (dabd)
1049 abd_iter_init(&daiter, dabd);
1050
1051 ASSERT3S(dsize, >=, 0);
1052
1053 while (csize > 0) {
1054 len = csize;
1055
1056 if (dabd && dsize > 0)
1057 abd_iter_map(&daiter);
1058
1059 for (i = 0; i < parity; i++) {
1060 abd_iter_map(&caiters[i]);
1061 caddrs[i] = caiters[i].iter_mapaddr;
1062 }
1063
1064 switch (parity) {
1065 case 3:
1066 len = MIN(caiters[2].iter_mapsize, len);
1067 case 2:
1068 len = MIN(caiters[1].iter_mapsize, len);
1069 case 1:
1070 len = MIN(caiters[0].iter_mapsize, len);
1071 }
1072
1073 /* must be progressive */
1074 ASSERT3S(len, >, 0);
1075
1076 if (dabd && dsize > 0) {
1077 /* this needs precise iter.length */
1078 len = MIN(daiter.iter_mapsize, len);
1079 dlen = len;
1080 } else
1081 dlen = 0;
1082
1083 /* must be progressive */
1084 ASSERT3S(len, >, 0);
1085 /*
1086 * The iterated function likely will not do well if each
1087 * segment except the last one is not multiple of 512 (raidz).
1088 */
1089 ASSERT3U(((uint64_t)len & 511ULL), ==, 0);
1090
1091 func_raidz_gen(caddrs, daiter.iter_mapaddr, len, dlen);
1092
1093 for (i = parity-1; i >= 0; i--) {
1094 abd_iter_unmap(&caiters[i]);
1095 abd_iter_advance(&caiters[i], len);
1096 }
1097
1098 if (dabd && dsize > 0) {
1099 abd_iter_unmap(&daiter);
1100 abd_iter_advance(&daiter, dlen);
1101 dsize -= dlen;
1102 }
1103
1104 csize -= len;
1105
1106 ASSERT3S(dsize, >=, 0);
1107 ASSERT3S(csize, >=, 0);
1108 }
1109 }
1110
1111 /*
1112 * Iterate over code ABDs and data reconstruction target ABDs and call
1113 * @func_raidz_rec. Function maps at most 6 pages atomically.
1114 *
1115 * @cabds parity ABDs, must have equal size
1116 * @tabds rec target ABDs, at most 3
1117 * @tsize size of data target columns
1118 * @func_raidz_rec expects syndrome data in target columns. Function
1119 * reconstructs data and overwrites target columns.
1120 */
1121 void
1122 abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds,
1123 ssize_t tsize, const unsigned parity,
1124 void (*func_raidz_rec)(void **t, const size_t tsize, void **c,
1125 const unsigned *mul),
1126 const unsigned *mul)
1127 {
1128 int i;
1129 ssize_t len;
1130 struct abd_iter citers[3];
1131 struct abd_iter xiters[3];
1132 void *caddrs[3], *xaddrs[3];
1133
1134 ASSERT3U(parity, <=, 3);
1135
1136 for (i = 0; i < parity; i++) {
1137 abd_iter_init(&citers[i], cabds[i]);
1138 abd_iter_init(&xiters[i], tabds[i]);
1139 }
1140
1141 while (tsize > 0) {
1142
1143 for (i = 0; i < parity; i++) {
1144 abd_iter_map(&citers[i]);
1145 abd_iter_map(&xiters[i]);
1146 caddrs[i] = citers[i].iter_mapaddr;
1147 xaddrs[i] = xiters[i].iter_mapaddr;
1148 }
1149
1150 len = tsize;
1151 switch (parity) {
1152 case 3:
1153 len = MIN(xiters[2].iter_mapsize, len);
1154 len = MIN(citers[2].iter_mapsize, len);
1155 case 2:
1156 len = MIN(xiters[1].iter_mapsize, len);
1157 len = MIN(citers[1].iter_mapsize, len);
1158 case 1:
1159 len = MIN(xiters[0].iter_mapsize, len);
1160 len = MIN(citers[0].iter_mapsize, len);
1161 }
1162 /* must be progressive */
1163 ASSERT3S(len, >, 0);
1164 /*
1165 * The iterated function likely will not do well if each
1166 * segment except the last one is not multiple of 512 (raidz).
1167 */
1168 ASSERT3U(((uint64_t)len & 511ULL), ==, 0);
1169
1170 func_raidz_rec(xaddrs, len, caddrs, mul);
1171
1172 for (i = parity-1; i >= 0; i--) {
1173 abd_iter_unmap(&xiters[i]);
1174 abd_iter_unmap(&citers[i]);
1175 abd_iter_advance(&xiters[i], len);
1176 abd_iter_advance(&citers[i], len);
1177 }
1178
1179 tsize -= len;
1180 ASSERT3S(tsize, >=, 0);
1181 }
1182 }
1183
1184 #if defined(_KERNEL) && defined(HAVE_SPL)
1185 /*
1186 * bio_nr_pages for ABD.
1187 * @off is the offset in @abd
1188 */
1189 unsigned long
1190 abd_nr_pages_off(abd_t *abd, unsigned int size, size_t off)
1191 {
1192 unsigned long pos;
1193
1194 if (abd_is_linear(abd))
1195 pos = (unsigned long)abd_to_buf(abd) + off;
1196 else
1197 pos = abd->abd_u.abd_scatter.abd_offset + off;
1198
1199 return ((pos + size + PAGESIZE - 1) >> PAGE_SHIFT)
1200 - (pos >> PAGE_SHIFT);
1201 }
1202
1203 /*
1204 * bio_map for scatter ABD.
1205 * @off is the offset in @abd
1206 * Remaining IO size is returned
1207 */
1208 unsigned int
1209 abd_scatter_bio_map_off(struct bio *bio, abd_t *abd,
1210 unsigned int io_size, size_t off)
1211 {
1212 int i;
1213 struct abd_iter aiter;
1214
1215 ASSERT(!abd_is_linear(abd));
1216 ASSERT3U(io_size, <=, abd->abd_size - off);
1217
1218 abd_iter_init(&aiter, abd);
1219 abd_iter_advance(&aiter, off);
1220
1221 for (i = 0; i < bio->bi_max_vecs; i++) {
1222 struct page *pg;
1223 size_t len, pgoff, index;
1224
1225 if (io_size <= 0)
1226 break;
1227
1228 pgoff = abd_iter_scatter_chunk_offset(&aiter);
1229 len = MIN(io_size, PAGESIZE - pgoff);
1230 ASSERT(len > 0);
1231
1232 index = abd_iter_scatter_chunk_index(&aiter);
1233 pg = abd->abd_u.abd_scatter.abd_chunks[index];
1234 if (bio_add_page(bio, pg, len, pgoff) != len)
1235 break;
1236
1237 io_size -= len;
1238 abd_iter_advance(&aiter, len);
1239 }
1240
1241 return (io_size);
1242 }
1243
1244 /* Tunable Parameters */
1245 module_param(zfs_abd_scatter_enabled, int, 0644);
1246 MODULE_PARM_DESC(zfs_abd_scatter_enabled,
1247 "Toggle whether ABD allocations must be linear.");
1248 #endif