]>
Commit | Line | Data |
---|---|---|
fc551d7e BA |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
1d3ba0bf | 9 | * or https://opensource.org/licenses/CDDL-1.0. |
fc551d7e BA |
10 | * See the License for the specific language governing permissions |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
22 | * Copyright (c) 2014 by Chunwei Chen. All rights reserved. | |
23 | * Copyright (c) 2019 by Delphix. All rights reserved. | |
390b4487 | 24 | * Copyright (c) 2023, 2024, Klara Inc. |
fc551d7e BA |
25 | */ |
26 | ||
27 | /* | |
fb822260 | 28 | * See abd.c for a general overview of the arc buffered data (ABD). |
fc551d7e BA |
29 | * |
30 | * Linear buffers act exactly like normal buffers and are always mapped into the | |
31 | * kernel's virtual memory space, while scattered ABD data chunks are allocated | |
32 | * as physical pages and then mapped in only while they are actually being | |
33 | * accessed through one of the abd_* library functions. Using scattered ABDs | |
34 | * provides several benefits: | |
35 | * | |
36 | * (1) They avoid use of kmem_*, preventing performance problems where running | |
37 | * kmem_reap on very large memory systems never finishes and causes | |
38 | * constant TLB shootdowns. | |
39 | * | |
40 | * (2) Fragmentation is less of an issue since when we are at the limit of | |
41 | * allocatable space, we won't have to search around for a long free | |
42 | * hole in the VA space for large ARC allocations. Each chunk is mapped in | |
43 | * individually, so even if we are using HIGHMEM (see next point) we | |
44 | * wouldn't need to worry about finding a contiguous address range. | |
45 | * | |
46 | * (3) If we are not using HIGHMEM, then all physical memory is always | |
47 | * mapped into the kernel's address space, so we also avoid the map / | |
48 | * unmap costs on each ABD access. | |
49 | * | |
50 | * If we are not using HIGHMEM, scattered buffers which have only one chunk | |
51 | * can be treated as linear buffers, because they are contiguous in the | |
fb822260 | 52 | * kernel's virtual address space. See abd_alloc_chunks() for details. |
fc551d7e BA |
53 | */ |
54 | ||
55 | #include <sys/abd_impl.h> | |
56 | #include <sys/param.h> | |
57 | #include <sys/zio.h> | |
85ec5cba | 58 | #include <sys/arc.h> |
fc551d7e BA |
59 | #include <sys/zfs_context.h> |
60 | #include <sys/zfs_znode.h> | |
61 | #ifdef _KERNEL | |
62 | #include <linux/kmap_compat.h> | |
390b4487 | 63 | #include <linux/mm_compat.h> |
fc551d7e | 64 | #include <linux/scatterlist.h> |
7692d86d RN |
65 | #endif |
66 | ||
67 | #ifdef _KERNEL | |
68 | #if defined(MAX_ORDER) | |
69 | #define ABD_MAX_ORDER (MAX_ORDER) | |
70 | #elif defined(MAX_PAGE_ORDER) | |
71 | #define ABD_MAX_ORDER (MAX_PAGE_ORDER) | |
72 | #endif | |
fc551d7e | 73 | #else |
7692d86d | 74 | #define ABD_MAX_ORDER (1) |
fc551d7e BA |
75 | #endif |
76 | ||
77 | typedef struct abd_stats { | |
78 | kstat_named_t abdstat_struct_size; | |
79 | kstat_named_t abdstat_linear_cnt; | |
80 | kstat_named_t abdstat_linear_data_size; | |
81 | kstat_named_t abdstat_scatter_cnt; | |
82 | kstat_named_t abdstat_scatter_data_size; | |
83 | kstat_named_t abdstat_scatter_chunk_waste; | |
7692d86d | 84 | kstat_named_t abdstat_scatter_orders[ABD_MAX_ORDER]; |
fc551d7e BA |
85 | kstat_named_t abdstat_scatter_page_multi_chunk; |
86 | kstat_named_t abdstat_scatter_page_multi_zone; | |
87 | kstat_named_t abdstat_scatter_page_alloc_retry; | |
88 | kstat_named_t abdstat_scatter_sg_table_retry; | |
89 | } abd_stats_t; | |
90 | ||
91 | static abd_stats_t abd_stats = { | |
92 | /* Amount of memory occupied by all of the abd_t struct allocations */ | |
93 | { "struct_size", KSTAT_DATA_UINT64 }, | |
94 | /* | |
95 | * The number of linear ABDs which are currently allocated, excluding | |
96 | * ABDs which don't own their data (for instance the ones which were | |
97 | * allocated through abd_get_offset() and abd_get_from_buf()). If an | |
98 | * ABD takes ownership of its buf then it will become tracked. | |
99 | */ | |
100 | { "linear_cnt", KSTAT_DATA_UINT64 }, | |
101 | /* Amount of data stored in all linear ABDs tracked by linear_cnt */ | |
102 | { "linear_data_size", KSTAT_DATA_UINT64 }, | |
103 | /* | |
104 | * The number of scatter ABDs which are currently allocated, excluding | |
105 | * ABDs which don't own their data (for instance the ones which were | |
106 | * allocated through abd_get_offset()). | |
107 | */ | |
108 | { "scatter_cnt", KSTAT_DATA_UINT64 }, | |
109 | /* Amount of data stored in all scatter ABDs tracked by scatter_cnt */ | |
110 | { "scatter_data_size", KSTAT_DATA_UINT64 }, | |
111 | /* | |
112 | * The amount of space wasted at the end of the last chunk across all | |
113 | * scatter ABDs tracked by scatter_cnt. | |
114 | */ | |
115 | { "scatter_chunk_waste", KSTAT_DATA_UINT64 }, | |
116 | /* | |
117 | * The number of compound allocations of a given order. These | |
118 | * allocations are spread over all currently allocated ABDs, and | |
119 | * act as a measure of memory fragmentation. | |
120 | */ | |
121 | { { "scatter_order_N", KSTAT_DATA_UINT64 } }, | |
122 | /* | |
123 | * The number of scatter ABDs which contain multiple chunks. | |
124 | * ABDs are preferentially allocated from the minimum number of | |
125 | * contiguous multi-page chunks, a single chunk is optimal. | |
126 | */ | |
127 | { "scatter_page_multi_chunk", KSTAT_DATA_UINT64 }, | |
128 | /* | |
129 | * The number of scatter ABDs which are split across memory zones. | |
130 | * ABDs are preferentially allocated using pages from a single zone. | |
131 | */ | |
132 | { "scatter_page_multi_zone", KSTAT_DATA_UINT64 }, | |
133 | /* | |
134 | * The total number of retries encountered when attempting to | |
135 | * allocate the pages to populate the scatter ABD. | |
136 | */ | |
137 | { "scatter_page_alloc_retry", KSTAT_DATA_UINT64 }, | |
138 | /* | |
139 | * The total number of retries encountered when attempting to | |
140 | * allocate the sg table for an ABD. | |
141 | */ | |
142 | { "scatter_sg_table_retry", KSTAT_DATA_UINT64 }, | |
143 | }; | |
144 | ||
27218a32 | 145 | static struct { |
c4c162c1 AM |
146 | wmsum_t abdstat_struct_size; |
147 | wmsum_t abdstat_linear_cnt; | |
148 | wmsum_t abdstat_linear_data_size; | |
149 | wmsum_t abdstat_scatter_cnt; | |
150 | wmsum_t abdstat_scatter_data_size; | |
151 | wmsum_t abdstat_scatter_chunk_waste; | |
7692d86d | 152 | wmsum_t abdstat_scatter_orders[ABD_MAX_ORDER]; |
c4c162c1 AM |
153 | wmsum_t abdstat_scatter_page_multi_chunk; |
154 | wmsum_t abdstat_scatter_page_multi_zone; | |
155 | wmsum_t abdstat_scatter_page_alloc_retry; | |
156 | wmsum_t abdstat_scatter_sg_table_retry; | |
157 | } abd_sums; | |
158 | ||
fc551d7e BA |
159 | #define abd_for_each_sg(abd, sg, n, i) \ |
160 | for_each_sg(ABD_SCATTER(abd).abd_sgl, sg, n, i) | |
161 | ||
fc551d7e BA |
162 | /* |
163 | * zfs_abd_scatter_min_size is the minimum allocation size to use scatter | |
164 | * ABD's. Smaller allocations will use linear ABD's which uses | |
165 | * zio_[data_]buf_alloc(). | |
166 | * | |
167 | * Scatter ABD's use at least one page each, so sub-page allocations waste | |
168 | * some space when allocated as scatter (e.g. 2KB scatter allocation wastes | |
169 | * half of each page). Using linear ABD's for small allocations means that | |
170 | * they will be put on slabs which contain many allocations. This can | |
171 | * improve memory efficiency, but it also makes it much harder for ARC | |
172 | * evictions to actually free pages, because all the buffers on one slab need | |
173 | * to be freed in order for the slab (and underlying pages) to be freed. | |
174 | * Typically, 512B and 1KB kmem caches have 16 buffers per slab, so it's | |
175 | * possible for them to actually waste more memory than scatter (one page per | |
176 | * buf = wasting 3/4 or 7/8th; one buf per slab = wasting 15/16th). | |
177 | * | |
178 | * Spill blocks are typically 512B and are heavily used on systems running | |
179 | * selinux with the default dnode size and the `xattr=sa` property set. | |
180 | * | |
181 | * By default we use linear allocations for 512B and 1KB, and scatter | |
182 | * allocations for larger (1.5KB and up). | |
183 | */ | |
18168da7 | 184 | static int zfs_abd_scatter_min_size = 512 * 3; |
fc551d7e | 185 | |
fb822260 BA |
186 | /* |
187 | * We use a scattered SPA_MAXBLOCKSIZE sized ABD whose pages are | |
188 | * just a single zero'd page. This allows us to conserve memory by | |
189 | * only using a single zero page for the scatterlist. | |
190 | */ | |
191 | abd_t *abd_zero_scatter = NULL; | |
192 | ||
e08b9933 BA |
193 | struct page; |
194 | /* | |
becc717f BA |
195 | * _KERNEL - Will point to ZERO_PAGE if it is available or it will be |
196 | * an allocated zero'd PAGESIZE buffer. | |
197 | * Userspace - Will be an allocated zero'ed PAGESIZE buffer. | |
198 | * | |
199 | * abd_zero_page is assigned to each of the pages of abd_zero_scatter. | |
e08b9933 BA |
200 | */ |
201 | static struct page *abd_zero_page = NULL; | |
202 | ||
fc551d7e BA |
203 | static kmem_cache_t *abd_cache = NULL; |
204 | static kstat_t *abd_ksp; | |
205 | ||
6366ef22 | 206 | static uint_t |
fc551d7e BA |
207 | abd_chunkcnt_for_bytes(size_t size) |
208 | { | |
209 | return (P2ROUNDUP(size, PAGESIZE) / PAGESIZE); | |
210 | } | |
211 | ||
212 | abd_t * | |
e2af2acc | 213 | abd_alloc_struct_impl(size_t size) |
fc551d7e BA |
214 | { |
215 | /* | |
216 | * In Linux we do not use the size passed in during ABD | |
217 | * allocation, so we just ignore it. | |
218 | */ | |
66cd33e0 | 219 | (void) size; |
fc551d7e BA |
220 | abd_t *abd = kmem_cache_alloc(abd_cache, KM_PUSHPAGE); |
221 | ASSERT3P(abd, !=, NULL); | |
222 | ABDSTAT_INCR(abdstat_struct_size, sizeof (abd_t)); | |
223 | ||
224 | return (abd); | |
225 | } | |
226 | ||
227 | void | |
e2af2acc | 228 | abd_free_struct_impl(abd_t *abd) |
fc551d7e BA |
229 | { |
230 | kmem_cache_free(abd_cache, abd); | |
231 | ABDSTAT_INCR(abdstat_struct_size, -(int)sizeof (abd_t)); | |
232 | } | |
233 | ||
234 | #ifdef _KERNEL | |
7692d86d | 235 | static unsigned zfs_abd_scatter_max_order = ABD_MAX_ORDER - 1; |
18168da7 | 236 | |
fc551d7e BA |
237 | /* |
238 | * Mark zfs data pages so they can be excluded from kernel crash dumps | |
239 | */ | |
240 | #ifdef _LP64 | |
241 | #define ABD_FILE_CACHE_PAGE 0x2F5ABDF11ECAC4E | |
242 | ||
243 | static inline void | |
244 | abd_mark_zfs_page(struct page *page) | |
245 | { | |
246 | get_page(page); | |
247 | SetPagePrivate(page); | |
248 | set_page_private(page, ABD_FILE_CACHE_PAGE); | |
249 | } | |
250 | ||
251 | static inline void | |
252 | abd_unmark_zfs_page(struct page *page) | |
253 | { | |
254 | set_page_private(page, 0UL); | |
255 | ClearPagePrivate(page); | |
256 | put_page(page); | |
257 | } | |
258 | #else | |
259 | #define abd_mark_zfs_page(page) | |
260 | #define abd_unmark_zfs_page(page) | |
261 | #endif /* _LP64 */ | |
262 | ||
263 | #ifndef CONFIG_HIGHMEM | |
264 | ||
265 | #ifndef __GFP_RECLAIM | |
266 | #define __GFP_RECLAIM __GFP_WAIT | |
267 | #endif | |
268 | ||
269 | /* | |
270 | * The goal is to minimize fragmentation by preferentially populating ABDs | |
271 | * with higher order compound pages from a single zone. Allocation size is | |
272 | * progressively decreased until it can be satisfied without performing | |
273 | * reclaim or compaction. When necessary this function will degenerate to | |
274 | * allocating individual pages and allowing reclaim to satisfy allocations. | |
275 | */ | |
276 | void | |
277 | abd_alloc_chunks(abd_t *abd, size_t size) | |
278 | { | |
279 | struct list_head pages; | |
280 | struct sg_table table; | |
281 | struct scatterlist *sg; | |
282 | struct page *page, *tmp_page = NULL; | |
283 | gfp_t gfp = __GFP_NOWARN | GFP_NOIO; | |
284 | gfp_t gfp_comp = (gfp | __GFP_NORETRY | __GFP_COMP) & ~__GFP_RECLAIM; | |
7692d86d RN |
285 | unsigned int max_order = MIN(zfs_abd_scatter_max_order, |
286 | ABD_MAX_ORDER - 1); | |
66953686 RY |
287 | unsigned int nr_pages = abd_chunkcnt_for_bytes(size); |
288 | unsigned int chunks = 0, zones = 0; | |
fc551d7e BA |
289 | size_t remaining_size; |
290 | int nid = NUMA_NO_NODE; | |
66953686 | 291 | unsigned int alloc_pages = 0; |
fc551d7e BA |
292 | |
293 | INIT_LIST_HEAD(&pages); | |
294 | ||
66953686 RY |
295 | ASSERT3U(alloc_pages, <, nr_pages); |
296 | ||
fc551d7e | 297 | while (alloc_pages < nr_pages) { |
66953686 RY |
298 | unsigned int chunk_pages; |
299 | unsigned int order; | |
fc551d7e BA |
300 | |
301 | order = MIN(highbit64(nr_pages - alloc_pages) - 1, max_order); | |
302 | chunk_pages = (1U << order); | |
303 | ||
304 | page = alloc_pages_node(nid, order ? gfp_comp : gfp, order); | |
305 | if (page == NULL) { | |
306 | if (order == 0) { | |
307 | ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry); | |
308 | schedule_timeout_interruptible(1); | |
309 | } else { | |
310 | max_order = MAX(0, order - 1); | |
311 | } | |
312 | continue; | |
313 | } | |
314 | ||
315 | list_add_tail(&page->lru, &pages); | |
316 | ||
317 | if ((nid != NUMA_NO_NODE) && (page_to_nid(page) != nid)) | |
318 | zones++; | |
319 | ||
320 | nid = page_to_nid(page); | |
321 | ABDSTAT_BUMP(abdstat_scatter_orders[order]); | |
322 | chunks++; | |
323 | alloc_pages += chunk_pages; | |
324 | } | |
325 | ||
326 | ASSERT3S(alloc_pages, ==, nr_pages); | |
327 | ||
328 | while (sg_alloc_table(&table, chunks, gfp)) { | |
329 | ABDSTAT_BUMP(abdstat_scatter_sg_table_retry); | |
330 | schedule_timeout_interruptible(1); | |
331 | } | |
332 | ||
333 | sg = table.sgl; | |
334 | remaining_size = size; | |
335 | list_for_each_entry_safe(page, tmp_page, &pages, lru) { | |
336 | size_t sg_size = MIN(PAGESIZE << compound_order(page), | |
337 | remaining_size); | |
338 | sg_set_page(sg, page, sg_size, 0); | |
339 | abd_mark_zfs_page(page); | |
340 | remaining_size -= sg_size; | |
341 | ||
342 | sg = sg_next(sg); | |
343 | list_del(&page->lru); | |
344 | } | |
345 | ||
346 | /* | |
347 | * These conditions ensure that a possible transformation to a linear | |
348 | * ABD would be valid. | |
349 | */ | |
350 | ASSERT(!PageHighMem(sg_page(table.sgl))); | |
351 | ASSERT0(ABD_SCATTER(abd).abd_offset); | |
352 | ||
353 | if (table.nents == 1) { | |
354 | /* | |
355 | * Since there is only one entry, this ABD can be represented | |
356 | * as a linear buffer. All single-page (4K) ABD's can be | |
357 | * represented this way. Some multi-page ABD's can also be | |
358 | * represented this way, if we were able to allocate a single | |
359 | * "chunk" (higher-order "page" which represents a power-of-2 | |
360 | * series of physically-contiguous pages). This is often the | |
361 | * case for 2-page (8K) ABD's. | |
362 | * | |
363 | * Representing a single-entry scatter ABD as a linear ABD | |
364 | * has the performance advantage of avoiding the copy (and | |
365 | * allocation) in abd_borrow_buf_copy / abd_return_buf_copy. | |
366 | * A performance increase of around 5% has been observed for | |
367 | * ARC-cached reads (of small blocks which can take advantage | |
368 | * of this). | |
369 | * | |
370 | * Note that this optimization is only possible because the | |
371 | * pages are always mapped into the kernel's address space. | |
372 | * This is not the case for highmem pages, so the | |
373 | * optimization can not be made there. | |
374 | */ | |
375 | abd->abd_flags |= ABD_FLAG_LINEAR; | |
376 | abd->abd_flags |= ABD_FLAG_LINEAR_PAGE; | |
377 | abd->abd_u.abd_linear.abd_sgl = table.sgl; | |
378 | ABD_LINEAR_BUF(abd) = page_address(sg_page(table.sgl)); | |
379 | } else if (table.nents > 1) { | |
380 | ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); | |
381 | abd->abd_flags |= ABD_FLAG_MULTI_CHUNK; | |
382 | ||
383 | if (zones) { | |
384 | ABDSTAT_BUMP(abdstat_scatter_page_multi_zone); | |
385 | abd->abd_flags |= ABD_FLAG_MULTI_ZONE; | |
386 | } | |
387 | ||
388 | ABD_SCATTER(abd).abd_sgl = table.sgl; | |
389 | ABD_SCATTER(abd).abd_nents = table.nents; | |
390 | } | |
391 | } | |
392 | #else | |
393 | ||
394 | /* | |
395 | * Allocate N individual pages to construct a scatter ABD. This function | |
396 | * makes no attempt to request contiguous pages and requires the minimal | |
397 | * number of kernel interfaces. It's designed for maximum compatibility. | |
398 | */ | |
399 | void | |
400 | abd_alloc_chunks(abd_t *abd, size_t size) | |
401 | { | |
402 | struct scatterlist *sg = NULL; | |
403 | struct sg_table table; | |
404 | struct page *page; | |
405 | gfp_t gfp = __GFP_NOWARN | GFP_NOIO; | |
406 | int nr_pages = abd_chunkcnt_for_bytes(size); | |
407 | int i = 0; | |
408 | ||
409 | while (sg_alloc_table(&table, nr_pages, gfp)) { | |
410 | ABDSTAT_BUMP(abdstat_scatter_sg_table_retry); | |
411 | schedule_timeout_interruptible(1); | |
412 | } | |
413 | ||
414 | ASSERT3U(table.nents, ==, nr_pages); | |
415 | ABD_SCATTER(abd).abd_sgl = table.sgl; | |
416 | ABD_SCATTER(abd).abd_nents = nr_pages; | |
417 | ||
418 | abd_for_each_sg(abd, sg, nr_pages, i) { | |
419 | while ((page = __page_cache_alloc(gfp)) == NULL) { | |
420 | ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry); | |
421 | schedule_timeout_interruptible(1); | |
422 | } | |
423 | ||
424 | ABDSTAT_BUMP(abdstat_scatter_orders[0]); | |
425 | sg_set_page(sg, page, PAGESIZE, 0); | |
426 | abd_mark_zfs_page(page); | |
427 | } | |
428 | ||
429 | if (nr_pages > 1) { | |
430 | ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); | |
431 | abd->abd_flags |= ABD_FLAG_MULTI_CHUNK; | |
432 | } | |
433 | } | |
434 | #endif /* !CONFIG_HIGHMEM */ | |
435 | ||
436 | /* | |
437 | * This must be called if any of the sg_table allocation functions | |
438 | * are called. | |
439 | */ | |
440 | static void | |
441 | abd_free_sg_table(abd_t *abd) | |
442 | { | |
443 | struct sg_table table; | |
444 | ||
445 | table.sgl = ABD_SCATTER(abd).abd_sgl; | |
446 | table.nents = table.orig_nents = ABD_SCATTER(abd).abd_nents; | |
447 | sg_free_table(&table); | |
448 | } | |
449 | ||
450 | void | |
451 | abd_free_chunks(abd_t *abd) | |
452 | { | |
453 | struct scatterlist *sg = NULL; | |
454 | struct page *page; | |
455 | int nr_pages = ABD_SCATTER(abd).abd_nents; | |
456 | int order, i = 0; | |
457 | ||
458 | if (abd->abd_flags & ABD_FLAG_MULTI_ZONE) | |
459 | ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_zone); | |
460 | ||
461 | if (abd->abd_flags & ABD_FLAG_MULTI_CHUNK) | |
462 | ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk); | |
463 | ||
464 | abd_for_each_sg(abd, sg, nr_pages, i) { | |
465 | page = sg_page(sg); | |
466 | abd_unmark_zfs_page(page); | |
467 | order = compound_order(page); | |
468 | __free_pages(page, order); | |
469 | ASSERT3U(sg->length, <=, PAGE_SIZE << order); | |
470 | ABDSTAT_BUMPDOWN(abdstat_scatter_orders[order]); | |
471 | } | |
472 | abd_free_sg_table(abd); | |
473 | } | |
474 | ||
fb822260 BA |
475 | /* |
476 | * Allocate scatter ABD of size SPA_MAXBLOCKSIZE, where each page in | |
e08b9933 | 477 | * the scatterlist will be set to the zero'd out buffer abd_zero_page. |
fb822260 BA |
478 | */ |
479 | static void | |
480 | abd_alloc_zero_scatter(void) | |
481 | { | |
482 | struct scatterlist *sg = NULL; | |
483 | struct sg_table table; | |
484 | gfp_t gfp = __GFP_NOWARN | GFP_NOIO; | |
485 | int nr_pages = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE); | |
486 | int i = 0; | |
487 | ||
becc717f BA |
488 | #if defined(HAVE_ZERO_PAGE_GPL_ONLY) |
489 | gfp_t gfp_zero_page = gfp | __GFP_ZERO; | |
e08b9933 BA |
490 | while ((abd_zero_page = __page_cache_alloc(gfp_zero_page)) == NULL) { |
491 | ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry); | |
492 | schedule_timeout_interruptible(1); | |
493 | } | |
494 | abd_mark_zfs_page(abd_zero_page); | |
becc717f BA |
495 | #else |
496 | abd_zero_page = ZERO_PAGE(0); | |
497 | #endif /* HAVE_ZERO_PAGE_GPL_ONLY */ | |
e08b9933 | 498 | |
fb822260 BA |
499 | while (sg_alloc_table(&table, nr_pages, gfp)) { |
500 | ABDSTAT_BUMP(abdstat_scatter_sg_table_retry); | |
501 | schedule_timeout_interruptible(1); | |
502 | } | |
503 | ASSERT3U(table.nents, ==, nr_pages); | |
504 | ||
505 | abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE); | |
e2af2acc | 506 | abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER; |
fb822260 BA |
507 | ABD_SCATTER(abd_zero_scatter).abd_offset = 0; |
508 | ABD_SCATTER(abd_zero_scatter).abd_sgl = table.sgl; | |
509 | ABD_SCATTER(abd_zero_scatter).abd_nents = nr_pages; | |
510 | abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE; | |
fb822260 | 511 | abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK | ABD_FLAG_ZEROS; |
fb822260 BA |
512 | |
513 | abd_for_each_sg(abd_zero_scatter, sg, nr_pages, i) { | |
e08b9933 | 514 | sg_set_page(sg, abd_zero_page, PAGESIZE, 0); |
fb822260 BA |
515 | } |
516 | ||
517 | ABDSTAT_BUMP(abdstat_scatter_cnt); | |
518 | ABDSTAT_INCR(abdstat_scatter_data_size, PAGESIZE); | |
519 | ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); | |
520 | } | |
521 | ||
fc551d7e BA |
522 | #else /* _KERNEL */ |
523 | ||
524 | #ifndef PAGE_SHIFT | |
525 | #define PAGE_SHIFT (highbit64(PAGESIZE)-1) | |
526 | #endif | |
527 | ||
f52124dc BA |
528 | #define zfs_kmap_atomic(chunk) ((void *)chunk) |
529 | #define zfs_kunmap_atomic(addr) do { (void)(addr); } while (0) | |
fc551d7e BA |
530 | #define local_irq_save(flags) do { (void)(flags); } while (0) |
531 | #define local_irq_restore(flags) do { (void)(flags); } while (0) | |
532 | #define nth_page(pg, i) \ | |
533 | ((struct page *)((void *)(pg) + (i) * PAGESIZE)) | |
534 | ||
535 | struct scatterlist { | |
536 | struct page *page; | |
537 | int length; | |
538 | int end; | |
539 | }; | |
540 | ||
541 | static void | |
542 | sg_init_table(struct scatterlist *sg, int nr) | |
543 | { | |
544 | memset(sg, 0, nr * sizeof (struct scatterlist)); | |
545 | sg[nr - 1].end = 1; | |
546 | } | |
547 | ||
548 | /* | |
549 | * This must be called if any of the sg_table allocation functions | |
550 | * are called. | |
551 | */ | |
552 | static void | |
553 | abd_free_sg_table(abd_t *abd) | |
554 | { | |
555 | int nents = ABD_SCATTER(abd).abd_nents; | |
556 | vmem_free(ABD_SCATTER(abd).abd_sgl, | |
557 | nents * sizeof (struct scatterlist)); | |
558 | } | |
559 | ||
560 | #define for_each_sg(sgl, sg, nr, i) \ | |
561 | for ((i) = 0, (sg) = (sgl); (i) < (nr); (i)++, (sg) = sg_next(sg)) | |
562 | ||
563 | static inline void | |
564 | sg_set_page(struct scatterlist *sg, struct page *page, unsigned int len, | |
565 | unsigned int offset) | |
566 | { | |
567 | /* currently we don't use offset */ | |
568 | ASSERT(offset == 0); | |
569 | sg->page = page; | |
570 | sg->length = len; | |
571 | } | |
572 | ||
573 | static inline struct page * | |
574 | sg_page(struct scatterlist *sg) | |
575 | { | |
576 | return (sg->page); | |
577 | } | |
578 | ||
579 | static inline struct scatterlist * | |
580 | sg_next(struct scatterlist *sg) | |
581 | { | |
582 | if (sg->end) | |
583 | return (NULL); | |
584 | ||
585 | return (sg + 1); | |
586 | } | |
587 | ||
588 | void | |
589 | abd_alloc_chunks(abd_t *abd, size_t size) | |
590 | { | |
591 | unsigned nr_pages = abd_chunkcnt_for_bytes(size); | |
592 | struct scatterlist *sg; | |
593 | int i; | |
594 | ||
595 | ABD_SCATTER(abd).abd_sgl = vmem_alloc(nr_pages * | |
596 | sizeof (struct scatterlist), KM_SLEEP); | |
597 | sg_init_table(ABD_SCATTER(abd).abd_sgl, nr_pages); | |
598 | ||
599 | abd_for_each_sg(abd, sg, nr_pages, i) { | |
600 | struct page *p = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP); | |
601 | sg_set_page(sg, p, PAGESIZE, 0); | |
602 | } | |
603 | ABD_SCATTER(abd).abd_nents = nr_pages; | |
604 | } | |
605 | ||
606 | void | |
607 | abd_free_chunks(abd_t *abd) | |
608 | { | |
609 | int i, n = ABD_SCATTER(abd).abd_nents; | |
610 | struct scatterlist *sg; | |
611 | ||
612 | abd_for_each_sg(abd, sg, n, i) { | |
9a803943 | 613 | struct page *p = nth_page(sg_page(sg), 0); |
07de8692 | 614 | umem_free_aligned(p, PAGESIZE); |
fc551d7e BA |
615 | } |
616 | abd_free_sg_table(abd); | |
617 | } | |
618 | ||
fb822260 BA |
619 | static void |
620 | abd_alloc_zero_scatter(void) | |
621 | { | |
622 | unsigned nr_pages = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE); | |
623 | struct scatterlist *sg; | |
624 | int i; | |
625 | ||
626 | abd_zero_page = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP); | |
627 | memset(abd_zero_page, 0, PAGESIZE); | |
628 | abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE); | |
e2af2acc | 629 | abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER; |
fb822260 BA |
630 | abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK | ABD_FLAG_ZEROS; |
631 | ABD_SCATTER(abd_zero_scatter).abd_offset = 0; | |
632 | ABD_SCATTER(abd_zero_scatter).abd_nents = nr_pages; | |
633 | abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE; | |
fb822260 BA |
634 | ABD_SCATTER(abd_zero_scatter).abd_sgl = vmem_alloc(nr_pages * |
635 | sizeof (struct scatterlist), KM_SLEEP); | |
636 | ||
637 | sg_init_table(ABD_SCATTER(abd_zero_scatter).abd_sgl, nr_pages); | |
638 | ||
639 | abd_for_each_sg(abd_zero_scatter, sg, nr_pages, i) { | |
640 | sg_set_page(sg, abd_zero_page, PAGESIZE, 0); | |
641 | } | |
642 | ||
643 | ABDSTAT_BUMP(abdstat_scatter_cnt); | |
644 | ABDSTAT_INCR(abdstat_scatter_data_size, PAGESIZE); | |
645 | ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); | |
646 | } | |
647 | ||
fc551d7e BA |
648 | #endif /* _KERNEL */ |
649 | ||
650 | boolean_t | |
651 | abd_size_alloc_linear(size_t size) | |
652 | { | |
7eebcd2b | 653 | return (!zfs_abd_scatter_enabled || size < zfs_abd_scatter_min_size); |
fc551d7e BA |
654 | } |
655 | ||
656 | void | |
657 | abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op) | |
658 | { | |
659 | ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR); | |
85ec5cba | 660 | int waste = P2ROUNDUP(abd->abd_size, PAGESIZE) - abd->abd_size; |
fc551d7e BA |
661 | if (op == ABDSTAT_INCR) { |
662 | ABDSTAT_BUMP(abdstat_scatter_cnt); | |
663 | ABDSTAT_INCR(abdstat_scatter_data_size, abd->abd_size); | |
85ec5cba MA |
664 | ABDSTAT_INCR(abdstat_scatter_chunk_waste, waste); |
665 | arc_space_consume(waste, ARC_SPACE_ABD_CHUNK_WASTE); | |
fc551d7e BA |
666 | } else { |
667 | ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); | |
668 | ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size); | |
85ec5cba MA |
669 | ABDSTAT_INCR(abdstat_scatter_chunk_waste, -waste); |
670 | arc_space_return(waste, ARC_SPACE_ABD_CHUNK_WASTE); | |
fc551d7e BA |
671 | } |
672 | } | |
673 | ||
674 | void | |
675 | abd_update_linear_stats(abd_t *abd, abd_stats_op_t op) | |
676 | { | |
677 | ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR); | |
678 | if (op == ABDSTAT_INCR) { | |
679 | ABDSTAT_BUMP(abdstat_linear_cnt); | |
680 | ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size); | |
681 | } else { | |
682 | ABDSTAT_BUMPDOWN(abdstat_linear_cnt); | |
683 | ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); | |
684 | } | |
685 | } | |
686 | ||
687 | void | |
688 | abd_verify_scatter(abd_t *abd) | |
689 | { | |
690 | size_t n; | |
691 | int i = 0; | |
692 | struct scatterlist *sg = NULL; | |
693 | ||
694 | ASSERT3U(ABD_SCATTER(abd).abd_nents, >, 0); | |
695 | ASSERT3U(ABD_SCATTER(abd).abd_offset, <, | |
696 | ABD_SCATTER(abd).abd_sgl->length); | |
697 | n = ABD_SCATTER(abd).abd_nents; | |
698 | abd_for_each_sg(abd, sg, n, i) { | |
699 | ASSERT3P(sg_page(sg), !=, NULL); | |
700 | } | |
701 | } | |
702 | ||
fb822260 BA |
703 | static void |
704 | abd_free_zero_scatter(void) | |
705 | { | |
fb822260 BA |
706 | ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); |
707 | ABDSTAT_INCR(abdstat_scatter_data_size, -(int)PAGESIZE); | |
708 | ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk); | |
709 | ||
710 | abd_free_sg_table(abd_zero_scatter); | |
711 | abd_free_struct(abd_zero_scatter); | |
712 | abd_zero_scatter = NULL; | |
e08b9933 BA |
713 | ASSERT3P(abd_zero_page, !=, NULL); |
714 | #if defined(_KERNEL) | |
becc717f | 715 | #if defined(HAVE_ZERO_PAGE_GPL_ONLY) |
e08b9933 BA |
716 | abd_unmark_zfs_page(abd_zero_page); |
717 | __free_page(abd_zero_page); | |
becc717f | 718 | #endif /* HAVE_ZERO_PAGE_GPL_ONLY */ |
e08b9933 | 719 | #else |
07de8692 | 720 | umem_free_aligned(abd_zero_page, PAGESIZE); |
fb822260 BA |
721 | #endif /* _KERNEL */ |
722 | } | |
723 | ||
c4c162c1 AM |
724 | static int |
725 | abd_kstats_update(kstat_t *ksp, int rw) | |
726 | { | |
727 | abd_stats_t *as = ksp->ks_data; | |
728 | ||
729 | if (rw == KSTAT_WRITE) | |
730 | return (EACCES); | |
731 | as->abdstat_struct_size.value.ui64 = | |
732 | wmsum_value(&abd_sums.abdstat_struct_size); | |
733 | as->abdstat_linear_cnt.value.ui64 = | |
734 | wmsum_value(&abd_sums.abdstat_linear_cnt); | |
735 | as->abdstat_linear_data_size.value.ui64 = | |
736 | wmsum_value(&abd_sums.abdstat_linear_data_size); | |
737 | as->abdstat_scatter_cnt.value.ui64 = | |
738 | wmsum_value(&abd_sums.abdstat_scatter_cnt); | |
739 | as->abdstat_scatter_data_size.value.ui64 = | |
740 | wmsum_value(&abd_sums.abdstat_scatter_data_size); | |
741 | as->abdstat_scatter_chunk_waste.value.ui64 = | |
742 | wmsum_value(&abd_sums.abdstat_scatter_chunk_waste); | |
7692d86d | 743 | for (int i = 0; i < ABD_MAX_ORDER; i++) { |
c4c162c1 AM |
744 | as->abdstat_scatter_orders[i].value.ui64 = |
745 | wmsum_value(&abd_sums.abdstat_scatter_orders[i]); | |
746 | } | |
747 | as->abdstat_scatter_page_multi_chunk.value.ui64 = | |
748 | wmsum_value(&abd_sums.abdstat_scatter_page_multi_chunk); | |
749 | as->abdstat_scatter_page_multi_zone.value.ui64 = | |
750 | wmsum_value(&abd_sums.abdstat_scatter_page_multi_zone); | |
751 | as->abdstat_scatter_page_alloc_retry.value.ui64 = | |
752 | wmsum_value(&abd_sums.abdstat_scatter_page_alloc_retry); | |
753 | as->abdstat_scatter_sg_table_retry.value.ui64 = | |
754 | wmsum_value(&abd_sums.abdstat_scatter_sg_table_retry); | |
755 | return (0); | |
756 | } | |
757 | ||
fc551d7e BA |
758 | void |
759 | abd_init(void) | |
760 | { | |
761 | int i; | |
762 | ||
763 | abd_cache = kmem_cache_create("abd_t", sizeof (abd_t), | |
764 | 0, NULL, NULL, NULL, NULL, NULL, 0); | |
765 | ||
c4c162c1 AM |
766 | wmsum_init(&abd_sums.abdstat_struct_size, 0); |
767 | wmsum_init(&abd_sums.abdstat_linear_cnt, 0); | |
768 | wmsum_init(&abd_sums.abdstat_linear_data_size, 0); | |
769 | wmsum_init(&abd_sums.abdstat_scatter_cnt, 0); | |
770 | wmsum_init(&abd_sums.abdstat_scatter_data_size, 0); | |
771 | wmsum_init(&abd_sums.abdstat_scatter_chunk_waste, 0); | |
7692d86d | 772 | for (i = 0; i < ABD_MAX_ORDER; i++) |
c4c162c1 AM |
773 | wmsum_init(&abd_sums.abdstat_scatter_orders[i], 0); |
774 | wmsum_init(&abd_sums.abdstat_scatter_page_multi_chunk, 0); | |
775 | wmsum_init(&abd_sums.abdstat_scatter_page_multi_zone, 0); | |
776 | wmsum_init(&abd_sums.abdstat_scatter_page_alloc_retry, 0); | |
777 | wmsum_init(&abd_sums.abdstat_scatter_sg_table_retry, 0); | |
778 | ||
fc551d7e BA |
779 | abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED, |
780 | sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); | |
781 | if (abd_ksp != NULL) { | |
7692d86d | 782 | for (i = 0; i < ABD_MAX_ORDER; i++) { |
fc551d7e BA |
783 | snprintf(abd_stats.abdstat_scatter_orders[i].name, |
784 | KSTAT_STRLEN, "scatter_order_%d", i); | |
785 | abd_stats.abdstat_scatter_orders[i].data_type = | |
786 | KSTAT_DATA_UINT64; | |
787 | } | |
788 | abd_ksp->ks_data = &abd_stats; | |
c4c162c1 | 789 | abd_ksp->ks_update = abd_kstats_update; |
fc551d7e BA |
790 | kstat_install(abd_ksp); |
791 | } | |
fb822260 BA |
792 | |
793 | abd_alloc_zero_scatter(); | |
fc551d7e BA |
794 | } |
795 | ||
796 | void | |
797 | abd_fini(void) | |
798 | { | |
fb822260 BA |
799 | abd_free_zero_scatter(); |
800 | ||
fc551d7e BA |
801 | if (abd_ksp != NULL) { |
802 | kstat_delete(abd_ksp); | |
803 | abd_ksp = NULL; | |
804 | } | |
805 | ||
c4c162c1 AM |
806 | wmsum_fini(&abd_sums.abdstat_struct_size); |
807 | wmsum_fini(&abd_sums.abdstat_linear_cnt); | |
808 | wmsum_fini(&abd_sums.abdstat_linear_data_size); | |
809 | wmsum_fini(&abd_sums.abdstat_scatter_cnt); | |
810 | wmsum_fini(&abd_sums.abdstat_scatter_data_size); | |
811 | wmsum_fini(&abd_sums.abdstat_scatter_chunk_waste); | |
7692d86d | 812 | for (int i = 0; i < ABD_MAX_ORDER; i++) |
c4c162c1 AM |
813 | wmsum_fini(&abd_sums.abdstat_scatter_orders[i]); |
814 | wmsum_fini(&abd_sums.abdstat_scatter_page_multi_chunk); | |
815 | wmsum_fini(&abd_sums.abdstat_scatter_page_multi_zone); | |
816 | wmsum_fini(&abd_sums.abdstat_scatter_page_alloc_retry); | |
817 | wmsum_fini(&abd_sums.abdstat_scatter_sg_table_retry); | |
818 | ||
fc551d7e BA |
819 | if (abd_cache) { |
820 | kmem_cache_destroy(abd_cache); | |
821 | abd_cache = NULL; | |
822 | } | |
823 | } | |
824 | ||
825 | void | |
826 | abd_free_linear_page(abd_t *abd) | |
827 | { | |
828 | /* Transform it back into a scatter ABD for freeing */ | |
829 | struct scatterlist *sg = abd->abd_u.abd_linear.abd_sgl; | |
830 | abd->abd_flags &= ~ABD_FLAG_LINEAR; | |
831 | abd->abd_flags &= ~ABD_FLAG_LINEAR_PAGE; | |
832 | ABD_SCATTER(abd).abd_nents = 1; | |
833 | ABD_SCATTER(abd).abd_offset = 0; | |
834 | ABD_SCATTER(abd).abd_sgl = sg; | |
835 | abd_free_chunks(abd); | |
836 | ||
fc551d7e | 837 | abd_update_scatter_stats(abd, ABDSTAT_DECR); |
fc551d7e BA |
838 | } |
839 | ||
840 | /* | |
841 | * If we're going to use this ABD for doing I/O using the block layer, the | |
842 | * consumer of the ABD data doesn't care if it's scattered or not, and we don't | |
843 | * plan to store this ABD in memory for a long period of time, we should | |
844 | * allocate the ABD type that requires the least data copying to do the I/O. | |
845 | * | |
846 | * On Linux the optimal thing to do would be to use abd_get_offset() and | |
847 | * construct a new ABD which shares the original pages thereby eliminating | |
848 | * the copy. But for the moment a new linear ABD is allocated until this | |
849 | * performance optimization can be implemented. | |
850 | */ | |
851 | abd_t * | |
852 | abd_alloc_for_io(size_t size, boolean_t is_metadata) | |
853 | { | |
854 | return (abd_alloc(size, is_metadata)); | |
855 | } | |
856 | ||
857 | abd_t * | |
c6d1112b JL |
858 | abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off, |
859 | size_t size) | |
fc551d7e | 860 | { |
66cd33e0 | 861 | (void) size; |
fc551d7e BA |
862 | int i = 0; |
863 | struct scatterlist *sg = NULL; | |
864 | ||
865 | abd_verify(sabd); | |
866 | ASSERT3U(off, <=, sabd->abd_size); | |
867 | ||
868 | size_t new_offset = ABD_SCATTER(sabd).abd_offset + off; | |
869 | ||
e2af2acc MA |
870 | if (abd == NULL) |
871 | abd = abd_alloc_struct(0); | |
fc551d7e BA |
872 | |
873 | /* | |
874 | * Even if this buf is filesystem metadata, we only track that | |
875 | * if we own the underlying data buffer, which is not true in | |
876 | * this case. Therefore, we don't ever use ABD_FLAG_META here. | |
877 | */ | |
fc551d7e BA |
878 | |
879 | abd_for_each_sg(sabd, sg, ABD_SCATTER(sabd).abd_nents, i) { | |
880 | if (new_offset < sg->length) | |
881 | break; | |
882 | new_offset -= sg->length; | |
883 | } | |
884 | ||
885 | ABD_SCATTER(abd).abd_sgl = sg; | |
886 | ABD_SCATTER(abd).abd_offset = new_offset; | |
887 | ABD_SCATTER(abd).abd_nents = ABD_SCATTER(sabd).abd_nents - i; | |
888 | ||
889 | return (abd); | |
890 | } | |
891 | ||
892 | /* | |
893 | * Initialize the abd_iter. | |
894 | */ | |
895 | void | |
896 | abd_iter_init(struct abd_iter *aiter, abd_t *abd) | |
897 | { | |
fb822260 | 898 | ASSERT(!abd_is_gang(abd)); |
fc551d7e | 899 | abd_verify(abd); |
390b4487 | 900 | memset(aiter, 0, sizeof (struct abd_iter)); |
fc551d7e | 901 | aiter->iter_abd = abd; |
390b4487 | 902 | if (!abd_is_linear(abd)) { |
fc551d7e BA |
903 | aiter->iter_offset = ABD_SCATTER(abd).abd_offset; |
904 | aiter->iter_sg = ABD_SCATTER(abd).abd_sgl; | |
905 | } | |
906 | } | |
907 | ||
908 | /* | |
909 | * This is just a helper function to see if we have exhausted the | |
910 | * abd_iter and reached the end. | |
911 | */ | |
912 | boolean_t | |
913 | abd_iter_at_end(struct abd_iter *aiter) | |
914 | { | |
390b4487 | 915 | ASSERT3U(aiter->iter_pos, <=, aiter->iter_abd->abd_size); |
fc551d7e BA |
916 | return (aiter->iter_pos == aiter->iter_abd->abd_size); |
917 | } | |
918 | ||
919 | /* | |
920 | * Advance the iterator by a certain amount. Cannot be called when a chunk is | |
921 | * in use. This can be safely called when the aiter has already exhausted, in | |
922 | * which case this does nothing. | |
923 | */ | |
924 | void | |
925 | abd_iter_advance(struct abd_iter *aiter, size_t amount) | |
926 | { | |
390b4487 RN |
927 | /* |
928 | * Ensure that last chunk is not in use. abd_iterate_*() must clear | |
929 | * this state (directly or abd_iter_unmap()) before advancing. | |
930 | */ | |
fc551d7e BA |
931 | ASSERT3P(aiter->iter_mapaddr, ==, NULL); |
932 | ASSERT0(aiter->iter_mapsize); | |
390b4487 RN |
933 | ASSERT3P(aiter->iter_page, ==, NULL); |
934 | ASSERT0(aiter->iter_page_doff); | |
935 | ASSERT0(aiter->iter_page_dsize); | |
fc551d7e BA |
936 | |
937 | /* There's nothing left to advance to, so do nothing */ | |
938 | if (abd_iter_at_end(aiter)) | |
939 | return; | |
940 | ||
941 | aiter->iter_pos += amount; | |
942 | aiter->iter_offset += amount; | |
943 | if (!abd_is_linear(aiter->iter_abd)) { | |
944 | while (aiter->iter_offset >= aiter->iter_sg->length) { | |
945 | aiter->iter_offset -= aiter->iter_sg->length; | |
946 | aiter->iter_sg = sg_next(aiter->iter_sg); | |
947 | if (aiter->iter_sg == NULL) { | |
948 | ASSERT0(aiter->iter_offset); | |
949 | break; | |
950 | } | |
951 | } | |
952 | } | |
953 | } | |
954 | ||
955 | /* | |
956 | * Map the current chunk into aiter. This can be safely called when the aiter | |
957 | * has already exhausted, in which case this does nothing. | |
958 | */ | |
959 | void | |
960 | abd_iter_map(struct abd_iter *aiter) | |
961 | { | |
962 | void *paddr; | |
963 | size_t offset = 0; | |
964 | ||
965 | ASSERT3P(aiter->iter_mapaddr, ==, NULL); | |
966 | ASSERT0(aiter->iter_mapsize); | |
967 | ||
968 | /* There's nothing left to iterate over, so do nothing */ | |
969 | if (abd_iter_at_end(aiter)) | |
970 | return; | |
971 | ||
972 | if (abd_is_linear(aiter->iter_abd)) { | |
973 | ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset); | |
974 | offset = aiter->iter_offset; | |
975 | aiter->iter_mapsize = aiter->iter_abd->abd_size - offset; | |
976 | paddr = ABD_LINEAR_BUF(aiter->iter_abd); | |
977 | } else { | |
978 | offset = aiter->iter_offset; | |
979 | aiter->iter_mapsize = MIN(aiter->iter_sg->length - offset, | |
980 | aiter->iter_abd->abd_size - aiter->iter_pos); | |
981 | ||
f52124dc | 982 | paddr = zfs_kmap_atomic(sg_page(aiter->iter_sg)); |
fc551d7e BA |
983 | } |
984 | ||
985 | aiter->iter_mapaddr = (char *)paddr + offset; | |
986 | } | |
987 | ||
988 | /* | |
989 | * Unmap the current chunk from aiter. This can be safely called when the aiter | |
990 | * has already exhausted, in which case this does nothing. | |
991 | */ | |
992 | void | |
993 | abd_iter_unmap(struct abd_iter *aiter) | |
994 | { | |
995 | /* There's nothing left to unmap, so do nothing */ | |
996 | if (abd_iter_at_end(aiter)) | |
997 | return; | |
998 | ||
999 | if (!abd_is_linear(aiter->iter_abd)) { | |
1000 | /* LINTED E_FUNC_SET_NOT_USED */ | |
f52124dc | 1001 | zfs_kunmap_atomic(aiter->iter_mapaddr - aiter->iter_offset); |
fc551d7e BA |
1002 | } |
1003 | ||
1004 | ASSERT3P(aiter->iter_mapaddr, !=, NULL); | |
1005 | ASSERT3U(aiter->iter_mapsize, >, 0); | |
1006 | ||
1007 | aiter->iter_mapaddr = NULL; | |
1008 | aiter->iter_mapsize = 0; | |
1009 | } | |
1010 | ||
7564073e MM |
1011 | void |
1012 | abd_cache_reap_now(void) | |
1013 | { | |
1014 | } | |
1015 | ||
fc551d7e | 1016 | #if defined(_KERNEL) |
390b4487 RN |
1017 | /* |
1018 | * Yield the next page struct and data offset and size within it, without | |
1019 | * mapping it into the address space. | |
1020 | */ | |
1021 | void | |
1022 | abd_iter_page(struct abd_iter *aiter) | |
1023 | { | |
1024 | if (abd_iter_at_end(aiter)) { | |
1025 | aiter->iter_page = NULL; | |
1026 | aiter->iter_page_doff = 0; | |
1027 | aiter->iter_page_dsize = 0; | |
1028 | return; | |
1029 | } | |
1030 | ||
1031 | struct page *page; | |
1032 | size_t doff, dsize; | |
1033 | ||
1034 | if (abd_is_linear(aiter->iter_abd)) { | |
1035 | ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset); | |
1036 | ||
1037 | /* memory address at iter_pos */ | |
1038 | void *paddr = ABD_LINEAR_BUF(aiter->iter_abd) + aiter->iter_pos; | |
1039 | ||
1040 | /* struct page for address */ | |
1041 | page = is_vmalloc_addr(paddr) ? | |
1042 | vmalloc_to_page(paddr) : virt_to_page(paddr); | |
1043 | ||
1044 | /* offset of address within the page */ | |
1045 | doff = offset_in_page(paddr); | |
1046 | ||
1047 | /* total data remaining in abd from this position */ | |
1048 | dsize = aiter->iter_abd->abd_size - aiter->iter_offset; | |
1049 | } else { | |
1050 | ASSERT(!abd_is_gang(aiter->iter_abd)); | |
1051 | ||
1052 | /* current scatter page */ | |
1053 | page = sg_page(aiter->iter_sg); | |
1054 | ||
1055 | /* position within page */ | |
1056 | doff = aiter->iter_offset; | |
1057 | ||
1058 | /* remaining data in scatterlist */ | |
1059 | dsize = MIN(aiter->iter_sg->length - aiter->iter_offset, | |
1060 | aiter->iter_abd->abd_size - aiter->iter_pos); | |
1061 | } | |
1062 | ASSERT(page); | |
1063 | ||
1064 | if (PageTail(page)) { | |
1065 | /* | |
1066 | * This page is part of a "compound page", which is a group of | |
1067 | * pages that can be referenced from a single struct page *. | |
1068 | * Its organised as a "head" page, followed by a series of | |
1069 | * "tail" pages. | |
1070 | * | |
1071 | * In OpenZFS, compound pages are allocated using the | |
1072 | * __GFP_COMP flag, which we get from scatter ABDs and SPL | |
1073 | * vmalloc slabs (ie >16K allocations). So a great many of the | |
1074 | * IO buffers we get are going to be of this type. | |
1075 | * | |
1076 | * The tail pages are just regular PAGE_SIZE pages, and can be | |
1077 | * safely used as-is. However, the head page has length | |
1078 | * covering itself and all the tail pages. If this ABD chunk | |
1079 | * spans multiple pages, then we can use the head page and a | |
1080 | * >PAGE_SIZE length, which is far more efficient. | |
1081 | * | |
1082 | * To do this, we need to adjust the offset to be counted from | |
1083 | * the head page. struct page for compound pages are stored | |
1084 | * contiguously, so we can just adjust by a simple offset. | |
1085 | */ | |
1086 | struct page *head = compound_head(page); | |
1087 | doff += ((page - head) * PAGESIZE); | |
1088 | page = head; | |
1089 | } | |
1090 | ||
1091 | /* final page and position within it */ | |
1092 | aiter->iter_page = page; | |
1093 | aiter->iter_page_doff = doff; | |
1094 | ||
1095 | /* amount of data in the chunk, up to the end of the page */ | |
1096 | aiter->iter_page_dsize = MIN(dsize, page_size(page) - doff); | |
1097 | } | |
1098 | ||
f3b85d70 RN |
1099 | /* |
1100 | * Note: ABD BIO functions only needed to support vdev_classic. See comments in | |
1101 | * vdev_disk.c. | |
1102 | */ | |
1103 | ||
fc551d7e BA |
1104 | /* |
1105 | * bio_nr_pages for ABD. | |
1106 | * @off is the offset in @abd | |
1107 | */ | |
1108 | unsigned long | |
1109 | abd_nr_pages_off(abd_t *abd, unsigned int size, size_t off) | |
1110 | { | |
1111 | unsigned long pos; | |
1112 | ||
f8c0d7e1 MA |
1113 | if (abd_is_gang(abd)) { |
1114 | unsigned long count = 0; | |
1115 | ||
1116 | for (abd_t *cabd = abd_gang_get_offset(abd, &off); | |
1117 | cabd != NULL && size != 0; | |
1118 | cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) { | |
1119 | ASSERT3U(off, <, cabd->abd_size); | |
1120 | int mysize = MIN(size, cabd->abd_size - off); | |
1121 | count += abd_nr_pages_off(cabd, mysize, off); | |
1122 | size -= mysize; | |
1123 | off = 0; | |
1124 | } | |
1125 | return (count); | |
1126 | } | |
fb822260 | 1127 | |
fc551d7e BA |
1128 | if (abd_is_linear(abd)) |
1129 | pos = (unsigned long)abd_to_buf(abd) + off; | |
1130 | else | |
1131 | pos = ABD_SCATTER(abd).abd_offset + off; | |
1132 | ||
f8c0d7e1 MA |
1133 | return (((pos + size + PAGESIZE - 1) >> PAGE_SHIFT) - |
1134 | (pos >> PAGE_SHIFT)); | |
fc551d7e BA |
1135 | } |
1136 | ||
fb822260 BA |
1137 | static unsigned int |
1138 | bio_map(struct bio *bio, void *buf_ptr, unsigned int bio_size) | |
1139 | { | |
1140 | unsigned int offset, size, i; | |
1141 | struct page *page; | |
1142 | ||
1143 | offset = offset_in_page(buf_ptr); | |
1144 | for (i = 0; i < bio->bi_max_vecs; i++) { | |
1145 | size = PAGE_SIZE - offset; | |
1146 | ||
1147 | if (bio_size <= 0) | |
1148 | break; | |
1149 | ||
1150 | if (size > bio_size) | |
1151 | size = bio_size; | |
1152 | ||
1153 | if (is_vmalloc_addr(buf_ptr)) | |
1154 | page = vmalloc_to_page(buf_ptr); | |
1155 | else | |
1156 | page = virt_to_page(buf_ptr); | |
1157 | ||
1158 | /* | |
1159 | * Some network related block device uses tcp_sendpage, which | |
1160 | * doesn't behave well when using 0-count page, this is a | |
1161 | * safety net to catch them. | |
1162 | */ | |
1163 | ASSERT3S(page_count(page), >, 0); | |
1164 | ||
1165 | if (bio_add_page(bio, page, size, offset) != size) | |
1166 | break; | |
1167 | ||
1168 | buf_ptr += size; | |
1169 | bio_size -= size; | |
1170 | offset = 0; | |
1171 | } | |
1172 | ||
1173 | return (bio_size); | |
1174 | } | |
1175 | ||
fc551d7e | 1176 | /* |
fb822260 BA |
1177 | * bio_map for gang ABD. |
1178 | */ | |
1179 | static unsigned int | |
1180 | abd_gang_bio_map_off(struct bio *bio, abd_t *abd, | |
1181 | unsigned int io_size, size_t off) | |
1182 | { | |
1183 | ASSERT(abd_is_gang(abd)); | |
1184 | ||
1185 | for (abd_t *cabd = abd_gang_get_offset(abd, &off); | |
1186 | cabd != NULL; | |
1187 | cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) { | |
1188 | ASSERT3U(off, <, cabd->abd_size); | |
1189 | int size = MIN(io_size, cabd->abd_size - off); | |
1190 | int remainder = abd_bio_map_off(bio, cabd, size, off); | |
1191 | io_size -= (size - remainder); | |
1192 | if (io_size == 0 || remainder > 0) | |
1193 | return (io_size); | |
1194 | off = 0; | |
1195 | } | |
1196 | ASSERT0(io_size); | |
1197 | return (io_size); | |
1198 | } | |
1199 | ||
1200 | /* | |
1201 | * bio_map for ABD. | |
fc551d7e BA |
1202 | * @off is the offset in @abd |
1203 | * Remaining IO size is returned | |
1204 | */ | |
1205 | unsigned int | |
fb822260 | 1206 | abd_bio_map_off(struct bio *bio, abd_t *abd, |
fc551d7e BA |
1207 | unsigned int io_size, size_t off) |
1208 | { | |
fc551d7e BA |
1209 | struct abd_iter aiter; |
1210 | ||
fc551d7e | 1211 | ASSERT3U(io_size, <=, abd->abd_size - off); |
fb822260 BA |
1212 | if (abd_is_linear(abd)) |
1213 | return (bio_map(bio, ((char *)abd_to_buf(abd)) + off, io_size)); | |
1214 | ||
1215 | ASSERT(!abd_is_linear(abd)); | |
1216 | if (abd_is_gang(abd)) | |
1217 | return (abd_gang_bio_map_off(bio, abd, io_size, off)); | |
fc551d7e BA |
1218 | |
1219 | abd_iter_init(&aiter, abd); | |
1220 | abd_iter_advance(&aiter, off); | |
1221 | ||
f8c0d7e1 | 1222 | for (int i = 0; i < bio->bi_max_vecs; i++) { |
fc551d7e BA |
1223 | struct page *pg; |
1224 | size_t len, sgoff, pgoff; | |
1225 | struct scatterlist *sg; | |
1226 | ||
1227 | if (io_size <= 0) | |
1228 | break; | |
1229 | ||
1230 | sg = aiter.iter_sg; | |
1231 | sgoff = aiter.iter_offset; | |
1232 | pgoff = sgoff & (PAGESIZE - 1); | |
1233 | len = MIN(io_size, PAGESIZE - pgoff); | |
1234 | ASSERT(len > 0); | |
1235 | ||
1236 | pg = nth_page(sg_page(sg), sgoff >> PAGE_SHIFT); | |
1237 | if (bio_add_page(bio, pg, len, pgoff) != len) | |
1238 | break; | |
1239 | ||
1240 | io_size -= len; | |
1241 | abd_iter_advance(&aiter, len); | |
1242 | } | |
1243 | ||
1244 | return (io_size); | |
1245 | } | |
1246 | ||
1247 | /* Tunable Parameters */ | |
1248 | module_param(zfs_abd_scatter_enabled, int, 0644); | |
1249 | MODULE_PARM_DESC(zfs_abd_scatter_enabled, | |
1250 | "Toggle whether ABD allocations must be linear."); | |
1251 | module_param(zfs_abd_scatter_min_size, int, 0644); | |
1252 | MODULE_PARM_DESC(zfs_abd_scatter_min_size, | |
1253 | "Minimum size of scatter allocations."); | |
1254 | /* CSTYLED */ | |
1255 | module_param(zfs_abd_scatter_max_order, uint, 0644); | |
1256 | MODULE_PARM_DESC(zfs_abd_scatter_max_order, | |
1257 | "Maximum order allocation used for a scatter ABD."); | |
390b4487 RN |
1258 | |
1259 | #endif /* _KERNEL */ |