]>
Commit | Line | Data |
---|---|---|
10b3c7f5 MN |
1 | /* |
2 | * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html) | |
3 | * | |
4 | * Redistribution and use in source and binary forms, with or without | |
5 | * modification, are permitted provided that the following conditions are met: | |
6 | * | |
7 | * 1. Redistributions of source code must retain the above copyright notice, | |
8 | * this list of conditions and the following disclaimer. | |
9 | * | |
10 | * 2. Redistributions in binary form must reproduce the above copyright notice, | |
11 | * this list of conditions and the following disclaimer in the documentation | |
12 | * and/or other materials provided with the distribution. | |
13 | * | |
14 | * 3. Neither the name of the copyright holder nor the names of its | |
15 | * contributors may be used to endorse or promote products derived from this | |
16 | * software without specific prior written permission. | |
17 | * | |
18 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
19 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
20 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
21 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE | |
22 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |
23 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |
24 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |
25 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |
26 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |
27 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |
28 | * POSSIBILITY OF SUCH DAMAGE. | |
29 | */ | |
30 | ||
31 | /* | |
32 | * Copyright (c) 2016-2018, Klara Inc. | |
33 | * Copyright (c) 2016-2018, Allan Jude | |
34 | * Copyright (c) 2018-2020, Sebastian Gottschall | |
35 | * Copyright (c) 2019-2020, Michael Niewöhner | |
36 | * Copyright (c) 2020, The FreeBSD Foundation [1] | |
37 | * | |
38 | * [1] Portions of this software were developed by Allan Jude | |
39 | * under sponsorship from the FreeBSD Foundation. | |
40 | */ | |
41 | ||
42 | #include <sys/param.h> | |
43 | #include <sys/sysmacros.h> | |
44 | #include <sys/zfs_context.h> | |
45 | #include <sys/zio_compress.h> | |
46 | #include <sys/spa.h> | |
47 | #include <sys/zstd/zstd.h> | |
48 | ||
49 | #define ZSTD_STATIC_LINKING_ONLY | |
50 | #include "lib/zstd.h" | |
234e9605 | 51 | #include "lib/common/zstd_errors.h" |
10b3c7f5 | 52 | |
fdc2d303 | 53 | static uint_t zstd_earlyabort_pass = 1; |
f375b23c RE |
54 | static int zstd_cutoff_level = ZIO_ZSTD_LEVEL_3; |
55 | static unsigned int zstd_abort_size = (128 * 1024); | |
56 | ||
7b875ee6 | 57 | static kstat_t *zstd_ksp = NULL; |
10b3c7f5 MN |
58 | |
59 | typedef struct zstd_stats { | |
60 | kstat_named_t zstd_stat_alloc_fail; | |
61 | kstat_named_t zstd_stat_alloc_fallback; | |
62 | kstat_named_t zstd_stat_com_alloc_fail; | |
63 | kstat_named_t zstd_stat_dec_alloc_fail; | |
64 | kstat_named_t zstd_stat_com_inval; | |
65 | kstat_named_t zstd_stat_dec_inval; | |
66 | kstat_named_t zstd_stat_dec_header_inval; | |
67 | kstat_named_t zstd_stat_com_fail; | |
68 | kstat_named_t zstd_stat_dec_fail; | |
f375b23c RE |
69 | /* |
70 | * LZ4 first-pass early abort verdict | |
71 | */ | |
72 | kstat_named_t zstd_stat_lz4pass_allowed; | |
73 | kstat_named_t zstd_stat_lz4pass_rejected; | |
74 | /* | |
75 | * zstd-1 second-pass early abort verdict | |
76 | */ | |
77 | kstat_named_t zstd_stat_zstdpass_allowed; | |
78 | kstat_named_t zstd_stat_zstdpass_rejected; | |
79 | /* | |
80 | * We excluded this from early abort for some reason | |
81 | */ | |
82 | kstat_named_t zstd_stat_passignored; | |
83 | kstat_named_t zstd_stat_passignored_size; | |
c4ede65b MG |
84 | kstat_named_t zstd_stat_buffers; |
85 | kstat_named_t zstd_stat_size; | |
10b3c7f5 MN |
86 | } zstd_stats_t; |
87 | ||
88 | static zstd_stats_t zstd_stats = { | |
89 | { "alloc_fail", KSTAT_DATA_UINT64 }, | |
90 | { "alloc_fallback", KSTAT_DATA_UINT64 }, | |
91 | { "compress_alloc_fail", KSTAT_DATA_UINT64 }, | |
92 | { "decompress_alloc_fail", KSTAT_DATA_UINT64 }, | |
93 | { "compress_level_invalid", KSTAT_DATA_UINT64 }, | |
94 | { "decompress_level_invalid", KSTAT_DATA_UINT64 }, | |
95 | { "decompress_header_invalid", KSTAT_DATA_UINT64 }, | |
96 | { "compress_failed", KSTAT_DATA_UINT64 }, | |
97 | { "decompress_failed", KSTAT_DATA_UINT64 }, | |
f375b23c RE |
98 | { "lz4pass_allowed", KSTAT_DATA_UINT64 }, |
99 | { "lz4pass_rejected", KSTAT_DATA_UINT64 }, | |
100 | { "zstdpass_allowed", KSTAT_DATA_UINT64 }, | |
101 | { "zstdpass_rejected", KSTAT_DATA_UINT64 }, | |
102 | { "passignored", KSTAT_DATA_UINT64 }, | |
103 | { "passignored_size", KSTAT_DATA_UINT64 }, | |
c4ede65b MG |
104 | { "buffers", KSTAT_DATA_UINT64 }, |
105 | { "size", KSTAT_DATA_UINT64 }, | |
10b3c7f5 MN |
106 | }; |
107 | ||
f375b23c RE |
108 | #ifdef _KERNEL |
109 | static int | |
110 | kstat_zstd_update(kstat_t *ksp, int rw) | |
111 | { | |
112 | ASSERT(ksp != NULL); | |
113 | ||
114 | if (rw == KSTAT_WRITE && ksp == zstd_ksp) { | |
115 | ZSTDSTAT_ZERO(zstd_stat_alloc_fail); | |
116 | ZSTDSTAT_ZERO(zstd_stat_alloc_fallback); | |
117 | ZSTDSTAT_ZERO(zstd_stat_com_alloc_fail); | |
118 | ZSTDSTAT_ZERO(zstd_stat_dec_alloc_fail); | |
119 | ZSTDSTAT_ZERO(zstd_stat_com_inval); | |
120 | ZSTDSTAT_ZERO(zstd_stat_dec_inval); | |
121 | ZSTDSTAT_ZERO(zstd_stat_dec_header_inval); | |
122 | ZSTDSTAT_ZERO(zstd_stat_com_fail); | |
123 | ZSTDSTAT_ZERO(zstd_stat_dec_fail); | |
124 | ZSTDSTAT_ZERO(zstd_stat_lz4pass_allowed); | |
125 | ZSTDSTAT_ZERO(zstd_stat_lz4pass_rejected); | |
126 | ZSTDSTAT_ZERO(zstd_stat_zstdpass_allowed); | |
127 | ZSTDSTAT_ZERO(zstd_stat_zstdpass_rejected); | |
128 | ZSTDSTAT_ZERO(zstd_stat_passignored); | |
129 | ZSTDSTAT_ZERO(zstd_stat_passignored_size); | |
130 | } | |
131 | ||
132 | return (0); | |
133 | } | |
134 | #endif | |
135 | ||
10b3c7f5 MN |
136 | /* Enums describing the allocator type specified by kmem_type in zstd_kmem */ |
137 | enum zstd_kmem_type { | |
138 | ZSTD_KMEM_UNKNOWN = 0, | |
139 | /* Allocation type using kmem_vmalloc */ | |
140 | ZSTD_KMEM_DEFAULT, | |
141 | /* Pool based allocation using mempool_alloc */ | |
142 | ZSTD_KMEM_POOL, | |
143 | /* Reserved fallback memory for decompression only */ | |
144 | ZSTD_KMEM_DCTX, | |
145 | ZSTD_KMEM_COUNT, | |
146 | }; | |
147 | ||
148 | /* Structure for pooled memory objects */ | |
149 | struct zstd_pool { | |
150 | void *mem; | |
151 | size_t size; | |
152 | kmutex_t barrier; | |
153 | hrtime_t timeout; | |
154 | }; | |
155 | ||
156 | /* Global structure for handling memory allocations */ | |
157 | struct zstd_kmem { | |
158 | enum zstd_kmem_type kmem_type; | |
159 | size_t kmem_size; | |
160 | struct zstd_pool *pool; | |
161 | }; | |
162 | ||
163 | /* Fallback memory structure used for decompression only if memory runs out */ | |
164 | struct zstd_fallback_mem { | |
165 | size_t mem_size; | |
166 | void *mem; | |
167 | kmutex_t barrier; | |
168 | }; | |
169 | ||
170 | struct zstd_levelmap { | |
171 | int16_t zstd_level; | |
172 | enum zio_zstd_levels level; | |
173 | }; | |
174 | ||
175 | /* | |
176 | * ZSTD memory handlers | |
177 | * | |
178 | * For decompression we use a different handler which also provides fallback | |
179 | * memory allocation in case memory runs out. | |
180 | * | |
181 | * The ZSTD handlers were split up for the most simplified implementation. | |
182 | */ | |
183 | static void *zstd_alloc(void *opaque, size_t size); | |
184 | static void *zstd_dctx_alloc(void *opaque, size_t size); | |
185 | static void zstd_free(void *opaque, void *ptr); | |
186 | ||
187 | /* Compression memory handler */ | |
188 | static const ZSTD_customMem zstd_malloc = { | |
189 | zstd_alloc, | |
190 | zstd_free, | |
191 | NULL, | |
192 | }; | |
193 | ||
194 | /* Decompression memory handler */ | |
195 | static const ZSTD_customMem zstd_dctx_malloc = { | |
196 | zstd_dctx_alloc, | |
197 | zstd_free, | |
198 | NULL, | |
199 | }; | |
200 | ||
201 | /* Level map for converting ZFS internal levels to ZSTD levels and vice versa */ | |
202 | static struct zstd_levelmap zstd_levels[] = { | |
203 | {ZIO_ZSTD_LEVEL_1, ZIO_ZSTD_LEVEL_1}, | |
204 | {ZIO_ZSTD_LEVEL_2, ZIO_ZSTD_LEVEL_2}, | |
205 | {ZIO_ZSTD_LEVEL_3, ZIO_ZSTD_LEVEL_3}, | |
206 | {ZIO_ZSTD_LEVEL_4, ZIO_ZSTD_LEVEL_4}, | |
207 | {ZIO_ZSTD_LEVEL_5, ZIO_ZSTD_LEVEL_5}, | |
208 | {ZIO_ZSTD_LEVEL_6, ZIO_ZSTD_LEVEL_6}, | |
209 | {ZIO_ZSTD_LEVEL_7, ZIO_ZSTD_LEVEL_7}, | |
210 | {ZIO_ZSTD_LEVEL_8, ZIO_ZSTD_LEVEL_8}, | |
211 | {ZIO_ZSTD_LEVEL_9, ZIO_ZSTD_LEVEL_9}, | |
212 | {ZIO_ZSTD_LEVEL_10, ZIO_ZSTD_LEVEL_10}, | |
213 | {ZIO_ZSTD_LEVEL_11, ZIO_ZSTD_LEVEL_11}, | |
214 | {ZIO_ZSTD_LEVEL_12, ZIO_ZSTD_LEVEL_12}, | |
215 | {ZIO_ZSTD_LEVEL_13, ZIO_ZSTD_LEVEL_13}, | |
216 | {ZIO_ZSTD_LEVEL_14, ZIO_ZSTD_LEVEL_14}, | |
217 | {ZIO_ZSTD_LEVEL_15, ZIO_ZSTD_LEVEL_15}, | |
218 | {ZIO_ZSTD_LEVEL_16, ZIO_ZSTD_LEVEL_16}, | |
219 | {ZIO_ZSTD_LEVEL_17, ZIO_ZSTD_LEVEL_17}, | |
220 | {ZIO_ZSTD_LEVEL_18, ZIO_ZSTD_LEVEL_18}, | |
221 | {ZIO_ZSTD_LEVEL_19, ZIO_ZSTD_LEVEL_19}, | |
222 | {-1, ZIO_ZSTD_LEVEL_FAST_1}, | |
223 | {-2, ZIO_ZSTD_LEVEL_FAST_2}, | |
224 | {-3, ZIO_ZSTD_LEVEL_FAST_3}, | |
225 | {-4, ZIO_ZSTD_LEVEL_FAST_4}, | |
226 | {-5, ZIO_ZSTD_LEVEL_FAST_5}, | |
227 | {-6, ZIO_ZSTD_LEVEL_FAST_6}, | |
228 | {-7, ZIO_ZSTD_LEVEL_FAST_7}, | |
229 | {-8, ZIO_ZSTD_LEVEL_FAST_8}, | |
230 | {-9, ZIO_ZSTD_LEVEL_FAST_9}, | |
231 | {-10, ZIO_ZSTD_LEVEL_FAST_10}, | |
232 | {-20, ZIO_ZSTD_LEVEL_FAST_20}, | |
233 | {-30, ZIO_ZSTD_LEVEL_FAST_30}, | |
234 | {-40, ZIO_ZSTD_LEVEL_FAST_40}, | |
235 | {-50, ZIO_ZSTD_LEVEL_FAST_50}, | |
236 | {-60, ZIO_ZSTD_LEVEL_FAST_60}, | |
237 | {-70, ZIO_ZSTD_LEVEL_FAST_70}, | |
238 | {-80, ZIO_ZSTD_LEVEL_FAST_80}, | |
239 | {-90, ZIO_ZSTD_LEVEL_FAST_90}, | |
240 | {-100, ZIO_ZSTD_LEVEL_FAST_100}, | |
241 | {-500, ZIO_ZSTD_LEVEL_FAST_500}, | |
242 | {-1000, ZIO_ZSTD_LEVEL_FAST_1000}, | |
243 | }; | |
244 | ||
245 | /* | |
246 | * This variable represents the maximum count of the pool based on the number | |
247 | * of CPUs plus some buffer. We default to cpu count * 4, see init_zstd. | |
248 | */ | |
249 | static int pool_count = 16; | |
250 | ||
251 | #define ZSTD_POOL_MAX pool_count | |
252 | #define ZSTD_POOL_TIMEOUT 60 * 2 | |
253 | ||
254 | static struct zstd_fallback_mem zstd_dctx_fallback; | |
255 | static struct zstd_pool *zstd_mempool_cctx; | |
256 | static struct zstd_pool *zstd_mempool_dctx; | |
257 | ||
2084e9f7 RE |
258 | /* |
259 | * The library zstd code expects these if ADDRESS_SANITIZER gets defined, | |
260 | * and while ASAN does this, KASAN defines that and does not. So to avoid | |
261 | * changing the external code, we do this. | |
262 | */ | |
63652e15 | 263 | #if defined(ZFS_ASAN_ENABLED) |
2084e9f7 RE |
264 | #define ADDRESS_SANITIZER 1 |
265 | #endif | |
266 | #if defined(_KERNEL) && defined(ADDRESS_SANITIZER) | |
267 | void __asan_unpoison_memory_region(void const volatile *addr, size_t size); | |
268 | void __asan_poison_memory_region(void const volatile *addr, size_t size); | |
269 | void __asan_unpoison_memory_region(void const volatile *addr, size_t size) {}; | |
270 | void __asan_poison_memory_region(void const volatile *addr, size_t size) {}; | |
271 | #endif | |
272 | ||
7eefaf0c SG |
273 | |
274 | static void | |
275 | zstd_mempool_reap(struct zstd_pool *zstd_mempool) | |
276 | { | |
277 | struct zstd_pool *pool; | |
278 | ||
279 | if (!zstd_mempool || !ZSTDSTAT(zstd_stat_buffers)) { | |
280 | return; | |
281 | } | |
282 | ||
283 | /* free obsolete slots */ | |
284 | for (int i = 0; i < ZSTD_POOL_MAX; i++) { | |
285 | pool = &zstd_mempool[i]; | |
286 | if (pool->mem && mutex_tryenter(&pool->barrier)) { | |
287 | /* Free memory if unused object older than 2 minutes */ | |
288 | if (pool->mem && gethrestime_sec() > pool->timeout) { | |
289 | vmem_free(pool->mem, pool->size); | |
290 | ZSTDSTAT_SUB(zstd_stat_buffers, 1); | |
291 | ZSTDSTAT_SUB(zstd_stat_size, pool->size); | |
292 | pool->mem = NULL; | |
293 | pool->size = 0; | |
294 | pool->timeout = 0; | |
295 | } | |
296 | mutex_exit(&pool->barrier); | |
297 | } | |
298 | } | |
299 | } | |
300 | ||
10b3c7f5 MN |
301 | /* |
302 | * Try to get a cached allocated buffer from memory pool or allocate a new one | |
303 | * if necessary. If a object is older than 2 minutes and does not fit the | |
304 | * requested size, it will be released and a new cached entry will be allocated. | |
305 | * If other pooled objects are detected without being used for 2 minutes, they | |
306 | * will be released, too. | |
307 | * | |
308 | * The concept is that high frequency memory allocations of bigger objects are | |
309 | * expensive. So if a lot of work is going on, allocations will be kept for a | |
310 | * while and can be reused in that time frame. | |
311 | * | |
312 | * The scheduled release will be updated every time a object is reused. | |
313 | */ | |
7eefaf0c | 314 | |
10b3c7f5 MN |
315 | static void * |
316 | zstd_mempool_alloc(struct zstd_pool *zstd_mempool, size_t size) | |
317 | { | |
318 | struct zstd_pool *pool; | |
319 | struct zstd_kmem *mem = NULL; | |
320 | ||
321 | if (!zstd_mempool) { | |
322 | return (NULL); | |
323 | } | |
324 | ||
325 | /* Seek for preallocated memory slot and free obsolete slots */ | |
326 | for (int i = 0; i < ZSTD_POOL_MAX; i++) { | |
327 | pool = &zstd_mempool[i]; | |
328 | /* | |
bf169e9f | 329 | * This lock is simply a marker for a pool object being in use. |
10b3c7f5 MN |
330 | * If it's already hold, it will be skipped. |
331 | * | |
332 | * We need to create it before checking it to avoid race | |
333 | * conditions caused by running in a threaded context. | |
334 | * | |
335 | * The lock is later released by zstd_mempool_free. | |
336 | */ | |
337 | if (mutex_tryenter(&pool->barrier)) { | |
338 | /* | |
339 | * Check if objects fits the size, if so we take it and | |
340 | * update the timestamp. | |
341 | */ | |
7eefaf0c | 342 | if (pool->mem && size <= pool->size) { |
10b3c7f5 MN |
343 | pool->timeout = gethrestime_sec() + |
344 | ZSTD_POOL_TIMEOUT; | |
345 | mem = pool->mem; | |
7eefaf0c | 346 | return (mem); |
10b3c7f5 | 347 | } |
10b3c7f5 MN |
348 | mutex_exit(&pool->barrier); |
349 | } | |
350 | } | |
351 | ||
10b3c7f5 MN |
352 | /* |
353 | * If no preallocated slot was found, try to fill in a new one. | |
354 | * | |
355 | * We run a similar algorithm twice here to avoid pool fragmentation. | |
356 | * The first one may generate holes in the list if objects get released. | |
357 | * We always make sure that these holes get filled instead of adding new | |
358 | * allocations constantly at the end. | |
359 | */ | |
360 | for (int i = 0; i < ZSTD_POOL_MAX; i++) { | |
361 | pool = &zstd_mempool[i]; | |
362 | if (mutex_tryenter(&pool->barrier)) { | |
363 | /* Object is free, try to allocate new one */ | |
364 | if (!pool->mem) { | |
365 | mem = vmem_alloc(size, KM_SLEEP); | |
c4ede65b MG |
366 | if (mem) { |
367 | ZSTDSTAT_ADD(zstd_stat_buffers, 1); | |
368 | ZSTDSTAT_ADD(zstd_stat_size, size); | |
369 | pool->mem = mem; | |
370 | pool->size = size; | |
10b3c7f5 MN |
371 | /* Keep track for later release */ |
372 | mem->pool = pool; | |
10b3c7f5 MN |
373 | mem->kmem_type = ZSTD_KMEM_POOL; |
374 | mem->kmem_size = size; | |
375 | } | |
376 | } | |
377 | ||
378 | if (size <= pool->size) { | |
379 | /* Update timestamp */ | |
380 | pool->timeout = gethrestime_sec() + | |
381 | ZSTD_POOL_TIMEOUT; | |
382 | ||
383 | return (pool->mem); | |
384 | } | |
385 | ||
386 | mutex_exit(&pool->barrier); | |
387 | } | |
388 | } | |
389 | ||
390 | /* | |
391 | * If the pool is full or the allocation failed, try lazy allocation | |
392 | * instead. | |
393 | */ | |
394 | if (!mem) { | |
395 | mem = vmem_alloc(size, KM_NOSLEEP); | |
396 | if (mem) { | |
397 | mem->pool = NULL; | |
398 | mem->kmem_type = ZSTD_KMEM_DEFAULT; | |
399 | mem->kmem_size = size; | |
400 | } | |
401 | } | |
402 | ||
403 | return (mem); | |
404 | } | |
405 | ||
406 | /* Mark object as released by releasing the barrier mutex */ | |
407 | static void | |
408 | zstd_mempool_free(struct zstd_kmem *z) | |
409 | { | |
410 | mutex_exit(&z->pool->barrier); | |
411 | } | |
412 | ||
413 | /* Convert ZFS internal enum to ZSTD level */ | |
414 | static int | |
415 | zstd_enum_to_level(enum zio_zstd_levels level, int16_t *zstd_level) | |
416 | { | |
417 | if (level > 0 && level <= ZIO_ZSTD_LEVEL_19) { | |
418 | *zstd_level = zstd_levels[level - 1].zstd_level; | |
419 | return (0); | |
420 | } | |
421 | if (level >= ZIO_ZSTD_LEVEL_FAST_1 && | |
422 | level <= ZIO_ZSTD_LEVEL_FAST_1000) { | |
423 | *zstd_level = zstd_levels[level - ZIO_ZSTD_LEVEL_FAST_1 | |
424 | + ZIO_ZSTD_LEVEL_19].zstd_level; | |
425 | return (0); | |
426 | } | |
427 | ||
428 | /* Invalid/unknown zfs compression enum - this should never happen. */ | |
429 | return (1); | |
430 | } | |
431 | ||
b1a1c643 | 432 | |
f375b23c RE |
433 | size_t |
434 | zfs_zstd_compress_wrap(void *s_start, void *d_start, size_t s_len, size_t d_len, | |
435 | int level) | |
436 | { | |
437 | int16_t zstd_level; | |
438 | if (zstd_enum_to_level(level, &zstd_level)) { | |
439 | ZSTDSTAT_BUMP(zstd_stat_com_inval); | |
440 | return (s_len); | |
441 | } | |
442 | /* | |
443 | * A zstd early abort heuristic. | |
444 | * | |
445 | * - Zeroth, if this is <= zstd-3, or < zstd_abort_size (currently | |
446 | * 128k), don't try any of this, just go. | |
447 | * (because experimentally that was a reasonable cutoff for a perf win | |
448 | * with tiny ratio change) | |
449 | * - First, we try LZ4 compression, and if it doesn't early abort, we | |
450 | * jump directly to whatever compression level we intended to try. | |
451 | * - Second, we try zstd-1 - if that errors out (usually, but not | |
452 | * exclusively, if it would overflow), we give up early. | |
453 | * | |
454 | * If it works, instead we go on and compress anyway. | |
455 | * | |
456 | * Why two passes? LZ4 alone gets you a lot of the way, but on highly | |
457 | * compressible data, it was losing up to 8.5% of the compressed | |
458 | * savings versus no early abort, and all the zstd-fast levels are | |
459 | * worse indications on their own than LZ4, and don't improve the LZ4 | |
460 | * pass noticably if stacked like this. | |
461 | */ | |
462 | size_t actual_abort_size = zstd_abort_size; | |
463 | if (zstd_earlyabort_pass > 0 && zstd_level >= zstd_cutoff_level && | |
464 | s_len >= actual_abort_size) { | |
465 | int pass_len = 1; | |
466 | pass_len = lz4_compress_zfs(s_start, d_start, s_len, d_len, 0); | |
467 | if (pass_len < d_len) { | |
468 | ZSTDSTAT_BUMP(zstd_stat_lz4pass_allowed); | |
469 | goto keep_trying; | |
470 | } | |
471 | ZSTDSTAT_BUMP(zstd_stat_lz4pass_rejected); | |
472 | ||
473 | pass_len = zfs_zstd_compress(s_start, d_start, s_len, d_len, | |
474 | ZIO_ZSTD_LEVEL_1); | |
475 | if (pass_len == s_len || pass_len <= 0 || pass_len > d_len) { | |
476 | ZSTDSTAT_BUMP(zstd_stat_zstdpass_rejected); | |
477 | return (s_len); | |
478 | } | |
479 | ZSTDSTAT_BUMP(zstd_stat_zstdpass_allowed); | |
480 | } else { | |
481 | ZSTDSTAT_BUMP(zstd_stat_passignored); | |
482 | if (s_len < actual_abort_size) { | |
483 | ZSTDSTAT_BUMP(zstd_stat_passignored_size); | |
484 | } | |
485 | } | |
486 | keep_trying: | |
487 | return (zfs_zstd_compress(s_start, d_start, s_len, d_len, level)); | |
488 | ||
489 | } | |
490 | ||
10b3c7f5 MN |
491 | /* Compress block using zstd */ |
492 | size_t | |
184df27e | 493 | zfs_zstd_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, |
10b3c7f5 MN |
494 | int level) |
495 | { | |
496 | size_t c_len; | |
497 | int16_t zstd_level; | |
498 | zfs_zstdhdr_t *hdr; | |
499 | ZSTD_CCtx *cctx; | |
500 | ||
501 | hdr = (zfs_zstdhdr_t *)d_start; | |
502 | ||
503 | /* Skip compression if the specified level is invalid */ | |
504 | if (zstd_enum_to_level(level, &zstd_level)) { | |
505 | ZSTDSTAT_BUMP(zstd_stat_com_inval); | |
506 | return (s_len); | |
507 | } | |
508 | ||
509 | ASSERT3U(d_len, >=, sizeof (*hdr)); | |
510 | ASSERT3U(d_len, <=, s_len); | |
511 | ASSERT3U(zstd_level, !=, 0); | |
512 | ||
513 | cctx = ZSTD_createCCtx_advanced(zstd_malloc); | |
514 | ||
515 | /* | |
516 | * Out of kernel memory, gently fall through - this will disable | |
517 | * compression in zio_compress_data | |
518 | */ | |
519 | if (!cctx) { | |
520 | ZSTDSTAT_BUMP(zstd_stat_com_alloc_fail); | |
521 | return (s_len); | |
522 | } | |
523 | ||
524 | /* Set the compression level */ | |
525 | ZSTD_CCtx_setParameter(cctx, ZSTD_c_compressionLevel, zstd_level); | |
526 | ||
527 | /* Use the "magicless" zstd header which saves us 4 header bytes */ | |
528 | ZSTD_CCtx_setParameter(cctx, ZSTD_c_format, ZSTD_f_zstd1_magicless); | |
529 | ||
530 | /* | |
531 | * Disable redundant checksum calculation and content size storage since | |
532 | * this is already done by ZFS itself. | |
533 | */ | |
534 | ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, 0); | |
535 | ZSTD_CCtx_setParameter(cctx, ZSTD_c_contentSizeFlag, 0); | |
536 | ||
537 | c_len = ZSTD_compress2(cctx, | |
538 | hdr->data, | |
539 | d_len - sizeof (*hdr), | |
540 | s_start, s_len); | |
541 | ||
542 | ZSTD_freeCCtx(cctx); | |
543 | ||
544 | /* Error in the compression routine, disable compression. */ | |
545 | if (ZSTD_isError(c_len)) { | |
546 | /* | |
547 | * If we are aborting the compression because the saves are | |
548 | * too small, that is not a failure. Everything else is a | |
549 | * failure, so increment the compression failure counter. | |
550 | */ | |
f375b23c RE |
551 | int err = ZSTD_getErrorCode(c_len); |
552 | if (err != ZSTD_error_dstSize_tooSmall) { | |
10b3c7f5 | 553 | ZSTDSTAT_BUMP(zstd_stat_com_fail); |
f375b23c | 554 | dprintf("Error: %s", ZSTD_getErrorString(err)); |
10b3c7f5 MN |
555 | } |
556 | return (s_len); | |
557 | } | |
558 | ||
559 | /* | |
560 | * Encode the compressed buffer size at the start. We'll need this in | |
561 | * decompression to counter the effects of padding which might be added | |
562 | * to the compressed buffer and which, if unhandled, would confuse the | |
563 | * hell out of our decompression function. | |
564 | */ | |
565 | hdr->c_len = BE_32(c_len); | |
566 | ||
567 | /* | |
568 | * Check version for overflow. | |
569 | * The limit of 24 bits must not be exceeded. This allows a maximum | |
570 | * version 1677.72.15 which we don't expect to be ever reached. | |
571 | */ | |
572 | ASSERT3U(ZSTD_VERSION_NUMBER, <=, 0xFFFFFF); | |
573 | ||
574 | /* | |
575 | * Encode the compression level as well. We may need to know the | |
576 | * original compression level if compressed_arc is disabled, to match | |
577 | * the compression settings to write this block to the L2ARC. | |
578 | * | |
579 | * Encode the actual level, so if the enum changes in the future, we | |
580 | * will be compatible. | |
581 | * | |
582 | * The upper 24 bits store the ZSTD version to be able to provide | |
583 | * future compatibility, since new versions might enhance the | |
584 | * compression algorithm in a way, where the compressed data will | |
585 | * change. | |
586 | * | |
587 | * As soon as such incompatibility occurs, handling code needs to be | |
588 | * added, differentiating between the versions. | |
589 | */ | |
b1a1c643 RE |
590 | zfs_set_hdrversion(hdr, ZSTD_VERSION_NUMBER); |
591 | zfs_set_hdrlevel(hdr, level); | |
10b3c7f5 MN |
592 | hdr->raw_version_level = BE_32(hdr->raw_version_level); |
593 | ||
594 | return (c_len + sizeof (*hdr)); | |
595 | } | |
596 | ||
597 | /* Decompress block using zstd and return its stored level */ | |
598 | int | |
184df27e SG |
599 | zfs_zstd_decompress_level(void *s_start, void *d_start, size_t s_len, |
600 | size_t d_len, uint8_t *level) | |
10b3c7f5 MN |
601 | { |
602 | ZSTD_DCtx *dctx; | |
603 | size_t result; | |
604 | int16_t zstd_level; | |
605 | uint32_t c_len; | |
606 | const zfs_zstdhdr_t *hdr; | |
607 | zfs_zstdhdr_t hdr_copy; | |
608 | ||
609 | hdr = (const zfs_zstdhdr_t *)s_start; | |
610 | c_len = BE_32(hdr->c_len); | |
611 | ||
612 | /* | |
613 | * Make a copy instead of directly converting the header, since we must | |
614 | * not modify the original data that may be used again later. | |
615 | */ | |
616 | hdr_copy.raw_version_level = BE_32(hdr->raw_version_level); | |
b1a1c643 | 617 | uint8_t curlevel = zfs_get_hdrlevel(&hdr_copy); |
10b3c7f5 MN |
618 | |
619 | /* | |
620 | * NOTE: We ignore the ZSTD version for now. As soon as any | |
bf169e9f | 621 | * incompatibility occurs, it has to be handled accordingly. |
10b3c7f5 MN |
622 | * The version can be accessed via `hdr_copy.version`. |
623 | */ | |
624 | ||
625 | /* | |
626 | * Convert and check the level | |
627 | * An invalid level is a strong indicator for data corruption! In such | |
628 | * case return an error so the upper layers can try to fix it. | |
629 | */ | |
b1a1c643 | 630 | if (zstd_enum_to_level(curlevel, &zstd_level)) { |
10b3c7f5 MN |
631 | ZSTDSTAT_BUMP(zstd_stat_dec_inval); |
632 | return (1); | |
633 | } | |
634 | ||
635 | ASSERT3U(d_len, >=, s_len); | |
b1a1c643 | 636 | ASSERT3U(curlevel, !=, ZIO_COMPLEVEL_INHERIT); |
10b3c7f5 MN |
637 | |
638 | /* Invalid compressed buffer size encoded at start */ | |
639 | if (c_len + sizeof (*hdr) > s_len) { | |
640 | ZSTDSTAT_BUMP(zstd_stat_dec_header_inval); | |
641 | return (1); | |
642 | } | |
643 | ||
644 | dctx = ZSTD_createDCtx_advanced(zstd_dctx_malloc); | |
645 | if (!dctx) { | |
646 | ZSTDSTAT_BUMP(zstd_stat_dec_alloc_fail); | |
647 | return (1); | |
648 | } | |
649 | ||
650 | /* Set header type to "magicless" */ | |
651 | ZSTD_DCtx_setParameter(dctx, ZSTD_d_format, ZSTD_f_zstd1_magicless); | |
652 | ||
653 | /* Decompress the data and release the context */ | |
654 | result = ZSTD_decompressDCtx(dctx, d_start, d_len, hdr->data, c_len); | |
655 | ZSTD_freeDCtx(dctx); | |
656 | ||
657 | /* | |
658 | * Returns 0 on success (decompression function returned non-negative) | |
659 | * and non-zero on failure (decompression function returned negative. | |
660 | */ | |
661 | if (ZSTD_isError(result)) { | |
662 | ZSTDSTAT_BUMP(zstd_stat_dec_fail); | |
663 | return (1); | |
664 | } | |
665 | ||
666 | if (level) { | |
b1a1c643 | 667 | *level = curlevel; |
10b3c7f5 MN |
668 | } |
669 | ||
670 | return (0); | |
671 | } | |
672 | ||
673 | /* Decompress datablock using zstd */ | |
674 | int | |
184df27e | 675 | zfs_zstd_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, |
10b3c7f5 MN |
676 | int level __maybe_unused) |
677 | { | |
678 | ||
184df27e SG |
679 | return (zfs_zstd_decompress_level(s_start, d_start, s_len, d_len, |
680 | NULL)); | |
10b3c7f5 MN |
681 | } |
682 | ||
683 | /* Allocator for zstd compression context using mempool_allocator */ | |
684 | static void * | |
685 | zstd_alloc(void *opaque __maybe_unused, size_t size) | |
686 | { | |
687 | size_t nbytes = sizeof (struct zstd_kmem) + size; | |
688 | struct zstd_kmem *z = NULL; | |
689 | ||
690 | z = (struct zstd_kmem *)zstd_mempool_alloc(zstd_mempool_cctx, nbytes); | |
691 | ||
692 | if (!z) { | |
693 | ZSTDSTAT_BUMP(zstd_stat_alloc_fail); | |
694 | return (NULL); | |
695 | } | |
696 | ||
697 | return ((void*)z + (sizeof (struct zstd_kmem))); | |
698 | } | |
699 | ||
700 | /* | |
701 | * Allocator for zstd decompression context using mempool_allocator with | |
702 | * fallback to reserved memory if allocation fails | |
703 | */ | |
704 | static void * | |
705 | zstd_dctx_alloc(void *opaque __maybe_unused, size_t size) | |
706 | { | |
707 | size_t nbytes = sizeof (struct zstd_kmem) + size; | |
708 | struct zstd_kmem *z = NULL; | |
709 | enum zstd_kmem_type type = ZSTD_KMEM_DEFAULT; | |
710 | ||
711 | z = (struct zstd_kmem *)zstd_mempool_alloc(zstd_mempool_dctx, nbytes); | |
712 | if (!z) { | |
713 | /* Try harder, decompression shall not fail */ | |
714 | z = vmem_alloc(nbytes, KM_SLEEP); | |
715 | if (z) { | |
716 | z->pool = NULL; | |
717 | } | |
718 | ZSTDSTAT_BUMP(zstd_stat_alloc_fail); | |
719 | } else { | |
720 | return ((void*)z + (sizeof (struct zstd_kmem))); | |
721 | } | |
722 | ||
723 | /* Fallback if everything fails */ | |
724 | if (!z) { | |
725 | /* | |
726 | * Barrier since we only can handle it in a single thread. All | |
727 | * other following threads need to wait here until decompression | |
728 | * is completed. zstd_free will release this barrier later. | |
729 | */ | |
730 | mutex_enter(&zstd_dctx_fallback.barrier); | |
731 | ||
732 | z = zstd_dctx_fallback.mem; | |
733 | type = ZSTD_KMEM_DCTX; | |
734 | ZSTDSTAT_BUMP(zstd_stat_alloc_fallback); | |
735 | } | |
736 | ||
737 | /* Allocation should always be successful */ | |
738 | if (!z) { | |
739 | return (NULL); | |
740 | } | |
741 | ||
742 | z->kmem_type = type; | |
743 | z->kmem_size = nbytes; | |
744 | ||
745 | return ((void*)z + (sizeof (struct zstd_kmem))); | |
746 | } | |
747 | ||
748 | /* Free allocated memory by its specific type */ | |
749 | static void | |
750 | zstd_free(void *opaque __maybe_unused, void *ptr) | |
751 | { | |
752 | struct zstd_kmem *z = (ptr - sizeof (struct zstd_kmem)); | |
753 | enum zstd_kmem_type type; | |
754 | ||
755 | ASSERT3U(z->kmem_type, <, ZSTD_KMEM_COUNT); | |
756 | ASSERT3U(z->kmem_type, >, ZSTD_KMEM_UNKNOWN); | |
757 | ||
758 | type = z->kmem_type; | |
759 | switch (type) { | |
760 | case ZSTD_KMEM_DEFAULT: | |
761 | vmem_free(z, z->kmem_size); | |
762 | break; | |
763 | case ZSTD_KMEM_POOL: | |
764 | zstd_mempool_free(z); | |
765 | break; | |
766 | case ZSTD_KMEM_DCTX: | |
767 | mutex_exit(&zstd_dctx_fallback.barrier); | |
768 | break; | |
769 | default: | |
770 | break; | |
771 | } | |
772 | } | |
773 | ||
774 | /* Allocate fallback memory to ensure safe decompression */ | |
775 | static void __init | |
776 | create_fallback_mem(struct zstd_fallback_mem *mem, size_t size) | |
777 | { | |
778 | mem->mem_size = size; | |
779 | mem->mem = vmem_zalloc(mem->mem_size, KM_SLEEP); | |
780 | mutex_init(&mem->barrier, NULL, MUTEX_DEFAULT, NULL); | |
781 | } | |
782 | ||
783 | /* Initialize memory pool barrier mutexes */ | |
784 | static void __init | |
785 | zstd_mempool_init(void) | |
786 | { | |
7384ec65 | 787 | zstd_mempool_cctx = |
10b3c7f5 | 788 | kmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP); |
7384ec65 | 789 | zstd_mempool_dctx = |
10b3c7f5 MN |
790 | kmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP); |
791 | ||
792 | for (int i = 0; i < ZSTD_POOL_MAX; i++) { | |
793 | mutex_init(&zstd_mempool_cctx[i].barrier, NULL, | |
794 | MUTEX_DEFAULT, NULL); | |
795 | mutex_init(&zstd_mempool_dctx[i].barrier, NULL, | |
796 | MUTEX_DEFAULT, NULL); | |
797 | } | |
798 | } | |
799 | ||
800 | /* Initialize zstd-related memory handling */ | |
801 | static int __init | |
802 | zstd_meminit(void) | |
803 | { | |
804 | zstd_mempool_init(); | |
805 | ||
806 | /* | |
807 | * Estimate the size of the fallback decompression context. | |
808 | * The expected size on x64 with current ZSTD should be about 160 KB. | |
809 | */ | |
810 | create_fallback_mem(&zstd_dctx_fallback, | |
811 | P2ROUNDUP(ZSTD_estimateDCtxSize() + sizeof (struct zstd_kmem), | |
812 | PAGESIZE)); | |
813 | ||
814 | return (0); | |
815 | } | |
816 | ||
817 | /* Release object from pool and free memory */ | |
ad9e7676 | 818 | static void |
10b3c7f5 MN |
819 | release_pool(struct zstd_pool *pool) |
820 | { | |
821 | mutex_destroy(&pool->barrier); | |
822 | vmem_free(pool->mem, pool->size); | |
823 | pool->mem = NULL; | |
824 | pool->size = 0; | |
825 | } | |
826 | ||
827 | /* Release memory pool objects */ | |
ad9e7676 | 828 | static void |
10b3c7f5 MN |
829 | zstd_mempool_deinit(void) |
830 | { | |
831 | for (int i = 0; i < ZSTD_POOL_MAX; i++) { | |
832 | release_pool(&zstd_mempool_cctx[i]); | |
833 | release_pool(&zstd_mempool_dctx[i]); | |
834 | } | |
835 | ||
836 | kmem_free(zstd_mempool_dctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool)); | |
837 | kmem_free(zstd_mempool_cctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool)); | |
838 | zstd_mempool_dctx = NULL; | |
839 | zstd_mempool_cctx = NULL; | |
840 | } | |
841 | ||
8a171ccd SG |
842 | /* release unused memory from pool */ |
843 | ||
844 | void | |
845 | zfs_zstd_cache_reap_now(void) | |
846 | { | |
847 | /* | |
848 | * calling alloc with zero size seeks | |
849 | * and releases old unused objects | |
850 | */ | |
7eefaf0c SG |
851 | zstd_mempool_reap(zstd_mempool_cctx); |
852 | zstd_mempool_reap(zstd_mempool_dctx); | |
8a171ccd SG |
853 | } |
854 | ||
10b3c7f5 MN |
855 | extern int __init |
856 | zstd_init(void) | |
857 | { | |
858 | /* Set pool size by using maximum sane thread count * 4 */ | |
859 | pool_count = (boot_ncpus * 4); | |
860 | zstd_meminit(); | |
861 | ||
862 | /* Initialize kstat */ | |
863 | zstd_ksp = kstat_create("zfs", 0, "zstd", "misc", | |
864 | KSTAT_TYPE_NAMED, sizeof (zstd_stats) / sizeof (kstat_named_t), | |
865 | KSTAT_FLAG_VIRTUAL); | |
866 | if (zstd_ksp != NULL) { | |
867 | zstd_ksp->ks_data = &zstd_stats; | |
868 | kstat_install(zstd_ksp); | |
f375b23c RE |
869 | #ifdef _KERNEL |
870 | zstd_ksp->ks_update = kstat_zstd_update; | |
871 | #endif | |
10b3c7f5 MN |
872 | } |
873 | ||
874 | return (0); | |
875 | } | |
876 | ||
ad9e7676 | 877 | extern void |
10b3c7f5 MN |
878 | zstd_fini(void) |
879 | { | |
880 | /* Deinitialize kstat */ | |
881 | if (zstd_ksp != NULL) { | |
882 | kstat_delete(zstd_ksp); | |
883 | zstd_ksp = NULL; | |
884 | } | |
885 | ||
886 | /* Release fallback memory */ | |
887 | vmem_free(zstd_dctx_fallback.mem, zstd_dctx_fallback.mem_size); | |
888 | mutex_destroy(&zstd_dctx_fallback.barrier); | |
889 | ||
890 | /* Deinit memory pool */ | |
891 | zstd_mempool_deinit(); | |
892 | } | |
893 | ||
894 | #if defined(_KERNEL) | |
ad9e7676 | 895 | #ifdef __FreeBSD__ |
10b3c7f5 MN |
896 | module_init(zstd_init); |
897 | module_exit(zstd_fini); | |
ad9e7676 | 898 | #endif |
10b3c7f5 | 899 | |
fdc2d303 | 900 | ZFS_MODULE_PARAM(zfs, zstd_, earlyabort_pass, UINT, ZMOD_RW, |
f375b23c RE |
901 | "Enable early abort attempts when using zstd"); |
902 | ZFS_MODULE_PARAM(zfs, zstd_, abort_size, UINT, ZMOD_RW, | |
903 | "Minimal size of block to attempt early abort"); | |
10b3c7f5 | 904 | #endif |