]> git.proxmox.com Git - mirror_zfs.git/blob - module/zfs/brt.c
759bc8d2e2b85a2428e7d32fb86fa9deefdbc90f
[mirror_zfs.git] / module / zfs / brt.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2020, 2021, 2022 by Pawel Jakub Dawidek
24 */
25
26 #include <sys/zfs_context.h>
27 #include <sys/spa.h>
28 #include <sys/spa_impl.h>
29 #include <sys/zio.h>
30 #include <sys/brt.h>
31 #include <sys/brt_impl.h>
32 #include <sys/ddt.h>
33 #include <sys/bitmap.h>
34 #include <sys/zap.h>
35 #include <sys/dmu_tx.h>
36 #include <sys/arc.h>
37 #include <sys/dsl_pool.h>
38 #include <sys/dsl_scan.h>
39 #include <sys/vdev_impl.h>
40 #include <sys/kstat.h>
41 #include <sys/wmsum.h>
42
43 /*
44 * Block Cloning design.
45 *
46 * Block Cloning allows to manually clone a file (or a subset of its blocks)
47 * into another (or the same) file by just creating additional references to
48 * the data blocks without copying the data itself. Those references are kept
49 * in the Block Reference Tables (BRTs).
50 *
51 * In many ways this is similar to the existing deduplication, but there are
52 * some important differences:
53 *
54 * - Deduplication is automatic and Block Cloning is not - one has to use a
55 * dedicated system call(s) to clone the given file/blocks.
56 * - Deduplication keeps all data blocks in its table, even those referenced
57 * just once. Block Cloning creates an entry in its tables only when there
58 * are at least two references to the given data block. If the block was
59 * never explicitly cloned or the second to last reference was dropped,
60 * there will be neither space nor performance overhead.
61 * - Deduplication needs data to work - one needs to pass real data to the
62 * write(2) syscall, so hash can be calculated. Block Cloning doesn't require
63 * data, just block pointers to the data, so it is extremely fast, as we pay
64 * neither the cost of reading the data, nor the cost of writing the data -
65 * we operate exclusively on metadata.
66 * - If the D (dedup) bit is not set in the block pointer, it means that
67 * the block is not in the dedup table (DDT) and we won't consult the DDT
68 * when we need to free the block. Block Cloning must be consulted on every
69 * free, because we cannot modify the source BP (eg. by setting something
70 * similar to the D bit), thus we have no hint if the block is in the
71 * Block Reference Table (BRT), so we need to look into the BRT. There is
72 * an optimization in place that allows us to eliminate the majority of BRT
73 * lookups which is described below in the "Minimizing free penalty" section.
74 * - The BRT entry is much smaller than the DDT entry - for BRT we only store
75 * 64bit offset and 64bit reference counter.
76 * - Dedup keys are cryptographic hashes, so two blocks that are close to each
77 * other on disk are most likely in totally different parts of the DDT.
78 * The BRT entry keys are offsets into a single top-level VDEV, so data blocks
79 * from one file should have BRT entries close to each other.
80 * - Scrub will only do a single pass over a block that is referenced multiple
81 * times in the DDT. Unfortunately it is not currently (if at all) possible
82 * with Block Cloning and block referenced multiple times will be scrubbed
83 * multiple times. The new, sorted scrub should be able to eliminate
84 * duplicated reads given enough memory.
85 * - Deduplication requires cryptographically strong hash as a checksum or
86 * additional data verification. Block Cloning works with any checksum
87 * algorithm or even with checksumming disabled.
88 *
89 * As mentioned above, the BRT entries are much smaller than the DDT entries.
90 * To uniquely identify a block we just need its vdev id and offset. We also
91 * need to maintain a reference counter. The vdev id will often repeat, as there
92 * is a small number of top-level VDEVs and a large number of blocks stored in
93 * each VDEV. We take advantage of that to reduce the BRT entry size further by
94 * maintaining one BRT for each top-level VDEV, so we can then have only offset
95 * and counter as the BRT entry.
96 *
97 * Minimizing free penalty.
98 *
99 * Block Cloning allows creating additional references to any existing block.
100 * When we free a block there is no hint in the block pointer whether the block
101 * was cloned or not, so on each free we have to check if there is a
102 * corresponding entry in the BRT or not. If there is, we need to decrease
103 * the reference counter. Doing BRT lookup on every free can potentially be
104 * expensive by requiring additional I/Os if the BRT doesn't fit into memory.
105 * This is the main problem with deduplication, so we've learned our lesson and
106 * try not to repeat the same mistake here. How do we do that? We divide each
107 * top-level VDEV into 16MB regions. For each region we maintain a counter that
108 * is a sum of all the BRT entries that have offsets within the region. This
109 * creates the entries count array of 16bit numbers for each top-level VDEV.
110 * The entries count array is always kept in memory and updated on disk in the
111 * same transaction group as the BRT updates to keep everything in-sync. We can
112 * keep the array in memory, because it is very small. With 16MB regions and
113 * 1TB VDEV the array requires only 128kB of memory (we may decide to decrease
114 * the region size even further in the future). Now, when we want to free
115 * a block, we first consult the array. If the counter for the whole region is
116 * zero, there is no need to look for the BRT entry, as there isn't one for
117 * sure. If the counter for the region is greater than zero, only then we will
118 * do a BRT lookup and if an entry is found we will decrease the reference
119 * counter in the BRT entry and in the entry counters array.
120 *
121 * The entry counters array is small, but can potentially be larger for very
122 * large VDEVs or smaller regions. In this case we don't want to rewrite entire
123 * array on every change. We then divide the array into 32kB block and keep
124 * a bitmap of dirty blocks within a transaction group. When we sync the
125 * transaction group we can only update the parts of the entry counters array
126 * that were modified. Note: Keeping track of the dirty parts of the entry
127 * counters array is implemented, but updating only parts of the array on disk
128 * is not yet implemented - for now we will update entire array if there was
129 * any change.
130 *
131 * The implementation tries to be economic: if BRT is not used, or no longer
132 * used, there will be no entries in the MOS and no additional memory used (eg.
133 * the entry counters array is only allocated if needed).
134 *
135 * Interaction between Deduplication and Block Cloning.
136 *
137 * If both functionalities are in use, we could end up with a block that is
138 * referenced multiple times in both DDT and BRT. When we free one of the
139 * references we couldn't tell where it belongs, so we would have to decide
140 * what table takes the precedence: do we first clear DDT references or BRT
141 * references? To avoid this dilemma BRT cooperates with DDT - if a given block
142 * is being cloned using BRT and the BP has the D (dedup) bit set, BRT will
143 * lookup DDT entry instead and increase the counter there. No BRT entry
144 * will be created for a block which has the D (dedup) bit set.
145 * BRT may be more efficient for manual deduplication, but if the block is
146 * already in the DDT, then creating additional BRT entry would be less
147 * efficient. This clever idea was proposed by Allan Jude.
148 *
149 * Block Cloning across datasets.
150 *
151 * Block Cloning is not limited to cloning blocks within the same dataset.
152 * It is possible (and very useful) to clone blocks between different datasets.
153 * One use case is recovering files from snapshots. By cloning the files into
154 * dataset we need no additional storage. Without Block Cloning we would need
155 * additional space for those files.
156 * Another interesting use case is moving the files between datasets
157 * (copying the file content to the new dataset and removing the source file).
158 * In that case Block Cloning will only be used briefly, because the BRT entries
159 * will be removed when the source is removed.
160 * Note: currently it is not possible to clone blocks between encrypted
161 * datasets, even if those datasets use the same encryption key (this includes
162 * snapshots of encrypted datasets). Cloning blocks between datasets that use
163 * the same keys should be possible and should be implemented in the future.
164 *
165 * Block Cloning flow through ZFS layers.
166 *
167 * Note: Block Cloning can be used both for cloning file system blocks and ZVOL
168 * blocks. As of this writing no interface is implemented that allows for block
169 * cloning within a ZVOL.
170 * FreeBSD and Linux provides copy_file_range(2) system call and we will use it
171 * for blocking cloning.
172 *
173 * ssize_t
174 * copy_file_range(int infd, off_t *inoffp, int outfd, off_t *outoffp,
175 * size_t len, unsigned int flags);
176 *
177 * Even though offsets and length represent bytes, they have to be
178 * block-aligned or we will return an error so the upper layer can
179 * fallback to the generic mechanism that will just copy the data.
180 * Using copy_file_range(2) will call OS-independent zfs_clone_range() function.
181 * This function was implemented based on zfs_write(), but instead of writing
182 * the given data we first read block pointers using the new dmu_read_l0_bps()
183 * function from the source file. Once we have BPs from the source file we call
184 * the dmu_brt_clone() function on the destination file. This function
185 * allocates BPs for us. We iterate over all source BPs. If the given BP is
186 * a hole or an embedded block, we just copy BP as-is. If it points to a real
187 * data we place this BP on a BRT pending list using the brt_pending_add()
188 * function.
189 *
190 * We use this pending list to keep track of all BPs that got new references
191 * within this transaction group.
192 *
193 * Some special cases to consider and how we address them:
194 * - The block we want to clone may have been created within the same
195 * transaction group that we are trying to clone. Such block has no BP
196 * allocated yet, so cannot be immediately cloned. We return EAGAIN.
197 * - The block we want to clone may have been modified within the same
198 * transaction group. We return EAGAIN.
199 * - A block may be cloned multiple times during one transaction group (that's
200 * why pending list is actually a tree and not an append-only list - this
201 * way we can figure out faster if this block is cloned for the first time
202 * in this txg or consecutive time).
203 * - A block may be cloned and freed within the same transaction group
204 * (see dbuf_undirty()).
205 * - A block may be cloned and within the same transaction group the clone
206 * can be cloned again (see dmu_read_l0_bps()).
207 * - A file might have been deleted, but the caller still has a file descriptor
208 * open to this file and clones it.
209 *
210 * When we free a block we have an additional step in the ZIO pipeline where we
211 * call the zio_brt_free() function. We then call the brt_entry_decref()
212 * that loads the corresponding BRT entry (if one exists) and decreases
213 * reference counter. If this is not the last reference we will stop ZIO
214 * pipeline here. If this is the last reference or the block is not in the
215 * BRT, we continue the pipeline and free the block as usual.
216 *
217 * At the beginning of spa_sync() where there can be no more block cloning,
218 * but before issuing frees we call brt_pending_apply(). This function applies
219 * all the new clones to the BRT table - we load BRT entries and update
220 * reference counters. To sync new BRT entries to disk, we use brt_sync()
221 * function. This function will sync all dirty per-top-level-vdev BRTs,
222 * the entry counters arrays, etc.
223 *
224 * Block Cloning and ZIL.
225 *
226 * Every clone operation is divided into chunks (similar to write) and each
227 * chunk is cloned in a separate transaction. The chunk size is determined by
228 * how many BPs we can fit into a single ZIL entry.
229 * Replaying clone operation is different from the regular clone operation,
230 * as when we log clone operations we cannot use the source object - it may
231 * reside on a different dataset, so we log BPs we want to clone.
232 * The ZIL is replayed when we mount the given dataset, not when the pool is
233 * imported. Taking this into account it is possible that the pool is imported
234 * without mounting datasets and the source dataset is destroyed before the
235 * destination dataset is mounted and its ZIL replayed.
236 * To address this situation we leverage zil_claim() mechanism where ZFS will
237 * parse all the ZILs on pool import. When we come across TX_CLONE_RANGE
238 * entries, we will bump reference counters for their BPs in the BRT. Then
239 * on mount and ZIL replay we bump the reference counters once more, while the
240 * first references are dropped during ZIL destroy by zil_free_clone_range().
241 * It is possible that after zil_claim() we never mount the destination, so
242 * we never replay its ZIL and just destroy it. In this case the only taken
243 * references will be dropped by zil_free_clone_range(), since the cloning is
244 * not going to ever take place.
245 */
246
247 static kmem_cache_t *brt_entry_cache;
248 static kmem_cache_t *brt_pending_entry_cache;
249
250 /*
251 * Enable/disable prefetching of BRT entries that we are going to modify.
252 */
253 int zfs_brt_prefetch = 1;
254
255 #ifdef ZFS_DEBUG
256 #define BRT_DEBUG(...) do { \
257 if ((zfs_flags & ZFS_DEBUG_BRT) != 0) { \
258 __dprintf(B_TRUE, __FILE__, __func__, __LINE__, __VA_ARGS__); \
259 } \
260 } while (0)
261 #else
262 #define BRT_DEBUG(...) do { } while (0)
263 #endif
264
265 int brt_zap_leaf_blockshift = 12;
266 int brt_zap_indirect_blockshift = 12;
267
268 static kstat_t *brt_ksp;
269
270 typedef struct brt_stats {
271 kstat_named_t brt_addref_entry_in_memory;
272 kstat_named_t brt_addref_entry_not_on_disk;
273 kstat_named_t brt_addref_entry_on_disk;
274 kstat_named_t brt_addref_entry_read_lost_race;
275 kstat_named_t brt_decref_entry_in_memory;
276 kstat_named_t brt_decref_entry_loaded_from_disk;
277 kstat_named_t brt_decref_entry_not_in_memory;
278 kstat_named_t brt_decref_entry_not_on_disk;
279 kstat_named_t brt_decref_entry_read_lost_race;
280 kstat_named_t brt_decref_entry_still_referenced;
281 kstat_named_t brt_decref_free_data_later;
282 kstat_named_t brt_decref_free_data_now;
283 kstat_named_t brt_decref_no_entry;
284 } brt_stats_t;
285
286 static brt_stats_t brt_stats = {
287 { "addref_entry_in_memory", KSTAT_DATA_UINT64 },
288 { "addref_entry_not_on_disk", KSTAT_DATA_UINT64 },
289 { "addref_entry_on_disk", KSTAT_DATA_UINT64 },
290 { "addref_entry_read_lost_race", KSTAT_DATA_UINT64 },
291 { "decref_entry_in_memory", KSTAT_DATA_UINT64 },
292 { "decref_entry_loaded_from_disk", KSTAT_DATA_UINT64 },
293 { "decref_entry_not_in_memory", KSTAT_DATA_UINT64 },
294 { "decref_entry_not_on_disk", KSTAT_DATA_UINT64 },
295 { "decref_entry_read_lost_race", KSTAT_DATA_UINT64 },
296 { "decref_entry_still_referenced", KSTAT_DATA_UINT64 },
297 { "decref_free_data_later", KSTAT_DATA_UINT64 },
298 { "decref_free_data_now", KSTAT_DATA_UINT64 },
299 { "decref_no_entry", KSTAT_DATA_UINT64 }
300 };
301
302 struct {
303 wmsum_t brt_addref_entry_in_memory;
304 wmsum_t brt_addref_entry_not_on_disk;
305 wmsum_t brt_addref_entry_on_disk;
306 wmsum_t brt_addref_entry_read_lost_race;
307 wmsum_t brt_decref_entry_in_memory;
308 wmsum_t brt_decref_entry_loaded_from_disk;
309 wmsum_t brt_decref_entry_not_in_memory;
310 wmsum_t brt_decref_entry_not_on_disk;
311 wmsum_t brt_decref_entry_read_lost_race;
312 wmsum_t brt_decref_entry_still_referenced;
313 wmsum_t brt_decref_free_data_later;
314 wmsum_t brt_decref_free_data_now;
315 wmsum_t brt_decref_no_entry;
316 } brt_sums;
317
318 #define BRTSTAT_BUMP(stat) wmsum_add(&brt_sums.stat, 1)
319
320 static int brt_entry_compare(const void *x1, const void *x2);
321 static int brt_pending_entry_compare(const void *x1, const void *x2);
322
323 static void
324 brt_rlock(brt_t *brt)
325 {
326 rw_enter(&brt->brt_lock, RW_READER);
327 }
328
329 static void
330 brt_wlock(brt_t *brt)
331 {
332 rw_enter(&brt->brt_lock, RW_WRITER);
333 }
334
335 static void
336 brt_unlock(brt_t *brt)
337 {
338 rw_exit(&brt->brt_lock);
339 }
340
341 static uint16_t
342 brt_vdev_entcount_get(const brt_vdev_t *brtvd, uint64_t idx)
343 {
344
345 ASSERT3U(idx, <, brtvd->bv_size);
346
347 if (brtvd->bv_need_byteswap) {
348 return (BSWAP_16(brtvd->bv_entcount[idx]));
349 } else {
350 return (brtvd->bv_entcount[idx]);
351 }
352 }
353
354 static void
355 brt_vdev_entcount_set(brt_vdev_t *brtvd, uint64_t idx, uint16_t entcnt)
356 {
357
358 ASSERT3U(idx, <, brtvd->bv_size);
359
360 if (brtvd->bv_need_byteswap) {
361 brtvd->bv_entcount[idx] = BSWAP_16(entcnt);
362 } else {
363 brtvd->bv_entcount[idx] = entcnt;
364 }
365 }
366
367 static void
368 brt_vdev_entcount_inc(brt_vdev_t *brtvd, uint64_t idx)
369 {
370 uint16_t entcnt;
371
372 ASSERT3U(idx, <, brtvd->bv_size);
373
374 entcnt = brt_vdev_entcount_get(brtvd, idx);
375 ASSERT(entcnt < UINT16_MAX);
376
377 brt_vdev_entcount_set(brtvd, idx, entcnt + 1);
378 }
379
380 static void
381 brt_vdev_entcount_dec(brt_vdev_t *brtvd, uint64_t idx)
382 {
383 uint16_t entcnt;
384
385 ASSERT3U(idx, <, brtvd->bv_size);
386
387 entcnt = brt_vdev_entcount_get(brtvd, idx);
388 ASSERT(entcnt > 0);
389
390 brt_vdev_entcount_set(brtvd, idx, entcnt - 1);
391 }
392
393 #ifdef ZFS_DEBUG
394 static void
395 brt_vdev_dump(brt_t *brt)
396 {
397 brt_vdev_t *brtvd;
398 uint64_t vdevid;
399
400 if ((zfs_flags & ZFS_DEBUG_BRT) == 0) {
401 return;
402 }
403
404 if (brt->brt_nvdevs == 0) {
405 zfs_dbgmsg("BRT empty");
406 return;
407 }
408
409 zfs_dbgmsg("BRT vdev dump:");
410 for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
411 uint64_t idx;
412
413 brtvd = &brt->brt_vdevs[vdevid];
414 zfs_dbgmsg(" vdevid=%llu/%llu meta_dirty=%d entcount_dirty=%d "
415 "size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu\n",
416 (u_longlong_t)vdevid, (u_longlong_t)brtvd->bv_vdevid,
417 brtvd->bv_meta_dirty, brtvd->bv_entcount_dirty,
418 (u_longlong_t)brtvd->bv_size,
419 (u_longlong_t)brtvd->bv_totalcount,
420 (u_longlong_t)brtvd->bv_nblocks,
421 (size_t)BT_SIZEOFMAP(brtvd->bv_nblocks));
422 if (brtvd->bv_totalcount > 0) {
423 zfs_dbgmsg(" entcounts:");
424 for (idx = 0; idx < brtvd->bv_size; idx++) {
425 if (brt_vdev_entcount_get(brtvd, idx) > 0) {
426 zfs_dbgmsg(" [%04llu] %hu",
427 (u_longlong_t)idx,
428 brt_vdev_entcount_get(brtvd, idx));
429 }
430 }
431 }
432 if (brtvd->bv_entcount_dirty) {
433 char *bitmap;
434
435 bitmap = kmem_alloc(brtvd->bv_nblocks + 1, KM_SLEEP);
436 for (idx = 0; idx < brtvd->bv_nblocks; idx++) {
437 bitmap[idx] =
438 BT_TEST(brtvd->bv_bitmap, idx) ? 'x' : '.';
439 }
440 bitmap[idx] = '\0';
441 zfs_dbgmsg(" bitmap: %s", bitmap);
442 kmem_free(bitmap, brtvd->bv_nblocks + 1);
443 }
444 }
445 }
446 #endif
447
448 static brt_vdev_t *
449 brt_vdev(brt_t *brt, uint64_t vdevid)
450 {
451 brt_vdev_t *brtvd;
452
453 ASSERT(RW_LOCK_HELD(&brt->brt_lock));
454
455 if (vdevid < brt->brt_nvdevs) {
456 brtvd = &brt->brt_vdevs[vdevid];
457 } else {
458 brtvd = NULL;
459 }
460
461 return (brtvd);
462 }
463
464 static void
465 brt_vdev_create(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx)
466 {
467 char name[64];
468
469 ASSERT(RW_WRITE_HELD(&brt->brt_lock));
470 ASSERT0(brtvd->bv_mos_brtvdev);
471 ASSERT0(brtvd->bv_mos_entries);
472 ASSERT(brtvd->bv_entcount != NULL);
473 ASSERT(brtvd->bv_size > 0);
474 ASSERT(brtvd->bv_bitmap != NULL);
475 ASSERT(brtvd->bv_nblocks > 0);
476
477 brtvd->bv_mos_entries = zap_create_flags(brt->brt_mos, 0,
478 ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY, DMU_OTN_ZAP_METADATA,
479 brt_zap_leaf_blockshift, brt_zap_indirect_blockshift, DMU_OT_NONE,
480 0, tx);
481 VERIFY(brtvd->bv_mos_entries != 0);
482 BRT_DEBUG("MOS entries created, object=%llu",
483 (u_longlong_t)brtvd->bv_mos_entries);
484
485 /*
486 * We allocate DMU buffer to store the bv_entcount[] array.
487 * We will keep array size (bv_size) and cummulative count for all
488 * bv_entcount[]s (bv_totalcount) in the bonus buffer.
489 */
490 brtvd->bv_mos_brtvdev = dmu_object_alloc(brt->brt_mos,
491 DMU_OTN_UINT64_METADATA, BRT_BLOCKSIZE,
492 DMU_OTN_UINT64_METADATA, sizeof (brt_vdev_phys_t), tx);
493 VERIFY(brtvd->bv_mos_brtvdev != 0);
494 BRT_DEBUG("MOS BRT VDEV created, object=%llu",
495 (u_longlong_t)brtvd->bv_mos_brtvdev);
496
497 snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
498 (u_longlong_t)brtvd->bv_vdevid);
499 VERIFY0(zap_add(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name,
500 sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev, tx));
501 BRT_DEBUG("Pool directory object created, object=%s", name);
502
503 spa_feature_incr(brt->brt_spa, SPA_FEATURE_BLOCK_CLONING, tx);
504 }
505
506 static void
507 brt_vdev_realloc(brt_t *brt, brt_vdev_t *brtvd)
508 {
509 vdev_t *vd;
510 uint16_t *entcount;
511 ulong_t *bitmap;
512 uint64_t nblocks, size;
513
514 ASSERT(RW_WRITE_HELD(&brt->brt_lock));
515
516 spa_config_enter(brt->brt_spa, SCL_VDEV, FTAG, RW_READER);
517 vd = vdev_lookup_top(brt->brt_spa, brtvd->bv_vdevid);
518 size = (vdev_get_min_asize(vd) - 1) / brt->brt_rangesize + 1;
519 spa_config_exit(brt->brt_spa, SCL_VDEV, FTAG);
520
521 entcount = vmem_zalloc(sizeof (entcount[0]) * size, KM_SLEEP);
522 nblocks = BRT_RANGESIZE_TO_NBLOCKS(size);
523 bitmap = kmem_zalloc(BT_SIZEOFMAP(nblocks), KM_SLEEP);
524
525 if (!brtvd->bv_initiated) {
526 ASSERT0(brtvd->bv_size);
527 ASSERT(brtvd->bv_entcount == NULL);
528 ASSERT(brtvd->bv_bitmap == NULL);
529 ASSERT0(brtvd->bv_nblocks);
530
531 avl_create(&brtvd->bv_tree, brt_entry_compare,
532 sizeof (brt_entry_t), offsetof(brt_entry_t, bre_node));
533 } else {
534 ASSERT(brtvd->bv_size > 0);
535 ASSERT(brtvd->bv_entcount != NULL);
536 ASSERT(brtvd->bv_bitmap != NULL);
537 ASSERT(brtvd->bv_nblocks > 0);
538 /*
539 * TODO: Allow vdev shrinking. We only need to implement
540 * shrinking the on-disk BRT VDEV object.
541 * dmu_free_range(brt->brt_mos, brtvd->bv_mos_brtvdev, offset,
542 * size, tx);
543 */
544 ASSERT3U(brtvd->bv_size, <=, size);
545
546 memcpy(entcount, brtvd->bv_entcount,
547 sizeof (entcount[0]) * MIN(size, brtvd->bv_size));
548 memcpy(bitmap, brtvd->bv_bitmap, MIN(BT_SIZEOFMAP(nblocks),
549 BT_SIZEOFMAP(brtvd->bv_nblocks)));
550 vmem_free(brtvd->bv_entcount,
551 sizeof (entcount[0]) * brtvd->bv_size);
552 kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks));
553 }
554
555 brtvd->bv_size = size;
556 brtvd->bv_entcount = entcount;
557 brtvd->bv_bitmap = bitmap;
558 brtvd->bv_nblocks = nblocks;
559 if (!brtvd->bv_initiated) {
560 brtvd->bv_need_byteswap = FALSE;
561 brtvd->bv_initiated = TRUE;
562 BRT_DEBUG("BRT VDEV %llu initiated.",
563 (u_longlong_t)brtvd->bv_vdevid);
564 }
565 }
566
567 static void
568 brt_vdev_load(brt_t *brt, brt_vdev_t *brtvd)
569 {
570 char name[64];
571 dmu_buf_t *db;
572 brt_vdev_phys_t *bvphys;
573 int error;
574
575 snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
576 (u_longlong_t)brtvd->bv_vdevid);
577 error = zap_lookup(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name,
578 sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev);
579 if (error != 0)
580 return;
581 ASSERT(brtvd->bv_mos_brtvdev != 0);
582
583 error = dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db);
584 ASSERT0(error);
585 if (error != 0)
586 return;
587
588 bvphys = db->db_data;
589 if (brt->brt_rangesize == 0) {
590 brt->brt_rangesize = bvphys->bvp_rangesize;
591 } else {
592 ASSERT3U(brt->brt_rangesize, ==, bvphys->bvp_rangesize);
593 }
594
595 ASSERT(!brtvd->bv_initiated);
596 brt_vdev_realloc(brt, brtvd);
597
598 /* TODO: We don't support VDEV shrinking. */
599 ASSERT3U(bvphys->bvp_size, <=, brtvd->bv_size);
600
601 /*
602 * If VDEV grew, we will leave new bv_entcount[] entries zeroed out.
603 */
604 error = dmu_read(brt->brt_mos, brtvd->bv_mos_brtvdev, 0,
605 MIN(brtvd->bv_size, bvphys->bvp_size) * sizeof (uint16_t),
606 brtvd->bv_entcount, DMU_READ_NO_PREFETCH);
607 ASSERT0(error);
608
609 brtvd->bv_mos_entries = bvphys->bvp_mos_entries;
610 ASSERT(brtvd->bv_mos_entries != 0);
611 brtvd->bv_need_byteswap =
612 (bvphys->bvp_byteorder != BRT_NATIVE_BYTEORDER);
613 brtvd->bv_totalcount = bvphys->bvp_totalcount;
614 brtvd->bv_usedspace = bvphys->bvp_usedspace;
615 brtvd->bv_savedspace = bvphys->bvp_savedspace;
616 brt->brt_usedspace += brtvd->bv_usedspace;
617 brt->brt_savedspace += brtvd->bv_savedspace;
618
619 dmu_buf_rele(db, FTAG);
620
621 BRT_DEBUG("MOS BRT VDEV %s loaded: mos_brtvdev=%llu, mos_entries=%llu",
622 name, (u_longlong_t)brtvd->bv_mos_brtvdev,
623 (u_longlong_t)brtvd->bv_mos_entries);
624 }
625
626 static void
627 brt_vdev_dealloc(brt_t *brt, brt_vdev_t *brtvd)
628 {
629
630 ASSERT(RW_WRITE_HELD(&brt->brt_lock));
631 ASSERT(brtvd->bv_initiated);
632
633 vmem_free(brtvd->bv_entcount, sizeof (uint16_t) * brtvd->bv_size);
634 brtvd->bv_entcount = NULL;
635 kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks));
636 brtvd->bv_bitmap = NULL;
637 ASSERT0(avl_numnodes(&brtvd->bv_tree));
638 avl_destroy(&brtvd->bv_tree);
639
640 brtvd->bv_size = 0;
641 brtvd->bv_nblocks = 0;
642
643 brtvd->bv_initiated = FALSE;
644 BRT_DEBUG("BRT VDEV %llu deallocated.", (u_longlong_t)brtvd->bv_vdevid);
645 }
646
647 static void
648 brt_vdev_destroy(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx)
649 {
650 char name[64];
651 uint64_t count;
652 dmu_buf_t *db;
653 brt_vdev_phys_t *bvphys;
654
655 ASSERT(RW_WRITE_HELD(&brt->brt_lock));
656 ASSERT(brtvd->bv_mos_brtvdev != 0);
657 ASSERT(brtvd->bv_mos_entries != 0);
658
659 VERIFY0(zap_count(brt->brt_mos, brtvd->bv_mos_entries, &count));
660 VERIFY0(count);
661 VERIFY0(zap_destroy(brt->brt_mos, brtvd->bv_mos_entries, tx));
662 BRT_DEBUG("MOS entries destroyed, object=%llu",
663 (u_longlong_t)brtvd->bv_mos_entries);
664 brtvd->bv_mos_entries = 0;
665
666 VERIFY0(dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db));
667 bvphys = db->db_data;
668 ASSERT0(bvphys->bvp_totalcount);
669 ASSERT0(bvphys->bvp_usedspace);
670 ASSERT0(bvphys->bvp_savedspace);
671 dmu_buf_rele(db, FTAG);
672
673 VERIFY0(dmu_object_free(brt->brt_mos, brtvd->bv_mos_brtvdev, tx));
674 BRT_DEBUG("MOS BRT VDEV destroyed, object=%llu",
675 (u_longlong_t)brtvd->bv_mos_brtvdev);
676 brtvd->bv_mos_brtvdev = 0;
677
678 snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
679 (u_longlong_t)brtvd->bv_vdevid);
680 VERIFY0(zap_remove(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name, tx));
681 BRT_DEBUG("Pool directory object removed, object=%s", name);
682
683 brt_vdev_dealloc(brt, brtvd);
684
685 spa_feature_decr(brt->brt_spa, SPA_FEATURE_BLOCK_CLONING, tx);
686 }
687
688 static void
689 brt_vdevs_expand(brt_t *brt, uint64_t nvdevs)
690 {
691 brt_vdev_t *brtvd, *vdevs;
692 uint64_t vdevid;
693
694 ASSERT(RW_WRITE_HELD(&brt->brt_lock));
695 ASSERT3U(nvdevs, >, brt->brt_nvdevs);
696
697 vdevs = kmem_zalloc(sizeof (vdevs[0]) * nvdevs, KM_SLEEP);
698 if (brt->brt_nvdevs > 0) {
699 ASSERT(brt->brt_vdevs != NULL);
700
701 memcpy(vdevs, brt->brt_vdevs,
702 sizeof (brt_vdev_t) * brt->brt_nvdevs);
703 kmem_free(brt->brt_vdevs,
704 sizeof (brt_vdev_t) * brt->brt_nvdevs);
705 }
706 for (vdevid = brt->brt_nvdevs; vdevid < nvdevs; vdevid++) {
707 brtvd = &vdevs[vdevid];
708
709 brtvd->bv_vdevid = vdevid;
710 brtvd->bv_initiated = FALSE;
711 }
712
713 BRT_DEBUG("BRT VDEVs expanded from %llu to %llu.",
714 (u_longlong_t)brt->brt_nvdevs, (u_longlong_t)nvdevs);
715
716 brt->brt_vdevs = vdevs;
717 brt->brt_nvdevs = nvdevs;
718 }
719
720 static boolean_t
721 brt_vdev_lookup(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre)
722 {
723 uint64_t idx;
724
725 ASSERT(RW_LOCK_HELD(&brt->brt_lock));
726
727 idx = bre->bre_offset / brt->brt_rangesize;
728 if (brtvd->bv_entcount != NULL && idx < brtvd->bv_size) {
729 /* VDEV wasn't expanded. */
730 return (brt_vdev_entcount_get(brtvd, idx) > 0);
731 }
732
733 return (FALSE);
734 }
735
736 static void
737 brt_vdev_addref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre,
738 uint64_t dsize)
739 {
740 uint64_t idx;
741
742 ASSERT(RW_LOCK_HELD(&brt->brt_lock));
743 ASSERT(brtvd != NULL);
744 ASSERT(brtvd->bv_entcount != NULL);
745
746 brt->brt_savedspace += dsize;
747 brtvd->bv_savedspace += dsize;
748 brtvd->bv_meta_dirty = TRUE;
749
750 if (bre->bre_refcount > 1) {
751 return;
752 }
753
754 brt->brt_usedspace += dsize;
755 brtvd->bv_usedspace += dsize;
756
757 idx = bre->bre_offset / brt->brt_rangesize;
758 if (idx >= brtvd->bv_size) {
759 /* VDEV has been expanded. */
760 brt_vdev_realloc(brt, brtvd);
761 }
762
763 ASSERT3U(idx, <, brtvd->bv_size);
764
765 brtvd->bv_totalcount++;
766 brt_vdev_entcount_inc(brtvd, idx);
767 brtvd->bv_entcount_dirty = TRUE;
768 idx = idx / BRT_BLOCKSIZE / 8;
769 BT_SET(brtvd->bv_bitmap, idx);
770
771 #ifdef ZFS_DEBUG
772 brt_vdev_dump(brt);
773 #endif
774 }
775
776 static void
777 brt_vdev_decref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre,
778 uint64_t dsize)
779 {
780 uint64_t idx;
781
782 ASSERT(RW_WRITE_HELD(&brt->brt_lock));
783 ASSERT(brtvd != NULL);
784 ASSERT(brtvd->bv_entcount != NULL);
785
786 brt->brt_savedspace -= dsize;
787 brtvd->bv_savedspace -= dsize;
788 brtvd->bv_meta_dirty = TRUE;
789
790 if (bre->bre_refcount > 0) {
791 return;
792 }
793
794 brt->brt_usedspace -= dsize;
795 brtvd->bv_usedspace -= dsize;
796
797 idx = bre->bre_offset / brt->brt_rangesize;
798 ASSERT3U(idx, <, brtvd->bv_size);
799
800 ASSERT(brtvd->bv_totalcount > 0);
801 brtvd->bv_totalcount--;
802 brt_vdev_entcount_dec(brtvd, idx);
803 brtvd->bv_entcount_dirty = TRUE;
804 idx = idx / BRT_BLOCKSIZE / 8;
805 BT_SET(brtvd->bv_bitmap, idx);
806
807 #ifdef ZFS_DEBUG
808 brt_vdev_dump(brt);
809 #endif
810 }
811
812 static void
813 brt_vdev_sync(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx)
814 {
815 dmu_buf_t *db;
816 brt_vdev_phys_t *bvphys;
817
818 ASSERT(brtvd->bv_meta_dirty);
819 ASSERT(brtvd->bv_mos_brtvdev != 0);
820 ASSERT(dmu_tx_is_syncing(tx));
821
822 VERIFY0(dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db));
823
824 if (brtvd->bv_entcount_dirty) {
825 /*
826 * TODO: Walk brtvd->bv_bitmap and write only the dirty blocks.
827 */
828 dmu_write(brt->brt_mos, brtvd->bv_mos_brtvdev, 0,
829 brtvd->bv_size * sizeof (brtvd->bv_entcount[0]),
830 brtvd->bv_entcount, tx);
831 memset(brtvd->bv_bitmap, 0, BT_SIZEOFMAP(brtvd->bv_nblocks));
832 brtvd->bv_entcount_dirty = FALSE;
833 }
834
835 dmu_buf_will_dirty(db, tx);
836 bvphys = db->db_data;
837 bvphys->bvp_mos_entries = brtvd->bv_mos_entries;
838 bvphys->bvp_size = brtvd->bv_size;
839 if (brtvd->bv_need_byteswap) {
840 bvphys->bvp_byteorder = BRT_NON_NATIVE_BYTEORDER;
841 } else {
842 bvphys->bvp_byteorder = BRT_NATIVE_BYTEORDER;
843 }
844 bvphys->bvp_totalcount = brtvd->bv_totalcount;
845 bvphys->bvp_rangesize = brt->brt_rangesize;
846 bvphys->bvp_usedspace = brtvd->bv_usedspace;
847 bvphys->bvp_savedspace = brtvd->bv_savedspace;
848 dmu_buf_rele(db, FTAG);
849
850 brtvd->bv_meta_dirty = FALSE;
851 }
852
853 static void
854 brt_vdevs_alloc(brt_t *brt, boolean_t load)
855 {
856 brt_vdev_t *brtvd;
857 uint64_t vdevid;
858
859 brt_wlock(brt);
860
861 brt_vdevs_expand(brt, brt->brt_spa->spa_root_vdev->vdev_children);
862
863 if (load) {
864 for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
865 brtvd = &brt->brt_vdevs[vdevid];
866 ASSERT(brtvd->bv_entcount == NULL);
867
868 brt_vdev_load(brt, brtvd);
869 }
870 }
871
872 if (brt->brt_rangesize == 0) {
873 brt->brt_rangesize = BRT_RANGESIZE;
874 }
875
876 brt_unlock(brt);
877 }
878
879 static void
880 brt_vdevs_free(brt_t *brt)
881 {
882 brt_vdev_t *brtvd;
883 uint64_t vdevid;
884
885 brt_wlock(brt);
886
887 for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
888 brtvd = &brt->brt_vdevs[vdevid];
889 if (brtvd->bv_initiated)
890 brt_vdev_dealloc(brt, brtvd);
891 }
892 kmem_free(brt->brt_vdevs, sizeof (brt_vdev_t) * brt->brt_nvdevs);
893
894 brt_unlock(brt);
895 }
896
897 static void
898 brt_entry_fill(const blkptr_t *bp, brt_entry_t *bre, uint64_t *vdevidp)
899 {
900
901 bre->bre_offset = DVA_GET_OFFSET(&bp->blk_dva[0]);
902 bre->bre_refcount = 0;
903
904 *vdevidp = DVA_GET_VDEV(&bp->blk_dva[0]);
905 }
906
907 static int
908 brt_entry_compare(const void *x1, const void *x2)
909 {
910 const brt_entry_t *bre1 = x1;
911 const brt_entry_t *bre2 = x2;
912
913 return (TREE_CMP(bre1->bre_offset, bre2->bre_offset));
914 }
915
916 static int
917 brt_entry_lookup(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre)
918 {
919 uint64_t mos_entries;
920 uint64_t one, physsize;
921 int error;
922
923 ASSERT(RW_LOCK_HELD(&brt->brt_lock));
924
925 if (!brt_vdev_lookup(brt, brtvd, bre))
926 return (SET_ERROR(ENOENT));
927
928 /*
929 * Remember mos_entries object number. After we reacquire the BRT lock,
930 * the brtvd pointer may be invalid.
931 */
932 mos_entries = brtvd->bv_mos_entries;
933 if (mos_entries == 0)
934 return (SET_ERROR(ENOENT));
935
936 brt_unlock(brt);
937
938 error = zap_length_uint64(brt->brt_mos, mos_entries, &bre->bre_offset,
939 BRT_KEY_WORDS, &one, &physsize);
940 if (error == 0) {
941 ASSERT3U(one, ==, 1);
942 ASSERT3U(physsize, ==, sizeof (bre->bre_refcount));
943
944 error = zap_lookup_uint64(brt->brt_mos, mos_entries,
945 &bre->bre_offset, BRT_KEY_WORDS, 1,
946 sizeof (bre->bre_refcount), &bre->bre_refcount);
947 BRT_DEBUG("ZAP lookup: object=%llu vdev=%llu offset=%llu "
948 "count=%llu error=%d", (u_longlong_t)mos_entries,
949 (u_longlong_t)brtvd->bv_vdevid,
950 (u_longlong_t)bre->bre_offset,
951 error == 0 ? (u_longlong_t)bre->bre_refcount : 0, error);
952 }
953
954 brt_wlock(brt);
955
956 return (error);
957 }
958
959 static void
960 brt_entry_prefetch(brt_t *brt, uint64_t vdevid, brt_entry_t *bre)
961 {
962 brt_vdev_t *brtvd;
963 uint64_t mos_entries = 0;
964
965 brt_rlock(brt);
966 brtvd = brt_vdev(brt, vdevid);
967 if (brtvd != NULL)
968 mos_entries = brtvd->bv_mos_entries;
969 brt_unlock(brt);
970
971 if (mos_entries == 0)
972 return;
973
974 BRT_DEBUG("ZAP prefetch: object=%llu vdev=%llu offset=%llu",
975 (u_longlong_t)mos_entries, (u_longlong_t)vdevid,
976 (u_longlong_t)bre->bre_offset);
977 (void) zap_prefetch_uint64(brt->brt_mos, mos_entries,
978 (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS);
979 }
980
981 static int
982 brt_entry_update(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx)
983 {
984 int error;
985
986 ASSERT(RW_LOCK_HELD(&brt->brt_lock));
987 ASSERT(brtvd->bv_mos_entries != 0);
988 ASSERT(bre->bre_refcount > 0);
989
990 error = zap_update_uint64(brt->brt_mos, brtvd->bv_mos_entries,
991 (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS, 1,
992 sizeof (bre->bre_refcount), &bre->bre_refcount, tx);
993 BRT_DEBUG("ZAP update: object=%llu vdev=%llu offset=%llu count=%llu "
994 "error=%d", (u_longlong_t)brtvd->bv_mos_entries,
995 (u_longlong_t)brtvd->bv_vdevid, (u_longlong_t)bre->bre_offset,
996 (u_longlong_t)bre->bre_refcount, error);
997
998 return (error);
999 }
1000
1001 static int
1002 brt_entry_remove(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx)
1003 {
1004 int error;
1005
1006 ASSERT(RW_LOCK_HELD(&brt->brt_lock));
1007 ASSERT(brtvd->bv_mos_entries != 0);
1008 ASSERT0(bre->bre_refcount);
1009
1010 error = zap_remove_uint64(brt->brt_mos, brtvd->bv_mos_entries,
1011 (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS, tx);
1012 BRT_DEBUG("ZAP remove: object=%llu vdev=%llu offset=%llu count=%llu "
1013 "error=%d", (u_longlong_t)brtvd->bv_mos_entries,
1014 (u_longlong_t)brtvd->bv_vdevid, (u_longlong_t)bre->bre_offset,
1015 (u_longlong_t)bre->bre_refcount, error);
1016
1017 return (error);
1018 }
1019
1020 /*
1021 * Return TRUE if we _can_ have BRT entry for this bp. It might be false
1022 * positive, but gives us quick answer if we should look into BRT, which
1023 * may require reads and thus will be more expensive.
1024 */
1025 boolean_t
1026 brt_maybe_exists(spa_t *spa, const blkptr_t *bp)
1027 {
1028 brt_t *brt = spa->spa_brt;
1029 brt_vdev_t *brtvd;
1030 brt_entry_t bre_search;
1031 boolean_t mayexists = FALSE;
1032 uint64_t vdevid;
1033
1034 brt_entry_fill(bp, &bre_search, &vdevid);
1035
1036 brt_rlock(brt);
1037
1038 brtvd = brt_vdev(brt, vdevid);
1039 if (brtvd != NULL && brtvd->bv_initiated) {
1040 if (!avl_is_empty(&brtvd->bv_tree) ||
1041 brt_vdev_lookup(brt, brtvd, &bre_search)) {
1042 mayexists = TRUE;
1043 }
1044 }
1045
1046 brt_unlock(brt);
1047
1048 return (mayexists);
1049 }
1050
1051 uint64_t
1052 brt_get_dspace(spa_t *spa)
1053 {
1054 brt_t *brt = spa->spa_brt;
1055
1056 if (brt == NULL)
1057 return (0);
1058
1059 return (brt->brt_savedspace);
1060 }
1061
1062 uint64_t
1063 brt_get_used(spa_t *spa)
1064 {
1065 brt_t *brt = spa->spa_brt;
1066
1067 if (brt == NULL)
1068 return (0);
1069
1070 return (brt->brt_usedspace);
1071 }
1072
1073 uint64_t
1074 brt_get_saved(spa_t *spa)
1075 {
1076 brt_t *brt = spa->spa_brt;
1077
1078 if (brt == NULL)
1079 return (0);
1080
1081 return (brt->brt_savedspace);
1082 }
1083
1084 uint64_t
1085 brt_get_ratio(spa_t *spa)
1086 {
1087 brt_t *brt = spa->spa_brt;
1088
1089 if (brt->brt_usedspace == 0)
1090 return (100);
1091
1092 return ((brt->brt_usedspace + brt->brt_savedspace) * 100 /
1093 brt->brt_usedspace);
1094 }
1095
1096 static int
1097 brt_kstats_update(kstat_t *ksp, int rw)
1098 {
1099 brt_stats_t *bs = ksp->ks_data;
1100
1101 if (rw == KSTAT_WRITE)
1102 return (EACCES);
1103
1104 bs->brt_addref_entry_in_memory.value.ui64 =
1105 wmsum_value(&brt_sums.brt_addref_entry_in_memory);
1106 bs->brt_addref_entry_not_on_disk.value.ui64 =
1107 wmsum_value(&brt_sums.brt_addref_entry_not_on_disk);
1108 bs->brt_addref_entry_on_disk.value.ui64 =
1109 wmsum_value(&brt_sums.brt_addref_entry_on_disk);
1110 bs->brt_addref_entry_read_lost_race.value.ui64 =
1111 wmsum_value(&brt_sums.brt_addref_entry_read_lost_race);
1112 bs->brt_decref_entry_in_memory.value.ui64 =
1113 wmsum_value(&brt_sums.brt_decref_entry_in_memory);
1114 bs->brt_decref_entry_loaded_from_disk.value.ui64 =
1115 wmsum_value(&brt_sums.brt_decref_entry_loaded_from_disk);
1116 bs->brt_decref_entry_not_in_memory.value.ui64 =
1117 wmsum_value(&brt_sums.brt_decref_entry_not_in_memory);
1118 bs->brt_decref_entry_not_on_disk.value.ui64 =
1119 wmsum_value(&brt_sums.brt_decref_entry_not_on_disk);
1120 bs->brt_decref_entry_read_lost_race.value.ui64 =
1121 wmsum_value(&brt_sums.brt_decref_entry_read_lost_race);
1122 bs->brt_decref_entry_still_referenced.value.ui64 =
1123 wmsum_value(&brt_sums.brt_decref_entry_still_referenced);
1124 bs->brt_decref_free_data_later.value.ui64 =
1125 wmsum_value(&brt_sums.brt_decref_free_data_later);
1126 bs->brt_decref_free_data_now.value.ui64 =
1127 wmsum_value(&brt_sums.brt_decref_free_data_now);
1128 bs->brt_decref_no_entry.value.ui64 =
1129 wmsum_value(&brt_sums.brt_decref_no_entry);
1130
1131 return (0);
1132 }
1133
1134 static void
1135 brt_stat_init(void)
1136 {
1137
1138 wmsum_init(&brt_sums.brt_addref_entry_in_memory, 0);
1139 wmsum_init(&brt_sums.brt_addref_entry_not_on_disk, 0);
1140 wmsum_init(&brt_sums.brt_addref_entry_on_disk, 0);
1141 wmsum_init(&brt_sums.brt_addref_entry_read_lost_race, 0);
1142 wmsum_init(&brt_sums.brt_decref_entry_in_memory, 0);
1143 wmsum_init(&brt_sums.brt_decref_entry_loaded_from_disk, 0);
1144 wmsum_init(&brt_sums.brt_decref_entry_not_in_memory, 0);
1145 wmsum_init(&brt_sums.brt_decref_entry_not_on_disk, 0);
1146 wmsum_init(&brt_sums.brt_decref_entry_read_lost_race, 0);
1147 wmsum_init(&brt_sums.brt_decref_entry_still_referenced, 0);
1148 wmsum_init(&brt_sums.brt_decref_free_data_later, 0);
1149 wmsum_init(&brt_sums.brt_decref_free_data_now, 0);
1150 wmsum_init(&brt_sums.brt_decref_no_entry, 0);
1151
1152 brt_ksp = kstat_create("zfs", 0, "brtstats", "misc", KSTAT_TYPE_NAMED,
1153 sizeof (brt_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
1154 if (brt_ksp != NULL) {
1155 brt_ksp->ks_data = &brt_stats;
1156 brt_ksp->ks_update = brt_kstats_update;
1157 kstat_install(brt_ksp);
1158 }
1159 }
1160
1161 static void
1162 brt_stat_fini(void)
1163 {
1164 if (brt_ksp != NULL) {
1165 kstat_delete(brt_ksp);
1166 brt_ksp = NULL;
1167 }
1168
1169 wmsum_fini(&brt_sums.brt_addref_entry_in_memory);
1170 wmsum_fini(&brt_sums.brt_addref_entry_not_on_disk);
1171 wmsum_fini(&brt_sums.brt_addref_entry_on_disk);
1172 wmsum_fini(&brt_sums.brt_addref_entry_read_lost_race);
1173 wmsum_fini(&brt_sums.brt_decref_entry_in_memory);
1174 wmsum_fini(&brt_sums.brt_decref_entry_loaded_from_disk);
1175 wmsum_fini(&brt_sums.brt_decref_entry_not_in_memory);
1176 wmsum_fini(&brt_sums.brt_decref_entry_not_on_disk);
1177 wmsum_fini(&brt_sums.brt_decref_entry_read_lost_race);
1178 wmsum_fini(&brt_sums.brt_decref_entry_still_referenced);
1179 wmsum_fini(&brt_sums.brt_decref_free_data_later);
1180 wmsum_fini(&brt_sums.brt_decref_free_data_now);
1181 wmsum_fini(&brt_sums.brt_decref_no_entry);
1182 }
1183
1184 void
1185 brt_init(void)
1186 {
1187 brt_entry_cache = kmem_cache_create("brt_entry_cache",
1188 sizeof (brt_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
1189 brt_pending_entry_cache = kmem_cache_create("brt_pending_entry_cache",
1190 sizeof (brt_pending_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
1191
1192 brt_stat_init();
1193 }
1194
1195 void
1196 brt_fini(void)
1197 {
1198 brt_stat_fini();
1199
1200 kmem_cache_destroy(brt_entry_cache);
1201 kmem_cache_destroy(brt_pending_entry_cache);
1202 }
1203
1204 static brt_entry_t *
1205 brt_entry_alloc(const brt_entry_t *bre_init)
1206 {
1207 brt_entry_t *bre;
1208
1209 bre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP);
1210 bre->bre_offset = bre_init->bre_offset;
1211 bre->bre_refcount = bre_init->bre_refcount;
1212
1213 return (bre);
1214 }
1215
1216 static void
1217 brt_entry_free(brt_entry_t *bre)
1218 {
1219
1220 kmem_cache_free(brt_entry_cache, bre);
1221 }
1222
1223 static void
1224 brt_entry_addref(brt_t *brt, const blkptr_t *bp)
1225 {
1226 brt_vdev_t *brtvd;
1227 brt_entry_t *bre, *racebre;
1228 brt_entry_t bre_search;
1229 avl_index_t where;
1230 uint64_t vdevid;
1231 int error;
1232
1233 ASSERT(!RW_WRITE_HELD(&brt->brt_lock));
1234
1235 brt_entry_fill(bp, &bre_search, &vdevid);
1236
1237 brt_wlock(brt);
1238
1239 brtvd = brt_vdev(brt, vdevid);
1240 if (brtvd == NULL) {
1241 ASSERT3U(vdevid, >=, brt->brt_nvdevs);
1242
1243 /* New VDEV was added. */
1244 brt_vdevs_expand(brt, vdevid + 1);
1245 brtvd = brt_vdev(brt, vdevid);
1246 }
1247 ASSERT(brtvd != NULL);
1248 if (!brtvd->bv_initiated)
1249 brt_vdev_realloc(brt, brtvd);
1250
1251 bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
1252 if (bre != NULL) {
1253 BRTSTAT_BUMP(brt_addref_entry_in_memory);
1254 } else {
1255 /*
1256 * brt_entry_lookup() may drop the BRT (read) lock and
1257 * reacquire it (write).
1258 */
1259 error = brt_entry_lookup(brt, brtvd, &bre_search);
1260 /* bre_search now contains correct bre_refcount */
1261 ASSERT(error == 0 || error == ENOENT);
1262 if (error == 0)
1263 BRTSTAT_BUMP(brt_addref_entry_on_disk);
1264 else
1265 BRTSTAT_BUMP(brt_addref_entry_not_on_disk);
1266 /*
1267 * When the BRT lock was dropped, brt_vdevs[] may have been
1268 * expanded and reallocated, we need to update brtvd's pointer.
1269 */
1270 brtvd = brt_vdev(brt, vdevid);
1271 ASSERT(brtvd != NULL);
1272
1273 racebre = avl_find(&brtvd->bv_tree, &bre_search, &where);
1274 if (racebre == NULL) {
1275 bre = brt_entry_alloc(&bre_search);
1276 ASSERT(RW_WRITE_HELD(&brt->brt_lock));
1277 avl_insert(&brtvd->bv_tree, bre, where);
1278 brt->brt_nentries++;
1279 } else {
1280 /*
1281 * The entry was added when the BRT lock was dropped in
1282 * brt_entry_lookup().
1283 */
1284 BRTSTAT_BUMP(brt_addref_entry_read_lost_race);
1285 bre = racebre;
1286 }
1287 }
1288 bre->bre_refcount++;
1289 brt_vdev_addref(brt, brtvd, bre, bp_get_dsize(brt->brt_spa, bp));
1290
1291 brt_unlock(brt);
1292 }
1293
1294 /* Return TRUE if block should be freed immediately. */
1295 boolean_t
1296 brt_entry_decref(spa_t *spa, const blkptr_t *bp)
1297 {
1298 brt_t *brt = spa->spa_brt;
1299 brt_vdev_t *brtvd;
1300 brt_entry_t *bre, *racebre;
1301 brt_entry_t bre_search;
1302 avl_index_t where;
1303 uint64_t vdevid;
1304 int error;
1305
1306 brt_entry_fill(bp, &bre_search, &vdevid);
1307
1308 brt_wlock(brt);
1309
1310 brtvd = brt_vdev(brt, vdevid);
1311 ASSERT(brtvd != NULL);
1312
1313 bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
1314 if (bre != NULL) {
1315 BRTSTAT_BUMP(brt_decref_entry_in_memory);
1316 goto out;
1317 } else {
1318 BRTSTAT_BUMP(brt_decref_entry_not_in_memory);
1319 }
1320
1321 /*
1322 * brt_entry_lookup() may drop the BRT lock and reacquire it.
1323 */
1324 error = brt_entry_lookup(brt, brtvd, &bre_search);
1325 /* bre_search now contains correct bre_refcount */
1326 ASSERT(error == 0 || error == ENOENT);
1327 /*
1328 * When the BRT lock was dropped, brt_vdevs[] may have been expanded
1329 * and reallocated, we need to update brtvd's pointer.
1330 */
1331 brtvd = brt_vdev(brt, vdevid);
1332 ASSERT(brtvd != NULL);
1333
1334 if (error == ENOENT) {
1335 BRTSTAT_BUMP(brt_decref_entry_not_on_disk);
1336 bre = NULL;
1337 goto out;
1338 }
1339
1340 racebre = avl_find(&brtvd->bv_tree, &bre_search, &where);
1341 if (racebre != NULL) {
1342 /*
1343 * The entry was added when the BRT lock was dropped in
1344 * brt_entry_lookup().
1345 */
1346 BRTSTAT_BUMP(brt_decref_entry_read_lost_race);
1347 bre = racebre;
1348 goto out;
1349 }
1350
1351 BRTSTAT_BUMP(brt_decref_entry_loaded_from_disk);
1352 bre = brt_entry_alloc(&bre_search);
1353 ASSERT(RW_WRITE_HELD(&brt->brt_lock));
1354 avl_insert(&brtvd->bv_tree, bre, where);
1355 brt->brt_nentries++;
1356
1357 out:
1358 if (bre == NULL) {
1359 /*
1360 * This is a free of a regular (not cloned) block.
1361 */
1362 brt_unlock(brt);
1363 BRTSTAT_BUMP(brt_decref_no_entry);
1364 return (B_TRUE);
1365 }
1366 if (bre->bre_refcount == 0) {
1367 brt_unlock(brt);
1368 BRTSTAT_BUMP(brt_decref_free_data_now);
1369 return (B_TRUE);
1370 }
1371
1372 ASSERT(bre->bre_refcount > 0);
1373 bre->bre_refcount--;
1374 if (bre->bre_refcount == 0)
1375 BRTSTAT_BUMP(brt_decref_free_data_later);
1376 else
1377 BRTSTAT_BUMP(brt_decref_entry_still_referenced);
1378 brt_vdev_decref(brt, brtvd, bre, bp_get_dsize(brt->brt_spa, bp));
1379
1380 brt_unlock(brt);
1381
1382 return (B_FALSE);
1383 }
1384
1385 uint64_t
1386 brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp)
1387 {
1388 brt_t *brt = spa->spa_brt;
1389 brt_vdev_t *brtvd;
1390 brt_entry_t bre_search, *bre;
1391 uint64_t vdevid, refcnt;
1392 int error;
1393
1394 brt_entry_fill(bp, &bre_search, &vdevid);
1395
1396 brt_rlock(brt);
1397
1398 brtvd = brt_vdev(brt, vdevid);
1399 ASSERT(brtvd != NULL);
1400
1401 bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
1402 if (bre == NULL) {
1403 error = brt_entry_lookup(brt, brtvd, &bre_search);
1404 ASSERT(error == 0 || error == ENOENT);
1405 if (error == ENOENT)
1406 refcnt = 0;
1407 else
1408 refcnt = bre_search.bre_refcount;
1409 } else
1410 refcnt = bre->bre_refcount;
1411
1412 brt_unlock(brt);
1413 return (refcnt);
1414 }
1415
1416 static void
1417 brt_prefetch(brt_t *brt, const blkptr_t *bp)
1418 {
1419 brt_entry_t bre;
1420 uint64_t vdevid;
1421
1422 ASSERT(bp != NULL);
1423
1424 if (!zfs_brt_prefetch)
1425 return;
1426
1427 brt_entry_fill(bp, &bre, &vdevid);
1428
1429 brt_entry_prefetch(brt, vdevid, &bre);
1430 }
1431
1432 static int
1433 brt_pending_entry_compare(const void *x1, const void *x2)
1434 {
1435 const brt_pending_entry_t *bpe1 = x1, *bpe2 = x2;
1436 const blkptr_t *bp1 = &bpe1->bpe_bp, *bp2 = &bpe2->bpe_bp;
1437 int cmp;
1438
1439 cmp = TREE_CMP(BP_PHYSICAL_BIRTH(bp1), BP_PHYSICAL_BIRTH(bp2));
1440 if (cmp == 0) {
1441 cmp = TREE_CMP(DVA_GET_VDEV(&bp1->blk_dva[0]),
1442 DVA_GET_VDEV(&bp2->blk_dva[0]));
1443 if (cmp == 0) {
1444 cmp = TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]),
1445 DVA_GET_OFFSET(&bp2->blk_dva[0]));
1446 }
1447 }
1448
1449 return (cmp);
1450 }
1451
1452 void
1453 brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
1454 {
1455 brt_t *brt;
1456 avl_tree_t *pending_tree;
1457 kmutex_t *pending_lock;
1458 brt_pending_entry_t *bpe, *newbpe;
1459 avl_index_t where;
1460 uint64_t txg;
1461
1462 brt = spa->spa_brt;
1463 txg = dmu_tx_get_txg(tx);
1464 ASSERT3U(txg, !=, 0);
1465 pending_tree = &brt->brt_pending_tree[txg & TXG_MASK];
1466 pending_lock = &brt->brt_pending_lock[txg & TXG_MASK];
1467
1468 newbpe = kmem_cache_alloc(brt_pending_entry_cache, KM_SLEEP);
1469 newbpe->bpe_bp = *bp;
1470 newbpe->bpe_count = 1;
1471
1472 mutex_enter(pending_lock);
1473
1474 bpe = avl_find(pending_tree, newbpe, &where);
1475 if (bpe == NULL) {
1476 avl_insert(pending_tree, newbpe, where);
1477 newbpe = NULL;
1478 } else {
1479 bpe->bpe_count++;
1480 }
1481
1482 mutex_exit(pending_lock);
1483
1484 if (newbpe != NULL) {
1485 ASSERT(bpe != NULL);
1486 ASSERT(bpe != newbpe);
1487 kmem_cache_free(brt_pending_entry_cache, newbpe);
1488 } else {
1489 ASSERT(bpe == NULL);
1490 }
1491
1492 /* Prefetch BRT entry, as we will need it in the syncing context. */
1493 brt_prefetch(brt, bp);
1494 }
1495
1496 void
1497 brt_pending_remove(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
1498 {
1499 brt_t *brt;
1500 avl_tree_t *pending_tree;
1501 kmutex_t *pending_lock;
1502 brt_pending_entry_t *bpe, bpe_search;
1503 uint64_t txg;
1504
1505 brt = spa->spa_brt;
1506 txg = dmu_tx_get_txg(tx);
1507 ASSERT3U(txg, !=, 0);
1508 pending_tree = &brt->brt_pending_tree[txg & TXG_MASK];
1509 pending_lock = &brt->brt_pending_lock[txg & TXG_MASK];
1510
1511 bpe_search.bpe_bp = *bp;
1512
1513 mutex_enter(pending_lock);
1514
1515 bpe = avl_find(pending_tree, &bpe_search, NULL);
1516 /* I believe we should always find bpe when this function is called. */
1517 if (bpe != NULL) {
1518 ASSERT(bpe->bpe_count > 0);
1519
1520 bpe->bpe_count--;
1521 if (bpe->bpe_count == 0) {
1522 avl_remove(pending_tree, bpe);
1523 kmem_cache_free(brt_pending_entry_cache, bpe);
1524 }
1525 }
1526
1527 mutex_exit(pending_lock);
1528 }
1529
1530 void
1531 brt_pending_apply(spa_t *spa, uint64_t txg)
1532 {
1533 brt_t *brt;
1534 brt_pending_entry_t *bpe;
1535 avl_tree_t *pending_tree;
1536 kmutex_t *pending_lock;
1537 void *c;
1538
1539 ASSERT3U(txg, !=, 0);
1540
1541 brt = spa->spa_brt;
1542 pending_tree = &brt->brt_pending_tree[txg & TXG_MASK];
1543 pending_lock = &brt->brt_pending_lock[txg & TXG_MASK];
1544
1545 mutex_enter(pending_lock);
1546
1547 c = NULL;
1548 while ((bpe = avl_destroy_nodes(pending_tree, &c)) != NULL) {
1549 boolean_t added_to_ddt;
1550
1551 mutex_exit(pending_lock);
1552
1553 for (int i = 0; i < bpe->bpe_count; i++) {
1554 /*
1555 * If the block has DEDUP bit set, it means that it
1556 * already exists in the DEDUP table, so we can just
1557 * use that instead of creating new entry in
1558 * the BRT table.
1559 */
1560 if (BP_GET_DEDUP(&bpe->bpe_bp)) {
1561 added_to_ddt = ddt_addref(spa, &bpe->bpe_bp);
1562 } else {
1563 added_to_ddt = B_FALSE;
1564 }
1565 if (!added_to_ddt)
1566 brt_entry_addref(brt, &bpe->bpe_bp);
1567 }
1568
1569 kmem_cache_free(brt_pending_entry_cache, bpe);
1570 mutex_enter(pending_lock);
1571 }
1572
1573 mutex_exit(pending_lock);
1574 }
1575
1576 static void
1577 brt_sync_entry(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx)
1578 {
1579
1580 ASSERT(RW_WRITE_HELD(&brt->brt_lock));
1581 ASSERT(brtvd->bv_mos_entries != 0);
1582
1583 if (bre->bre_refcount == 0) {
1584 int error;
1585
1586 error = brt_entry_remove(brt, brtvd, bre, tx);
1587 ASSERT(error == 0 || error == ENOENT);
1588 /*
1589 * If error == ENOENT then zfs_clone_range() was done from a
1590 * removed (but opened) file (open(), unlink()).
1591 */
1592 ASSERT(brt_entry_lookup(brt, brtvd, bre) == ENOENT);
1593 } else {
1594 VERIFY0(brt_entry_update(brt, brtvd, bre, tx));
1595 }
1596 }
1597
1598 static void
1599 brt_sync_table(brt_t *brt, dmu_tx_t *tx)
1600 {
1601 brt_vdev_t *brtvd;
1602 brt_entry_t *bre;
1603 uint64_t vdevid;
1604 void *c;
1605
1606 brt_wlock(brt);
1607
1608 for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
1609 brtvd = &brt->brt_vdevs[vdevid];
1610
1611 if (!brtvd->bv_initiated)
1612 continue;
1613
1614 if (!brtvd->bv_meta_dirty) {
1615 ASSERT(!brtvd->bv_entcount_dirty);
1616 ASSERT0(avl_numnodes(&brtvd->bv_tree));
1617 continue;
1618 }
1619
1620 ASSERT(!brtvd->bv_entcount_dirty ||
1621 avl_numnodes(&brtvd->bv_tree) != 0);
1622
1623 if (brtvd->bv_mos_brtvdev == 0)
1624 brt_vdev_create(brt, brtvd, tx);
1625
1626 c = NULL;
1627 while ((bre = avl_destroy_nodes(&brtvd->bv_tree, &c)) != NULL) {
1628 brt_sync_entry(brt, brtvd, bre, tx);
1629 brt_entry_free(bre);
1630 ASSERT(brt->brt_nentries > 0);
1631 brt->brt_nentries--;
1632 }
1633
1634 brt_vdev_sync(brt, brtvd, tx);
1635
1636 if (brtvd->bv_totalcount == 0)
1637 brt_vdev_destroy(brt, brtvd, tx);
1638 }
1639
1640 ASSERT0(brt->brt_nentries);
1641
1642 brt_unlock(brt);
1643 }
1644
1645 void
1646 brt_sync(spa_t *spa, uint64_t txg)
1647 {
1648 dmu_tx_t *tx;
1649 brt_t *brt;
1650
1651 ASSERT(spa_syncing_txg(spa) == txg);
1652
1653 brt = spa->spa_brt;
1654 brt_rlock(brt);
1655 if (brt->brt_nentries == 0) {
1656 /* No changes. */
1657 brt_unlock(brt);
1658 return;
1659 }
1660 brt_unlock(brt);
1661
1662 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
1663
1664 brt_sync_table(brt, tx);
1665
1666 dmu_tx_commit(tx);
1667 }
1668
1669 static void
1670 brt_table_alloc(brt_t *brt)
1671 {
1672
1673 for (int i = 0; i < TXG_SIZE; i++) {
1674 avl_create(&brt->brt_pending_tree[i],
1675 brt_pending_entry_compare,
1676 sizeof (brt_pending_entry_t),
1677 offsetof(brt_pending_entry_t, bpe_node));
1678 mutex_init(&brt->brt_pending_lock[i], NULL, MUTEX_DEFAULT,
1679 NULL);
1680 }
1681 }
1682
1683 static void
1684 brt_table_free(brt_t *brt)
1685 {
1686
1687 for (int i = 0; i < TXG_SIZE; i++) {
1688 ASSERT(avl_is_empty(&brt->brt_pending_tree[i]));
1689
1690 avl_destroy(&brt->brt_pending_tree[i]);
1691 mutex_destroy(&brt->brt_pending_lock[i]);
1692 }
1693 }
1694
1695 static void
1696 brt_alloc(spa_t *spa)
1697 {
1698 brt_t *brt;
1699
1700 ASSERT(spa->spa_brt == NULL);
1701
1702 brt = kmem_zalloc(sizeof (*brt), KM_SLEEP);
1703 rw_init(&brt->brt_lock, NULL, RW_DEFAULT, NULL);
1704 brt->brt_spa = spa;
1705 brt->brt_rangesize = 0;
1706 brt->brt_nentries = 0;
1707 brt->brt_vdevs = NULL;
1708 brt->brt_nvdevs = 0;
1709 brt_table_alloc(brt);
1710
1711 spa->spa_brt = brt;
1712 }
1713
1714 void
1715 brt_create(spa_t *spa)
1716 {
1717
1718 brt_alloc(spa);
1719 brt_vdevs_alloc(spa->spa_brt, B_FALSE);
1720 }
1721
1722 int
1723 brt_load(spa_t *spa)
1724 {
1725
1726 brt_alloc(spa);
1727 brt_vdevs_alloc(spa->spa_brt, B_TRUE);
1728
1729 return (0);
1730 }
1731
1732 void
1733 brt_unload(spa_t *spa)
1734 {
1735 brt_t *brt = spa->spa_brt;
1736
1737 if (brt == NULL)
1738 return;
1739
1740 brt_vdevs_free(brt);
1741 brt_table_free(brt);
1742 rw_destroy(&brt->brt_lock);
1743 kmem_free(brt, sizeof (*brt));
1744 spa->spa_brt = NULL;
1745 }
1746
1747 /* BEGIN CSTYLED */
1748 ZFS_MODULE_PARAM(zfs_brt, zfs_brt_, prefetch, INT, ZMOD_RW,
1749 "Enable prefetching of BRT entries");
1750 #ifdef ZFS_BRT_DEBUG
1751 ZFS_MODULE_PARAM(zfs_brt, zfs_brt_, debug, INT, ZMOD_RW, "BRT debug");
1752 #endif
1753 /* END CSTYLED */