]> git.proxmox.com Git - mirror_zfs.git/blob - module/zfs/brt.c
Allow block cloning across encrypted datasets
[mirror_zfs.git] / module / zfs / brt.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2020, 2021, 2022 by Pawel Jakub Dawidek
24 */
25
26 #include <sys/zfs_context.h>
27 #include <sys/spa.h>
28 #include <sys/spa_impl.h>
29 #include <sys/zio.h>
30 #include <sys/brt.h>
31 #include <sys/brt_impl.h>
32 #include <sys/ddt.h>
33 #include <sys/bitmap.h>
34 #include <sys/zap.h>
35 #include <sys/dmu_tx.h>
36 #include <sys/arc.h>
37 #include <sys/dsl_pool.h>
38 #include <sys/dsl_scan.h>
39 #include <sys/vdev_impl.h>
40 #include <sys/kstat.h>
41 #include <sys/wmsum.h>
42
43 /*
44 * Block Cloning design.
45 *
46 * Block Cloning allows to manually clone a file (or a subset of its blocks)
47 * into another (or the same) file by just creating additional references to
48 * the data blocks without copying the data itself. Those references are kept
49 * in the Block Reference Tables (BRTs).
50 *
51 * In many ways this is similar to the existing deduplication, but there are
52 * some important differences:
53 *
54 * - Deduplication is automatic and Block Cloning is not - one has to use a
55 * dedicated system call(s) to clone the given file/blocks.
56 * - Deduplication keeps all data blocks in its table, even those referenced
57 * just once. Block Cloning creates an entry in its tables only when there
58 * are at least two references to the given data block. If the block was
59 * never explicitly cloned or the second to last reference was dropped,
60 * there will be neither space nor performance overhead.
61 * - Deduplication needs data to work - one needs to pass real data to the
62 * write(2) syscall, so hash can be calculated. Block Cloning doesn't require
63 * data, just block pointers to the data, so it is extremely fast, as we pay
64 * neither the cost of reading the data, nor the cost of writing the data -
65 * we operate exclusively on metadata.
66 * - If the D (dedup) bit is not set in the block pointer, it means that
67 * the block is not in the dedup table (DDT) and we won't consult the DDT
68 * when we need to free the block. Block Cloning must be consulted on every
69 * free, because we cannot modify the source BP (eg. by setting something
70 * similar to the D bit), thus we have no hint if the block is in the
71 * Block Reference Table (BRT), so we need to look into the BRT. There is
72 * an optimization in place that allows us to eliminate the majority of BRT
73 * lookups which is described below in the "Minimizing free penalty" section.
74 * - The BRT entry is much smaller than the DDT entry - for BRT we only store
75 * 64bit offset and 64bit reference counter.
76 * - Dedup keys are cryptographic hashes, so two blocks that are close to each
77 * other on disk are most likely in totally different parts of the DDT.
78 * The BRT entry keys are offsets into a single top-level VDEV, so data blocks
79 * from one file should have BRT entries close to each other.
80 * - Scrub will only do a single pass over a block that is referenced multiple
81 * times in the DDT. Unfortunately it is not currently (if at all) possible
82 * with Block Cloning and block referenced multiple times will be scrubbed
83 * multiple times. The new, sorted scrub should be able to eliminate
84 * duplicated reads given enough memory.
85 * - Deduplication requires cryptographically strong hash as a checksum or
86 * additional data verification. Block Cloning works with any checksum
87 * algorithm or even with checksumming disabled.
88 *
89 * As mentioned above, the BRT entries are much smaller than the DDT entries.
90 * To uniquely identify a block we just need its vdev id and offset. We also
91 * need to maintain a reference counter. The vdev id will often repeat, as there
92 * is a small number of top-level VDEVs and a large number of blocks stored in
93 * each VDEV. We take advantage of that to reduce the BRT entry size further by
94 * maintaining one BRT for each top-level VDEV, so we can then have only offset
95 * and counter as the BRT entry.
96 *
97 * Minimizing free penalty.
98 *
99 * Block Cloning allows creating additional references to any existing block.
100 * When we free a block there is no hint in the block pointer whether the block
101 * was cloned or not, so on each free we have to check if there is a
102 * corresponding entry in the BRT or not. If there is, we need to decrease
103 * the reference counter. Doing BRT lookup on every free can potentially be
104 * expensive by requiring additional I/Os if the BRT doesn't fit into memory.
105 * This is the main problem with deduplication, so we've learned our lesson and
106 * try not to repeat the same mistake here. How do we do that? We divide each
107 * top-level VDEV into 16MB regions. For each region we maintain a counter that
108 * is a sum of all the BRT entries that have offsets within the region. This
109 * creates the entries count array of 16bit numbers for each top-level VDEV.
110 * The entries count array is always kept in memory and updated on disk in the
111 * same transaction group as the BRT updates to keep everything in-sync. We can
112 * keep the array in memory, because it is very small. With 16MB regions and
113 * 1TB VDEV the array requires only 128kB of memory (we may decide to decrease
114 * the region size even further in the future). Now, when we want to free
115 * a block, we first consult the array. If the counter for the whole region is
116 * zero, there is no need to look for the BRT entry, as there isn't one for
117 * sure. If the counter for the region is greater than zero, only then we will
118 * do a BRT lookup and if an entry is found we will decrease the reference
119 * counter in the BRT entry and in the entry counters array.
120 *
121 * The entry counters array is small, but can potentially be larger for very
122 * large VDEVs or smaller regions. In this case we don't want to rewrite entire
123 * array on every change. We then divide the array into 32kB block and keep
124 * a bitmap of dirty blocks within a transaction group. When we sync the
125 * transaction group we can only update the parts of the entry counters array
126 * that were modified. Note: Keeping track of the dirty parts of the entry
127 * counters array is implemented, but updating only parts of the array on disk
128 * is not yet implemented - for now we will update entire array if there was
129 * any change.
130 *
131 * The implementation tries to be economic: if BRT is not used, or no longer
132 * used, there will be no entries in the MOS and no additional memory used (eg.
133 * the entry counters array is only allocated if needed).
134 *
135 * Interaction between Deduplication and Block Cloning.
136 *
137 * If both functionalities are in use, we could end up with a block that is
138 * referenced multiple times in both DDT and BRT. When we free one of the
139 * references we couldn't tell where it belongs, so we would have to decide
140 * what table takes the precedence: do we first clear DDT references or BRT
141 * references? To avoid this dilemma BRT cooperates with DDT - if a given block
142 * is being cloned using BRT and the BP has the D (dedup) bit set, BRT will
143 * lookup DDT entry instead and increase the counter there. No BRT entry
144 * will be created for a block which has the D (dedup) bit set.
145 * BRT may be more efficient for manual deduplication, but if the block is
146 * already in the DDT, then creating additional BRT entry would be less
147 * efficient. This clever idea was proposed by Allan Jude.
148 *
149 * Block Cloning across datasets.
150 *
151 * Block Cloning is not limited to cloning blocks within the same dataset.
152 * It is possible (and very useful) to clone blocks between different datasets.
153 * One use case is recovering files from snapshots. By cloning the files into
154 * dataset we need no additional storage. Without Block Cloning we would need
155 * additional space for those files.
156 * Another interesting use case is moving the files between datasets
157 * (copying the file content to the new dataset and removing the source file).
158 * In that case Block Cloning will only be used briefly, because the BRT entries
159 * will be removed when the source is removed.
160 * Block Cloning across encrypted datasets is supported as long as both
161 * datasets share the same master key (e.g. snapshots and clones)
162 *
163 * Block Cloning flow through ZFS layers.
164 *
165 * Note: Block Cloning can be used both for cloning file system blocks and ZVOL
166 * blocks. As of this writing no interface is implemented that allows for block
167 * cloning within a ZVOL.
168 * FreeBSD and Linux provides copy_file_range(2) system call and we will use it
169 * for blocking cloning.
170 *
171 * ssize_t
172 * copy_file_range(int infd, off_t *inoffp, int outfd, off_t *outoffp,
173 * size_t len, unsigned int flags);
174 *
175 * Even though offsets and length represent bytes, they have to be
176 * block-aligned or we will return an error so the upper layer can
177 * fallback to the generic mechanism that will just copy the data.
178 * Using copy_file_range(2) will call OS-independent zfs_clone_range() function.
179 * This function was implemented based on zfs_write(), but instead of writing
180 * the given data we first read block pointers using the new dmu_read_l0_bps()
181 * function from the source file. Once we have BPs from the source file we call
182 * the dmu_brt_clone() function on the destination file. This function
183 * allocates BPs for us. We iterate over all source BPs. If the given BP is
184 * a hole or an embedded block, we just copy BP as-is. If it points to a real
185 * data we place this BP on a BRT pending list using the brt_pending_add()
186 * function.
187 *
188 * We use this pending list to keep track of all BPs that got new references
189 * within this transaction group.
190 *
191 * Some special cases to consider and how we address them:
192 * - The block we want to clone may have been created within the same
193 * transaction group that we are trying to clone. Such block has no BP
194 * allocated yet, so cannot be immediately cloned. We return EAGAIN.
195 * - The block we want to clone may have been modified within the same
196 * transaction group. We return EAGAIN.
197 * - A block may be cloned multiple times during one transaction group (that's
198 * why pending list is actually a tree and not an append-only list - this
199 * way we can figure out faster if this block is cloned for the first time
200 * in this txg or consecutive time).
201 * - A block may be cloned and freed within the same transaction group
202 * (see dbuf_undirty()).
203 * - A block may be cloned and within the same transaction group the clone
204 * can be cloned again (see dmu_read_l0_bps()).
205 * - A file might have been deleted, but the caller still has a file descriptor
206 * open to this file and clones it.
207 *
208 * When we free a block we have an additional step in the ZIO pipeline where we
209 * call the zio_brt_free() function. We then call the brt_entry_decref()
210 * that loads the corresponding BRT entry (if one exists) and decreases
211 * reference counter. If this is not the last reference we will stop ZIO
212 * pipeline here. If this is the last reference or the block is not in the
213 * BRT, we continue the pipeline and free the block as usual.
214 *
215 * At the beginning of spa_sync() where there can be no more block cloning,
216 * but before issuing frees we call brt_pending_apply(). This function applies
217 * all the new clones to the BRT table - we load BRT entries and update
218 * reference counters. To sync new BRT entries to disk, we use brt_sync()
219 * function. This function will sync all dirty per-top-level-vdev BRTs,
220 * the entry counters arrays, etc.
221 *
222 * Block Cloning and ZIL.
223 *
224 * Every clone operation is divided into chunks (similar to write) and each
225 * chunk is cloned in a separate transaction. The chunk size is determined by
226 * how many BPs we can fit into a single ZIL entry.
227 * Replaying clone operation is different from the regular clone operation,
228 * as when we log clone operations we cannot use the source object - it may
229 * reside on a different dataset, so we log BPs we want to clone.
230 * The ZIL is replayed when we mount the given dataset, not when the pool is
231 * imported. Taking this into account it is possible that the pool is imported
232 * without mounting datasets and the source dataset is destroyed before the
233 * destination dataset is mounted and its ZIL replayed.
234 * To address this situation we leverage zil_claim() mechanism where ZFS will
235 * parse all the ZILs on pool import. When we come across TX_CLONE_RANGE
236 * entries, we will bump reference counters for their BPs in the BRT. Then
237 * on mount and ZIL replay we bump the reference counters once more, while the
238 * first references are dropped during ZIL destroy by zil_free_clone_range().
239 * It is possible that after zil_claim() we never mount the destination, so
240 * we never replay its ZIL and just destroy it. In this case the only taken
241 * references will be dropped by zil_free_clone_range(), since the cloning is
242 * not going to ever take place.
243 */
244
245 static kmem_cache_t *brt_entry_cache;
246 static kmem_cache_t *brt_pending_entry_cache;
247
248 /*
249 * Enable/disable prefetching of BRT entries that we are going to modify.
250 */
251 int zfs_brt_prefetch = 1;
252
253 #ifdef ZFS_DEBUG
254 #define BRT_DEBUG(...) do { \
255 if ((zfs_flags & ZFS_DEBUG_BRT) != 0) { \
256 __dprintf(B_TRUE, __FILE__, __func__, __LINE__, __VA_ARGS__); \
257 } \
258 } while (0)
259 #else
260 #define BRT_DEBUG(...) do { } while (0)
261 #endif
262
263 int brt_zap_leaf_blockshift = 12;
264 int brt_zap_indirect_blockshift = 12;
265
266 static kstat_t *brt_ksp;
267
268 typedef struct brt_stats {
269 kstat_named_t brt_addref_entry_in_memory;
270 kstat_named_t brt_addref_entry_not_on_disk;
271 kstat_named_t brt_addref_entry_on_disk;
272 kstat_named_t brt_addref_entry_read_lost_race;
273 kstat_named_t brt_decref_entry_in_memory;
274 kstat_named_t brt_decref_entry_loaded_from_disk;
275 kstat_named_t brt_decref_entry_not_in_memory;
276 kstat_named_t brt_decref_entry_not_on_disk;
277 kstat_named_t brt_decref_entry_read_lost_race;
278 kstat_named_t brt_decref_entry_still_referenced;
279 kstat_named_t brt_decref_free_data_later;
280 kstat_named_t brt_decref_free_data_now;
281 kstat_named_t brt_decref_no_entry;
282 } brt_stats_t;
283
284 static brt_stats_t brt_stats = {
285 { "addref_entry_in_memory", KSTAT_DATA_UINT64 },
286 { "addref_entry_not_on_disk", KSTAT_DATA_UINT64 },
287 { "addref_entry_on_disk", KSTAT_DATA_UINT64 },
288 { "addref_entry_read_lost_race", KSTAT_DATA_UINT64 },
289 { "decref_entry_in_memory", KSTAT_DATA_UINT64 },
290 { "decref_entry_loaded_from_disk", KSTAT_DATA_UINT64 },
291 { "decref_entry_not_in_memory", KSTAT_DATA_UINT64 },
292 { "decref_entry_not_on_disk", KSTAT_DATA_UINT64 },
293 { "decref_entry_read_lost_race", KSTAT_DATA_UINT64 },
294 { "decref_entry_still_referenced", KSTAT_DATA_UINT64 },
295 { "decref_free_data_later", KSTAT_DATA_UINT64 },
296 { "decref_free_data_now", KSTAT_DATA_UINT64 },
297 { "decref_no_entry", KSTAT_DATA_UINT64 }
298 };
299
300 struct {
301 wmsum_t brt_addref_entry_in_memory;
302 wmsum_t brt_addref_entry_not_on_disk;
303 wmsum_t brt_addref_entry_on_disk;
304 wmsum_t brt_addref_entry_read_lost_race;
305 wmsum_t brt_decref_entry_in_memory;
306 wmsum_t brt_decref_entry_loaded_from_disk;
307 wmsum_t brt_decref_entry_not_in_memory;
308 wmsum_t brt_decref_entry_not_on_disk;
309 wmsum_t brt_decref_entry_read_lost_race;
310 wmsum_t brt_decref_entry_still_referenced;
311 wmsum_t brt_decref_free_data_later;
312 wmsum_t brt_decref_free_data_now;
313 wmsum_t brt_decref_no_entry;
314 } brt_sums;
315
316 #define BRTSTAT_BUMP(stat) wmsum_add(&brt_sums.stat, 1)
317
318 static int brt_entry_compare(const void *x1, const void *x2);
319 static int brt_pending_entry_compare(const void *x1, const void *x2);
320
321 static void
322 brt_rlock(brt_t *brt)
323 {
324 rw_enter(&brt->brt_lock, RW_READER);
325 }
326
327 static void
328 brt_wlock(brt_t *brt)
329 {
330 rw_enter(&brt->brt_lock, RW_WRITER);
331 }
332
333 static void
334 brt_unlock(brt_t *brt)
335 {
336 rw_exit(&brt->brt_lock);
337 }
338
339 static uint16_t
340 brt_vdev_entcount_get(const brt_vdev_t *brtvd, uint64_t idx)
341 {
342
343 ASSERT3U(idx, <, brtvd->bv_size);
344
345 if (brtvd->bv_need_byteswap) {
346 return (BSWAP_16(brtvd->bv_entcount[idx]));
347 } else {
348 return (brtvd->bv_entcount[idx]);
349 }
350 }
351
352 static void
353 brt_vdev_entcount_set(brt_vdev_t *brtvd, uint64_t idx, uint16_t entcnt)
354 {
355
356 ASSERT3U(idx, <, brtvd->bv_size);
357
358 if (brtvd->bv_need_byteswap) {
359 brtvd->bv_entcount[idx] = BSWAP_16(entcnt);
360 } else {
361 brtvd->bv_entcount[idx] = entcnt;
362 }
363 }
364
365 static void
366 brt_vdev_entcount_inc(brt_vdev_t *brtvd, uint64_t idx)
367 {
368 uint16_t entcnt;
369
370 ASSERT3U(idx, <, brtvd->bv_size);
371
372 entcnt = brt_vdev_entcount_get(brtvd, idx);
373 ASSERT(entcnt < UINT16_MAX);
374
375 brt_vdev_entcount_set(brtvd, idx, entcnt + 1);
376 }
377
378 static void
379 brt_vdev_entcount_dec(brt_vdev_t *brtvd, uint64_t idx)
380 {
381 uint16_t entcnt;
382
383 ASSERT3U(idx, <, brtvd->bv_size);
384
385 entcnt = brt_vdev_entcount_get(brtvd, idx);
386 ASSERT(entcnt > 0);
387
388 brt_vdev_entcount_set(brtvd, idx, entcnt - 1);
389 }
390
391 #ifdef ZFS_DEBUG
392 static void
393 brt_vdev_dump(brt_t *brt)
394 {
395 brt_vdev_t *brtvd;
396 uint64_t vdevid;
397
398 if ((zfs_flags & ZFS_DEBUG_BRT) == 0) {
399 return;
400 }
401
402 if (brt->brt_nvdevs == 0) {
403 zfs_dbgmsg("BRT empty");
404 return;
405 }
406
407 zfs_dbgmsg("BRT vdev dump:");
408 for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
409 uint64_t idx;
410
411 brtvd = &brt->brt_vdevs[vdevid];
412 zfs_dbgmsg(" vdevid=%llu/%llu meta_dirty=%d entcount_dirty=%d "
413 "size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu\n",
414 (u_longlong_t)vdevid, (u_longlong_t)brtvd->bv_vdevid,
415 brtvd->bv_meta_dirty, brtvd->bv_entcount_dirty,
416 (u_longlong_t)brtvd->bv_size,
417 (u_longlong_t)brtvd->bv_totalcount,
418 (u_longlong_t)brtvd->bv_nblocks,
419 (size_t)BT_SIZEOFMAP(brtvd->bv_nblocks));
420 if (brtvd->bv_totalcount > 0) {
421 zfs_dbgmsg(" entcounts:");
422 for (idx = 0; idx < brtvd->bv_size; idx++) {
423 if (brt_vdev_entcount_get(brtvd, idx) > 0) {
424 zfs_dbgmsg(" [%04llu] %hu",
425 (u_longlong_t)idx,
426 brt_vdev_entcount_get(brtvd, idx));
427 }
428 }
429 }
430 if (brtvd->bv_entcount_dirty) {
431 char *bitmap;
432
433 bitmap = kmem_alloc(brtvd->bv_nblocks + 1, KM_SLEEP);
434 for (idx = 0; idx < brtvd->bv_nblocks; idx++) {
435 bitmap[idx] =
436 BT_TEST(brtvd->bv_bitmap, idx) ? 'x' : '.';
437 }
438 bitmap[idx] = '\0';
439 zfs_dbgmsg(" bitmap: %s", bitmap);
440 kmem_free(bitmap, brtvd->bv_nblocks + 1);
441 }
442 }
443 }
444 #endif
445
446 static brt_vdev_t *
447 brt_vdev(brt_t *brt, uint64_t vdevid)
448 {
449 brt_vdev_t *brtvd;
450
451 ASSERT(RW_LOCK_HELD(&brt->brt_lock));
452
453 if (vdevid < brt->brt_nvdevs) {
454 brtvd = &brt->brt_vdevs[vdevid];
455 } else {
456 brtvd = NULL;
457 }
458
459 return (brtvd);
460 }
461
462 static void
463 brt_vdev_create(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx)
464 {
465 char name[64];
466
467 ASSERT(RW_WRITE_HELD(&brt->brt_lock));
468 ASSERT0(brtvd->bv_mos_brtvdev);
469 ASSERT0(brtvd->bv_mos_entries);
470 ASSERT(brtvd->bv_entcount != NULL);
471 ASSERT(brtvd->bv_size > 0);
472 ASSERT(brtvd->bv_bitmap != NULL);
473 ASSERT(brtvd->bv_nblocks > 0);
474
475 brtvd->bv_mos_entries = zap_create_flags(brt->brt_mos, 0,
476 ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY, DMU_OTN_ZAP_METADATA,
477 brt_zap_leaf_blockshift, brt_zap_indirect_blockshift, DMU_OT_NONE,
478 0, tx);
479 VERIFY(brtvd->bv_mos_entries != 0);
480 BRT_DEBUG("MOS entries created, object=%llu",
481 (u_longlong_t)brtvd->bv_mos_entries);
482
483 /*
484 * We allocate DMU buffer to store the bv_entcount[] array.
485 * We will keep array size (bv_size) and cummulative count for all
486 * bv_entcount[]s (bv_totalcount) in the bonus buffer.
487 */
488 brtvd->bv_mos_brtvdev = dmu_object_alloc(brt->brt_mos,
489 DMU_OTN_UINT64_METADATA, BRT_BLOCKSIZE,
490 DMU_OTN_UINT64_METADATA, sizeof (brt_vdev_phys_t), tx);
491 VERIFY(brtvd->bv_mos_brtvdev != 0);
492 BRT_DEBUG("MOS BRT VDEV created, object=%llu",
493 (u_longlong_t)brtvd->bv_mos_brtvdev);
494
495 snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
496 (u_longlong_t)brtvd->bv_vdevid);
497 VERIFY0(zap_add(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name,
498 sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev, tx));
499 BRT_DEBUG("Pool directory object created, object=%s", name);
500
501 spa_feature_incr(brt->brt_spa, SPA_FEATURE_BLOCK_CLONING, tx);
502 }
503
504 static void
505 brt_vdev_realloc(brt_t *brt, brt_vdev_t *brtvd)
506 {
507 vdev_t *vd;
508 uint16_t *entcount;
509 ulong_t *bitmap;
510 uint64_t nblocks, size;
511
512 ASSERT(RW_WRITE_HELD(&brt->brt_lock));
513
514 spa_config_enter(brt->brt_spa, SCL_VDEV, FTAG, RW_READER);
515 vd = vdev_lookup_top(brt->brt_spa, brtvd->bv_vdevid);
516 size = (vdev_get_min_asize(vd) - 1) / brt->brt_rangesize + 1;
517 spa_config_exit(brt->brt_spa, SCL_VDEV, FTAG);
518
519 entcount = vmem_zalloc(sizeof (entcount[0]) * size, KM_SLEEP);
520 nblocks = BRT_RANGESIZE_TO_NBLOCKS(size);
521 bitmap = kmem_zalloc(BT_SIZEOFMAP(nblocks), KM_SLEEP);
522
523 if (!brtvd->bv_initiated) {
524 ASSERT0(brtvd->bv_size);
525 ASSERT(brtvd->bv_entcount == NULL);
526 ASSERT(brtvd->bv_bitmap == NULL);
527 ASSERT0(brtvd->bv_nblocks);
528
529 avl_create(&brtvd->bv_tree, brt_entry_compare,
530 sizeof (brt_entry_t), offsetof(brt_entry_t, bre_node));
531 } else {
532 ASSERT(brtvd->bv_size > 0);
533 ASSERT(brtvd->bv_entcount != NULL);
534 ASSERT(brtvd->bv_bitmap != NULL);
535 ASSERT(brtvd->bv_nblocks > 0);
536 /*
537 * TODO: Allow vdev shrinking. We only need to implement
538 * shrinking the on-disk BRT VDEV object.
539 * dmu_free_range(brt->brt_mos, brtvd->bv_mos_brtvdev, offset,
540 * size, tx);
541 */
542 ASSERT3U(brtvd->bv_size, <=, size);
543
544 memcpy(entcount, brtvd->bv_entcount,
545 sizeof (entcount[0]) * MIN(size, brtvd->bv_size));
546 memcpy(bitmap, brtvd->bv_bitmap, MIN(BT_SIZEOFMAP(nblocks),
547 BT_SIZEOFMAP(brtvd->bv_nblocks)));
548 vmem_free(brtvd->bv_entcount,
549 sizeof (entcount[0]) * brtvd->bv_size);
550 kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks));
551 }
552
553 brtvd->bv_size = size;
554 brtvd->bv_entcount = entcount;
555 brtvd->bv_bitmap = bitmap;
556 brtvd->bv_nblocks = nblocks;
557 if (!brtvd->bv_initiated) {
558 brtvd->bv_need_byteswap = FALSE;
559 brtvd->bv_initiated = TRUE;
560 BRT_DEBUG("BRT VDEV %llu initiated.",
561 (u_longlong_t)brtvd->bv_vdevid);
562 }
563 }
564
565 static void
566 brt_vdev_load(brt_t *brt, brt_vdev_t *brtvd)
567 {
568 char name[64];
569 dmu_buf_t *db;
570 brt_vdev_phys_t *bvphys;
571 int error;
572
573 snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
574 (u_longlong_t)brtvd->bv_vdevid);
575 error = zap_lookup(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name,
576 sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev);
577 if (error != 0)
578 return;
579 ASSERT(brtvd->bv_mos_brtvdev != 0);
580
581 error = dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db);
582 ASSERT0(error);
583 if (error != 0)
584 return;
585
586 bvphys = db->db_data;
587 if (brt->brt_rangesize == 0) {
588 brt->brt_rangesize = bvphys->bvp_rangesize;
589 } else {
590 ASSERT3U(brt->brt_rangesize, ==, bvphys->bvp_rangesize);
591 }
592
593 ASSERT(!brtvd->bv_initiated);
594 brt_vdev_realloc(brt, brtvd);
595
596 /* TODO: We don't support VDEV shrinking. */
597 ASSERT3U(bvphys->bvp_size, <=, brtvd->bv_size);
598
599 /*
600 * If VDEV grew, we will leave new bv_entcount[] entries zeroed out.
601 */
602 error = dmu_read(brt->brt_mos, brtvd->bv_mos_brtvdev, 0,
603 MIN(brtvd->bv_size, bvphys->bvp_size) * sizeof (uint16_t),
604 brtvd->bv_entcount, DMU_READ_NO_PREFETCH);
605 ASSERT0(error);
606
607 brtvd->bv_mos_entries = bvphys->bvp_mos_entries;
608 ASSERT(brtvd->bv_mos_entries != 0);
609 brtvd->bv_need_byteswap =
610 (bvphys->bvp_byteorder != BRT_NATIVE_BYTEORDER);
611 brtvd->bv_totalcount = bvphys->bvp_totalcount;
612 brtvd->bv_usedspace = bvphys->bvp_usedspace;
613 brtvd->bv_savedspace = bvphys->bvp_savedspace;
614 brt->brt_usedspace += brtvd->bv_usedspace;
615 brt->brt_savedspace += brtvd->bv_savedspace;
616
617 dmu_buf_rele(db, FTAG);
618
619 BRT_DEBUG("MOS BRT VDEV %s loaded: mos_brtvdev=%llu, mos_entries=%llu",
620 name, (u_longlong_t)brtvd->bv_mos_brtvdev,
621 (u_longlong_t)brtvd->bv_mos_entries);
622 }
623
624 static void
625 brt_vdev_dealloc(brt_t *brt, brt_vdev_t *brtvd)
626 {
627
628 ASSERT(RW_WRITE_HELD(&brt->brt_lock));
629 ASSERT(brtvd->bv_initiated);
630
631 vmem_free(brtvd->bv_entcount, sizeof (uint16_t) * brtvd->bv_size);
632 brtvd->bv_entcount = NULL;
633 kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks));
634 brtvd->bv_bitmap = NULL;
635 ASSERT0(avl_numnodes(&brtvd->bv_tree));
636 avl_destroy(&brtvd->bv_tree);
637
638 brtvd->bv_size = 0;
639 brtvd->bv_nblocks = 0;
640
641 brtvd->bv_initiated = FALSE;
642 BRT_DEBUG("BRT VDEV %llu deallocated.", (u_longlong_t)brtvd->bv_vdevid);
643 }
644
645 static void
646 brt_vdev_destroy(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx)
647 {
648 char name[64];
649 uint64_t count;
650 dmu_buf_t *db;
651 brt_vdev_phys_t *bvphys;
652
653 ASSERT(RW_WRITE_HELD(&brt->brt_lock));
654 ASSERT(brtvd->bv_mos_brtvdev != 0);
655 ASSERT(brtvd->bv_mos_entries != 0);
656
657 VERIFY0(zap_count(brt->brt_mos, brtvd->bv_mos_entries, &count));
658 VERIFY0(count);
659 VERIFY0(zap_destroy(brt->brt_mos, brtvd->bv_mos_entries, tx));
660 BRT_DEBUG("MOS entries destroyed, object=%llu",
661 (u_longlong_t)brtvd->bv_mos_entries);
662 brtvd->bv_mos_entries = 0;
663
664 VERIFY0(dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db));
665 bvphys = db->db_data;
666 ASSERT0(bvphys->bvp_totalcount);
667 ASSERT0(bvphys->bvp_usedspace);
668 ASSERT0(bvphys->bvp_savedspace);
669 dmu_buf_rele(db, FTAG);
670
671 VERIFY0(dmu_object_free(brt->brt_mos, brtvd->bv_mos_brtvdev, tx));
672 BRT_DEBUG("MOS BRT VDEV destroyed, object=%llu",
673 (u_longlong_t)brtvd->bv_mos_brtvdev);
674 brtvd->bv_mos_brtvdev = 0;
675
676 snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
677 (u_longlong_t)brtvd->bv_vdevid);
678 VERIFY0(zap_remove(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name, tx));
679 BRT_DEBUG("Pool directory object removed, object=%s", name);
680
681 brt_vdev_dealloc(brt, brtvd);
682
683 spa_feature_decr(brt->brt_spa, SPA_FEATURE_BLOCK_CLONING, tx);
684 }
685
686 static void
687 brt_vdevs_expand(brt_t *brt, uint64_t nvdevs)
688 {
689 brt_vdev_t *brtvd, *vdevs;
690 uint64_t vdevid;
691
692 ASSERT(RW_WRITE_HELD(&brt->brt_lock));
693 ASSERT3U(nvdevs, >, brt->brt_nvdevs);
694
695 vdevs = kmem_zalloc(sizeof (vdevs[0]) * nvdevs, KM_SLEEP);
696 if (brt->brt_nvdevs > 0) {
697 ASSERT(brt->brt_vdevs != NULL);
698
699 memcpy(vdevs, brt->brt_vdevs,
700 sizeof (brt_vdev_t) * brt->brt_nvdevs);
701 kmem_free(brt->brt_vdevs,
702 sizeof (brt_vdev_t) * brt->brt_nvdevs);
703 }
704 for (vdevid = brt->brt_nvdevs; vdevid < nvdevs; vdevid++) {
705 brtvd = &vdevs[vdevid];
706
707 brtvd->bv_vdevid = vdevid;
708 brtvd->bv_initiated = FALSE;
709 }
710
711 BRT_DEBUG("BRT VDEVs expanded from %llu to %llu.",
712 (u_longlong_t)brt->brt_nvdevs, (u_longlong_t)nvdevs);
713
714 brt->brt_vdevs = vdevs;
715 brt->brt_nvdevs = nvdevs;
716 }
717
718 static boolean_t
719 brt_vdev_lookup(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre)
720 {
721 uint64_t idx;
722
723 ASSERT(RW_LOCK_HELD(&brt->brt_lock));
724
725 idx = bre->bre_offset / brt->brt_rangesize;
726 if (brtvd->bv_entcount != NULL && idx < brtvd->bv_size) {
727 /* VDEV wasn't expanded. */
728 return (brt_vdev_entcount_get(brtvd, idx) > 0);
729 }
730
731 return (FALSE);
732 }
733
734 static void
735 brt_vdev_addref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre,
736 uint64_t dsize)
737 {
738 uint64_t idx;
739
740 ASSERT(RW_LOCK_HELD(&brt->brt_lock));
741 ASSERT(brtvd != NULL);
742 ASSERT(brtvd->bv_entcount != NULL);
743
744 brt->brt_savedspace += dsize;
745 brtvd->bv_savedspace += dsize;
746 brtvd->bv_meta_dirty = TRUE;
747
748 if (bre->bre_refcount > 1) {
749 return;
750 }
751
752 brt->brt_usedspace += dsize;
753 brtvd->bv_usedspace += dsize;
754
755 idx = bre->bre_offset / brt->brt_rangesize;
756 if (idx >= brtvd->bv_size) {
757 /* VDEV has been expanded. */
758 brt_vdev_realloc(brt, brtvd);
759 }
760
761 ASSERT3U(idx, <, brtvd->bv_size);
762
763 brtvd->bv_totalcount++;
764 brt_vdev_entcount_inc(brtvd, idx);
765 brtvd->bv_entcount_dirty = TRUE;
766 idx = idx / BRT_BLOCKSIZE / 8;
767 BT_SET(brtvd->bv_bitmap, idx);
768
769 #ifdef ZFS_DEBUG
770 brt_vdev_dump(brt);
771 #endif
772 }
773
774 static void
775 brt_vdev_decref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre,
776 uint64_t dsize)
777 {
778 uint64_t idx;
779
780 ASSERT(RW_WRITE_HELD(&brt->brt_lock));
781 ASSERT(brtvd != NULL);
782 ASSERT(brtvd->bv_entcount != NULL);
783
784 brt->brt_savedspace -= dsize;
785 brtvd->bv_savedspace -= dsize;
786 brtvd->bv_meta_dirty = TRUE;
787
788 if (bre->bre_refcount > 0) {
789 return;
790 }
791
792 brt->brt_usedspace -= dsize;
793 brtvd->bv_usedspace -= dsize;
794
795 idx = bre->bre_offset / brt->brt_rangesize;
796 ASSERT3U(idx, <, brtvd->bv_size);
797
798 ASSERT(brtvd->bv_totalcount > 0);
799 brtvd->bv_totalcount--;
800 brt_vdev_entcount_dec(brtvd, idx);
801 brtvd->bv_entcount_dirty = TRUE;
802 idx = idx / BRT_BLOCKSIZE / 8;
803 BT_SET(brtvd->bv_bitmap, idx);
804
805 #ifdef ZFS_DEBUG
806 brt_vdev_dump(brt);
807 #endif
808 }
809
810 static void
811 brt_vdev_sync(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx)
812 {
813 dmu_buf_t *db;
814 brt_vdev_phys_t *bvphys;
815
816 ASSERT(brtvd->bv_meta_dirty);
817 ASSERT(brtvd->bv_mos_brtvdev != 0);
818 ASSERT(dmu_tx_is_syncing(tx));
819
820 VERIFY0(dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db));
821
822 if (brtvd->bv_entcount_dirty) {
823 /*
824 * TODO: Walk brtvd->bv_bitmap and write only the dirty blocks.
825 */
826 dmu_write(brt->brt_mos, brtvd->bv_mos_brtvdev, 0,
827 brtvd->bv_size * sizeof (brtvd->bv_entcount[0]),
828 brtvd->bv_entcount, tx);
829 memset(brtvd->bv_bitmap, 0, BT_SIZEOFMAP(brtvd->bv_nblocks));
830 brtvd->bv_entcount_dirty = FALSE;
831 }
832
833 dmu_buf_will_dirty(db, tx);
834 bvphys = db->db_data;
835 bvphys->bvp_mos_entries = brtvd->bv_mos_entries;
836 bvphys->bvp_size = brtvd->bv_size;
837 if (brtvd->bv_need_byteswap) {
838 bvphys->bvp_byteorder = BRT_NON_NATIVE_BYTEORDER;
839 } else {
840 bvphys->bvp_byteorder = BRT_NATIVE_BYTEORDER;
841 }
842 bvphys->bvp_totalcount = brtvd->bv_totalcount;
843 bvphys->bvp_rangesize = brt->brt_rangesize;
844 bvphys->bvp_usedspace = brtvd->bv_usedspace;
845 bvphys->bvp_savedspace = brtvd->bv_savedspace;
846 dmu_buf_rele(db, FTAG);
847
848 brtvd->bv_meta_dirty = FALSE;
849 }
850
851 static void
852 brt_vdevs_alloc(brt_t *brt, boolean_t load)
853 {
854 brt_vdev_t *brtvd;
855 uint64_t vdevid;
856
857 brt_wlock(brt);
858
859 brt_vdevs_expand(brt, brt->brt_spa->spa_root_vdev->vdev_children);
860
861 if (load) {
862 for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
863 brtvd = &brt->brt_vdevs[vdevid];
864 ASSERT(brtvd->bv_entcount == NULL);
865
866 brt_vdev_load(brt, brtvd);
867 }
868 }
869
870 if (brt->brt_rangesize == 0) {
871 brt->brt_rangesize = BRT_RANGESIZE;
872 }
873
874 brt_unlock(brt);
875 }
876
877 static void
878 brt_vdevs_free(brt_t *brt)
879 {
880 brt_vdev_t *brtvd;
881 uint64_t vdevid;
882
883 brt_wlock(brt);
884
885 for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
886 brtvd = &brt->brt_vdevs[vdevid];
887 if (brtvd->bv_initiated)
888 brt_vdev_dealloc(brt, brtvd);
889 }
890 kmem_free(brt->brt_vdevs, sizeof (brt_vdev_t) * brt->brt_nvdevs);
891
892 brt_unlock(brt);
893 }
894
895 static void
896 brt_entry_fill(const blkptr_t *bp, brt_entry_t *bre, uint64_t *vdevidp)
897 {
898
899 bre->bre_offset = DVA_GET_OFFSET(&bp->blk_dva[0]);
900 bre->bre_refcount = 0;
901
902 *vdevidp = DVA_GET_VDEV(&bp->blk_dva[0]);
903 }
904
905 static int
906 brt_entry_compare(const void *x1, const void *x2)
907 {
908 const brt_entry_t *bre1 = x1;
909 const brt_entry_t *bre2 = x2;
910
911 return (TREE_CMP(bre1->bre_offset, bre2->bre_offset));
912 }
913
914 static int
915 brt_entry_lookup(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre)
916 {
917 uint64_t mos_entries;
918 uint64_t one, physsize;
919 int error;
920
921 ASSERT(RW_LOCK_HELD(&brt->brt_lock));
922
923 if (!brt_vdev_lookup(brt, brtvd, bre))
924 return (SET_ERROR(ENOENT));
925
926 /*
927 * Remember mos_entries object number. After we reacquire the BRT lock,
928 * the brtvd pointer may be invalid.
929 */
930 mos_entries = brtvd->bv_mos_entries;
931 if (mos_entries == 0)
932 return (SET_ERROR(ENOENT));
933
934 brt_unlock(brt);
935
936 error = zap_length_uint64(brt->brt_mos, mos_entries, &bre->bre_offset,
937 BRT_KEY_WORDS, &one, &physsize);
938 if (error == 0) {
939 ASSERT3U(one, ==, 1);
940 ASSERT3U(physsize, ==, sizeof (bre->bre_refcount));
941
942 error = zap_lookup_uint64(brt->brt_mos, mos_entries,
943 &bre->bre_offset, BRT_KEY_WORDS, 1,
944 sizeof (bre->bre_refcount), &bre->bre_refcount);
945 BRT_DEBUG("ZAP lookup: object=%llu vdev=%llu offset=%llu "
946 "count=%llu error=%d", (u_longlong_t)mos_entries,
947 (u_longlong_t)brtvd->bv_vdevid,
948 (u_longlong_t)bre->bre_offset,
949 error == 0 ? (u_longlong_t)bre->bre_refcount : 0, error);
950 }
951
952 brt_wlock(brt);
953
954 return (error);
955 }
956
957 static void
958 brt_entry_prefetch(brt_t *brt, uint64_t vdevid, brt_entry_t *bre)
959 {
960 brt_vdev_t *brtvd;
961 uint64_t mos_entries = 0;
962
963 brt_rlock(brt);
964 brtvd = brt_vdev(brt, vdevid);
965 if (brtvd != NULL)
966 mos_entries = brtvd->bv_mos_entries;
967 brt_unlock(brt);
968
969 if (mos_entries == 0)
970 return;
971
972 BRT_DEBUG("ZAP prefetch: object=%llu vdev=%llu offset=%llu",
973 (u_longlong_t)mos_entries, (u_longlong_t)vdevid,
974 (u_longlong_t)bre->bre_offset);
975 (void) zap_prefetch_uint64(brt->brt_mos, mos_entries,
976 (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS);
977 }
978
979 static int
980 brt_entry_update(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx)
981 {
982 int error;
983
984 ASSERT(RW_LOCK_HELD(&brt->brt_lock));
985 ASSERT(brtvd->bv_mos_entries != 0);
986 ASSERT(bre->bre_refcount > 0);
987
988 error = zap_update_uint64(brt->brt_mos, brtvd->bv_mos_entries,
989 (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS, 1,
990 sizeof (bre->bre_refcount), &bre->bre_refcount, tx);
991 BRT_DEBUG("ZAP update: object=%llu vdev=%llu offset=%llu count=%llu "
992 "error=%d", (u_longlong_t)brtvd->bv_mos_entries,
993 (u_longlong_t)brtvd->bv_vdevid, (u_longlong_t)bre->bre_offset,
994 (u_longlong_t)bre->bre_refcount, error);
995
996 return (error);
997 }
998
999 static int
1000 brt_entry_remove(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx)
1001 {
1002 int error;
1003
1004 ASSERT(RW_LOCK_HELD(&brt->brt_lock));
1005 ASSERT(brtvd->bv_mos_entries != 0);
1006 ASSERT0(bre->bre_refcount);
1007
1008 error = zap_remove_uint64(brt->brt_mos, brtvd->bv_mos_entries,
1009 (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS, tx);
1010 BRT_DEBUG("ZAP remove: object=%llu vdev=%llu offset=%llu count=%llu "
1011 "error=%d", (u_longlong_t)brtvd->bv_mos_entries,
1012 (u_longlong_t)brtvd->bv_vdevid, (u_longlong_t)bre->bre_offset,
1013 (u_longlong_t)bre->bre_refcount, error);
1014
1015 return (error);
1016 }
1017
1018 /*
1019 * Return TRUE if we _can_ have BRT entry for this bp. It might be false
1020 * positive, but gives us quick answer if we should look into BRT, which
1021 * may require reads and thus will be more expensive.
1022 */
1023 boolean_t
1024 brt_maybe_exists(spa_t *spa, const blkptr_t *bp)
1025 {
1026 brt_t *brt = spa->spa_brt;
1027 brt_vdev_t *brtvd;
1028 brt_entry_t bre_search;
1029 boolean_t mayexists = FALSE;
1030 uint64_t vdevid;
1031
1032 brt_entry_fill(bp, &bre_search, &vdevid);
1033
1034 brt_rlock(brt);
1035
1036 brtvd = brt_vdev(brt, vdevid);
1037 if (brtvd != NULL && brtvd->bv_initiated) {
1038 if (!avl_is_empty(&brtvd->bv_tree) ||
1039 brt_vdev_lookup(brt, brtvd, &bre_search)) {
1040 mayexists = TRUE;
1041 }
1042 }
1043
1044 brt_unlock(brt);
1045
1046 return (mayexists);
1047 }
1048
1049 uint64_t
1050 brt_get_dspace(spa_t *spa)
1051 {
1052 brt_t *brt = spa->spa_brt;
1053
1054 if (brt == NULL)
1055 return (0);
1056
1057 return (brt->brt_savedspace);
1058 }
1059
1060 uint64_t
1061 brt_get_used(spa_t *spa)
1062 {
1063 brt_t *brt = spa->spa_brt;
1064
1065 if (brt == NULL)
1066 return (0);
1067
1068 return (brt->brt_usedspace);
1069 }
1070
1071 uint64_t
1072 brt_get_saved(spa_t *spa)
1073 {
1074 brt_t *brt = spa->spa_brt;
1075
1076 if (brt == NULL)
1077 return (0);
1078
1079 return (brt->brt_savedspace);
1080 }
1081
1082 uint64_t
1083 brt_get_ratio(spa_t *spa)
1084 {
1085 brt_t *brt = spa->spa_brt;
1086
1087 if (brt->brt_usedspace == 0)
1088 return (100);
1089
1090 return ((brt->brt_usedspace + brt->brt_savedspace) * 100 /
1091 brt->brt_usedspace);
1092 }
1093
1094 static int
1095 brt_kstats_update(kstat_t *ksp, int rw)
1096 {
1097 brt_stats_t *bs = ksp->ks_data;
1098
1099 if (rw == KSTAT_WRITE)
1100 return (EACCES);
1101
1102 bs->brt_addref_entry_in_memory.value.ui64 =
1103 wmsum_value(&brt_sums.brt_addref_entry_in_memory);
1104 bs->brt_addref_entry_not_on_disk.value.ui64 =
1105 wmsum_value(&brt_sums.brt_addref_entry_not_on_disk);
1106 bs->brt_addref_entry_on_disk.value.ui64 =
1107 wmsum_value(&brt_sums.brt_addref_entry_on_disk);
1108 bs->brt_addref_entry_read_lost_race.value.ui64 =
1109 wmsum_value(&brt_sums.brt_addref_entry_read_lost_race);
1110 bs->brt_decref_entry_in_memory.value.ui64 =
1111 wmsum_value(&brt_sums.brt_decref_entry_in_memory);
1112 bs->brt_decref_entry_loaded_from_disk.value.ui64 =
1113 wmsum_value(&brt_sums.brt_decref_entry_loaded_from_disk);
1114 bs->brt_decref_entry_not_in_memory.value.ui64 =
1115 wmsum_value(&brt_sums.brt_decref_entry_not_in_memory);
1116 bs->brt_decref_entry_not_on_disk.value.ui64 =
1117 wmsum_value(&brt_sums.brt_decref_entry_not_on_disk);
1118 bs->brt_decref_entry_read_lost_race.value.ui64 =
1119 wmsum_value(&brt_sums.brt_decref_entry_read_lost_race);
1120 bs->brt_decref_entry_still_referenced.value.ui64 =
1121 wmsum_value(&brt_sums.brt_decref_entry_still_referenced);
1122 bs->brt_decref_free_data_later.value.ui64 =
1123 wmsum_value(&brt_sums.brt_decref_free_data_later);
1124 bs->brt_decref_free_data_now.value.ui64 =
1125 wmsum_value(&brt_sums.brt_decref_free_data_now);
1126 bs->brt_decref_no_entry.value.ui64 =
1127 wmsum_value(&brt_sums.brt_decref_no_entry);
1128
1129 return (0);
1130 }
1131
1132 static void
1133 brt_stat_init(void)
1134 {
1135
1136 wmsum_init(&brt_sums.brt_addref_entry_in_memory, 0);
1137 wmsum_init(&brt_sums.brt_addref_entry_not_on_disk, 0);
1138 wmsum_init(&brt_sums.brt_addref_entry_on_disk, 0);
1139 wmsum_init(&brt_sums.brt_addref_entry_read_lost_race, 0);
1140 wmsum_init(&brt_sums.brt_decref_entry_in_memory, 0);
1141 wmsum_init(&brt_sums.brt_decref_entry_loaded_from_disk, 0);
1142 wmsum_init(&brt_sums.brt_decref_entry_not_in_memory, 0);
1143 wmsum_init(&brt_sums.brt_decref_entry_not_on_disk, 0);
1144 wmsum_init(&brt_sums.brt_decref_entry_read_lost_race, 0);
1145 wmsum_init(&brt_sums.brt_decref_entry_still_referenced, 0);
1146 wmsum_init(&brt_sums.brt_decref_free_data_later, 0);
1147 wmsum_init(&brt_sums.brt_decref_free_data_now, 0);
1148 wmsum_init(&brt_sums.brt_decref_no_entry, 0);
1149
1150 brt_ksp = kstat_create("zfs", 0, "brtstats", "misc", KSTAT_TYPE_NAMED,
1151 sizeof (brt_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
1152 if (brt_ksp != NULL) {
1153 brt_ksp->ks_data = &brt_stats;
1154 brt_ksp->ks_update = brt_kstats_update;
1155 kstat_install(brt_ksp);
1156 }
1157 }
1158
1159 static void
1160 brt_stat_fini(void)
1161 {
1162 if (brt_ksp != NULL) {
1163 kstat_delete(brt_ksp);
1164 brt_ksp = NULL;
1165 }
1166
1167 wmsum_fini(&brt_sums.brt_addref_entry_in_memory);
1168 wmsum_fini(&brt_sums.brt_addref_entry_not_on_disk);
1169 wmsum_fini(&brt_sums.brt_addref_entry_on_disk);
1170 wmsum_fini(&brt_sums.brt_addref_entry_read_lost_race);
1171 wmsum_fini(&brt_sums.brt_decref_entry_in_memory);
1172 wmsum_fini(&brt_sums.brt_decref_entry_loaded_from_disk);
1173 wmsum_fini(&brt_sums.brt_decref_entry_not_in_memory);
1174 wmsum_fini(&brt_sums.brt_decref_entry_not_on_disk);
1175 wmsum_fini(&brt_sums.brt_decref_entry_read_lost_race);
1176 wmsum_fini(&brt_sums.brt_decref_entry_still_referenced);
1177 wmsum_fini(&brt_sums.brt_decref_free_data_later);
1178 wmsum_fini(&brt_sums.brt_decref_free_data_now);
1179 wmsum_fini(&brt_sums.brt_decref_no_entry);
1180 }
1181
1182 void
1183 brt_init(void)
1184 {
1185 brt_entry_cache = kmem_cache_create("brt_entry_cache",
1186 sizeof (brt_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
1187 brt_pending_entry_cache = kmem_cache_create("brt_pending_entry_cache",
1188 sizeof (brt_pending_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
1189
1190 brt_stat_init();
1191 }
1192
1193 void
1194 brt_fini(void)
1195 {
1196 brt_stat_fini();
1197
1198 kmem_cache_destroy(brt_entry_cache);
1199 kmem_cache_destroy(brt_pending_entry_cache);
1200 }
1201
1202 static brt_entry_t *
1203 brt_entry_alloc(const brt_entry_t *bre_init)
1204 {
1205 brt_entry_t *bre;
1206
1207 bre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP);
1208 bre->bre_offset = bre_init->bre_offset;
1209 bre->bre_refcount = bre_init->bre_refcount;
1210
1211 return (bre);
1212 }
1213
1214 static void
1215 brt_entry_free(brt_entry_t *bre)
1216 {
1217
1218 kmem_cache_free(brt_entry_cache, bre);
1219 }
1220
1221 static void
1222 brt_entry_addref(brt_t *brt, const blkptr_t *bp)
1223 {
1224 brt_vdev_t *brtvd;
1225 brt_entry_t *bre, *racebre;
1226 brt_entry_t bre_search;
1227 avl_index_t where;
1228 uint64_t vdevid;
1229 int error;
1230
1231 ASSERT(!RW_WRITE_HELD(&brt->brt_lock));
1232
1233 brt_entry_fill(bp, &bre_search, &vdevid);
1234
1235 brt_wlock(brt);
1236
1237 brtvd = brt_vdev(brt, vdevid);
1238 if (brtvd == NULL) {
1239 ASSERT3U(vdevid, >=, brt->brt_nvdevs);
1240
1241 /* New VDEV was added. */
1242 brt_vdevs_expand(brt, vdevid + 1);
1243 brtvd = brt_vdev(brt, vdevid);
1244 }
1245 ASSERT(brtvd != NULL);
1246 if (!brtvd->bv_initiated)
1247 brt_vdev_realloc(brt, brtvd);
1248
1249 bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
1250 if (bre != NULL) {
1251 BRTSTAT_BUMP(brt_addref_entry_in_memory);
1252 } else {
1253 /*
1254 * brt_entry_lookup() may drop the BRT (read) lock and
1255 * reacquire it (write).
1256 */
1257 error = brt_entry_lookup(brt, brtvd, &bre_search);
1258 /* bre_search now contains correct bre_refcount */
1259 ASSERT(error == 0 || error == ENOENT);
1260 if (error == 0)
1261 BRTSTAT_BUMP(brt_addref_entry_on_disk);
1262 else
1263 BRTSTAT_BUMP(brt_addref_entry_not_on_disk);
1264 /*
1265 * When the BRT lock was dropped, brt_vdevs[] may have been
1266 * expanded and reallocated, we need to update brtvd's pointer.
1267 */
1268 brtvd = brt_vdev(brt, vdevid);
1269 ASSERT(brtvd != NULL);
1270
1271 racebre = avl_find(&brtvd->bv_tree, &bre_search, &where);
1272 if (racebre == NULL) {
1273 bre = brt_entry_alloc(&bre_search);
1274 ASSERT(RW_WRITE_HELD(&brt->brt_lock));
1275 avl_insert(&brtvd->bv_tree, bre, where);
1276 brt->brt_nentries++;
1277 } else {
1278 /*
1279 * The entry was added when the BRT lock was dropped in
1280 * brt_entry_lookup().
1281 */
1282 BRTSTAT_BUMP(brt_addref_entry_read_lost_race);
1283 bre = racebre;
1284 }
1285 }
1286 bre->bre_refcount++;
1287 brt_vdev_addref(brt, brtvd, bre, bp_get_dsize(brt->brt_spa, bp));
1288
1289 brt_unlock(brt);
1290 }
1291
1292 /* Return TRUE if block should be freed immediately. */
1293 boolean_t
1294 brt_entry_decref(spa_t *spa, const blkptr_t *bp)
1295 {
1296 brt_t *brt = spa->spa_brt;
1297 brt_vdev_t *brtvd;
1298 brt_entry_t *bre, *racebre;
1299 brt_entry_t bre_search;
1300 avl_index_t where;
1301 uint64_t vdevid;
1302 int error;
1303
1304 brt_entry_fill(bp, &bre_search, &vdevid);
1305
1306 brt_wlock(brt);
1307
1308 brtvd = brt_vdev(brt, vdevid);
1309 ASSERT(brtvd != NULL);
1310
1311 bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
1312 if (bre != NULL) {
1313 BRTSTAT_BUMP(brt_decref_entry_in_memory);
1314 goto out;
1315 } else {
1316 BRTSTAT_BUMP(brt_decref_entry_not_in_memory);
1317 }
1318
1319 /*
1320 * brt_entry_lookup() may drop the BRT lock and reacquire it.
1321 */
1322 error = brt_entry_lookup(brt, brtvd, &bre_search);
1323 /* bre_search now contains correct bre_refcount */
1324 ASSERT(error == 0 || error == ENOENT);
1325 /*
1326 * When the BRT lock was dropped, brt_vdevs[] may have been expanded
1327 * and reallocated, we need to update brtvd's pointer.
1328 */
1329 brtvd = brt_vdev(brt, vdevid);
1330 ASSERT(brtvd != NULL);
1331
1332 if (error == ENOENT) {
1333 BRTSTAT_BUMP(brt_decref_entry_not_on_disk);
1334 bre = NULL;
1335 goto out;
1336 }
1337
1338 racebre = avl_find(&brtvd->bv_tree, &bre_search, &where);
1339 if (racebre != NULL) {
1340 /*
1341 * The entry was added when the BRT lock was dropped in
1342 * brt_entry_lookup().
1343 */
1344 BRTSTAT_BUMP(brt_decref_entry_read_lost_race);
1345 bre = racebre;
1346 goto out;
1347 }
1348
1349 BRTSTAT_BUMP(brt_decref_entry_loaded_from_disk);
1350 bre = brt_entry_alloc(&bre_search);
1351 ASSERT(RW_WRITE_HELD(&brt->brt_lock));
1352 avl_insert(&brtvd->bv_tree, bre, where);
1353 brt->brt_nentries++;
1354
1355 out:
1356 if (bre == NULL) {
1357 /*
1358 * This is a free of a regular (not cloned) block.
1359 */
1360 brt_unlock(brt);
1361 BRTSTAT_BUMP(brt_decref_no_entry);
1362 return (B_TRUE);
1363 }
1364 if (bre->bre_refcount == 0) {
1365 brt_unlock(brt);
1366 BRTSTAT_BUMP(brt_decref_free_data_now);
1367 return (B_TRUE);
1368 }
1369
1370 ASSERT(bre->bre_refcount > 0);
1371 bre->bre_refcount--;
1372 if (bre->bre_refcount == 0)
1373 BRTSTAT_BUMP(brt_decref_free_data_later);
1374 else
1375 BRTSTAT_BUMP(brt_decref_entry_still_referenced);
1376 brt_vdev_decref(brt, brtvd, bre, bp_get_dsize(brt->brt_spa, bp));
1377
1378 brt_unlock(brt);
1379
1380 return (B_FALSE);
1381 }
1382
1383 uint64_t
1384 brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp)
1385 {
1386 brt_t *brt = spa->spa_brt;
1387 brt_vdev_t *brtvd;
1388 brt_entry_t bre_search, *bre;
1389 uint64_t vdevid, refcnt;
1390 int error;
1391
1392 brt_entry_fill(bp, &bre_search, &vdevid);
1393
1394 brt_rlock(brt);
1395
1396 brtvd = brt_vdev(brt, vdevid);
1397 ASSERT(brtvd != NULL);
1398
1399 bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
1400 if (bre == NULL) {
1401 error = brt_entry_lookup(brt, brtvd, &bre_search);
1402 ASSERT(error == 0 || error == ENOENT);
1403 if (error == ENOENT)
1404 refcnt = 0;
1405 else
1406 refcnt = bre_search.bre_refcount;
1407 } else
1408 refcnt = bre->bre_refcount;
1409
1410 brt_unlock(brt);
1411 return (refcnt);
1412 }
1413
1414 static void
1415 brt_prefetch(brt_t *brt, const blkptr_t *bp)
1416 {
1417 brt_entry_t bre;
1418 uint64_t vdevid;
1419
1420 ASSERT(bp != NULL);
1421
1422 if (!zfs_brt_prefetch)
1423 return;
1424
1425 brt_entry_fill(bp, &bre, &vdevid);
1426
1427 brt_entry_prefetch(brt, vdevid, &bre);
1428 }
1429
1430 static int
1431 brt_pending_entry_compare(const void *x1, const void *x2)
1432 {
1433 const brt_pending_entry_t *bpe1 = x1, *bpe2 = x2;
1434 const blkptr_t *bp1 = &bpe1->bpe_bp, *bp2 = &bpe2->bpe_bp;
1435 int cmp;
1436
1437 cmp = TREE_CMP(BP_PHYSICAL_BIRTH(bp1), BP_PHYSICAL_BIRTH(bp2));
1438 if (cmp == 0) {
1439 cmp = TREE_CMP(DVA_GET_VDEV(&bp1->blk_dva[0]),
1440 DVA_GET_VDEV(&bp2->blk_dva[0]));
1441 if (cmp == 0) {
1442 cmp = TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]),
1443 DVA_GET_OFFSET(&bp2->blk_dva[0]));
1444 }
1445 }
1446
1447 return (cmp);
1448 }
1449
1450 void
1451 brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
1452 {
1453 brt_t *brt;
1454 avl_tree_t *pending_tree;
1455 kmutex_t *pending_lock;
1456 brt_pending_entry_t *bpe, *newbpe;
1457 avl_index_t where;
1458 uint64_t txg;
1459
1460 brt = spa->spa_brt;
1461 txg = dmu_tx_get_txg(tx);
1462 ASSERT3U(txg, !=, 0);
1463 pending_tree = &brt->brt_pending_tree[txg & TXG_MASK];
1464 pending_lock = &brt->brt_pending_lock[txg & TXG_MASK];
1465
1466 newbpe = kmem_cache_alloc(brt_pending_entry_cache, KM_SLEEP);
1467 newbpe->bpe_bp = *bp;
1468 newbpe->bpe_count = 1;
1469
1470 mutex_enter(pending_lock);
1471
1472 bpe = avl_find(pending_tree, newbpe, &where);
1473 if (bpe == NULL) {
1474 avl_insert(pending_tree, newbpe, where);
1475 newbpe = NULL;
1476 } else {
1477 bpe->bpe_count++;
1478 }
1479
1480 mutex_exit(pending_lock);
1481
1482 if (newbpe != NULL) {
1483 ASSERT(bpe != NULL);
1484 ASSERT(bpe != newbpe);
1485 kmem_cache_free(brt_pending_entry_cache, newbpe);
1486 } else {
1487 ASSERT(bpe == NULL);
1488 }
1489
1490 /* Prefetch BRT entry, as we will need it in the syncing context. */
1491 brt_prefetch(brt, bp);
1492 }
1493
1494 void
1495 brt_pending_remove(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
1496 {
1497 brt_t *brt;
1498 avl_tree_t *pending_tree;
1499 kmutex_t *pending_lock;
1500 brt_pending_entry_t *bpe, bpe_search;
1501 uint64_t txg;
1502
1503 brt = spa->spa_brt;
1504 txg = dmu_tx_get_txg(tx);
1505 ASSERT3U(txg, !=, 0);
1506 pending_tree = &brt->brt_pending_tree[txg & TXG_MASK];
1507 pending_lock = &brt->brt_pending_lock[txg & TXG_MASK];
1508
1509 bpe_search.bpe_bp = *bp;
1510
1511 mutex_enter(pending_lock);
1512
1513 bpe = avl_find(pending_tree, &bpe_search, NULL);
1514 /* I believe we should always find bpe when this function is called. */
1515 if (bpe != NULL) {
1516 ASSERT(bpe->bpe_count > 0);
1517
1518 bpe->bpe_count--;
1519 if (bpe->bpe_count == 0) {
1520 avl_remove(pending_tree, bpe);
1521 kmem_cache_free(brt_pending_entry_cache, bpe);
1522 }
1523 }
1524
1525 mutex_exit(pending_lock);
1526 }
1527
1528 void
1529 brt_pending_apply(spa_t *spa, uint64_t txg)
1530 {
1531 brt_t *brt;
1532 brt_pending_entry_t *bpe;
1533 avl_tree_t *pending_tree;
1534 kmutex_t *pending_lock;
1535 void *c;
1536
1537 ASSERT3U(txg, !=, 0);
1538
1539 brt = spa->spa_brt;
1540 pending_tree = &brt->brt_pending_tree[txg & TXG_MASK];
1541 pending_lock = &brt->brt_pending_lock[txg & TXG_MASK];
1542
1543 mutex_enter(pending_lock);
1544
1545 c = NULL;
1546 while ((bpe = avl_destroy_nodes(pending_tree, &c)) != NULL) {
1547 boolean_t added_to_ddt;
1548
1549 mutex_exit(pending_lock);
1550
1551 for (int i = 0; i < bpe->bpe_count; i++) {
1552 /*
1553 * If the block has DEDUP bit set, it means that it
1554 * already exists in the DEDUP table, so we can just
1555 * use that instead of creating new entry in
1556 * the BRT table.
1557 */
1558 if (BP_GET_DEDUP(&bpe->bpe_bp)) {
1559 added_to_ddt = ddt_addref(spa, &bpe->bpe_bp);
1560 } else {
1561 added_to_ddt = B_FALSE;
1562 }
1563 if (!added_to_ddt)
1564 brt_entry_addref(brt, &bpe->bpe_bp);
1565 }
1566
1567 kmem_cache_free(brt_pending_entry_cache, bpe);
1568 mutex_enter(pending_lock);
1569 }
1570
1571 mutex_exit(pending_lock);
1572 }
1573
1574 static void
1575 brt_sync_entry(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx)
1576 {
1577
1578 ASSERT(RW_WRITE_HELD(&brt->brt_lock));
1579 ASSERT(brtvd->bv_mos_entries != 0);
1580
1581 if (bre->bre_refcount == 0) {
1582 int error;
1583
1584 error = brt_entry_remove(brt, brtvd, bre, tx);
1585 ASSERT(error == 0 || error == ENOENT);
1586 /*
1587 * If error == ENOENT then zfs_clone_range() was done from a
1588 * removed (but opened) file (open(), unlink()).
1589 */
1590 ASSERT(brt_entry_lookup(brt, brtvd, bre) == ENOENT);
1591 } else {
1592 VERIFY0(brt_entry_update(brt, brtvd, bre, tx));
1593 }
1594 }
1595
1596 static void
1597 brt_sync_table(brt_t *brt, dmu_tx_t *tx)
1598 {
1599 brt_vdev_t *brtvd;
1600 brt_entry_t *bre;
1601 uint64_t vdevid;
1602 void *c;
1603
1604 brt_wlock(brt);
1605
1606 for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
1607 brtvd = &brt->brt_vdevs[vdevid];
1608
1609 if (!brtvd->bv_initiated)
1610 continue;
1611
1612 if (!brtvd->bv_meta_dirty) {
1613 ASSERT(!brtvd->bv_entcount_dirty);
1614 ASSERT0(avl_numnodes(&brtvd->bv_tree));
1615 continue;
1616 }
1617
1618 ASSERT(!brtvd->bv_entcount_dirty ||
1619 avl_numnodes(&brtvd->bv_tree) != 0);
1620
1621 if (brtvd->bv_mos_brtvdev == 0)
1622 brt_vdev_create(brt, brtvd, tx);
1623
1624 c = NULL;
1625 while ((bre = avl_destroy_nodes(&brtvd->bv_tree, &c)) != NULL) {
1626 brt_sync_entry(brt, brtvd, bre, tx);
1627 brt_entry_free(bre);
1628 ASSERT(brt->brt_nentries > 0);
1629 brt->brt_nentries--;
1630 }
1631
1632 brt_vdev_sync(brt, brtvd, tx);
1633
1634 if (brtvd->bv_totalcount == 0)
1635 brt_vdev_destroy(brt, brtvd, tx);
1636 }
1637
1638 ASSERT0(brt->brt_nentries);
1639
1640 brt_unlock(brt);
1641 }
1642
1643 void
1644 brt_sync(spa_t *spa, uint64_t txg)
1645 {
1646 dmu_tx_t *tx;
1647 brt_t *brt;
1648
1649 ASSERT(spa_syncing_txg(spa) == txg);
1650
1651 brt = spa->spa_brt;
1652 brt_rlock(brt);
1653 if (brt->brt_nentries == 0) {
1654 /* No changes. */
1655 brt_unlock(brt);
1656 return;
1657 }
1658 brt_unlock(brt);
1659
1660 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
1661
1662 brt_sync_table(brt, tx);
1663
1664 dmu_tx_commit(tx);
1665 }
1666
1667 static void
1668 brt_table_alloc(brt_t *brt)
1669 {
1670
1671 for (int i = 0; i < TXG_SIZE; i++) {
1672 avl_create(&brt->brt_pending_tree[i],
1673 brt_pending_entry_compare,
1674 sizeof (brt_pending_entry_t),
1675 offsetof(brt_pending_entry_t, bpe_node));
1676 mutex_init(&brt->brt_pending_lock[i], NULL, MUTEX_DEFAULT,
1677 NULL);
1678 }
1679 }
1680
1681 static void
1682 brt_table_free(brt_t *brt)
1683 {
1684
1685 for (int i = 0; i < TXG_SIZE; i++) {
1686 ASSERT(avl_is_empty(&brt->brt_pending_tree[i]));
1687
1688 avl_destroy(&brt->brt_pending_tree[i]);
1689 mutex_destroy(&brt->brt_pending_lock[i]);
1690 }
1691 }
1692
1693 static void
1694 brt_alloc(spa_t *spa)
1695 {
1696 brt_t *brt;
1697
1698 ASSERT(spa->spa_brt == NULL);
1699
1700 brt = kmem_zalloc(sizeof (*brt), KM_SLEEP);
1701 rw_init(&brt->brt_lock, NULL, RW_DEFAULT, NULL);
1702 brt->brt_spa = spa;
1703 brt->brt_rangesize = 0;
1704 brt->brt_nentries = 0;
1705 brt->brt_vdevs = NULL;
1706 brt->brt_nvdevs = 0;
1707 brt_table_alloc(brt);
1708
1709 spa->spa_brt = brt;
1710 }
1711
1712 void
1713 brt_create(spa_t *spa)
1714 {
1715
1716 brt_alloc(spa);
1717 brt_vdevs_alloc(spa->spa_brt, B_FALSE);
1718 }
1719
1720 int
1721 brt_load(spa_t *spa)
1722 {
1723
1724 brt_alloc(spa);
1725 brt_vdevs_alloc(spa->spa_brt, B_TRUE);
1726
1727 return (0);
1728 }
1729
1730 void
1731 brt_unload(spa_t *spa)
1732 {
1733 brt_t *brt = spa->spa_brt;
1734
1735 if (brt == NULL)
1736 return;
1737
1738 brt_vdevs_free(brt);
1739 brt_table_free(brt);
1740 rw_destroy(&brt->brt_lock);
1741 kmem_free(brt, sizeof (*brt));
1742 spa->spa_brt = NULL;
1743 }
1744
1745 /* BEGIN CSTYLED */
1746 ZFS_MODULE_PARAM(zfs_brt, zfs_brt_, prefetch, INT, ZMOD_RW,
1747 "Enable prefetching of BRT entries");
1748 #ifdef ZFS_BRT_DEBUG
1749 ZFS_MODULE_PARAM(zfs_brt, zfs_brt_, debug, INT, ZMOD_RW, "BRT debug");
1750 #endif
1751 /* END CSTYLED */