4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
25 * Copyright (c) 2022 by Pawel Jakub Dawidek
26 * Copyright (c) 2023, Klara Inc.
29 #include <sys/zfs_context.h>
31 #include <sys/spa_impl.h>
34 #include <sys/ddt_impl.h>
36 #include <sys/dmu_tx.h>
38 #include <sys/dsl_pool.h>
39 #include <sys/zio_checksum.h>
40 #include <sys/dsl_scan.h>
44 * # DDT: Deduplication tables
46 * The dedup subsystem provides block-level deduplication. When enabled, blocks
47 * to be written will have the dedup (D) bit set, which causes them to be
48 * tracked in a "dedup table", or DDT. If a block has been seen before (exists
49 * in the DDT), instead of being written, it will instead be made to reference
50 * the existing on-disk data, and a refcount bumped in the DDT instead.
52 * ## Dedup tables and entries
54 * Conceptually, a DDT is a dictionary or map. Each entry has a "key"
55 * (ddt_key_t) made up a block's checksum and certian properties, and a "value"
56 * (one or more ddt_phys_t) containing valid DVAs for the block's data, birth
57 * time and refcount. Together these are enough to track references to a
58 * specific block, to build a valid block pointer to reference that block (for
59 * freeing, scrubbing, etc), and to fill a new block pointer with the missing
60 * pieces to make it seem like it was written.
62 * There's a single DDT (ddt_t) for each checksum type, held in spa_ddt[].
63 * Within each DDT, there can be multiple storage "types" (ddt_type_t, on-disk
64 * object data formats, each with their own implementations) and "classes"
65 * (ddt_class_t, instance of a storage type object, for entries with a specific
66 * characteristic). An entry (key) will only ever exist on one of these objects
67 * at any given time, but may be moved from one to another if their type or
70 * The DDT is driven by the write IO pipeline (zio_ddt_write()). When a block
71 * is to be written, before DVAs have been allocated, ddt_lookup() is called to
72 * see if the block has been seen before. If its not found, the write proceeds
73 * as normal, and after it succeeds, a new entry is created. If it is found, we
74 * fill the BP with the DVAs from the entry, increment the refcount and cause
75 * the write IO to return immediately.
77 * Each ddt_phys_t slot in the entry represents a separate dedup block for the
78 * same content/checksum. The slot is selected based on the zp_copies parameter
79 * the block is written with, that is, the number of DVAs in the block. The
80 * "ditto" slot (DDT_PHYS_DITTO) used to be used for now-removed "dedupditto"
81 * feature. These are no longer written, and will be freed if encountered on
84 * ## Lifetime of an entry
86 * A DDT can be enormous, and typically is not held in memory all at once.
87 * Instead, the changes to an entry are tracked in memory, and written down to
88 * disk at the end of each txg.
90 * A "live" in-memory entry (ddt_entry_t) is a node on the live tree
91 * (ddt_tree). At the start of a txg, ddt_tree is empty. When an entry is
92 * required for IO, ddt_lookup() is called. If an entry already exists on
93 * ddt_tree, it is returned. Otherwise, a new one is created, and the
94 * type/class objects for the DDT are searched for that key. If its found, its
95 * value is copied into the live entry. If not, an empty entry is created.
97 * The live entry will be modified during the txg, usually by modifying the
98 * refcount, but sometimes by adding or updating DVAs. At the end of the txg
99 * (during spa_sync()), type and class are recalculated for entry (see
100 * ddt_sync_entry()), and the entry is written to the appropriate storage
101 * object and (if necessary), removed from an old one. ddt_tree is cleared and
102 * the next txg can start.
106 * If a read on a dedup block fails, but there are other copies of the block in
107 * the other ddt_phys_t slots, reads will be issued for those instead
108 * (zio_ddt_read_start()). If one of those succeeds, the read is returned to
109 * the caller, and a copy is stashed on the entry's dde_repair_abd.
111 * During the end-of-txg sync, any entries with a dde_repair_abd get a
112 * "rewrite" write issued for the original block pointer, with the data read
113 * from the alternate block. If the block is actually damaged, this will invoke
114 * the pool's "self-healing" mechanism, and repair the block.
116 * ## Scanning (scrub/resilver)
118 * If dedup is active, the scrub machinery will walk the dedup table first, and
119 * scrub all blocks with refcnt > 1 first. After that it will move on to the
120 * regular top-down scrub, and exclude the refcnt > 1 blocks when it sees them.
121 * In this way, heavily deduplicated blocks are only scrubbed once. See the
122 * commentary on dsl_scan_ddt() for more details.
124 * Walking the DDT is done via ddt_walk(). The current position is stored in a
125 * ddt_bookmark_t, which represents a stable position in the storage object.
126 * This bookmark is stored by the scan machinery, and must reference the same
127 * position on the object even if the object changes, the pool is exported, or
128 * OpenZFS is upgraded.
130 * ## Interaction with block cloning
132 * If block cloning and dedup are both enabled on a pool, BRT will look for the
133 * dedup bit on an incoming block pointer. If set, it will call into the DDT
134 * (ddt_addref()) to add a reference to the block, instead of adding a
135 * reference to the BRT. See brt_pending_apply().
139 * These are the only checksums valid for dedup. They must match the list
140 * from dedup_table in zfs_prop.c
142 #define DDT_CHECKSUM_VALID(c) \
143 (c == ZIO_CHECKSUM_SHA256 || c == ZIO_CHECKSUM_SHA512 || \
144 c == ZIO_CHECKSUM_SKEIN || c == ZIO_CHECKSUM_EDONR || \
145 c == ZIO_CHECKSUM_BLAKE3)
147 static kmem_cache_t
*ddt_cache
;
148 static kmem_cache_t
*ddt_entry_cache
;
151 * Enable/disable prefetching of dedup-ed blocks which are going to be freed.
153 int zfs_dedup_prefetch
= 0;
155 static const ddt_ops_t
*const ddt_ops
[DDT_TYPES
] = {
159 static const char *const ddt_class_name
[DDT_CLASSES
] = {
166 ddt_object_create(ddt_t
*ddt
, ddt_type_t type
, ddt_class_t
class,
169 spa_t
*spa
= ddt
->ddt_spa
;
170 objset_t
*os
= ddt
->ddt_os
;
171 uint64_t *objectp
= &ddt
->ddt_object
[type
][class];
172 boolean_t prehash
= zio_checksum_table
[ddt
->ddt_checksum
].ci_flags
&
173 ZCHECKSUM_FLAG_DEDUP
;
174 char name
[DDT_NAMELEN
];
176 ddt_object_name(ddt
, type
, class, name
);
178 ASSERT3U(*objectp
, ==, 0);
179 VERIFY0(ddt_ops
[type
]->ddt_op_create(os
, objectp
, tx
, prehash
));
180 ASSERT3U(*objectp
, !=, 0);
182 VERIFY0(zap_add(os
, DMU_POOL_DIRECTORY_OBJECT
, name
,
183 sizeof (uint64_t), 1, objectp
, tx
));
185 VERIFY0(zap_add(os
, spa
->spa_ddt_stat_object
, name
,
186 sizeof (uint64_t), sizeof (ddt_histogram_t
) / sizeof (uint64_t),
187 &ddt
->ddt_histogram
[type
][class], tx
));
191 ddt_object_destroy(ddt_t
*ddt
, ddt_type_t type
, ddt_class_t
class,
194 spa_t
*spa
= ddt
->ddt_spa
;
195 objset_t
*os
= ddt
->ddt_os
;
196 uint64_t *objectp
= &ddt
->ddt_object
[type
][class];
198 char name
[DDT_NAMELEN
];
200 ddt_object_name(ddt
, type
, class, name
);
202 ASSERT3U(*objectp
, !=, 0);
203 ASSERT(ddt_histogram_empty(&ddt
->ddt_histogram
[type
][class]));
204 VERIFY0(ddt_object_count(ddt
, type
, class, &count
));
206 VERIFY0(zap_remove(os
, DMU_POOL_DIRECTORY_OBJECT
, name
, tx
));
207 VERIFY0(zap_remove(os
, spa
->spa_ddt_stat_object
, name
, tx
));
208 VERIFY0(ddt_ops
[type
]->ddt_op_destroy(os
, *objectp
, tx
));
209 memset(&ddt
->ddt_object_stats
[type
][class], 0, sizeof (ddt_object_t
));
215 ddt_object_load(ddt_t
*ddt
, ddt_type_t type
, ddt_class_t
class)
217 ddt_object_t
*ddo
= &ddt
->ddt_object_stats
[type
][class];
218 dmu_object_info_t doi
;
220 char name
[DDT_NAMELEN
];
223 ddt_object_name(ddt
, type
, class, name
);
225 error
= zap_lookup(ddt
->ddt_os
, DMU_POOL_DIRECTORY_OBJECT
, name
,
226 sizeof (uint64_t), 1, &ddt
->ddt_object
[type
][class]);
230 error
= zap_lookup(ddt
->ddt_os
, ddt
->ddt_spa
->spa_ddt_stat_object
, name
,
231 sizeof (uint64_t), sizeof (ddt_histogram_t
) / sizeof (uint64_t),
232 &ddt
->ddt_histogram
[type
][class]);
237 * Seed the cached statistics.
239 error
= ddt_object_info(ddt
, type
, class, &doi
);
243 error
= ddt_object_count(ddt
, type
, class, &count
);
247 ddo
->ddo_count
= count
;
248 ddo
->ddo_dspace
= doi
.doi_physical_blocks_512
<< 9;
249 ddo
->ddo_mspace
= doi
.doi_fill_count
* doi
.doi_data_block_size
;
255 ddt_object_sync(ddt_t
*ddt
, ddt_type_t type
, ddt_class_t
class,
258 ddt_object_t
*ddo
= &ddt
->ddt_object_stats
[type
][class];
259 dmu_object_info_t doi
;
261 char name
[DDT_NAMELEN
];
263 ddt_object_name(ddt
, type
, class, name
);
265 VERIFY0(zap_update(ddt
->ddt_os
, ddt
->ddt_spa
->spa_ddt_stat_object
, name
,
266 sizeof (uint64_t), sizeof (ddt_histogram_t
) / sizeof (uint64_t),
267 &ddt
->ddt_histogram
[type
][class], tx
));
270 * Cache DDT statistics; this is the only time they'll change.
272 VERIFY0(ddt_object_info(ddt
, type
, class, &doi
));
273 VERIFY0(ddt_object_count(ddt
, type
, class, &count
));
275 ddo
->ddo_count
= count
;
276 ddo
->ddo_dspace
= doi
.doi_physical_blocks_512
<< 9;
277 ddo
->ddo_mspace
= doi
.doi_fill_count
* doi
.doi_data_block_size
;
281 ddt_object_exists(ddt_t
*ddt
, ddt_type_t type
, ddt_class_t
class)
283 return (!!ddt
->ddt_object
[type
][class]);
287 ddt_object_lookup(ddt_t
*ddt
, ddt_type_t type
, ddt_class_t
class,
290 if (!ddt_object_exists(ddt
, type
, class))
291 return (SET_ERROR(ENOENT
));
293 return (ddt_ops
[type
]->ddt_op_lookup(ddt
->ddt_os
,
294 ddt
->ddt_object
[type
][class], &dde
->dde_key
,
295 dde
->dde_phys
, sizeof (dde
->dde_phys
)));
299 ddt_object_contains(ddt_t
*ddt
, ddt_type_t type
, ddt_class_t
class,
300 const ddt_key_t
*ddk
)
302 if (!ddt_object_exists(ddt
, type
, class))
303 return (SET_ERROR(ENOENT
));
305 return (ddt_ops
[type
]->ddt_op_contains(ddt
->ddt_os
,
306 ddt
->ddt_object
[type
][class], ddk
));
310 ddt_object_prefetch(ddt_t
*ddt
, ddt_type_t type
, ddt_class_t
class,
311 const ddt_key_t
*ddk
)
313 if (!ddt_object_exists(ddt
, type
, class))
316 ddt_ops
[type
]->ddt_op_prefetch(ddt
->ddt_os
,
317 ddt
->ddt_object
[type
][class], ddk
);
321 ddt_object_update(ddt_t
*ddt
, ddt_type_t type
, ddt_class_t
class,
322 ddt_entry_t
*dde
, dmu_tx_t
*tx
)
324 ASSERT(ddt_object_exists(ddt
, type
, class));
326 return (ddt_ops
[type
]->ddt_op_update(ddt
->ddt_os
,
327 ddt
->ddt_object
[type
][class], &dde
->dde_key
, dde
->dde_phys
,
328 sizeof (dde
->dde_phys
), tx
));
332 ddt_object_remove(ddt_t
*ddt
, ddt_type_t type
, ddt_class_t
class,
333 const ddt_key_t
*ddk
, dmu_tx_t
*tx
)
335 ASSERT(ddt_object_exists(ddt
, type
, class));
337 return (ddt_ops
[type
]->ddt_op_remove(ddt
->ddt_os
,
338 ddt
->ddt_object
[type
][class], ddk
, tx
));
342 ddt_object_walk(ddt_t
*ddt
, ddt_type_t type
, ddt_class_t
class,
343 uint64_t *walk
, ddt_entry_t
*dde
)
345 ASSERT(ddt_object_exists(ddt
, type
, class));
347 return (ddt_ops
[type
]->ddt_op_walk(ddt
->ddt_os
,
348 ddt
->ddt_object
[type
][class], walk
, &dde
->dde_key
,
349 dde
->dde_phys
, sizeof (dde
->dde_phys
)));
353 ddt_object_count(ddt_t
*ddt
, ddt_type_t type
, ddt_class_t
class,
356 ASSERT(ddt_object_exists(ddt
, type
, class));
358 return (ddt_ops
[type
]->ddt_op_count(ddt
->ddt_os
,
359 ddt
->ddt_object
[type
][class], count
));
363 ddt_object_info(ddt_t
*ddt
, ddt_type_t type
, ddt_class_t
class,
364 dmu_object_info_t
*doi
)
366 if (!ddt_object_exists(ddt
, type
, class))
367 return (SET_ERROR(ENOENT
));
369 return (dmu_object_info(ddt
->ddt_os
, ddt
->ddt_object
[type
][class],
374 ddt_object_name(ddt_t
*ddt
, ddt_type_t type
, ddt_class_t
class,
377 (void) snprintf(name
, DDT_NAMELEN
, DMU_POOL_DDT
,
378 zio_checksum_table
[ddt
->ddt_checksum
].ci_name
,
379 ddt_ops
[type
]->ddt_op_name
, ddt_class_name
[class]);
383 ddt_bp_fill(const ddt_phys_t
*ddp
, blkptr_t
*bp
, uint64_t txg
)
385 ASSERT3U(txg
, !=, 0);
387 for (int d
= 0; d
< SPA_DVAS_PER_BP
; d
++)
388 bp
->blk_dva
[d
] = ddp
->ddp_dva
[d
];
389 BP_SET_BIRTH(bp
, txg
, ddp
->ddp_phys_birth
);
393 * The bp created via this function may be used for repairs and scrub, but it
394 * will be missing the salt / IV required to do a full decrypting read.
397 ddt_bp_create(enum zio_checksum checksum
,
398 const ddt_key_t
*ddk
, const ddt_phys_t
*ddp
, blkptr_t
*bp
)
403 ddt_bp_fill(ddp
, bp
, ddp
->ddp_phys_birth
);
405 bp
->blk_cksum
= ddk
->ddk_cksum
;
407 BP_SET_LSIZE(bp
, DDK_GET_LSIZE(ddk
));
408 BP_SET_PSIZE(bp
, DDK_GET_PSIZE(ddk
));
409 BP_SET_COMPRESS(bp
, DDK_GET_COMPRESS(ddk
));
410 BP_SET_CRYPT(bp
, DDK_GET_CRYPT(ddk
));
412 BP_SET_CHECKSUM(bp
, checksum
);
413 BP_SET_TYPE(bp
, DMU_OT_DEDUP
);
416 BP_SET_BYTEORDER(bp
, ZFS_HOST_BYTEORDER
);
420 ddt_key_fill(ddt_key_t
*ddk
, const blkptr_t
*bp
)
422 ddk
->ddk_cksum
= bp
->blk_cksum
;
425 ASSERT(BP_IS_ENCRYPTED(bp
) || !BP_USES_CRYPT(bp
));
427 DDK_SET_LSIZE(ddk
, BP_GET_LSIZE(bp
));
428 DDK_SET_PSIZE(ddk
, BP_GET_PSIZE(bp
));
429 DDK_SET_COMPRESS(ddk
, BP_GET_COMPRESS(bp
));
430 DDK_SET_CRYPT(ddk
, BP_USES_CRYPT(bp
));
434 ddt_phys_fill(ddt_phys_t
*ddp
, const blkptr_t
*bp
)
436 ASSERT0(ddp
->ddp_phys_birth
);
438 for (int d
= 0; d
< SPA_DVAS_PER_BP
; d
++)
439 ddp
->ddp_dva
[d
] = bp
->blk_dva
[d
];
440 ddp
->ddp_phys_birth
= BP_PHYSICAL_BIRTH(bp
);
444 ddt_phys_clear(ddt_phys_t
*ddp
)
446 memset(ddp
, 0, sizeof (*ddp
));
450 ddt_phys_addref(ddt_phys_t
*ddp
)
456 ddt_phys_decref(ddt_phys_t
*ddp
)
459 ASSERT3U(ddp
->ddp_refcnt
, >, 0);
465 ddt_phys_free(ddt_t
*ddt
, ddt_key_t
*ddk
, ddt_phys_t
*ddp
, uint64_t txg
)
469 ddt_bp_create(ddt
->ddt_checksum
, ddk
, ddp
, &blk
);
472 * We clear the dedup bit so that zio_free() will actually free the
473 * space, rather than just decrementing the refcount in the DDT.
475 BP_SET_DEDUP(&blk
, 0);
478 zio_free(ddt
->ddt_spa
, txg
, &blk
);
482 ddt_phys_select(const ddt_entry_t
*dde
, const blkptr_t
*bp
)
484 ddt_phys_t
*ddp
= (ddt_phys_t
*)dde
->dde_phys
;
486 for (int p
= 0; p
< DDT_PHYS_TYPES
; p
++, ddp
++) {
487 if (DVA_EQUAL(BP_IDENTITY(bp
), &ddp
->ddp_dva
[0]) &&
488 BP_PHYSICAL_BIRTH(bp
) == ddp
->ddp_phys_birth
)
495 ddt_phys_total_refcnt(const ddt_entry_t
*dde
)
499 for (int p
= DDT_PHYS_SINGLE
; p
<= DDT_PHYS_TRIPLE
; p
++)
500 refcnt
+= dde
->dde_phys
[p
].ddp_refcnt
;
506 ddt_select(spa_t
*spa
, const blkptr_t
*bp
)
508 ASSERT(DDT_CHECKSUM_VALID(BP_GET_CHECKSUM(bp
)));
509 return (spa
->spa_ddt
[BP_GET_CHECKSUM(bp
)]);
513 ddt_enter(ddt_t
*ddt
)
515 mutex_enter(&ddt
->ddt_lock
);
521 mutex_exit(&ddt
->ddt_lock
);
527 ddt_cache
= kmem_cache_create("ddt_cache",
528 sizeof (ddt_t
), 0, NULL
, NULL
, NULL
, NULL
, NULL
, 0);
529 ddt_entry_cache
= kmem_cache_create("ddt_entry_cache",
530 sizeof (ddt_entry_t
), 0, NULL
, NULL
, NULL
, NULL
, NULL
, 0);
536 kmem_cache_destroy(ddt_entry_cache
);
537 kmem_cache_destroy(ddt_cache
);
541 ddt_alloc(const ddt_key_t
*ddk
)
545 dde
= kmem_cache_alloc(ddt_entry_cache
, KM_SLEEP
);
546 memset(dde
, 0, sizeof (ddt_entry_t
));
547 cv_init(&dde
->dde_cv
, NULL
, CV_DEFAULT
, NULL
);
555 ddt_free(ddt_entry_t
*dde
)
557 ASSERT(dde
->dde_flags
& DDE_FLAG_LOADED
);
559 for (int p
= 0; p
< DDT_PHYS_TYPES
; p
++)
560 ASSERT3P(dde
->dde_lead_zio
[p
], ==, NULL
);
562 if (dde
->dde_repair_abd
!= NULL
)
563 abd_free(dde
->dde_repair_abd
);
565 cv_destroy(&dde
->dde_cv
);
566 kmem_cache_free(ddt_entry_cache
, dde
);
570 ddt_remove(ddt_t
*ddt
, ddt_entry_t
*dde
)
572 ASSERT(MUTEX_HELD(&ddt
->ddt_lock
));
574 avl_remove(&ddt
->ddt_tree
, dde
);
579 ddt_lookup(ddt_t
*ddt
, const blkptr_t
*bp
, boolean_t add
)
588 ASSERT(MUTEX_HELD(&ddt
->ddt_lock
));
590 ddt_key_fill(&search
, bp
);
592 /* Find an existing live entry */
593 dde
= avl_find(&ddt
->ddt_tree
, &search
, &where
);
595 /* Found it. If it's already loaded, we can just return it. */
596 if (dde
->dde_flags
& DDE_FLAG_LOADED
)
599 /* Someone else is loading it, wait for it. */
600 while (!(dde
->dde_flags
& DDE_FLAG_LOADED
))
601 cv_wait(&dde
->dde_cv
, &ddt
->ddt_lock
);
610 /* Time to make a new entry. */
611 dde
= ddt_alloc(&search
);
612 avl_insert(&ddt
->ddt_tree
, dde
, where
);
615 * ddt_tree is now stable, so unlock and let everyone else keep moving.
616 * Anyone landing on this entry will find it without DDE_FLAG_LOADED,
617 * and go to sleep waiting for it above.
621 /* Search all store objects for the entry. */
623 for (type
= 0; type
< DDT_TYPES
; type
++) {
624 for (class = 0; class < DDT_CLASSES
; class++) {
625 error
= ddt_object_lookup(ddt
, type
, class, dde
);
626 if (error
!= ENOENT
) {
637 ASSERT(!(dde
->dde_flags
& DDE_FLAG_LOADED
));
639 dde
->dde_type
= type
; /* will be DDT_TYPES if no entry found */
640 dde
->dde_class
= class; /* will be DDT_CLASSES if no entry found */
643 ddt_stat_update(ddt
, dde
, -1ULL);
645 /* Entry loaded, everyone can proceed now */
646 dde
->dde_flags
|= DDE_FLAG_LOADED
;
647 cv_broadcast(&dde
->dde_cv
);
653 ddt_prefetch(spa_t
*spa
, const blkptr_t
*bp
)
658 if (!zfs_dedup_prefetch
|| bp
== NULL
|| !BP_GET_DEDUP(bp
))
662 * We only remove the DDT once all tables are empty and only
663 * prefetch dedup blocks when there are entries in the DDT.
664 * Thus no locking is required as the DDT can't disappear on us.
666 ddt
= ddt_select(spa
, bp
);
667 ddt_key_fill(&ddk
, bp
);
669 for (ddt_type_t type
= 0; type
< DDT_TYPES
; type
++) {
670 for (ddt_class_t
class = 0; class < DDT_CLASSES
; class++) {
671 ddt_object_prefetch(ddt
, type
, class, &ddk
);
677 * Key comparison. Any struct wanting to make use of this function must have
678 * the key as the first element.
680 #define DDT_KEY_CMP_LEN (sizeof (ddt_key_t) / sizeof (uint16_t))
682 typedef struct ddt_key_cmp
{
683 uint16_t u16
[DDT_KEY_CMP_LEN
];
687 ddt_key_compare(const void *x1
, const void *x2
)
689 const ddt_key_cmp_t
*k1
= (const ddt_key_cmp_t
*)x1
;
690 const ddt_key_cmp_t
*k2
= (const ddt_key_cmp_t
*)x2
;
693 for (int i
= 0; i
< DDT_KEY_CMP_LEN
; i
++) {
694 cmp
= (int32_t)k1
->u16
[i
] - (int32_t)k2
->u16
[i
];
699 return (TREE_ISIGN(cmp
));
703 ddt_table_alloc(spa_t
*spa
, enum zio_checksum c
)
707 ddt
= kmem_cache_alloc(ddt_cache
, KM_SLEEP
);
708 memset(ddt
, 0, sizeof (ddt_t
));
710 mutex_init(&ddt
->ddt_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
711 avl_create(&ddt
->ddt_tree
, ddt_key_compare
,
712 sizeof (ddt_entry_t
), offsetof(ddt_entry_t
, dde_node
));
713 avl_create(&ddt
->ddt_repair_tree
, ddt_key_compare
,
714 sizeof (ddt_entry_t
), offsetof(ddt_entry_t
, dde_node
));
715 ddt
->ddt_checksum
= c
;
717 ddt
->ddt_os
= spa
->spa_meta_objset
;
723 ddt_table_free(ddt_t
*ddt
)
725 ASSERT0(avl_numnodes(&ddt
->ddt_tree
));
726 ASSERT0(avl_numnodes(&ddt
->ddt_repair_tree
));
727 avl_destroy(&ddt
->ddt_tree
);
728 avl_destroy(&ddt
->ddt_repair_tree
);
729 mutex_destroy(&ddt
->ddt_lock
);
730 kmem_cache_free(ddt_cache
, ddt
);
734 ddt_create(spa_t
*spa
)
736 spa
->spa_dedup_checksum
= ZIO_DEDUPCHECKSUM
;
738 for (enum zio_checksum c
= 0; c
< ZIO_CHECKSUM_FUNCTIONS
; c
++) {
739 if (DDT_CHECKSUM_VALID(c
))
740 spa
->spa_ddt
[c
] = ddt_table_alloc(spa
, c
);
751 error
= zap_lookup(spa
->spa_meta_objset
, DMU_POOL_DIRECTORY_OBJECT
,
752 DMU_POOL_DDT_STATS
, sizeof (uint64_t), 1,
753 &spa
->spa_ddt_stat_object
);
756 return (error
== ENOENT
? 0 : error
);
758 for (enum zio_checksum c
= 0; c
< ZIO_CHECKSUM_FUNCTIONS
; c
++) {
759 if (!DDT_CHECKSUM_VALID(c
))
762 ddt_t
*ddt
= spa
->spa_ddt
[c
];
763 for (ddt_type_t type
= 0; type
< DDT_TYPES
; type
++) {
764 for (ddt_class_t
class = 0; class < DDT_CLASSES
;
766 error
= ddt_object_load(ddt
, type
, class);
767 if (error
!= 0 && error
!= ENOENT
)
773 * Seed the cached histograms.
775 memcpy(&ddt
->ddt_histogram_cache
, ddt
->ddt_histogram
,
776 sizeof (ddt
->ddt_histogram
));
777 spa
->spa_dedup_dspace
= ~0ULL;
784 ddt_unload(spa_t
*spa
)
786 for (enum zio_checksum c
= 0; c
< ZIO_CHECKSUM_FUNCTIONS
; c
++) {
787 if (spa
->spa_ddt
[c
]) {
788 ddt_table_free(spa
->spa_ddt
[c
]);
789 spa
->spa_ddt
[c
] = NULL
;
795 ddt_class_contains(spa_t
*spa
, ddt_class_t max_class
, const blkptr_t
*bp
)
800 if (!BP_GET_DEDUP(bp
))
803 if (max_class
== DDT_CLASS_UNIQUE
)
806 ddt
= spa
->spa_ddt
[BP_GET_CHECKSUM(bp
)];
808 ddt_key_fill(&ddk
, bp
);
810 for (ddt_type_t type
= 0; type
< DDT_TYPES
; type
++) {
811 for (ddt_class_t
class = 0; class <= max_class
; class++) {
812 if (ddt_object_contains(ddt
, type
, class, &ddk
) == 0)
821 ddt_repair_start(ddt_t
*ddt
, const blkptr_t
*bp
)
826 ddt_key_fill(&ddk
, bp
);
828 dde
= ddt_alloc(&ddk
);
830 for (ddt_type_t type
= 0; type
< DDT_TYPES
; type
++) {
831 for (ddt_class_t
class = 0; class < DDT_CLASSES
; class++) {
833 * We can only do repair if there are multiple copies
834 * of the block. For anything in the UNIQUE class,
835 * there's definitely only one copy, so don't even try.
837 if (class != DDT_CLASS_UNIQUE
&&
838 ddt_object_lookup(ddt
, type
, class, dde
) == 0)
843 memset(dde
->dde_phys
, 0, sizeof (dde
->dde_phys
));
849 ddt_repair_done(ddt_t
*ddt
, ddt_entry_t
*dde
)
855 if (dde
->dde_repair_abd
!= NULL
&& spa_writeable(ddt
->ddt_spa
) &&
856 avl_find(&ddt
->ddt_repair_tree
, dde
, &where
) == NULL
)
857 avl_insert(&ddt
->ddt_repair_tree
, dde
, where
);
865 ddt_repair_entry_done(zio_t
*zio
)
867 ddt_entry_t
*rdde
= zio
->io_private
;
873 ddt_repair_entry(ddt_t
*ddt
, ddt_entry_t
*dde
, ddt_entry_t
*rdde
, zio_t
*rio
)
875 ddt_phys_t
*ddp
= dde
->dde_phys
;
876 ddt_phys_t
*rddp
= rdde
->dde_phys
;
877 ddt_key_t
*ddk
= &dde
->dde_key
;
878 ddt_key_t
*rddk
= &rdde
->dde_key
;
882 zio
= zio_null(rio
, rio
->io_spa
, NULL
,
883 ddt_repair_entry_done
, rdde
, rio
->io_flags
);
885 for (int p
= 0; p
< DDT_PHYS_TYPES
; p
++, ddp
++, rddp
++) {
886 if (ddp
->ddp_phys_birth
== 0 ||
887 ddp
->ddp_phys_birth
!= rddp
->ddp_phys_birth
||
888 memcmp(ddp
->ddp_dva
, rddp
->ddp_dva
, sizeof (ddp
->ddp_dva
)))
890 ddt_bp_create(ddt
->ddt_checksum
, ddk
, ddp
, &blk
);
891 zio_nowait(zio_rewrite(zio
, zio
->io_spa
, 0, &blk
,
892 rdde
->dde_repair_abd
, DDK_GET_PSIZE(rddk
), NULL
, NULL
,
893 ZIO_PRIORITY_SYNC_WRITE
, ZIO_DDT_CHILD_FLAGS(zio
), NULL
));
900 ddt_repair_table(ddt_t
*ddt
, zio_t
*rio
)
902 spa_t
*spa
= ddt
->ddt_spa
;
903 ddt_entry_t
*dde
, *rdde_next
, *rdde
;
904 avl_tree_t
*t
= &ddt
->ddt_repair_tree
;
907 if (spa_sync_pass(spa
) > 1)
911 for (rdde
= avl_first(t
); rdde
!= NULL
; rdde
= rdde_next
) {
912 rdde_next
= AVL_NEXT(t
, rdde
);
913 avl_remove(&ddt
->ddt_repair_tree
, rdde
);
915 ddt_bp_create(ddt
->ddt_checksum
, &rdde
->dde_key
, NULL
, &blk
);
916 dde
= ddt_repair_start(ddt
, &blk
);
917 ddt_repair_entry(ddt
, dde
, rdde
, rio
);
918 ddt_repair_done(ddt
, dde
);
925 ddt_sync_entry(ddt_t
*ddt
, ddt_entry_t
*dde
, dmu_tx_t
*tx
, uint64_t txg
)
927 dsl_pool_t
*dp
= ddt
->ddt_spa
->spa_dsl_pool
;
928 ddt_phys_t
*ddp
= dde
->dde_phys
;
929 ddt_key_t
*ddk
= &dde
->dde_key
;
930 ddt_type_t otype
= dde
->dde_type
;
931 ddt_type_t ntype
= DDT_TYPE_DEFAULT
;
932 ddt_class_t oclass
= dde
->dde_class
;
934 uint64_t total_refcnt
= 0;
936 ASSERT(dde
->dde_flags
& DDE_FLAG_LOADED
);
938 for (int p
= 0; p
< DDT_PHYS_TYPES
; p
++, ddp
++) {
939 ASSERT3P(dde
->dde_lead_zio
[p
], ==, NULL
);
940 if (ddp
->ddp_phys_birth
== 0) {
941 ASSERT0(ddp
->ddp_refcnt
);
944 if (p
== DDT_PHYS_DITTO
) {
946 * Note, we no longer create DDT-DITTO blocks, but we
947 * don't want to leak any written by older software.
949 ddt_phys_free(ddt
, ddk
, ddp
, txg
);
952 if (ddp
->ddp_refcnt
== 0)
953 ddt_phys_free(ddt
, ddk
, ddp
, txg
);
954 total_refcnt
+= ddp
->ddp_refcnt
;
957 /* We do not create new DDT-DITTO blocks. */
958 ASSERT0(dde
->dde_phys
[DDT_PHYS_DITTO
].ddp_phys_birth
);
959 if (total_refcnt
> 1)
960 nclass
= DDT_CLASS_DUPLICATE
;
962 nclass
= DDT_CLASS_UNIQUE
;
964 if (otype
!= DDT_TYPES
&&
965 (otype
!= ntype
|| oclass
!= nclass
|| total_refcnt
== 0)) {
966 VERIFY0(ddt_object_remove(ddt
, otype
, oclass
, ddk
, tx
));
968 ddt_object_contains(ddt
, otype
, oclass
, ddk
), ==, ENOENT
);
971 if (total_refcnt
!= 0) {
972 dde
->dde_type
= ntype
;
973 dde
->dde_class
= nclass
;
974 ddt_stat_update(ddt
, dde
, 0);
975 if (!ddt_object_exists(ddt
, ntype
, nclass
))
976 ddt_object_create(ddt
, ntype
, nclass
, tx
);
977 VERIFY0(ddt_object_update(ddt
, ntype
, nclass
, dde
, tx
));
980 * If the class changes, the order that we scan this bp
981 * changes. If it decreases, we could miss it, so
982 * scan it right now. (This covers both class changing
983 * while we are doing ddt_walk(), and when we are
986 if (nclass
< oclass
) {
987 dsl_scan_ddt_entry(dp
->dp_scan
,
988 ddt
->ddt_checksum
, dde
, tx
);
994 ddt_sync_table(ddt_t
*ddt
, dmu_tx_t
*tx
, uint64_t txg
)
996 spa_t
*spa
= ddt
->ddt_spa
;
1000 if (avl_numnodes(&ddt
->ddt_tree
) == 0)
1003 ASSERT3U(spa
->spa_uberblock
.ub_version
, >=, SPA_VERSION_DEDUP
);
1005 if (spa
->spa_ddt_stat_object
== 0) {
1006 spa
->spa_ddt_stat_object
= zap_create_link(ddt
->ddt_os
,
1007 DMU_OT_DDT_STATS
, DMU_POOL_DIRECTORY_OBJECT
,
1008 DMU_POOL_DDT_STATS
, tx
);
1011 while ((dde
= avl_destroy_nodes(&ddt
->ddt_tree
, &cookie
)) != NULL
) {
1012 ddt_sync_entry(ddt
, dde
, tx
, txg
);
1016 for (ddt_type_t type
= 0; type
< DDT_TYPES
; type
++) {
1017 uint64_t add
, count
= 0;
1018 for (ddt_class_t
class = 0; class < DDT_CLASSES
; class++) {
1019 if (ddt_object_exists(ddt
, type
, class)) {
1020 ddt_object_sync(ddt
, type
, class, tx
);
1021 VERIFY0(ddt_object_count(ddt
, type
, class,
1026 for (ddt_class_t
class = 0; class < DDT_CLASSES
; class++) {
1027 if (count
== 0 && ddt_object_exists(ddt
, type
, class))
1028 ddt_object_destroy(ddt
, type
, class, tx
);
1032 memcpy(&ddt
->ddt_histogram_cache
, ddt
->ddt_histogram
,
1033 sizeof (ddt
->ddt_histogram
));
1034 spa
->spa_dedup_dspace
= ~0ULL;
1038 ddt_sync(spa_t
*spa
, uint64_t txg
)
1040 dsl_scan_t
*scn
= spa
->spa_dsl_pool
->dp_scan
;
1044 ASSERT3U(spa_syncing_txg(spa
), ==, txg
);
1046 tx
= dmu_tx_create_assigned(spa
->spa_dsl_pool
, txg
);
1048 rio
= zio_root(spa
, NULL
, NULL
,
1049 ZIO_FLAG_CANFAIL
| ZIO_FLAG_SPECULATIVE
| ZIO_FLAG_SELF_HEAL
);
1052 * This function may cause an immediate scan of ddt blocks (see
1053 * the comment above dsl_scan_ddt() for details). We set the
1054 * scan's root zio here so that we can wait for any scan IOs in
1055 * addition to the regular ddt IOs.
1057 ASSERT3P(scn
->scn_zio_root
, ==, NULL
);
1058 scn
->scn_zio_root
= rio
;
1060 for (enum zio_checksum c
= 0; c
< ZIO_CHECKSUM_FUNCTIONS
; c
++) {
1061 ddt_t
*ddt
= spa
->spa_ddt
[c
];
1064 ddt_sync_table(ddt
, tx
, txg
);
1065 ddt_repair_table(ddt
, rio
);
1068 (void) zio_wait(rio
);
1069 scn
->scn_zio_root
= NULL
;
1075 ddt_walk(spa_t
*spa
, ddt_bookmark_t
*ddb
, ddt_entry_t
*dde
)
1080 ddt_t
*ddt
= spa
->spa_ddt
[ddb
->ddb_checksum
];
1084 if (ddt_object_exists(ddt
, ddb
->ddb_type
,
1086 error
= ddt_object_walk(ddt
,
1087 ddb
->ddb_type
, ddb
->ddb_class
,
1088 &ddb
->ddb_cursor
, dde
);
1090 dde
->dde_type
= ddb
->ddb_type
;
1091 dde
->dde_class
= ddb
->ddb_class
;
1094 if (error
!= ENOENT
)
1096 ddb
->ddb_cursor
= 0;
1097 } while (++ddb
->ddb_checksum
< ZIO_CHECKSUM_FUNCTIONS
);
1098 ddb
->ddb_checksum
= 0;
1099 } while (++ddb
->ddb_type
< DDT_TYPES
);
1101 } while (++ddb
->ddb_class
< DDT_CLASSES
);
1103 return (SET_ERROR(ENOENT
));
1107 * This function is used by Block Cloning (brt.c) to increase reference
1108 * counter for the DDT entry if the block is already in DDT.
1110 * Return false if the block, despite having the D bit set, is not present
1111 * in the DDT. Currently this is not possible but might be in the future.
1112 * See the comment below.
1115 ddt_addref(spa_t
*spa
, const blkptr_t
*bp
)
1121 spa_config_enter(spa
, SCL_ZIO
, FTAG
, RW_READER
);
1122 ddt
= ddt_select(spa
, bp
);
1125 dde
= ddt_lookup(ddt
, bp
, B_TRUE
);
1126 ASSERT3P(dde
, !=, NULL
);
1128 if (dde
->dde_type
< DDT_TYPES
) {
1131 ASSERT3S(dde
->dde_class
, <, DDT_CLASSES
);
1133 ddp
= &dde
->dde_phys
[BP_GET_NDVAS(bp
)];
1136 * This entry already existed (dde_type is real), so it must
1137 * have refcnt >0 at the start of this txg. We are called from
1138 * brt_pending_apply(), before frees are issued, so the refcnt
1139 * can't be lowered yet. Therefore, it must be >0. We assert
1140 * this because if the order of BRT and DDT interactions were
1141 * ever to change and the refcnt was ever zero here, then
1142 * likely further action is required to fill out the DDT entry,
1143 * and this is a place that is likely to be missed in testing.
1145 ASSERT3U(ddp
->ddp_refcnt
, >, 0);
1147 ddt_phys_addref(ddp
);
1151 * At the time of implementating this if the block has the
1152 * DEDUP flag set it must exist in the DEDUP table, but
1153 * there are many advocates that want ability to remove
1154 * entries from DDT with refcnt=1. If this will happen,
1155 * we may have a block with the DEDUP set, but which doesn't
1156 * have a corresponding entry in the DDT. Be ready.
1158 ASSERT3S(dde
->dde_class
, ==, DDT_CLASSES
);
1159 ddt_remove(ddt
, dde
);
1164 spa_config_exit(spa
, SCL_ZIO
, FTAG
);
1169 ZFS_MODULE_PARAM(zfs_dedup
, zfs_dedup_
, prefetch
, INT
, ZMOD_RW
,
1170 "Enable prefetching dedup-ed blks");