From: Rob Norris Date: Mon, 27 Nov 2023 23:43:36 +0000 (+1100) Subject: ddt: document the theory and the key data structures X-Git-Url: https://git.proxmox.com/?p=mirror_zfs.git;a=commitdiff_plain;h=5720b00632c1086fe4a57df42aef2f0972c83c74 ddt: document the theory and the key data structures Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Sponsored-by: Klara, Inc. Sponsored-by: iXsystems, Inc. Closes #15887 --- diff --git a/include/sys/ddt.h b/include/sys/ddt.h index b29667e5a..726f1a390 100644 --- a/include/sys/ddt.h +++ b/include/sys/ddt.h @@ -21,6 +21,7 @@ /* * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2016 by Delphix. All rights reserved. + * Copyright (c) 2023, Klara Inc. */ #ifndef _SYS_DDT_H @@ -39,10 +40,16 @@ extern "C" { struct abd; /* - * On-disk DDT formats, in the desired search order (newest version first). + * DDT on-disk storage object types. Each one corresponds to specific + * implementation, see ddt_ops_t. The value itself is not stored on disk. + * + * When searching for an entry, objects types will be searched in this order. + * + * Note that DDT_TYPES is used as the "no type" for new entries that have not + * yet been written to a storage object. */ typedef enum { - DDT_TYPE_ZAP = 0, + DDT_TYPE_ZAP = 0, /* ZAP storage object, ddt_zap */ DDT_TYPES } ddt_type_t; @@ -53,12 +60,18 @@ _Static_assert(DDT_TYPES <= UINT8_MAX, #define DDT_TYPE_DEFAULT (DDT_TYPE_ZAP) /* - * DDT classes, in the desired search order (highest replication level first). + * DDT storage classes. Each class has a separate storage object for each type. + * The value itself is not stored on disk. + * + * When search for an entry, object classes will be searched in this order. + * + * Note that DDT_CLASSES is used as the "no class" for new entries that have not + * yet been written to a storage object. */ typedef enum { - DDT_CLASS_DITTO = 0, - DDT_CLASS_DUPLICATE, - DDT_CLASS_UNIQUE, + DDT_CLASS_DITTO = 0, /* entry has ditto blocks (obsolete) */ + DDT_CLASS_DUPLICATE, /* entry has multiple references */ + DDT_CLASS_UNIQUE, /* entry has a single reference */ DDT_CLASSES } ddt_class_t; @@ -66,7 +79,9 @@ _Static_assert(DDT_CLASSES < UINT8_MAX, "ddt_class_t must fit in a uint8_t"); /* - * On-disk ddt entry: key (name) and physical storage (value). + * The "key" part of an on-disk entry. This is the unique "name" for a block, + * that is, that parts of the block pointer that will always be the same for + * the same data. */ typedef struct { zio_cksum_t ddk_cksum; /* 256-bit block checksum */ @@ -80,6 +95,10 @@ typedef struct { uint64_t ddk_prop; } ddt_key_t; +/* + * Macros for accessing parts of a ddt_key_t. These are similar to their BP_* + * counterparts. + */ #define DDK_GET_LSIZE(ddk) \ BF64_GET_SB((ddk)->ddk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1) #define DDK_SET_LSIZE(ddk, x) \ @@ -96,6 +115,16 @@ typedef struct { #define DDK_GET_CRYPT(ddk) BF64_GET((ddk)->ddk_prop, 39, 1) #define DDK_SET_CRYPT(ddk, x) BF64_SET((ddk)->ddk_prop, 39, 1, x) +/* + * The "value" part for an on-disk entry. These are the "physical" + * characteristics of the stored block, such as its location on disk (DVAs), + * birth txg and ref count. + * + * Note that an entry has an array of four ddt_phys_t, one for each number of + * DVAs (copies= property) and another for additional "ditto" copies. Most + * users of ddt_phys_t will handle indexing into or counting the phys they + * want. + */ typedef struct { dva_t ddp_dva[SPA_DVAS_PER_BP]; uint64_t ddp_refcnt; @@ -103,6 +132,8 @@ typedef struct { } ddt_phys_t; /* + * Named indexes into the ddt_phys_t array in each entry. + * * Note, we no longer generate new DDT_PHYS_DITTO-type blocks. However, * we maintain the ability to free existing dedup-ditto blocks. */ @@ -115,7 +146,8 @@ enum ddt_phys_type { }; /* - * In-core ddt entry + * A "live" entry, holding changes to an entry made this txg, and other data to + * support loading, updating and repairing the entry. */ /* State flags for dde_flags */ @@ -123,36 +155,56 @@ enum ddt_phys_type { typedef struct { /* key must be first for ddt_key_compare */ - ddt_key_t dde_key; - ddt_phys_t dde_phys[DDT_PHYS_TYPES]; + ddt_key_t dde_key; /* ddt_tree key */ + ddt_phys_t dde_phys[DDT_PHYS_TYPES]; /* on-disk data */ + + /* in-flight update IOs */ zio_t *dde_lead_zio[DDT_PHYS_TYPES]; + + /* copy of data after a repair read, to be rewritten */ struct abd *dde_repair_abd; + + /* storage type and class the entry was loaded from */ ddt_type_t dde_type; ddt_class_t dde_class; - uint8_t dde_flags; - kcondvar_t dde_cv; - avl_node_t dde_node; + + uint8_t dde_flags; /* load state flags */ + kcondvar_t dde_cv; /* signaled when load completes */ + + avl_node_t dde_node; /* ddt_tree node */ } ddt_entry_t; /* - * In-core ddt + * In-core DDT object. This covers all entries and stats for a the whole pool + * for a given checksum type. */ typedef struct { - kmutex_t ddt_lock; - avl_tree_t ddt_tree; - avl_tree_t ddt_repair_tree; - enum zio_checksum ddt_checksum; - spa_t *ddt_spa; - objset_t *ddt_os; - uint64_t ddt_stat_object; + kmutex_t ddt_lock; /* protects changes to all fields */ + + avl_tree_t ddt_tree; /* "live" (changed) entries this txg */ + + avl_tree_t ddt_repair_tree; /* entries being repaired */ + + enum zio_checksum ddt_checksum; /* checksum algorithm in use */ + spa_t *ddt_spa; /* pool this ddt is on */ + objset_t *ddt_os; /* ddt objset (always MOS) */ + + /* per-type/per-class entry store objects */ uint64_t ddt_object[DDT_TYPES][DDT_CLASSES]; + + /* object ids for whole-ddt and per-type/per-class stats */ + uint64_t ddt_stat_object; + ddt_object_t ddt_object_stats[DDT_TYPES][DDT_CLASSES]; + + /* type/class stats by power-2-sized referenced blocks */ ddt_histogram_t ddt_histogram[DDT_TYPES][DDT_CLASSES]; ddt_histogram_t ddt_histogram_cache[DDT_TYPES][DDT_CLASSES]; - ddt_object_t ddt_object_stats[DDT_TYPES][DDT_CLASSES]; } ddt_t; /* - * In-core and on-disk bookmark for DDT walks + * In-core and on-disk bookmark for DDT walks. This is a cursor for ddt_walk(), + * and is stable across calls, even if the DDT is updated, the pool is + * restarted or loaded on another system, or OpenZFS is upgraded. */ typedef struct { uint64_t ddb_class; diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c index f22c79c50..de8640e58 100644 --- a/module/zfs/ddt.c +++ b/module/zfs/ddt.c @@ -23,6 +23,7 @@ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2016 by Delphix. All rights reserved. * Copyright (c) 2022 by Pawel Jakub Dawidek + * Copyright (c) 2023, Klara Inc. */ #include @@ -39,6 +40,101 @@ #include #include +/* + * # DDT: Deduplication tables + * + * The dedup subsystem provides block-level deduplication. When enabled, blocks + * to be written will have the dedup (D) bit set, which causes them to be + * tracked in a "dedup table", or DDT. If a block has been seen before (exists + * in the DDT), instead of being written, it will instead be made to reference + * the existing on-disk data, and a refcount bumped in the DDT instead. + * + * ## Dedup tables and entries + * + * Conceptually, a DDT is a dictionary or map. Each entry has a "key" + * (ddt_key_t) made up a block's checksum and certian properties, and a "value" + * (one or more ddt_phys_t) containing valid DVAs for the block's data, birth + * time and refcount. Together these are enough to track references to a + * specific block, to build a valid block pointer to reference that block (for + * freeing, scrubbing, etc), and to fill a new block pointer with the missing + * pieces to make it seem like it was written. + * + * There's a single DDT (ddt_t) for each checksum type, held in spa_ddt[]. + * Within each DDT, there can be multiple storage "types" (ddt_type_t, on-disk + * object data formats, each with their own implementations) and "classes" + * (ddt_class_t, instance of a storage type object, for entries with a specific + * characteristic). An entry (key) will only ever exist on one of these objects + * at any given time, but may be moved from one to another if their type or + * class changes. + * + * The DDT is driven by the write IO pipeline (zio_ddt_write()). When a block + * is to be written, before DVAs have been allocated, ddt_lookup() is called to + * see if the block has been seen before. If its not found, the write proceeds + * as normal, and after it succeeds, a new entry is created. If it is found, we + * fill the BP with the DVAs from the entry, increment the refcount and cause + * the write IO to return immediately. + * + * Each ddt_phys_t slot in the entry represents a separate dedup block for the + * same content/checksum. The slot is selected based on the zp_copies parameter + * the block is written with, that is, the number of DVAs in the block. The + * "ditto" slot (DDT_PHYS_DITTO) used to be used for now-removed "dedupditto" + * feature. These are no longer written, and will be freed if encountered on + * old pools. + * + * ## Lifetime of an entry + * + * A DDT can be enormous, and typically is not held in memory all at once. + * Instead, the changes to an entry are tracked in memory, and written down to + * disk at the end of each txg. + * + * A "live" in-memory entry (ddt_entry_t) is a node on the live tree + * (ddt_tree). At the start of a txg, ddt_tree is empty. When an entry is + * required for IO, ddt_lookup() is called. If an entry already exists on + * ddt_tree, it is returned. Otherwise, a new one is created, and the + * type/class objects for the DDT are searched for that key. If its found, its + * value is copied into the live entry. If not, an empty entry is created. + * + * The live entry will be modified during the txg, usually by modifying the + * refcount, but sometimes by adding or updating DVAs. At the end of the txg + * (during spa_sync()), type and class are recalculated for entry (see + * ddt_sync_entry()), and the entry is written to the appropriate storage + * object and (if necessary), removed from an old one. ddt_tree is cleared and + * the next txg can start. + * + * ## Repair IO + * + * If a read on a dedup block fails, but there are other copies of the block in + * the other ddt_phys_t slots, reads will be issued for those instead + * (zio_ddt_read_start()). If one of those succeeds, the read is returned to + * the caller, and a copy is stashed on the entry's dde_repair_abd. + * + * During the end-of-txg sync, any entries with a dde_repair_abd get a + * "rewrite" write issued for the original block pointer, with the data read + * from the alternate block. If the block is actually damaged, this will invoke + * the pool's "self-healing" mechanism, and repair the block. + * + * ## Scanning (scrub/resilver) + * + * If dedup is active, the scrub machinery will walk the dedup table first, and + * scrub all blocks with refcnt > 1 first. After that it will move on to the + * regular top-down scrub, and exclude the refcnt > 1 blocks when it sees them. + * In this way, heavily deduplicated blocks are only scrubbed once. See the + * commentary on dsl_scan_ddt() for more details. + * + * Walking the DDT is done via ddt_walk(). The current position is stored in a + * ddt_bookmark_t, which represents a stable position in the storage object. + * This bookmark is stored by the scan machinery, and must reference the same + * position on the object even if the object changes, the pool is exported, or + * OpenZFS is upgraded. + * + * ## Interaction with block cloning + * + * If block cloning and dedup are both enabled on a pool, BRT will look for the + * dedup bit on an incoming block pointer. If set, it will call into the DDT + * (ddt_addref()) to add a reference to the block, instead of adding a + * reference to the BRT. See brt_pending_apply(). + */ + /* * These are the only checksums valid for dedup. They must match the list * from dedup_table in zfs_prop.c