]> git.proxmox.com Git - mirror_zfs-debian.git/commitdiff
Illumos #3104: eliminate empty bpobjs
authorMatthew Ahrens <mahrens@delphix.com>
Sun, 23 Dec 2012 23:57:14 +0000 (15:57 -0800)
committerBrian Behlendorf <behlendorf1@llnl.gov>
Tue, 8 Jan 2013 18:35:43 +0000 (10:35 -0800)
3104 eliminate empty bpobjs
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <chris.siden@delphix.com>
Reviewed by: Garrett D'Amore <garrett@damore.org>
Approved by: Eric Schrock <eric.schrock@delphix.com>

References:
  illumos/illumos-gate@f17457368189aa911f774c38c1f21875a568bdca
  illumos changeset: 13782:8f78aae28a63
  https://www.illumos.org/issues/3104

Ported-by: Brian Behlendorf <behlendorf1@llnl.gov>
12 files changed:
include/sys/bpobj.h
include/sys/dmu.h
include/sys/dsl_pool.h
include/sys/zap.h
include/zfeature_common.h
man/man5/zpool-features.5
module/zfs/bpobj.c
module/zfs/dsl_deadlist.c
module/zfs/dsl_pool.c
module/zfs/zap.c
module/zfs/zfeature.c
module/zfs/zfeature_common.c

index 3771a9541aa7e518cb748eaac62a4cf73bc2bb17..af975c734560d122977048026d84ad58354f7795 100644 (file)
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #ifndef        _SYS_BPOBJ_H
@@ -67,7 +68,9 @@ typedef struct bpobj {
 typedef int bpobj_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
 
 uint64_t bpobj_alloc(objset_t *mos, int blocksize, dmu_tx_t *tx);
+uint64_t bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx);
 void bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx);
+void bpobj_decr_empty(objset_t *os, dmu_tx_t *tx);
 
 int bpobj_open(bpobj_t *bpo, objset_t *mos, uint64_t object);
 void bpobj_close(bpobj_t *bpo);
index ce316973130ff19a0cc3d6a52936d77b15875cca..7fc876be7692bb2fcb7c6011fa0dfc1d0bff7574 100644 (file)
@@ -309,6 +309,7 @@ typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr);
 #define        DMU_POOL_SCAN                   "scan"
 #define        DMU_POOL_FREE_BPOBJ             "free_bpobj"
 #define        DMU_POOL_BPTREE_OBJ             "bptree_obj"
+#define        DMU_POOL_EMPTY_BPOBJ            "empty_bpobj"
 
 /*
  * Allocate an object from this objset.  The range of object numbers
index 61c91f06c96912f0379ac1672586ddd2a3367635..ff5df1414a5eab11528f37e5bcfbf9f552ad62e2 100644 (file)
@@ -96,6 +96,7 @@ typedef struct dsl_pool {
        uint64_t dp_tmp_userrefs_obj;
        bpobj_t dp_free_bpobj;
        uint64_t dp_bptree_obj;
+       uint64_t dp_empty_bpobj;
 
        struct dsl_scan *dp_scan;
 
index 4d7b315597c52d2a1848101475b46900a1fc71ab..092669c8b3ab310b15525e9b5a9001643d3e3067 100644 (file)
@@ -300,6 +300,8 @@ int zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta,
 /* Here the key is an int and the value is a different int. */
 int zap_add_int_key(objset_t *os, uint64_t obj,
     uint64_t key, uint64_t value, dmu_tx_t *tx);
+int zap_update_int_key(objset_t *os, uint64_t obj,
+    uint64_t key, uint64_t value, dmu_tx_t *tx);
 int zap_lookup_int_key(objset_t *os, uint64_t obj,
     uint64_t key, uint64_t *valuep);
 
index 27f8f00a04f08545564170d2ca86e4035a14304d..cb1d02fd7cd2fc77a25f3e4d7ce2366dda3859aa 100644 (file)
@@ -51,6 +51,7 @@ typedef int (zfeature_func_t)(zfeature_info_t *fi, void *arg);
 
 typedef enum spa_feature {
        SPA_FEATURE_ASYNC_DESTROY,
+       SPA_FEATURE_EMPTY_BPOBJ,
        SPA_FEATURES
 } spa_feature_t;
 
index 3c1930c35acc31533d5059e905c7e88d5aa34b56..0ab179ef7c9002d8056789c663b88168bfef0f66 100644 (file)
@@ -169,5 +169,33 @@ through the \fBfreeing\fR property.
 
 This feature is only \fBactive\fR while \fBfreeing\fR is non\-zero.
 .RE
+
+.sp
+.ne 2
+.na
+\fB\fBempty_bpobj\fR\fR
+.ad
+.RS 4n
+.TS
+l l .
+GUID   com.delphix:empty_bpobj
+READ\-ONLY COMPATIBLE  yes
+DEPENDENCIES   none
+.TE
+
+This feature increases the performance of creating and using a large
+number of snapshots of a single filesystem or volume, and also reduces
+the disk space required.
+
+When there are many snapshots, each snapshot uses many Block Pointer
+Objects (bpobj's) to track blocks associated with that snapshot.
+However, in common use cases, most of these bpobj's are empty.  This
+feature allows us to create each bpobj on-demand, thus eliminating the
+empty bpobjs.
+
+This feature is \fBactive\fR while there are any filesystems, volumes,
+or snapshots which were created after enabling this feature.
+.RE
+
 .SH "SEE ALSO"
 \fBzpool\fR(1M)
index 022921c666b80c0c0b604f2e6ff4456ea456200f..d5f8d4072d110a57fa5a1f0637c2dfa834664019 100644 (file)
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011 by Delphix. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #include <sys/bpobj.h>
 #include <sys/zfs_context.h>
 #include <sys/refcount.h>
 #include <sys/dsl_pool.h>
+#include <sys/zfeature.h>
+#include <sys/zap.h>
+
+/*
+ * Return an empty bpobj, preferably the empty dummy one (dp_empty_bpobj).
+ */
+uint64_t
+bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx)
+{
+       zfeature_info_t *empty_bpobj_feat =
+           &spa_feature_table[SPA_FEATURE_EMPTY_BPOBJ];
+       spa_t *spa = dmu_objset_spa(os);
+       dsl_pool_t *dp = dmu_objset_pool(os);
+
+       if (spa_feature_is_enabled(spa, empty_bpobj_feat)) {
+               if (!spa_feature_is_active(spa, empty_bpobj_feat)) {
+                       ASSERT3U(dp->dp_empty_bpobj, ==, 0);
+                       dp->dp_empty_bpobj =
+                           bpobj_alloc(os, SPA_MAXBLOCKSIZE, tx);
+                       VERIFY(zap_add(os,
+                           DMU_POOL_DIRECTORY_OBJECT,
+                           DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
+                           &dp->dp_empty_bpobj, tx) == 0);
+               }
+               spa_feature_incr(spa, empty_bpobj_feat, tx);
+               ASSERT(dp->dp_empty_bpobj != 0);
+               return (dp->dp_empty_bpobj);
+       } else {
+               return (bpobj_alloc(os, blocksize, tx));
+       }
+}
+
+void
+bpobj_decr_empty(objset_t *os, dmu_tx_t *tx)
+{
+       zfeature_info_t *empty_bpobj_feat =
+           &spa_feature_table[SPA_FEATURE_EMPTY_BPOBJ];
+       dsl_pool_t *dp = dmu_objset_pool(os);
+
+       spa_feature_decr(dmu_objset_spa(os), empty_bpobj_feat, tx);
+       if (!spa_feature_is_active(dmu_objset_spa(os), empty_bpobj_feat)) {
+               VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset,
+                   DMU_POOL_DIRECTORY_OBJECT,
+                   DMU_POOL_EMPTY_BPOBJ, tx));
+               VERIFY3U(0, ==, dmu_object_free(os, dp->dp_empty_bpobj, tx));
+               dp->dp_empty_bpobj = 0;
+       }
+}
 
 uint64_t
 bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
@@ -53,6 +101,7 @@ bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)
        int epb;
        dmu_buf_t *dbuf = NULL;
 
+       ASSERT(obj != dmu_objset_pool(os)->dp_empty_bpobj);
        VERIFY3U(0, ==, bpobj_open(&bpo, os, obj));
 
        mutex_enter(&bpo.bpo_lock);
@@ -320,6 +369,12 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
 
        ASSERT(bpo->bpo_havesubobj);
        ASSERT(bpo->bpo_havecomp);
+       ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);
+
+       if (subobj == dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj) {
+               bpobj_decr_empty(bpo->bpo_os, tx);
+               return;
+       }
 
        VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj));
        VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp));
@@ -388,6 +443,7 @@ bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx)
        blkptr_t *bparray;
 
        ASSERT(!BP_IS_HOLE(bp));
+       ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);
 
        /* We never need the fill count. */
        stored_bp.blk_fill = 0;
index 1e89a68d77083ac4706a331a7a647bac216c3afe..909b5f8fc83fdf99b5500006f9e0dfb2150d6591 100644 (file)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011 by Delphix. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #include <sys/dsl_dataset.h>
@@ -165,12 +165,49 @@ dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx)
 
        for (zap_cursor_init(&zc, os, dlobj);
            zap_cursor_retrieve(&zc, &za) == 0;
-           zap_cursor_advance(&zc))
-               bpobj_free(os, za.za_first_integer, tx);
+           zap_cursor_advance(&zc)) {
+               uint64_t obj = za.za_first_integer;
+               if (obj == dmu_objset_pool(os)->dp_empty_bpobj)
+                       bpobj_decr_empty(os, tx);
+               else
+                       bpobj_free(os, obj, tx);
+       }
        zap_cursor_fini(&zc);
        VERIFY3U(0, ==, dmu_object_free(os, dlobj, tx));
 }
 
+static void
+dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
+    const blkptr_t *bp, dmu_tx_t *tx)
+{
+       if (dle->dle_bpobj.bpo_object ==
+           dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) {
+               uint64_t obj = bpobj_alloc(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
+               bpobj_close(&dle->dle_bpobj);
+               bpobj_decr_empty(dl->dl_os, tx);
+               VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
+               VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object,
+                   dle->dle_mintxg, obj, tx));
+       }
+       bpobj_enqueue(&dle->dle_bpobj, bp, tx);
+}
+
+static void
+dle_enqueue_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
+    uint64_t obj, dmu_tx_t *tx)
+{
+       if (dle->dle_bpobj.bpo_object !=
+           dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) {
+               bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx);
+       } else {
+               bpobj_close(&dle->dle_bpobj);
+               bpobj_decr_empty(dl->dl_os, tx);
+               VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
+               VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object,
+                   dle->dle_mintxg, obj, tx));
+       }
+}
+
 void
 dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx)
 {
@@ -199,7 +236,7 @@ dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx)
                dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
        else
                dle = AVL_PREV(&dl->dl_tree, dle);
-       bpobj_enqueue(&dle->dle_bpobj, bp, tx);
+       dle_enqueue(dl, dle, bp, tx);
 }
 
 /*
@@ -219,7 +256,7 @@ dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
 
        dle = kmem_alloc(sizeof (*dle), KM_PUSHPAGE);
        dle->dle_mintxg = mintxg;
-       obj = bpobj_alloc(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
+       obj = bpobj_alloc_empty(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
        VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
        avl_add(&dl->dl_tree, dle);
 
@@ -245,8 +282,7 @@ dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
        dle = avl_find(&dl->dl_tree, &dle_tofind, NULL);
        dle_prev = AVL_PREV(&dl->dl_tree, dle);
 
-       bpobj_enqueue_subobj(&dle_prev->dle_bpobj,
-           dle->dle_bpobj.bpo_object, tx);
+       dle_enqueue_subobj(dl, dle_prev, dle->dle_bpobj.bpo_object, tx);
 
        avl_remove(&dl->dl_tree, dle);
        bpobj_close(&dle->dle_bpobj);
@@ -304,7 +340,7 @@ dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg,
                if (dle->dle_mintxg >= maxtxg)
                        break;
 
-               obj = bpobj_alloc(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
+               obj = bpobj_alloc_empty(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
                VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, newobj,
                    dle->dle_mintxg, obj, tx));
        }
@@ -402,7 +438,7 @@ dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth,
        dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
        if (dle == NULL)
                dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
-       bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx);
+       dle_enqueue_subobj(dl, dle, obj, tx);
 }
 
 static int
index 089a7f092f163fae4e9008f51b43918e9e1f3726..704f034e9ee0842193f73744f7b7decb4fbf3551 100644 (file)
@@ -322,6 +322,15 @@ dsl_pool_open(dsl_pool_t *dp)
                        goto out;
        }
 
+       if (spa_feature_is_active(dp->dp_spa,
+           &spa_feature_table[SPA_FEATURE_EMPTY_BPOBJ])) {
+               err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+                   DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
+                   &dp->dp_empty_bpobj);
+               if (err != 0)
+                       goto out;
+       }
+
        err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
            DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1,
            &dp->dp_tmp_userrefs_obj);
index 59f62fa3afa6597e612b7a463bfd0def640b0848..fd3021be6df5d4649fd63d87fdd48ce8fb58da0b 100644 (file)
@@ -1093,6 +1093,16 @@ zap_add_int_key(objset_t *os, uint64_t obj,
        return (zap_add(os, obj, name, 8, 1, &value, tx));
 }
 
+int
+zap_update_int_key(objset_t *os, uint64_t obj,
+    uint64_t key, uint64_t value, dmu_tx_t *tx)
+{
+       char name[20];
+
+       (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
+       return (zap_update(os, obj, name, 8, 1, &value, tx));
+}
+
 int
 zap_lookup_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t *valuep)
 {
index 24ff18fc3691d1aa5208ba60f1a1850a4dff8805..c09b32d177308f952f4bcb5919e98d05d6243c2c 100644 (file)
@@ -229,7 +229,12 @@ feature_get_refcount(objset_t *os, uint64_t read_obj, uint64_t write_obj,
        uint64_t refcount;
        uint64_t zapobj = feature->fi_can_readonly ? write_obj : read_obj;
 
-       ASSERT(0 != zapobj);
+       /*
+        * If the pool is currently being created, the feature objects may not
+        * have been allocated yet.  Act as though all features are disabled.
+        */
+       if (zapobj == 0)
+               return (ENOTSUP);
 
        err = zap_lookup(os, zapobj, feature->fi_guid, sizeof (uint64_t), 1,
            &refcount);
index 33d15133e20e989a5a90e0fd0c759d701f940ca5..40066991b948f90bb0415c35180df6b2118f1a7f 100644 (file)
@@ -157,4 +157,7 @@ zpool_feature_init(void)
        zfeature_register(SPA_FEATURE_ASYNC_DESTROY,
            "com.delphix:async_destroy", "async_destroy",
            "Destroy filesystems asynchronously.", B_TRUE, B_FALSE, NULL);
+       zfeature_register(SPA_FEATURE_EMPTY_BPOBJ,
+           "com.delphix:empty_bpobj", "empty_bpobj",
+           "Snapshots use less space.", B_TRUE, B_FALSE, NULL);
 }