]> git.proxmox.com Git - mirror_zfs.git/commitdiff
Provide more flexible object allocation interface
authorBrian Behlendorf <behlendorf1@llnl.gov>
Thu, 10 Jan 2019 22:37:43 +0000 (14:37 -0800)
committerGitHub <noreply@github.com>
Thu, 10 Jan 2019 22:37:43 +0000 (14:37 -0800)
Object allocation performance can be improved for complex operations
by providing an interface which returns the newly allocated dnode.
This allows the caller to immediately use the dnode without incurring
the expense of looking up the dnode by object number.

The functions dmu_object_alloc_hold(), zap_create_hold(), and
dmu_bonus_hold_by_dnode() were added for this purpose.

The zap_create_* functions have been updated to take advantage of
this new functionality.  The dmu_bonus_hold_impl() function should
really have never been included in sys/dmu.h and was removed.
It's sole caller was converted to use dmu_bonus_hold_by_dnode().

The new symbols have been exported for use by Lustre.

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #8015

include/sys/dmu.h
include/sys/zap.h
module/zfs/dmu.c
module/zfs/dmu_object.c
module/zfs/dmu_recv.c
module/zfs/zap_micro.c

index f8b5f096a1db278c7aa7dafa48ded0ca39419bd1..542eff95f02f0b52cdaea7bb0fef91f3d0114cfb 100644 (file)
@@ -407,6 +407,10 @@ uint64_t dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize,
 uint64_t dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot,
     int blocksize, dmu_object_type_t bonus_type, int bonus_len,
     int dnodesize, dmu_tx_t *tx);
+uint64_t dmu_object_alloc_hold(objset_t *os, dmu_object_type_t ot,
+    int blocksize, int indirect_blockshift, dmu_object_type_t bonustype,
+    int bonuslen, int dnodesize, dnode_t **allocated_dnode, void *tag,
+    dmu_tx_t *tx);
 int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
     int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
 int dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
@@ -521,9 +525,9 @@ void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp,
  *
  * Returns ENOENT, EIO, or 0.
  */
-int dmu_bonus_hold_impl(objset_t *os, uint64_t object, void *tag,
-    uint32_t flags, dmu_buf_t **dbp);
-int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **);
+int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp);
+int dmu_bonus_hold_by_dnode(dnode_t *dn, void *tag, dmu_buf_t **dbp,
+    uint32_t flags);
 int dmu_bonus_max(void);
 int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *);
 int dmu_set_bonustype(dmu_buf_t *, dmu_object_type_t, dmu_tx_t *);
index 7acc3becb5a19704e9a3b8675f9a293d934e949d..ab13652d8c07ac2900c9c9244a26e497967cbe69 100644 (file)
@@ -131,6 +131,11 @@ uint64_t zap_create_flags_dnsize(objset_t *os, int normflags,
     zap_flags_t flags, dmu_object_type_t ot, int leaf_blockshift,
     int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
     int dnodesize, dmu_tx_t *tx);
+uint64_t zap_create_hold(objset_t *os, int normflags, zap_flags_t flags,
+    dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
+    dmu_object_type_t bonustype, int bonuslen, int dnodesize,
+    dnode_t **allocated_dnode, void *tag, dmu_tx_t *tx);
+
 uint64_t zap_create_link(objset_t *os, dmu_object_type_t ot,
     uint64_t parent_obj, const char *name, dmu_tx_t *tx);
 uint64_t zap_create_link_dnsize(objset_t *os, dmu_object_type_t ot,
@@ -139,8 +144,8 @@ uint64_t zap_create_link_dnsize(objset_t *os, dmu_object_type_t ot,
 /*
  * Initialize an already-allocated object.
  */
-void mzap_create_impl(objset_t *os, uint64_t obj, int normflags,
-    zap_flags_t flags, dmu_tx_t *tx);
+void mzap_create_impl(dnode_t *dn, int normflags, zap_flags_t flags,
+    dmu_tx_t *tx);
 
 /*
  * Create a new zapobj with no attributes from the given (unallocated)
index e8d0ce3be715f98a7faae71ea176627293712504..5b79eb90724d7ba4f00ed4bcf7a6b752c63aa6b1 100644 (file)
@@ -330,13 +330,13 @@ dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx)
 }
 
 /*
- * returns ENOENT, EIO, or 0.
+ * Lookup and hold the bonus buffer for the provided dnode.  If the dnode
+ * has not yet been allocated a new bonus dbuf a will be allocated.
+ * Returns ENOENT, EIO, or 0.
  */
-int
-dmu_bonus_hold_impl(objset_t *os, uint64_t object, void *tag, uint32_t flags,
-    dmu_buf_t **dbp)
+int dmu_bonus_hold_by_dnode(dnode_t *dn, void *tag, dmu_buf_t **dbp,
+    uint32_t flags)
 {
-       dnode_t *dn;
        dmu_buf_impl_t *db;
        int error;
        uint32_t db_flags = DB_RF_MUST_SUCCEED;
@@ -346,10 +346,6 @@ dmu_bonus_hold_impl(objset_t *os, uint64_t object, void *tag, uint32_t flags,
        if (flags & DMU_READ_NO_DECRYPT)
                db_flags |= DB_RF_NO_DECRYPT;
 
-       error = dnode_hold(os, object, FTAG, &dn);
-       if (error)
-               return (error);
-
        rw_enter(&dn->dn_struct_rwlock, RW_READER);
        if (dn->dn_bonus == NULL) {
                rw_exit(&dn->dn_struct_rwlock);
@@ -372,8 +368,6 @@ dmu_bonus_hold_impl(objset_t *os, uint64_t object, void *tag, uint32_t flags,
         */
        rw_exit(&dn->dn_struct_rwlock);
 
-       dnode_rele(dn, FTAG);
-
        error = dbuf_read(db, NULL, db_flags);
        if (error) {
                dnode_evict_bonus(dn);
@@ -387,9 +381,19 @@ dmu_bonus_hold_impl(objset_t *os, uint64_t object, void *tag, uint32_t flags,
 }
 
 int
-dmu_bonus_hold(objset_t *os, uint64_t obj, void *tag, dmu_buf_t **dbp)
+dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
 {
-       return (dmu_bonus_hold_impl(os, obj, tag, DMU_READ_NO_PREFETCH, dbp));
+       dnode_t *dn;
+       int error;
+
+       error = dnode_hold(os, object, FTAG, &dn);
+       if (error)
+               return (error);
+
+       error = dmu_bonus_hold_by_dnode(dn, tag, dbp, DMU_READ_NO_PREFETCH);
+       dnode_rele(dn, FTAG);
+
+       return (error);
 }
 
 /*
@@ -2547,6 +2551,7 @@ dmu_fini(void)
 
 #if defined(_KERNEL)
 EXPORT_SYMBOL(dmu_bonus_hold);
+EXPORT_SYMBOL(dmu_bonus_hold_by_dnode);
 EXPORT_SYMBOL(dmu_buf_hold_array_by_bonus);
 EXPORT_SYMBOL(dmu_buf_rele_array);
 EXPORT_SYMBOL(dmu_prefetch);
index 9b5cf125f397b724f87b925b2424ec6fbaa2c76a..e77ebeca54f20cb27fe7238e1184b4e771fe1723 100644 (file)
@@ -44,7 +44,7 @@ int dmu_object_alloc_chunk_shift = 7;
 static uint64_t
 dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize,
     int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
-    int dnodesize, dmu_tx_t *tx)
+    int dnodesize, dnode_t **allocated_dnode, void *tag, dmu_tx_t *tx)
 {
        uint64_t object;
        uint64_t L1_dnode_count = DNODES_PER_BLOCK <<
@@ -80,6 +80,19 @@ dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize,
        if (dnodes_per_chunk > L1_dnode_count)
                dnodes_per_chunk = L1_dnode_count;
 
+       /*
+        * The caller requested the dnode be returned as a performance
+        * optimization in order to avoid releasing the hold only to
+        * immediately reacquire it.  Since they caller is responsible
+        * for releasing the hold they must provide the tag.
+        */
+       if (allocated_dnode != NULL) {
+               ASSERT3P(tag, !=, NULL);
+       } else {
+               ASSERT3P(tag, ==, NULL);
+               tag = FTAG;
+       }
+
        object = *cpuobj;
        for (;;) {
                /*
@@ -167,7 +180,7 @@ dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize,
                 * to do so.
                 */
                error = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE,
-                   dn_slots, FTAG, &dn);
+                   dn_slots, tag, &dn);
                if (error == 0) {
                        rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
                        /*
@@ -180,11 +193,20 @@ dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize,
                                    bonuslen, dn_slots, tx);
                                rw_exit(&dn->dn_struct_rwlock);
                                dmu_tx_add_new_object(tx, dn);
-                               dnode_rele(dn, FTAG);
+
+                               /*
+                                * Caller requested the allocated dnode be
+                                * returned and is responsible for the hold.
+                                */
+                               if (allocated_dnode != NULL)
+                                       *allocated_dnode = dn;
+                               else
+                                       dnode_rele(dn, tag);
+
                                return (object);
                        }
                        rw_exit(&dn->dn_struct_rwlock);
-                       dnode_rele(dn, FTAG);
+                       dnode_rele(dn, tag);
                        DNODE_STAT_BUMP(dnode_alloc_race);
                }
 
@@ -205,7 +227,7 @@ dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 {
        return dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype,
-           bonuslen, 0, tx);
+           bonuslen, 0, NULL, NULL, tx);
 }
 
 uint64_t
@@ -214,7 +236,7 @@ dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize,
     dmu_tx_t *tx)
 {
        return dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift,
-           bonustype, bonuslen, 0, tx);
+           bonustype, bonuslen, 0, NULL, NULL, tx);
 }
 
 uint64_t
@@ -222,7 +244,21 @@ dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
     dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
 {
        return (dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype,
-           bonuslen, dnodesize, tx));
+           bonuslen, dnodesize, NULL, NULL, tx));
+}
+
+/*
+ * Allocate a new object and return a pointer to the newly allocated dnode
+ * via the allocated_dnode argument.  The returned dnode will be held and
+ * the caller is responsible for releasing the hold by calling dnode_rele().
+ */
+uint64_t
+dmu_object_alloc_hold(objset_t *os, dmu_object_type_t ot, int blocksize,
+    int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
+    int dnodesize, dnode_t **allocated_dnode, void *tag, dmu_tx_t *tx)
+{
+       return (dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift,
+           bonustype, bonuslen, dnodesize, allocated_dnode, tag, tx));
 }
 
 int
@@ -414,14 +450,13 @@ dmu_object_zapify(objset_t *mos, uint64_t object, dmu_object_type_t old_type,
         * so that concurrent calls to *_is_zapified() can determine if
         * the object has been completely zapified by checking the type.
         */
-       mzap_create_impl(mos, object, 0, 0, tx);
+       mzap_create_impl(dn, 0, 0, tx);
 
        dn->dn_next_type[tx->tx_txg & TXG_MASK] = dn->dn_type =
            DMU_OTN_ZAP_METADATA;
        dnode_setdirty(dn, tx);
        dnode_rele(dn, FTAG);
 
-
        spa_feature_incr(dmu_objset_spa(mos),
            SPA_FEATURE_EXTENSIBLE_DATASET, tx);
 }
@@ -449,6 +484,7 @@ dmu_object_free_zapified(objset_t *mos, uint64_t object, dmu_tx_t *tx)
 EXPORT_SYMBOL(dmu_object_alloc);
 EXPORT_SYMBOL(dmu_object_alloc_ibs);
 EXPORT_SYMBOL(dmu_object_alloc_dnsize);
+EXPORT_SYMBOL(dmu_object_alloc_hold);
 EXPORT_SYMBOL(dmu_object_claim);
 EXPORT_SYMBOL(dmu_object_claim_dnsize);
 EXPORT_SYMBOL(dmu_object_reclaim);
index 990f790256be4dcb5333e8b674fa988e5981a80e..a448bc1480ce206c2ed37b10435cd81c81696e74 100644 (file)
@@ -1323,13 +1323,15 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
 
        if (data != NULL) {
                dmu_buf_t *db;
+               dnode_t *dn;
                uint32_t flags = DMU_READ_NO_PREFETCH;
 
                if (rwa->raw)
                        flags |= DMU_READ_NO_DECRYPT;
 
-               VERIFY0(dmu_bonus_hold_impl(rwa->os, drro->drr_object,
-                   FTAG, flags, &db));
+               VERIFY0(dnode_hold(rwa->os, drro->drr_object, FTAG, &dn));
+               VERIFY0(dmu_bonus_hold_by_dnode(dn, FTAG, &db, flags));
+
                dmu_buf_will_dirty(db, tx);
 
                ASSERT3U(db->db_size, >=, drro->drr_bonuslen);
@@ -1346,6 +1348,7 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
                            DRR_OBJECT_PAYLOAD_SIZE(drro));
                }
                dmu_buf_rele(db, FTAG);
+               dnode_rele(dn, FTAG);
        }
        dmu_tx_commit(tx);
 
index 8b4fd0652a1930628f2ef8d9e401bcbc31cefad4..fa369f7975484a13372c6fc776663efa5f4dd873 100644 (file)
@@ -699,17 +699,17 @@ mzap_upgrade(zap_t **zapp, void *tag, dmu_tx_t *tx, zap_flags_t flags)
  * of them may be supplied.
  */
 void
-mzap_create_impl(objset_t *os, uint64_t obj, int normflags, zap_flags_t flags,
-    dmu_tx_t *tx)
+mzap_create_impl(dnode_t *dn, int normflags, zap_flags_t flags, dmu_tx_t *tx)
 {
        dmu_buf_t *db;
 
-       VERIFY0(dmu_buf_hold(os, obj, 0, FTAG, &db, DMU_READ_NO_PREFETCH));
+       VERIFY0(dmu_buf_hold_by_dnode(dn, 0, FTAG, &db, DMU_READ_NO_PREFETCH));
 
        dmu_buf_will_dirty(db, tx);
        mzap_phys_t *zp = db->db_data;
        zp->mz_block_type = ZBT_MICRO;
-       zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (obj << 1)) | 1ULL;
+       zp->mz_salt =
+           ((uintptr_t)db ^ (uintptr_t)tx ^ (dn->dn_object << 1)) | 1ULL;
        zp->mz_normflags = normflags;
 
        if (flags != 0) {
@@ -724,6 +724,33 @@ mzap_create_impl(objset_t *os, uint64_t obj, int normflags, zap_flags_t flags,
        }
 }
 
+static uint64_t
+zap_create_impl(objset_t *os, int normflags, zap_flags_t flags,
+    dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
+    dmu_object_type_t bonustype, int bonuslen, int dnodesize,
+    dnode_t **allocated_dnode, void *tag, dmu_tx_t *tx)
+{
+       uint64_t obj;
+
+       ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP);
+
+       if (allocated_dnode == NULL) {
+               dnode_t *dn;
+               obj = dmu_object_alloc_hold(os, ot, 1ULL << leaf_blockshift,
+                   indirect_blockshift, bonustype, bonuslen, dnodesize,
+                   &dn, FTAG, tx);
+               mzap_create_impl(dn, normflags, flags, tx);
+               dnode_rele(dn, FTAG);
+       } else {
+               obj = dmu_object_alloc_hold(os, ot, 1ULL << leaf_blockshift,
+                   indirect_blockshift, bonustype, bonuslen, dnodesize,
+                   allocated_dnode, tag, tx);
+               mzap_create_impl(*allocated_dnode, normflags, flags, tx);
+       }
+
+       return (obj);
+}
+
 int
 zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot,
     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
@@ -754,12 +781,23 @@ zap_create_claim_norm_dnsize(objset_t *os, uint64_t obj, int normflags,
     dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen,
     int dnodesize, dmu_tx_t *tx)
 {
+       dnode_t *dn;
+       int error;
+
        ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP);
-       int err = dmu_object_claim_dnsize(os, obj, ot, 0, bonustype, bonuslen,
+       error = dmu_object_claim_dnsize(os, obj, ot, 0, bonustype, bonuslen,
            dnodesize, tx);
-       if (err != 0)
-               return (err);
-       mzap_create_impl(os, obj, normflags, 0, tx);
+       if (error != 0)
+               return (error);
+
+       error = dnode_hold(os, obj, FTAG, &dn);
+       if (error != 0)
+               return (error);
+
+       mzap_create_impl(dn, normflags, 0, tx);
+
+       dnode_rele(dn, FTAG);
+
        return (0);
 }
 
@@ -790,12 +828,8 @@ uint64_t
 zap_create_norm_dnsize(objset_t *os, int normflags, dmu_object_type_t ot,
     dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
 {
-       ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP);
-       uint64_t obj = dmu_object_alloc_dnsize(os, ot, 0, bonustype, bonuslen,
-           dnodesize, tx);
-
-       mzap_create_impl(os, obj, normflags, 0, tx);
-       return (obj);
+       return (zap_create_impl(os, normflags, 0, ot, 0, 0,
+           bonustype, bonuslen, dnodesize, NULL, NULL, tx));
 }
 
 uint64_t
@@ -812,20 +846,25 @@ zap_create_flags_dnsize(objset_t *os, int normflags, zap_flags_t flags,
     dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
     dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
 {
-       ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP);
-       uint64_t obj = dmu_object_alloc_dnsize(os, ot, 0, bonustype, bonuslen,
-           dnodesize, tx);
-
-       ASSERT(leaf_blockshift >= SPA_MINBLOCKSHIFT &&
-           leaf_blockshift <= SPA_OLD_MAXBLOCKSHIFT &&
-           indirect_blockshift >= SPA_MINBLOCKSHIFT &&
-           indirect_blockshift <= SPA_OLD_MAXBLOCKSHIFT);
-
-       VERIFY(dmu_object_set_blocksize(os, obj,
-           1ULL << leaf_blockshift, indirect_blockshift, tx) == 0);
+       return (zap_create_impl(os, normflags, flags, ot, leaf_blockshift,
+           indirect_blockshift, bonustype, bonuslen, dnodesize, NULL, NULL,
+           tx));
+}
 
-       mzap_create_impl(os, obj, normflags, flags, tx);
-       return (obj);
+/*
+ * Create a zap object and return a pointer to the newly allocated dnode via
+ * the allocated_dnode argument.  The returned dnode will be held and the
+ * caller is responsible for releasing the hold by calling dnode_rele().
+ */
+uint64_t
+zap_create_hold(objset_t *os, int normflags, zap_flags_t flags,
+    dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
+    dmu_object_type_t bonustype, int bonuslen, int dnodesize,
+    dnode_t **allocated_dnode, void *tag, dmu_tx_t *tx)
+{
+       return (zap_create_impl(os, normflags, flags, ot, leaf_blockshift,
+           indirect_blockshift, bonustype, bonuslen, dnodesize,
+           allocated_dnode, tag, tx));
 }
 
 int
@@ -1596,6 +1635,7 @@ EXPORT_SYMBOL(zap_create_flags_dnsize);
 EXPORT_SYMBOL(zap_create_claim);
 EXPORT_SYMBOL(zap_create_claim_norm);
 EXPORT_SYMBOL(zap_create_claim_norm_dnsize);
+EXPORT_SYMBOL(zap_create_hold);
 EXPORT_SYMBOL(zap_destroy);
 EXPORT_SYMBOL(zap_lookup);
 EXPORT_SYMBOL(zap_lookup_by_dnode);