]> git.proxmox.com Git - mirror_zfs.git/commitdiff
OpenZFS 6988 spa_sync() spends half its time in dmu_objset_do_userquota_updates
authorJinshan Xiong <jinshan.xiong@intel.com>
Wed, 21 Sep 2016 20:49:47 +0000 (13:49 -0700)
committerBrian Behlendorf <behlendorf1@llnl.gov>
Fri, 7 Oct 2016 16:45:13 +0000 (09:45 -0700)
Using a benchmark which creates 2 million files in one TXG, I observe
that the thread running spa_sync() is on CPU almost the entire time we
are syncing, and therefore can be a performance bottleneck. About 50% of
the time in spa_sync() is in dmu_objset_do_userquota_updates().

The problem is that dmu_objset_do_userquota_updates() calls
zap_increment_int(DMU_USERUSED_OBJECT) once for every file that was
modified (or created). In this benchmark, all the files are owned by the
same user/group, so all 2 million calls to zap_increment_int() are
modifying the same entry in the zap. The same issue exists for the
DMU_GROUPUSED_OBJECT.

We should keep an in-memory map from user to space delta while we are
syncing, and when we finish, iterate over the in-memory map and modify
the ZAP once per entry. This reduces the number of calls to
zap_increment_int() from "number of objects modified" to "number of
owners/groups of modified files".

This reduced the time spent in spa_sync() in the file create benchmark
by ~33%, from 11 seconds to 7 seconds.

Upstream bugs: DLPX-44799
Ported by: Ned Bass <bass6@llnl.gov>

OpenZFS-issue: https://www.illumos.org/issues/6988
ZFSonLinux-issue: https://github.com/zfsonlinux/zfs/issues/4642
OpenZFS-commit: unmerged

Porting notes:
- Added curly braces around declaration of userquota_cache_t cache to
  quiet compiler warning;
- Handled the userobj accounting the same way it proposed in this path.

Signed-off-by: Jinshan Xiong <jinshan.xiong@intel.com>
module/zfs/dmu_objset.c
module/zfs/dnode.c

index c6b6eb745c0d27eda208bb1d0fe620e3043769c3..b1f05cb6108ba88fa41af50b005d2e45192cca22 100644 (file)
@@ -1338,37 +1338,113 @@ dmu_objset_userobjused_enabled(objset_t *os)
            spa_feature_is_enabled(os->os_spa, SPA_FEATURE_USEROBJ_ACCOUNTING));
 }
 
+typedef struct userquota_node {
+       /* must be in the first filed, see userquota_update_cache() */
+       char            uqn_id[20 + DMU_OBJACCT_PREFIX_LEN];
+       int64_t         uqn_delta;
+       avl_node_t      uqn_node;
+} userquota_node_t;
+
+typedef struct userquota_cache {
+       avl_tree_t uqc_user_deltas;
+       avl_tree_t uqc_group_deltas;
+} userquota_cache_t;
+
+static int
+userquota_compare(const void *l, const void *r)
+{
+       const userquota_node_t *luqn = l;
+       const userquota_node_t *ruqn = r;
+
+       /*
+        * NB: can only access uqn_id because userquota_update_cache() doesn't
+        * pass in an entire userquota_node_t.
+        */
+       return (strcmp(luqn->uqn_id, ruqn->uqn_id));
+}
+
+static void
+do_userquota_cacheflush(objset_t *os, userquota_cache_t *cache, dmu_tx_t *tx)
+{
+       void *cookie;
+       userquota_node_t *uqn;
+
+       ASSERT(dmu_tx_is_syncing(tx));
+
+       cookie = NULL;
+       while ((uqn = avl_destroy_nodes(&cache->uqc_user_deltas,
+           &cookie)) != NULL) {
+               VERIFY0(zap_increment(os, DMU_USERUSED_OBJECT,
+                   uqn->uqn_id, uqn->uqn_delta, tx));
+               kmem_free(uqn, sizeof (*uqn));
+       }
+       avl_destroy(&cache->uqc_user_deltas);
+
+       cookie = NULL;
+       while ((uqn = avl_destroy_nodes(&cache->uqc_group_deltas,
+           &cookie)) != NULL) {
+               VERIFY0(zap_increment(os, DMU_GROUPUSED_OBJECT,
+                   uqn->uqn_id, uqn->uqn_delta, tx));
+               kmem_free(uqn, sizeof (*uqn));
+       }
+       avl_destroy(&cache->uqc_group_deltas);
+}
+
+static void
+userquota_update_cache(avl_tree_t *avl, const char *id, int64_t delta)
+{
+       userquota_node_t *uqn;
+       avl_index_t idx;
+
+       ASSERT(strlen(id) < sizeof (uqn->uqn_id));
+       /*
+        * Use id directly for searching because uqn_id is the first field of
+        * userquota_node_t and fields after uqn_id won't be accessed in
+        * avl_find().
+        */
+       uqn = avl_find(avl, (const void *)id, &idx);
+       if (uqn == NULL) {
+               uqn = kmem_zalloc(sizeof (*uqn), KM_SLEEP);
+               strcpy(uqn->uqn_id, id);
+               avl_insert(avl, uqn, idx);
+       }
+       uqn->uqn_delta += delta;
+}
+
 static void
-do_userquota_update(objset_t *os, uint64_t used, uint64_t flags,
-    uint64_t user, uint64_t group, boolean_t subtract, dmu_tx_t *tx)
+do_userquota_update(userquota_cache_t *cache, uint64_t used, uint64_t flags,
+    uint64_t user, uint64_t group, boolean_t subtract)
 {
        if ((flags & DNODE_FLAG_USERUSED_ACCOUNTED)) {
                int64_t delta = DNODE_MIN_SIZE + used;
+               char name[20];
+
                if (subtract)
                        delta = -delta;
-               VERIFY3U(0, ==, zap_increment_int(os, DMU_USERUSED_OBJECT,
-                   user, delta, tx));
-               VERIFY3U(0, ==, zap_increment_int(os, DMU_GROUPUSED_OBJECT,
-                   group, delta, tx));
+
+               (void) sprintf(name, "%llx", (longlong_t)user);
+               userquota_update_cache(&cache->uqc_user_deltas, name, delta);
+
+               (void) sprintf(name, "%llx", (longlong_t)group);
+               userquota_update_cache(&cache->uqc_group_deltas, name, delta);
        }
 }
 
 static void
-do_userobjquota_update(objset_t *os, uint64_t flags, uint64_t user,
-    uint64_t group, boolean_t subtract, dmu_tx_t *tx)
+do_userobjquota_update(userquota_cache_t *cache, uint64_t flags,
+    uint64_t user, uint64_t group, boolean_t subtract)
 {
        if (flags & DNODE_FLAG_USEROBJUSED_ACCOUNTED) {
                char name[20 + DMU_OBJACCT_PREFIX_LEN];
+               int delta = subtract ? -1 : 1;
 
                (void) snprintf(name, sizeof (name), DMU_OBJACCT_PREFIX "%llx",
                    (longlong_t)user);
-               VERIFY0(zap_increment(os, DMU_USERUSED_OBJECT, name,
-                   subtract ? -1 : 1, tx));
+               userquota_update_cache(&cache->uqc_user_deltas, name, delta);
 
                (void) snprintf(name, sizeof (name), DMU_OBJACCT_PREFIX "%llx",
                    (longlong_t)group);
-               VERIFY0(zap_increment(os, DMU_GROUPUSED_OBJECT, name,
-                   subtract ? -1 : 1, tx));
+               userquota_update_cache(&cache->uqc_group_deltas, name, delta);
        }
 }
 
@@ -1377,9 +1453,15 @@ dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx)
 {
        dnode_t *dn;
        list_t *list = &os->os_synced_dnodes;
+       userquota_cache_t cache = { { 0 } };
 
        ASSERT(list_head(list) == NULL || dmu_objset_userused_enabled(os));
 
+       avl_create(&cache.uqc_user_deltas, userquota_compare,
+           sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node));
+       avl_create(&cache.uqc_group_deltas, userquota_compare,
+           sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node));
+
        while ((dn = list_head(list))) {
                int flags;
                ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object));
@@ -1389,36 +1471,27 @@ dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx)
 
                /* Allocate the user/groupused objects if necessary. */
                if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) {
-                       VERIFY(0 == zap_create_claim(os,
-                           DMU_USERUSED_OBJECT,
+                       VERIFY0(zap_create_claim(os, DMU_USERUSED_OBJECT,
                            DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
-                       VERIFY(0 == zap_create_claim(os,
-                           DMU_GROUPUSED_OBJECT,
+                       VERIFY0(zap_create_claim(os, DMU_GROUPUSED_OBJECT,
                            DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
                }
 
-               /*
-                * We intentionally modify the zap object even if the
-                * net delta is zero.  Otherwise
-                * the block of the zap obj could be shared between
-                * datasets but need to be different between them after
-                * a bprewrite.
-                */
-
                flags = dn->dn_id_flags;
                ASSERT(flags);
                if (flags & DN_ID_OLD_EXIST)  {
-                       do_userquota_update(os, dn->dn_oldused, dn->dn_oldflags,
-                           dn->dn_olduid, dn->dn_oldgid, B_TRUE, tx);
-                       do_userobjquota_update(os, dn->dn_oldflags,
-                           dn->dn_olduid, dn->dn_oldgid, B_TRUE, tx);
+                       do_userquota_update(&cache,
+                           dn->dn_oldused, dn->dn_oldflags,
+                           dn->dn_olduid, dn->dn_oldgid, B_TRUE);
+                       do_userobjquota_update(&cache, dn->dn_oldflags,
+                           dn->dn_olduid, dn->dn_oldgid, B_TRUE);
                }
                if (flags & DN_ID_NEW_EXIST) {
-                       do_userquota_update(os, DN_USED_BYTES(dn->dn_phys),
-                           dn->dn_phys->dn_flags,  dn->dn_newuid,
-                           dn->dn_newgid, B_FALSE, tx);
-                       do_userobjquota_update(os, dn->dn_phys->dn_flags,
-                           dn->dn_newuid, dn->dn_newgid, B_FALSE, tx);
+                       do_userquota_update(&cache,
+                           DN_USED_BYTES(dn->dn_phys), dn->dn_phys->dn_flags,
+                           dn->dn_newuid, dn->dn_newgid, B_FALSE);
+                       do_userobjquota_update(&cache, dn->dn_phys->dn_flags,
+                           dn->dn_newuid, dn->dn_newgid, B_FALSE);
                }
 
                mutex_enter(&dn->dn_mtx);
@@ -1439,6 +1512,7 @@ dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx)
                list_remove(list, dn);
                dnode_rele(dn, list);
        }
+       do_userquota_cacheflush(os, &cache, tx);
 }
 
 /*
index f0b03bbba798c067c0c4d2323e071b937361fe46..6ba8207e2bcf8be5f7bcbb6741d25026d93946c5 100644 (file)
@@ -2001,9 +2001,6 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
        boolean_t hole;
        int i, inc, error, span;
 
-       dprintf("probing object %llu offset %llx level %d of %u\n",
-           dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels);
-
        hole = ((flags & DNODE_FIND_HOLE) != 0);
        inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1;
        ASSERT(txg == 0 || !hole);