]> git.proxmox.com Git - mirror_zfs.git/blobdiff - module/zfs/dmu_objset.c
Illumos 5056 - ZFS deadlock on db_mtx and dn_holds
[mirror_zfs.git] / module / zfs / dmu_objset.c
index a02591b989e233520b12536ff55fe18553a1b8d3..6399a0a75254425330876240547587a7703d7418 100644 (file)
@@ -23,6 +23,7 @@
  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
@@ -347,7 +348,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
                            zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE),
                            secondary_cache_changed_cb, os);
                }
-               if (!dsl_dataset_is_snapshot(ds)) {
+               if (!ds->ds_is_snapshot) {
                        if (err == 0) {
                                err = dsl_prop_register(ds,
                                    zfs_prop_to_name(ZFS_PROP_CHECKSUM),
@@ -404,7 +405,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
                os->os_secondary_cache = ZFS_CACHE_ALL;
        }
 
-       if (ds == NULL || !dsl_dataset_is_snapshot(ds))
+       if (ds == NULL || !ds->ds_is_snapshot)
                os->os_zil_header = os->os_phys->os_zil_header;
        os->os_zil = zil_alloc(os, &os->os_zil_header);
 
@@ -419,20 +420,19 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
        list_create(&os->os_downgraded_dbufs, sizeof (dmu_buf_impl_t),
            offsetof(dmu_buf_impl_t, db_link));
 
+       list_link_init(&os->os_evicting_node);
+
        mutex_init(&os->os_lock, NULL, MUTEX_DEFAULT, NULL);
        mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL);
        mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL);
 
-       DMU_META_DNODE(os) = dnode_special_open(os,
-           &os->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT,
-           &os->os_meta_dnode);
+       dnode_special_open(os, &os->os_phys->os_meta_dnode,
+           DMU_META_DNODE_OBJECT, &os->os_meta_dnode);
        if (arc_buf_size(os->os_phys_buf) >= sizeof (objset_phys_t)) {
-               DMU_USERUSED_DNODE(os) = dnode_special_open(os,
-                   &os->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT,
-                   &os->os_userused_dnode);
-               DMU_GROUPUSED_DNODE(os) = dnode_special_open(os,
-                   &os->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT,
-                   &os->os_groupused_dnode);
+               dnode_special_open(os, &os->os_phys->os_userused_dnode,
+                   DMU_USERUSED_OBJECT, &os->os_userused_dnode);
+               dnode_special_open(os, &os->os_phys->os_groupused_dnode,
+                   DMU_GROUPUSED_OBJECT, &os->os_groupused_dnode);
        }
 
        *osp = os;
@@ -520,7 +520,7 @@ dmu_objset_own(const char *name, dmu_objset_type_t type,
        } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) {
                dsl_dataset_disown(ds, tag);
                return (SET_ERROR(EINVAL));
-       } else if (!readonly && dsl_dataset_is_snapshot(ds)) {
+       } else if (!readonly && ds->ds_is_snapshot) {
                dsl_dataset_disown(ds, tag);
                return (SET_ERROR(EROFS));
        }
@@ -576,41 +576,57 @@ dmu_objset_disown(objset_t *os, void *tag)
 void
 dmu_objset_evict_dbufs(objset_t *os)
 {
+       dnode_t *dn_marker;
        dnode_t *dn;
 
-       mutex_enter(&os->os_lock);
+       dn_marker = kmem_alloc(sizeof (dnode_t), KM_SLEEP);
 
-       /* process the mdn last, since the other dnodes have holds on it */
-       list_remove(&os->os_dnodes, DMU_META_DNODE(os));
-       list_insert_tail(&os->os_dnodes, DMU_META_DNODE(os));
+       mutex_enter(&os->os_lock);
+       dn = list_head(&os->os_dnodes);
+       while (dn != NULL) {
+               /*
+                * Skip dnodes without holds.  We have to do this dance
+                * because dnode_add_ref() only works if there is already a
+                * hold.  If the dnode has no holds, then it has no dbufs.
+                */
+               if (dnode_add_ref(dn, FTAG)) {
+                       list_insert_after(&os->os_dnodes, dn, dn_marker);
+                       mutex_exit(&os->os_lock);
 
-       /*
-        * Find the first dnode with holds.  We have to do this dance
-        * because dnode_add_ref() only works if you already have a
-        * hold.  If there are no holds then it has no dbufs so OK to
-        * skip.
-        */
-       for (dn = list_head(&os->os_dnodes);
-           dn && !dnode_add_ref(dn, FTAG);
-           dn = list_next(&os->os_dnodes, dn))
-               continue;
+                       dnode_evict_dbufs(dn);
+                       dnode_rele(dn, FTAG);
 
-       while (dn) {
-               dnode_t *next_dn = dn;
+                       mutex_enter(&os->os_lock);
+                       dn = list_next(&os->os_dnodes, dn_marker);
+                       list_remove(&os->os_dnodes, dn_marker);
+               } else {
+                       dn = list_next(&os->os_dnodes, dn);
+               }
+       }
+       mutex_exit(&os->os_lock);
 
-               do {
-                       next_dn = list_next(&os->os_dnodes, next_dn);
-               } while (next_dn && !dnode_add_ref(next_dn, FTAG));
+       kmem_free(dn_marker, sizeof (dnode_t));
 
-               mutex_exit(&os->os_lock);
-               dnode_evict_dbufs(dn);
-               dnode_rele(dn, FTAG);
-               mutex_enter(&os->os_lock);
-               dn = next_dn;
+       if (DMU_USERUSED_DNODE(os) != NULL) {
+               dnode_evict_dbufs(DMU_GROUPUSED_DNODE(os));
+               dnode_evict_dbufs(DMU_USERUSED_DNODE(os));
        }
-       mutex_exit(&os->os_lock);
+       dnode_evict_dbufs(DMU_META_DNODE(os));
 }
 
+/*
+ * Objset eviction processing is split into into two pieces.
+ * The first marks the objset as evicting, evicts any dbufs that
+ * have a refcount of zero, and then queues up the objset for the
+ * second phase of eviction.  Once os->os_dnodes has been cleared by
+ * dnode_buf_pageout()->dnode_destroy(), the second phase is executed.
+ * The second phase closes the special dnodes, dequeues the objset from
+ * the list of those undergoing eviction, and finally frees the objset.
+ *
+ * NOTE: Due to asynchronous eviction processing (invocation of
+ *       dnode_buf_pageout()), it is possible for the meta dnode for the
+ *       objset to have no holds even though os->os_dnodes is not empty.
+ */
 void
 dmu_objset_evict(objset_t *os)
 {
@@ -622,7 +638,7 @@ dmu_objset_evict(objset_t *os)
                ASSERT(!dmu_objset_is_dirty(os, t));
 
        if (ds) {
-               if (!dsl_dataset_is_snapshot(ds)) {
+               if (!ds->ds_is_snapshot) {
                        VERIFY0(dsl_prop_unregister(ds,
                            zfs_prop_to_name(ZFS_PROP_CHECKSUM),
                            checksum_changed_cb, os));
@@ -656,8 +672,24 @@ dmu_objset_evict(objset_t *os)
        if (os->os_sa)
                sa_tear_down(os);
 
+       os->os_evicting = B_TRUE;
        dmu_objset_evict_dbufs(os);
 
+       mutex_enter(&os->os_lock);
+       spa_evicting_os_register(os->os_spa, os);
+       if (list_is_empty(&os->os_dnodes)) {
+               mutex_exit(&os->os_lock);
+               dmu_objset_evict_done(os);
+       } else {
+               mutex_exit(&os->os_lock);
+       }
+}
+
+void
+dmu_objset_evict_done(objset_t *os)
+{
+       ASSERT3P(list_head(&os->os_dnodes), ==, NULL);
+
        dnode_special_close(&os->os_meta_dnode);
        if (DMU_USERUSED_DNODE(os)) {
                dnode_special_close(&os->os_userused_dnode);
@@ -665,8 +697,6 @@ dmu_objset_evict(objset_t *os)
        }
        zil_free(os->os_zil);
 
-       ASSERT3P(list_head(&os->os_dnodes), ==, NULL);
-
        VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf));
 
        /*
@@ -681,6 +711,7 @@ dmu_objset_evict(objset_t *os)
        mutex_destroy(&os->os_lock);
        mutex_destroy(&os->os_obj_lock);
        mutex_destroy(&os->os_user_ptr_lock);
+       spa_evicting_os_deregister(os->os_spa, os);
        kmem_free(os, sizeof (objset_t));
 }
 
@@ -888,7 +919,7 @@ dmu_objset_clone_check(void *arg, dmu_tx_t *tx)
        }
 
        /* You can only clone snapshots, not the head datasets. */
-       if (!dsl_dataset_is_snapshot(origin)) {
+       if (!origin->ds_is_snapshot) {
                dsl_dataset_rele(origin, FTAG);
                return (SET_ERROR(EINVAL));
        }
@@ -1453,7 +1484,7 @@ int
 dmu_objset_is_snapshot(objset_t *os)
 {
        if (os->os_dsl_dataset != NULL)
-               return (dsl_dataset_is_snapshot(os->os_dsl_dataset));
+               return (os->os_dsl_dataset->ds_is_snapshot);
        else
                return (B_FALSE);
 }