Get rid of space_map_update() for ms_synced_length

[mirror_zfs.git] / module / zfs / metaslab.c
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c

index aeca0ed20faf52cea18f4835f1da66d1b7890745..58c47a0abfb26a638bc861eff4801d5bee973ceb 100644 (file)
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -496,45 +496,62 @@ metaslab_compare(const void *x1, const void *x2)
         return (AVL_CMP(m1->ms_start, m2->ms_start));
  }
  
+uint64_t
+metaslab_allocated_space(metaslab_t *msp)
+{
+       return (msp->ms_allocated_space);
+}
+
  /*
   * Verify that the space accounting on disk matches the in-core range_trees.
   */
-void
+static void
  metaslab_verify_space(metaslab_t *msp, uint64_t txg)
  {
         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
-       uint64_t allocated = 0;
+       uint64_t allocating = 0;
         uint64_t sm_free_space, msp_free_space;
  
         ASSERT(MUTEX_HELD(&msp->ms_lock));
+       ASSERT(!msp->ms_condensing);
  
         if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
                 return;
  
         /*
          * We can only verify the metaslab space when we're called
-        * from syncing context with a loaded metaslab that has an allocated
-        * space map. Calling this in non-syncing context does not
-        * provide a consistent view of the metaslab since we're performing
-        * allocations in the future.
+        * from syncing context with a loaded metaslab that has an
+        * allocated space map. Calling this in non-syncing context
+        * does not provide a consistent view of the metaslab since
+        * we're performing allocations in the future.
          */
         if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL ||
             !msp->ms_loaded)
                 return;
  
-       sm_free_space = msp->ms_size - space_map_allocated(msp->ms_sm) -
-           space_map_alloc_delta(msp->ms_sm);
+       /*
+        * Even though the smp_alloc field can get negative (e.g.
+        * see vdev_checkpoint_sm), that should never be the case
+        * when it come's to a metaslab's space map.
+        */
+       ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0);
+
+       sm_free_space = msp->ms_size - metaslab_allocated_space(msp);
  
         /*
-        * Account for future allocations since we would have already
-        * deducted that space from the ms_freetree.
+        * Account for future allocations since we would have
+        * already deducted that space from the ms_allocatable.
          */
         for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
-               allocated +=
+               allocating +=
                     range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]);
         }
  
-       msp_free_space = range_tree_space(msp->ms_allocatable) + allocated +
+       ASSERT3U(msp->ms_deferspace, ==,
+           range_tree_space(msp->ms_defer[0]) +
+           range_tree_space(msp->ms_defer[1]));
+
+       msp_free_space = range_tree_space(msp->ms_allocatable) + allocating +
             msp->ms_deferspace + range_tree_space(msp->ms_freed);
  
         VERIFY3U(sm_free_space, ==, msp_free_space);
@@ -1420,27 +1437,52 @@ metaslab_load_impl(metaslab_t *msp)
  
         ASSERT(MUTEX_HELD(&msp->ms_lock));
         ASSERT(msp->ms_loading);
+       ASSERT(!msp->ms_condensing);
  
         /*
-        * Nobody else can manipulate a loading metaslab, so it's now safe
-        * to drop the lock. This way we don't have to hold the lock while
-        * reading the spacemap from disk.
+        * We temporarily drop the lock to unblock other operations while we
+        * are reading the space map. Therefore, metaslab_sync() and
+        * metaslab_sync_done() can run at the same time as we do.
+        *
+        * metaslab_sync() can append to the space map while we are loading.
+        * Therefore we load only entries that existed when we started the
+        * load. Additionally, metaslab_sync_done() has to wait for the load
+        * to complete because there are potential races like metaslab_load()
+        * loading parts of the space map that are currently being appended
+        * by metaslab_sync(). If we didn't, the ms_allocatable would have
+        * entries that metaslab_sync_done() would try to re-add later.
+        *
+        * That's why before dropping the lock we remember the synced length
+        * of the metaslab and read up to that point of the space map,
+        * ignoring entries appended by metaslab_sync() that happen after we
+        * drop the lock.
          */
+       uint64_t length = msp->ms_synced_length;
         mutex_exit(&msp->ms_lock);
  
-       /*
-        * If the space map has not been allocated yet, then treat
-        * all the space in the metaslab as free and add it to ms_allocatable.
-        */
         if (msp->ms_sm != NULL) {
-               error = space_map_load(msp->ms_sm, msp->ms_allocatable,
-                   SM_FREE);
+               error = space_map_load_length(msp->ms_sm, msp->ms_allocatable,
+                   SM_FREE, length);
         } else {
+               /*
+                * The space map has not been allocated yet, so treat
+                * all the space in the metaslab as free and add it to the
+                * ms_allocatable tree.
+                */
                 range_tree_add(msp->ms_allocatable,
                     msp->ms_start, msp->ms_size);
         }
  
+       /*
+        * We need to grab the ms_sync_lock to prevent metaslab_sync() from
+        * changing the ms_sm and the metaslab's range trees while we are
+        * about to use them and populate the ms_allocatable. The ms_lock
+        * is insufficient for this because metaslab_sync() doesn't hold
+        * the ms_lock while writing the ms_checkpointing tree to disk.
+        */
+       mutex_enter(&msp->ms_sync_lock);
         mutex_enter(&msp->ms_lock);
+       ASSERT(!msp->ms_condensing);
  
         if (error != 0)
                 return (error);
@@ -1449,18 +1491,22 @@ metaslab_load_impl(metaslab_t *msp)
         msp->ms_loaded = B_TRUE;
  
         /*
-        * If the metaslab already has a spacemap, then we need to
-        * remove all segments from the defer tree; otherwise, the
-        * metaslab is completely empty and we can skip this.
+        * The ms_allocatable contains the segments that exist in the
+        * ms_defer trees [see ms_synced_length]. Thus we need to remove
+        * them from ms_allocatable as they will be added again in
+        * metaslab_sync_done().
          */
-       if (msp->ms_sm != NULL) {
-               for (int t = 0; t < TXG_DEFER_SIZE; t++) {
-                       range_tree_walk(msp->ms_defer[t],
-                           range_tree_remove, msp->ms_allocatable);
-               }
+       for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+               range_tree_walk(msp->ms_defer[t],
+                   range_tree_remove, msp->ms_allocatable);
         }
+
         msp->ms_max_size = metaslab_block_maxsize(msp);
  
+       spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+       metaslab_verify_space(msp, spa_syncing_txg(spa));
+       mutex_exit(&msp->ms_sync_lock);
+
         return (0);
  }
  
@@ -1477,6 +1523,7 @@ metaslab_load(metaslab_t *msp)
         if (msp->ms_loaded)
                 return (0);
         VERIFY(!msp->ms_loading);
+       ASSERT(!msp->ms_condensing);
  
         msp->ms_loading = B_TRUE;
         int error = metaslab_load_impl(msp);
@@ -1533,6 +1580,13 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
         /*
          * We only open space map objects that already exist. All others
          * will be opened when we finally allocate an object for it.
+        *
+        * Note:
+        * When called from vdev_expand(), we can't call into the DMU as
+        * we are holding the spa_config_lock as a writer and we would
+        * deadlock [see relevant comment in vdev_metaslab_init()]. in
+        * that case, the object parameter is zero though, so we won't
+        * call into the DMU.
          */
         if (object != 0) {
                 error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start,
@@ -1544,14 +1598,16 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
                 }
  
                 ASSERT(ms->ms_sm != NULL);
+               ms->ms_allocated_space = space_map_allocated(ms->ms_sm);
         }
  
         /*
-        * We create the main range tree here, but we don't create the
+        * We create the ms_allocatable here, but we don't create the
          * other range trees until metaslab_sync_done().  This serves
          * two purposes: it allows metaslab_sync_done() to detect the
-        * addition of new space; and for debugging, it ensures that we'd
-        * data fault on any attempt to use this metaslab before it's ready.
+        * addition of new space; and for debugging, it ensures that
+        * we'd data fault on any attempt to use this metaslab before
+        * it's ready.
          */
         ms->ms_allocatable = range_tree_create_impl(&rt_avl_ops,
             &ms->ms_allocatable_by_size, metaslab_rangesize_compare, 0);
@@ -1568,8 +1624,11 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
          * out this txg. This ensures that we don't attempt to allocate
          * from it before we have initialized it completely.
          */
-       if (txg <= TXG_INITIAL)
+       if (txg <= TXG_INITIAL) {
                 metaslab_sync_done(ms, 0);
+               metaslab_space_update(vd, mg->mg_class,
+                   metaslab_allocated_space(ms), 0, 0);
+       }
  
         /*
          * If metaslab_debug_load is set and we're initializing a metaslab
@@ -1603,7 +1662,7 @@ metaslab_fini(metaslab_t *msp)
         mutex_enter(&msp->ms_lock);
         VERIFY(msp->ms_group == NULL);
         metaslab_space_update(vd, mg->mg_class,
-           -space_map_allocated(msp->ms_sm), 0, -msp->ms_size);
+           -metaslab_allocated_space(msp), 0, -msp->ms_size);
  
         space_map_close(msp->ms_sm);
  
@@ -1674,10 +1733,10 @@ int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
  };
  
  /*
- * Calclate the metaslab's fragmentation metric. A return value
- * of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does
- * not support this metric. Otherwise, the return value should be in the
- * range [0, 100].
+ * Calculate the metaslab's fragmentation metric and set ms_fragmentation.
+ * Setting this value to ZFS_FRAG_INVALID means that the metaslab has not
+ * been upgraded and does not support this metric. Otherwise, the return
+ * value should be in the range [0, 100].
   */
  static void
  metaslab_set_fragmentation(metaslab_t *msp)
@@ -1770,7 +1829,7 @@ metaslab_space_weight(metaslab_t *msp)
         /*
          * The baseline weight is the metaslab's free space.
          */
-       space = msp->ms_size - space_map_allocated(msp->ms_sm);
+       space = msp->ms_size - metaslab_allocated_space(msp);
  
         if (metaslab_fragmentation_factor_enabled &&
             msp->ms_fragmentation != ZFS_FRAG_INVALID) {
@@ -1906,7 +1965,7 @@ metaslab_segment_weight(metaslab_t *msp)
         /*
          * The metaslab is completely free.
          */
-       if (space_map_allocated(msp->ms_sm) == 0) {
+       if (metaslab_allocated_space(msp) == 0) {
                 int idx = highbit64(msp->ms_size) - 1;
                 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
  
@@ -1928,7 +1987,7 @@ metaslab_segment_weight(metaslab_t *msp)
         /*
          * If the metaslab is fully allocated then just make the weight 0.
          */
-       if (space_map_allocated(msp->ms_sm) == msp->ms_size)
+       if (metaslab_allocated_space(msp) == msp->ms_size)
                 return (0);
         /*
          * If the metaslab is already loaded, then use the range tree to
@@ -2008,6 +2067,8 @@ metaslab_weight(metaslab_t *msp)
          */
         if (msp->ms_loaded)
                 msp->ms_max_size = metaslab_block_maxsize(msp);
+       else
+               ASSERT0(msp->ms_max_size);
  
         /*
          * Segment-based weighting requires space map histogram support.
@@ -2411,17 +2472,17 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
         VERIFY(txg <= spa_final_dirty_txg(spa));
  
         /*
-        * The only state that can actually be changing concurrently with
-        * metaslab_sync() is the metaslab's ms_allocatable.  No other
-        * thread can be modifying this txg's alloc, freeing,
+        * The only state that can actually be changing concurrently
+        * with metaslab_sync() is the metaslab's ms_allocatable. No
+        * other thread can be modifying this txg's alloc, freeing,
          * freed, or space_map_phys_t.  We drop ms_lock whenever we
-        * could call into the DMU, because the DMU can call down to us
-        * (e.g. via zio_free()) at any time.
+        * could call into the DMU, because the DMU can call down to
+        * us (e.g. via zio_free()) at any time.
          *
          * The spa_vdev_remove_thread() can be reading metaslab state
-        * concurrently, and it is locked out by the ms_sync_lock.  Note
-        * that the ms_lock is insufficient for this, because it is dropped
-        * by space_map_write().
+        * concurrently, and it is locked out by the ms_sync_lock.
+        * Note that the ms_lock is insufficient for this, because it
+        * is dropped by space_map_write().
          */
         tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
  
@@ -2433,7 +2494,9 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
  
                 VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
                     msp->ms_start, msp->ms_size, vd->vdev_ashift));
+
                 ASSERT(msp->ms_sm != NULL);
+               ASSERT0(metaslab_allocated_space(msp));
         }
  
         if (!range_tree_is_empty(msp->ms_checkpointing) &&
@@ -2481,6 +2544,11 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
                 mutex_enter(&msp->ms_lock);
         }
  
+       msp->ms_allocated_space += range_tree_space(alloctree);
+       ASSERT3U(msp->ms_allocated_space, >=,
+           range_tree_space(msp->ms_freeing));
+       msp->ms_allocated_space -= range_tree_space(msp->ms_freeing);
+
         if (!range_tree_is_empty(msp->ms_checkpointing)) {
                 ASSERT(spa_has_checkpoint(spa));
                 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
@@ -2494,14 +2562,13 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
                 space_map_write(vd->vdev_checkpoint_sm,
                     msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx);
                 mutex_enter(&msp->ms_lock);
-               space_map_update(vd->vdev_checkpoint_sm);
  
                 spa->spa_checkpoint_info.sci_dspace +=
                     range_tree_space(msp->ms_checkpointing);
                 vd->vdev_stat.vs_checkpoint_space +=
                     range_tree_space(msp->ms_checkpointing);
                 ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==,
-                   -vd->vdev_checkpoint_sm->sm_alloc);
+                   -space_map_allocated(vd->vdev_checkpoint_sm));
  
                 range_tree_vacate(msp->ms_checkpointing, NULL, NULL);
         }
@@ -2553,16 +2620,18 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
  
         /*
          * For sync pass 1, we avoid traversing this txg's free range tree
-        * and instead will just swap the pointers for freeing and
-        * freed. We can safely do this since the freed_tree is
-        * guaranteed to be empty on the initial pass.
+        * and instead will just swap the pointers for freeing and freed.
+        * We can safely do this since the freed_tree is guaranteed to be
+        * empty on the initial pass.
          */
         if (spa_sync_pass(spa) == 1) {
                 range_tree_swap(&msp->ms_freeing, &msp->ms_freed);
+               ASSERT0(msp->ms_allocated_this_txg);
         } else {
                 range_tree_vacate(msp->ms_freeing,
                     range_tree_add, msp->ms_freed);
         }
+       msp->ms_allocated_this_txg += range_tree_space(alloctree);
         range_tree_vacate(alloctree, NULL, NULL);
  
         ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
@@ -2640,7 +2709,8 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
         }
  
         defer_delta = 0;
-       alloc_delta = space_map_alloc_delta(msp->ms_sm);
+       alloc_delta = msp->ms_allocated_this_txg -
+           range_tree_space(msp->ms_freed);
         if (defer_allowed) {
                 defer_delta = range_tree_space(msp->ms_freed) -
                     range_tree_space(*defer_tree);
@@ -2672,7 +2742,8 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
                     msp->ms_loaded ? range_tree_add : NULL,
                     msp->ms_allocatable);
         }
-       space_map_update(msp->ms_sm);
+
+       msp->ms_synced_length = space_map_length(msp->ms_sm);
  
         msp->ms_deferspace += defer_delta;
         ASSERT3S(msp->ms_deferspace, >=, 0);
@@ -2724,6 +2795,7 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
         ASSERT0(range_tree_space(msp->ms_freed));
         ASSERT0(range_tree_space(msp->ms_checkpointing));
  
+       msp->ms_allocated_this_txg = 0;
         mutex_exit(&msp->ms_lock);
  }