module/zfs/metaslab.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
  24  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  25  */
  26
  27 #include <sys/zfs_context.h>
  28 #include <sys/dmu.h>
  29 #include <sys/dmu_tx.h>
  30 #include <sys/space_map.h>
  31 #include <sys/metaslab_impl.h>
  32 #include <sys/vdev_impl.h>
  33 #include <sys/zio.h>
  34 #include <sys/spa_impl.h>
  35 #include <sys/zfeature.h>
  36 #include <sys/vdev_indirect_mapping.h>
  37
  38 #define WITH_DF_BLOCK_ALLOCATOR
  39
  40 #define GANG_ALLOCATION(flags) \
  41         ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER))
  42
  43 /*
  44  * Metaslab granularity, in bytes. This is roughly similar to what would be
  45  * referred to as the "stripe size" in traditional RAID arrays. In normal
  46  * operation, we will try to write this amount of data to a top-level vdev
  47  * before moving on to the next one.
  48  */
  49 unsigned long metaslab_aliquot = 512 << 10;
  50
  51 /* force gang blocks */
  52 unsigned long metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1;
  53
  54 /*
  55  * The in-core space map representation is more compact than its on-disk form.
  56  * The zfs_condense_pct determines how much more compact the in-core
  57  * space map representation must be before we compact it on-disk.
  58  * Values should be greater than or equal to 100.
  59  */
  60 int zfs_condense_pct = 200;
  61
  62 /*
  63  * Condensing a metaslab is not guaranteed to actually reduce the amount of
  64  * space used on disk. In particular, a space map uses data in increments of
  65  * MAX(1 << ashift, space_map_blksz), so a metaslab might use the
  66  * same number of blocks after condensing. Since the goal of condensing is to
  67  * reduce the number of IOPs required to read the space map, we only want to
  68  * condense when we can be sure we will reduce the number of blocks used by the
  69  * space map. Unfortunately, we cannot precisely compute whether or not this is
  70  * the case in metaslab_should_condense since we are holding ms_lock. Instead,
  71  * we apply the following heuristic: do not condense a spacemap unless the
  72  * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold
  73  * blocks.
  74  */
  75 int zfs_metaslab_condense_block_threshold = 4;
  76
  77 /*
  78  * The zfs_mg_noalloc_threshold defines which metaslab groups should
  79  * be eligible for allocation. The value is defined as a percentage of
  80  * free space. Metaslab groups that have more free space than
  81  * zfs_mg_noalloc_threshold are always eligible for allocations. Once
  82  * a metaslab group's free space is less than or equal to the
  83  * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
  84  * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
  85  * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
  86  * groups are allowed to accept allocations. Gang blocks are always
  87  * eligible to allocate on any metaslab group. The default value of 0 means
  88  * no metaslab group will be excluded based on this criterion.
  89  */
  90 int zfs_mg_noalloc_threshold = 0;
  91
  92 /*
  93  * Metaslab groups are considered eligible for allocations if their
  94  * fragmenation metric (measured as a percentage) is less than or equal to
  95  * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold
  96  * then it will be skipped unless all metaslab groups within the metaslab
  97  * class have also crossed this threshold.
  98  */
  99 int zfs_mg_fragmentation_threshold = 85;
 100
 101 /*
 102  * Allow metaslabs to keep their active state as long as their fragmentation
 103  * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An
 104  * active metaslab that exceeds this threshold will no longer keep its active
 105  * status allowing better metaslabs to be selected.
 106  */
 107 int zfs_metaslab_fragmentation_threshold = 70;
 108
 109 /*
 110  * When set will load all metaslabs when pool is first opened.
 111  */
 112 int metaslab_debug_load = 0;
 113
 114 /*
 115  * When set will prevent metaslabs from being unloaded.
 116  */
 117 int metaslab_debug_unload = 0;
 118
 119 /*
 120  * Minimum size which forces the dynamic allocator to change
 121  * it's allocation strategy.  Once the space map cannot satisfy
 122  * an allocation of this size then it switches to using more
 123  * aggressive strategy (i.e search by size rather than offset).
 124  */
 125 uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE;
 126
 127 /*
 128  * The minimum free space, in percent, which must be available
 129  * in a space map to continue allocations in a first-fit fashion.
 130  * Once the space map's free space drops below this level we dynamically
 131  * switch to using best-fit allocations.
 132  */
 133 int metaslab_df_free_pct = 4;
 134
 135 /*
 136  * Percentage of all cpus that can be used by the metaslab taskq.
 137  */
 138 int metaslab_load_pct = 50;
 139
 140 /*
 141  * Determines how many txgs a metaslab may remain loaded without having any
 142  * allocations from it. As long as a metaslab continues to be used we will
 143  * keep it loaded.
 144  */
 145 int metaslab_unload_delay = TXG_SIZE * 2;
 146
 147 /*
 148  * Max number of metaslabs per group to preload.
 149  */
 150 int metaslab_preload_limit = SPA_DVAS_PER_BP;
 151
 152 /*
 153  * Enable/disable preloading of metaslab.
 154  */
 155 int metaslab_preload_enabled = B_TRUE;
 156
 157 /*
 158  * Enable/disable fragmentation weighting on metaslabs.
 159  */
 160 int metaslab_fragmentation_factor_enabled = B_TRUE;
 161
 162 /*
 163  * Enable/disable lba weighting (i.e. outer tracks are given preference).
 164  */
 165 int metaslab_lba_weighting_enabled = B_TRUE;
 166
 167 /*
 168  * Enable/disable metaslab group biasing.
 169  */
 170 int metaslab_bias_enabled = B_TRUE;
 171
 172
 173 /*
 174  * Enable/disable remapping of indirect DVAs to their concrete vdevs.
 175  */
 176 boolean_t zfs_remap_blkptr_enable = B_TRUE;
 177
 178 /*
 179  * Enable/disable segment-based metaslab selection.
 180  */
 181 int zfs_metaslab_segment_weight_enabled = B_TRUE;
 182
 183 /*
 184  * When using segment-based metaslab selection, we will continue
 185  * allocating from the active metaslab until we have exhausted
 186  * zfs_metaslab_switch_threshold of its buckets.
 187  */
 188 int zfs_metaslab_switch_threshold = 2;
 189
 190 /*
 191  * Internal switch to enable/disable the metaslab allocation tracing
 192  * facility.
 193  */
 194 #ifdef _METASLAB_TRACING
 195 boolean_t metaslab_trace_enabled = B_TRUE;
 196 #endif
 197
 198 /*
 199  * Maximum entries that the metaslab allocation tracing facility will keep
 200  * in a given list when running in non-debug mode. We limit the number
 201  * of entries in non-debug mode to prevent us from using up too much memory.
 202  * The limit should be sufficiently large that we don't expect any allocation
 203  * to every exceed this value. In debug mode, the system will panic if this
 204  * limit is ever reached allowing for further investigation.
 205  */
 206 #ifdef _METASLAB_TRACING
 207 uint64_t metaslab_trace_max_entries = 5000;
 208 #endif
 209
 210 static uint64_t metaslab_weight(metaslab_t *);
 211 static void metaslab_set_fragmentation(metaslab_t *);
 212 static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, uint64_t);
 213 static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t);
 214
 215 #ifdef _METASLAB_TRACING
 216 kmem_cache_t *metaslab_alloc_trace_cache;
 217 #endif
 218
 219 /*
 220  * ==========================================================================
 221  * Metaslab classes
 222  * ==========================================================================
 223  */
 224 metaslab_class_t *
 225 metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
 226 {
 227         metaslab_class_t *mc;
 228
 229         mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
 230
 231         mc->mc_spa = spa;
 232         mc->mc_rotor = NULL;
 233         mc->mc_ops = ops;
 234         mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
 235         refcount_create_tracked(&mc->mc_alloc_slots);
 236
 237         return (mc);
 238 }
 239
 240 void
 241 metaslab_class_destroy(metaslab_class_t *mc)
 242 {
 243         ASSERT(mc->mc_rotor == NULL);
 244         ASSERT(mc->mc_alloc == 0);
 245         ASSERT(mc->mc_deferred == 0);
 246         ASSERT(mc->mc_space == 0);
 247         ASSERT(mc->mc_dspace == 0);
 248
 249         refcount_destroy(&mc->mc_alloc_slots);
 250         mutex_destroy(&mc->mc_lock);
 251         kmem_free(mc, sizeof (metaslab_class_t));
 252 }
 253
 254 int
 255 metaslab_class_validate(metaslab_class_t *mc)
 256 {
 257         metaslab_group_t *mg;
 258         vdev_t *vd;
 259
 260         /*
 261          * Must hold one of the spa_config locks.
 262          */
 263         ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
 264             spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
 265
 266         if ((mg = mc->mc_rotor) == NULL)
 267                 return (0);
 268
 269         do {
 270                 vd = mg->mg_vd;
 271                 ASSERT(vd->vdev_mg != NULL);
 272                 ASSERT3P(vd->vdev_top, ==, vd);
 273                 ASSERT3P(mg->mg_class, ==, mc);
 274                 ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
 275         } while ((mg = mg->mg_next) != mc->mc_rotor);
 276
 277         return (0);
 278 }
 279
 280 void
 281 metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
 282     int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
 283 {
 284         atomic_add_64(&mc->mc_alloc, alloc_delta);
 285         atomic_add_64(&mc->mc_deferred, defer_delta);
 286         atomic_add_64(&mc->mc_space, space_delta);
 287         atomic_add_64(&mc->mc_dspace, dspace_delta);
 288 }
 289
 290 uint64_t
 291 metaslab_class_get_alloc(metaslab_class_t *mc)
 292 {
 293         return (mc->mc_alloc);
 294 }
 295
 296 uint64_t
 297 metaslab_class_get_deferred(metaslab_class_t *mc)
 298 {
 299         return (mc->mc_deferred);
 300 }
 301
 302 uint64_t
 303 metaslab_class_get_space(metaslab_class_t *mc)
 304 {
 305         return (mc->mc_space);
 306 }
 307
 308 uint64_t
 309 metaslab_class_get_dspace(metaslab_class_t *mc)
 310 {
 311         return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
 312 }
 313
 314 void
 315 metaslab_class_histogram_verify(metaslab_class_t *mc)
 316 {
 317         vdev_t *rvd = mc->mc_spa->spa_root_vdev;
 318         uint64_t *mc_hist;
 319         int i;
 320
 321         if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
 322                 return;
 323
 324         mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
 325             KM_SLEEP);
 326
 327         for (int c = 0; c < rvd->vdev_children; c++) {
 328                 vdev_t *tvd = rvd->vdev_child[c];
 329                 metaslab_group_t *mg = tvd->vdev_mg;
 330
 331                 /*
 332                  * Skip any holes, uninitialized top-levels, or
 333                  * vdevs that are not in this metalab class.
 334                  */
 335                 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
 336                     mg->mg_class != mc) {
 337                         continue;
 338                 }
 339
 340                 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
 341                         mc_hist[i] += mg->mg_histogram[i];
 342         }
 343
 344         for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
 345                 VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]);
 346
 347         kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
 348 }
 349
 350 /*
 351  * Calculate the metaslab class's fragmentation metric. The metric
 352  * is weighted based on the space contribution of each metaslab group.
 353  * The return value will be a number between 0 and 100 (inclusive), or
 354  * ZFS_FRAG_INVALID if the metric has not been set. See comment above the
 355  * zfs_frag_table for more information about the metric.
 356  */
 357 uint64_t
 358 metaslab_class_fragmentation(metaslab_class_t *mc)
 359 {
 360         vdev_t *rvd = mc->mc_spa->spa_root_vdev;
 361         uint64_t fragmentation = 0;
 362
 363         spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
 364
 365         for (int c = 0; c < rvd->vdev_children; c++) {
 366                 vdev_t *tvd = rvd->vdev_child[c];
 367                 metaslab_group_t *mg = tvd->vdev_mg;
 368
 369                 /*
 370                  * Skip any holes, uninitialized top-levels,
 371                  * or vdevs that are not in this metalab class.
 372                  */
 373                 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
 374                     mg->mg_class != mc) {
 375                         continue;
 376                 }
 377
 378                 /*
 379                  * If a metaslab group does not contain a fragmentation
 380                  * metric then just bail out.
 381                  */
 382                 if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
 383                         spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
 384                         return (ZFS_FRAG_INVALID);
 385                 }
 386
 387                 /*
 388                  * Determine how much this metaslab_group is contributing
 389                  * to the overall pool fragmentation metric.
 390                  */
 391                 fragmentation += mg->mg_fragmentation *
 392                     metaslab_group_get_space(mg);
 393         }
 394         fragmentation /= metaslab_class_get_space(mc);
 395
 396         ASSERT3U(fragmentation, <=, 100);
 397         spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
 398         return (fragmentation);
 399 }
 400
 401 /*
 402  * Calculate the amount of expandable space that is available in
 403  * this metaslab class. If a device is expanded then its expandable
 404  * space will be the amount of allocatable space that is currently not
 405  * part of this metaslab class.
 406  */
 407 uint64_t
 408 metaslab_class_expandable_space(metaslab_class_t *mc)
 409 {
 410         vdev_t *rvd = mc->mc_spa->spa_root_vdev;
 411         uint64_t space = 0;
 412
 413         spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
 414         for (int c = 0; c < rvd->vdev_children; c++) {
 415                 vdev_t *tvd = rvd->vdev_child[c];
 416                 metaslab_group_t *mg = tvd->vdev_mg;
 417
 418                 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
 419                     mg->mg_class != mc) {
 420                         continue;
 421                 }
 422
 423                 /*
 424                  * Calculate if we have enough space to add additional
 425                  * metaslabs. We report the expandable space in terms
 426                  * of the metaslab size since that's the unit of expansion.
 427                  */
 428                 space += P2ALIGN(tvd->vdev_max_asize - tvd->vdev_asize,
 429                     1ULL << tvd->vdev_ms_shift);
 430         }
 431         spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
 432         return (space);
 433 }
 434
 435 static int
 436 metaslab_compare(const void *x1, const void *x2)
 437 {
 438         const metaslab_t *m1 = (const metaslab_t *)x1;
 439         const metaslab_t *m2 = (const metaslab_t *)x2;
 440
 441         int cmp = AVL_CMP(m2->ms_weight, m1->ms_weight);
 442         if (likely(cmp))
 443                 return (cmp);
 444
 445         IMPLY(AVL_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2);
 446
 447         return (AVL_CMP(m1->ms_start, m2->ms_start));
 448 }
 449
 450 /*
 451  * Verify that the space accounting on disk matches the in-core range_trees.
 452  */
 453 void
 454 metaslab_verify_space(metaslab_t *msp, uint64_t txg)
 455 {
 456         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 457         uint64_t allocated = 0;
 458         uint64_t sm_free_space, msp_free_space;
 459
 460         ASSERT(MUTEX_HELD(&msp->ms_lock));
 461
 462         if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
 463                 return;
 464
 465         /*
 466          * We can only verify the metaslab space when we're called
 467          * from syncing context with a loaded metaslab that has an allocated
 468          * space map. Calling this in non-syncing context does not
 469          * provide a consistent view of the metaslab since we're performing
 470          * allocations in the future.
 471          */
 472         if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL ||
 473             !msp->ms_loaded)
 474                 return;
 475
 476         sm_free_space = msp->ms_size - space_map_allocated(msp->ms_sm) -
 477             space_map_alloc_delta(msp->ms_sm);
 478
 479         /*
 480          * Account for future allocations since we would have already
 481          * deducted that space from the ms_freetree.
 482          */
 483         for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
 484                 allocated +=
 485                     range_tree_space(msp->ms_alloctree[(txg + t) & TXG_MASK]);
 486         }
 487
 488         msp_free_space = range_tree_space(msp->ms_tree) + allocated +
 489             msp->ms_deferspace + range_tree_space(msp->ms_freedtree);
 490
 491         VERIFY3U(sm_free_space, ==, msp_free_space);
 492 }
 493
 494 /*
 495  * ==========================================================================
 496  * Metaslab groups
 497  * ==========================================================================
 498  */
 499 /*
 500  * Update the allocatable flag and the metaslab group's capacity.
 501  * The allocatable flag is set to true if the capacity is below
 502  * the zfs_mg_noalloc_threshold or has a fragmentation value that is
 503  * greater than zfs_mg_fragmentation_threshold. If a metaslab group
 504  * transitions from allocatable to non-allocatable or vice versa then the
 505  * metaslab group's class is updated to reflect the transition.
 506  */
 507 static void
 508 metaslab_group_alloc_update(metaslab_group_t *mg)
 509 {
 510         vdev_t *vd = mg->mg_vd;
 511         metaslab_class_t *mc = mg->mg_class;
 512         vdev_stat_t *vs = &vd->vdev_stat;
 513         boolean_t was_allocatable;
 514         boolean_t was_initialized;
 515
 516         ASSERT(vd == vd->vdev_top);
 517         ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_READER), ==,
 518             SCL_ALLOC);
 519
 520         mutex_enter(&mg->mg_lock);
 521         was_allocatable = mg->mg_allocatable;
 522         was_initialized = mg->mg_initialized;
 523
 524         mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
 525             (vs->vs_space + 1);
 526
 527         mutex_enter(&mc->mc_lock);
 528
 529         /*
 530          * If the metaslab group was just added then it won't
 531          * have any space until we finish syncing out this txg.
 532          * At that point we will consider it initialized and available
 533          * for allocations.  We also don't consider non-activated
 534          * metaslab groups (e.g. vdevs that are in the middle of being removed)
 535          * to be initialized, because they can't be used for allocation.
 536          */
 537         mg->mg_initialized = metaslab_group_initialized(mg);
 538         if (!was_initialized && mg->mg_initialized) {
 539                 mc->mc_groups++;
 540         } else if (was_initialized && !mg->mg_initialized) {
 541                 ASSERT3U(mc->mc_groups, >, 0);
 542                 mc->mc_groups--;
 543         }
 544         if (mg->mg_initialized)
 545                 mg->mg_no_free_space = B_FALSE;
 546
 547         /*
 548          * A metaslab group is considered allocatable if it has plenty
 549          * of free space or is not heavily fragmented. We only take
 550          * fragmentation into account if the metaslab group has a valid
 551          * fragmentation metric (i.e. a value between 0 and 100).
 552          */
 553         mg->mg_allocatable = (mg->mg_activation_count > 0 &&
 554             mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
 555             (mg->mg_fragmentation == ZFS_FRAG_INVALID ||
 556             mg->mg_fragmentation <= zfs_mg_fragmentation_threshold));
 557
 558         /*
 559          * The mc_alloc_groups maintains a count of the number of
 560          * groups in this metaslab class that are still above the
 561          * zfs_mg_noalloc_threshold. This is used by the allocating
 562          * threads to determine if they should avoid allocations to
 563          * a given group. The allocator will avoid allocations to a group
 564          * if that group has reached or is below the zfs_mg_noalloc_threshold
 565          * and there are still other groups that are above the threshold.
 566          * When a group transitions from allocatable to non-allocatable or
 567          * vice versa we update the metaslab class to reflect that change.
 568          * When the mc_alloc_groups value drops to 0 that means that all
 569          * groups have reached the zfs_mg_noalloc_threshold making all groups
 570          * eligible for allocations. This effectively means that all devices
 571          * are balanced again.
 572          */
 573         if (was_allocatable && !mg->mg_allocatable)
 574                 mc->mc_alloc_groups--;
 575         else if (!was_allocatable && mg->mg_allocatable)
 576                 mc->mc_alloc_groups++;
 577         mutex_exit(&mc->mc_lock);
 578
 579         mutex_exit(&mg->mg_lock);
 580 }
 581
 582 metaslab_group_t *
 583 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
 584 {
 585         metaslab_group_t *mg;
 586
 587         mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
 588         mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
 589         avl_create(&mg->mg_metaslab_tree, metaslab_compare,
 590             sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
 591         mg->mg_vd = vd;
 592         mg->mg_class = mc;
 593         mg->mg_activation_count = 0;
 594         mg->mg_initialized = B_FALSE;
 595         mg->mg_no_free_space = B_TRUE;
 596         refcount_create_tracked(&mg->mg_alloc_queue_depth);
 597
 598         mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
 599             maxclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT | TASKQ_DYNAMIC);
 600
 601         return (mg);
 602 }
 603
 604 void
 605 metaslab_group_destroy(metaslab_group_t *mg)
 606 {
 607         ASSERT(mg->mg_prev == NULL);
 608         ASSERT(mg->mg_next == NULL);
 609         /*
 610          * We may have gone below zero with the activation count
 611          * either because we never activated in the first place or
 612          * because we're done, and possibly removing the vdev.
 613          */
 614         ASSERT(mg->mg_activation_count <= 0);
 615
 616         taskq_destroy(mg->mg_taskq);
 617         avl_destroy(&mg->mg_metaslab_tree);
 618         mutex_destroy(&mg->mg_lock);
 619         refcount_destroy(&mg->mg_alloc_queue_depth);
 620         kmem_free(mg, sizeof (metaslab_group_t));
 621 }
 622
 623 void
 624 metaslab_group_activate(metaslab_group_t *mg)
 625 {
 626         metaslab_class_t *mc = mg->mg_class;
 627         metaslab_group_t *mgprev, *mgnext;
 628
 629         ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER), !=, 0);
 630
 631         ASSERT(mc->mc_rotor != mg);
 632         ASSERT(mg->mg_prev == NULL);
 633         ASSERT(mg->mg_next == NULL);
 634         ASSERT(mg->mg_activation_count <= 0);
 635
 636         if (++mg->mg_activation_count <= 0)
 637                 return;
 638
 639         mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
 640         metaslab_group_alloc_update(mg);
 641
 642         if ((mgprev = mc->mc_rotor) == NULL) {
 643                 mg->mg_prev = mg;
 644                 mg->mg_next = mg;
 645         } else {
 646                 mgnext = mgprev->mg_next;
 647                 mg->mg_prev = mgprev;
 648                 mg->mg_next = mgnext;
 649                 mgprev->mg_next = mg;
 650                 mgnext->mg_prev = mg;
 651         }
 652         mc->mc_rotor = mg;
 653 }
 654
 655 /*
 656  * Passivate a metaslab group and remove it from the allocation rotor.
 657  * Callers must hold both the SCL_ALLOC and SCL_ZIO lock prior to passivating
 658  * a metaslab group. This function will momentarily drop spa_config_locks
 659  * that are lower than the SCL_ALLOC lock (see comment below).
 660  */
 661 void
 662 metaslab_group_passivate(metaslab_group_t *mg)
 663 {
 664         metaslab_class_t *mc = mg->mg_class;
 665         spa_t *spa = mc->mc_spa;
 666         metaslab_group_t *mgprev, *mgnext;
 667         int locks = spa_config_held(spa, SCL_ALL, RW_WRITER);
 668
 669         ASSERT3U(spa_config_held(spa, SCL_ALLOC | SCL_ZIO, RW_WRITER), ==,
 670             (SCL_ALLOC | SCL_ZIO));
 671
 672         if (--mg->mg_activation_count != 0) {
 673                 ASSERT(mc->mc_rotor != mg);
 674                 ASSERT(mg->mg_prev == NULL);
 675                 ASSERT(mg->mg_next == NULL);
 676                 ASSERT(mg->mg_activation_count < 0);
 677                 return;
 678         }
 679
 680         /*
 681          * The spa_config_lock is an array of rwlocks, ordered as
 682          * follows (from highest to lowest):
 683          *      SCL_CONFIG > SCL_STATE > SCL_L2ARC > SCL_ALLOC >
 684          *      SCL_ZIO > SCL_FREE > SCL_VDEV
 685          * (For more information about the spa_config_lock see spa_misc.c)
 686          * The higher the lock, the broader its coverage. When we passivate
 687          * a metaslab group, we must hold both the SCL_ALLOC and the SCL_ZIO
 688          * config locks. However, the metaslab group's taskq might be trying
 689          * to preload metaslabs so we must drop the SCL_ZIO lock and any
 690          * lower locks to allow the I/O to complete. At a minimum,
 691          * we continue to hold the SCL_ALLOC lock, which prevents any future
 692          * allocations from taking place and any changes to the vdev tree.
 693          */
 694         spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa);
 695         taskq_wait_outstanding(mg->mg_taskq, 0);
 696         spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER);
 697         metaslab_group_alloc_update(mg);
 698
 699         mgprev = mg->mg_prev;
 700         mgnext = mg->mg_next;
 701
 702         if (mg == mgnext) {
 703                 mc->mc_rotor = NULL;
 704         } else {
 705                 mc->mc_rotor = mgnext;
 706                 mgprev->mg_next = mgnext;
 707                 mgnext->mg_prev = mgprev;
 708         }
 709
 710         mg->mg_prev = NULL;
 711         mg->mg_next = NULL;
 712 }
 713
 714 boolean_t
 715 metaslab_group_initialized(metaslab_group_t *mg)
 716 {
 717         vdev_t *vd = mg->mg_vd;
 718         vdev_stat_t *vs = &vd->vdev_stat;
 719
 720         return (vs->vs_space != 0 && mg->mg_activation_count > 0);
 721 }
 722
 723 uint64_t
 724 metaslab_group_get_space(metaslab_group_t *mg)
 725 {
 726         return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count);
 727 }
 728
 729 void
 730 metaslab_group_histogram_verify(metaslab_group_t *mg)
 731 {
 732         uint64_t *mg_hist;
 733         vdev_t *vd = mg->mg_vd;
 734         uint64_t ashift = vd->vdev_ashift;
 735         int i;
 736
 737         if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
 738                 return;
 739
 740         mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
 741             KM_SLEEP);
 742
 743         ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=,
 744             SPACE_MAP_HISTOGRAM_SIZE + ashift);
 745
 746         for (int m = 0; m < vd->vdev_ms_count; m++) {
 747                 metaslab_t *msp = vd->vdev_ms[m];
 748
 749                 if (msp->ms_sm == NULL)
 750                         continue;
 751
 752                 for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
 753                         mg_hist[i + ashift] +=
 754                             msp->ms_sm->sm_phys->smp_histogram[i];
 755         }
 756
 757         for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++)
 758                 VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]);
 759
 760         kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
 761 }
 762
 763 static void
 764 metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
 765 {
 766         metaslab_class_t *mc = mg->mg_class;
 767         uint64_t ashift = mg->mg_vd->vdev_ashift;
 768
 769         ASSERT(MUTEX_HELD(&msp->ms_lock));
 770         if (msp->ms_sm == NULL)
 771                 return;
 772
 773         mutex_enter(&mg->mg_lock);
 774         for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
 775                 mg->mg_histogram[i + ashift] +=
 776                     msp->ms_sm->sm_phys->smp_histogram[i];
 777                 mc->mc_histogram[i + ashift] +=
 778                     msp->ms_sm->sm_phys->smp_histogram[i];
 779         }
 780         mutex_exit(&mg->mg_lock);
 781 }
 782
 783 void
 784 metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp)
 785 {
 786         metaslab_class_t *mc = mg->mg_class;
 787         uint64_t ashift = mg->mg_vd->vdev_ashift;
 788
 789         ASSERT(MUTEX_HELD(&msp->ms_lock));
 790         if (msp->ms_sm == NULL)
 791                 return;
 792
 793         mutex_enter(&mg->mg_lock);
 794         for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
 795                 ASSERT3U(mg->mg_histogram[i + ashift], >=,
 796                     msp->ms_sm->sm_phys->smp_histogram[i]);
 797                 ASSERT3U(mc->mc_histogram[i + ashift], >=,
 798                     msp->ms_sm->sm_phys->smp_histogram[i]);
 799
 800                 mg->mg_histogram[i + ashift] -=
 801                     msp->ms_sm->sm_phys->smp_histogram[i];
 802                 mc->mc_histogram[i + ashift] -=
 803                     msp->ms_sm->sm_phys->smp_histogram[i];
 804         }
 805         mutex_exit(&mg->mg_lock);
 806 }
 807
 808 static void
 809 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
 810 {
 811         ASSERT(msp->ms_group == NULL);
 812         mutex_enter(&mg->mg_lock);
 813         msp->ms_group = mg;
 814         msp->ms_weight = 0;
 815         avl_add(&mg->mg_metaslab_tree, msp);
 816         mutex_exit(&mg->mg_lock);
 817
 818         mutex_enter(&msp->ms_lock);
 819         metaslab_group_histogram_add(mg, msp);
 820         mutex_exit(&msp->ms_lock);
 821 }
 822
 823 static void
 824 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
 825 {
 826         mutex_enter(&msp->ms_lock);
 827         metaslab_group_histogram_remove(mg, msp);
 828         mutex_exit(&msp->ms_lock);
 829
 830         mutex_enter(&mg->mg_lock);
 831         ASSERT(msp->ms_group == mg);
 832         avl_remove(&mg->mg_metaslab_tree, msp);
 833         msp->ms_group = NULL;
 834         mutex_exit(&mg->mg_lock);
 835 }
 836
 837 static void
 838 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
 839 {
 840         /*
 841          * Although in principle the weight can be any value, in
 842          * practice we do not use values in the range [1, 511].
 843          */
 844         ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0);
 845         ASSERT(MUTEX_HELD(&msp->ms_lock));
 846
 847         mutex_enter(&mg->mg_lock);
 848         ASSERT(msp->ms_group == mg);
 849         avl_remove(&mg->mg_metaslab_tree, msp);
 850         msp->ms_weight = weight;
 851         avl_add(&mg->mg_metaslab_tree, msp);
 852         mutex_exit(&mg->mg_lock);
 853 }
 854
 855 /*
 856  * Calculate the fragmentation for a given metaslab group. We can use
 857  * a simple average here since all metaslabs within the group must have
 858  * the same size. The return value will be a value between 0 and 100
 859  * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this
 860  * group have a fragmentation metric.
 861  */
 862 uint64_t
 863 metaslab_group_fragmentation(metaslab_group_t *mg)
 864 {
 865         vdev_t *vd = mg->mg_vd;
 866         uint64_t fragmentation = 0;
 867         uint64_t valid_ms = 0;
 868
 869         for (int m = 0; m < vd->vdev_ms_count; m++) {
 870                 metaslab_t *msp = vd->vdev_ms[m];
 871
 872                 if (msp->ms_fragmentation == ZFS_FRAG_INVALID)
 873                         continue;
 874
 875                 valid_ms++;
 876                 fragmentation += msp->ms_fragmentation;
 877         }
 878
 879         if (valid_ms <= vd->vdev_ms_count / 2)
 880                 return (ZFS_FRAG_INVALID);
 881
 882         fragmentation /= valid_ms;
 883         ASSERT3U(fragmentation, <=, 100);
 884         return (fragmentation);
 885 }
 886
 887 /*
 888  * Determine if a given metaslab group should skip allocations. A metaslab
 889  * group should avoid allocations if its free capacity is less than the
 890  * zfs_mg_noalloc_threshold or its fragmentation metric is greater than
 891  * zfs_mg_fragmentation_threshold and there is at least one metaslab group
 892  * that can still handle allocations. If the allocation throttle is enabled
 893  * then we skip allocations to devices that have reached their maximum
 894  * allocation queue depth unless the selected metaslab group is the only
 895  * eligible group remaining.
 896  */
 897 static boolean_t
 898 metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
 899     uint64_t psize)
 900 {
 901         spa_t *spa = mg->mg_vd->vdev_spa;
 902         metaslab_class_t *mc = mg->mg_class;
 903
 904         /*
 905          * We can only consider skipping this metaslab group if it's
 906          * in the normal metaslab class and there are other metaslab
 907          * groups to select from. Otherwise, we always consider it eligible
 908          * for allocations.
 909          */
 910         if (mc != spa_normal_class(spa) || mc->mc_groups <= 1)
 911                 return (B_TRUE);
 912
 913         /*
 914          * If the metaslab group's mg_allocatable flag is set (see comments
 915          * in metaslab_group_alloc_update() for more information) and
 916          * the allocation throttle is disabled then allow allocations to this
 917          * device. However, if the allocation throttle is enabled then
 918          * check if we have reached our allocation limit (mg_alloc_queue_depth)
 919          * to determine if we should allow allocations to this metaslab group.
 920          * If all metaslab groups are no longer considered allocatable
 921          * (mc_alloc_groups == 0) or we're trying to allocate the smallest
 922          * gang block size then we allow allocations on this metaslab group
 923          * regardless of the mg_allocatable or throttle settings.
 924          */
 925         if (mg->mg_allocatable) {
 926                 metaslab_group_t *mgp;
 927                 int64_t qdepth;
 928                 uint64_t qmax = mg->mg_max_alloc_queue_depth;
 929
 930                 if (!mc->mc_alloc_throttle_enabled)
 931                         return (B_TRUE);
 932
 933                 /*
 934                  * If this metaslab group does not have any free space, then
 935                  * there is no point in looking further.
 936                  */
 937                 if (mg->mg_no_free_space)
 938                         return (B_FALSE);
 939
 940                 qdepth = refcount_count(&mg->mg_alloc_queue_depth);
 941
 942                 /*
 943                  * If this metaslab group is below its qmax or it's
 944                  * the only allocatable metasable group, then attempt
 945                  * to allocate from it.
 946                  */
 947                 if (qdepth < qmax || mc->mc_alloc_groups == 1)
 948                         return (B_TRUE);
 949                 ASSERT3U(mc->mc_alloc_groups, >, 1);
 950
 951                 /*
 952                  * Since this metaslab group is at or over its qmax, we
 953                  * need to determine if there are metaslab groups after this
 954                  * one that might be able to handle this allocation. This is
 955                  * racy since we can't hold the locks for all metaslab
 956                  * groups at the same time when we make this check.
 957                  */
 958                 for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) {
 959                         qmax = mgp->mg_max_alloc_queue_depth;
 960
 961                         qdepth = refcount_count(&mgp->mg_alloc_queue_depth);
 962
 963                         /*
 964                          * If there is another metaslab group that
 965                          * might be able to handle the allocation, then
 966                          * we return false so that we skip this group.
 967                          */
 968                         if (qdepth < qmax && !mgp->mg_no_free_space)
 969                                 return (B_FALSE);
 970                 }
 971
 972                 /*
 973                  * We didn't find another group to handle the allocation
 974                  * so we can't skip this metaslab group even though
 975                  * we are at or over our qmax.
 976                  */
 977                 return (B_TRUE);
 978
 979         } else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) {
 980                 return (B_TRUE);
 981         }
 982         return (B_FALSE);
 983 }
 984
 985 /*
 986  * ==========================================================================
 987  * Range tree callbacks
 988  * ==========================================================================
 989  */
 990
 991 /*
 992  * Comparison function for the private size-ordered tree. Tree is sorted
 993  * by size, larger sizes at the end of the tree.
 994  */
 995 static int
 996 metaslab_rangesize_compare(const void *x1, const void *x2)
 997 {
 998         const range_seg_t *r1 = x1;
 999         const range_seg_t *r2 = x2;
1000         uint64_t rs_size1 = r1->rs_end - r1->rs_start;
1001         uint64_t rs_size2 = r2->rs_end - r2->rs_start;
1002
1003         int cmp = AVL_CMP(rs_size1, rs_size2);
1004         if (likely(cmp))
1005                 return (cmp);
1006
1007         return (AVL_CMP(r1->rs_start, r2->rs_start));
1008 }
1009
1010 /*
1011  * ==========================================================================
1012  * Common allocator routines
1013  * ==========================================================================
1014  */
1015
1016 /*
1017  * Return the maximum contiguous segment within the metaslab.
1018  */
1019 uint64_t
1020 metaslab_block_maxsize(metaslab_t *msp)
1021 {
1022         avl_tree_t *t = &msp->ms_size_tree;
1023         range_seg_t *rs;
1024
1025         if (t == NULL || (rs = avl_last(t)) == NULL)
1026                 return (0ULL);
1027
1028         return (rs->rs_end - rs->rs_start);
1029 }
1030
1031 static range_seg_t *
1032 metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size)
1033 {
1034         range_seg_t *rs, rsearch;
1035         avl_index_t where;
1036
1037         rsearch.rs_start = start;
1038         rsearch.rs_end = start + size;
1039
1040         rs = avl_find(t, &rsearch, &where);
1041         if (rs == NULL) {
1042                 rs = avl_nearest(t, where, AVL_AFTER);
1043         }
1044
1045         return (rs);
1046 }
1047
1048 #if defined(WITH_FF_BLOCK_ALLOCATOR) || \
1049     defined(WITH_DF_BLOCK_ALLOCATOR) || \
1050     defined(WITH_CF_BLOCK_ALLOCATOR)
1051 /*
1052  * This is a helper function that can be used by the allocator to find
1053  * a suitable block to allocate. This will search the specified AVL
1054  * tree looking for a block that matches the specified criteria.
1055  */
1056 static uint64_t
1057 metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
1058     uint64_t align)
1059 {
1060         range_seg_t *rs = metaslab_block_find(t, *cursor, size);
1061
1062         while (rs != NULL) {
1063                 uint64_t offset = P2ROUNDUP(rs->rs_start, align);
1064
1065                 if (offset + size <= rs->rs_end) {
1066                         *cursor = offset + size;
1067                         return (offset);
1068                 }
1069                 rs = AVL_NEXT(t, rs);
1070         }
1071
1072         /*
1073          * If we know we've searched the whole map (*cursor == 0), give up.
1074          * Otherwise, reset the cursor to the beginning and try again.
1075          */
1076         if (*cursor == 0)
1077                 return (-1ULL);
1078
1079         *cursor = 0;
1080         return (metaslab_block_picker(t, cursor, size, align));
1081 }
1082 #endif /* WITH_FF/DF/CF_BLOCK_ALLOCATOR */
1083
1084 #if defined(WITH_FF_BLOCK_ALLOCATOR)
1085 /*
1086  * ==========================================================================
1087  * The first-fit block allocator
1088  * ==========================================================================
1089  */
1090 static uint64_t
1091 metaslab_ff_alloc(metaslab_t *msp, uint64_t size)
1092 {
1093         /*
1094          * Find the largest power of 2 block size that evenly divides the
1095          * requested size. This is used to try to allocate blocks with similar
1096          * alignment from the same area of the metaslab (i.e. same cursor
1097          * bucket) but it does not guarantee that other allocations sizes
1098          * may exist in the same region.
1099          */
1100         uint64_t align = size & -size;
1101         uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
1102         avl_tree_t *t = &msp->ms_tree->rt_root;
1103
1104         return (metaslab_block_picker(t, cursor, size, align));
1105 }
1106
1107 static metaslab_ops_t metaslab_ff_ops = {
1108         metaslab_ff_alloc
1109 };
1110
1111 metaslab_ops_t *zfs_metaslab_ops = &metaslab_ff_ops;
1112 #endif /* WITH_FF_BLOCK_ALLOCATOR */
1113
1114 #if defined(WITH_DF_BLOCK_ALLOCATOR)
1115 /*
1116  * ==========================================================================
1117  * Dynamic block allocator -
1118  * Uses the first fit allocation scheme until space get low and then
1119  * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
1120  * and metaslab_df_free_pct to determine when to switch the allocation scheme.
1121  * ==========================================================================
1122  */
1123 static uint64_t
1124 metaslab_df_alloc(metaslab_t *msp, uint64_t size)
1125 {
1126         /*
1127          * Find the largest power of 2 block size that evenly divides the
1128          * requested size. This is used to try to allocate blocks with similar
1129          * alignment from the same area of the metaslab (i.e. same cursor
1130          * bucket) but it does not guarantee that other allocations sizes
1131          * may exist in the same region.
1132          */
1133         uint64_t align = size & -size;
1134         uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
1135         range_tree_t *rt = msp->ms_tree;
1136         avl_tree_t *t = &rt->rt_root;
1137         uint64_t max_size = metaslab_block_maxsize(msp);
1138         int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
1139
1140         ASSERT(MUTEX_HELD(&msp->ms_lock));
1141         ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
1142
1143         if (max_size < size)
1144                 return (-1ULL);
1145
1146         /*
1147          * If we're running low on space switch to using the size
1148          * sorted AVL tree (best-fit).
1149          */
1150         if (max_size < metaslab_df_alloc_threshold ||
1151             free_pct < metaslab_df_free_pct) {
1152                 t = &msp->ms_size_tree;
1153                 *cursor = 0;
1154         }
1155
1156         return (metaslab_block_picker(t, cursor, size, 1ULL));
1157 }
1158
1159 static metaslab_ops_t metaslab_df_ops = {
1160         metaslab_df_alloc
1161 };
1162
1163 metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
1164 #endif /* WITH_DF_BLOCK_ALLOCATOR */
1165
1166 #if defined(WITH_CF_BLOCK_ALLOCATOR)
1167 /*
1168  * ==========================================================================
1169  * Cursor fit block allocator -
1170  * Select the largest region in the metaslab, set the cursor to the beginning
1171  * of the range and the cursor_end to the end of the range. As allocations
1172  * are made advance the cursor. Continue allocating from the cursor until
1173  * the range is exhausted and then find a new range.
1174  * ==========================================================================
1175  */
1176 static uint64_t
1177 metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
1178 {
1179         range_tree_t *rt = msp->ms_tree;
1180         avl_tree_t *t = &msp->ms_size_tree;
1181         uint64_t *cursor = &msp->ms_lbas[0];
1182         uint64_t *cursor_end = &msp->ms_lbas[1];
1183         uint64_t offset = 0;
1184
1185         ASSERT(MUTEX_HELD(&msp->ms_lock));
1186         ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root));
1187
1188         ASSERT3U(*cursor_end, >=, *cursor);
1189
1190         if ((*cursor + size) > *cursor_end) {
1191                 range_seg_t *rs;
1192
1193                 rs = avl_last(&msp->ms_size_tree);
1194                 if (rs == NULL || (rs->rs_end - rs->rs_start) < size)
1195                         return (-1ULL);
1196
1197                 *cursor = rs->rs_start;
1198                 *cursor_end = rs->rs_end;
1199         }
1200
1201         offset = *cursor;
1202         *cursor += size;
1203
1204         return (offset);
1205 }
1206
1207 static metaslab_ops_t metaslab_cf_ops = {
1208         metaslab_cf_alloc
1209 };
1210
1211 metaslab_ops_t *zfs_metaslab_ops = &metaslab_cf_ops;
1212 #endif /* WITH_CF_BLOCK_ALLOCATOR */
1213
1214 #if defined(WITH_NDF_BLOCK_ALLOCATOR)
1215 /*
1216  * ==========================================================================
1217  * New dynamic fit allocator -
1218  * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
1219  * contiguous blocks. If no region is found then just use the largest segment
1220  * that remains.
1221  * ==========================================================================
1222  */
1223
1224 /*
1225  * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
1226  * to request from the allocator.
1227  */
1228 uint64_t metaslab_ndf_clump_shift = 4;
1229
1230 static uint64_t
1231 metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
1232 {
1233         avl_tree_t *t = &msp->ms_tree->rt_root;
1234         avl_index_t where;
1235         range_seg_t *rs, rsearch;
1236         uint64_t hbit = highbit64(size);
1237         uint64_t *cursor = &msp->ms_lbas[hbit - 1];
1238         uint64_t max_size = metaslab_block_maxsize(msp);
1239
1240         ASSERT(MUTEX_HELD(&msp->ms_lock));
1241         ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
1242
1243         if (max_size < size)
1244                 return (-1ULL);
1245
1246         rsearch.rs_start = *cursor;
1247         rsearch.rs_end = *cursor + size;
1248
1249         rs = avl_find(t, &rsearch, &where);
1250         if (rs == NULL || (rs->rs_end - rs->rs_start) < size) {
1251                 t = &msp->ms_size_tree;
1252
1253                 rsearch.rs_start = 0;
1254                 rsearch.rs_end = MIN(max_size,
1255                     1ULL << (hbit + metaslab_ndf_clump_shift));
1256                 rs = avl_find(t, &rsearch, &where);
1257                 if (rs == NULL)
1258                         rs = avl_nearest(t, where, AVL_AFTER);
1259                 ASSERT(rs != NULL);
1260         }
1261
1262         if ((rs->rs_end - rs->rs_start) >= size) {
1263                 *cursor = rs->rs_start + size;
1264                 return (rs->rs_start);
1265         }
1266         return (-1ULL);
1267 }
1268
1269 static metaslab_ops_t metaslab_ndf_ops = {
1270         metaslab_ndf_alloc
1271 };
1272
1273 metaslab_ops_t *zfs_metaslab_ops = &metaslab_ndf_ops;
1274 #endif /* WITH_NDF_BLOCK_ALLOCATOR */
1275
1276
1277 /*
1278  * ==========================================================================
1279  * Metaslabs
1280  * ==========================================================================
1281  */
1282
1283 /*
1284  * Wait for any in-progress metaslab loads to complete.
1285  */
1286 void
1287 metaslab_load_wait(metaslab_t *msp)
1288 {
1289         ASSERT(MUTEX_HELD(&msp->ms_lock));
1290
1291         while (msp->ms_loading) {
1292                 ASSERT(!msp->ms_loaded);
1293                 cv_wait(&msp->ms_load_cv, &msp->ms_lock);
1294         }
1295 }
1296
1297 int
1298 metaslab_load(metaslab_t *msp)
1299 {
1300         int error = 0;
1301         boolean_t success = B_FALSE;
1302
1303         ASSERT(MUTEX_HELD(&msp->ms_lock));
1304         ASSERT(!msp->ms_loaded);
1305         ASSERT(!msp->ms_loading);
1306
1307         msp->ms_loading = B_TRUE;
1308         /*
1309          * Nobody else can manipulate a loading metaslab, so it's now safe
1310          * to drop the lock.  This way we don't have to hold the lock while
1311          * reading the spacemap from disk.
1312          */
1313         mutex_exit(&msp->ms_lock);
1314
1315         /*
1316          * If the space map has not been allocated yet, then treat
1317          * all the space in the metaslab as free and add it to the
1318          * ms_tree.
1319          */
1320         if (msp->ms_sm != NULL)
1321                 error = space_map_load(msp->ms_sm, msp->ms_tree, SM_FREE);
1322         else
1323                 range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size);
1324
1325         success = (error == 0);
1326
1327         mutex_enter(&msp->ms_lock);
1328         msp->ms_loading = B_FALSE;
1329
1330         if (success) {
1331                 ASSERT3P(msp->ms_group, !=, NULL);
1332                 msp->ms_loaded = B_TRUE;
1333
1334                 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1335                         range_tree_walk(msp->ms_defertree[t],
1336                             range_tree_remove, msp->ms_tree);
1337                 }
1338                 msp->ms_max_size = metaslab_block_maxsize(msp);
1339         }
1340         cv_broadcast(&msp->ms_load_cv);
1341         return (error);
1342 }
1343
1344 void
1345 metaslab_unload(metaslab_t *msp)
1346 {
1347         ASSERT(MUTEX_HELD(&msp->ms_lock));
1348         range_tree_vacate(msp->ms_tree, NULL, NULL);
1349         msp->ms_loaded = B_FALSE;
1350         msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
1351         msp->ms_max_size = 0;
1352 }
1353
1354 int
1355 metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
1356     metaslab_t **msp)
1357 {
1358         vdev_t *vd = mg->mg_vd;
1359         objset_t *mos = vd->vdev_spa->spa_meta_objset;
1360         metaslab_t *ms;
1361         int error;
1362
1363         ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
1364         mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL);
1365         mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL);
1366         cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
1367         ms->ms_id = id;
1368         ms->ms_start = id << vd->vdev_ms_shift;
1369         ms->ms_size = 1ULL << vd->vdev_ms_shift;
1370
1371         /*
1372          * We only open space map objects that already exist. All others
1373          * will be opened when we finally allocate an object for it.
1374          */
1375         if (object != 0) {
1376                 error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start,
1377                     ms->ms_size, vd->vdev_ashift);
1378
1379                 if (error != 0) {
1380                         kmem_free(ms, sizeof (metaslab_t));
1381                         return (error);
1382                 }
1383
1384                 ASSERT(ms->ms_sm != NULL);
1385         }
1386
1387         /*
1388          * We create the main range tree here, but we don't create the
1389          * other range trees until metaslab_sync_done().  This serves
1390          * two purposes: it allows metaslab_sync_done() to detect the
1391          * addition of new space; and for debugging, it ensures that we'd
1392          * data fault on any attempt to use this metaslab before it's ready.
1393          */
1394         ms->ms_tree = range_tree_create_impl(&rt_avl_ops, &ms->ms_size_tree,
1395             metaslab_rangesize_compare, 0);
1396         metaslab_group_add(mg, ms);
1397
1398         metaslab_set_fragmentation(ms);
1399
1400         /*
1401          * If we're opening an existing pool (txg == 0) or creating
1402          * a new one (txg == TXG_INITIAL), all space is available now.
1403          * If we're adding space to an existing pool, the new space
1404          * does not become available until after this txg has synced.
1405          * The metaslab's weight will also be initialized when we sync
1406          * out this txg. This ensures that we don't attempt to allocate
1407          * from it before we have initialized it completely.
1408          */
1409         if (txg <= TXG_INITIAL)
1410                 metaslab_sync_done(ms, 0);
1411
1412         /*
1413          * If metaslab_debug_load is set and we're initializing a metaslab
1414          * that has an allocated space map object then load the its space
1415          * map so that can verify frees.
1416          */
1417         if (metaslab_debug_load && ms->ms_sm != NULL) {
1418                 mutex_enter(&ms->ms_lock);
1419                 VERIFY0(metaslab_load(ms));
1420                 mutex_exit(&ms->ms_lock);
1421         }
1422
1423         if (txg != 0) {
1424                 vdev_dirty(vd, 0, NULL, txg);
1425                 vdev_dirty(vd, VDD_METASLAB, ms, txg);
1426         }
1427
1428         *msp = ms;
1429
1430         return (0);
1431 }
1432
1433 void
1434 metaslab_fini(metaslab_t *msp)
1435 {
1436         metaslab_group_t *mg = msp->ms_group;
1437
1438         metaslab_group_remove(mg, msp);
1439
1440         mutex_enter(&msp->ms_lock);
1441         VERIFY(msp->ms_group == NULL);
1442         vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm),
1443             0, -msp->ms_size);
1444         space_map_close(msp->ms_sm);
1445
1446         metaslab_unload(msp);
1447         range_tree_destroy(msp->ms_tree);
1448         range_tree_destroy(msp->ms_freeingtree);
1449         range_tree_destroy(msp->ms_freedtree);
1450
1451         for (int t = 0; t < TXG_SIZE; t++) {
1452                 range_tree_destroy(msp->ms_alloctree[t]);
1453         }
1454
1455         for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1456                 range_tree_destroy(msp->ms_defertree[t]);
1457         }
1458
1459         ASSERT0(msp->ms_deferspace);
1460
1461         mutex_exit(&msp->ms_lock);
1462         cv_destroy(&msp->ms_load_cv);
1463         mutex_destroy(&msp->ms_lock);
1464         mutex_destroy(&msp->ms_sync_lock);
1465
1466         kmem_free(msp, sizeof (metaslab_t));
1467 }
1468
1469 #define FRAGMENTATION_TABLE_SIZE        17
1470
1471 /*
1472  * This table defines a segment size based fragmentation metric that will
1473  * allow each metaslab to derive its own fragmentation value. This is done
1474  * by calculating the space in each bucket of the spacemap histogram and
1475  * multiplying that by the fragmetation metric in this table. Doing
1476  * this for all buckets and dividing it by the total amount of free
1477  * space in this metaslab (i.e. the total free space in all buckets) gives
1478  * us the fragmentation metric. This means that a high fragmentation metric
1479  * equates to most of the free space being comprised of small segments.
1480  * Conversely, if the metric is low, then most of the free space is in
1481  * large segments. A 10% change in fragmentation equates to approximately
1482  * double the number of segments.
1483  *
1484  * This table defines 0% fragmented space using 16MB segments. Testing has
1485  * shown that segments that are greater than or equal to 16MB do not suffer
1486  * from drastic performance problems. Using this value, we derive the rest
1487  * of the table. Since the fragmentation value is never stored on disk, it
1488  * is possible to change these calculations in the future.
1489  */
1490 int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
1491         100,    /* 512B */
1492         100,    /* 1K   */
1493         98,     /* 2K   */
1494         95,     /* 4K   */
1495         90,     /* 8K   */
1496         80,     /* 16K  */
1497         70,     /* 32K  */
1498         60,     /* 64K  */
1499         50,     /* 128K */
1500         40,     /* 256K */
1501         30,     /* 512K */
1502         20,     /* 1M   */
1503         15,     /* 2M   */
1504         10,     /* 4M   */
1505         5,      /* 8M   */
1506         0       /* 16M  */
1507 };
1508
1509 /*
1510  * Calclate the metaslab's fragmentation metric. A return value
1511  * of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does
1512  * not support this metric. Otherwise, the return value should be in the
1513  * range [0, 100].
1514  */
1515 static void
1516 metaslab_set_fragmentation(metaslab_t *msp)
1517 {
1518         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1519         uint64_t fragmentation = 0;
1520         uint64_t total = 0;
1521         boolean_t feature_enabled = spa_feature_is_enabled(spa,
1522             SPA_FEATURE_SPACEMAP_HISTOGRAM);
1523
1524         if (!feature_enabled) {
1525                 msp->ms_fragmentation = ZFS_FRAG_INVALID;
1526                 return;
1527         }
1528
1529         /*
1530          * A null space map means that the entire metaslab is free
1531          * and thus is not fragmented.
1532          */
1533         if (msp->ms_sm == NULL) {
1534                 msp->ms_fragmentation = 0;
1535                 return;
1536         }
1537
1538         /*
1539          * If this metaslab's space map has not been upgraded, flag it
1540          * so that we upgrade next time we encounter it.
1541          */
1542         if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) {
1543                 uint64_t txg = spa_syncing_txg(spa);
1544                 vdev_t *vd = msp->ms_group->mg_vd;
1545
1546                 /*
1547                  * If we've reached the final dirty txg, then we must
1548                  * be shutting down the pool. We don't want to dirty
1549                  * any data past this point so skip setting the condense
1550                  * flag. We can retry this action the next time the pool
1551                  * is imported.
1552                  */
1553                 if (spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) {
1554                         msp->ms_condense_wanted = B_TRUE;
1555                         vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
1556                         spa_dbgmsg(spa, "txg %llu, requesting force condense: "
1557                             "ms_id %llu, vdev_id %llu", txg, msp->ms_id,
1558                             vd->vdev_id);
1559                 }
1560                 msp->ms_fragmentation = ZFS_FRAG_INVALID;
1561                 return;
1562         }
1563
1564         for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
1565                 uint64_t space = 0;
1566                 uint8_t shift = msp->ms_sm->sm_shift;
1567
1568                 int idx = MIN(shift - SPA_MINBLOCKSHIFT + i,
1569                     FRAGMENTATION_TABLE_SIZE - 1);
1570
1571                 if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
1572                         continue;
1573
1574                 space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift);
1575                 total += space;
1576
1577                 ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE);
1578                 fragmentation += space * zfs_frag_table[idx];
1579         }
1580
1581         if (total > 0)
1582                 fragmentation /= total;
1583         ASSERT3U(fragmentation, <=, 100);
1584
1585         msp->ms_fragmentation = fragmentation;
1586 }
1587
1588 /*
1589  * Compute a weight -- a selection preference value -- for the given metaslab.
1590  * This is based on the amount of free space, the level of fragmentation,
1591  * the LBA range, and whether the metaslab is loaded.
1592  */
1593 static uint64_t
1594 metaslab_space_weight(metaslab_t *msp)
1595 {
1596         metaslab_group_t *mg = msp->ms_group;
1597         vdev_t *vd = mg->mg_vd;
1598         uint64_t weight, space;
1599
1600         ASSERT(MUTEX_HELD(&msp->ms_lock));
1601         ASSERT(!vd->vdev_removing);
1602
1603         /*
1604          * The baseline weight is the metaslab's free space.
1605          */
1606         space = msp->ms_size - space_map_allocated(msp->ms_sm);
1607
1608         if (metaslab_fragmentation_factor_enabled &&
1609             msp->ms_fragmentation != ZFS_FRAG_INVALID) {
1610                 /*
1611                  * Use the fragmentation information to inversely scale
1612                  * down the baseline weight. We need to ensure that we
1613                  * don't exclude this metaslab completely when it's 100%
1614                  * fragmented. To avoid this we reduce the fragmented value
1615                  * by 1.
1616                  */
1617                 space = (space * (100 - (msp->ms_fragmentation - 1))) / 100;
1618
1619                 /*
1620                  * If space < SPA_MINBLOCKSIZE, then we will not allocate from
1621                  * this metaslab again. The fragmentation metric may have
1622                  * decreased the space to something smaller than
1623                  * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE
1624                  * so that we can consume any remaining space.
1625                  */
1626                 if (space > 0 && space < SPA_MINBLOCKSIZE)
1627                         space = SPA_MINBLOCKSIZE;
1628         }
1629         weight = space;
1630
1631         /*
1632          * Modern disks have uniform bit density and constant angular velocity.
1633          * Therefore, the outer recording zones are faster (higher bandwidth)
1634          * than the inner zones by the ratio of outer to inner track diameter,
1635          * which is typically around 2:1.  We account for this by assigning
1636          * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
1637          * In effect, this means that we'll select the metaslab with the most
1638          * free bandwidth rather than simply the one with the most free space.
1639          */
1640         if (!vd->vdev_nonrot && metaslab_lba_weighting_enabled) {
1641                 weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
1642                 ASSERT(weight >= space && weight <= 2 * space);
1643         }
1644
1645         /*
1646          * If this metaslab is one we're actively using, adjust its
1647          * weight to make it preferable to any inactive metaslab so
1648          * we'll polish it off. If the fragmentation on this metaslab
1649          * has exceed our threshold, then don't mark it active.
1650          */
1651         if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID &&
1652             msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) {
1653                 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
1654         }
1655
1656         WEIGHT_SET_SPACEBASED(weight);
1657         return (weight);
1658 }
1659
1660 /*
1661  * Return the weight of the specified metaslab, according to the segment-based
1662  * weighting algorithm. The metaslab must be loaded. This function can
1663  * be called within a sync pass since it relies only on the metaslab's
1664  * range tree which is always accurate when the metaslab is loaded.
1665  */
1666 static uint64_t
1667 metaslab_weight_from_range_tree(metaslab_t *msp)
1668 {
1669         uint64_t weight = 0;
1670         uint32_t segments = 0;
1671
1672         ASSERT(msp->ms_loaded);
1673
1674         for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT;
1675             i--) {
1676                 uint8_t shift = msp->ms_group->mg_vd->vdev_ashift;
1677                 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
1678
1679                 segments <<= 1;
1680                 segments += msp->ms_tree->rt_histogram[i];
1681
1682                 /*
1683                  * The range tree provides more precision than the space map
1684                  * and must be downgraded so that all values fit within the
1685                  * space map's histogram. This allows us to compare loaded
1686                  * vs. unloaded metaslabs to determine which metaslab is
1687                  * considered "best".
1688                  */
1689                 if (i > max_idx)
1690                         continue;
1691
1692                 if (segments != 0) {
1693                         WEIGHT_SET_COUNT(weight, segments);
1694                         WEIGHT_SET_INDEX(weight, i);
1695                         WEIGHT_SET_ACTIVE(weight, 0);
1696                         break;
1697                 }
1698         }
1699         return (weight);
1700 }
1701
1702 /*
1703  * Calculate the weight based on the on-disk histogram. This should only
1704  * be called after a sync pass has completely finished since the on-disk
1705  * information is updated in metaslab_sync().
1706  */
1707 static uint64_t
1708 metaslab_weight_from_spacemap(metaslab_t *msp)
1709 {
1710         uint64_t weight = 0;
1711
1712         for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) {
1713                 if (msp->ms_sm->sm_phys->smp_histogram[i] != 0) {
1714                         WEIGHT_SET_COUNT(weight,
1715                             msp->ms_sm->sm_phys->smp_histogram[i]);
1716                         WEIGHT_SET_INDEX(weight, i +
1717                             msp->ms_sm->sm_shift);
1718                         WEIGHT_SET_ACTIVE(weight, 0);
1719                         break;
1720                 }
1721         }
1722         return (weight);
1723 }
1724
1725 /*
1726  * Compute a segment-based weight for the specified metaslab. The weight
1727  * is determined by highest bucket in the histogram. The information
1728  * for the highest bucket is encoded into the weight value.
1729  */
1730 static uint64_t
1731 metaslab_segment_weight(metaslab_t *msp)
1732 {
1733         metaslab_group_t *mg = msp->ms_group;
1734         uint64_t weight = 0;
1735         uint8_t shift = mg->mg_vd->vdev_ashift;
1736
1737         ASSERT(MUTEX_HELD(&msp->ms_lock));
1738
1739         /*
1740          * The metaslab is completely free.
1741          */
1742         if (space_map_allocated(msp->ms_sm) == 0) {
1743                 int idx = highbit64(msp->ms_size) - 1;
1744                 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
1745
1746                 if (idx < max_idx) {
1747                         WEIGHT_SET_COUNT(weight, 1ULL);
1748                         WEIGHT_SET_INDEX(weight, idx);
1749                 } else {
1750                         WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx));
1751                         WEIGHT_SET_INDEX(weight, max_idx);
1752                 }
1753                 WEIGHT_SET_ACTIVE(weight, 0);
1754                 ASSERT(!WEIGHT_IS_SPACEBASED(weight));
1755
1756                 return (weight);
1757         }
1758
1759         ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
1760
1761         /*
1762          * If the metaslab is fully allocated then just make the weight 0.
1763          */
1764         if (space_map_allocated(msp->ms_sm) == msp->ms_size)
1765                 return (0);
1766         /*
1767          * If the metaslab is already loaded, then use the range tree to
1768          * determine the weight. Otherwise, we rely on the space map information
1769          * to generate the weight.
1770          */
1771         if (msp->ms_loaded) {
1772                 weight = metaslab_weight_from_range_tree(msp);
1773         } else {
1774                 weight = metaslab_weight_from_spacemap(msp);
1775         }
1776
1777         /*
1778          * If the metaslab was active the last time we calculated its weight
1779          * then keep it active. We want to consume the entire region that
1780          * is associated with this weight.
1781          */
1782         if (msp->ms_activation_weight != 0 && weight != 0)
1783                 WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight));
1784         return (weight);
1785 }
1786
1787 /*
1788  * Determine if we should attempt to allocate from this metaslab. If the
1789  * metaslab has a maximum size then we can quickly determine if the desired
1790  * allocation size can be satisfied. Otherwise, if we're using segment-based
1791  * weighting then we can determine the maximum allocation that this metaslab
1792  * can accommodate based on the index encoded in the weight. If we're using
1793  * space-based weights then rely on the entire weight (excluding the weight
1794  * type bit).
1795  */
1796 boolean_t
1797 metaslab_should_allocate(metaslab_t *msp, uint64_t asize)
1798 {
1799         boolean_t should_allocate;
1800
1801         if (msp->ms_max_size != 0)
1802                 return (msp->ms_max_size >= asize);
1803
1804         if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
1805                 /*
1806                  * The metaslab segment weight indicates segments in the
1807                  * range [2^i, 2^(i+1)), where i is the index in the weight.
1808                  * Since the asize might be in the middle of the range, we
1809                  * should attempt the allocation if asize < 2^(i+1).
1810                  */
1811                 should_allocate = (asize <
1812                     1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1));
1813         } else {
1814                 should_allocate = (asize <=
1815                     (msp->ms_weight & ~METASLAB_WEIGHT_TYPE));
1816         }
1817         return (should_allocate);
1818 }
1819 static uint64_t
1820 metaslab_weight(metaslab_t *msp)
1821 {
1822         vdev_t *vd = msp->ms_group->mg_vd;
1823         spa_t *spa = vd->vdev_spa;
1824         uint64_t weight;
1825
1826         ASSERT(MUTEX_HELD(&msp->ms_lock));
1827
1828         /*
1829          * If this vdev is in the process of being removed, there is nothing
1830          * for us to do here.
1831          */
1832         if (vd->vdev_removing)
1833                 return (0);
1834
1835         metaslab_set_fragmentation(msp);
1836
1837         /*
1838          * Update the maximum size if the metaslab is loaded. This will
1839          * ensure that we get an accurate maximum size if newly freed space
1840          * has been added back into the free tree.
1841          */
1842         if (msp->ms_loaded)
1843                 msp->ms_max_size = metaslab_block_maxsize(msp);
1844
1845         /*
1846          * Segment-based weighting requires space map histogram support.
1847          */
1848         if (zfs_metaslab_segment_weight_enabled &&
1849             spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
1850             (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size ==
1851             sizeof (space_map_phys_t))) {
1852                 weight = metaslab_segment_weight(msp);
1853         } else {
1854                 weight = metaslab_space_weight(msp);
1855         }
1856         return (weight);
1857 }
1858
1859 static int
1860 metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
1861 {
1862         ASSERT(MUTEX_HELD(&msp->ms_lock));
1863
1864         if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
1865                 metaslab_load_wait(msp);
1866                 if (!msp->ms_loaded) {
1867                         int error = metaslab_load(msp);
1868                         if (error) {
1869                                 metaslab_group_sort(msp->ms_group, msp, 0);
1870                                 return (error);
1871                         }
1872                 }
1873
1874                 msp->ms_activation_weight = msp->ms_weight;
1875                 metaslab_group_sort(msp->ms_group, msp,
1876                     msp->ms_weight | activation_weight);
1877         }
1878         ASSERT(msp->ms_loaded);
1879         ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
1880
1881         return (0);
1882 }
1883
1884 static void
1885 metaslab_passivate(metaslab_t *msp, uint64_t weight)
1886 {
1887         ASSERTV(uint64_t size = weight & ~METASLAB_WEIGHT_TYPE);
1888
1889         /*
1890          * If size < SPA_MINBLOCKSIZE, then we will not allocate from
1891          * this metaslab again.  In that case, it had better be empty,
1892          * or we would be leaving space on the table.
1893          */
1894         ASSERT(!WEIGHT_IS_SPACEBASED(msp->ms_weight) ||
1895             size >= SPA_MINBLOCKSIZE ||
1896             range_tree_space(msp->ms_tree) == 0);
1897         ASSERT0(weight & METASLAB_ACTIVE_MASK);
1898
1899         msp->ms_activation_weight = 0;
1900         metaslab_group_sort(msp->ms_group, msp, weight);
1901         ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
1902 }
1903
1904 /*
1905  * Segment-based metaslabs are activated once and remain active until
1906  * we either fail an allocation attempt (similar to space-based metaslabs)
1907  * or have exhausted the free space in zfs_metaslab_switch_threshold
1908  * buckets since the metaslab was activated. This function checks to see
1909  * if we've exhaused the zfs_metaslab_switch_threshold buckets in the
1910  * metaslab and passivates it proactively. This will allow us to select a
1911  * metaslab with a larger contiguous region, if any, remaining within this
1912  * metaslab group. If we're in sync pass > 1, then we continue using this
1913  * metaslab so that we don't dirty more block and cause more sync passes.
1914  */
1915 void
1916 metaslab_segment_may_passivate(metaslab_t *msp)
1917 {
1918         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1919
1920         if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1)
1921                 return;
1922
1923         /*
1924          * Since we are in the middle of a sync pass, the most accurate
1925          * information that is accessible to us is the in-core range tree
1926          * histogram; calculate the new weight based on that information.
1927          */
1928         uint64_t weight = metaslab_weight_from_range_tree(msp);
1929         int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight);
1930         int current_idx = WEIGHT_GET_INDEX(weight);
1931
1932         if (current_idx <= activation_idx - zfs_metaslab_switch_threshold)
1933                 metaslab_passivate(msp, weight);
1934 }
1935
1936 static void
1937 metaslab_preload(void *arg)
1938 {
1939         metaslab_t *msp = arg;
1940         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1941         fstrans_cookie_t cookie = spl_fstrans_mark();
1942
1943         ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
1944
1945         mutex_enter(&msp->ms_lock);
1946         metaslab_load_wait(msp);
1947         if (!msp->ms_loaded)
1948                 (void) metaslab_load(msp);
1949         msp->ms_selected_txg = spa_syncing_txg(spa);
1950         mutex_exit(&msp->ms_lock);
1951         spl_fstrans_unmark(cookie);
1952 }
1953
1954 static void
1955 metaslab_group_preload(metaslab_group_t *mg)
1956 {
1957         spa_t *spa = mg->mg_vd->vdev_spa;
1958         metaslab_t *msp;
1959         avl_tree_t *t = &mg->mg_metaslab_tree;
1960         int m = 0;
1961
1962         if (spa_shutting_down(spa) || !metaslab_preload_enabled) {
1963                 taskq_wait_outstanding(mg->mg_taskq, 0);
1964                 return;
1965         }
1966
1967         mutex_enter(&mg->mg_lock);
1968
1969         /*
1970          * Load the next potential metaslabs
1971          */
1972         for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) {
1973                 ASSERT3P(msp->ms_group, ==, mg);
1974
1975                 /*
1976                  * We preload only the maximum number of metaslabs specified
1977                  * by metaslab_preload_limit. If a metaslab is being forced
1978                  * to condense then we preload it too. This will ensure
1979                  * that force condensing happens in the next txg.
1980                  */
1981                 if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) {
1982                         continue;
1983                 }
1984
1985                 VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload,
1986                     msp, TQ_SLEEP) != TASKQID_INVALID);
1987         }
1988         mutex_exit(&mg->mg_lock);
1989 }
1990
1991 /*
1992  * Determine if the space map's on-disk footprint is past our tolerance
1993  * for inefficiency. We would like to use the following criteria to make
1994  * our decision:
1995  *
1996  * 1. The size of the space map object should not dramatically increase as a
1997  * result of writing out the free space range tree.
1998  *
1999  * 2. The minimal on-disk space map representation is zfs_condense_pct/100
2000  * times the size than the free space range tree representation
2001  * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1MB).
2002  *
2003  * 3. The on-disk size of the space map should actually decrease.
2004  *
2005  * Checking the first condition is tricky since we don't want to walk
2006  * the entire AVL tree calculating the estimated on-disk size. Instead we
2007  * use the size-ordered range tree in the metaslab and calculate the
2008  * size required to write out the largest segment in our free tree. If the
2009  * size required to represent that segment on disk is larger than the space
2010  * map object then we avoid condensing this map.
2011  *
2012  * To determine the second criterion we use a best-case estimate and assume
2013  * each segment can be represented on-disk as a single 64-bit entry. We refer
2014  * to this best-case estimate as the space map's minimal form.
2015  *
2016  * Unfortunately, we cannot compute the on-disk size of the space map in this
2017  * context because we cannot accurately compute the effects of compression, etc.
2018  * Instead, we apply the heuristic described in the block comment for
2019  * zfs_metaslab_condense_block_threshold - we only condense if the space used
2020  * is greater than a threshold number of blocks.
2021  */
2022 static boolean_t
2023 metaslab_should_condense(metaslab_t *msp)
2024 {
2025         space_map_t *sm = msp->ms_sm;
2026         range_seg_t *rs;
2027         uint64_t size, entries, segsz, object_size, optimal_size, record_size;
2028         dmu_object_info_t doi;
2029         uint64_t vdev_blocksize = 1ULL << msp->ms_group->mg_vd->vdev_ashift;
2030
2031         ASSERT(MUTEX_HELD(&msp->ms_lock));
2032         ASSERT(msp->ms_loaded);
2033
2034         /*
2035          * Use the ms_size_tree range tree, which is ordered by size, to
2036          * obtain the largest segment in the free tree. We always condense
2037          * metaslabs that are empty and metaslabs for which a condense
2038          * request has been made.
2039          */
2040         rs = avl_last(&msp->ms_size_tree);
2041         if (rs == NULL || msp->ms_condense_wanted)
2042                 return (B_TRUE);
2043
2044         /*
2045          * Calculate the number of 64-bit entries this segment would
2046          * require when written to disk. If this single segment would be
2047          * larger on-disk than the entire current on-disk structure, then
2048          * clearly condensing will increase the on-disk structure size.
2049          */
2050         size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
2051         entries = size / (MIN(size, SM_RUN_MAX));
2052         segsz = entries * sizeof (uint64_t);
2053
2054         optimal_size = sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root);
2055         object_size = space_map_length(msp->ms_sm);
2056
2057         dmu_object_info_from_db(sm->sm_dbuf, &doi);
2058         record_size = MAX(doi.doi_data_block_size, vdev_blocksize);
2059
2060         return (segsz <= object_size &&
2061             object_size >= (optimal_size * zfs_condense_pct / 100) &&
2062             object_size > zfs_metaslab_condense_block_threshold * record_size);
2063 }
2064
2065 /*
2066  * Condense the on-disk space map representation to its minimized form.
2067  * The minimized form consists of a small number of allocations followed by
2068  * the entries of the free range tree.
2069  */
2070 static void
2071 metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
2072 {
2073         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
2074         range_tree_t *condense_tree;
2075         space_map_t *sm = msp->ms_sm;
2076
2077         ASSERT(MUTEX_HELD(&msp->ms_lock));
2078         ASSERT3U(spa_sync_pass(spa), ==, 1);
2079         ASSERT(msp->ms_loaded);
2080
2081
2082         spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, vdev id %llu, "
2083             "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg,
2084             msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id,
2085             msp->ms_group->mg_vd->vdev_spa->spa_name,
2086             space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root),
2087             msp->ms_condense_wanted ? "TRUE" : "FALSE");
2088
2089         msp->ms_condense_wanted = B_FALSE;
2090
2091         /*
2092          * Create an range tree that is 100% allocated. We remove segments
2093          * that have been freed in this txg, any deferred frees that exist,
2094          * and any allocation in the future. Removing segments should be
2095          * a relatively inexpensive operation since we expect these trees to
2096          * have a small number of nodes.
2097          */
2098         condense_tree = range_tree_create(NULL, NULL);
2099         range_tree_add(condense_tree, msp->ms_start, msp->ms_size);
2100
2101         /*
2102          * Remove what's been freed in this txg from the condense_tree.
2103          * Since we're in sync_pass 1, we know that all the frees from
2104          * this txg are in the freeingtree.
2105          */
2106         range_tree_walk(msp->ms_freeingtree, range_tree_remove, condense_tree);
2107
2108         for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2109                 range_tree_walk(msp->ms_defertree[t],
2110                     range_tree_remove, condense_tree);
2111         }
2112
2113         for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
2114                 range_tree_walk(msp->ms_alloctree[(txg + t) & TXG_MASK],
2115                     range_tree_remove, condense_tree);
2116         }
2117
2118         /*
2119          * We're about to drop the metaslab's lock thus allowing
2120          * other consumers to change it's content. Set the
2121          * metaslab's ms_condensing flag to ensure that
2122          * allocations on this metaslab do not occur while we're
2123          * in the middle of committing it to disk. This is only critical
2124          * for the ms_tree as all other range trees use per txg
2125          * views of their content.
2126          */
2127         msp->ms_condensing = B_TRUE;
2128
2129         mutex_exit(&msp->ms_lock);
2130         space_map_truncate(sm, tx);
2131
2132         /*
2133          * While we would ideally like to create a space map representation
2134          * that consists only of allocation records, doing so can be
2135          * prohibitively expensive because the in-core free tree can be
2136          * large, and therefore computationally expensive to subtract
2137          * from the condense_tree. Instead we sync out two trees, a cheap
2138          * allocation only tree followed by the in-core free tree. While not
2139          * optimal, this is typically close to optimal, and much cheaper to
2140          * compute.
2141          */
2142         space_map_write(sm, condense_tree, SM_ALLOC, tx);
2143         range_tree_vacate(condense_tree, NULL, NULL);
2144         range_tree_destroy(condense_tree);
2145
2146         space_map_write(sm, msp->ms_tree, SM_FREE, tx);
2147         mutex_enter(&msp->ms_lock);
2148         msp->ms_condensing = B_FALSE;
2149 }
2150
2151 /*
2152  * Write a metaslab to disk in the context of the specified transaction group.
2153  */
2154 void
2155 metaslab_sync(metaslab_t *msp, uint64_t txg)
2156 {
2157         metaslab_group_t *mg = msp->ms_group;
2158         vdev_t *vd = mg->mg_vd;
2159         spa_t *spa = vd->vdev_spa;
2160         objset_t *mos = spa_meta_objset(spa);
2161         range_tree_t *alloctree = msp->ms_alloctree[txg & TXG_MASK];
2162         dmu_tx_t *tx;
2163         uint64_t object = space_map_object(msp->ms_sm);
2164
2165         ASSERT(!vd->vdev_ishole);
2166
2167         /*
2168          * This metaslab has just been added so there's no work to do now.
2169          */
2170         if (msp->ms_freeingtree == NULL) {
2171                 ASSERT3P(alloctree, ==, NULL);
2172                 return;
2173         }
2174
2175         ASSERT3P(alloctree, !=, NULL);
2176         ASSERT3P(msp->ms_freeingtree, !=, NULL);
2177         ASSERT3P(msp->ms_freedtree, !=, NULL);
2178
2179         /*
2180          * Normally, we don't want to process a metaslab if there
2181          * are no allocations or frees to perform. However, if the metaslab
2182          * is being forced to condense and it's loaded, we need to let it
2183          * through.
2184          */
2185         if (range_tree_space(alloctree) == 0 &&
2186             range_tree_space(msp->ms_freeingtree) == 0 &&
2187             !(msp->ms_loaded && msp->ms_condense_wanted))
2188                 return;
2189
2190
2191         VERIFY(txg <= spa_final_dirty_txg(spa));
2192
2193         /*
2194          * The only state that can actually be changing concurrently with
2195          * metaslab_sync() is the metaslab's ms_tree.  No other thread can
2196          * be modifying this txg's alloctree, freeingtree, freedtree, or
2197          * space_map_phys_t.  We drop ms_lock whenever we could call
2198          * into the DMU, because the DMU can call down to us
2199          * (e.g. via zio_free()) at any time.
2200          *
2201          * The spa_vdev_remove_thread() can be reading metaslab state
2202          * concurrently, and it is locked out by the ms_sync_lock.  Note
2203          * that the ms_lock is insufficient for this, because it is dropped
2204          * by space_map_write().
2205          */
2206
2207         tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
2208
2209         if (msp->ms_sm == NULL) {
2210                 uint64_t new_object;
2211
2212                 new_object = space_map_alloc(mos, tx);
2213                 VERIFY3U(new_object, !=, 0);
2214
2215                 VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
2216                     msp->ms_start, msp->ms_size, vd->vdev_ashift));
2217                 ASSERT(msp->ms_sm != NULL);
2218         }
2219
2220         mutex_enter(&msp->ms_sync_lock);
2221         mutex_enter(&msp->ms_lock);
2222
2223         /*
2224          * Note: metaslab_condense() clears the space map's histogram.
2225          * Therefore we must verify and remove this histogram before
2226          * condensing.
2227          */
2228         metaslab_group_histogram_verify(mg);
2229         metaslab_class_histogram_verify(mg->mg_class);
2230         metaslab_group_histogram_remove(mg, msp);
2231
2232         if (msp->ms_loaded && spa_sync_pass(spa) == 1 &&
2233             metaslab_should_condense(msp)) {
2234                 metaslab_condense(msp, txg, tx);
2235         } else {
2236                 mutex_exit(&msp->ms_lock);
2237                 space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx);
2238                 space_map_write(msp->ms_sm, msp->ms_freeingtree, SM_FREE, tx);
2239                 mutex_enter(&msp->ms_lock);
2240         }
2241
2242         if (msp->ms_loaded) {
2243                 /*
2244                  * When the space map is loaded, we have an accurate
2245                  * histogram in the range tree. This gives us an opportunity
2246                  * to bring the space map's histogram up-to-date so we clear
2247                  * it first before updating it.
2248                  */
2249                 space_map_histogram_clear(msp->ms_sm);
2250                 space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx);
2251
2252                 /*
2253                  * Since we've cleared the histogram we need to add back
2254                  * any free space that has already been processed, plus
2255                  * any deferred space. This allows the on-disk histogram
2256                  * to accurately reflect all free space even if some space
2257                  * is not yet available for allocation (i.e. deferred).
2258                  */
2259                 space_map_histogram_add(msp->ms_sm, msp->ms_freedtree, tx);
2260
2261                 /*
2262                  * Add back any deferred free space that has not been
2263                  * added back into the in-core free tree yet. This will
2264                  * ensure that we don't end up with a space map histogram
2265                  * that is completely empty unless the metaslab is fully
2266                  * allocated.
2267                  */
2268                 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2269                         space_map_histogram_add(msp->ms_sm,
2270                             msp->ms_defertree[t], tx);
2271                 }
2272         }
2273
2274         /*
2275          * Always add the free space from this sync pass to the space
2276          * map histogram. We want to make sure that the on-disk histogram
2277          * accounts for all free space. If the space map is not loaded,
2278          * then we will lose some accuracy but will correct it the next
2279          * time we load the space map.
2280          */
2281         space_map_histogram_add(msp->ms_sm, msp->ms_freeingtree, tx);
2282
2283         metaslab_group_histogram_add(mg, msp);
2284         metaslab_group_histogram_verify(mg);
2285         metaslab_class_histogram_verify(mg->mg_class);
2286
2287         /*
2288          * For sync pass 1, we avoid traversing this txg's free range tree
2289          * and instead will just swap the pointers for freeingtree and
2290          * freedtree. We can safely do this since the freed_tree is
2291          * guaranteed to be empty on the initial pass.
2292          */
2293         if (spa_sync_pass(spa) == 1) {
2294                 range_tree_swap(&msp->ms_freeingtree, &msp->ms_freedtree);
2295         } else {
2296                 range_tree_vacate(msp->ms_freeingtree,
2297                     range_tree_add, msp->ms_freedtree);
2298         }
2299         range_tree_vacate(alloctree, NULL, NULL);
2300
2301         ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
2302         ASSERT0(range_tree_space(msp->ms_alloctree[TXG_CLEAN(txg) & TXG_MASK]));
2303         ASSERT0(range_tree_space(msp->ms_freeingtree));
2304
2305         mutex_exit(&msp->ms_lock);
2306
2307         if (object != space_map_object(msp->ms_sm)) {
2308                 object = space_map_object(msp->ms_sm);
2309                 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
2310                     msp->ms_id, sizeof (uint64_t), &object, tx);
2311         }
2312         mutex_exit(&msp->ms_sync_lock);
2313         dmu_tx_commit(tx);
2314 }
2315
2316 /*
2317  * Called after a transaction group has completely synced to mark
2318  * all of the metaslab's free space as usable.
2319  */
2320 void
2321 metaslab_sync_done(metaslab_t *msp, uint64_t txg)
2322 {
2323         metaslab_group_t *mg = msp->ms_group;
2324         vdev_t *vd = mg->mg_vd;
2325         spa_t *spa = vd->vdev_spa;
2326         range_tree_t **defer_tree;
2327         int64_t alloc_delta, defer_delta;
2328         boolean_t defer_allowed = B_TRUE;
2329
2330         ASSERT(!vd->vdev_ishole);
2331
2332         mutex_enter(&msp->ms_lock);
2333
2334         /*
2335          * If this metaslab is just becoming available, initialize its
2336          * range trees and add its capacity to the vdev.
2337          */
2338         if (msp->ms_freedtree == NULL) {
2339                 for (int t = 0; t < TXG_SIZE; t++) {
2340                         ASSERT(msp->ms_alloctree[t] == NULL);
2341
2342                         msp->ms_alloctree[t] = range_tree_create(NULL, NULL);
2343                 }
2344
2345                 ASSERT3P(msp->ms_freeingtree, ==, NULL);
2346                 msp->ms_freeingtree = range_tree_create(NULL, NULL);
2347
2348                 ASSERT3P(msp->ms_freedtree, ==, NULL);
2349                 msp->ms_freedtree = range_tree_create(NULL, NULL);
2350
2351                 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2352                         ASSERT(msp->ms_defertree[t] == NULL);
2353
2354                         msp->ms_defertree[t] = range_tree_create(NULL, NULL);
2355                 }
2356
2357                 vdev_space_update(vd, 0, 0, msp->ms_size);
2358         }
2359
2360         defer_tree = &msp->ms_defertree[txg % TXG_DEFER_SIZE];
2361
2362         uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) -
2363             metaslab_class_get_alloc(spa_normal_class(spa));
2364         if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing) {
2365                 defer_allowed = B_FALSE;
2366         }
2367
2368         defer_delta = 0;
2369         alloc_delta = space_map_alloc_delta(msp->ms_sm);
2370         if (defer_allowed) {
2371                 defer_delta = range_tree_space(msp->ms_freedtree) -
2372                     range_tree_space(*defer_tree);
2373         } else {
2374                 defer_delta -= range_tree_space(*defer_tree);
2375         }
2376
2377         vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
2378
2379         /*
2380          * If there's a metaslab_load() in progress, wait for it to complete
2381          * so that we have a consistent view of the in-core space map.
2382          */
2383         metaslab_load_wait(msp);
2384
2385         /*
2386          * Move the frees from the defer_tree back to the free
2387          * range tree (if it's loaded). Swap the freed_tree and the
2388          * defer_tree -- this is safe to do because we've just emptied out
2389          * the defer_tree.
2390          */
2391         range_tree_vacate(*defer_tree,
2392             msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
2393         if (defer_allowed) {
2394                 range_tree_swap(&msp->ms_freedtree, defer_tree);
2395         } else {
2396                 range_tree_vacate(msp->ms_freedtree,
2397                     msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
2398         }
2399
2400         space_map_update(msp->ms_sm);
2401
2402         msp->ms_deferspace += defer_delta;
2403         ASSERT3S(msp->ms_deferspace, >=, 0);
2404         ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
2405         if (msp->ms_deferspace != 0) {
2406                 /*
2407                  * Keep syncing this metaslab until all deferred frees
2408                  * are back in circulation.
2409                  */
2410                 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
2411         }
2412
2413         /*
2414          * Calculate the new weights before unloading any metaslabs.
2415          * This will give us the most accurate weighting.
2416          */
2417         metaslab_group_sort(mg, msp, metaslab_weight(msp));
2418
2419         /*
2420          * If the metaslab is loaded and we've not tried to load or allocate
2421          * from it in 'metaslab_unload_delay' txgs, then unload it.
2422          */
2423         if (msp->ms_loaded &&
2424             msp->ms_selected_txg + metaslab_unload_delay < txg) {
2425
2426                 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
2427                         VERIFY0(range_tree_space(
2428                             msp->ms_alloctree[(txg + t) & TXG_MASK]));
2429                 }
2430
2431                 if (!metaslab_debug_unload)
2432                         metaslab_unload(msp);
2433         }
2434
2435         ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
2436         ASSERT0(range_tree_space(msp->ms_freeingtree));
2437         ASSERT0(range_tree_space(msp->ms_freedtree));
2438
2439         mutex_exit(&msp->ms_lock);
2440 }
2441
2442 void
2443 metaslab_sync_reassess(metaslab_group_t *mg)
2444 {
2445         spa_t *spa = mg->mg_class->mc_spa;
2446
2447         spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
2448         metaslab_group_alloc_update(mg);
2449         mg->mg_fragmentation = metaslab_group_fragmentation(mg);
2450
2451         /*
2452          * Preload the next potential metaslabs but only on active
2453          * metaslab groups. We can get into a state where the metaslab
2454          * is no longer active since we dirty metaslabs as we remove a
2455          * a device, thus potentially making the metaslab group eligible
2456          * for preloading.
2457          */
2458         if (mg->mg_activation_count > 0) {
2459                 metaslab_group_preload(mg);
2460         }
2461         spa_config_exit(spa, SCL_ALLOC, FTAG);
2462 }
2463
2464 static uint64_t
2465 metaslab_distance(metaslab_t *msp, dva_t *dva)
2466 {
2467         uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
2468         uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
2469         uint64_t start = msp->ms_id;
2470
2471         if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
2472                 return (1ULL << 63);
2473
2474         if (offset < start)
2475                 return ((start - offset) << ms_shift);
2476         if (offset > start)
2477                 return ((offset - start) << ms_shift);
2478         return (0);
2479 }
2480
2481 /*
2482  * ==========================================================================
2483  * Metaslab allocation tracing facility
2484  * ==========================================================================
2485  */
2486 #ifdef _METASLAB_TRACING
2487 kstat_t *metaslab_trace_ksp;
2488 kstat_named_t metaslab_trace_over_limit;
2489
2490 void
2491 metaslab_alloc_trace_init(void)
2492 {
2493         ASSERT(metaslab_alloc_trace_cache == NULL);
2494         metaslab_alloc_trace_cache = kmem_cache_create(
2495             "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t),
2496             0, NULL, NULL, NULL, NULL, NULL, 0);
2497         metaslab_trace_ksp = kstat_create("zfs", 0, "metaslab_trace_stats",
2498             "misc", KSTAT_TYPE_NAMED, 1, KSTAT_FLAG_VIRTUAL);
2499         if (metaslab_trace_ksp != NULL) {
2500                 metaslab_trace_ksp->ks_data = &metaslab_trace_over_limit;
2501                 kstat_named_init(&metaslab_trace_over_limit,
2502                     "metaslab_trace_over_limit", KSTAT_DATA_UINT64);
2503                 kstat_install(metaslab_trace_ksp);
2504         }
2505 }
2506
2507 void
2508 metaslab_alloc_trace_fini(void)
2509 {
2510         if (metaslab_trace_ksp != NULL) {
2511                 kstat_delete(metaslab_trace_ksp);
2512                 metaslab_trace_ksp = NULL;
2513         }
2514         kmem_cache_destroy(metaslab_alloc_trace_cache);
2515         metaslab_alloc_trace_cache = NULL;
2516 }
2517
2518 /*
2519  * Add an allocation trace element to the allocation tracing list.
2520  */
2521 static void
2522 metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg,
2523     metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset)
2524 {
2525         metaslab_alloc_trace_t *mat;
2526
2527         if (!metaslab_trace_enabled)
2528                 return;
2529
2530         /*
2531          * When the tracing list reaches its maximum we remove
2532          * the second element in the list before adding a new one.
2533          * By removing the second element we preserve the original
2534          * entry as a clue to what allocations steps have already been
2535          * performed.
2536          */
2537         if (zal->zal_size == metaslab_trace_max_entries) {
2538                 metaslab_alloc_trace_t *mat_next;
2539 #ifdef DEBUG
2540                 panic("too many entries in allocation list");
2541 #endif
2542                 atomic_inc_64(&metaslab_trace_over_limit.value.ui64);
2543                 zal->zal_size--;
2544                 mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list));
2545                 list_remove(&zal->zal_list, mat_next);
2546                 kmem_cache_free(metaslab_alloc_trace_cache, mat_next);
2547         }
2548
2549         mat = kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP);
2550         list_link_init(&mat->mat_list_node);
2551         mat->mat_mg = mg;
2552         mat->mat_msp = msp;
2553         mat->mat_size = psize;
2554         mat->mat_dva_id = dva_id;
2555         mat->mat_offset = offset;
2556         mat->mat_weight = 0;
2557
2558         if (msp != NULL)
2559                 mat->mat_weight = msp->ms_weight;
2560
2561         /*
2562          * The list is part of the zio so locking is not required. Only
2563          * a single thread will perform allocations for a given zio.
2564          */
2565         list_insert_tail(&zal->zal_list, mat);
2566         zal->zal_size++;
2567
2568         ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries);
2569 }
2570
2571 void
2572 metaslab_trace_init(zio_alloc_list_t *zal)
2573 {
2574         list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t),
2575             offsetof(metaslab_alloc_trace_t, mat_list_node));
2576         zal->zal_size = 0;
2577 }
2578
2579 void
2580 metaslab_trace_fini(zio_alloc_list_t *zal)
2581 {
2582         metaslab_alloc_trace_t *mat;
2583
2584         while ((mat = list_remove_head(&zal->zal_list)) != NULL)
2585                 kmem_cache_free(metaslab_alloc_trace_cache, mat);
2586         list_destroy(&zal->zal_list);
2587         zal->zal_size = 0;
2588 }
2589 #else
2590
2591 #define metaslab_trace_add(zal, mg, msp, psize, id, off)
2592
2593 void
2594 metaslab_alloc_trace_init(void)
2595 {
2596 }
2597
2598 void
2599 metaslab_alloc_trace_fini(void)
2600 {
2601 }
2602
2603 void
2604 metaslab_trace_init(zio_alloc_list_t *zal)
2605 {
2606 }
2607
2608 void
2609 metaslab_trace_fini(zio_alloc_list_t *zal)
2610 {
2611 }
2612
2613 #endif /* _METASLAB_TRACING */
2614
2615 /*
2616  * ==========================================================================
2617  * Metaslab block operations
2618  * ==========================================================================
2619  */
2620
2621 static void
2622 metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags)
2623 {
2624         if (!(flags & METASLAB_ASYNC_ALLOC) ||
2625             flags & METASLAB_DONT_THROTTLE)
2626                 return;
2627
2628         metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
2629         if (!mg->mg_class->mc_alloc_throttle_enabled)
2630                 return;
2631
2632         (void) refcount_add(&mg->mg_alloc_queue_depth, tag);
2633 }
2634
2635 void
2636 metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags)
2637 {
2638         if (!(flags & METASLAB_ASYNC_ALLOC) ||
2639             flags & METASLAB_DONT_THROTTLE)
2640                 return;
2641
2642         metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
2643         if (!mg->mg_class->mc_alloc_throttle_enabled)
2644                 return;
2645
2646         (void) refcount_remove(&mg->mg_alloc_queue_depth, tag);
2647 }
2648
2649 void
2650 metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag)
2651 {
2652 #ifdef ZFS_DEBUG
2653         const dva_t *dva = bp->blk_dva;
2654         int ndvas = BP_GET_NDVAS(bp);
2655
2656         for (int d = 0; d < ndvas; d++) {
2657                 uint64_t vdev = DVA_GET_VDEV(&dva[d]);
2658                 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
2659                 VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth, tag));
2660         }
2661 #endif
2662 }
2663
2664 static uint64_t
2665 metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
2666 {
2667         uint64_t start;
2668         range_tree_t *rt = msp->ms_tree;
2669         metaslab_class_t *mc = msp->ms_group->mg_class;
2670
2671         VERIFY(!msp->ms_condensing);
2672
2673         start = mc->mc_ops->msop_alloc(msp, size);
2674         if (start != -1ULL) {
2675                 metaslab_group_t *mg = msp->ms_group;
2676                 vdev_t *vd = mg->mg_vd;
2677
2678                 VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
2679                 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
2680                 VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
2681                 range_tree_remove(rt, start, size);
2682
2683                 if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
2684                         vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
2685
2686                 range_tree_add(msp->ms_alloctree[txg & TXG_MASK], start, size);
2687
2688                 /* Track the last successful allocation */
2689                 msp->ms_alloc_txg = txg;
2690                 metaslab_verify_space(msp, txg);
2691         }
2692
2693         /*
2694          * Now that we've attempted the allocation we need to update the
2695          * metaslab's maximum block size since it may have changed.
2696          */
2697         msp->ms_max_size = metaslab_block_maxsize(msp);
2698         return (start);
2699 }
2700
2701 static uint64_t
2702 metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
2703     uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
2704 {
2705         metaslab_t *msp = NULL;
2706         uint64_t offset = -1ULL;
2707         uint64_t activation_weight;
2708         uint64_t target_distance;
2709         int i;
2710
2711         activation_weight = METASLAB_WEIGHT_PRIMARY;
2712         for (i = 0; i < d; i++) {
2713                 if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
2714                         activation_weight = METASLAB_WEIGHT_SECONDARY;
2715                         break;
2716                 }
2717         }
2718
2719         metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP);
2720         search->ms_weight = UINT64_MAX;
2721         search->ms_start = 0;
2722         for (;;) {
2723                 boolean_t was_active;
2724                 avl_tree_t *t = &mg->mg_metaslab_tree;
2725                 avl_index_t idx;
2726
2727                 mutex_enter(&mg->mg_lock);
2728
2729                 /*
2730                  * Find the metaslab with the highest weight that is less
2731                  * than what we've already tried.  In the common case, this
2732                  * means that we will examine each metaslab at most once.
2733                  * Note that concurrent callers could reorder metaslabs
2734                  * by activation/passivation once we have dropped the mg_lock.
2735                  * If a metaslab is activated by another thread, and we fail
2736                  * to allocate from the metaslab we have selected, we may
2737                  * not try the newly-activated metaslab, and instead activate
2738                  * another metaslab.  This is not optimal, but generally
2739                  * does not cause any problems (a possible exception being
2740                  * if every metaslab is completely full except for the
2741                  * the newly-activated metaslab which we fail to examine).
2742                  */
2743                 msp = avl_find(t, search, &idx);
2744                 if (msp == NULL)
2745                         msp = avl_nearest(t, idx, AVL_AFTER);
2746                 for (; msp != NULL; msp = AVL_NEXT(t, msp)) {
2747
2748                         if (!metaslab_should_allocate(msp, asize)) {
2749                                 metaslab_trace_add(zal, mg, msp, asize, d,
2750                                     TRACE_TOO_SMALL);
2751                                 continue;
2752                         }
2753
2754                         /*
2755                          * If the selected metaslab is condensing, skip it.
2756                          */
2757                         if (msp->ms_condensing)
2758                                 continue;
2759
2760                         was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
2761                         if (activation_weight == METASLAB_WEIGHT_PRIMARY)
2762                                 break;
2763
2764                         target_distance = min_distance +
2765                             (space_map_allocated(msp->ms_sm) != 0 ? 0 :
2766                             min_distance >> 1);
2767
2768                         for (i = 0; i < d; i++) {
2769                                 if (metaslab_distance(msp, &dva[i]) <
2770                                     target_distance)
2771                                         break;
2772                         }
2773                         if (i == d)
2774                                 break;
2775                 }
2776                 mutex_exit(&mg->mg_lock);
2777                 if (msp == NULL) {
2778                         kmem_free(search, sizeof (*search));
2779                         return (-1ULL);
2780                 }
2781                 search->ms_weight = msp->ms_weight;
2782                 search->ms_start = msp->ms_start + 1;
2783
2784                 mutex_enter(&msp->ms_lock);
2785
2786                 /*
2787                  * Ensure that the metaslab we have selected is still
2788                  * capable of handling our request. It's possible that
2789                  * another thread may have changed the weight while we
2790                  * were blocked on the metaslab lock. We check the
2791                  * active status first to see if we need to reselect
2792                  * a new metaslab.
2793                  */
2794                 if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) {
2795                         mutex_exit(&msp->ms_lock);
2796                         continue;
2797                 }
2798
2799                 if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
2800                     activation_weight == METASLAB_WEIGHT_PRIMARY) {
2801                         metaslab_passivate(msp,
2802                             msp->ms_weight & ~METASLAB_ACTIVE_MASK);
2803                         mutex_exit(&msp->ms_lock);
2804                         continue;
2805                 }
2806
2807                 if (metaslab_activate(msp, activation_weight) != 0) {
2808                         mutex_exit(&msp->ms_lock);
2809                         continue;
2810                 }
2811                 msp->ms_selected_txg = txg;
2812
2813                 /*
2814                  * Now that we have the lock, recheck to see if we should
2815                  * continue to use this metaslab for this allocation. The
2816                  * the metaslab is now loaded so metaslab_should_allocate() can
2817                  * accurately determine if the allocation attempt should
2818                  * proceed.
2819                  */
2820                 if (!metaslab_should_allocate(msp, asize)) {
2821                         /* Passivate this metaslab and select a new one. */
2822                         metaslab_trace_add(zal, mg, msp, asize, d,
2823                             TRACE_TOO_SMALL);
2824                         goto next;
2825                 }
2826
2827
2828                 /*
2829                  * If this metaslab is currently condensing then pick again as
2830                  * we can't manipulate this metaslab until it's committed
2831                  * to disk.
2832                  */
2833                 if (msp->ms_condensing) {
2834                         metaslab_trace_add(zal, mg, msp, asize, d,
2835                             TRACE_CONDENSING);
2836                         mutex_exit(&msp->ms_lock);
2837                         continue;
2838                 }
2839
2840                 offset = metaslab_block_alloc(msp, asize, txg);
2841                 metaslab_trace_add(zal, mg, msp, asize, d, offset);
2842
2843                 if (offset != -1ULL) {
2844                         /* Proactively passivate the metaslab, if needed */
2845                         metaslab_segment_may_passivate(msp);
2846                         break;
2847                 }
2848 next:
2849                 ASSERT(msp->ms_loaded);
2850
2851                 /*
2852                  * We were unable to allocate from this metaslab so determine
2853                  * a new weight for this metaslab. Now that we have loaded
2854                  * the metaslab we can provide a better hint to the metaslab
2855                  * selector.
2856                  *
2857                  * For space-based metaslabs, we use the maximum block size.
2858                  * This information is only available when the metaslab
2859                  * is loaded and is more accurate than the generic free
2860                  * space weight that was calculated by metaslab_weight().
2861                  * This information allows us to quickly compare the maximum
2862                  * available allocation in the metaslab to the allocation
2863                  * size being requested.
2864                  *
2865                  * For segment-based metaslabs, determine the new weight
2866                  * based on the highest bucket in the range tree. We
2867                  * explicitly use the loaded segment weight (i.e. the range
2868                  * tree histogram) since it contains the space that is
2869                  * currently available for allocation and is accurate
2870                  * even within a sync pass.
2871                  */
2872                 if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
2873                         uint64_t weight = metaslab_block_maxsize(msp);
2874                         WEIGHT_SET_SPACEBASED(weight);
2875                         metaslab_passivate(msp, weight);
2876                 } else {
2877                         metaslab_passivate(msp,
2878                             metaslab_weight_from_range_tree(msp));
2879                 }
2880
2881                 /*
2882                  * We have just failed an allocation attempt, check
2883                  * that metaslab_should_allocate() agrees. Otherwise,
2884                  * we may end up in an infinite loop retrying the same
2885                  * metaslab.
2886                  */
2887                 ASSERT(!metaslab_should_allocate(msp, asize));
2888                 mutex_exit(&msp->ms_lock);
2889         }
2890         mutex_exit(&msp->ms_lock);
2891         kmem_free(search, sizeof (*search));
2892         return (offset);
2893 }
2894
2895 static uint64_t
2896 metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
2897     uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
2898 {
2899         uint64_t offset;
2900         ASSERT(mg->mg_initialized);
2901
2902         offset = metaslab_group_alloc_normal(mg, zal, asize, txg,
2903             min_distance, dva, d);
2904
2905         mutex_enter(&mg->mg_lock);
2906         if (offset == -1ULL) {
2907                 mg->mg_failed_allocations++;
2908                 metaslab_trace_add(zal, mg, NULL, asize, d,
2909                     TRACE_GROUP_FAILURE);
2910                 if (asize == SPA_GANGBLOCKSIZE) {
2911                         /*
2912                          * This metaslab group was unable to allocate
2913                          * the minimum gang block size so it must be out of
2914                          * space. We must notify the allocation throttle
2915                          * to start skipping allocation attempts to this
2916                          * metaslab group until more space becomes available.
2917                          * Note: this failure cannot be caused by the
2918                          * allocation throttle since the allocation throttle
2919                          * is only responsible for skipping devices and
2920                          * not failing block allocations.
2921                          */
2922                         mg->mg_no_free_space = B_TRUE;
2923                 }
2924         }
2925         mg->mg_allocations++;
2926         mutex_exit(&mg->mg_lock);
2927         return (offset);
2928 }
2929
2930 /*
2931  * If we have to write a ditto block (i.e. more than one DVA for a given BP)
2932  * on the same vdev as an existing DVA of this BP, then try to allocate it
2933  * at least (vdev_asize / (2 ^ ditto_same_vdev_distance_shift)) away from the
2934  * existing DVAs.
2935  */
2936 int ditto_same_vdev_distance_shift = 3;
2937
2938 /*
2939  * Allocate a block for the specified i/o.
2940  */
2941 int
2942 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
2943     dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags,
2944     zio_alloc_list_t *zal)
2945 {
2946         metaslab_group_t *mg, *fast_mg, *rotor;
2947         vdev_t *vd;
2948         boolean_t try_hard = B_FALSE;
2949
2950         ASSERT(!DVA_IS_VALID(&dva[d]));
2951
2952         /*
2953          * For testing, make some blocks above a certain size be gang blocks.
2954          */
2955         if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0) {
2956                 metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG);
2957                 return (SET_ERROR(ENOSPC));
2958         }
2959
2960         /*
2961          * Start at the rotor and loop through all mgs until we find something.
2962          * Note that there's no locking on mc_rotor or mc_aliquot because
2963          * nothing actually breaks if we miss a few updates -- we just won't
2964          * allocate quite as evenly.  It all balances out over time.
2965          *
2966          * If we are doing ditto or log blocks, try to spread them across
2967          * consecutive vdevs.  If we're forced to reuse a vdev before we've
2968          * allocated all of our ditto blocks, then try and spread them out on
2969          * that vdev as much as possible.  If it turns out to not be possible,
2970          * gradually lower our standards until anything becomes acceptable.
2971          * Also, allocating on consecutive vdevs (as opposed to random vdevs)
2972          * gives us hope of containing our fault domains to something we're
2973          * able to reason about.  Otherwise, any two top-level vdev failures
2974          * will guarantee the loss of data.  With consecutive allocation,
2975          * only two adjacent top-level vdev failures will result in data loss.
2976          *
2977          * If we are doing gang blocks (hintdva is non-NULL), try to keep
2978          * ourselves on the same vdev as our gang block header.  That
2979          * way, we can hope for locality in vdev_cache, plus it makes our
2980          * fault domains something tractable.
2981          */
2982         if (hintdva) {
2983                 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
2984
2985                 /*
2986                  * It's possible the vdev we're using as the hint no
2987                  * longer exists or its mg has been closed (e.g. by
2988                  * device removal).  Consult the rotor when
2989                  * all else fails.
2990                  */
2991                 if (vd != NULL && vd->vdev_mg != NULL) {
2992                         mg = vd->vdev_mg;
2993
2994                         if (flags & METASLAB_HINTBP_AVOID &&
2995                             mg->mg_next != NULL)
2996                                 mg = mg->mg_next;
2997                 } else {
2998                         mg = mc->mc_rotor;
2999                 }
3000         } else if (d != 0) {
3001                 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
3002                 mg = vd->vdev_mg->mg_next;
3003         } else if (flags & METASLAB_FASTWRITE) {
3004                 mg = fast_mg = mc->mc_rotor;
3005
3006                 do {
3007                         if (fast_mg->mg_vd->vdev_pending_fastwrite <
3008                             mg->mg_vd->vdev_pending_fastwrite)
3009                                 mg = fast_mg;
3010                 } while ((fast_mg = fast_mg->mg_next) != mc->mc_rotor);
3011
3012         } else {
3013                 mg = mc->mc_rotor;
3014         }
3015
3016         /*
3017          * If the hint put us into the wrong metaslab class, or into a
3018          * metaslab group that has been passivated, just follow the rotor.
3019          */
3020         if (mg->mg_class != mc || mg->mg_activation_count <= 0)
3021                 mg = mc->mc_rotor;
3022
3023         rotor = mg;
3024 top:
3025         do {
3026                 boolean_t allocatable;
3027
3028                 ASSERT(mg->mg_activation_count == 1);
3029                 vd = mg->mg_vd;
3030
3031                 /*
3032                  * Don't allocate from faulted devices.
3033                  */
3034                 if (try_hard) {
3035                         spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
3036                         allocatable = vdev_allocatable(vd);
3037                         spa_config_exit(spa, SCL_ZIO, FTAG);
3038                 } else {
3039                         allocatable = vdev_allocatable(vd);
3040                 }
3041
3042                 /*
3043                  * Determine if the selected metaslab group is eligible
3044                  * for allocations. If we're ganging then don't allow
3045                  * this metaslab group to skip allocations since that would
3046                  * inadvertently return ENOSPC and suspend the pool
3047                  * even though space is still available.
3048                  */
3049                 if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) {
3050                         allocatable = metaslab_group_allocatable(mg, rotor,
3051                             psize);
3052                 }
3053
3054                 if (!allocatable) {
3055                         metaslab_trace_add(zal, mg, NULL, psize, d,
3056                             TRACE_NOT_ALLOCATABLE);
3057                         goto next;
3058                 }
3059
3060                 ASSERT(mg->mg_initialized);
3061
3062                 /*
3063                  * Avoid writing single-copy data to a failing,
3064                  * non-redundant vdev, unless we've already tried all
3065                  * other vdevs.
3066                  */
3067                 if ((vd->vdev_stat.vs_write_errors > 0 ||
3068                     vd->vdev_state < VDEV_STATE_HEALTHY) &&
3069                     d == 0 && !try_hard && vd->vdev_children == 0) {
3070                         metaslab_trace_add(zal, mg, NULL, psize, d,
3071                             TRACE_VDEV_ERROR);
3072                         goto next;
3073                 }
3074
3075                 ASSERT(mg->mg_class == mc);
3076
3077                 /*
3078                  * If we don't need to try hard, then require that the
3079                  * block be 1/8th of the device away from any other DVAs
3080                  * in this BP.  If we are trying hard, allow any offset
3081                  * to be used (distance=0).
3082                  */
3083                 uint64_t distance = 0;
3084                 if (!try_hard) {
3085                         distance = vd->vdev_asize >>
3086                             ditto_same_vdev_distance_shift;
3087                         if (distance <= (1ULL << vd->vdev_ms_shift))
3088                                 distance = 0;
3089                 }
3090
3091                 uint64_t asize = vdev_psize_to_asize(vd, psize);
3092                 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
3093
3094                 uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg,
3095                     distance, dva, d);
3096
3097                 if (offset != -1ULL) {
3098                         /*
3099                          * If we've just selected this metaslab group,
3100                          * figure out whether the corresponding vdev is
3101                          * over- or under-used relative to the pool,
3102                          * and set an allocation bias to even it out.
3103                          *
3104                          * Bias is also used to compensate for unequally
3105                          * sized vdevs so that space is allocated fairly.
3106                          */
3107                         if (mc->mc_aliquot == 0 && metaslab_bias_enabled) {
3108                                 vdev_stat_t *vs = &vd->vdev_stat;
3109                                 int64_t vs_free = vs->vs_space - vs->vs_alloc;
3110                                 int64_t mc_free = mc->mc_space - mc->mc_alloc;
3111                                 int64_t ratio;
3112
3113                                 /*
3114                                  * Calculate how much more or less we should
3115                                  * try to allocate from this device during
3116                                  * this iteration around the rotor.
3117                                  *
3118                                  * This basically introduces a zero-centered
3119                                  * bias towards the devices with the most
3120                                  * free space, while compensating for vdev
3121                                  * size differences.
3122                                  *
3123                                  * Examples:
3124                                  *  vdev V1 = 16M/128M
3125                                  *  vdev V2 = 16M/128M
3126                                  *  ratio(V1) = 100% ratio(V2) = 100%
3127                                  *
3128                                  *  vdev V1 = 16M/128M
3129                                  *  vdev V2 = 64M/128M
3130                                  *  ratio(V1) = 127% ratio(V2) =  72%
3131                                  *
3132                                  *  vdev V1 = 16M/128M
3133                                  *  vdev V2 = 64M/512M
3134                                  *  ratio(V1) =  40% ratio(V2) = 160%
3135                                  */
3136                                 ratio = (vs_free * mc->mc_alloc_groups * 100) /
3137                                     (mc_free + 1);
3138                                 mg->mg_bias = ((ratio - 100) *
3139                                     (int64_t)mg->mg_aliquot) / 100;
3140                         } else if (!metaslab_bias_enabled) {
3141                                 mg->mg_bias = 0;
3142                         }
3143
3144                         if ((flags & METASLAB_FASTWRITE) ||
3145                             atomic_add_64_nv(&mc->mc_aliquot, asize) >=
3146                             mg->mg_aliquot + mg->mg_bias) {
3147                                 mc->mc_rotor = mg->mg_next;
3148                                 mc->mc_aliquot = 0;
3149                         }
3150
3151                         DVA_SET_VDEV(&dva[d], vd->vdev_id);
3152                         DVA_SET_OFFSET(&dva[d], offset);
3153                         DVA_SET_GANG(&dva[d],
3154                             ((flags & METASLAB_GANG_HEADER) ? 1 : 0));
3155                         DVA_SET_ASIZE(&dva[d], asize);
3156
3157                         if (flags & METASLAB_FASTWRITE) {
3158                                 atomic_add_64(&vd->vdev_pending_fastwrite,
3159                                     psize);
3160                         }
3161
3162                         return (0);
3163                 }
3164 next:
3165                 mc->mc_rotor = mg->mg_next;
3166                 mc->mc_aliquot = 0;
3167         } while ((mg = mg->mg_next) != rotor);
3168
3169         /*
3170          * If we haven't tried hard, do so now.
3171          */
3172         if (!try_hard) {
3173                 try_hard = B_TRUE;
3174                 goto top;
3175         }
3176
3177         bzero(&dva[d], sizeof (dva_t));
3178
3179         metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC);
3180         return (SET_ERROR(ENOSPC));
3181 }
3182
3183 void
3184 metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize,
3185     uint64_t txg)
3186 {
3187         metaslab_t *msp;
3188         ASSERTV(spa_t *spa = vd->vdev_spa);
3189
3190         ASSERT3U(txg, ==, spa->spa_syncing_txg);
3191         ASSERT(vdev_is_concrete(vd));
3192         ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
3193         ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
3194
3195         msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
3196
3197         VERIFY(!msp->ms_condensing);
3198         VERIFY3U(offset, >=, msp->ms_start);
3199         VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size);
3200         VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
3201         VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift));
3202
3203         metaslab_check_free_impl(vd, offset, asize);
3204         mutex_enter(&msp->ms_lock);
3205         if (range_tree_space(msp->ms_freeingtree) == 0) {
3206                 vdev_dirty(vd, VDD_METASLAB, msp, txg);
3207         }
3208         range_tree_add(msp->ms_freeingtree, offset, asize);
3209         mutex_exit(&msp->ms_lock);
3210 }
3211
3212 /* ARGSUSED */
3213 void
3214 metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
3215     uint64_t size, void *arg)
3216 {
3217         uint64_t *txgp = arg;
3218
3219         if (vd->vdev_ops->vdev_op_remap != NULL)
3220                 vdev_indirect_mark_obsolete(vd, offset, size, *txgp);
3221         else
3222                 metaslab_free_impl(vd, offset, size, *txgp);
3223 }
3224
3225 static void
3226 metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size,
3227     uint64_t txg)
3228 {
3229         spa_t *spa = vd->vdev_spa;
3230
3231         ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
3232
3233         if (txg > spa_freeze_txg(spa))
3234                 return;
3235
3236         if (spa->spa_vdev_removal != NULL &&
3237             spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id &&
3238             vdev_is_concrete(vd)) {
3239                 /*
3240                  * Note: we check if the vdev is concrete because when
3241                  * we complete the removal, we first change the vdev to be
3242                  * an indirect vdev (in open context), and then (in syncing
3243                  * context) clear spa_vdev_removal.
3244                  */
3245                 free_from_removing_vdev(vd, offset, size, txg);
3246         } else if (vd->vdev_ops->vdev_op_remap != NULL) {
3247                 vdev_indirect_mark_obsolete(vd, offset, size, txg);
3248                 vd->vdev_ops->vdev_op_remap(vd, offset, size,
3249                     metaslab_free_impl_cb, &txg);
3250         } else {
3251                 metaslab_free_concrete(vd, offset, size, txg);
3252         }
3253 }
3254
3255 typedef struct remap_blkptr_cb_arg {
3256         blkptr_t *rbca_bp;
3257         spa_remap_cb_t rbca_cb;
3258         vdev_t *rbca_remap_vd;
3259         uint64_t rbca_remap_offset;
3260         void *rbca_cb_arg;
3261 } remap_blkptr_cb_arg_t;
3262
3263 void
3264 remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
3265     uint64_t size, void *arg)
3266 {
3267         remap_blkptr_cb_arg_t *rbca = arg;
3268         blkptr_t *bp = rbca->rbca_bp;
3269
3270         /* We can not remap split blocks. */
3271         if (size != DVA_GET_ASIZE(&bp->blk_dva[0]))
3272                 return;
3273         ASSERT0(inner_offset);
3274
3275         if (rbca->rbca_cb != NULL) {
3276                 /*
3277                  * At this point we know that we are not handling split
3278                  * blocks and we invoke the callback on the previous
3279                  * vdev which must be indirect.
3280                  */
3281                 ASSERT3P(rbca->rbca_remap_vd->vdev_ops, ==, &vdev_indirect_ops);
3282
3283                 rbca->rbca_cb(rbca->rbca_remap_vd->vdev_id,
3284                     rbca->rbca_remap_offset, size, rbca->rbca_cb_arg);
3285
3286                 /* set up remap_blkptr_cb_arg for the next call */
3287                 rbca->rbca_remap_vd = vd;
3288                 rbca->rbca_remap_offset = offset;
3289         }
3290
3291         /*
3292          * The phys birth time is that of dva[0].  This ensures that we know
3293          * when each dva was written, so that resilver can determine which
3294          * blocks need to be scrubbed (i.e. those written during the time
3295          * the vdev was offline).  It also ensures that the key used in
3296          * the ARC hash table is unique (i.e. dva[0] + phys_birth).  If
3297          * we didn't change the phys_birth, a lookup in the ARC for a
3298          * remapped BP could find the data that was previously stored at
3299          * this vdev + offset.
3300          */
3301         vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa,
3302             DVA_GET_VDEV(&bp->blk_dva[0]));
3303         vdev_indirect_births_t *vib = oldvd->vdev_indirect_births;
3304         bp->blk_phys_birth = vdev_indirect_births_physbirth(vib,
3305             DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0]));
3306
3307         DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id);
3308         DVA_SET_OFFSET(&bp->blk_dva[0], offset);
3309 }
3310
3311 /*
3312  * If the block pointer contains any indirect DVAs, modify them to refer to
3313  * concrete DVAs.  Note that this will sometimes not be possible, leaving
3314  * the indirect DVA in place.  This happens if the indirect DVA spans multiple
3315  * segments in the mapping (i.e. it is a "split block").
3316  *
3317  * If the BP was remapped, calls the callback on the original dva (note the
3318  * callback can be called multiple times if the original indirect DVA refers
3319  * to another indirect DVA, etc).
3320  *
3321  * Returns TRUE if the BP was remapped.
3322  */
3323 boolean_t
3324 spa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg)
3325 {
3326         remap_blkptr_cb_arg_t rbca;
3327
3328         if (!zfs_remap_blkptr_enable)
3329                 return (B_FALSE);
3330
3331         if (!spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS))
3332                 return (B_FALSE);
3333
3334         /*
3335          * Dedup BP's can not be remapped, because ddt_phys_select() depends
3336          * on DVA[0] being the same in the BP as in the DDT (dedup table).
3337          */
3338         if (BP_GET_DEDUP(bp))
3339                 return (B_FALSE);
3340
3341         /*
3342          * Gang blocks can not be remapped, because
3343          * zio_checksum_gang_verifier() depends on the DVA[0] that's in
3344          * the BP used to read the gang block header (GBH) being the same
3345          * as the DVA[0] that we allocated for the GBH.
3346          */
3347         if (BP_IS_GANG(bp))
3348                 return (B_FALSE);
3349
3350         /*
3351          * Embedded BP's have no DVA to remap.
3352          */
3353         if (BP_GET_NDVAS(bp) < 1)
3354                 return (B_FALSE);
3355
3356         /*
3357          * Note: we only remap dva[0].  If we remapped other dvas, we
3358          * would no longer know what their phys birth txg is.
3359          */
3360         dva_t *dva = &bp->blk_dva[0];
3361
3362         uint64_t offset = DVA_GET_OFFSET(dva);
3363         uint64_t size = DVA_GET_ASIZE(dva);
3364         vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
3365
3366         if (vd->vdev_ops->vdev_op_remap == NULL)
3367                 return (B_FALSE);
3368
3369         rbca.rbca_bp = bp;
3370         rbca.rbca_cb = callback;
3371         rbca.rbca_remap_vd = vd;
3372         rbca.rbca_remap_offset = offset;
3373         rbca.rbca_cb_arg = arg;
3374
3375         /*
3376          * remap_blkptr_cb() will be called in order for each level of
3377          * indirection, until a concrete vdev is reached or a split block is
3378          * encountered. old_vd and old_offset are updated within the callback
3379          * as we go from the one indirect vdev to the next one (either concrete
3380          * or indirect again) in that order.
3381          */
3382         vd->vdev_ops->vdev_op_remap(vd, offset, size, remap_blkptr_cb, &rbca);
3383
3384         /* Check if the DVA wasn't remapped because it is a split block */
3385         if (DVA_GET_VDEV(&rbca.rbca_bp->blk_dva[0]) == vd->vdev_id)
3386                 return (B_FALSE);
3387
3388         return (B_TRUE);
3389 }
3390
3391 /*
3392  * Undo the allocation of a DVA which happened in the given transaction group.
3393  */
3394 void
3395 metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
3396 {
3397         metaslab_t *msp;
3398         vdev_t *vd;
3399         uint64_t vdev = DVA_GET_VDEV(dva);
3400         uint64_t offset = DVA_GET_OFFSET(dva);
3401         uint64_t size = DVA_GET_ASIZE(dva);
3402
3403         ASSERT(DVA_IS_VALID(dva));
3404         ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
3405
3406         if (txg > spa_freeze_txg(spa))
3407                 return;
3408
3409         if ((vd = vdev_lookup_top(spa, vdev)) == NULL || !DVA_IS_VALID(dva) ||
3410             (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
3411                 zfs_panic_recover("metaslab_free_dva(): bad DVA %llu:%llu:%llu",
3412                     (u_longlong_t)vdev, (u_longlong_t)offset,
3413                     (u_longlong_t)size);
3414                 return;
3415         }
3416
3417         ASSERT(!vd->vdev_removing);
3418         ASSERT(vdev_is_concrete(vd));
3419         ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
3420         ASSERT3P(vd->vdev_indirect_mapping, ==, NULL);
3421
3422         if (DVA_GET_GANG(dva))
3423                 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
3424
3425         msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
3426
3427         mutex_enter(&msp->ms_lock);
3428         range_tree_remove(msp->ms_alloctree[txg & TXG_MASK],
3429             offset, size);
3430
3431         VERIFY(!msp->ms_condensing);
3432         VERIFY3U(offset, >=, msp->ms_start);
3433         VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
3434         VERIFY3U(range_tree_space(msp->ms_tree) + size, <=,
3435             msp->ms_size);
3436         VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
3437         VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
3438         range_tree_add(msp->ms_tree, offset, size);
3439         mutex_exit(&msp->ms_lock);
3440 }
3441
3442 /*
3443  * Free the block represented by DVA in the context of the specified
3444  * transaction group.
3445  */
3446 void
3447 metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
3448 {
3449         uint64_t vdev = DVA_GET_VDEV(dva);
3450         uint64_t offset = DVA_GET_OFFSET(dva);
3451         uint64_t size = DVA_GET_ASIZE(dva);
3452         vdev_t *vd = vdev_lookup_top(spa, vdev);
3453
3454         ASSERT(DVA_IS_VALID(dva));
3455         ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
3456
3457         if (DVA_GET_GANG(dva)) {
3458                 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
3459         }
3460
3461         metaslab_free_impl(vd, offset, size, txg);
3462 }
3463
3464 /*
3465  * Reserve some allocation slots. The reservation system must be called
3466  * before we call into the allocator. If there aren't any available slots
3467  * then the I/O will be throttled until an I/O completes and its slots are
3468  * freed up. The function returns true if it was successful in placing
3469  * the reservation.
3470  */
3471 boolean_t
3472 metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio,
3473     int flags)
3474 {
3475         uint64_t available_slots = 0;
3476         boolean_t slot_reserved = B_FALSE;
3477
3478         ASSERT(mc->mc_alloc_throttle_enabled);
3479         mutex_enter(&mc->mc_lock);
3480
3481         uint64_t reserved_slots = refcount_count(&mc->mc_alloc_slots);
3482         if (reserved_slots < mc->mc_alloc_max_slots)
3483                 available_slots = mc->mc_alloc_max_slots - reserved_slots;
3484
3485         if (slots <= available_slots || GANG_ALLOCATION(flags)) {
3486                 /*
3487                  * We reserve the slots individually so that we can unreserve
3488                  * them individually when an I/O completes.
3489                  */
3490                 for (int d = 0; d < slots; d++) {
3491                         reserved_slots = refcount_add(&mc->mc_alloc_slots, zio);
3492                 }
3493                 zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
3494                 slot_reserved = B_TRUE;
3495         }
3496
3497         mutex_exit(&mc->mc_lock);
3498         return (slot_reserved);
3499 }
3500
3501 void
3502 metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, zio_t *zio)
3503 {
3504         ASSERT(mc->mc_alloc_throttle_enabled);
3505         mutex_enter(&mc->mc_lock);
3506         for (int d = 0; d < slots; d++) {
3507                 (void) refcount_remove(&mc->mc_alloc_slots, zio);
3508         }
3509         mutex_exit(&mc->mc_lock);
3510 }
3511
3512 static int
3513 metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size,
3514     uint64_t txg)
3515 {
3516         metaslab_t *msp;
3517         spa_t *spa = vd->vdev_spa;
3518         int error = 0;
3519
3520         if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count)
3521                 return (ENXIO);
3522
3523         ASSERT3P(vd->vdev_ms, !=, NULL);
3524         msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
3525
3526         mutex_enter(&msp->ms_lock);
3527
3528         if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded)
3529                 error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
3530
3531         if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size))
3532                 error = SET_ERROR(ENOENT);
3533
3534         if (error || txg == 0) {        /* txg == 0 indicates dry run */
3535                 mutex_exit(&msp->ms_lock);
3536                 return (error);
3537         }
3538
3539         VERIFY(!msp->ms_condensing);
3540         VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
3541         VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
3542         VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size);
3543         range_tree_remove(msp->ms_tree, offset, size);
3544
3545         if (spa_writeable(spa)) {       /* don't dirty if we're zdb(1M) */
3546                 if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
3547                         vdev_dirty(vd, VDD_METASLAB, msp, txg);
3548                 range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size);
3549         }
3550
3551         mutex_exit(&msp->ms_lock);
3552
3553         return (0);
3554 }
3555
3556 typedef struct metaslab_claim_cb_arg_t {
3557         uint64_t        mcca_txg;
3558         int             mcca_error;
3559 } metaslab_claim_cb_arg_t;
3560
3561 /* ARGSUSED */
3562 static void
3563 metaslab_claim_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
3564     uint64_t size, void *arg)
3565 {
3566         metaslab_claim_cb_arg_t *mcca_arg = arg;
3567
3568         if (mcca_arg->mcca_error == 0) {
3569                 mcca_arg->mcca_error = metaslab_claim_concrete(vd, offset,
3570                     size, mcca_arg->mcca_txg);
3571         }
3572 }
3573
3574 int
3575 metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg)
3576 {
3577         if (vd->vdev_ops->vdev_op_remap != NULL) {
3578                 metaslab_claim_cb_arg_t arg;
3579
3580                 /*
3581                  * Only zdb(1M) can claim on indirect vdevs.  This is used
3582                  * to detect leaks of mapped space (that are not accounted
3583                  * for in the obsolete counts, spacemap, or bpobj).
3584                  */
3585                 ASSERT(!spa_writeable(vd->vdev_spa));
3586                 arg.mcca_error = 0;
3587                 arg.mcca_txg = txg;
3588
3589                 vd->vdev_ops->vdev_op_remap(vd, offset, size,
3590                     metaslab_claim_impl_cb, &arg);
3591
3592                 if (arg.mcca_error == 0) {
3593                         arg.mcca_error = metaslab_claim_concrete(vd,
3594                             offset, size, txg);
3595                 }
3596                 return (arg.mcca_error);
3597         } else {
3598                 return (metaslab_claim_concrete(vd, offset, size, txg));
3599         }
3600 }
3601
3602 /*
3603  * Intent log support: upon opening the pool after a crash, notify the SPA
3604  * of blocks that the intent log has allocated for immediate write, but
3605  * which are still considered free by the SPA because the last transaction
3606  * group didn't commit yet.
3607  */
3608 static int
3609 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
3610 {
3611         uint64_t vdev = DVA_GET_VDEV(dva);
3612         uint64_t offset = DVA_GET_OFFSET(dva);
3613         uint64_t size = DVA_GET_ASIZE(dva);
3614         vdev_t *vd;
3615
3616         if ((vd = vdev_lookup_top(spa, vdev)) == NULL) {
3617                 return (SET_ERROR(ENXIO));
3618         }
3619
3620         ASSERT(DVA_IS_VALID(dva));
3621
3622         if (DVA_GET_GANG(dva))
3623                 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
3624
3625         return (metaslab_claim_impl(vd, offset, size, txg));
3626 }
3627
3628 int
3629 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
3630     int ndvas, uint64_t txg, blkptr_t *hintbp, int flags,
3631     zio_alloc_list_t *zal, zio_t *zio)
3632 {
3633         dva_t *dva = bp->blk_dva;
3634         dva_t *hintdva = hintbp->blk_dva;
3635         int error = 0;
3636
3637         ASSERT(bp->blk_birth == 0);
3638         ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
3639
3640         spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
3641
3642         if (mc->mc_rotor == NULL) {     /* no vdevs in this class */
3643                 spa_config_exit(spa, SCL_ALLOC, FTAG);
3644                 return (SET_ERROR(ENOSPC));
3645         }
3646
3647         ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
3648         ASSERT(BP_GET_NDVAS(bp) == 0);
3649         ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
3650         ASSERT3P(zal, !=, NULL);
3651
3652         for (int d = 0; d < ndvas; d++) {
3653                 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
3654                     txg, flags, zal);
3655                 if (error != 0) {
3656                         for (d--; d >= 0; d--) {
3657                                 metaslab_unalloc_dva(spa, &dva[d], txg);
3658                                 metaslab_group_alloc_decrement(spa,
3659                                     DVA_GET_VDEV(&dva[d]), zio, flags);
3660                                 bzero(&dva[d], sizeof (dva_t));
3661                         }
3662                         spa_config_exit(spa, SCL_ALLOC, FTAG);
3663                         return (error);
3664                 } else {
3665                         /*
3666                          * Update the metaslab group's queue depth
3667                          * based on the newly allocated dva.
3668                          */
3669                         metaslab_group_alloc_increment(spa,
3670                             DVA_GET_VDEV(&dva[d]), zio, flags);
3671                 }
3672
3673         }
3674         ASSERT(error == 0);
3675         ASSERT(BP_GET_NDVAS(bp) == ndvas);
3676
3677         spa_config_exit(spa, SCL_ALLOC, FTAG);
3678
3679         BP_SET_BIRTH(bp, txg, 0);
3680
3681         return (0);
3682 }
3683
3684 void
3685 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
3686 {
3687         const dva_t *dva = bp->blk_dva;
3688         int ndvas = BP_GET_NDVAS(bp);
3689
3690         ASSERT(!BP_IS_HOLE(bp));
3691         ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
3692
3693         spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
3694
3695         for (int d = 0; d < ndvas; d++) {
3696                 if (now) {
3697                         metaslab_unalloc_dva(spa, &dva[d], txg);
3698                 } else {
3699                         metaslab_free_dva(spa, &dva[d], txg);
3700                 }
3701         }
3702
3703         spa_config_exit(spa, SCL_FREE, FTAG);
3704 }
3705
3706 int
3707 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
3708 {
3709         const dva_t *dva = bp->blk_dva;
3710         int ndvas = BP_GET_NDVAS(bp);
3711         int error = 0;
3712
3713         ASSERT(!BP_IS_HOLE(bp));
3714
3715         if (txg != 0) {
3716                 /*
3717                  * First do a dry run to make sure all DVAs are claimable,
3718                  * so we don't have to unwind from partial failures below.
3719                  */
3720                 if ((error = metaslab_claim(spa, bp, 0)) != 0)
3721                         return (error);
3722         }
3723
3724         spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
3725
3726         for (int d = 0; d < ndvas; d++)
3727                 if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0)
3728                         break;
3729
3730         spa_config_exit(spa, SCL_ALLOC, FTAG);
3731
3732         ASSERT(error == 0 || txg == 0);
3733
3734         return (error);
3735 }
3736
3737 void
3738 metaslab_fastwrite_mark(spa_t *spa, const blkptr_t *bp)
3739 {
3740         const dva_t *dva = bp->blk_dva;
3741         int ndvas = BP_GET_NDVAS(bp);
3742         uint64_t psize = BP_GET_PSIZE(bp);
3743         int d;
3744         vdev_t *vd;
3745
3746         ASSERT(!BP_IS_HOLE(bp));
3747         ASSERT(!BP_IS_EMBEDDED(bp));
3748         ASSERT(psize > 0);
3749
3750         spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
3751
3752         for (d = 0; d < ndvas; d++) {
3753                 if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL)
3754                         continue;
3755                 atomic_add_64(&vd->vdev_pending_fastwrite, psize);
3756         }
3757
3758         spa_config_exit(spa, SCL_VDEV, FTAG);
3759 }
3760
3761 void
3762 metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp)
3763 {
3764         const dva_t *dva = bp->blk_dva;
3765         int ndvas = BP_GET_NDVAS(bp);
3766         uint64_t psize = BP_GET_PSIZE(bp);
3767         int d;
3768         vdev_t *vd;
3769
3770         ASSERT(!BP_IS_HOLE(bp));
3771         ASSERT(!BP_IS_EMBEDDED(bp));
3772         ASSERT(psize > 0);
3773
3774         spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
3775
3776         for (d = 0; d < ndvas; d++) {
3777                 if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL)
3778                         continue;
3779                 ASSERT3U(vd->vdev_pending_fastwrite, >=, psize);
3780                 atomic_sub_64(&vd->vdev_pending_fastwrite, psize);
3781         }
3782
3783         spa_config_exit(spa, SCL_VDEV, FTAG);
3784 }
3785
3786 /* ARGSUSED */
3787 static void
3788 metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset,
3789     uint64_t size, void *arg)
3790 {
3791         if (vd->vdev_ops == &vdev_indirect_ops)
3792                 return;
3793
3794         metaslab_check_free_impl(vd, offset, size);
3795 }
3796
3797 static void
3798 metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size)
3799 {
3800         metaslab_t *msp;
3801         ASSERTV(spa_t *spa = vd->vdev_spa);
3802
3803         if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
3804                 return;
3805
3806         if (vd->vdev_ops->vdev_op_remap != NULL) {
3807                 vd->vdev_ops->vdev_op_remap(vd, offset, size,
3808                     metaslab_check_free_impl_cb, NULL);
3809                 return;
3810         }
3811
3812         ASSERT(vdev_is_concrete(vd));
3813         ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
3814         ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
3815
3816         msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
3817
3818         mutex_enter(&msp->ms_lock);
3819         if (msp->ms_loaded)
3820                 range_tree_verify(msp->ms_tree, offset, size);
3821
3822         range_tree_verify(msp->ms_freeingtree, offset, size);
3823         range_tree_verify(msp->ms_freedtree, offset, size);
3824         for (int j = 0; j < TXG_DEFER_SIZE; j++)
3825                 range_tree_verify(msp->ms_defertree[j], offset, size);
3826         mutex_exit(&msp->ms_lock);
3827 }
3828
3829 void
3830 metaslab_check_free(spa_t *spa, const blkptr_t *bp)
3831 {
3832         if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
3833                 return;
3834
3835         spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
3836         for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
3837                 uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
3838                 vdev_t *vd = vdev_lookup_top(spa, vdev);
3839                 uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
3840                 uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
3841
3842                 if (DVA_GET_GANG(&bp->blk_dva[i]))
3843                         size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
3844
3845                 ASSERT3P(vd, !=, NULL);
3846
3847                 metaslab_check_free_impl(vd, offset, size);
3848         }
3849         spa_config_exit(spa, SCL_VDEV, FTAG);
3850 }
3851
3852 #if defined(_KERNEL) && defined(HAVE_SPL)
3853 /* CSTYLED */
3854 module_param(metaslab_aliquot, ulong, 0644);
3855 MODULE_PARM_DESC(metaslab_aliquot,
3856         "allocation granularity (a.k.a. stripe size)");
3857
3858 module_param(metaslab_debug_load, int, 0644);
3859 MODULE_PARM_DESC(metaslab_debug_load,
3860         "load all metaslabs when pool is first opened");
3861
3862 module_param(metaslab_debug_unload, int, 0644);
3863 MODULE_PARM_DESC(metaslab_debug_unload,
3864         "prevent metaslabs from being unloaded");
3865
3866 module_param(metaslab_preload_enabled, int, 0644);
3867 MODULE_PARM_DESC(metaslab_preload_enabled,
3868         "preload potential metaslabs during reassessment");
3869
3870 module_param(zfs_mg_noalloc_threshold, int, 0644);
3871 MODULE_PARM_DESC(zfs_mg_noalloc_threshold,
3872         "percentage of free space for metaslab group to allow allocation");
3873
3874 module_param(zfs_mg_fragmentation_threshold, int, 0644);
3875 MODULE_PARM_DESC(zfs_mg_fragmentation_threshold,
3876         "fragmentation for metaslab group to allow allocation");
3877
3878 module_param(zfs_metaslab_fragmentation_threshold, int, 0644);
3879 MODULE_PARM_DESC(zfs_metaslab_fragmentation_threshold,
3880         "fragmentation for metaslab to allow allocation");
3881
3882 module_param(metaslab_fragmentation_factor_enabled, int, 0644);
3883 MODULE_PARM_DESC(metaslab_fragmentation_factor_enabled,
3884         "use the fragmentation metric to prefer less fragmented metaslabs");
3885
3886 module_param(metaslab_lba_weighting_enabled, int, 0644);
3887 MODULE_PARM_DESC(metaslab_lba_weighting_enabled,
3888         "prefer metaslabs with lower LBAs");
3889
3890 module_param(metaslab_bias_enabled, int, 0644);
3891 MODULE_PARM_DESC(metaslab_bias_enabled,
3892         "enable metaslab group biasing");
3893
3894 module_param(zfs_metaslab_segment_weight_enabled, int, 0644);
3895 MODULE_PARM_DESC(zfs_metaslab_segment_weight_enabled,
3896         "enable segment-based metaslab selection");
3897
3898 module_param(zfs_metaslab_switch_threshold, int, 0644);
3899 MODULE_PARM_DESC(zfs_metaslab_switch_threshold,
3900         "segment-based metaslab selection maximum buckets before switching");
3901
3902 /* CSTYLED */
3903 module_param(metaslab_gang_bang, ulong, 0644);
3904 MODULE_PARM_DESC(metaslab_gang_bang,
3905         "blocks larger than this size are forced to be gang blocks");
3906 #endif /* _KERNEL && HAVE_SPL */