module/zfs/metaslab.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
  24  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  25  */
  26
  27 #include <sys/zfs_context.h>
  28 #include <sys/dmu.h>
  29 #include <sys/dmu_tx.h>
  30 #include <sys/space_map.h>
  31 #include <sys/metaslab_impl.h>
  32 #include <sys/vdev_impl.h>
  33 #include <sys/zio.h>
  34 #include <sys/spa_impl.h>
  35 #include <sys/zfeature.h>
  36
  37 #define WITH_DF_BLOCK_ALLOCATOR
  38
  39 #define GANG_ALLOCATION(flags) \
  40         ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER))
  41
  42 /*
  43  * Metaslab granularity, in bytes. This is roughly similar to what would be
  44  * referred to as the "stripe size" in traditional RAID arrays. In normal
  45  * operation, we will try to write this amount of data to a top-level vdev
  46  * before moving on to the next one.
  47  */
  48 unsigned long metaslab_aliquot = 512 << 10;
  49
  50 uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1;     /* force gang blocks */
  51
  52 /*
  53  * The in-core space map representation is more compact than its on-disk form.
  54  * The zfs_condense_pct determines how much more compact the in-core
  55  * space map representation must be before we compact it on-disk.
  56  * Values should be greater than or equal to 100.
  57  */
  58 int zfs_condense_pct = 200;
  59
  60 /*
  61  * Condensing a metaslab is not guaranteed to actually reduce the amount of
  62  * space used on disk. In particular, a space map uses data in increments of
  63  * MAX(1 << ashift, space_map_blksz), so a metaslab might use the
  64  * same number of blocks after condensing. Since the goal of condensing is to
  65  * reduce the number of IOPs required to read the space map, we only want to
  66  * condense when we can be sure we will reduce the number of blocks used by the
  67  * space map. Unfortunately, we cannot precisely compute whether or not this is
  68  * the case in metaslab_should_condense since we are holding ms_lock. Instead,
  69  * we apply the following heuristic: do not condense a spacemap unless the
  70  * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold
  71  * blocks.
  72  */
  73 int zfs_metaslab_condense_block_threshold = 4;
  74
  75 /*
  76  * The zfs_mg_noalloc_threshold defines which metaslab groups should
  77  * be eligible for allocation. The value is defined as a percentage of
  78  * free space. Metaslab groups that have more free space than
  79  * zfs_mg_noalloc_threshold are always eligible for allocations. Once
  80  * a metaslab group's free space is less than or equal to the
  81  * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
  82  * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
  83  * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
  84  * groups are allowed to accept allocations. Gang blocks are always
  85  * eligible to allocate on any metaslab group. The default value of 0 means
  86  * no metaslab group will be excluded based on this criterion.
  87  */
  88 int zfs_mg_noalloc_threshold = 0;
  89
  90 /*
  91  * Metaslab groups are considered eligible for allocations if their
  92  * fragmenation metric (measured as a percentage) is less than or equal to
  93  * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold
  94  * then it will be skipped unless all metaslab groups within the metaslab
  95  * class have also crossed this threshold.
  96  */
  97 int zfs_mg_fragmentation_threshold = 85;
  98
  99 /*
 100  * Allow metaslabs to keep their active state as long as their fragmentation
 101  * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An
 102  * active metaslab that exceeds this threshold will no longer keep its active
 103  * status allowing better metaslabs to be selected.
 104  */
 105 int zfs_metaslab_fragmentation_threshold = 70;
 106
 107 /*
 108  * When set will load all metaslabs when pool is first opened.
 109  */
 110 int metaslab_debug_load = 0;
 111
 112 /*
 113  * When set will prevent metaslabs from being unloaded.
 114  */
 115 int metaslab_debug_unload = 0;
 116
 117 /*
 118  * Minimum size which forces the dynamic allocator to change
 119  * it's allocation strategy.  Once the space map cannot satisfy
 120  * an allocation of this size then it switches to using more
 121  * aggressive strategy (i.e search by size rather than offset).
 122  */
 123 uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE;
 124
 125 /*
 126  * The minimum free space, in percent, which must be available
 127  * in a space map to continue allocations in a first-fit fashion.
 128  * Once the space map's free space drops below this level we dynamically
 129  * switch to using best-fit allocations.
 130  */
 131 int metaslab_df_free_pct = 4;
 132
 133 /*
 134  * Percentage of all cpus that can be used by the metaslab taskq.
 135  */
 136 int metaslab_load_pct = 50;
 137
 138 /*
 139  * Determines how many txgs a metaslab may remain loaded without having any
 140  * allocations from it. As long as a metaslab continues to be used we will
 141  * keep it loaded.
 142  */
 143 int metaslab_unload_delay = TXG_SIZE * 2;
 144
 145 /*
 146  * Max number of metaslabs per group to preload.
 147  */
 148 int metaslab_preload_limit = SPA_DVAS_PER_BP;
 149
 150 /*
 151  * Enable/disable preloading of metaslab.
 152  */
 153 int metaslab_preload_enabled = B_TRUE;
 154
 155 /*
 156  * Enable/disable fragmentation weighting on metaslabs.
 157  */
 158 int metaslab_fragmentation_factor_enabled = B_TRUE;
 159
 160 /*
 161  * Enable/disable lba weighting (i.e. outer tracks are given preference).
 162  */
 163 int metaslab_lba_weighting_enabled = B_TRUE;
 164
 165 /*
 166  * Enable/disable metaslab group biasing.
 167  */
 168 int metaslab_bias_enabled = B_TRUE;
 169
 170
 171 /*
 172  * Enable/disable segment-based metaslab selection.
 173  */
 174 int zfs_metaslab_segment_weight_enabled = B_TRUE;
 175
 176 /*
 177  * When using segment-based metaslab selection, we will continue
 178  * allocating from the active metaslab until we have exhausted
 179  * zfs_metaslab_switch_threshold of its buckets.
 180  */
 181 int zfs_metaslab_switch_threshold = 2;
 182
 183 /*
 184  * Internal switch to enable/disable the metaslab allocation tracing
 185  * facility.
 186  */
 187 #ifdef _METASLAB_TRACING
 188 boolean_t metaslab_trace_enabled = B_TRUE;
 189 #endif
 190
 191 /*
 192  * Maximum entries that the metaslab allocation tracing facility will keep
 193  * in a given list when running in non-debug mode. We limit the number
 194  * of entries in non-debug mode to prevent us from using up too much memory.
 195  * The limit should be sufficiently large that we don't expect any allocation
 196  * to every exceed this value. In debug mode, the system will panic if this
 197  * limit is ever reached allowing for further investigation.
 198  */
 199 #ifdef _METASLAB_TRACING
 200 uint64_t metaslab_trace_max_entries = 5000;
 201 #endif
 202
 203 static uint64_t metaslab_weight(metaslab_t *);
 204 static void metaslab_set_fragmentation(metaslab_t *);
 205
 206 #ifdef _METASLAB_TRACING
 207 kmem_cache_t *metaslab_alloc_trace_cache;
 208 #endif
 209
 210 /*
 211  * ==========================================================================
 212  * Metaslab classes
 213  * ==========================================================================
 214  */
 215 metaslab_class_t *
 216 metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
 217 {
 218         metaslab_class_t *mc;
 219
 220         mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
 221
 222         mc->mc_spa = spa;
 223         mc->mc_rotor = NULL;
 224         mc->mc_ops = ops;
 225         mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
 226         refcount_create_tracked(&mc->mc_alloc_slots);
 227
 228         return (mc);
 229 }
 230
 231 void
 232 metaslab_class_destroy(metaslab_class_t *mc)
 233 {
 234         ASSERT(mc->mc_rotor == NULL);
 235         ASSERT(mc->mc_alloc == 0);
 236         ASSERT(mc->mc_deferred == 0);
 237         ASSERT(mc->mc_space == 0);
 238         ASSERT(mc->mc_dspace == 0);
 239
 240         refcount_destroy(&mc->mc_alloc_slots);
 241         mutex_destroy(&mc->mc_lock);
 242         kmem_free(mc, sizeof (metaslab_class_t));
 243 }
 244
 245 int
 246 metaslab_class_validate(metaslab_class_t *mc)
 247 {
 248         metaslab_group_t *mg;
 249         vdev_t *vd;
 250
 251         /*
 252          * Must hold one of the spa_config locks.
 253          */
 254         ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
 255             spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
 256
 257         if ((mg = mc->mc_rotor) == NULL)
 258                 return (0);
 259
 260         do {
 261                 vd = mg->mg_vd;
 262                 ASSERT(vd->vdev_mg != NULL);
 263                 ASSERT3P(vd->vdev_top, ==, vd);
 264                 ASSERT3P(mg->mg_class, ==, mc);
 265                 ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
 266         } while ((mg = mg->mg_next) != mc->mc_rotor);
 267
 268         return (0);
 269 }
 270
 271 void
 272 metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
 273     int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
 274 {
 275         atomic_add_64(&mc->mc_alloc, alloc_delta);
 276         atomic_add_64(&mc->mc_deferred, defer_delta);
 277         atomic_add_64(&mc->mc_space, space_delta);
 278         atomic_add_64(&mc->mc_dspace, dspace_delta);
 279 }
 280
 281 uint64_t
 282 metaslab_class_get_alloc(metaslab_class_t *mc)
 283 {
 284         return (mc->mc_alloc);
 285 }
 286
 287 uint64_t
 288 metaslab_class_get_deferred(metaslab_class_t *mc)
 289 {
 290         return (mc->mc_deferred);
 291 }
 292
 293 uint64_t
 294 metaslab_class_get_space(metaslab_class_t *mc)
 295 {
 296         return (mc->mc_space);
 297 }
 298
 299 uint64_t
 300 metaslab_class_get_dspace(metaslab_class_t *mc)
 301 {
 302         return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
 303 }
 304
 305 void
 306 metaslab_class_histogram_verify(metaslab_class_t *mc)
 307 {
 308         vdev_t *rvd = mc->mc_spa->spa_root_vdev;
 309         uint64_t *mc_hist;
 310         int i;
 311
 312         if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
 313                 return;
 314
 315         mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
 316             KM_SLEEP);
 317
 318         for (int c = 0; c < rvd->vdev_children; c++) {
 319                 vdev_t *tvd = rvd->vdev_child[c];
 320                 metaslab_group_t *mg = tvd->vdev_mg;
 321
 322                 /*
 323                  * Skip any holes, uninitialized top-levels, or
 324                  * vdevs that are not in this metalab class.
 325                  */
 326                 if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 ||
 327                     mg->mg_class != mc) {
 328                         continue;
 329                 }
 330
 331                 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
 332                         mc_hist[i] += mg->mg_histogram[i];
 333         }
 334
 335         for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
 336                 VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]);
 337
 338         kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
 339 }
 340
 341 /*
 342  * Calculate the metaslab class's fragmentation metric. The metric
 343  * is weighted based on the space contribution of each metaslab group.
 344  * The return value will be a number between 0 and 100 (inclusive), or
 345  * ZFS_FRAG_INVALID if the metric has not been set. See comment above the
 346  * zfs_frag_table for more information about the metric.
 347  */
 348 uint64_t
 349 metaslab_class_fragmentation(metaslab_class_t *mc)
 350 {
 351         vdev_t *rvd = mc->mc_spa->spa_root_vdev;
 352         uint64_t fragmentation = 0;
 353
 354         spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
 355
 356         for (int c = 0; c < rvd->vdev_children; c++) {
 357                 vdev_t *tvd = rvd->vdev_child[c];
 358                 metaslab_group_t *mg = tvd->vdev_mg;
 359
 360                 /*
 361                  * Skip any holes, uninitialized top-levels, or
 362                  * vdevs that are not in this metalab class.
 363                  */
 364                 if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 ||
 365                     mg->mg_class != mc) {
 366                         continue;
 367                 }
 368
 369                 /*
 370                  * If a metaslab group does not contain a fragmentation
 371                  * metric then just bail out.
 372                  */
 373                 if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
 374                         spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
 375                         return (ZFS_FRAG_INVALID);
 376                 }
 377
 378                 /*
 379                  * Determine how much this metaslab_group is contributing
 380                  * to the overall pool fragmentation metric.
 381                  */
 382                 fragmentation += mg->mg_fragmentation *
 383                     metaslab_group_get_space(mg);
 384         }
 385         fragmentation /= metaslab_class_get_space(mc);
 386
 387         ASSERT3U(fragmentation, <=, 100);
 388         spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
 389         return (fragmentation);
 390 }
 391
 392 /*
 393  * Calculate the amount of expandable space that is available in
 394  * this metaslab class. If a device is expanded then its expandable
 395  * space will be the amount of allocatable space that is currently not
 396  * part of this metaslab class.
 397  */
 398 uint64_t
 399 metaslab_class_expandable_space(metaslab_class_t *mc)
 400 {
 401         vdev_t *rvd = mc->mc_spa->spa_root_vdev;
 402         uint64_t space = 0;
 403
 404         spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
 405         for (int c = 0; c < rvd->vdev_children; c++) {
 406                 vdev_t *tvd = rvd->vdev_child[c];
 407                 metaslab_group_t *mg = tvd->vdev_mg;
 408
 409                 if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 ||
 410                     mg->mg_class != mc) {
 411                         continue;
 412                 }
 413
 414                 /*
 415                  * Calculate if we have enough space to add additional
 416                  * metaslabs. We report the expandable space in terms
 417                  * of the metaslab size since that's the unit of expansion.
 418                  */
 419                 space += P2ALIGN(tvd->vdev_max_asize - tvd->vdev_asize,
 420                     1ULL << tvd->vdev_ms_shift);
 421         }
 422         spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
 423         return (space);
 424 }
 425
 426 static int
 427 metaslab_compare(const void *x1, const void *x2)
 428 {
 429         const metaslab_t *m1 = (const metaslab_t *)x1;
 430         const metaslab_t *m2 = (const metaslab_t *)x2;
 431
 432         int cmp = AVL_CMP(m2->ms_weight, m1->ms_weight);
 433         if (likely(cmp))
 434                 return (cmp);
 435
 436         IMPLY(AVL_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2);
 437
 438         return (AVL_CMP(m1->ms_start, m2->ms_start));
 439 }
 440
 441 /*
 442  * Verify that the space accounting on disk matches the in-core range_trees.
 443  */
 444 void
 445 metaslab_verify_space(metaslab_t *msp, uint64_t txg)
 446 {
 447         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 448         uint64_t allocated = 0;
 449         uint64_t sm_free_space, msp_free_space;
 450
 451         ASSERT(MUTEX_HELD(&msp->ms_lock));
 452
 453         if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
 454                 return;
 455
 456         /*
 457          * We can only verify the metaslab space when we're called
 458          * from syncing context with a loaded metaslab that has an allocated
 459          * space map. Calling this in non-syncing context does not
 460          * provide a consistent view of the metaslab since we're performing
 461          * allocations in the future.
 462          */
 463         if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL ||
 464             !msp->ms_loaded)
 465                 return;
 466
 467         sm_free_space = msp->ms_size - space_map_allocated(msp->ms_sm) -
 468             space_map_alloc_delta(msp->ms_sm);
 469
 470         /*
 471          * Account for future allocations since we would have already
 472          * deducted that space from the ms_freetree.
 473          */
 474         for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
 475                 allocated +=
 476                     range_tree_space(msp->ms_alloctree[(txg + t) & TXG_MASK]);
 477         }
 478
 479         msp_free_space = range_tree_space(msp->ms_tree) + allocated +
 480             msp->ms_deferspace + range_tree_space(msp->ms_freedtree);
 481
 482         VERIFY3U(sm_free_space, ==, msp_free_space);
 483 }
 484
 485 /*
 486  * ==========================================================================
 487  * Metaslab groups
 488  * ==========================================================================
 489  */
 490 /*
 491  * Update the allocatable flag and the metaslab group's capacity.
 492  * The allocatable flag is set to true if the capacity is below
 493  * the zfs_mg_noalloc_threshold or has a fragmentation value that is
 494  * greater than zfs_mg_fragmentation_threshold. If a metaslab group
 495  * transitions from allocatable to non-allocatable or vice versa then the
 496  * metaslab group's class is updated to reflect the transition.
 497  */
 498 static void
 499 metaslab_group_alloc_update(metaslab_group_t *mg)
 500 {
 501         vdev_t *vd = mg->mg_vd;
 502         metaslab_class_t *mc = mg->mg_class;
 503         vdev_stat_t *vs = &vd->vdev_stat;
 504         boolean_t was_allocatable;
 505         boolean_t was_initialized;
 506
 507         ASSERT(vd == vd->vdev_top);
 508
 509         mutex_enter(&mg->mg_lock);
 510         was_allocatable = mg->mg_allocatable;
 511         was_initialized = mg->mg_initialized;
 512
 513         mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
 514             (vs->vs_space + 1);
 515
 516         mutex_enter(&mc->mc_lock);
 517
 518         /*
 519          * If the metaslab group was just added then it won't
 520          * have any space until we finish syncing out this txg.
 521          * At that point we will consider it initialized and available
 522          * for allocations.  We also don't consider non-activated
 523          * metaslab groups (e.g. vdevs that are in the middle of being removed)
 524          * to be initialized, because they can't be used for allocation.
 525          */
 526         mg->mg_initialized = metaslab_group_initialized(mg);
 527         if (!was_initialized && mg->mg_initialized) {
 528                 mc->mc_groups++;
 529         } else if (was_initialized && !mg->mg_initialized) {
 530                 ASSERT3U(mc->mc_groups, >, 0);
 531                 mc->mc_groups--;
 532         }
 533         if (mg->mg_initialized)
 534                 mg->mg_no_free_space = B_FALSE;
 535
 536         /*
 537          * A metaslab group is considered allocatable if it has plenty
 538          * of free space or is not heavily fragmented. We only take
 539          * fragmentation into account if the metaslab group has a valid
 540          * fragmentation metric (i.e. a value between 0 and 100).
 541          */
 542         mg->mg_allocatable = (mg->mg_activation_count > 0 &&
 543             mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
 544             (mg->mg_fragmentation == ZFS_FRAG_INVALID ||
 545             mg->mg_fragmentation <= zfs_mg_fragmentation_threshold));
 546
 547         /*
 548          * The mc_alloc_groups maintains a count of the number of
 549          * groups in this metaslab class that are still above the
 550          * zfs_mg_noalloc_threshold. This is used by the allocating
 551          * threads to determine if they should avoid allocations to
 552          * a given group. The allocator will avoid allocations to a group
 553          * if that group has reached or is below the zfs_mg_noalloc_threshold
 554          * and there are still other groups that are above the threshold.
 555          * When a group transitions from allocatable to non-allocatable or
 556          * vice versa we update the metaslab class to reflect that change.
 557          * When the mc_alloc_groups value drops to 0 that means that all
 558          * groups have reached the zfs_mg_noalloc_threshold making all groups
 559          * eligible for allocations. This effectively means that all devices
 560          * are balanced again.
 561          */
 562         if (was_allocatable && !mg->mg_allocatable)
 563                 mc->mc_alloc_groups--;
 564         else if (!was_allocatable && mg->mg_allocatable)
 565                 mc->mc_alloc_groups++;
 566         mutex_exit(&mc->mc_lock);
 567
 568         mutex_exit(&mg->mg_lock);
 569 }
 570
 571 metaslab_group_t *
 572 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
 573 {
 574         metaslab_group_t *mg;
 575
 576         mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
 577         mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
 578         avl_create(&mg->mg_metaslab_tree, metaslab_compare,
 579             sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
 580         mg->mg_vd = vd;
 581         mg->mg_class = mc;
 582         mg->mg_activation_count = 0;
 583         mg->mg_initialized = B_FALSE;
 584         mg->mg_no_free_space = B_TRUE;
 585         refcount_create_tracked(&mg->mg_alloc_queue_depth);
 586
 587         mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
 588             maxclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT | TASKQ_DYNAMIC);
 589
 590         return (mg);
 591 }
 592
 593 void
 594 metaslab_group_destroy(metaslab_group_t *mg)
 595 {
 596         ASSERT(mg->mg_prev == NULL);
 597         ASSERT(mg->mg_next == NULL);
 598         /*
 599          * We may have gone below zero with the activation count
 600          * either because we never activated in the first place or
 601          * because we're done, and possibly removing the vdev.
 602          */
 603         ASSERT(mg->mg_activation_count <= 0);
 604
 605         taskq_destroy(mg->mg_taskq);
 606         avl_destroy(&mg->mg_metaslab_tree);
 607         mutex_destroy(&mg->mg_lock);
 608         refcount_destroy(&mg->mg_alloc_queue_depth);
 609         kmem_free(mg, sizeof (metaslab_group_t));
 610 }
 611
 612 void
 613 metaslab_group_activate(metaslab_group_t *mg)
 614 {
 615         metaslab_class_t *mc = mg->mg_class;
 616         metaslab_group_t *mgprev, *mgnext;
 617
 618         ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
 619
 620         ASSERT(mc->mc_rotor != mg);
 621         ASSERT(mg->mg_prev == NULL);
 622         ASSERT(mg->mg_next == NULL);
 623         ASSERT(mg->mg_activation_count <= 0);
 624
 625         if (++mg->mg_activation_count <= 0)
 626                 return;
 627
 628         mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
 629         metaslab_group_alloc_update(mg);
 630
 631         if ((mgprev = mc->mc_rotor) == NULL) {
 632                 mg->mg_prev = mg;
 633                 mg->mg_next = mg;
 634         } else {
 635                 mgnext = mgprev->mg_next;
 636                 mg->mg_prev = mgprev;
 637                 mg->mg_next = mgnext;
 638                 mgprev->mg_next = mg;
 639                 mgnext->mg_prev = mg;
 640         }
 641         mc->mc_rotor = mg;
 642 }
 643
 644 void
 645 metaslab_group_passivate(metaslab_group_t *mg)
 646 {
 647         metaslab_class_t *mc = mg->mg_class;
 648         metaslab_group_t *mgprev, *mgnext;
 649
 650         ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
 651
 652         if (--mg->mg_activation_count != 0) {
 653                 ASSERT(mc->mc_rotor != mg);
 654                 ASSERT(mg->mg_prev == NULL);
 655                 ASSERT(mg->mg_next == NULL);
 656                 ASSERT(mg->mg_activation_count < 0);
 657                 return;
 658         }
 659
 660         taskq_wait_outstanding(mg->mg_taskq, 0);
 661         metaslab_group_alloc_update(mg);
 662
 663         mgprev = mg->mg_prev;
 664         mgnext = mg->mg_next;
 665
 666         if (mg == mgnext) {
 667                 mc->mc_rotor = NULL;
 668         } else {
 669                 mc->mc_rotor = mgnext;
 670                 mgprev->mg_next = mgnext;
 671                 mgnext->mg_prev = mgprev;
 672         }
 673
 674         mg->mg_prev = NULL;
 675         mg->mg_next = NULL;
 676 }
 677
 678 boolean_t
 679 metaslab_group_initialized(metaslab_group_t *mg)
 680 {
 681         vdev_t *vd = mg->mg_vd;
 682         vdev_stat_t *vs = &vd->vdev_stat;
 683
 684         return (vs->vs_space != 0 && mg->mg_activation_count > 0);
 685 }
 686
 687 uint64_t
 688 metaslab_group_get_space(metaslab_group_t *mg)
 689 {
 690         return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count);
 691 }
 692
 693 void
 694 metaslab_group_histogram_verify(metaslab_group_t *mg)
 695 {
 696         uint64_t *mg_hist;
 697         vdev_t *vd = mg->mg_vd;
 698         uint64_t ashift = vd->vdev_ashift;
 699         int i;
 700
 701         if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
 702                 return;
 703
 704         mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
 705             KM_SLEEP);
 706
 707         ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=,
 708             SPACE_MAP_HISTOGRAM_SIZE + ashift);
 709
 710         for (int m = 0; m < vd->vdev_ms_count; m++) {
 711                 metaslab_t *msp = vd->vdev_ms[m];
 712
 713                 if (msp->ms_sm == NULL)
 714                         continue;
 715
 716                 for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
 717                         mg_hist[i + ashift] +=
 718                             msp->ms_sm->sm_phys->smp_histogram[i];
 719         }
 720
 721         for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++)
 722                 VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]);
 723
 724         kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
 725 }
 726
 727 static void
 728 metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
 729 {
 730         metaslab_class_t *mc = mg->mg_class;
 731         uint64_t ashift = mg->mg_vd->vdev_ashift;
 732
 733         ASSERT(MUTEX_HELD(&msp->ms_lock));
 734         if (msp->ms_sm == NULL)
 735                 return;
 736
 737         mutex_enter(&mg->mg_lock);
 738         for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
 739                 mg->mg_histogram[i + ashift] +=
 740                     msp->ms_sm->sm_phys->smp_histogram[i];
 741                 mc->mc_histogram[i + ashift] +=
 742                     msp->ms_sm->sm_phys->smp_histogram[i];
 743         }
 744         mutex_exit(&mg->mg_lock);
 745 }
 746
 747 void
 748 metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp)
 749 {
 750         metaslab_class_t *mc = mg->mg_class;
 751         uint64_t ashift = mg->mg_vd->vdev_ashift;
 752
 753         ASSERT(MUTEX_HELD(&msp->ms_lock));
 754         if (msp->ms_sm == NULL)
 755                 return;
 756
 757         mutex_enter(&mg->mg_lock);
 758         for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
 759                 ASSERT3U(mg->mg_histogram[i + ashift], >=,
 760                     msp->ms_sm->sm_phys->smp_histogram[i]);
 761                 ASSERT3U(mc->mc_histogram[i + ashift], >=,
 762                     msp->ms_sm->sm_phys->smp_histogram[i]);
 763
 764                 mg->mg_histogram[i + ashift] -=
 765                     msp->ms_sm->sm_phys->smp_histogram[i];
 766                 mc->mc_histogram[i + ashift] -=
 767                     msp->ms_sm->sm_phys->smp_histogram[i];
 768         }
 769         mutex_exit(&mg->mg_lock);
 770 }
 771
 772 static void
 773 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
 774 {
 775         ASSERT(msp->ms_group == NULL);
 776         mutex_enter(&mg->mg_lock);
 777         msp->ms_group = mg;
 778         msp->ms_weight = 0;
 779         avl_add(&mg->mg_metaslab_tree, msp);
 780         mutex_exit(&mg->mg_lock);
 781
 782         mutex_enter(&msp->ms_lock);
 783         metaslab_group_histogram_add(mg, msp);
 784         mutex_exit(&msp->ms_lock);
 785 }
 786
 787 static void
 788 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
 789 {
 790         mutex_enter(&msp->ms_lock);
 791         metaslab_group_histogram_remove(mg, msp);
 792         mutex_exit(&msp->ms_lock);
 793
 794         mutex_enter(&mg->mg_lock);
 795         ASSERT(msp->ms_group == mg);
 796         avl_remove(&mg->mg_metaslab_tree, msp);
 797         msp->ms_group = NULL;
 798         mutex_exit(&mg->mg_lock);
 799 }
 800
 801 static void
 802 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
 803 {
 804         /*
 805          * Although in principle the weight can be any value, in
 806          * practice we do not use values in the range [1, 511].
 807          */
 808         ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0);
 809         ASSERT(MUTEX_HELD(&msp->ms_lock));
 810
 811         mutex_enter(&mg->mg_lock);
 812         ASSERT(msp->ms_group == mg);
 813         avl_remove(&mg->mg_metaslab_tree, msp);
 814         msp->ms_weight = weight;
 815         avl_add(&mg->mg_metaslab_tree, msp);
 816         mutex_exit(&mg->mg_lock);
 817 }
 818
 819 /*
 820  * Calculate the fragmentation for a given metaslab group. We can use
 821  * a simple average here since all metaslabs within the group must have
 822  * the same size. The return value will be a value between 0 and 100
 823  * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this
 824  * group have a fragmentation metric.
 825  */
 826 uint64_t
 827 metaslab_group_fragmentation(metaslab_group_t *mg)
 828 {
 829         vdev_t *vd = mg->mg_vd;
 830         uint64_t fragmentation = 0;
 831         uint64_t valid_ms = 0;
 832
 833         for (int m = 0; m < vd->vdev_ms_count; m++) {
 834                 metaslab_t *msp = vd->vdev_ms[m];
 835
 836                 if (msp->ms_fragmentation == ZFS_FRAG_INVALID)
 837                         continue;
 838
 839                 valid_ms++;
 840                 fragmentation += msp->ms_fragmentation;
 841         }
 842
 843         if (valid_ms <= vd->vdev_ms_count / 2)
 844                 return (ZFS_FRAG_INVALID);
 845
 846         fragmentation /= valid_ms;
 847         ASSERT3U(fragmentation, <=, 100);
 848         return (fragmentation);
 849 }
 850
 851 /*
 852  * Determine if a given metaslab group should skip allocations. A metaslab
 853  * group should avoid allocations if its free capacity is less than the
 854  * zfs_mg_noalloc_threshold or its fragmentation metric is greater than
 855  * zfs_mg_fragmentation_threshold and there is at least one metaslab group
 856  * that can still handle allocations. If the allocation throttle is enabled
 857  * then we skip allocations to devices that have reached their maximum
 858  * allocation queue depth unless the selected metaslab group is the only
 859  * eligible group remaining.
 860  */
 861 static boolean_t
 862 metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
 863     uint64_t psize)
 864 {
 865         spa_t *spa = mg->mg_vd->vdev_spa;
 866         metaslab_class_t *mc = mg->mg_class;
 867
 868         /*
 869          * We can only consider skipping this metaslab group if it's
 870          * in the normal metaslab class and there are other metaslab
 871          * groups to select from. Otherwise, we always consider it eligible
 872          * for allocations.
 873          */
 874         if (mc != spa_normal_class(spa) || mc->mc_groups <= 1)
 875                 return (B_TRUE);
 876
 877         /*
 878          * If the metaslab group's mg_allocatable flag is set (see comments
 879          * in metaslab_group_alloc_update() for more information) and
 880          * the allocation throttle is disabled then allow allocations to this
 881          * device. However, if the allocation throttle is enabled then
 882          * check if we have reached our allocation limit (mg_alloc_queue_depth)
 883          * to determine if we should allow allocations to this metaslab group.
 884          * If all metaslab groups are no longer considered allocatable
 885          * (mc_alloc_groups == 0) or we're trying to allocate the smallest
 886          * gang block size then we allow allocations on this metaslab group
 887          * regardless of the mg_allocatable or throttle settings.
 888          */
 889         if (mg->mg_allocatable) {
 890                 metaslab_group_t *mgp;
 891                 int64_t qdepth;
 892                 uint64_t qmax = mg->mg_max_alloc_queue_depth;
 893
 894                 if (!mc->mc_alloc_throttle_enabled)
 895                         return (B_TRUE);
 896
 897                 /*
 898                  * If this metaslab group does not have any free space, then
 899                  * there is no point in looking further.
 900                  */
 901                 if (mg->mg_no_free_space)
 902                         return (B_FALSE);
 903
 904                 qdepth = refcount_count(&mg->mg_alloc_queue_depth);
 905
 906                 /*
 907                  * If this metaslab group is below its qmax or it's
 908                  * the only allocatable metasable group, then attempt
 909                  * to allocate from it.
 910                  */
 911                 if (qdepth < qmax || mc->mc_alloc_groups == 1)
 912                         return (B_TRUE);
 913                 ASSERT3U(mc->mc_alloc_groups, >, 1);
 914
 915                 /*
 916                  * Since this metaslab group is at or over its qmax, we
 917                  * need to determine if there are metaslab groups after this
 918                  * one that might be able to handle this allocation. This is
 919                  * racy since we can't hold the locks for all metaslab
 920                  * groups at the same time when we make this check.
 921                  */
 922                 for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) {
 923                         qmax = mgp->mg_max_alloc_queue_depth;
 924
 925                         qdepth = refcount_count(&mgp->mg_alloc_queue_depth);
 926
 927                         /*
 928                          * If there is another metaslab group that
 929                          * might be able to handle the allocation, then
 930                          * we return false so that we skip this group.
 931                          */
 932                         if (qdepth < qmax && !mgp->mg_no_free_space)
 933                                 return (B_FALSE);
 934                 }
 935
 936                 /*
 937                  * We didn't find another group to handle the allocation
 938                  * so we can't skip this metaslab group even though
 939                  * we are at or over our qmax.
 940                  */
 941                 return (B_TRUE);
 942
 943         } else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) {
 944                 return (B_TRUE);
 945         }
 946         return (B_FALSE);
 947 }
 948
 949 /*
 950  * ==========================================================================
 951  * Range tree callbacks
 952  * ==========================================================================
 953  */
 954
 955 /*
 956  * Comparison function for the private size-ordered tree. Tree is sorted
 957  * by size, larger sizes at the end of the tree.
 958  */
 959 static int
 960 metaslab_rangesize_compare(const void *x1, const void *x2)
 961 {
 962         const range_seg_t *r1 = x1;
 963         const range_seg_t *r2 = x2;
 964         uint64_t rs_size1 = r1->rs_end - r1->rs_start;
 965         uint64_t rs_size2 = r2->rs_end - r2->rs_start;
 966
 967         int cmp = AVL_CMP(rs_size1, rs_size2);
 968         if (likely(cmp))
 969                 return (cmp);
 970
 971         return (AVL_CMP(r1->rs_start, r2->rs_start));
 972 }
 973
 974 /*
 975  * Create any block allocator specific components. The current allocators
 976  * rely on using both a size-ordered range_tree_t and an array of uint64_t's.
 977  */
 978 static void
 979 metaslab_rt_create(range_tree_t *rt, void *arg)
 980 {
 981         metaslab_t *msp = arg;
 982
 983         ASSERT3P(rt->rt_arg, ==, msp);
 984         ASSERT(msp->ms_tree == NULL);
 985
 986         avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
 987             sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
 988 }
 989
 990 /*
 991  * Destroy the block allocator specific components.
 992  */
 993 static void
 994 metaslab_rt_destroy(range_tree_t *rt, void *arg)
 995 {
 996         metaslab_t *msp = arg;
 997
 998         ASSERT3P(rt->rt_arg, ==, msp);
 999         ASSERT3P(msp->ms_tree, ==, rt);
1000         ASSERT0(avl_numnodes(&msp->ms_size_tree));
1001
1002         avl_destroy(&msp->ms_size_tree);
1003 }
1004
1005 static void
1006 metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg)
1007 {
1008         metaslab_t *msp = arg;
1009
1010         ASSERT3P(rt->rt_arg, ==, msp);
1011         ASSERT3P(msp->ms_tree, ==, rt);
1012         VERIFY(!msp->ms_condensing);
1013         avl_add(&msp->ms_size_tree, rs);
1014 }
1015
1016 static void
1017 metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
1018 {
1019         metaslab_t *msp = arg;
1020
1021         ASSERT3P(rt->rt_arg, ==, msp);
1022         ASSERT3P(msp->ms_tree, ==, rt);
1023         VERIFY(!msp->ms_condensing);
1024         avl_remove(&msp->ms_size_tree, rs);
1025 }
1026
1027 static void
1028 metaslab_rt_vacate(range_tree_t *rt, void *arg)
1029 {
1030         metaslab_t *msp = arg;
1031
1032         ASSERT3P(rt->rt_arg, ==, msp);
1033         ASSERT3P(msp->ms_tree, ==, rt);
1034
1035         /*
1036          * Normally one would walk the tree freeing nodes along the way.
1037          * Since the nodes are shared with the range trees we can avoid
1038          * walking all nodes and just reinitialize the avl tree. The nodes
1039          * will be freed by the range tree, so we don't want to free them here.
1040          */
1041         avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
1042             sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
1043 }
1044
1045 static range_tree_ops_t metaslab_rt_ops = {
1046         metaslab_rt_create,
1047         metaslab_rt_destroy,
1048         metaslab_rt_add,
1049         metaslab_rt_remove,
1050         metaslab_rt_vacate
1051 };
1052
1053 /*
1054  * ==========================================================================
1055  * Common allocator routines
1056  * ==========================================================================
1057  */
1058
1059 /*
1060  * Return the maximum contiguous segment within the metaslab.
1061  */
1062 uint64_t
1063 metaslab_block_maxsize(metaslab_t *msp)
1064 {
1065         avl_tree_t *t = &msp->ms_size_tree;
1066         range_seg_t *rs;
1067
1068         if (t == NULL || (rs = avl_last(t)) == NULL)
1069                 return (0ULL);
1070
1071         return (rs->rs_end - rs->rs_start);
1072 }
1073
1074 static range_seg_t *
1075 metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size)
1076 {
1077         range_seg_t *rs, rsearch;
1078         avl_index_t where;
1079
1080         rsearch.rs_start = start;
1081         rsearch.rs_end = start + size;
1082
1083         rs = avl_find(t, &rsearch, &where);
1084         if (rs == NULL) {
1085                 rs = avl_nearest(t, where, AVL_AFTER);
1086         }
1087
1088         return (rs);
1089 }
1090
1091 #if defined(WITH_FF_BLOCK_ALLOCATOR) || \
1092     defined(WITH_DF_BLOCK_ALLOCATOR) || \
1093     defined(WITH_CF_BLOCK_ALLOCATOR)
1094 /*
1095  * This is a helper function that can be used by the allocator to find
1096  * a suitable block to allocate. This will search the specified AVL
1097  * tree looking for a block that matches the specified criteria.
1098  */
1099 static uint64_t
1100 metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
1101     uint64_t align)
1102 {
1103         range_seg_t *rs = metaslab_block_find(t, *cursor, size);
1104
1105         while (rs != NULL) {
1106                 uint64_t offset = P2ROUNDUP(rs->rs_start, align);
1107
1108                 if (offset + size <= rs->rs_end) {
1109                         *cursor = offset + size;
1110                         return (offset);
1111                 }
1112                 rs = AVL_NEXT(t, rs);
1113         }
1114
1115         /*
1116          * If we know we've searched the whole map (*cursor == 0), give up.
1117          * Otherwise, reset the cursor to the beginning and try again.
1118          */
1119         if (*cursor == 0)
1120                 return (-1ULL);
1121
1122         *cursor = 0;
1123         return (metaslab_block_picker(t, cursor, size, align));
1124 }
1125 #endif /* WITH_FF/DF/CF_BLOCK_ALLOCATOR */
1126
1127 #if defined(WITH_FF_BLOCK_ALLOCATOR)
1128 /*
1129  * ==========================================================================
1130  * The first-fit block allocator
1131  * ==========================================================================
1132  */
1133 static uint64_t
1134 metaslab_ff_alloc(metaslab_t *msp, uint64_t size)
1135 {
1136         /*
1137          * Find the largest power of 2 block size that evenly divides the
1138          * requested size. This is used to try to allocate blocks with similar
1139          * alignment from the same area of the metaslab (i.e. same cursor
1140          * bucket) but it does not guarantee that other allocations sizes
1141          * may exist in the same region.
1142          */
1143         uint64_t align = size & -size;
1144         uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
1145         avl_tree_t *t = &msp->ms_tree->rt_root;
1146
1147         return (metaslab_block_picker(t, cursor, size, align));
1148 }
1149
1150 static metaslab_ops_t metaslab_ff_ops = {
1151         metaslab_ff_alloc
1152 };
1153
1154 metaslab_ops_t *zfs_metaslab_ops = &metaslab_ff_ops;
1155 #endif /* WITH_FF_BLOCK_ALLOCATOR */
1156
1157 #if defined(WITH_DF_BLOCK_ALLOCATOR)
1158 /*
1159  * ==========================================================================
1160  * Dynamic block allocator -
1161  * Uses the first fit allocation scheme until space get low and then
1162  * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
1163  * and metaslab_df_free_pct to determine when to switch the allocation scheme.
1164  * ==========================================================================
1165  */
1166 static uint64_t
1167 metaslab_df_alloc(metaslab_t *msp, uint64_t size)
1168 {
1169         /*
1170          * Find the largest power of 2 block size that evenly divides the
1171          * requested size. This is used to try to allocate blocks with similar
1172          * alignment from the same area of the metaslab (i.e. same cursor
1173          * bucket) but it does not guarantee that other allocations sizes
1174          * may exist in the same region.
1175          */
1176         uint64_t align = size & -size;
1177         uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
1178         range_tree_t *rt = msp->ms_tree;
1179         avl_tree_t *t = &rt->rt_root;
1180         uint64_t max_size = metaslab_block_maxsize(msp);
1181         int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
1182
1183         ASSERT(MUTEX_HELD(&msp->ms_lock));
1184         ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
1185
1186         if (max_size < size)
1187                 return (-1ULL);
1188
1189         /*
1190          * If we're running low on space switch to using the size
1191          * sorted AVL tree (best-fit).
1192          */
1193         if (max_size < metaslab_df_alloc_threshold ||
1194             free_pct < metaslab_df_free_pct) {
1195                 t = &msp->ms_size_tree;
1196                 *cursor = 0;
1197         }
1198
1199         return (metaslab_block_picker(t, cursor, size, 1ULL));
1200 }
1201
1202 static metaslab_ops_t metaslab_df_ops = {
1203         metaslab_df_alloc
1204 };
1205
1206 metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
1207 #endif /* WITH_DF_BLOCK_ALLOCATOR */
1208
1209 #if defined(WITH_CF_BLOCK_ALLOCATOR)
1210 /*
1211  * ==========================================================================
1212  * Cursor fit block allocator -
1213  * Select the largest region in the metaslab, set the cursor to the beginning
1214  * of the range and the cursor_end to the end of the range. As allocations
1215  * are made advance the cursor. Continue allocating from the cursor until
1216  * the range is exhausted and then find a new range.
1217  * ==========================================================================
1218  */
1219 static uint64_t
1220 metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
1221 {
1222         range_tree_t *rt = msp->ms_tree;
1223         avl_tree_t *t = &msp->ms_size_tree;
1224         uint64_t *cursor = &msp->ms_lbas[0];
1225         uint64_t *cursor_end = &msp->ms_lbas[1];
1226         uint64_t offset = 0;
1227
1228         ASSERT(MUTEX_HELD(&msp->ms_lock));
1229         ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root));
1230
1231         ASSERT3U(*cursor_end, >=, *cursor);
1232
1233         if ((*cursor + size) > *cursor_end) {
1234                 range_seg_t *rs;
1235
1236                 rs = avl_last(&msp->ms_size_tree);
1237                 if (rs == NULL || (rs->rs_end - rs->rs_start) < size)
1238                         return (-1ULL);
1239
1240                 *cursor = rs->rs_start;
1241                 *cursor_end = rs->rs_end;
1242         }
1243
1244         offset = *cursor;
1245         *cursor += size;
1246
1247         return (offset);
1248 }
1249
1250 static metaslab_ops_t metaslab_cf_ops = {
1251         metaslab_cf_alloc
1252 };
1253
1254 metaslab_ops_t *zfs_metaslab_ops = &metaslab_cf_ops;
1255 #endif /* WITH_CF_BLOCK_ALLOCATOR */
1256
1257 #if defined(WITH_NDF_BLOCK_ALLOCATOR)
1258 /*
1259  * ==========================================================================
1260  * New dynamic fit allocator -
1261  * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
1262  * contiguous blocks. If no region is found then just use the largest segment
1263  * that remains.
1264  * ==========================================================================
1265  */
1266
1267 /*
1268  * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
1269  * to request from the allocator.
1270  */
1271 uint64_t metaslab_ndf_clump_shift = 4;
1272
1273 static uint64_t
1274 metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
1275 {
1276         avl_tree_t *t = &msp->ms_tree->rt_root;
1277         avl_index_t where;
1278         range_seg_t *rs, rsearch;
1279         uint64_t hbit = highbit64(size);
1280         uint64_t *cursor = &msp->ms_lbas[hbit - 1];
1281         uint64_t max_size = metaslab_block_maxsize(msp);
1282
1283         ASSERT(MUTEX_HELD(&msp->ms_lock));
1284         ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
1285
1286         if (max_size < size)
1287                 return (-1ULL);
1288
1289         rsearch.rs_start = *cursor;
1290         rsearch.rs_end = *cursor + size;
1291
1292         rs = avl_find(t, &rsearch, &where);
1293         if (rs == NULL || (rs->rs_end - rs->rs_start) < size) {
1294                 t = &msp->ms_size_tree;
1295
1296                 rsearch.rs_start = 0;
1297                 rsearch.rs_end = MIN(max_size,
1298                     1ULL << (hbit + metaslab_ndf_clump_shift));
1299                 rs = avl_find(t, &rsearch, &where);
1300                 if (rs == NULL)
1301                         rs = avl_nearest(t, where, AVL_AFTER);
1302                 ASSERT(rs != NULL);
1303         }
1304
1305         if ((rs->rs_end - rs->rs_start) >= size) {
1306                 *cursor = rs->rs_start + size;
1307                 return (rs->rs_start);
1308         }
1309         return (-1ULL);
1310 }
1311
1312 static metaslab_ops_t metaslab_ndf_ops = {
1313         metaslab_ndf_alloc
1314 };
1315
1316 metaslab_ops_t *zfs_metaslab_ops = &metaslab_ndf_ops;
1317 #endif /* WITH_NDF_BLOCK_ALLOCATOR */
1318
1319
1320 /*
1321  * ==========================================================================
1322  * Metaslabs
1323  * ==========================================================================
1324  */
1325
1326 /*
1327  * Wait for any in-progress metaslab loads to complete.
1328  */
1329 void
1330 metaslab_load_wait(metaslab_t *msp)
1331 {
1332         ASSERT(MUTEX_HELD(&msp->ms_lock));
1333
1334         while (msp->ms_loading) {
1335                 ASSERT(!msp->ms_loaded);
1336                 cv_wait(&msp->ms_load_cv, &msp->ms_lock);
1337         }
1338 }
1339
1340 int
1341 metaslab_load(metaslab_t *msp)
1342 {
1343         int error = 0;
1344         boolean_t success = B_FALSE;
1345
1346         ASSERT(MUTEX_HELD(&msp->ms_lock));
1347         ASSERT(!msp->ms_loaded);
1348         ASSERT(!msp->ms_loading);
1349
1350         msp->ms_loading = B_TRUE;
1351
1352         /*
1353          * If the space map has not been allocated yet, then treat
1354          * all the space in the metaslab as free and add it to the
1355          * ms_tree.
1356          */
1357         if (msp->ms_sm != NULL)
1358                 error = space_map_load(msp->ms_sm, msp->ms_tree, SM_FREE);
1359         else
1360                 range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size);
1361
1362         success = (error == 0);
1363         msp->ms_loading = B_FALSE;
1364
1365         if (success) {
1366                 ASSERT3P(msp->ms_group, !=, NULL);
1367                 msp->ms_loaded = B_TRUE;
1368
1369                 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1370                         range_tree_walk(msp->ms_defertree[t],
1371                             range_tree_remove, msp->ms_tree);
1372                 }
1373                 msp->ms_max_size = metaslab_block_maxsize(msp);
1374         }
1375         cv_broadcast(&msp->ms_load_cv);
1376         return (error);
1377 }
1378
1379 void
1380 metaslab_unload(metaslab_t *msp)
1381 {
1382         ASSERT(MUTEX_HELD(&msp->ms_lock));
1383         range_tree_vacate(msp->ms_tree, NULL, NULL);
1384         msp->ms_loaded = B_FALSE;
1385         msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
1386         msp->ms_max_size = 0;
1387 }
1388
1389 int
1390 metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
1391     metaslab_t **msp)
1392 {
1393         vdev_t *vd = mg->mg_vd;
1394         objset_t *mos = vd->vdev_spa->spa_meta_objset;
1395         metaslab_t *ms;
1396         int error;
1397
1398         ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
1399         mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL);
1400         cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
1401         ms->ms_id = id;
1402         ms->ms_start = id << vd->vdev_ms_shift;
1403         ms->ms_size = 1ULL << vd->vdev_ms_shift;
1404
1405         /*
1406          * We only open space map objects that already exist. All others
1407          * will be opened when we finally allocate an object for it.
1408          */
1409         if (object != 0) {
1410                 error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start,
1411                     ms->ms_size, vd->vdev_ashift, &ms->ms_lock);
1412
1413                 if (error != 0) {
1414                         kmem_free(ms, sizeof (metaslab_t));
1415                         return (error);
1416                 }
1417
1418                 ASSERT(ms->ms_sm != NULL);
1419         }
1420
1421         /*
1422          * We create the main range tree here, but we don't create the
1423          * other range trees until metaslab_sync_done().  This serves
1424          * two purposes: it allows metaslab_sync_done() to detect the
1425          * addition of new space; and for debugging, it ensures that we'd
1426          * data fault on any attempt to use this metaslab before it's ready.
1427          */
1428         ms->ms_tree = range_tree_create(&metaslab_rt_ops, ms, &ms->ms_lock);
1429         metaslab_group_add(mg, ms);
1430
1431         metaslab_set_fragmentation(ms);
1432
1433         /*
1434          * If we're opening an existing pool (txg == 0) or creating
1435          * a new one (txg == TXG_INITIAL), all space is available now.
1436          * If we're adding space to an existing pool, the new space
1437          * does not become available until after this txg has synced.
1438          * The metaslab's weight will also be initialized when we sync
1439          * out this txg. This ensures that we don't attempt to allocate
1440          * from it before we have initialized it completely.
1441          */
1442         if (txg <= TXG_INITIAL)
1443                 metaslab_sync_done(ms, 0);
1444
1445         /*
1446          * If metaslab_debug_load is set and we're initializing a metaslab
1447          * that has an allocated space map object then load the its space
1448          * map so that can verify frees.
1449          */
1450         if (metaslab_debug_load && ms->ms_sm != NULL) {
1451                 mutex_enter(&ms->ms_lock);
1452                 VERIFY0(metaslab_load(ms));
1453                 mutex_exit(&ms->ms_lock);
1454         }
1455
1456         if (txg != 0) {
1457                 vdev_dirty(vd, 0, NULL, txg);
1458                 vdev_dirty(vd, VDD_METASLAB, ms, txg);
1459         }
1460
1461         *msp = ms;
1462
1463         return (0);
1464 }
1465
1466 void
1467 metaslab_fini(metaslab_t *msp)
1468 {
1469         metaslab_group_t *mg = msp->ms_group;
1470
1471         metaslab_group_remove(mg, msp);
1472
1473         mutex_enter(&msp->ms_lock);
1474         VERIFY(msp->ms_group == NULL);
1475         vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm),
1476             0, -msp->ms_size);
1477         space_map_close(msp->ms_sm);
1478
1479         metaslab_unload(msp);
1480         range_tree_destroy(msp->ms_tree);
1481         range_tree_destroy(msp->ms_freeingtree);
1482         range_tree_destroy(msp->ms_freedtree);
1483
1484         for (int t = 0; t < TXG_SIZE; t++) {
1485                 range_tree_destroy(msp->ms_alloctree[t]);
1486         }
1487
1488         for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1489                 range_tree_destroy(msp->ms_defertree[t]);
1490         }
1491
1492         ASSERT0(msp->ms_deferspace);
1493
1494         mutex_exit(&msp->ms_lock);
1495         cv_destroy(&msp->ms_load_cv);
1496         mutex_destroy(&msp->ms_lock);
1497
1498         kmem_free(msp, sizeof (metaslab_t));
1499 }
1500
1501 #define FRAGMENTATION_TABLE_SIZE        17
1502
1503 /*
1504  * This table defines a segment size based fragmentation metric that will
1505  * allow each metaslab to derive its own fragmentation value. This is done
1506  * by calculating the space in each bucket of the spacemap histogram and
1507  * multiplying that by the fragmetation metric in this table. Doing
1508  * this for all buckets and dividing it by the total amount of free
1509  * space in this metaslab (i.e. the total free space in all buckets) gives
1510  * us the fragmentation metric. This means that a high fragmentation metric
1511  * equates to most of the free space being comprised of small segments.
1512  * Conversely, if the metric is low, then most of the free space is in
1513  * large segments. A 10% change in fragmentation equates to approximately
1514  * double the number of segments.
1515  *
1516  * This table defines 0% fragmented space using 16MB segments. Testing has
1517  * shown that segments that are greater than or equal to 16MB do not suffer
1518  * from drastic performance problems. Using this value, we derive the rest
1519  * of the table. Since the fragmentation value is never stored on disk, it
1520  * is possible to change these calculations in the future.
1521  */
1522 int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
1523         100,    /* 512B */
1524         100,    /* 1K   */
1525         98,     /* 2K   */
1526         95,     /* 4K   */
1527         90,     /* 8K   */
1528         80,     /* 16K  */
1529         70,     /* 32K  */
1530         60,     /* 64K  */
1531         50,     /* 128K */
1532         40,     /* 256K */
1533         30,     /* 512K */
1534         20,     /* 1M   */
1535         15,     /* 2M   */
1536         10,     /* 4M   */
1537         5,      /* 8M   */
1538         0       /* 16M  */
1539 };
1540
1541 /*
1542  * Calclate the metaslab's fragmentation metric. A return value
1543  * of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does
1544  * not support this metric. Otherwise, the return value should be in the
1545  * range [0, 100].
1546  */
1547 static void
1548 metaslab_set_fragmentation(metaslab_t *msp)
1549 {
1550         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1551         uint64_t fragmentation = 0;
1552         uint64_t total = 0;
1553         boolean_t feature_enabled = spa_feature_is_enabled(spa,
1554             SPA_FEATURE_SPACEMAP_HISTOGRAM);
1555
1556         if (!feature_enabled) {
1557                 msp->ms_fragmentation = ZFS_FRAG_INVALID;
1558                 return;
1559         }
1560
1561         /*
1562          * A null space map means that the entire metaslab is free
1563          * and thus is not fragmented.
1564          */
1565         if (msp->ms_sm == NULL) {
1566                 msp->ms_fragmentation = 0;
1567                 return;
1568         }
1569
1570         /*
1571          * If this metaslab's space map has not been upgraded, flag it
1572          * so that we upgrade next time we encounter it.
1573          */
1574         if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) {
1575                 uint64_t txg = spa_syncing_txg(spa);
1576                 vdev_t *vd = msp->ms_group->mg_vd;
1577
1578                 /*
1579                  * If we've reached the final dirty txg, then we must
1580                  * be shutting down the pool. We don't want to dirty
1581                  * any data past this point so skip setting the condense
1582                  * flag. We can retry this action the next time the pool
1583                  * is imported.
1584                  */
1585                 if (spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) {
1586                         msp->ms_condense_wanted = B_TRUE;
1587                         vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
1588                         spa_dbgmsg(spa, "txg %llu, requesting force condense: "
1589                             "ms_id %llu, vdev_id %llu", txg, msp->ms_id,
1590                             vd->vdev_id);
1591                 }
1592                 msp->ms_fragmentation = ZFS_FRAG_INVALID;
1593                 return;
1594         }
1595
1596         for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
1597                 uint64_t space = 0;
1598                 uint8_t shift = msp->ms_sm->sm_shift;
1599
1600                 int idx = MIN(shift - SPA_MINBLOCKSHIFT + i,
1601                     FRAGMENTATION_TABLE_SIZE - 1);
1602
1603                 if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
1604                         continue;
1605
1606                 space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift);
1607                 total += space;
1608
1609                 ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE);
1610                 fragmentation += space * zfs_frag_table[idx];
1611         }
1612
1613         if (total > 0)
1614                 fragmentation /= total;
1615         ASSERT3U(fragmentation, <=, 100);
1616
1617         msp->ms_fragmentation = fragmentation;
1618 }
1619
1620 /*
1621  * Compute a weight -- a selection preference value -- for the given metaslab.
1622  * This is based on the amount of free space, the level of fragmentation,
1623  * the LBA range, and whether the metaslab is loaded.
1624  */
1625 static uint64_t
1626 metaslab_space_weight(metaslab_t *msp)
1627 {
1628         metaslab_group_t *mg = msp->ms_group;
1629         vdev_t *vd = mg->mg_vd;
1630         uint64_t weight, space;
1631
1632         ASSERT(MUTEX_HELD(&msp->ms_lock));
1633         ASSERT(!vd->vdev_removing);
1634
1635         /*
1636          * The baseline weight is the metaslab's free space.
1637          */
1638         space = msp->ms_size - space_map_allocated(msp->ms_sm);
1639
1640         if (metaslab_fragmentation_factor_enabled &&
1641             msp->ms_fragmentation != ZFS_FRAG_INVALID) {
1642                 /*
1643                  * Use the fragmentation information to inversely scale
1644                  * down the baseline weight. We need to ensure that we
1645                  * don't exclude this metaslab completely when it's 100%
1646                  * fragmented. To avoid this we reduce the fragmented value
1647                  * by 1.
1648                  */
1649                 space = (space * (100 - (msp->ms_fragmentation - 1))) / 100;
1650
1651                 /*
1652                  * If space < SPA_MINBLOCKSIZE, then we will not allocate from
1653                  * this metaslab again. The fragmentation metric may have
1654                  * decreased the space to something smaller than
1655                  * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE
1656                  * so that we can consume any remaining space.
1657                  */
1658                 if (space > 0 && space < SPA_MINBLOCKSIZE)
1659                         space = SPA_MINBLOCKSIZE;
1660         }
1661         weight = space;
1662
1663         /*
1664          * Modern disks have uniform bit density and constant angular velocity.
1665          * Therefore, the outer recording zones are faster (higher bandwidth)
1666          * than the inner zones by the ratio of outer to inner track diameter,
1667          * which is typically around 2:1.  We account for this by assigning
1668          * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
1669          * In effect, this means that we'll select the metaslab with the most
1670          * free bandwidth rather than simply the one with the most free space.
1671          */
1672         if (!vd->vdev_nonrot && metaslab_lba_weighting_enabled) {
1673                 weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
1674                 ASSERT(weight >= space && weight <= 2 * space);
1675         }
1676
1677         /*
1678          * If this metaslab is one we're actively using, adjust its
1679          * weight to make it preferable to any inactive metaslab so
1680          * we'll polish it off. If the fragmentation on this metaslab
1681          * has exceed our threshold, then don't mark it active.
1682          */
1683         if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID &&
1684             msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) {
1685                 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
1686         }
1687
1688         WEIGHT_SET_SPACEBASED(weight);
1689         return (weight);
1690 }
1691
1692 /*
1693  * Return the weight of the specified metaslab, according to the segment-based
1694  * weighting algorithm. The metaslab must be loaded. This function can
1695  * be called within a sync pass since it relies only on the metaslab's
1696  * range tree which is always accurate when the metaslab is loaded.
1697  */
1698 static uint64_t
1699 metaslab_weight_from_range_tree(metaslab_t *msp)
1700 {
1701         uint64_t weight = 0;
1702         uint32_t segments = 0;
1703
1704         ASSERT(msp->ms_loaded);
1705
1706         for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT;
1707             i--) {
1708                 uint8_t shift = msp->ms_group->mg_vd->vdev_ashift;
1709                 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
1710
1711                 segments <<= 1;
1712                 segments += msp->ms_tree->rt_histogram[i];
1713
1714                 /*
1715                  * The range tree provides more precision than the space map
1716                  * and must be downgraded so that all values fit within the
1717                  * space map's histogram. This allows us to compare loaded
1718                  * vs. unloaded metaslabs to determine which metaslab is
1719                  * considered "best".
1720                  */
1721                 if (i > max_idx)
1722                         continue;
1723
1724                 if (segments != 0) {
1725                         WEIGHT_SET_COUNT(weight, segments);
1726                         WEIGHT_SET_INDEX(weight, i);
1727                         WEIGHT_SET_ACTIVE(weight, 0);
1728                         break;
1729                 }
1730         }
1731         return (weight);
1732 }
1733
1734 /*
1735  * Calculate the weight based on the on-disk histogram. This should only
1736  * be called after a sync pass has completely finished since the on-disk
1737  * information is updated in metaslab_sync().
1738  */
1739 static uint64_t
1740 metaslab_weight_from_spacemap(metaslab_t *msp)
1741 {
1742         uint64_t weight = 0;
1743
1744         for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) {
1745                 if (msp->ms_sm->sm_phys->smp_histogram[i] != 0) {
1746                         WEIGHT_SET_COUNT(weight,
1747                             msp->ms_sm->sm_phys->smp_histogram[i]);
1748                         WEIGHT_SET_INDEX(weight, i +
1749                             msp->ms_sm->sm_shift);
1750                         WEIGHT_SET_ACTIVE(weight, 0);
1751                         break;
1752                 }
1753         }
1754         return (weight);
1755 }
1756
1757 /*
1758  * Compute a segment-based weight for the specified metaslab. The weight
1759  * is determined by highest bucket in the histogram. The information
1760  * for the highest bucket is encoded into the weight value.
1761  */
1762 static uint64_t
1763 metaslab_segment_weight(metaslab_t *msp)
1764 {
1765         metaslab_group_t *mg = msp->ms_group;
1766         uint64_t weight = 0;
1767         uint8_t shift = mg->mg_vd->vdev_ashift;
1768
1769         ASSERT(MUTEX_HELD(&msp->ms_lock));
1770
1771         /*
1772          * The metaslab is completely free.
1773          */
1774         if (space_map_allocated(msp->ms_sm) == 0) {
1775                 int idx = highbit64(msp->ms_size) - 1;
1776                 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
1777
1778                 if (idx < max_idx) {
1779                         WEIGHT_SET_COUNT(weight, 1ULL);
1780                         WEIGHT_SET_INDEX(weight, idx);
1781                 } else {
1782                         WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx));
1783                         WEIGHT_SET_INDEX(weight, max_idx);
1784                 }
1785                 WEIGHT_SET_ACTIVE(weight, 0);
1786                 ASSERT(!WEIGHT_IS_SPACEBASED(weight));
1787
1788                 return (weight);
1789         }
1790
1791         ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
1792
1793         /*
1794          * If the metaslab is fully allocated then just make the weight 0.
1795          */
1796         if (space_map_allocated(msp->ms_sm) == msp->ms_size)
1797                 return (0);
1798         /*
1799          * If the metaslab is already loaded, then use the range tree to
1800          * determine the weight. Otherwise, we rely on the space map information
1801          * to generate the weight.
1802          */
1803         if (msp->ms_loaded) {
1804                 weight = metaslab_weight_from_range_tree(msp);
1805         } else {
1806                 weight = metaslab_weight_from_spacemap(msp);
1807         }
1808
1809         /*
1810          * If the metaslab was active the last time we calculated its weight
1811          * then keep it active. We want to consume the entire region that
1812          * is associated with this weight.
1813          */
1814         if (msp->ms_activation_weight != 0 && weight != 0)
1815                 WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight));
1816         return (weight);
1817 }
1818
1819 /*
1820  * Determine if we should attempt to allocate from this metaslab. If the
1821  * metaslab has a maximum size then we can quickly determine if the desired
1822  * allocation size can be satisfied. Otherwise, if we're using segment-based
1823  * weighting then we can determine the maximum allocation that this metaslab
1824  * can accommodate based on the index encoded in the weight. If we're using
1825  * space-based weights then rely on the entire weight (excluding the weight
1826  * type bit).
1827  */
1828 boolean_t
1829 metaslab_should_allocate(metaslab_t *msp, uint64_t asize)
1830 {
1831         boolean_t should_allocate;
1832
1833         if (msp->ms_max_size != 0)
1834                 return (msp->ms_max_size >= asize);
1835
1836         if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
1837                 /*
1838                  * The metaslab segment weight indicates segments in the
1839                  * range [2^i, 2^(i+1)), where i is the index in the weight.
1840                  * Since the asize might be in the middle of the range, we
1841                  * should attempt the allocation if asize < 2^(i+1).
1842                  */
1843                 should_allocate = (asize <
1844                     1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1));
1845         } else {
1846                 should_allocate = (asize <=
1847                     (msp->ms_weight & ~METASLAB_WEIGHT_TYPE));
1848         }
1849         return (should_allocate);
1850 }
1851 static uint64_t
1852 metaslab_weight(metaslab_t *msp)
1853 {
1854         vdev_t *vd = msp->ms_group->mg_vd;
1855         spa_t *spa = vd->vdev_spa;
1856         uint64_t weight;
1857
1858         ASSERT(MUTEX_HELD(&msp->ms_lock));
1859
1860         /*
1861          * This vdev is in the process of being removed so there is nothing
1862          * for us to do here.
1863          */
1864         if (vd->vdev_removing) {
1865                 ASSERT0(space_map_allocated(msp->ms_sm));
1866                 ASSERT0(vd->vdev_ms_shift);
1867                 return (0);
1868         }
1869
1870         metaslab_set_fragmentation(msp);
1871
1872         /*
1873          * Update the maximum size if the metaslab is loaded. This will
1874          * ensure that we get an accurate maximum size if newly freed space
1875          * has been added back into the free tree.
1876          */
1877         if (msp->ms_loaded)
1878                 msp->ms_max_size = metaslab_block_maxsize(msp);
1879
1880         /*
1881          * Segment-based weighting requires space map histogram support.
1882          */
1883         if (zfs_metaslab_segment_weight_enabled &&
1884             spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
1885             (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size ==
1886             sizeof (space_map_phys_t))) {
1887                 weight = metaslab_segment_weight(msp);
1888         } else {
1889                 weight = metaslab_space_weight(msp);
1890         }
1891         return (weight);
1892 }
1893
1894 static int
1895 metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
1896 {
1897         ASSERT(MUTEX_HELD(&msp->ms_lock));
1898
1899         if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
1900                 metaslab_load_wait(msp);
1901                 if (!msp->ms_loaded) {
1902                         int error = metaslab_load(msp);
1903                         if (error) {
1904                                 metaslab_group_sort(msp->ms_group, msp, 0);
1905                                 return (error);
1906                         }
1907                 }
1908
1909                 msp->ms_activation_weight = msp->ms_weight;
1910                 metaslab_group_sort(msp->ms_group, msp,
1911                     msp->ms_weight | activation_weight);
1912         }
1913         ASSERT(msp->ms_loaded);
1914         ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
1915
1916         return (0);
1917 }
1918
1919 static void
1920 metaslab_passivate(metaslab_t *msp, uint64_t weight)
1921 {
1922         ASSERTV(uint64_t size = weight & ~METASLAB_WEIGHT_TYPE);
1923
1924         /*
1925          * If size < SPA_MINBLOCKSIZE, then we will not allocate from
1926          * this metaslab again.  In that case, it had better be empty,
1927          * or we would be leaving space on the table.
1928          */
1929         ASSERT(!WEIGHT_IS_SPACEBASED(msp->ms_weight) ||
1930             size >= SPA_MINBLOCKSIZE ||
1931             range_tree_space(msp->ms_tree) == 0);
1932         ASSERT0(weight & METASLAB_ACTIVE_MASK);
1933
1934         msp->ms_activation_weight = 0;
1935         metaslab_group_sort(msp->ms_group, msp, weight);
1936         ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
1937 }
1938
1939 /*
1940  * Segment-based metaslabs are activated once and remain active until
1941  * we either fail an allocation attempt (similar to space-based metaslabs)
1942  * or have exhausted the free space in zfs_metaslab_switch_threshold
1943  * buckets since the metaslab was activated. This function checks to see
1944  * if we've exhaused the zfs_metaslab_switch_threshold buckets in the
1945  * metaslab and passivates it proactively. This will allow us to select a
1946  * metaslab with a larger contiguous region, if any, remaining within this
1947  * metaslab group. If we're in sync pass > 1, then we continue using this
1948  * metaslab so that we don't dirty more block and cause more sync passes.
1949  */
1950 void
1951 metaslab_segment_may_passivate(metaslab_t *msp)
1952 {
1953         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1954
1955         if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1)
1956                 return;
1957
1958         /*
1959          * Since we are in the middle of a sync pass, the most accurate
1960          * information that is accessible to us is the in-core range tree
1961          * histogram; calculate the new weight based on that information.
1962          */
1963         uint64_t weight = metaslab_weight_from_range_tree(msp);
1964         int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight);
1965         int current_idx = WEIGHT_GET_INDEX(weight);
1966
1967         if (current_idx <= activation_idx - zfs_metaslab_switch_threshold)
1968                 metaslab_passivate(msp, weight);
1969 }
1970
1971 static void
1972 metaslab_preload(void *arg)
1973 {
1974         metaslab_t *msp = arg;
1975         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1976         fstrans_cookie_t cookie = spl_fstrans_mark();
1977
1978         ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
1979
1980         mutex_enter(&msp->ms_lock);
1981         metaslab_load_wait(msp);
1982         if (!msp->ms_loaded)
1983                 (void) metaslab_load(msp);
1984         msp->ms_selected_txg = spa_syncing_txg(spa);
1985         mutex_exit(&msp->ms_lock);
1986         spl_fstrans_unmark(cookie);
1987 }
1988
1989 static void
1990 metaslab_group_preload(metaslab_group_t *mg)
1991 {
1992         spa_t *spa = mg->mg_vd->vdev_spa;
1993         metaslab_t *msp;
1994         avl_tree_t *t = &mg->mg_metaslab_tree;
1995         int m = 0;
1996
1997         if (spa_shutting_down(spa) || !metaslab_preload_enabled) {
1998                 taskq_wait_outstanding(mg->mg_taskq, 0);
1999                 return;
2000         }
2001
2002         mutex_enter(&mg->mg_lock);
2003         /*
2004          * Load the next potential metaslabs
2005          */
2006         for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) {
2007                 /*
2008                  * We preload only the maximum number of metaslabs specified
2009                  * by metaslab_preload_limit. If a metaslab is being forced
2010                  * to condense then we preload it too. This will ensure
2011                  * that force condensing happens in the next txg.
2012                  */
2013                 if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) {
2014                         continue;
2015                 }
2016
2017                 VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload,
2018                     msp, TQ_SLEEP) != TASKQID_INVALID);
2019         }
2020         mutex_exit(&mg->mg_lock);
2021 }
2022
2023 /*
2024  * Determine if the space map's on-disk footprint is past our tolerance
2025  * for inefficiency. We would like to use the following criteria to make
2026  * our decision:
2027  *
2028  * 1. The size of the space map object should not dramatically increase as a
2029  * result of writing out the free space range tree.
2030  *
2031  * 2. The minimal on-disk space map representation is zfs_condense_pct/100
2032  * times the size than the free space range tree representation
2033  * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1.MB).
2034  *
2035  * 3. The on-disk size of the space map should actually decrease.
2036  *
2037  * Checking the first condition is tricky since we don't want to walk
2038  * the entire AVL tree calculating the estimated on-disk size. Instead we
2039  * use the size-ordered range tree in the metaslab and calculate the
2040  * size required to write out the largest segment in our free tree. If the
2041  * size required to represent that segment on disk is larger than the space
2042  * map object then we avoid condensing this map.
2043  *
2044  * To determine the second criterion we use a best-case estimate and assume
2045  * each segment can be represented on-disk as a single 64-bit entry. We refer
2046  * to this best-case estimate as the space map's minimal form.
2047  *
2048  * Unfortunately, we cannot compute the on-disk size of the space map in this
2049  * context because we cannot accurately compute the effects of compression, etc.
2050  * Instead, we apply the heuristic described in the block comment for
2051  * zfs_metaslab_condense_block_threshold - we only condense if the space used
2052  * is greater than a threshold number of blocks.
2053  */
2054 static boolean_t
2055 metaslab_should_condense(metaslab_t *msp)
2056 {
2057         space_map_t *sm = msp->ms_sm;
2058         range_seg_t *rs;
2059         uint64_t size, entries, segsz, object_size, optimal_size, record_size;
2060         dmu_object_info_t doi;
2061         uint64_t vdev_blocksize = 1ULL << msp->ms_group->mg_vd->vdev_ashift;
2062
2063         ASSERT(MUTEX_HELD(&msp->ms_lock));
2064         ASSERT(msp->ms_loaded);
2065
2066         /*
2067          * Use the ms_size_tree range tree, which is ordered by size, to
2068          * obtain the largest segment in the free tree. We always condense
2069          * metaslabs that are empty and metaslabs for which a condense
2070          * request has been made.
2071          */
2072         rs = avl_last(&msp->ms_size_tree);
2073         if (rs == NULL || msp->ms_condense_wanted)
2074                 return (B_TRUE);
2075
2076         /*
2077          * Calculate the number of 64-bit entries this segment would
2078          * require when written to disk. If this single segment would be
2079          * larger on-disk than the entire current on-disk structure, then
2080          * clearly condensing will increase the on-disk structure size.
2081          */
2082         size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
2083         entries = size / (MIN(size, SM_RUN_MAX));
2084         segsz = entries * sizeof (uint64_t);
2085
2086         optimal_size = sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root);
2087         object_size = space_map_length(msp->ms_sm);
2088
2089         dmu_object_info_from_db(sm->sm_dbuf, &doi);
2090         record_size = MAX(doi.doi_data_block_size, vdev_blocksize);
2091
2092         return (segsz <= object_size &&
2093             object_size >= (optimal_size * zfs_condense_pct / 100) &&
2094             object_size > zfs_metaslab_condense_block_threshold * record_size);
2095 }
2096
2097 /*
2098  * Condense the on-disk space map representation to its minimized form.
2099  * The minimized form consists of a small number of allocations followed by
2100  * the entries of the free range tree.
2101  */
2102 static void
2103 metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
2104 {
2105         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
2106         range_tree_t *condense_tree;
2107         space_map_t *sm = msp->ms_sm;
2108
2109         ASSERT(MUTEX_HELD(&msp->ms_lock));
2110         ASSERT3U(spa_sync_pass(spa), ==, 1);
2111         ASSERT(msp->ms_loaded);
2112
2113
2114         spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, vdev id %llu, "
2115             "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg,
2116             msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id,
2117             msp->ms_group->mg_vd->vdev_spa->spa_name,
2118             space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root),
2119             msp->ms_condense_wanted ? "TRUE" : "FALSE");
2120
2121         msp->ms_condense_wanted = B_FALSE;
2122
2123         /*
2124          * Create an range tree that is 100% allocated. We remove segments
2125          * that have been freed in this txg, any deferred frees that exist,
2126          * and any allocation in the future. Removing segments should be
2127          * a relatively inexpensive operation since we expect these trees to
2128          * have a small number of nodes.
2129          */
2130         condense_tree = range_tree_create(NULL, NULL, &msp->ms_lock);
2131         range_tree_add(condense_tree, msp->ms_start, msp->ms_size);
2132
2133         /*
2134          * Remove what's been freed in this txg from the condense_tree.
2135          * Since we're in sync_pass 1, we know that all the frees from
2136          * this txg are in the freeingtree.
2137          */
2138         range_tree_walk(msp->ms_freeingtree, range_tree_remove, condense_tree);
2139
2140         for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2141                 range_tree_walk(msp->ms_defertree[t],
2142                     range_tree_remove, condense_tree);
2143         }
2144
2145         for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
2146                 range_tree_walk(msp->ms_alloctree[(txg + t) & TXG_MASK],
2147                     range_tree_remove, condense_tree);
2148         }
2149
2150         /*
2151          * We're about to drop the metaslab's lock thus allowing
2152          * other consumers to change it's content. Set the
2153          * metaslab's ms_condensing flag to ensure that
2154          * allocations on this metaslab do not occur while we're
2155          * in the middle of committing it to disk. This is only critical
2156          * for the ms_tree as all other range trees use per txg
2157          * views of their content.
2158          */
2159         msp->ms_condensing = B_TRUE;
2160
2161         mutex_exit(&msp->ms_lock);
2162         space_map_truncate(sm, tx);
2163         mutex_enter(&msp->ms_lock);
2164
2165         /*
2166          * While we would ideally like to create a space map representation
2167          * that consists only of allocation records, doing so can be
2168          * prohibitively expensive because the in-core free tree can be
2169          * large, and therefore computationally expensive to subtract
2170          * from the condense_tree. Instead we sync out two trees, a cheap
2171          * allocation only tree followed by the in-core free tree. While not
2172          * optimal, this is typically close to optimal, and much cheaper to
2173          * compute.
2174          */
2175         space_map_write(sm, condense_tree, SM_ALLOC, tx);
2176         range_tree_vacate(condense_tree, NULL, NULL);
2177         range_tree_destroy(condense_tree);
2178
2179         space_map_write(sm, msp->ms_tree, SM_FREE, tx);
2180         msp->ms_condensing = B_FALSE;
2181 }
2182
2183 /*
2184  * Write a metaslab to disk in the context of the specified transaction group.
2185  */
2186 void
2187 metaslab_sync(metaslab_t *msp, uint64_t txg)
2188 {
2189         metaslab_group_t *mg = msp->ms_group;
2190         vdev_t *vd = mg->mg_vd;
2191         spa_t *spa = vd->vdev_spa;
2192         objset_t *mos = spa_meta_objset(spa);
2193         range_tree_t *alloctree = msp->ms_alloctree[txg & TXG_MASK];
2194         dmu_tx_t *tx;
2195         uint64_t object = space_map_object(msp->ms_sm);
2196
2197         ASSERT(!vd->vdev_ishole);
2198
2199         /*
2200          * This metaslab has just been added so there's no work to do now.
2201          */
2202         if (msp->ms_freeingtree == NULL) {
2203                 ASSERT3P(alloctree, ==, NULL);
2204                 return;
2205         }
2206
2207         ASSERT3P(alloctree, !=, NULL);
2208         ASSERT3P(msp->ms_freeingtree, !=, NULL);
2209         ASSERT3P(msp->ms_freedtree, !=, NULL);
2210
2211         /*
2212          * Normally, we don't want to process a metaslab if there
2213          * are no allocations or frees to perform. However, if the metaslab
2214          * is being forced to condense and it's loaded, we need to let it
2215          * through.
2216          */
2217         if (range_tree_space(alloctree) == 0 &&
2218             range_tree_space(msp->ms_freeingtree) == 0 &&
2219             !(msp->ms_loaded && msp->ms_condense_wanted))
2220                 return;
2221
2222
2223         VERIFY(txg <= spa_final_dirty_txg(spa));
2224
2225         /*
2226          * The only state that can actually be changing concurrently with
2227          * metaslab_sync() is the metaslab's ms_tree.  No other thread can
2228          * be modifying this txg's alloctree, freeingtree, freedtree, or
2229          * space_map_phys_t. Therefore, we only hold ms_lock to satify
2230          * space map ASSERTs. We drop it whenever we call into the DMU,
2231          * because the DMU can call down to us (e.g. via zio_free()) at
2232          * any time.
2233          */
2234
2235         tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
2236
2237         if (msp->ms_sm == NULL) {
2238                 uint64_t new_object;
2239
2240                 new_object = space_map_alloc(mos, tx);
2241                 VERIFY3U(new_object, !=, 0);
2242
2243                 VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
2244                     msp->ms_start, msp->ms_size, vd->vdev_ashift,
2245                     &msp->ms_lock));
2246                 ASSERT(msp->ms_sm != NULL);
2247         }
2248
2249         mutex_enter(&msp->ms_lock);
2250
2251         /*
2252          * Note: metaslab_condense() clears the space map's histogram.
2253          * Therefore we must verify and remove this histogram before
2254          * condensing.
2255          */
2256         metaslab_group_histogram_verify(mg);
2257         metaslab_class_histogram_verify(mg->mg_class);
2258         metaslab_group_histogram_remove(mg, msp);
2259
2260         if (msp->ms_loaded && spa_sync_pass(spa) == 1 &&
2261             metaslab_should_condense(msp)) {
2262                 metaslab_condense(msp, txg, tx);
2263         } else {
2264                 space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx);
2265                 space_map_write(msp->ms_sm, msp->ms_freeingtree, SM_FREE, tx);
2266         }
2267
2268         if (msp->ms_loaded) {
2269                 /*
2270                  * When the space map is loaded, we have an accruate
2271                  * histogram in the range tree. This gives us an opportunity
2272                  * to bring the space map's histogram up-to-date so we clear
2273                  * it first before updating it.
2274                  */
2275                 space_map_histogram_clear(msp->ms_sm);
2276                 space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx);
2277
2278                 /*
2279                  * Since we've cleared the histogram we need to add back
2280                  * any free space that has already been processed, plus
2281                  * any deferred space. This allows the on-disk histogram
2282                  * to accurately reflect all free space even if some space
2283                  * is not yet available for allocation (i.e. deferred).
2284                  */
2285                 space_map_histogram_add(msp->ms_sm, msp->ms_freedtree, tx);
2286
2287                 /*
2288                  * Add back any deferred free space that has not been
2289                  * added back into the in-core free tree yet. This will
2290                  * ensure that we don't end up with a space map histogram
2291                  * that is completely empty unless the metaslab is fully
2292                  * allocated.
2293                  */
2294                 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2295                         space_map_histogram_add(msp->ms_sm,
2296                             msp->ms_defertree[t], tx);
2297                 }
2298         }
2299
2300         /*
2301          * Always add the free space from this sync pass to the space
2302          * map histogram. We want to make sure that the on-disk histogram
2303          * accounts for all free space. If the space map is not loaded,
2304          * then we will lose some accuracy but will correct it the next
2305          * time we load the space map.
2306          */
2307         space_map_histogram_add(msp->ms_sm, msp->ms_freeingtree, tx);
2308
2309         metaslab_group_histogram_add(mg, msp);
2310         metaslab_group_histogram_verify(mg);
2311         metaslab_class_histogram_verify(mg->mg_class);
2312
2313         /*
2314          * For sync pass 1, we avoid traversing this txg's free range tree
2315          * and instead will just swap the pointers for freeingtree and
2316          * freedtree. We can safely do this since the freed_tree is
2317          * guaranteed to be empty on the initial pass.
2318          */
2319         if (spa_sync_pass(spa) == 1) {
2320                 range_tree_swap(&msp->ms_freeingtree, &msp->ms_freedtree);
2321         } else {
2322                 range_tree_vacate(msp->ms_freeingtree,
2323                     range_tree_add, msp->ms_freedtree);
2324         }
2325         range_tree_vacate(alloctree, NULL, NULL);
2326
2327         ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
2328         ASSERT0(range_tree_space(msp->ms_alloctree[TXG_CLEAN(txg) & TXG_MASK]));
2329         ASSERT0(range_tree_space(msp->ms_freeingtree));
2330
2331         mutex_exit(&msp->ms_lock);
2332
2333         if (object != space_map_object(msp->ms_sm)) {
2334                 object = space_map_object(msp->ms_sm);
2335                 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
2336                     msp->ms_id, sizeof (uint64_t), &object, tx);
2337         }
2338         dmu_tx_commit(tx);
2339 }
2340
2341 /*
2342  * Called after a transaction group has completely synced to mark
2343  * all of the metaslab's free space as usable.
2344  */
2345 void
2346 metaslab_sync_done(metaslab_t *msp, uint64_t txg)
2347 {
2348         metaslab_group_t *mg = msp->ms_group;
2349         vdev_t *vd = mg->mg_vd;
2350         spa_t *spa = vd->vdev_spa;
2351         range_tree_t **defer_tree;
2352         int64_t alloc_delta, defer_delta;
2353         boolean_t defer_allowed = B_TRUE;
2354
2355         ASSERT(!vd->vdev_ishole);
2356
2357         mutex_enter(&msp->ms_lock);
2358
2359         /*
2360          * If this metaslab is just becoming available, initialize its
2361          * range trees and add its capacity to the vdev.
2362          */
2363         if (msp->ms_freedtree == NULL) {
2364                 for (int t = 0; t < TXG_SIZE; t++) {
2365                         ASSERT(msp->ms_alloctree[t] == NULL);
2366
2367                         msp->ms_alloctree[t] = range_tree_create(NULL, msp,
2368                             &msp->ms_lock);
2369                 }
2370
2371                 ASSERT3P(msp->ms_freeingtree, ==, NULL);
2372                 msp->ms_freeingtree = range_tree_create(NULL, msp,
2373                     &msp->ms_lock);
2374
2375                 ASSERT3P(msp->ms_freedtree, ==, NULL);
2376                 msp->ms_freedtree = range_tree_create(NULL, msp,
2377                     &msp->ms_lock);
2378
2379                 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2380                         ASSERT(msp->ms_defertree[t] == NULL);
2381
2382                         msp->ms_defertree[t] = range_tree_create(NULL, msp,
2383                             &msp->ms_lock);
2384                 }
2385
2386                 vdev_space_update(vd, 0, 0, msp->ms_size);
2387         }
2388
2389         defer_tree = &msp->ms_defertree[txg % TXG_DEFER_SIZE];
2390
2391         uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) -
2392             metaslab_class_get_alloc(spa_normal_class(spa));
2393         if (free_space <= spa_get_slop_space(spa)) {
2394                 defer_allowed = B_FALSE;
2395         }
2396
2397         defer_delta = 0;
2398         alloc_delta = space_map_alloc_delta(msp->ms_sm);
2399         if (defer_allowed) {
2400                 defer_delta = range_tree_space(msp->ms_freedtree) -
2401                     range_tree_space(*defer_tree);
2402         } else {
2403                 defer_delta -= range_tree_space(*defer_tree);
2404         }
2405
2406         vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
2407
2408         /*
2409          * If there's a metaslab_load() in progress, wait for it to complete
2410          * so that we have a consistent view of the in-core space map.
2411          */
2412         metaslab_load_wait(msp);
2413
2414         /*
2415          * Move the frees from the defer_tree back to the free
2416          * range tree (if it's loaded). Swap the freed_tree and the
2417          * defer_tree -- this is safe to do because we've just emptied out
2418          * the defer_tree.
2419          */
2420         range_tree_vacate(*defer_tree,
2421             msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
2422         if (defer_allowed) {
2423                 range_tree_swap(&msp->ms_freedtree, defer_tree);
2424         } else {
2425                 range_tree_vacate(msp->ms_freedtree,
2426                     msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
2427         }
2428
2429         space_map_update(msp->ms_sm);
2430
2431         msp->ms_deferspace += defer_delta;
2432         ASSERT3S(msp->ms_deferspace, >=, 0);
2433         ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
2434         if (msp->ms_deferspace != 0) {
2435                 /*
2436                  * Keep syncing this metaslab until all deferred frees
2437                  * are back in circulation.
2438                  */
2439                 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
2440         }
2441
2442         /*
2443          * Calculate the new weights before unloading any metaslabs.
2444          * This will give us the most accurate weighting.
2445          */
2446         metaslab_group_sort(mg, msp, metaslab_weight(msp));
2447
2448         /*
2449          * If the metaslab is loaded and we've not tried to load or allocate
2450          * from it in 'metaslab_unload_delay' txgs, then unload it.
2451          */
2452         if (msp->ms_loaded &&
2453             msp->ms_selected_txg + metaslab_unload_delay < txg) {
2454
2455                 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
2456                         VERIFY0(range_tree_space(
2457                             msp->ms_alloctree[(txg + t) & TXG_MASK]));
2458                 }
2459
2460                 if (!metaslab_debug_unload)
2461                         metaslab_unload(msp);
2462         }
2463
2464         mutex_exit(&msp->ms_lock);
2465 }
2466
2467 void
2468 metaslab_sync_reassess(metaslab_group_t *mg)
2469 {
2470         metaslab_group_alloc_update(mg);
2471         mg->mg_fragmentation = metaslab_group_fragmentation(mg);
2472
2473         /*
2474          * Preload the next potential metaslabs
2475          */
2476         metaslab_group_preload(mg);
2477 }
2478
2479 static uint64_t
2480 metaslab_distance(metaslab_t *msp, dva_t *dva)
2481 {
2482         uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
2483         uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
2484         uint64_t start = msp->ms_id;
2485
2486         if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
2487                 return (1ULL << 63);
2488
2489         if (offset < start)
2490                 return ((start - offset) << ms_shift);
2491         if (offset > start)
2492                 return ((offset - start) << ms_shift);
2493         return (0);
2494 }
2495
2496 /*
2497  * ==========================================================================
2498  * Metaslab allocation tracing facility
2499  * ==========================================================================
2500  */
2501 #ifdef _METASLAB_TRACING
2502 kstat_t *metaslab_trace_ksp;
2503 kstat_named_t metaslab_trace_over_limit;
2504
2505 void
2506 metaslab_alloc_trace_init(void)
2507 {
2508         ASSERT(metaslab_alloc_trace_cache == NULL);
2509         metaslab_alloc_trace_cache = kmem_cache_create(
2510             "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t),
2511             0, NULL, NULL, NULL, NULL, NULL, 0);
2512         metaslab_trace_ksp = kstat_create("zfs", 0, "metaslab_trace_stats",
2513             "misc", KSTAT_TYPE_NAMED, 1, KSTAT_FLAG_VIRTUAL);
2514         if (metaslab_trace_ksp != NULL) {
2515                 metaslab_trace_ksp->ks_data = &metaslab_trace_over_limit;
2516                 kstat_named_init(&metaslab_trace_over_limit,
2517                     "metaslab_trace_over_limit", KSTAT_DATA_UINT64);
2518                 kstat_install(metaslab_trace_ksp);
2519         }
2520 }
2521
2522 void
2523 metaslab_alloc_trace_fini(void)
2524 {
2525         if (metaslab_trace_ksp != NULL) {
2526                 kstat_delete(metaslab_trace_ksp);
2527                 metaslab_trace_ksp = NULL;
2528         }
2529         kmem_cache_destroy(metaslab_alloc_trace_cache);
2530         metaslab_alloc_trace_cache = NULL;
2531 }
2532
2533 /*
2534  * Add an allocation trace element to the allocation tracing list.
2535  */
2536 static void
2537 metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg,
2538     metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset)
2539 {
2540         metaslab_alloc_trace_t *mat;
2541
2542         if (!metaslab_trace_enabled)
2543                 return;
2544
2545         /*
2546          * When the tracing list reaches its maximum we remove
2547          * the second element in the list before adding a new one.
2548          * By removing the second element we preserve the original
2549          * entry as a clue to what allocations steps have already been
2550          * performed.
2551          */
2552         if (zal->zal_size == metaslab_trace_max_entries) {
2553                 metaslab_alloc_trace_t *mat_next;
2554 #ifdef DEBUG
2555                 panic("too many entries in allocation list");
2556 #endif
2557                 atomic_inc_64(&metaslab_trace_over_limit.value.ui64);
2558                 zal->zal_size--;
2559                 mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list));
2560                 list_remove(&zal->zal_list, mat_next);
2561                 kmem_cache_free(metaslab_alloc_trace_cache, mat_next);
2562         }
2563
2564         mat = kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP);
2565         list_link_init(&mat->mat_list_node);
2566         mat->mat_mg = mg;
2567         mat->mat_msp = msp;
2568         mat->mat_size = psize;
2569         mat->mat_dva_id = dva_id;
2570         mat->mat_offset = offset;
2571         mat->mat_weight = 0;
2572
2573         if (msp != NULL)
2574                 mat->mat_weight = msp->ms_weight;
2575
2576         /*
2577          * The list is part of the zio so locking is not required. Only
2578          * a single thread will perform allocations for a given zio.
2579          */
2580         list_insert_tail(&zal->zal_list, mat);
2581         zal->zal_size++;
2582
2583         ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries);
2584 }
2585
2586 void
2587 metaslab_trace_init(zio_alloc_list_t *zal)
2588 {
2589         list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t),
2590             offsetof(metaslab_alloc_trace_t, mat_list_node));
2591         zal->zal_size = 0;
2592 }
2593
2594 void
2595 metaslab_trace_fini(zio_alloc_list_t *zal)
2596 {
2597         metaslab_alloc_trace_t *mat;
2598
2599         while ((mat = list_remove_head(&zal->zal_list)) != NULL)
2600                 kmem_cache_free(metaslab_alloc_trace_cache, mat);
2601         list_destroy(&zal->zal_list);
2602         zal->zal_size = 0;
2603 }
2604 #else
2605
2606 #define metaslab_trace_add(zal, mg, msp, psize, id, off)
2607
2608 void
2609 metaslab_alloc_trace_init(void)
2610 {
2611 }
2612
2613 void
2614 metaslab_alloc_trace_fini(void)
2615 {
2616 }
2617
2618 void
2619 metaslab_trace_init(zio_alloc_list_t *zal)
2620 {
2621 }
2622
2623 void
2624 metaslab_trace_fini(zio_alloc_list_t *zal)
2625 {
2626 }
2627
2628 #endif /* _METASLAB_TRACING */
2629
2630 /*
2631  * ==========================================================================
2632  * Metaslab block operations
2633  * ==========================================================================
2634  */
2635
2636 static void
2637 metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags)
2638 {
2639         if (!(flags & METASLAB_ASYNC_ALLOC) ||
2640             flags & METASLAB_DONT_THROTTLE)
2641                 return;
2642
2643         metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
2644         if (!mg->mg_class->mc_alloc_throttle_enabled)
2645                 return;
2646
2647         (void) refcount_add(&mg->mg_alloc_queue_depth, tag);
2648 }
2649
2650 void
2651 metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags)
2652 {
2653         if (!(flags & METASLAB_ASYNC_ALLOC) ||
2654             flags & METASLAB_DONT_THROTTLE)
2655                 return;
2656
2657         metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
2658         if (!mg->mg_class->mc_alloc_throttle_enabled)
2659                 return;
2660
2661         (void) refcount_remove(&mg->mg_alloc_queue_depth, tag);
2662 }
2663
2664 void
2665 metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag)
2666 {
2667 #ifdef ZFS_DEBUG
2668         const dva_t *dva = bp->blk_dva;
2669         int ndvas = BP_GET_NDVAS(bp);
2670
2671         for (int d = 0; d < ndvas; d++) {
2672                 uint64_t vdev = DVA_GET_VDEV(&dva[d]);
2673                 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
2674                 VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth, tag));
2675         }
2676 #endif
2677 }
2678
2679 static uint64_t
2680 metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
2681 {
2682         uint64_t start;
2683         range_tree_t *rt = msp->ms_tree;
2684         metaslab_class_t *mc = msp->ms_group->mg_class;
2685
2686         VERIFY(!msp->ms_condensing);
2687
2688         start = mc->mc_ops->msop_alloc(msp, size);
2689         if (start != -1ULL) {
2690                 metaslab_group_t *mg = msp->ms_group;
2691                 vdev_t *vd = mg->mg_vd;
2692
2693                 VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
2694                 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
2695                 VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
2696                 range_tree_remove(rt, start, size);
2697
2698                 if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
2699                         vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
2700
2701                 range_tree_add(msp->ms_alloctree[txg & TXG_MASK], start, size);
2702
2703                 /* Track the last successful allocation */
2704                 msp->ms_alloc_txg = txg;
2705                 metaslab_verify_space(msp, txg);
2706         }
2707
2708         /*
2709          * Now that we've attempted the allocation we need to update the
2710          * metaslab's maximum block size since it may have changed.
2711          */
2712         msp->ms_max_size = metaslab_block_maxsize(msp);
2713         return (start);
2714 }
2715
2716 static uint64_t
2717 metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
2718     uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
2719 {
2720         metaslab_t *msp = NULL;
2721         uint64_t offset = -1ULL;
2722         uint64_t activation_weight;
2723         uint64_t target_distance;
2724         int i;
2725
2726         activation_weight = METASLAB_WEIGHT_PRIMARY;
2727         for (i = 0; i < d; i++) {
2728                 if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
2729                         activation_weight = METASLAB_WEIGHT_SECONDARY;
2730                         break;
2731                 }
2732         }
2733
2734         metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP);
2735         search->ms_weight = UINT64_MAX;
2736         search->ms_start = 0;
2737         for (;;) {
2738                 boolean_t was_active;
2739                 avl_tree_t *t = &mg->mg_metaslab_tree;
2740                 avl_index_t idx;
2741
2742                 mutex_enter(&mg->mg_lock);
2743
2744                 /*
2745                  * Find the metaslab with the highest weight that is less
2746                  * than what we've already tried.  In the common case, this
2747                  * means that we will examine each metaslab at most once.
2748                  * Note that concurrent callers could reorder metaslabs
2749                  * by activation/passivation once we have dropped the mg_lock.
2750                  * If a metaslab is activated by another thread, and we fail
2751                  * to allocate from the metaslab we have selected, we may
2752                  * not try the newly-activated metaslab, and instead activate
2753                  * another metaslab.  This is not optimal, but generally
2754                  * does not cause any problems (a possible exception being
2755                  * if every metaslab is completely full except for the
2756                  * the newly-activated metaslab which we fail to examine).
2757                  */
2758                 msp = avl_find(t, search, &idx);
2759                 if (msp == NULL)
2760                         msp = avl_nearest(t, idx, AVL_AFTER);
2761                 for (; msp != NULL; msp = AVL_NEXT(t, msp)) {
2762
2763                         if (!metaslab_should_allocate(msp, asize)) {
2764                                 metaslab_trace_add(zal, mg, msp, asize, d,
2765                                     TRACE_TOO_SMALL);
2766                                 continue;
2767                         }
2768
2769                         /*
2770                          * If the selected metaslab is condensing, skip it.
2771                          */
2772                         if (msp->ms_condensing)
2773                                 continue;
2774
2775                         was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
2776                         if (activation_weight == METASLAB_WEIGHT_PRIMARY)
2777                                 break;
2778
2779                         target_distance = min_distance +
2780                             (space_map_allocated(msp->ms_sm) != 0 ? 0 :
2781                             min_distance >> 1);
2782
2783                         for (i = 0; i < d; i++) {
2784                                 if (metaslab_distance(msp, &dva[i]) <
2785                                     target_distance)
2786                                         break;
2787                         }
2788                         if (i == d)
2789                                 break;
2790                 }
2791                 mutex_exit(&mg->mg_lock);
2792                 if (msp == NULL) {
2793                         kmem_free(search, sizeof (*search));
2794                         return (-1ULL);
2795                 }
2796                 search->ms_weight = msp->ms_weight;
2797                 search->ms_start = msp->ms_start + 1;
2798
2799                 mutex_enter(&msp->ms_lock);
2800
2801                 /*
2802                  * Ensure that the metaslab we have selected is still
2803                  * capable of handling our request. It's possible that
2804                  * another thread may have changed the weight while we
2805                  * were blocked on the metaslab lock. We check the
2806                  * active status first to see if we need to reselect
2807                  * a new metaslab.
2808                  */
2809                 if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) {
2810                         mutex_exit(&msp->ms_lock);
2811                         continue;
2812                 }
2813
2814                 if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
2815                     activation_weight == METASLAB_WEIGHT_PRIMARY) {
2816                         metaslab_passivate(msp,
2817                             msp->ms_weight & ~METASLAB_ACTIVE_MASK);
2818                         mutex_exit(&msp->ms_lock);
2819                         continue;
2820                 }
2821
2822                 if (metaslab_activate(msp, activation_weight) != 0) {
2823                         mutex_exit(&msp->ms_lock);
2824                         continue;
2825                 }
2826                 msp->ms_selected_txg = txg;
2827
2828                 /*
2829                  * Now that we have the lock, recheck to see if we should
2830                  * continue to use this metaslab for this allocation. The
2831                  * the metaslab is now loaded so metaslab_should_allocate() can
2832                  * accurately determine if the allocation attempt should
2833                  * proceed.
2834                  */
2835                 if (!metaslab_should_allocate(msp, asize)) {
2836                         /* Passivate this metaslab and select a new one. */
2837                         metaslab_trace_add(zal, mg, msp, asize, d,
2838                             TRACE_TOO_SMALL);
2839                         goto next;
2840                 }
2841
2842
2843                 /*
2844                  * If this metaslab is currently condensing then pick again as
2845                  * we can't manipulate this metaslab until it's committed
2846                  * to disk.
2847                  */
2848                 if (msp->ms_condensing) {
2849                         metaslab_trace_add(zal, mg, msp, asize, d,
2850                             TRACE_CONDENSING);
2851                         mutex_exit(&msp->ms_lock);
2852                         continue;
2853                 }
2854
2855                 offset = metaslab_block_alloc(msp, asize, txg);
2856                 metaslab_trace_add(zal, mg, msp, asize, d, offset);
2857
2858                 if (offset != -1ULL) {
2859                         /* Proactively passivate the metaslab, if needed */
2860                         metaslab_segment_may_passivate(msp);
2861                         break;
2862                 }
2863 next:
2864                 ASSERT(msp->ms_loaded);
2865
2866                 /*
2867                  * We were unable to allocate from this metaslab so determine
2868                  * a new weight for this metaslab. Now that we have loaded
2869                  * the metaslab we can provide a better hint to the metaslab
2870                  * selector.
2871                  *
2872                  * For space-based metaslabs, we use the maximum block size.
2873                  * This information is only available when the metaslab
2874                  * is loaded and is more accurate than the generic free
2875                  * space weight that was calculated by metaslab_weight().
2876                  * This information allows us to quickly compare the maximum
2877                  * available allocation in the metaslab to the allocation
2878                  * size being requested.
2879                  *
2880                  * For segment-based metaslabs, determine the new weight
2881                  * based on the highest bucket in the range tree. We
2882                  * explicitly use the loaded segment weight (i.e. the range
2883                  * tree histogram) since it contains the space that is
2884                  * currently available for allocation and is accurate
2885                  * even within a sync pass.
2886                  */
2887                 if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
2888                         uint64_t weight = metaslab_block_maxsize(msp);
2889                         WEIGHT_SET_SPACEBASED(weight);
2890                         metaslab_passivate(msp, weight);
2891                 } else {
2892                         metaslab_passivate(msp,
2893                             metaslab_weight_from_range_tree(msp));
2894                 }
2895
2896                 /*
2897                  * We have just failed an allocation attempt, check
2898                  * that metaslab_should_allocate() agrees. Otherwise,
2899                  * we may end up in an infinite loop retrying the same
2900                  * metaslab.
2901                  */
2902                 ASSERT(!metaslab_should_allocate(msp, asize));
2903                 mutex_exit(&msp->ms_lock);
2904         }
2905         mutex_exit(&msp->ms_lock);
2906         kmem_free(search, sizeof (*search));
2907         return (offset);
2908 }
2909
2910 static uint64_t
2911 metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
2912     uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
2913 {
2914         uint64_t offset;
2915         ASSERT(mg->mg_initialized);
2916
2917         offset = metaslab_group_alloc_normal(mg, zal, asize, txg,
2918             min_distance, dva, d);
2919
2920         mutex_enter(&mg->mg_lock);
2921         if (offset == -1ULL) {
2922                 mg->mg_failed_allocations++;
2923                 metaslab_trace_add(zal, mg, NULL, asize, d,
2924                     TRACE_GROUP_FAILURE);
2925                 if (asize == SPA_GANGBLOCKSIZE) {
2926                         /*
2927                          * This metaslab group was unable to allocate
2928                          * the minimum gang block size so it must be out of
2929                          * space. We must notify the allocation throttle
2930                          * to start skipping allocation attempts to this
2931                          * metaslab group until more space becomes available.
2932                          * Note: this failure cannot be caused by the
2933                          * allocation throttle since the allocation throttle
2934                          * is only responsible for skipping devices and
2935                          * not failing block allocations.
2936                          */
2937                         mg->mg_no_free_space = B_TRUE;
2938                 }
2939         }
2940         mg->mg_allocations++;
2941         mutex_exit(&mg->mg_lock);
2942         return (offset);
2943 }
2944
2945 /*
2946  * If we have to write a ditto block (i.e. more than one DVA for a given BP)
2947  * on the same vdev as an existing DVA of this BP, then try to allocate it
2948  * at least (vdev_asize / (2 ^ ditto_same_vdev_distance_shift)) away from the
2949  * existing DVAs.
2950  */
2951 int ditto_same_vdev_distance_shift = 3;
2952
2953 /*
2954  * Allocate a block for the specified i/o.
2955  */
2956 static int
2957 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
2958     dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags,
2959     zio_alloc_list_t *zal)
2960 {
2961         metaslab_group_t *mg, *fast_mg, *rotor;
2962         vdev_t *vd;
2963         boolean_t try_hard = B_FALSE;
2964
2965         ASSERT(!DVA_IS_VALID(&dva[d]));
2966
2967         /*
2968          * For testing, make some blocks above a certain size be gang blocks.
2969          */
2970         if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0) {
2971                 metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG);
2972                 return (SET_ERROR(ENOSPC));
2973         }
2974
2975         /*
2976          * Start at the rotor and loop through all mgs until we find something.
2977          * Note that there's no locking on mc_rotor or mc_aliquot because
2978          * nothing actually breaks if we miss a few updates -- we just won't
2979          * allocate quite as evenly.  It all balances out over time.
2980          *
2981          * If we are doing ditto or log blocks, try to spread them across
2982          * consecutive vdevs.  If we're forced to reuse a vdev before we've
2983          * allocated all of our ditto blocks, then try and spread them out on
2984          * that vdev as much as possible.  If it turns out to not be possible,
2985          * gradually lower our standards until anything becomes acceptable.
2986          * Also, allocating on consecutive vdevs (as opposed to random vdevs)
2987          * gives us hope of containing our fault domains to something we're
2988          * able to reason about.  Otherwise, any two top-level vdev failures
2989          * will guarantee the loss of data.  With consecutive allocation,
2990          * only two adjacent top-level vdev failures will result in data loss.
2991          *
2992          * If we are doing gang blocks (hintdva is non-NULL), try to keep
2993          * ourselves on the same vdev as our gang block header.  That
2994          * way, we can hope for locality in vdev_cache, plus it makes our
2995          * fault domains something tractable.
2996          */
2997         if (hintdva) {
2998                 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
2999
3000                 /*
3001                  * It's possible the vdev we're using as the hint no
3002                  * longer exists (i.e. removed). Consult the rotor when
3003                  * all else fails.
3004                  */
3005                 if (vd != NULL) {
3006                         mg = vd->vdev_mg;
3007
3008                         if (flags & METASLAB_HINTBP_AVOID &&
3009                             mg->mg_next != NULL)
3010                                 mg = mg->mg_next;
3011                 } else {
3012                         mg = mc->mc_rotor;
3013                 }
3014         } else if (d != 0) {
3015                 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
3016                 mg = vd->vdev_mg->mg_next;
3017         } else if (flags & METASLAB_FASTWRITE) {
3018                 mg = fast_mg = mc->mc_rotor;
3019
3020                 do {
3021                         if (fast_mg->mg_vd->vdev_pending_fastwrite <
3022                             mg->mg_vd->vdev_pending_fastwrite)
3023                                 mg = fast_mg;
3024                 } while ((fast_mg = fast_mg->mg_next) != mc->mc_rotor);
3025
3026         } else {
3027                 mg = mc->mc_rotor;
3028         }
3029
3030         /*
3031          * If the hint put us into the wrong metaslab class, or into a
3032          * metaslab group that has been passivated, just follow the rotor.
3033          */
3034         if (mg->mg_class != mc || mg->mg_activation_count <= 0)
3035                 mg = mc->mc_rotor;
3036
3037         rotor = mg;
3038 top:
3039         do {
3040                 boolean_t allocatable;
3041
3042                 ASSERT(mg->mg_activation_count == 1);
3043                 vd = mg->mg_vd;
3044
3045                 /*
3046                  * Don't allocate from faulted devices.
3047                  */
3048                 if (try_hard) {
3049                         spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
3050                         allocatable = vdev_allocatable(vd);
3051                         spa_config_exit(spa, SCL_ZIO, FTAG);
3052                 } else {
3053                         allocatable = vdev_allocatable(vd);
3054                 }
3055
3056                 /*
3057                  * Determine if the selected metaslab group is eligible
3058                  * for allocations. If we're ganging then don't allow
3059                  * this metaslab group to skip allocations since that would
3060                  * inadvertently return ENOSPC and suspend the pool
3061                  * even though space is still available.
3062                  */
3063                 if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) {
3064                         allocatable = metaslab_group_allocatable(mg, rotor,
3065                             psize);
3066                 }
3067
3068                 if (!allocatable) {
3069                         metaslab_trace_add(zal, mg, NULL, psize, d,
3070                             TRACE_NOT_ALLOCATABLE);
3071                         goto next;
3072                 }
3073
3074                 ASSERT(mg->mg_initialized);
3075
3076                 /*
3077                  * Avoid writing single-copy data to a failing,
3078                  * non-redundant vdev, unless we've already tried all
3079                  * other vdevs.
3080                  */
3081                 if ((vd->vdev_stat.vs_write_errors > 0 ||
3082                     vd->vdev_state < VDEV_STATE_HEALTHY) &&
3083                     d == 0 && !try_hard && vd->vdev_children == 0) {
3084                         metaslab_trace_add(zal, mg, NULL, psize, d,
3085                             TRACE_VDEV_ERROR);
3086                         goto next;
3087                 }
3088
3089                 ASSERT(mg->mg_class == mc);
3090
3091                 /*
3092                  * If we don't need to try hard, then require that the
3093                  * block be 1/8th of the device away from any other DVAs
3094                  * in this BP.  If we are trying hard, allow any offset
3095                  * to be used (distance=0).
3096                  */
3097                 uint64_t distance = 0;
3098                 if (!try_hard) {
3099                         distance = vd->vdev_asize >>
3100                             ditto_same_vdev_distance_shift;
3101                         if (distance <= (1ULL << vd->vdev_ms_shift))
3102                                 distance = 0;
3103                 }
3104
3105                 uint64_t asize = vdev_psize_to_asize(vd, psize);
3106                 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
3107
3108                 uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg,
3109                     distance, dva, d);
3110
3111                 if (offset != -1ULL) {
3112                         /*
3113                          * If we've just selected this metaslab group,
3114                          * figure out whether the corresponding vdev is
3115                          * over- or under-used relative to the pool,
3116                          * and set an allocation bias to even it out.
3117                          *
3118                          * Bias is also used to compensate for unequally
3119                          * sized vdevs so that space is allocated fairly.
3120                          */
3121                         if (mc->mc_aliquot == 0 && metaslab_bias_enabled) {
3122                                 vdev_stat_t *vs = &vd->vdev_stat;
3123                                 int64_t vs_free = vs->vs_space - vs->vs_alloc;
3124                                 int64_t mc_free = mc->mc_space - mc->mc_alloc;
3125                                 int64_t ratio;
3126
3127                                 /*
3128                                  * Calculate how much more or less we should
3129                                  * try to allocate from this device during
3130                                  * this iteration around the rotor.
3131                                  *
3132                                  * This basically introduces a zero-centered
3133                                  * bias towards the devices with the most
3134                                  * free space, while compensating for vdev
3135                                  * size differences.
3136                                  *
3137                                  * Examples:
3138                                  *  vdev V1 = 16M/128M
3139                                  *  vdev V2 = 16M/128M
3140                                  *  ratio(V1) = 100% ratio(V2) = 100%
3141                                  *
3142                                  *  vdev V1 = 16M/128M
3143                                  *  vdev V2 = 64M/128M
3144                                  *  ratio(V1) = 127% ratio(V2) =  72%
3145                                  *
3146                                  *  vdev V1 = 16M/128M
3147                                  *  vdev V2 = 64M/512M
3148                                  *  ratio(V1) =  40% ratio(V2) = 160%
3149                                  */
3150                                 ratio = (vs_free * mc->mc_alloc_groups * 100) /
3151                                     (mc_free + 1);
3152                                 mg->mg_bias = ((ratio - 100) *
3153                                     (int64_t)mg->mg_aliquot) / 100;
3154                         } else if (!metaslab_bias_enabled) {
3155                                 mg->mg_bias = 0;
3156                         }
3157
3158                         if ((flags & METASLAB_FASTWRITE) ||
3159                             atomic_add_64_nv(&mc->mc_aliquot, asize) >=
3160                             mg->mg_aliquot + mg->mg_bias) {
3161                                 mc->mc_rotor = mg->mg_next;
3162                                 mc->mc_aliquot = 0;
3163                         }
3164
3165                         DVA_SET_VDEV(&dva[d], vd->vdev_id);
3166                         DVA_SET_OFFSET(&dva[d], offset);
3167                         DVA_SET_GANG(&dva[d],
3168                             ((flags & METASLAB_GANG_HEADER) ? 1 : 0));
3169                         DVA_SET_ASIZE(&dva[d], asize);
3170
3171                         if (flags & METASLAB_FASTWRITE) {
3172                                 atomic_add_64(&vd->vdev_pending_fastwrite,
3173                                     psize);
3174                         }
3175
3176                         return (0);
3177                 }
3178 next:
3179                 mc->mc_rotor = mg->mg_next;
3180                 mc->mc_aliquot = 0;
3181         } while ((mg = mg->mg_next) != rotor);
3182
3183         /*
3184          * If we haven't tried hard, do so now.
3185          */
3186         if (!try_hard) {
3187                 try_hard = B_TRUE;
3188                 goto top;
3189         }
3190
3191         bzero(&dva[d], sizeof (dva_t));
3192
3193         metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC);
3194         return (SET_ERROR(ENOSPC));
3195 }
3196
3197 /*
3198  * Free the block represented by DVA in the context of the specified
3199  * transaction group.
3200  */
3201 static void
3202 metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now)
3203 {
3204         uint64_t vdev = DVA_GET_VDEV(dva);
3205         uint64_t offset = DVA_GET_OFFSET(dva);
3206         uint64_t size = DVA_GET_ASIZE(dva);
3207         vdev_t *vd;
3208         metaslab_t *msp;
3209
3210         if (txg > spa_freeze_txg(spa))
3211                 return;
3212
3213         if ((vd = vdev_lookup_top(spa, vdev)) == NULL || !DVA_IS_VALID(dva) ||
3214             (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
3215                 zfs_panic_recover("metaslab_free_dva(): bad DVA %llu:%llu:%llu",
3216                     (u_longlong_t)vdev, (u_longlong_t)offset,
3217                     (u_longlong_t)size);
3218                 return;
3219         }
3220
3221         msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
3222
3223         if (DVA_GET_GANG(dva))
3224                 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
3225
3226         mutex_enter(&msp->ms_lock);
3227
3228         if (now) {
3229                 range_tree_remove(msp->ms_alloctree[txg & TXG_MASK],
3230                     offset, size);
3231
3232                 VERIFY(!msp->ms_condensing);
3233                 VERIFY3U(offset, >=, msp->ms_start);
3234                 VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
3235                 VERIFY3U(range_tree_space(msp->ms_tree) + size, <=,
3236                     msp->ms_size);
3237                 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
3238                 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
3239                 range_tree_add(msp->ms_tree, offset, size);
3240                 msp->ms_max_size = metaslab_block_maxsize(msp);
3241         } else {
3242                 VERIFY3U(txg, ==, spa->spa_syncing_txg);
3243                 if (range_tree_space(msp->ms_freeingtree) == 0)
3244                         vdev_dirty(vd, VDD_METASLAB, msp, txg);
3245                 range_tree_add(msp->ms_freeingtree, offset, size);
3246         }
3247
3248         mutex_exit(&msp->ms_lock);
3249 }
3250
3251 /*
3252  * Intent log support: upon opening the pool after a crash, notify the SPA
3253  * of blocks that the intent log has allocated for immediate write, but
3254  * which are still considered free by the SPA because the last transaction
3255  * group didn't commit yet.
3256  */
3257 static int
3258 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
3259 {
3260         uint64_t vdev = DVA_GET_VDEV(dva);
3261         uint64_t offset = DVA_GET_OFFSET(dva);
3262         uint64_t size = DVA_GET_ASIZE(dva);
3263         vdev_t *vd;
3264         metaslab_t *msp;
3265         int error = 0;
3266
3267         ASSERT(DVA_IS_VALID(dva));
3268
3269         if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
3270             (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
3271                 return (SET_ERROR(ENXIO));
3272
3273         msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
3274
3275         if (DVA_GET_GANG(dva))
3276                 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
3277
3278         mutex_enter(&msp->ms_lock);
3279
3280         if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded)
3281                 error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
3282
3283         if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size))
3284                 error = SET_ERROR(ENOENT);
3285
3286         if (error || txg == 0) {        /* txg == 0 indicates dry run */
3287                 mutex_exit(&msp->ms_lock);
3288                 return (error);
3289         }
3290
3291         VERIFY(!msp->ms_condensing);
3292         VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
3293         VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
3294         VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size);
3295         range_tree_remove(msp->ms_tree, offset, size);
3296
3297         if (spa_writeable(spa)) {       /* don't dirty if we're zdb(1M) */
3298                 if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
3299                         vdev_dirty(vd, VDD_METASLAB, msp, txg);
3300                 range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size);
3301         }
3302
3303         mutex_exit(&msp->ms_lock);
3304
3305         return (0);
3306 }
3307
3308 /*
3309  * Reserve some allocation slots. The reservation system must be called
3310  * before we call into the allocator. If there aren't any available slots
3311  * then the I/O will be throttled until an I/O completes and its slots are
3312  * freed up. The function returns true if it was successful in placing
3313  * the reservation.
3314  */
3315 boolean_t
3316 metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio,
3317     int flags)
3318 {
3319         uint64_t available_slots = 0;
3320         boolean_t slot_reserved = B_FALSE;
3321
3322         ASSERT(mc->mc_alloc_throttle_enabled);
3323         mutex_enter(&mc->mc_lock);
3324
3325         uint64_t reserved_slots = refcount_count(&mc->mc_alloc_slots);
3326         if (reserved_slots < mc->mc_alloc_max_slots)
3327                 available_slots = mc->mc_alloc_max_slots - reserved_slots;
3328
3329         if (slots <= available_slots || GANG_ALLOCATION(flags)) {
3330                 /*
3331                  * We reserve the slots individually so that we can unreserve
3332                  * them individually when an I/O completes.
3333                  */
3334                 for (int d = 0; d < slots; d++) {
3335                         reserved_slots = refcount_add(&mc->mc_alloc_slots, zio);
3336                 }
3337                 zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
3338                 slot_reserved = B_TRUE;
3339         }
3340
3341         mutex_exit(&mc->mc_lock);
3342         return (slot_reserved);
3343 }
3344
3345 void
3346 metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, zio_t *zio)
3347 {
3348         ASSERT(mc->mc_alloc_throttle_enabled);
3349         mutex_enter(&mc->mc_lock);
3350         for (int d = 0; d < slots; d++) {
3351                 (void) refcount_remove(&mc->mc_alloc_slots, zio);
3352         }
3353         mutex_exit(&mc->mc_lock);
3354 }
3355
3356 int
3357 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
3358     int ndvas, uint64_t txg, blkptr_t *hintbp, int flags,
3359     zio_alloc_list_t *zal, zio_t *zio)
3360 {
3361         dva_t *dva = bp->blk_dva;
3362         dva_t *hintdva = hintbp->blk_dva;
3363         int error = 0;
3364
3365         ASSERT(bp->blk_birth == 0);
3366         ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
3367
3368         spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
3369
3370         if (mc->mc_rotor == NULL) {     /* no vdevs in this class */
3371                 spa_config_exit(spa, SCL_ALLOC, FTAG);
3372                 return (SET_ERROR(ENOSPC));
3373         }
3374
3375         ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
3376         ASSERT(BP_GET_NDVAS(bp) == 0);
3377         ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
3378         ASSERT3P(zal, !=, NULL);
3379
3380         for (int d = 0; d < ndvas; d++) {
3381                 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
3382                     txg, flags, zal);
3383                 if (error != 0) {
3384                         for (d--; d >= 0; d--) {
3385                                 metaslab_free_dva(spa, &dva[d], txg, B_TRUE);
3386                                 metaslab_group_alloc_decrement(spa,
3387                                     DVA_GET_VDEV(&dva[d]), zio, flags);
3388                                 bzero(&dva[d], sizeof (dva_t));
3389                         }
3390                         spa_config_exit(spa, SCL_ALLOC, FTAG);
3391                         return (error);
3392                 } else {
3393                         /*
3394                          * Update the metaslab group's queue depth
3395                          * based on the newly allocated dva.
3396                          */
3397                         metaslab_group_alloc_increment(spa,
3398                             DVA_GET_VDEV(&dva[d]), zio, flags);
3399                 }
3400
3401         }
3402         ASSERT(error == 0);
3403         ASSERT(BP_GET_NDVAS(bp) == ndvas);
3404
3405         spa_config_exit(spa, SCL_ALLOC, FTAG);
3406
3407         BP_SET_BIRTH(bp, txg, 0);
3408
3409         return (0);
3410 }
3411
3412 void
3413 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
3414 {
3415         const dva_t *dva = bp->blk_dva;
3416         int ndvas = BP_GET_NDVAS(bp);
3417
3418         ASSERT(!BP_IS_HOLE(bp));
3419         ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
3420
3421         spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
3422
3423         for (int d = 0; d < ndvas; d++)
3424                 metaslab_free_dva(spa, &dva[d], txg, now);
3425
3426         spa_config_exit(spa, SCL_FREE, FTAG);
3427 }
3428
3429 int
3430 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
3431 {
3432         const dva_t *dva = bp->blk_dva;
3433         int ndvas = BP_GET_NDVAS(bp);
3434         int error = 0;
3435
3436         ASSERT(!BP_IS_HOLE(bp));
3437
3438         if (txg != 0) {
3439                 /*
3440                  * First do a dry run to make sure all DVAs are claimable,
3441                  * so we don't have to unwind from partial failures below.
3442                  */
3443                 if ((error = metaslab_claim(spa, bp, 0)) != 0)
3444                         return (error);
3445         }
3446
3447         spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
3448
3449         for (int d = 0; d < ndvas; d++)
3450                 if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0)
3451                         break;
3452
3453         spa_config_exit(spa, SCL_ALLOC, FTAG);
3454
3455         ASSERT(error == 0 || txg == 0);
3456
3457         return (error);
3458 }
3459
3460 void
3461 metaslab_fastwrite_mark(spa_t *spa, const blkptr_t *bp)
3462 {
3463         const dva_t *dva = bp->blk_dva;
3464         int ndvas = BP_GET_NDVAS(bp);
3465         uint64_t psize = BP_GET_PSIZE(bp);
3466         int d;
3467         vdev_t *vd;
3468
3469         ASSERT(!BP_IS_HOLE(bp));
3470         ASSERT(!BP_IS_EMBEDDED(bp));
3471         ASSERT(psize > 0);
3472
3473         spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
3474
3475         for (d = 0; d < ndvas; d++) {
3476                 if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL)
3477                         continue;
3478                 atomic_add_64(&vd->vdev_pending_fastwrite, psize);
3479         }
3480
3481         spa_config_exit(spa, SCL_VDEV, FTAG);
3482 }
3483
3484 void
3485 metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp)
3486 {
3487         const dva_t *dva = bp->blk_dva;
3488         int ndvas = BP_GET_NDVAS(bp);
3489         uint64_t psize = BP_GET_PSIZE(bp);
3490         int d;
3491         vdev_t *vd;
3492
3493         ASSERT(!BP_IS_HOLE(bp));
3494         ASSERT(!BP_IS_EMBEDDED(bp));
3495         ASSERT(psize > 0);
3496
3497         spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
3498
3499         for (d = 0; d < ndvas; d++) {
3500                 if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL)
3501                         continue;
3502                 ASSERT3U(vd->vdev_pending_fastwrite, >=, psize);
3503                 atomic_sub_64(&vd->vdev_pending_fastwrite, psize);
3504         }
3505
3506         spa_config_exit(spa, SCL_VDEV, FTAG);
3507 }
3508
3509 void
3510 metaslab_check_free(spa_t *spa, const blkptr_t *bp)
3511 {
3512         if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
3513                 return;
3514
3515         spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
3516         for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
3517                 uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
3518                 vdev_t *vd = vdev_lookup_top(spa, vdev);
3519                 uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
3520                 uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
3521                 metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
3522
3523                 if (msp->ms_loaded)
3524                         range_tree_verify(msp->ms_tree, offset, size);
3525
3526                 range_tree_verify(msp->ms_freeingtree, offset, size);
3527                 range_tree_verify(msp->ms_freedtree, offset, size);
3528                 for (int j = 0; j < TXG_DEFER_SIZE; j++)
3529                         range_tree_verify(msp->ms_defertree[j], offset, size);
3530         }
3531         spa_config_exit(spa, SCL_VDEV, FTAG);
3532 }
3533
3534 #if defined(_KERNEL) && defined(HAVE_SPL)
3535 /* CSTYLED */
3536 module_param(metaslab_aliquot, ulong, 0644);
3537 MODULE_PARM_DESC(metaslab_aliquot,
3538         "allocation granularity (a.k.a. stripe size)");
3539
3540 module_param(metaslab_debug_load, int, 0644);
3541 MODULE_PARM_DESC(metaslab_debug_load,
3542         "load all metaslabs when pool is first opened");
3543
3544 module_param(metaslab_debug_unload, int, 0644);
3545 MODULE_PARM_DESC(metaslab_debug_unload,
3546         "prevent metaslabs from being unloaded");
3547
3548 module_param(metaslab_preload_enabled, int, 0644);
3549 MODULE_PARM_DESC(metaslab_preload_enabled,
3550         "preload potential metaslabs during reassessment");
3551
3552 module_param(zfs_mg_noalloc_threshold, int, 0644);
3553 MODULE_PARM_DESC(zfs_mg_noalloc_threshold,
3554         "percentage of free space for metaslab group to allow allocation");
3555
3556 module_param(zfs_mg_fragmentation_threshold, int, 0644);
3557 MODULE_PARM_DESC(zfs_mg_fragmentation_threshold,
3558         "fragmentation for metaslab group to allow allocation");
3559
3560 module_param(zfs_metaslab_fragmentation_threshold, int, 0644);
3561 MODULE_PARM_DESC(zfs_metaslab_fragmentation_threshold,
3562         "fragmentation for metaslab to allow allocation");
3563
3564 module_param(metaslab_fragmentation_factor_enabled, int, 0644);
3565 MODULE_PARM_DESC(metaslab_fragmentation_factor_enabled,
3566         "use the fragmentation metric to prefer less fragmented metaslabs");
3567
3568 module_param(metaslab_lba_weighting_enabled, int, 0644);
3569 MODULE_PARM_DESC(metaslab_lba_weighting_enabled,
3570         "prefer metaslabs with lower LBAs");
3571
3572 module_param(metaslab_bias_enabled, int, 0644);
3573 MODULE_PARM_DESC(metaslab_bias_enabled,
3574         "enable metaslab group biasing");
3575
3576 module_param(zfs_metaslab_segment_weight_enabled, int, 0644);
3577 MODULE_PARM_DESC(zfs_metaslab_segment_weight_enabled,
3578         "enable segment-based metaslab selection");
3579
3580 module_param(zfs_metaslab_switch_threshold, int, 0644);
3581 MODULE_PARM_DESC(zfs_metaslab_switch_threshold,
3582         "segment-based metaslab selection maximum buckets before switching");
3583 #endif /* _KERNEL && HAVE_SPL */