module/zfs/metaslab.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
  24  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  25  */
  26
  27 #include <sys/zfs_context.h>
  28 #include <sys/dmu.h>
  29 #include <sys/dmu_tx.h>
  30 #include <sys/space_map.h>
  31 #include <sys/metaslab_impl.h>
  32 #include <sys/vdev_impl.h>
  33 #include <sys/zio.h>
  34 #include <sys/spa_impl.h>
  35 #include <sys/zfeature.h>
  36 #include <sys/vdev_indirect_mapping.h>
  37
  38 #define WITH_DF_BLOCK_ALLOCATOR
  39
  40 #define GANG_ALLOCATION(flags) \
  41         ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER))
  42
  43 /*
  44  * Metaslab granularity, in bytes. This is roughly similar to what would be
  45  * referred to as the "stripe size" in traditional RAID arrays. In normal
  46  * operation, we will try to write this amount of data to a top-level vdev
  47  * before moving on to the next one.
  48  */
  49 unsigned long metaslab_aliquot = 512 << 10;
  50
  51 /*
  52  * For testing, make some blocks above a certain size be gang blocks.
  53  */
  54 unsigned long metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1;
  55
  56 /*
  57  * The in-core space map representation is more compact than its on-disk form.
  58  * The zfs_condense_pct determines how much more compact the in-core
  59  * space map representation must be before we compact it on-disk.
  60  * Values should be greater than or equal to 100.
  61  */
  62 int zfs_condense_pct = 200;
  63
  64 /*
  65  * Condensing a metaslab is not guaranteed to actually reduce the amount of
  66  * space used on disk. In particular, a space map uses data in increments of
  67  * MAX(1 << ashift, space_map_blksz), so a metaslab might use the
  68  * same number of blocks after condensing. Since the goal of condensing is to
  69  * reduce the number of IOPs required to read the space map, we only want to
  70  * condense when we can be sure we will reduce the number of blocks used by the
  71  * space map. Unfortunately, we cannot precisely compute whether or not this is
  72  * the case in metaslab_should_condense since we are holding ms_lock. Instead,
  73  * we apply the following heuristic: do not condense a spacemap unless the
  74  * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold
  75  * blocks.
  76  */
  77 int zfs_metaslab_condense_block_threshold = 4;
  78
  79 /*
  80  * The zfs_mg_noalloc_threshold defines which metaslab groups should
  81  * be eligible for allocation. The value is defined as a percentage of
  82  * free space. Metaslab groups that have more free space than
  83  * zfs_mg_noalloc_threshold are always eligible for allocations. Once
  84  * a metaslab group's free space is less than or equal to the
  85  * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
  86  * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
  87  * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
  88  * groups are allowed to accept allocations. Gang blocks are always
  89  * eligible to allocate on any metaslab group. The default value of 0 means
  90  * no metaslab group will be excluded based on this criterion.
  91  */
  92 int zfs_mg_noalloc_threshold = 0;
  93
  94 /*
  95  * Metaslab groups are considered eligible for allocations if their
  96  * fragmenation metric (measured as a percentage) is less than or equal to
  97  * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold
  98  * then it will be skipped unless all metaslab groups within the metaslab
  99  * class have also crossed this threshold.
 100  */
 101 int zfs_mg_fragmentation_threshold = 85;
 102
 103 /*
 104  * Allow metaslabs to keep their active state as long as their fragmentation
 105  * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An
 106  * active metaslab that exceeds this threshold will no longer keep its active
 107  * status allowing better metaslabs to be selected.
 108  */
 109 int zfs_metaslab_fragmentation_threshold = 70;
 110
 111 /*
 112  * When set will load all metaslabs when pool is first opened.
 113  */
 114 int metaslab_debug_load = 0;
 115
 116 /*
 117  * When set will prevent metaslabs from being unloaded.
 118  */
 119 int metaslab_debug_unload = 0;
 120
 121 /*
 122  * Minimum size which forces the dynamic allocator to change
 123  * it's allocation strategy.  Once the space map cannot satisfy
 124  * an allocation of this size then it switches to using more
 125  * aggressive strategy (i.e search by size rather than offset).
 126  */
 127 uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE;
 128
 129 /*
 130  * The minimum free space, in percent, which must be available
 131  * in a space map to continue allocations in a first-fit fashion.
 132  * Once the space map's free space drops below this level we dynamically
 133  * switch to using best-fit allocations.
 134  */
 135 int metaslab_df_free_pct = 4;
 136
 137 /*
 138  * Percentage of all cpus that can be used by the metaslab taskq.
 139  */
 140 int metaslab_load_pct = 50;
 141
 142 /*
 143  * Determines how many txgs a metaslab may remain loaded without having any
 144  * allocations from it. As long as a metaslab continues to be used we will
 145  * keep it loaded.
 146  */
 147 int metaslab_unload_delay = TXG_SIZE * 2;
 148
 149 /*
 150  * Max number of metaslabs per group to preload.
 151  */
 152 int metaslab_preload_limit = SPA_DVAS_PER_BP;
 153
 154 /*
 155  * Enable/disable preloading of metaslab.
 156  */
 157 int metaslab_preload_enabled = B_TRUE;
 158
 159 /*
 160  * Enable/disable fragmentation weighting on metaslabs.
 161  */
 162 int metaslab_fragmentation_factor_enabled = B_TRUE;
 163
 164 /*
 165  * Enable/disable lba weighting (i.e. outer tracks are given preference).
 166  */
 167 int metaslab_lba_weighting_enabled = B_TRUE;
 168
 169 /*
 170  * Enable/disable metaslab group biasing.
 171  */
 172 int metaslab_bias_enabled = B_TRUE;
 173
 174
 175 /*
 176  * Enable/disable remapping of indirect DVAs to their concrete vdevs.
 177  */
 178 boolean_t zfs_remap_blkptr_enable = B_TRUE;
 179
 180 /*
 181  * Enable/disable segment-based metaslab selection.
 182  */
 183 int zfs_metaslab_segment_weight_enabled = B_TRUE;
 184
 185 /*
 186  * When using segment-based metaslab selection, we will continue
 187  * allocating from the active metaslab until we have exhausted
 188  * zfs_metaslab_switch_threshold of its buckets.
 189  */
 190 int zfs_metaslab_switch_threshold = 2;
 191
 192 /*
 193  * Internal switch to enable/disable the metaslab allocation tracing
 194  * facility.
 195  */
 196 #ifdef _METASLAB_TRACING
 197 boolean_t metaslab_trace_enabled = B_TRUE;
 198 #endif
 199
 200 /*
 201  * Maximum entries that the metaslab allocation tracing facility will keep
 202  * in a given list when running in non-debug mode. We limit the number
 203  * of entries in non-debug mode to prevent us from using up too much memory.
 204  * The limit should be sufficiently large that we don't expect any allocation
 205  * to every exceed this value. In debug mode, the system will panic if this
 206  * limit is ever reached allowing for further investigation.
 207  */
 208 #ifdef _METASLAB_TRACING
 209 uint64_t metaslab_trace_max_entries = 5000;
 210 #endif
 211
 212 static uint64_t metaslab_weight(metaslab_t *);
 213 static void metaslab_set_fragmentation(metaslab_t *);
 214 static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, uint64_t);
 215 static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t);
 216
 217 #ifdef _METASLAB_TRACING
 218 kmem_cache_t *metaslab_alloc_trace_cache;
 219 #endif
 220
 221 /*
 222  * ==========================================================================
 223  * Metaslab classes
 224  * ==========================================================================
 225  */
 226 metaslab_class_t *
 227 metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
 228 {
 229         metaslab_class_t *mc;
 230
 231         mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
 232
 233         mc->mc_spa = spa;
 234         mc->mc_rotor = NULL;
 235         mc->mc_ops = ops;
 236         mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
 237         refcount_create_tracked(&mc->mc_alloc_slots);
 238
 239         return (mc);
 240 }
 241
 242 void
 243 metaslab_class_destroy(metaslab_class_t *mc)
 244 {
 245         ASSERT(mc->mc_rotor == NULL);
 246         ASSERT(mc->mc_alloc == 0);
 247         ASSERT(mc->mc_deferred == 0);
 248         ASSERT(mc->mc_space == 0);
 249         ASSERT(mc->mc_dspace == 0);
 250
 251         refcount_destroy(&mc->mc_alloc_slots);
 252         mutex_destroy(&mc->mc_lock);
 253         kmem_free(mc, sizeof (metaslab_class_t));
 254 }
 255
 256 int
 257 metaslab_class_validate(metaslab_class_t *mc)
 258 {
 259         metaslab_group_t *mg;
 260         vdev_t *vd;
 261
 262         /*
 263          * Must hold one of the spa_config locks.
 264          */
 265         ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
 266             spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
 267
 268         if ((mg = mc->mc_rotor) == NULL)
 269                 return (0);
 270
 271         do {
 272                 vd = mg->mg_vd;
 273                 ASSERT(vd->vdev_mg != NULL);
 274                 ASSERT3P(vd->vdev_top, ==, vd);
 275                 ASSERT3P(mg->mg_class, ==, mc);
 276                 ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
 277         } while ((mg = mg->mg_next) != mc->mc_rotor);
 278
 279         return (0);
 280 }
 281
 282 void
 283 metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
 284     int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
 285 {
 286         atomic_add_64(&mc->mc_alloc, alloc_delta);
 287         atomic_add_64(&mc->mc_deferred, defer_delta);
 288         atomic_add_64(&mc->mc_space, space_delta);
 289         atomic_add_64(&mc->mc_dspace, dspace_delta);
 290 }
 291
 292 uint64_t
 293 metaslab_class_get_alloc(metaslab_class_t *mc)
 294 {
 295         return (mc->mc_alloc);
 296 }
 297
 298 uint64_t
 299 metaslab_class_get_deferred(metaslab_class_t *mc)
 300 {
 301         return (mc->mc_deferred);
 302 }
 303
 304 uint64_t
 305 metaslab_class_get_space(metaslab_class_t *mc)
 306 {
 307         return (mc->mc_space);
 308 }
 309
 310 uint64_t
 311 metaslab_class_get_dspace(metaslab_class_t *mc)
 312 {
 313         return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
 314 }
 315
 316 void
 317 metaslab_class_histogram_verify(metaslab_class_t *mc)
 318 {
 319         vdev_t *rvd = mc->mc_spa->spa_root_vdev;
 320         uint64_t *mc_hist;
 321         int i;
 322
 323         if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
 324                 return;
 325
 326         mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
 327             KM_SLEEP);
 328
 329         for (int c = 0; c < rvd->vdev_children; c++) {
 330                 vdev_t *tvd = rvd->vdev_child[c];
 331                 metaslab_group_t *mg = tvd->vdev_mg;
 332
 333                 /*
 334                  * Skip any holes, uninitialized top-levels, or
 335                  * vdevs that are not in this metalab class.
 336                  */
 337                 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
 338                     mg->mg_class != mc) {
 339                         continue;
 340                 }
 341
 342                 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
 343                         mc_hist[i] += mg->mg_histogram[i];
 344         }
 345
 346         for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
 347                 VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]);
 348
 349         kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
 350 }
 351
 352 /*
 353  * Calculate the metaslab class's fragmentation metric. The metric
 354  * is weighted based on the space contribution of each metaslab group.
 355  * The return value will be a number between 0 and 100 (inclusive), or
 356  * ZFS_FRAG_INVALID if the metric has not been set. See comment above the
 357  * zfs_frag_table for more information about the metric.
 358  */
 359 uint64_t
 360 metaslab_class_fragmentation(metaslab_class_t *mc)
 361 {
 362         vdev_t *rvd = mc->mc_spa->spa_root_vdev;
 363         uint64_t fragmentation = 0;
 364
 365         spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
 366
 367         for (int c = 0; c < rvd->vdev_children; c++) {
 368                 vdev_t *tvd = rvd->vdev_child[c];
 369                 metaslab_group_t *mg = tvd->vdev_mg;
 370
 371                 /*
 372                  * Skip any holes, uninitialized top-levels,
 373                  * or vdevs that are not in this metalab class.
 374                  */
 375                 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
 376                     mg->mg_class != mc) {
 377                         continue;
 378                 }
 379
 380                 /*
 381                  * If a metaslab group does not contain a fragmentation
 382                  * metric then just bail out.
 383                  */
 384                 if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
 385                         spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
 386                         return (ZFS_FRAG_INVALID);
 387                 }
 388
 389                 /*
 390                  * Determine how much this metaslab_group is contributing
 391                  * to the overall pool fragmentation metric.
 392                  */
 393                 fragmentation += mg->mg_fragmentation *
 394                     metaslab_group_get_space(mg);
 395         }
 396         fragmentation /= metaslab_class_get_space(mc);
 397
 398         ASSERT3U(fragmentation, <=, 100);
 399         spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
 400         return (fragmentation);
 401 }
 402
 403 /*
 404  * Calculate the amount of expandable space that is available in
 405  * this metaslab class. If a device is expanded then its expandable
 406  * space will be the amount of allocatable space that is currently not
 407  * part of this metaslab class.
 408  */
 409 uint64_t
 410 metaslab_class_expandable_space(metaslab_class_t *mc)
 411 {
 412         vdev_t *rvd = mc->mc_spa->spa_root_vdev;
 413         uint64_t space = 0;
 414
 415         spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
 416         for (int c = 0; c < rvd->vdev_children; c++) {
 417                 vdev_t *tvd = rvd->vdev_child[c];
 418                 metaslab_group_t *mg = tvd->vdev_mg;
 419
 420                 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
 421                     mg->mg_class != mc) {
 422                         continue;
 423                 }
 424
 425                 /*
 426                  * Calculate if we have enough space to add additional
 427                  * metaslabs. We report the expandable space in terms
 428                  * of the metaslab size since that's the unit of expansion.
 429                  */
 430                 space += P2ALIGN(tvd->vdev_max_asize - tvd->vdev_asize,
 431                     1ULL << tvd->vdev_ms_shift);
 432         }
 433         spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
 434         return (space);
 435 }
 436
 437 static int
 438 metaslab_compare(const void *x1, const void *x2)
 439 {
 440         const metaslab_t *m1 = (const metaslab_t *)x1;
 441         const metaslab_t *m2 = (const metaslab_t *)x2;
 442
 443         int cmp = AVL_CMP(m2->ms_weight, m1->ms_weight);
 444         if (likely(cmp))
 445                 return (cmp);
 446
 447         IMPLY(AVL_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2);
 448
 449         return (AVL_CMP(m1->ms_start, m2->ms_start));
 450 }
 451
 452 /*
 453  * Verify that the space accounting on disk matches the in-core range_trees.
 454  */
 455 void
 456 metaslab_verify_space(metaslab_t *msp, uint64_t txg)
 457 {
 458         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 459         uint64_t allocated = 0;
 460         uint64_t sm_free_space, msp_free_space;
 461
 462         ASSERT(MUTEX_HELD(&msp->ms_lock));
 463
 464         if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
 465                 return;
 466
 467         /*
 468          * We can only verify the metaslab space when we're called
 469          * from syncing context with a loaded metaslab that has an allocated
 470          * space map. Calling this in non-syncing context does not
 471          * provide a consistent view of the metaslab since we're performing
 472          * allocations in the future.
 473          */
 474         if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL ||
 475             !msp->ms_loaded)
 476                 return;
 477
 478         sm_free_space = msp->ms_size - space_map_allocated(msp->ms_sm) -
 479             space_map_alloc_delta(msp->ms_sm);
 480
 481         /*
 482          * Account for future allocations since we would have already
 483          * deducted that space from the ms_freetree.
 484          */
 485         for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
 486                 allocated +=
 487                     range_tree_space(msp->ms_alloctree[(txg + t) & TXG_MASK]);
 488         }
 489
 490         msp_free_space = range_tree_space(msp->ms_tree) + allocated +
 491             msp->ms_deferspace + range_tree_space(msp->ms_freedtree);
 492
 493         VERIFY3U(sm_free_space, ==, msp_free_space);
 494 }
 495
 496 /*
 497  * ==========================================================================
 498  * Metaslab groups
 499  * ==========================================================================
 500  */
 501 /*
 502  * Update the allocatable flag and the metaslab group's capacity.
 503  * The allocatable flag is set to true if the capacity is below
 504  * the zfs_mg_noalloc_threshold or has a fragmentation value that is
 505  * greater than zfs_mg_fragmentation_threshold. If a metaslab group
 506  * transitions from allocatable to non-allocatable or vice versa then the
 507  * metaslab group's class is updated to reflect the transition.
 508  */
 509 static void
 510 metaslab_group_alloc_update(metaslab_group_t *mg)
 511 {
 512         vdev_t *vd = mg->mg_vd;
 513         metaslab_class_t *mc = mg->mg_class;
 514         vdev_stat_t *vs = &vd->vdev_stat;
 515         boolean_t was_allocatable;
 516         boolean_t was_initialized;
 517
 518         ASSERT(vd == vd->vdev_top);
 519         ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_READER), ==,
 520             SCL_ALLOC);
 521
 522         mutex_enter(&mg->mg_lock);
 523         was_allocatable = mg->mg_allocatable;
 524         was_initialized = mg->mg_initialized;
 525
 526         mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
 527             (vs->vs_space + 1);
 528
 529         mutex_enter(&mc->mc_lock);
 530
 531         /*
 532          * If the metaslab group was just added then it won't
 533          * have any space until we finish syncing out this txg.
 534          * At that point we will consider it initialized and available
 535          * for allocations.  We also don't consider non-activated
 536          * metaslab groups (e.g. vdevs that are in the middle of being removed)
 537          * to be initialized, because they can't be used for allocation.
 538          */
 539         mg->mg_initialized = metaslab_group_initialized(mg);
 540         if (!was_initialized && mg->mg_initialized) {
 541                 mc->mc_groups++;
 542         } else if (was_initialized && !mg->mg_initialized) {
 543                 ASSERT3U(mc->mc_groups, >, 0);
 544                 mc->mc_groups--;
 545         }
 546         if (mg->mg_initialized)
 547                 mg->mg_no_free_space = B_FALSE;
 548
 549         /*
 550          * A metaslab group is considered allocatable if it has plenty
 551          * of free space or is not heavily fragmented. We only take
 552          * fragmentation into account if the metaslab group has a valid
 553          * fragmentation metric (i.e. a value between 0 and 100).
 554          */
 555         mg->mg_allocatable = (mg->mg_activation_count > 0 &&
 556             mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
 557             (mg->mg_fragmentation == ZFS_FRAG_INVALID ||
 558             mg->mg_fragmentation <= zfs_mg_fragmentation_threshold));
 559
 560         /*
 561          * The mc_alloc_groups maintains a count of the number of
 562          * groups in this metaslab class that are still above the
 563          * zfs_mg_noalloc_threshold. This is used by the allocating
 564          * threads to determine if they should avoid allocations to
 565          * a given group. The allocator will avoid allocations to a group
 566          * if that group has reached or is below the zfs_mg_noalloc_threshold
 567          * and there are still other groups that are above the threshold.
 568          * When a group transitions from allocatable to non-allocatable or
 569          * vice versa we update the metaslab class to reflect that change.
 570          * When the mc_alloc_groups value drops to 0 that means that all
 571          * groups have reached the zfs_mg_noalloc_threshold making all groups
 572          * eligible for allocations. This effectively means that all devices
 573          * are balanced again.
 574          */
 575         if (was_allocatable && !mg->mg_allocatable)
 576                 mc->mc_alloc_groups--;
 577         else if (!was_allocatable && mg->mg_allocatable)
 578                 mc->mc_alloc_groups++;
 579         mutex_exit(&mc->mc_lock);
 580
 581         mutex_exit(&mg->mg_lock);
 582 }
 583
 584 metaslab_group_t *
 585 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
 586 {
 587         metaslab_group_t *mg;
 588
 589         mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
 590         mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
 591         avl_create(&mg->mg_metaslab_tree, metaslab_compare,
 592             sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
 593         mg->mg_vd = vd;
 594         mg->mg_class = mc;
 595         mg->mg_activation_count = 0;
 596         mg->mg_initialized = B_FALSE;
 597         mg->mg_no_free_space = B_TRUE;
 598         refcount_create_tracked(&mg->mg_alloc_queue_depth);
 599
 600         mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
 601             maxclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT | TASKQ_DYNAMIC);
 602
 603         return (mg);
 604 }
 605
 606 void
 607 metaslab_group_destroy(metaslab_group_t *mg)
 608 {
 609         ASSERT(mg->mg_prev == NULL);
 610         ASSERT(mg->mg_next == NULL);
 611         /*
 612          * We may have gone below zero with the activation count
 613          * either because we never activated in the first place or
 614          * because we're done, and possibly removing the vdev.
 615          */
 616         ASSERT(mg->mg_activation_count <= 0);
 617
 618         taskq_destroy(mg->mg_taskq);
 619         avl_destroy(&mg->mg_metaslab_tree);
 620         mutex_destroy(&mg->mg_lock);
 621         refcount_destroy(&mg->mg_alloc_queue_depth);
 622         kmem_free(mg, sizeof (metaslab_group_t));
 623 }
 624
 625 void
 626 metaslab_group_activate(metaslab_group_t *mg)
 627 {
 628         metaslab_class_t *mc = mg->mg_class;
 629         metaslab_group_t *mgprev, *mgnext;
 630
 631         ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER), !=, 0);
 632
 633         ASSERT(mc->mc_rotor != mg);
 634         ASSERT(mg->mg_prev == NULL);
 635         ASSERT(mg->mg_next == NULL);
 636         ASSERT(mg->mg_activation_count <= 0);
 637
 638         if (++mg->mg_activation_count <= 0)
 639                 return;
 640
 641         mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
 642         metaslab_group_alloc_update(mg);
 643
 644         if ((mgprev = mc->mc_rotor) == NULL) {
 645                 mg->mg_prev = mg;
 646                 mg->mg_next = mg;
 647         } else {
 648                 mgnext = mgprev->mg_next;
 649                 mg->mg_prev = mgprev;
 650                 mg->mg_next = mgnext;
 651                 mgprev->mg_next = mg;
 652                 mgnext->mg_prev = mg;
 653         }
 654         mc->mc_rotor = mg;
 655 }
 656
 657 /*
 658  * Passivate a metaslab group and remove it from the allocation rotor.
 659  * Callers must hold both the SCL_ALLOC and SCL_ZIO lock prior to passivating
 660  * a metaslab group. This function will momentarily drop spa_config_locks
 661  * that are lower than the SCL_ALLOC lock (see comment below).
 662  */
 663 void
 664 metaslab_group_passivate(metaslab_group_t *mg)
 665 {
 666         metaslab_class_t *mc = mg->mg_class;
 667         spa_t *spa = mc->mc_spa;
 668         metaslab_group_t *mgprev, *mgnext;
 669         int locks = spa_config_held(spa, SCL_ALL, RW_WRITER);
 670
 671         ASSERT3U(spa_config_held(spa, SCL_ALLOC | SCL_ZIO, RW_WRITER), ==,
 672             (SCL_ALLOC | SCL_ZIO));
 673
 674         if (--mg->mg_activation_count != 0) {
 675                 ASSERT(mc->mc_rotor != mg);
 676                 ASSERT(mg->mg_prev == NULL);
 677                 ASSERT(mg->mg_next == NULL);
 678                 ASSERT(mg->mg_activation_count < 0);
 679                 return;
 680         }
 681
 682         /*
 683          * The spa_config_lock is an array of rwlocks, ordered as
 684          * follows (from highest to lowest):
 685          *      SCL_CONFIG > SCL_STATE > SCL_L2ARC > SCL_ALLOC >
 686          *      SCL_ZIO > SCL_FREE > SCL_VDEV
 687          * (For more information about the spa_config_lock see spa_misc.c)
 688          * The higher the lock, the broader its coverage. When we passivate
 689          * a metaslab group, we must hold both the SCL_ALLOC and the SCL_ZIO
 690          * config locks. However, the metaslab group's taskq might be trying
 691          * to preload metaslabs so we must drop the SCL_ZIO lock and any
 692          * lower locks to allow the I/O to complete. At a minimum,
 693          * we continue to hold the SCL_ALLOC lock, which prevents any future
 694          * allocations from taking place and any changes to the vdev tree.
 695          */
 696         spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa);
 697         taskq_wait_outstanding(mg->mg_taskq, 0);
 698         spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER);
 699         metaslab_group_alloc_update(mg);
 700
 701         mgprev = mg->mg_prev;
 702         mgnext = mg->mg_next;
 703
 704         if (mg == mgnext) {
 705                 mc->mc_rotor = NULL;
 706         } else {
 707                 mc->mc_rotor = mgnext;
 708                 mgprev->mg_next = mgnext;
 709                 mgnext->mg_prev = mgprev;
 710         }
 711
 712         mg->mg_prev = NULL;
 713         mg->mg_next = NULL;
 714 }
 715
 716 boolean_t
 717 metaslab_group_initialized(metaslab_group_t *mg)
 718 {
 719         vdev_t *vd = mg->mg_vd;
 720         vdev_stat_t *vs = &vd->vdev_stat;
 721
 722         return (vs->vs_space != 0 && mg->mg_activation_count > 0);
 723 }
 724
 725 uint64_t
 726 metaslab_group_get_space(metaslab_group_t *mg)
 727 {
 728         return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count);
 729 }
 730
 731 void
 732 metaslab_group_histogram_verify(metaslab_group_t *mg)
 733 {
 734         uint64_t *mg_hist;
 735         vdev_t *vd = mg->mg_vd;
 736         uint64_t ashift = vd->vdev_ashift;
 737         int i;
 738
 739         if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
 740                 return;
 741
 742         mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
 743             KM_SLEEP);
 744
 745         ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=,
 746             SPACE_MAP_HISTOGRAM_SIZE + ashift);
 747
 748         for (int m = 0; m < vd->vdev_ms_count; m++) {
 749                 metaslab_t *msp = vd->vdev_ms[m];
 750
 751                 if (msp->ms_sm == NULL)
 752                         continue;
 753
 754                 for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
 755                         mg_hist[i + ashift] +=
 756                             msp->ms_sm->sm_phys->smp_histogram[i];
 757         }
 758
 759         for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++)
 760                 VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]);
 761
 762         kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
 763 }
 764
 765 static void
 766 metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
 767 {
 768         metaslab_class_t *mc = mg->mg_class;
 769         uint64_t ashift = mg->mg_vd->vdev_ashift;
 770
 771         ASSERT(MUTEX_HELD(&msp->ms_lock));
 772         if (msp->ms_sm == NULL)
 773                 return;
 774
 775         mutex_enter(&mg->mg_lock);
 776         for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
 777                 mg->mg_histogram[i + ashift] +=
 778                     msp->ms_sm->sm_phys->smp_histogram[i];
 779                 mc->mc_histogram[i + ashift] +=
 780                     msp->ms_sm->sm_phys->smp_histogram[i];
 781         }
 782         mutex_exit(&mg->mg_lock);
 783 }
 784
 785 void
 786 metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp)
 787 {
 788         metaslab_class_t *mc = mg->mg_class;
 789         uint64_t ashift = mg->mg_vd->vdev_ashift;
 790
 791         ASSERT(MUTEX_HELD(&msp->ms_lock));
 792         if (msp->ms_sm == NULL)
 793                 return;
 794
 795         mutex_enter(&mg->mg_lock);
 796         for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
 797                 ASSERT3U(mg->mg_histogram[i + ashift], >=,
 798                     msp->ms_sm->sm_phys->smp_histogram[i]);
 799                 ASSERT3U(mc->mc_histogram[i + ashift], >=,
 800                     msp->ms_sm->sm_phys->smp_histogram[i]);
 801
 802                 mg->mg_histogram[i + ashift] -=
 803                     msp->ms_sm->sm_phys->smp_histogram[i];
 804                 mc->mc_histogram[i + ashift] -=
 805                     msp->ms_sm->sm_phys->smp_histogram[i];
 806         }
 807         mutex_exit(&mg->mg_lock);
 808 }
 809
 810 static void
 811 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
 812 {
 813         ASSERT(msp->ms_group == NULL);
 814         mutex_enter(&mg->mg_lock);
 815         msp->ms_group = mg;
 816         msp->ms_weight = 0;
 817         avl_add(&mg->mg_metaslab_tree, msp);
 818         mutex_exit(&mg->mg_lock);
 819
 820         mutex_enter(&msp->ms_lock);
 821         metaslab_group_histogram_add(mg, msp);
 822         mutex_exit(&msp->ms_lock);
 823 }
 824
 825 static void
 826 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
 827 {
 828         mutex_enter(&msp->ms_lock);
 829         metaslab_group_histogram_remove(mg, msp);
 830         mutex_exit(&msp->ms_lock);
 831
 832         mutex_enter(&mg->mg_lock);
 833         ASSERT(msp->ms_group == mg);
 834         avl_remove(&mg->mg_metaslab_tree, msp);
 835         msp->ms_group = NULL;
 836         mutex_exit(&mg->mg_lock);
 837 }
 838
 839 static void
 840 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
 841 {
 842         /*
 843          * Although in principle the weight can be any value, in
 844          * practice we do not use values in the range [1, 511].
 845          */
 846         ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0);
 847         ASSERT(MUTEX_HELD(&msp->ms_lock));
 848
 849         mutex_enter(&mg->mg_lock);
 850         ASSERT(msp->ms_group == mg);
 851         avl_remove(&mg->mg_metaslab_tree, msp);
 852         msp->ms_weight = weight;
 853         avl_add(&mg->mg_metaslab_tree, msp);
 854         mutex_exit(&mg->mg_lock);
 855 }
 856
 857 /*
 858  * Calculate the fragmentation for a given metaslab group. We can use
 859  * a simple average here since all metaslabs within the group must have
 860  * the same size. The return value will be a value between 0 and 100
 861  * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this
 862  * group have a fragmentation metric.
 863  */
 864 uint64_t
 865 metaslab_group_fragmentation(metaslab_group_t *mg)
 866 {
 867         vdev_t *vd = mg->mg_vd;
 868         uint64_t fragmentation = 0;
 869         uint64_t valid_ms = 0;
 870
 871         for (int m = 0; m < vd->vdev_ms_count; m++) {
 872                 metaslab_t *msp = vd->vdev_ms[m];
 873
 874                 if (msp->ms_fragmentation == ZFS_FRAG_INVALID)
 875                         continue;
 876
 877                 valid_ms++;
 878                 fragmentation += msp->ms_fragmentation;
 879         }
 880
 881         if (valid_ms <= vd->vdev_ms_count / 2)
 882                 return (ZFS_FRAG_INVALID);
 883
 884         fragmentation /= valid_ms;
 885         ASSERT3U(fragmentation, <=, 100);
 886         return (fragmentation);
 887 }
 888
 889 /*
 890  * Determine if a given metaslab group should skip allocations. A metaslab
 891  * group should avoid allocations if its free capacity is less than the
 892  * zfs_mg_noalloc_threshold or its fragmentation metric is greater than
 893  * zfs_mg_fragmentation_threshold and there is at least one metaslab group
 894  * that can still handle allocations. If the allocation throttle is enabled
 895  * then we skip allocations to devices that have reached their maximum
 896  * allocation queue depth unless the selected metaslab group is the only
 897  * eligible group remaining.
 898  */
 899 static boolean_t
 900 metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
 901     uint64_t psize)
 902 {
 903         spa_t *spa = mg->mg_vd->vdev_spa;
 904         metaslab_class_t *mc = mg->mg_class;
 905
 906         /*
 907          * We can only consider skipping this metaslab group if it's
 908          * in the normal metaslab class and there are other metaslab
 909          * groups to select from. Otherwise, we always consider it eligible
 910          * for allocations.
 911          */
 912         if (mc != spa_normal_class(spa) || mc->mc_groups <= 1)
 913                 return (B_TRUE);
 914
 915         /*
 916          * If the metaslab group's mg_allocatable flag is set (see comments
 917          * in metaslab_group_alloc_update() for more information) and
 918          * the allocation throttle is disabled then allow allocations to this
 919          * device. However, if the allocation throttle is enabled then
 920          * check if we have reached our allocation limit (mg_alloc_queue_depth)
 921          * to determine if we should allow allocations to this metaslab group.
 922          * If all metaslab groups are no longer considered allocatable
 923          * (mc_alloc_groups == 0) or we're trying to allocate the smallest
 924          * gang block size then we allow allocations on this metaslab group
 925          * regardless of the mg_allocatable or throttle settings.
 926          */
 927         if (mg->mg_allocatable) {
 928                 metaslab_group_t *mgp;
 929                 int64_t qdepth;
 930                 uint64_t qmax = mg->mg_max_alloc_queue_depth;
 931
 932                 if (!mc->mc_alloc_throttle_enabled)
 933                         return (B_TRUE);
 934
 935                 /*
 936                  * If this metaslab group does not have any free space, then
 937                  * there is no point in looking further.
 938                  */
 939                 if (mg->mg_no_free_space)
 940                         return (B_FALSE);
 941
 942                 qdepth = refcount_count(&mg->mg_alloc_queue_depth);
 943
 944                 /*
 945                  * If this metaslab group is below its qmax or it's
 946                  * the only allocatable metasable group, then attempt
 947                  * to allocate from it.
 948                  */
 949                 if (qdepth < qmax || mc->mc_alloc_groups == 1)
 950                         return (B_TRUE);
 951                 ASSERT3U(mc->mc_alloc_groups, >, 1);
 952
 953                 /*
 954                  * Since this metaslab group is at or over its qmax, we
 955                  * need to determine if there are metaslab groups after this
 956                  * one that might be able to handle this allocation. This is
 957                  * racy since we can't hold the locks for all metaslab
 958                  * groups at the same time when we make this check.
 959                  */
 960                 for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) {
 961                         qmax = mgp->mg_max_alloc_queue_depth;
 962
 963                         qdepth = refcount_count(&mgp->mg_alloc_queue_depth);
 964
 965                         /*
 966                          * If there is another metaslab group that
 967                          * might be able to handle the allocation, then
 968                          * we return false so that we skip this group.
 969                          */
 970                         if (qdepth < qmax && !mgp->mg_no_free_space)
 971                                 return (B_FALSE);
 972                 }
 973
 974                 /*
 975                  * We didn't find another group to handle the allocation
 976                  * so we can't skip this metaslab group even though
 977                  * we are at or over our qmax.
 978                  */
 979                 return (B_TRUE);
 980
 981         } else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) {
 982                 return (B_TRUE);
 983         }
 984         return (B_FALSE);
 985 }
 986
 987 /*
 988  * ==========================================================================
 989  * Range tree callbacks
 990  * ==========================================================================
 991  */
 992
 993 /*
 994  * Comparison function for the private size-ordered tree. Tree is sorted
 995  * by size, larger sizes at the end of the tree.
 996  */
 997 static int
 998 metaslab_rangesize_compare(const void *x1, const void *x2)
 999 {
1000         const range_seg_t *r1 = x1;
1001         const range_seg_t *r2 = x2;
1002         uint64_t rs_size1 = r1->rs_end - r1->rs_start;
1003         uint64_t rs_size2 = r2->rs_end - r2->rs_start;
1004
1005         int cmp = AVL_CMP(rs_size1, rs_size2);
1006         if (likely(cmp))
1007                 return (cmp);
1008
1009         return (AVL_CMP(r1->rs_start, r2->rs_start));
1010 }
1011
1012 /*
1013  * ==========================================================================
1014  * Common allocator routines
1015  * ==========================================================================
1016  */
1017
1018 /*
1019  * Return the maximum contiguous segment within the metaslab.
1020  */
1021 uint64_t
1022 metaslab_block_maxsize(metaslab_t *msp)
1023 {
1024         avl_tree_t *t = &msp->ms_size_tree;
1025         range_seg_t *rs;
1026
1027         if (t == NULL || (rs = avl_last(t)) == NULL)
1028                 return (0ULL);
1029
1030         return (rs->rs_end - rs->rs_start);
1031 }
1032
1033 static range_seg_t *
1034 metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size)
1035 {
1036         range_seg_t *rs, rsearch;
1037         avl_index_t where;
1038
1039         rsearch.rs_start = start;
1040         rsearch.rs_end = start + size;
1041
1042         rs = avl_find(t, &rsearch, &where);
1043         if (rs == NULL) {
1044                 rs = avl_nearest(t, where, AVL_AFTER);
1045         }
1046
1047         return (rs);
1048 }
1049
1050 #if defined(WITH_FF_BLOCK_ALLOCATOR) || \
1051     defined(WITH_DF_BLOCK_ALLOCATOR) || \
1052     defined(WITH_CF_BLOCK_ALLOCATOR)
1053 /*
1054  * This is a helper function that can be used by the allocator to find
1055  * a suitable block to allocate. This will search the specified AVL
1056  * tree looking for a block that matches the specified criteria.
1057  */
1058 static uint64_t
1059 metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
1060     uint64_t align)
1061 {
1062         range_seg_t *rs = metaslab_block_find(t, *cursor, size);
1063
1064         while (rs != NULL) {
1065                 uint64_t offset = P2ROUNDUP(rs->rs_start, align);
1066
1067                 if (offset + size <= rs->rs_end) {
1068                         *cursor = offset + size;
1069                         return (offset);
1070                 }
1071                 rs = AVL_NEXT(t, rs);
1072         }
1073
1074         /*
1075          * If we know we've searched the whole map (*cursor == 0), give up.
1076          * Otherwise, reset the cursor to the beginning and try again.
1077          */
1078         if (*cursor == 0)
1079                 return (-1ULL);
1080
1081         *cursor = 0;
1082         return (metaslab_block_picker(t, cursor, size, align));
1083 }
1084 #endif /* WITH_FF/DF/CF_BLOCK_ALLOCATOR */
1085
1086 #if defined(WITH_FF_BLOCK_ALLOCATOR)
1087 /*
1088  * ==========================================================================
1089  * The first-fit block allocator
1090  * ==========================================================================
1091  */
1092 static uint64_t
1093 metaslab_ff_alloc(metaslab_t *msp, uint64_t size)
1094 {
1095         /*
1096          * Find the largest power of 2 block size that evenly divides the
1097          * requested size. This is used to try to allocate blocks with similar
1098          * alignment from the same area of the metaslab (i.e. same cursor
1099          * bucket) but it does not guarantee that other allocations sizes
1100          * may exist in the same region.
1101          */
1102         uint64_t align = size & -size;
1103         uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
1104         avl_tree_t *t = &msp->ms_tree->rt_root;
1105
1106         return (metaslab_block_picker(t, cursor, size, align));
1107 }
1108
1109 static metaslab_ops_t metaslab_ff_ops = {
1110         metaslab_ff_alloc
1111 };
1112
1113 metaslab_ops_t *zfs_metaslab_ops = &metaslab_ff_ops;
1114 #endif /* WITH_FF_BLOCK_ALLOCATOR */
1115
1116 #if defined(WITH_DF_BLOCK_ALLOCATOR)
1117 /*
1118  * ==========================================================================
1119  * Dynamic block allocator -
1120  * Uses the first fit allocation scheme until space get low and then
1121  * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
1122  * and metaslab_df_free_pct to determine when to switch the allocation scheme.
1123  * ==========================================================================
1124  */
1125 static uint64_t
1126 metaslab_df_alloc(metaslab_t *msp, uint64_t size)
1127 {
1128         /*
1129          * Find the largest power of 2 block size that evenly divides the
1130          * requested size. This is used to try to allocate blocks with similar
1131          * alignment from the same area of the metaslab (i.e. same cursor
1132          * bucket) but it does not guarantee that other allocations sizes
1133          * may exist in the same region.
1134          */
1135         uint64_t align = size & -size;
1136         uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
1137         range_tree_t *rt = msp->ms_tree;
1138         avl_tree_t *t = &rt->rt_root;
1139         uint64_t max_size = metaslab_block_maxsize(msp);
1140         int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
1141
1142         ASSERT(MUTEX_HELD(&msp->ms_lock));
1143         ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
1144
1145         if (max_size < size)
1146                 return (-1ULL);
1147
1148         /*
1149          * If we're running low on space switch to using the size
1150          * sorted AVL tree (best-fit).
1151          */
1152         if (max_size < metaslab_df_alloc_threshold ||
1153             free_pct < metaslab_df_free_pct) {
1154                 t = &msp->ms_size_tree;
1155                 *cursor = 0;
1156         }
1157
1158         return (metaslab_block_picker(t, cursor, size, 1ULL));
1159 }
1160
1161 static metaslab_ops_t metaslab_df_ops = {
1162         metaslab_df_alloc
1163 };
1164
1165 metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
1166 #endif /* WITH_DF_BLOCK_ALLOCATOR */
1167
1168 #if defined(WITH_CF_BLOCK_ALLOCATOR)
1169 /*
1170  * ==========================================================================
1171  * Cursor fit block allocator -
1172  * Select the largest region in the metaslab, set the cursor to the beginning
1173  * of the range and the cursor_end to the end of the range. As allocations
1174  * are made advance the cursor. Continue allocating from the cursor until
1175  * the range is exhausted and then find a new range.
1176  * ==========================================================================
1177  */
1178 static uint64_t
1179 metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
1180 {
1181         range_tree_t *rt = msp->ms_tree;
1182         avl_tree_t *t = &msp->ms_size_tree;
1183         uint64_t *cursor = &msp->ms_lbas[0];
1184         uint64_t *cursor_end = &msp->ms_lbas[1];
1185         uint64_t offset = 0;
1186
1187         ASSERT(MUTEX_HELD(&msp->ms_lock));
1188         ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root));
1189
1190         ASSERT3U(*cursor_end, >=, *cursor);
1191
1192         if ((*cursor + size) > *cursor_end) {
1193                 range_seg_t *rs;
1194
1195                 rs = avl_last(&msp->ms_size_tree);
1196                 if (rs == NULL || (rs->rs_end - rs->rs_start) < size)
1197                         return (-1ULL);
1198
1199                 *cursor = rs->rs_start;
1200                 *cursor_end = rs->rs_end;
1201         }
1202
1203         offset = *cursor;
1204         *cursor += size;
1205
1206         return (offset);
1207 }
1208
1209 static metaslab_ops_t metaslab_cf_ops = {
1210         metaslab_cf_alloc
1211 };
1212
1213 metaslab_ops_t *zfs_metaslab_ops = &metaslab_cf_ops;
1214 #endif /* WITH_CF_BLOCK_ALLOCATOR */
1215
1216 #if defined(WITH_NDF_BLOCK_ALLOCATOR)
1217 /*
1218  * ==========================================================================
1219  * New dynamic fit allocator -
1220  * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
1221  * contiguous blocks. If no region is found then just use the largest segment
1222  * that remains.
1223  * ==========================================================================
1224  */
1225
1226 /*
1227  * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
1228  * to request from the allocator.
1229  */
1230 uint64_t metaslab_ndf_clump_shift = 4;
1231
1232 static uint64_t
1233 metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
1234 {
1235         avl_tree_t *t = &msp->ms_tree->rt_root;
1236         avl_index_t where;
1237         range_seg_t *rs, rsearch;
1238         uint64_t hbit = highbit64(size);
1239         uint64_t *cursor = &msp->ms_lbas[hbit - 1];
1240         uint64_t max_size = metaslab_block_maxsize(msp);
1241
1242         ASSERT(MUTEX_HELD(&msp->ms_lock));
1243         ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
1244
1245         if (max_size < size)
1246                 return (-1ULL);
1247
1248         rsearch.rs_start = *cursor;
1249         rsearch.rs_end = *cursor + size;
1250
1251         rs = avl_find(t, &rsearch, &where);
1252         if (rs == NULL || (rs->rs_end - rs->rs_start) < size) {
1253                 t = &msp->ms_size_tree;
1254
1255                 rsearch.rs_start = 0;
1256                 rsearch.rs_end = MIN(max_size,
1257                     1ULL << (hbit + metaslab_ndf_clump_shift));
1258                 rs = avl_find(t, &rsearch, &where);
1259                 if (rs == NULL)
1260                         rs = avl_nearest(t, where, AVL_AFTER);
1261                 ASSERT(rs != NULL);
1262         }
1263
1264         if ((rs->rs_end - rs->rs_start) >= size) {
1265                 *cursor = rs->rs_start + size;
1266                 return (rs->rs_start);
1267         }
1268         return (-1ULL);
1269 }
1270
1271 static metaslab_ops_t metaslab_ndf_ops = {
1272         metaslab_ndf_alloc
1273 };
1274
1275 metaslab_ops_t *zfs_metaslab_ops = &metaslab_ndf_ops;
1276 #endif /* WITH_NDF_BLOCK_ALLOCATOR */
1277
1278
1279 /*
1280  * ==========================================================================
1281  * Metaslabs
1282  * ==========================================================================
1283  */
1284
1285 /*
1286  * Wait for any in-progress metaslab loads to complete.
1287  */
1288 void
1289 metaslab_load_wait(metaslab_t *msp)
1290 {
1291         ASSERT(MUTEX_HELD(&msp->ms_lock));
1292
1293         while (msp->ms_loading) {
1294                 ASSERT(!msp->ms_loaded);
1295                 cv_wait(&msp->ms_load_cv, &msp->ms_lock);
1296         }
1297 }
1298
1299 int
1300 metaslab_load(metaslab_t *msp)
1301 {
1302         int error = 0;
1303         boolean_t success = B_FALSE;
1304
1305         ASSERT(MUTEX_HELD(&msp->ms_lock));
1306         ASSERT(!msp->ms_loaded);
1307         ASSERT(!msp->ms_loading);
1308
1309         msp->ms_loading = B_TRUE;
1310         /*
1311          * Nobody else can manipulate a loading metaslab, so it's now safe
1312          * to drop the lock.  This way we don't have to hold the lock while
1313          * reading the spacemap from disk.
1314          */
1315         mutex_exit(&msp->ms_lock);
1316
1317         /*
1318          * If the space map has not been allocated yet, then treat
1319          * all the space in the metaslab as free and add it to the
1320          * ms_tree.
1321          */
1322         if (msp->ms_sm != NULL)
1323                 error = space_map_load(msp->ms_sm, msp->ms_tree, SM_FREE);
1324         else
1325                 range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size);
1326
1327         success = (error == 0);
1328
1329         mutex_enter(&msp->ms_lock);
1330         msp->ms_loading = B_FALSE;
1331
1332         if (success) {
1333                 ASSERT3P(msp->ms_group, !=, NULL);
1334                 msp->ms_loaded = B_TRUE;
1335
1336                 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1337                         range_tree_walk(msp->ms_defertree[t],
1338                             range_tree_remove, msp->ms_tree);
1339                 }
1340                 msp->ms_max_size = metaslab_block_maxsize(msp);
1341         }
1342         cv_broadcast(&msp->ms_load_cv);
1343         return (error);
1344 }
1345
1346 void
1347 metaslab_unload(metaslab_t *msp)
1348 {
1349         ASSERT(MUTEX_HELD(&msp->ms_lock));
1350         range_tree_vacate(msp->ms_tree, NULL, NULL);
1351         msp->ms_loaded = B_FALSE;
1352         msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
1353         msp->ms_max_size = 0;
1354 }
1355
1356 int
1357 metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
1358     metaslab_t **msp)
1359 {
1360         vdev_t *vd = mg->mg_vd;
1361         objset_t *mos = vd->vdev_spa->spa_meta_objset;
1362         metaslab_t *ms;
1363         int error;
1364
1365         ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
1366         mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL);
1367         mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL);
1368         cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
1369         ms->ms_id = id;
1370         ms->ms_start = id << vd->vdev_ms_shift;
1371         ms->ms_size = 1ULL << vd->vdev_ms_shift;
1372
1373         /*
1374          * We only open space map objects that already exist. All others
1375          * will be opened when we finally allocate an object for it.
1376          */
1377         if (object != 0) {
1378                 error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start,
1379                     ms->ms_size, vd->vdev_ashift);
1380
1381                 if (error != 0) {
1382                         kmem_free(ms, sizeof (metaslab_t));
1383                         return (error);
1384                 }
1385
1386                 ASSERT(ms->ms_sm != NULL);
1387         }
1388
1389         /*
1390          * We create the main range tree here, but we don't create the
1391          * other range trees until metaslab_sync_done().  This serves
1392          * two purposes: it allows metaslab_sync_done() to detect the
1393          * addition of new space; and for debugging, it ensures that we'd
1394          * data fault on any attempt to use this metaslab before it's ready.
1395          */
1396         ms->ms_tree = range_tree_create_impl(&rt_avl_ops, &ms->ms_size_tree,
1397             metaslab_rangesize_compare, 0);
1398         metaslab_group_add(mg, ms);
1399
1400         metaslab_set_fragmentation(ms);
1401
1402         /*
1403          * If we're opening an existing pool (txg == 0) or creating
1404          * a new one (txg == TXG_INITIAL), all space is available now.
1405          * If we're adding space to an existing pool, the new space
1406          * does not become available until after this txg has synced.
1407          * The metaslab's weight will also be initialized when we sync
1408          * out this txg. This ensures that we don't attempt to allocate
1409          * from it before we have initialized it completely.
1410          */
1411         if (txg <= TXG_INITIAL)
1412                 metaslab_sync_done(ms, 0);
1413
1414         /*
1415          * If metaslab_debug_load is set and we're initializing a metaslab
1416          * that has an allocated space map object then load the its space
1417          * map so that can verify frees.
1418          */
1419         if (metaslab_debug_load && ms->ms_sm != NULL) {
1420                 mutex_enter(&ms->ms_lock);
1421                 VERIFY0(metaslab_load(ms));
1422                 mutex_exit(&ms->ms_lock);
1423         }
1424
1425         if (txg != 0) {
1426                 vdev_dirty(vd, 0, NULL, txg);
1427                 vdev_dirty(vd, VDD_METASLAB, ms, txg);
1428         }
1429
1430         *msp = ms;
1431
1432         return (0);
1433 }
1434
1435 void
1436 metaslab_fini(metaslab_t *msp)
1437 {
1438         metaslab_group_t *mg = msp->ms_group;
1439
1440         metaslab_group_remove(mg, msp);
1441
1442         mutex_enter(&msp->ms_lock);
1443         VERIFY(msp->ms_group == NULL);
1444         vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm),
1445             0, -msp->ms_size);
1446         space_map_close(msp->ms_sm);
1447
1448         metaslab_unload(msp);
1449         range_tree_destroy(msp->ms_tree);
1450         range_tree_destroy(msp->ms_freeingtree);
1451         range_tree_destroy(msp->ms_freedtree);
1452
1453         for (int t = 0; t < TXG_SIZE; t++) {
1454                 range_tree_destroy(msp->ms_alloctree[t]);
1455         }
1456
1457         for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1458                 range_tree_destroy(msp->ms_defertree[t]);
1459         }
1460
1461         ASSERT0(msp->ms_deferspace);
1462
1463         mutex_exit(&msp->ms_lock);
1464         cv_destroy(&msp->ms_load_cv);
1465         mutex_destroy(&msp->ms_lock);
1466         mutex_destroy(&msp->ms_sync_lock);
1467
1468         kmem_free(msp, sizeof (metaslab_t));
1469 }
1470
1471 #define FRAGMENTATION_TABLE_SIZE        17
1472
1473 /*
1474  * This table defines a segment size based fragmentation metric that will
1475  * allow each metaslab to derive its own fragmentation value. This is done
1476  * by calculating the space in each bucket of the spacemap histogram and
1477  * multiplying that by the fragmetation metric in this table. Doing
1478  * this for all buckets and dividing it by the total amount of free
1479  * space in this metaslab (i.e. the total free space in all buckets) gives
1480  * us the fragmentation metric. This means that a high fragmentation metric
1481  * equates to most of the free space being comprised of small segments.
1482  * Conversely, if the metric is low, then most of the free space is in
1483  * large segments. A 10% change in fragmentation equates to approximately
1484  * double the number of segments.
1485  *
1486  * This table defines 0% fragmented space using 16MB segments. Testing has
1487  * shown that segments that are greater than or equal to 16MB do not suffer
1488  * from drastic performance problems. Using this value, we derive the rest
1489  * of the table. Since the fragmentation value is never stored on disk, it
1490  * is possible to change these calculations in the future.
1491  */
1492 int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
1493         100,    /* 512B */
1494         100,    /* 1K   */
1495         98,     /* 2K   */
1496         95,     /* 4K   */
1497         90,     /* 8K   */
1498         80,     /* 16K  */
1499         70,     /* 32K  */
1500         60,     /* 64K  */
1501         50,     /* 128K */
1502         40,     /* 256K */
1503         30,     /* 512K */
1504         20,     /* 1M   */
1505         15,     /* 2M   */
1506         10,     /* 4M   */
1507         5,      /* 8M   */
1508         0       /* 16M  */
1509 };
1510
1511 /*
1512  * Calclate the metaslab's fragmentation metric. A return value
1513  * of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does
1514  * not support this metric. Otherwise, the return value should be in the
1515  * range [0, 100].
1516  */
1517 static void
1518 metaslab_set_fragmentation(metaslab_t *msp)
1519 {
1520         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1521         uint64_t fragmentation = 0;
1522         uint64_t total = 0;
1523         boolean_t feature_enabled = spa_feature_is_enabled(spa,
1524             SPA_FEATURE_SPACEMAP_HISTOGRAM);
1525
1526         if (!feature_enabled) {
1527                 msp->ms_fragmentation = ZFS_FRAG_INVALID;
1528                 return;
1529         }
1530
1531         /*
1532          * A null space map means that the entire metaslab is free
1533          * and thus is not fragmented.
1534          */
1535         if (msp->ms_sm == NULL) {
1536                 msp->ms_fragmentation = 0;
1537                 return;
1538         }
1539
1540         /*
1541          * If this metaslab's space map has not been upgraded, flag it
1542          * so that we upgrade next time we encounter it.
1543          */
1544         if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) {
1545                 uint64_t txg = spa_syncing_txg(spa);
1546                 vdev_t *vd = msp->ms_group->mg_vd;
1547
1548                 /*
1549                  * If we've reached the final dirty txg, then we must
1550                  * be shutting down the pool. We don't want to dirty
1551                  * any data past this point so skip setting the condense
1552                  * flag. We can retry this action the next time the pool
1553                  * is imported.
1554                  */
1555                 if (spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) {
1556                         msp->ms_condense_wanted = B_TRUE;
1557                         vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
1558                         zfs_dbgmsg("txg %llu, requesting force condense: "
1559                             "ms_id %llu, vdev_id %llu", txg, msp->ms_id,
1560                             vd->vdev_id);
1561                 }
1562                 msp->ms_fragmentation = ZFS_FRAG_INVALID;
1563                 return;
1564         }
1565
1566         for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
1567                 uint64_t space = 0;
1568                 uint8_t shift = msp->ms_sm->sm_shift;
1569
1570                 int idx = MIN(shift - SPA_MINBLOCKSHIFT + i,
1571                     FRAGMENTATION_TABLE_SIZE - 1);
1572
1573                 if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
1574                         continue;
1575
1576                 space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift);
1577                 total += space;
1578
1579                 ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE);
1580                 fragmentation += space * zfs_frag_table[idx];
1581         }
1582
1583         if (total > 0)
1584                 fragmentation /= total;
1585         ASSERT3U(fragmentation, <=, 100);
1586
1587         msp->ms_fragmentation = fragmentation;
1588 }
1589
1590 /*
1591  * Compute a weight -- a selection preference value -- for the given metaslab.
1592  * This is based on the amount of free space, the level of fragmentation,
1593  * the LBA range, and whether the metaslab is loaded.
1594  */
1595 static uint64_t
1596 metaslab_space_weight(metaslab_t *msp)
1597 {
1598         metaslab_group_t *mg = msp->ms_group;
1599         vdev_t *vd = mg->mg_vd;
1600         uint64_t weight, space;
1601
1602         ASSERT(MUTEX_HELD(&msp->ms_lock));
1603         ASSERT(!vd->vdev_removing);
1604
1605         /*
1606          * The baseline weight is the metaslab's free space.
1607          */
1608         space = msp->ms_size - space_map_allocated(msp->ms_sm);
1609
1610         if (metaslab_fragmentation_factor_enabled &&
1611             msp->ms_fragmentation != ZFS_FRAG_INVALID) {
1612                 /*
1613                  * Use the fragmentation information to inversely scale
1614                  * down the baseline weight. We need to ensure that we
1615                  * don't exclude this metaslab completely when it's 100%
1616                  * fragmented. To avoid this we reduce the fragmented value
1617                  * by 1.
1618                  */
1619                 space = (space * (100 - (msp->ms_fragmentation - 1))) / 100;
1620
1621                 /*
1622                  * If space < SPA_MINBLOCKSIZE, then we will not allocate from
1623                  * this metaslab again. The fragmentation metric may have
1624                  * decreased the space to something smaller than
1625                  * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE
1626                  * so that we can consume any remaining space.
1627                  */
1628                 if (space > 0 && space < SPA_MINBLOCKSIZE)
1629                         space = SPA_MINBLOCKSIZE;
1630         }
1631         weight = space;
1632
1633         /*
1634          * Modern disks have uniform bit density and constant angular velocity.
1635          * Therefore, the outer recording zones are faster (higher bandwidth)
1636          * than the inner zones by the ratio of outer to inner track diameter,
1637          * which is typically around 2:1.  We account for this by assigning
1638          * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
1639          * In effect, this means that we'll select the metaslab with the most
1640          * free bandwidth rather than simply the one with the most free space.
1641          */
1642         if (!vd->vdev_nonrot && metaslab_lba_weighting_enabled) {
1643                 weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
1644                 ASSERT(weight >= space && weight <= 2 * space);
1645         }
1646
1647         /*
1648          * If this metaslab is one we're actively using, adjust its
1649          * weight to make it preferable to any inactive metaslab so
1650          * we'll polish it off. If the fragmentation on this metaslab
1651          * has exceed our threshold, then don't mark it active.
1652          */
1653         if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID &&
1654             msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) {
1655                 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
1656         }
1657
1658         WEIGHT_SET_SPACEBASED(weight);
1659         return (weight);
1660 }
1661
1662 /*
1663  * Return the weight of the specified metaslab, according to the segment-based
1664  * weighting algorithm. The metaslab must be loaded. This function can
1665  * be called within a sync pass since it relies only on the metaslab's
1666  * range tree which is always accurate when the metaslab is loaded.
1667  */
1668 static uint64_t
1669 metaslab_weight_from_range_tree(metaslab_t *msp)
1670 {
1671         uint64_t weight = 0;
1672         uint32_t segments = 0;
1673
1674         ASSERT(msp->ms_loaded);
1675
1676         for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT;
1677             i--) {
1678                 uint8_t shift = msp->ms_group->mg_vd->vdev_ashift;
1679                 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
1680
1681                 segments <<= 1;
1682                 segments += msp->ms_tree->rt_histogram[i];
1683
1684                 /*
1685                  * The range tree provides more precision than the space map
1686                  * and must be downgraded so that all values fit within the
1687                  * space map's histogram. This allows us to compare loaded
1688                  * vs. unloaded metaslabs to determine which metaslab is
1689                  * considered "best".
1690                  */
1691                 if (i > max_idx)
1692                         continue;
1693
1694                 if (segments != 0) {
1695                         WEIGHT_SET_COUNT(weight, segments);
1696                         WEIGHT_SET_INDEX(weight, i);
1697                         WEIGHT_SET_ACTIVE(weight, 0);
1698                         break;
1699                 }
1700         }
1701         return (weight);
1702 }
1703
1704 /*
1705  * Calculate the weight based on the on-disk histogram. This should only
1706  * be called after a sync pass has completely finished since the on-disk
1707  * information is updated in metaslab_sync().
1708  */
1709 static uint64_t
1710 metaslab_weight_from_spacemap(metaslab_t *msp)
1711 {
1712         uint64_t weight = 0;
1713
1714         for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) {
1715                 if (msp->ms_sm->sm_phys->smp_histogram[i] != 0) {
1716                         WEIGHT_SET_COUNT(weight,
1717                             msp->ms_sm->sm_phys->smp_histogram[i]);
1718                         WEIGHT_SET_INDEX(weight, i +
1719                             msp->ms_sm->sm_shift);
1720                         WEIGHT_SET_ACTIVE(weight, 0);
1721                         break;
1722                 }
1723         }
1724         return (weight);
1725 }
1726
1727 /*
1728  * Compute a segment-based weight for the specified metaslab. The weight
1729  * is determined by highest bucket in the histogram. The information
1730  * for the highest bucket is encoded into the weight value.
1731  */
1732 static uint64_t
1733 metaslab_segment_weight(metaslab_t *msp)
1734 {
1735         metaslab_group_t *mg = msp->ms_group;
1736         uint64_t weight = 0;
1737         uint8_t shift = mg->mg_vd->vdev_ashift;
1738
1739         ASSERT(MUTEX_HELD(&msp->ms_lock));
1740
1741         /*
1742          * The metaslab is completely free.
1743          */
1744         if (space_map_allocated(msp->ms_sm) == 0) {
1745                 int idx = highbit64(msp->ms_size) - 1;
1746                 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
1747
1748                 if (idx < max_idx) {
1749                         WEIGHT_SET_COUNT(weight, 1ULL);
1750                         WEIGHT_SET_INDEX(weight, idx);
1751                 } else {
1752                         WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx));
1753                         WEIGHT_SET_INDEX(weight, max_idx);
1754                 }
1755                 WEIGHT_SET_ACTIVE(weight, 0);
1756                 ASSERT(!WEIGHT_IS_SPACEBASED(weight));
1757
1758                 return (weight);
1759         }
1760
1761         ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
1762
1763         /*
1764          * If the metaslab is fully allocated then just make the weight 0.
1765          */
1766         if (space_map_allocated(msp->ms_sm) == msp->ms_size)
1767                 return (0);
1768         /*
1769          * If the metaslab is already loaded, then use the range tree to
1770          * determine the weight. Otherwise, we rely on the space map information
1771          * to generate the weight.
1772          */
1773         if (msp->ms_loaded) {
1774                 weight = metaslab_weight_from_range_tree(msp);
1775         } else {
1776                 weight = metaslab_weight_from_spacemap(msp);
1777         }
1778
1779         /*
1780          * If the metaslab was active the last time we calculated its weight
1781          * then keep it active. We want to consume the entire region that
1782          * is associated with this weight.
1783          */
1784         if (msp->ms_activation_weight != 0 && weight != 0)
1785                 WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight));
1786         return (weight);
1787 }
1788
1789 /*
1790  * Determine if we should attempt to allocate from this metaslab. If the
1791  * metaslab has a maximum size then we can quickly determine if the desired
1792  * allocation size can be satisfied. Otherwise, if we're using segment-based
1793  * weighting then we can determine the maximum allocation that this metaslab
1794  * can accommodate based on the index encoded in the weight. If we're using
1795  * space-based weights then rely on the entire weight (excluding the weight
1796  * type bit).
1797  */
1798 boolean_t
1799 metaslab_should_allocate(metaslab_t *msp, uint64_t asize)
1800 {
1801         boolean_t should_allocate;
1802
1803         if (msp->ms_max_size != 0)
1804                 return (msp->ms_max_size >= asize);
1805
1806         if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
1807                 /*
1808                  * The metaslab segment weight indicates segments in the
1809                  * range [2^i, 2^(i+1)), where i is the index in the weight.
1810                  * Since the asize might be in the middle of the range, we
1811                  * should attempt the allocation if asize < 2^(i+1).
1812                  */
1813                 should_allocate = (asize <
1814                     1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1));
1815         } else {
1816                 should_allocate = (asize <=
1817                     (msp->ms_weight & ~METASLAB_WEIGHT_TYPE));
1818         }
1819         return (should_allocate);
1820 }
1821 static uint64_t
1822 metaslab_weight(metaslab_t *msp)
1823 {
1824         vdev_t *vd = msp->ms_group->mg_vd;
1825         spa_t *spa = vd->vdev_spa;
1826         uint64_t weight;
1827
1828         ASSERT(MUTEX_HELD(&msp->ms_lock));
1829
1830         /*
1831          * If this vdev is in the process of being removed, there is nothing
1832          * for us to do here.
1833          */
1834         if (vd->vdev_removing)
1835                 return (0);
1836
1837         metaslab_set_fragmentation(msp);
1838
1839         /*
1840          * Update the maximum size if the metaslab is loaded. This will
1841          * ensure that we get an accurate maximum size if newly freed space
1842          * has been added back into the free tree.
1843          */
1844         if (msp->ms_loaded)
1845                 msp->ms_max_size = metaslab_block_maxsize(msp);
1846
1847         /*
1848          * Segment-based weighting requires space map histogram support.
1849          */
1850         if (zfs_metaslab_segment_weight_enabled &&
1851             spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
1852             (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size ==
1853             sizeof (space_map_phys_t))) {
1854                 weight = metaslab_segment_weight(msp);
1855         } else {
1856                 weight = metaslab_space_weight(msp);
1857         }
1858         return (weight);
1859 }
1860
1861 static int
1862 metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
1863 {
1864         ASSERT(MUTEX_HELD(&msp->ms_lock));
1865
1866         if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
1867                 metaslab_load_wait(msp);
1868                 if (!msp->ms_loaded) {
1869                         int error = metaslab_load(msp);
1870                         if (error) {
1871                                 metaslab_group_sort(msp->ms_group, msp, 0);
1872                                 return (error);
1873                         }
1874                 }
1875
1876                 msp->ms_activation_weight = msp->ms_weight;
1877                 metaslab_group_sort(msp->ms_group, msp,
1878                     msp->ms_weight | activation_weight);
1879         }
1880         ASSERT(msp->ms_loaded);
1881         ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
1882
1883         return (0);
1884 }
1885
1886 static void
1887 metaslab_passivate(metaslab_t *msp, uint64_t weight)
1888 {
1889         ASSERTV(uint64_t size = weight & ~METASLAB_WEIGHT_TYPE);
1890
1891         /*
1892          * If size < SPA_MINBLOCKSIZE, then we will not allocate from
1893          * this metaslab again.  In that case, it had better be empty,
1894          * or we would be leaving space on the table.
1895          */
1896         ASSERT(!WEIGHT_IS_SPACEBASED(msp->ms_weight) ||
1897             size >= SPA_MINBLOCKSIZE ||
1898             range_tree_space(msp->ms_tree) == 0);
1899         ASSERT0(weight & METASLAB_ACTIVE_MASK);
1900
1901         msp->ms_activation_weight = 0;
1902         metaslab_group_sort(msp->ms_group, msp, weight);
1903         ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
1904 }
1905
1906 /*
1907  * Segment-based metaslabs are activated once and remain active until
1908  * we either fail an allocation attempt (similar to space-based metaslabs)
1909  * or have exhausted the free space in zfs_metaslab_switch_threshold
1910  * buckets since the metaslab was activated. This function checks to see
1911  * if we've exhaused the zfs_metaslab_switch_threshold buckets in the
1912  * metaslab and passivates it proactively. This will allow us to select a
1913  * metaslab with a larger contiguous region, if any, remaining within this
1914  * metaslab group. If we're in sync pass > 1, then we continue using this
1915  * metaslab so that we don't dirty more block and cause more sync passes.
1916  */
1917 void
1918 metaslab_segment_may_passivate(metaslab_t *msp)
1919 {
1920         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1921
1922         if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1)
1923                 return;
1924
1925         /*
1926          * Since we are in the middle of a sync pass, the most accurate
1927          * information that is accessible to us is the in-core range tree
1928          * histogram; calculate the new weight based on that information.
1929          */
1930         uint64_t weight = metaslab_weight_from_range_tree(msp);
1931         int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight);
1932         int current_idx = WEIGHT_GET_INDEX(weight);
1933
1934         if (current_idx <= activation_idx - zfs_metaslab_switch_threshold)
1935                 metaslab_passivate(msp, weight);
1936 }
1937
1938 static void
1939 metaslab_preload(void *arg)
1940 {
1941         metaslab_t *msp = arg;
1942         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1943         fstrans_cookie_t cookie = spl_fstrans_mark();
1944
1945         ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
1946
1947         mutex_enter(&msp->ms_lock);
1948         metaslab_load_wait(msp);
1949         if (!msp->ms_loaded)
1950                 (void) metaslab_load(msp);
1951         msp->ms_selected_txg = spa_syncing_txg(spa);
1952         mutex_exit(&msp->ms_lock);
1953         spl_fstrans_unmark(cookie);
1954 }
1955
1956 static void
1957 metaslab_group_preload(metaslab_group_t *mg)
1958 {
1959         spa_t *spa = mg->mg_vd->vdev_spa;
1960         metaslab_t *msp;
1961         avl_tree_t *t = &mg->mg_metaslab_tree;
1962         int m = 0;
1963
1964         if (spa_shutting_down(spa) || !metaslab_preload_enabled) {
1965                 taskq_wait_outstanding(mg->mg_taskq, 0);
1966                 return;
1967         }
1968
1969         mutex_enter(&mg->mg_lock);
1970
1971         /*
1972          * Load the next potential metaslabs
1973          */
1974         for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) {
1975                 ASSERT3P(msp->ms_group, ==, mg);
1976
1977                 /*
1978                  * We preload only the maximum number of metaslabs specified
1979                  * by metaslab_preload_limit. If a metaslab is being forced
1980                  * to condense then we preload it too. This will ensure
1981                  * that force condensing happens in the next txg.
1982                  */
1983                 if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) {
1984                         continue;
1985                 }
1986
1987                 VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload,
1988                     msp, TQ_SLEEP) != TASKQID_INVALID);
1989         }
1990         mutex_exit(&mg->mg_lock);
1991 }
1992
1993 /*
1994  * Determine if the space map's on-disk footprint is past our tolerance
1995  * for inefficiency. We would like to use the following criteria to make
1996  * our decision:
1997  *
1998  * 1. The size of the space map object should not dramatically increase as a
1999  * result of writing out the free space range tree.
2000  *
2001  * 2. The minimal on-disk space map representation is zfs_condense_pct/100
2002  * times the size than the free space range tree representation
2003  * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1MB).
2004  *
2005  * 3. The on-disk size of the space map should actually decrease.
2006  *
2007  * Checking the first condition is tricky since we don't want to walk
2008  * the entire AVL tree calculating the estimated on-disk size. Instead we
2009  * use the size-ordered range tree in the metaslab and calculate the
2010  * size required to write out the largest segment in our free tree. If the
2011  * size required to represent that segment on disk is larger than the space
2012  * map object then we avoid condensing this map.
2013  *
2014  * To determine the second criterion we use a best-case estimate and assume
2015  * each segment can be represented on-disk as a single 64-bit entry. We refer
2016  * to this best-case estimate as the space map's minimal form.
2017  *
2018  * Unfortunately, we cannot compute the on-disk size of the space map in this
2019  * context because we cannot accurately compute the effects of compression, etc.
2020  * Instead, we apply the heuristic described in the block comment for
2021  * zfs_metaslab_condense_block_threshold - we only condense if the space used
2022  * is greater than a threshold number of blocks.
2023  */
2024 static boolean_t
2025 metaslab_should_condense(metaslab_t *msp)
2026 {
2027         space_map_t *sm = msp->ms_sm;
2028         range_seg_t *rs;
2029         uint64_t size, entries, segsz, object_size, optimal_size, record_size;
2030         dmu_object_info_t doi;
2031         uint64_t vdev_blocksize = 1ULL << msp->ms_group->mg_vd->vdev_ashift;
2032
2033         ASSERT(MUTEX_HELD(&msp->ms_lock));
2034         ASSERT(msp->ms_loaded);
2035
2036         /*
2037          * Use the ms_size_tree range tree, which is ordered by size, to
2038          * obtain the largest segment in the free tree. We always condense
2039          * metaslabs that are empty and metaslabs for which a condense
2040          * request has been made.
2041          */
2042         rs = avl_last(&msp->ms_size_tree);
2043         if (rs == NULL || msp->ms_condense_wanted)
2044                 return (B_TRUE);
2045
2046         /*
2047          * Calculate the number of 64-bit entries this segment would
2048          * require when written to disk. If this single segment would be
2049          * larger on-disk than the entire current on-disk structure, then
2050          * clearly condensing will increase the on-disk structure size.
2051          */
2052         size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
2053         entries = size / (MIN(size, SM_RUN_MAX));
2054         segsz = entries * sizeof (uint64_t);
2055
2056         optimal_size = sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root);
2057         object_size = space_map_length(msp->ms_sm);
2058
2059         dmu_object_info_from_db(sm->sm_dbuf, &doi);
2060         record_size = MAX(doi.doi_data_block_size, vdev_blocksize);
2061
2062         return (segsz <= object_size &&
2063             object_size >= (optimal_size * zfs_condense_pct / 100) &&
2064             object_size > zfs_metaslab_condense_block_threshold * record_size);
2065 }
2066
2067 /*
2068  * Condense the on-disk space map representation to its minimized form.
2069  * The minimized form consists of a small number of allocations followed by
2070  * the entries of the free range tree.
2071  */
2072 static void
2073 metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
2074 {
2075         range_tree_t *condense_tree;
2076         space_map_t *sm = msp->ms_sm;
2077
2078         ASSERT(MUTEX_HELD(&msp->ms_lock));
2079         ASSERT3U(spa_sync_pass(msp->ms_group->mg_vd->vdev_spa), ==, 1);
2080         ASSERT(msp->ms_loaded);
2081
2082
2083         zfs_dbgmsg("condensing: txg %llu, msp[%llu] %p, vdev id %llu, "
2084             "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg,
2085             msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id,
2086             msp->ms_group->mg_vd->vdev_spa->spa_name,
2087             space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root),
2088             msp->ms_condense_wanted ? "TRUE" : "FALSE");
2089
2090         msp->ms_condense_wanted = B_FALSE;
2091
2092         /*
2093          * Create an range tree that is 100% allocated. We remove segments
2094          * that have been freed in this txg, any deferred frees that exist,
2095          * and any allocation in the future. Removing segments should be
2096          * a relatively inexpensive operation since we expect these trees to
2097          * have a small number of nodes.
2098          */
2099         condense_tree = range_tree_create(NULL, NULL);
2100         range_tree_add(condense_tree, msp->ms_start, msp->ms_size);
2101
2102         /*
2103          * Remove what's been freed in this txg from the condense_tree.
2104          * Since we're in sync_pass 1, we know that all the frees from
2105          * this txg are in the freeingtree.
2106          */
2107         range_tree_walk(msp->ms_freeingtree, range_tree_remove, condense_tree);
2108
2109         for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2110                 range_tree_walk(msp->ms_defertree[t],
2111                     range_tree_remove, condense_tree);
2112         }
2113
2114         for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
2115                 range_tree_walk(msp->ms_alloctree[(txg + t) & TXG_MASK],
2116                     range_tree_remove, condense_tree);
2117         }
2118
2119         /*
2120          * We're about to drop the metaslab's lock thus allowing
2121          * other consumers to change it's content. Set the
2122          * metaslab's ms_condensing flag to ensure that
2123          * allocations on this metaslab do not occur while we're
2124          * in the middle of committing it to disk. This is only critical
2125          * for the ms_tree as all other range trees use per txg
2126          * views of their content.
2127          */
2128         msp->ms_condensing = B_TRUE;
2129
2130         mutex_exit(&msp->ms_lock);
2131         space_map_truncate(sm, tx);
2132
2133         /*
2134          * While we would ideally like to create a space map representation
2135          * that consists only of allocation records, doing so can be
2136          * prohibitively expensive because the in-core free tree can be
2137          * large, and therefore computationally expensive to subtract
2138          * from the condense_tree. Instead we sync out two trees, a cheap
2139          * allocation only tree followed by the in-core free tree. While not
2140          * optimal, this is typically close to optimal, and much cheaper to
2141          * compute.
2142          */
2143         space_map_write(sm, condense_tree, SM_ALLOC, tx);
2144         range_tree_vacate(condense_tree, NULL, NULL);
2145         range_tree_destroy(condense_tree);
2146
2147         space_map_write(sm, msp->ms_tree, SM_FREE, tx);
2148         mutex_enter(&msp->ms_lock);
2149         msp->ms_condensing = B_FALSE;
2150 }
2151
2152 /*
2153  * Write a metaslab to disk in the context of the specified transaction group.
2154  */
2155 void
2156 metaslab_sync(metaslab_t *msp, uint64_t txg)
2157 {
2158         metaslab_group_t *mg = msp->ms_group;
2159         vdev_t *vd = mg->mg_vd;
2160         spa_t *spa = vd->vdev_spa;
2161         objset_t *mos = spa_meta_objset(spa);
2162         range_tree_t *alloctree = msp->ms_alloctree[txg & TXG_MASK];
2163         dmu_tx_t *tx;
2164         uint64_t object = space_map_object(msp->ms_sm);
2165
2166         ASSERT(!vd->vdev_ishole);
2167
2168         /*
2169          * This metaslab has just been added so there's no work to do now.
2170          */
2171         if (msp->ms_freeingtree == NULL) {
2172                 ASSERT3P(alloctree, ==, NULL);
2173                 return;
2174         }
2175
2176         ASSERT3P(alloctree, !=, NULL);
2177         ASSERT3P(msp->ms_freeingtree, !=, NULL);
2178         ASSERT3P(msp->ms_freedtree, !=, NULL);
2179
2180         /*
2181          * Normally, we don't want to process a metaslab if there
2182          * are no allocations or frees to perform. However, if the metaslab
2183          * is being forced to condense and it's loaded, we need to let it
2184          * through.
2185          */
2186         if (range_tree_space(alloctree) == 0 &&
2187             range_tree_space(msp->ms_freeingtree) == 0 &&
2188             !(msp->ms_loaded && msp->ms_condense_wanted))
2189                 return;
2190
2191
2192         VERIFY(txg <= spa_final_dirty_txg(spa));
2193
2194         /*
2195          * The only state that can actually be changing concurrently with
2196          * metaslab_sync() is the metaslab's ms_tree.  No other thread can
2197          * be modifying this txg's alloctree, freeingtree, freedtree, or
2198          * space_map_phys_t.  We drop ms_lock whenever we could call
2199          * into the DMU, because the DMU can call down to us
2200          * (e.g. via zio_free()) at any time.
2201          *
2202          * The spa_vdev_remove_thread() can be reading metaslab state
2203          * concurrently, and it is locked out by the ms_sync_lock.  Note
2204          * that the ms_lock is insufficient for this, because it is dropped
2205          * by space_map_write().
2206          */
2207
2208         tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
2209
2210         if (msp->ms_sm == NULL) {
2211                 uint64_t new_object;
2212
2213                 new_object = space_map_alloc(mos, tx);
2214                 VERIFY3U(new_object, !=, 0);
2215
2216                 VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
2217                     msp->ms_start, msp->ms_size, vd->vdev_ashift));
2218                 ASSERT(msp->ms_sm != NULL);
2219         }
2220
2221         mutex_enter(&msp->ms_sync_lock);
2222         mutex_enter(&msp->ms_lock);
2223
2224         /*
2225          * Note: metaslab_condense() clears the space map's histogram.
2226          * Therefore we must verify and remove this histogram before
2227          * condensing.
2228          */
2229         metaslab_group_histogram_verify(mg);
2230         metaslab_class_histogram_verify(mg->mg_class);
2231         metaslab_group_histogram_remove(mg, msp);
2232
2233         if (msp->ms_loaded && spa_sync_pass(spa) == 1 &&
2234             metaslab_should_condense(msp)) {
2235                 metaslab_condense(msp, txg, tx);
2236         } else {
2237                 mutex_exit(&msp->ms_lock);
2238                 space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx);
2239                 space_map_write(msp->ms_sm, msp->ms_freeingtree, SM_FREE, tx);
2240                 mutex_enter(&msp->ms_lock);
2241         }
2242
2243         if (msp->ms_loaded) {
2244                 /*
2245                  * When the space map is loaded, we have an accurate
2246                  * histogram in the range tree. This gives us an opportunity
2247                  * to bring the space map's histogram up-to-date so we clear
2248                  * it first before updating it.
2249                  */
2250                 space_map_histogram_clear(msp->ms_sm);
2251                 space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx);
2252
2253                 /*
2254                  * Since we've cleared the histogram we need to add back
2255                  * any free space that has already been processed, plus
2256                  * any deferred space. This allows the on-disk histogram
2257                  * to accurately reflect all free space even if some space
2258                  * is not yet available for allocation (i.e. deferred).
2259                  */
2260                 space_map_histogram_add(msp->ms_sm, msp->ms_freedtree, tx);
2261
2262                 /*
2263                  * Add back any deferred free space that has not been
2264                  * added back into the in-core free tree yet. This will
2265                  * ensure that we don't end up with a space map histogram
2266                  * that is completely empty unless the metaslab is fully
2267                  * allocated.
2268                  */
2269                 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2270                         space_map_histogram_add(msp->ms_sm,
2271                             msp->ms_defertree[t], tx);
2272                 }
2273         }
2274
2275         /*
2276          * Always add the free space from this sync pass to the space
2277          * map histogram. We want to make sure that the on-disk histogram
2278          * accounts for all free space. If the space map is not loaded,
2279          * then we will lose some accuracy but will correct it the next
2280          * time we load the space map.
2281          */
2282         space_map_histogram_add(msp->ms_sm, msp->ms_freeingtree, tx);
2283
2284         metaslab_group_histogram_add(mg, msp);
2285         metaslab_group_histogram_verify(mg);
2286         metaslab_class_histogram_verify(mg->mg_class);
2287
2288         /*
2289          * For sync pass 1, we avoid traversing this txg's free range tree
2290          * and instead will just swap the pointers for freeingtree and
2291          * freedtree. We can safely do this since the freed_tree is
2292          * guaranteed to be empty on the initial pass.
2293          */
2294         if (spa_sync_pass(spa) == 1) {
2295                 range_tree_swap(&msp->ms_freeingtree, &msp->ms_freedtree);
2296         } else {
2297                 range_tree_vacate(msp->ms_freeingtree,
2298                     range_tree_add, msp->ms_freedtree);
2299         }
2300         range_tree_vacate(alloctree, NULL, NULL);
2301
2302         ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
2303         ASSERT0(range_tree_space(msp->ms_alloctree[TXG_CLEAN(txg) & TXG_MASK]));
2304         ASSERT0(range_tree_space(msp->ms_freeingtree));
2305
2306         mutex_exit(&msp->ms_lock);
2307
2308         if (object != space_map_object(msp->ms_sm)) {
2309                 object = space_map_object(msp->ms_sm);
2310                 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
2311                     msp->ms_id, sizeof (uint64_t), &object, tx);
2312         }
2313         mutex_exit(&msp->ms_sync_lock);
2314         dmu_tx_commit(tx);
2315 }
2316
2317 /*
2318  * Called after a transaction group has completely synced to mark
2319  * all of the metaslab's free space as usable.
2320  */
2321 void
2322 metaslab_sync_done(metaslab_t *msp, uint64_t txg)
2323 {
2324         metaslab_group_t *mg = msp->ms_group;
2325         vdev_t *vd = mg->mg_vd;
2326         spa_t *spa = vd->vdev_spa;
2327         range_tree_t **defer_tree;
2328         int64_t alloc_delta, defer_delta;
2329         boolean_t defer_allowed = B_TRUE;
2330
2331         ASSERT(!vd->vdev_ishole);
2332
2333         mutex_enter(&msp->ms_lock);
2334
2335         /*
2336          * If this metaslab is just becoming available, initialize its
2337          * range trees and add its capacity to the vdev.
2338          */
2339         if (msp->ms_freedtree == NULL) {
2340                 for (int t = 0; t < TXG_SIZE; t++) {
2341                         ASSERT(msp->ms_alloctree[t] == NULL);
2342
2343                         msp->ms_alloctree[t] = range_tree_create(NULL, NULL);
2344                 }
2345
2346                 ASSERT3P(msp->ms_freeingtree, ==, NULL);
2347                 msp->ms_freeingtree = range_tree_create(NULL, NULL);
2348
2349                 ASSERT3P(msp->ms_freedtree, ==, NULL);
2350                 msp->ms_freedtree = range_tree_create(NULL, NULL);
2351
2352                 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2353                         ASSERT(msp->ms_defertree[t] == NULL);
2354
2355                         msp->ms_defertree[t] = range_tree_create(NULL, NULL);
2356                 }
2357
2358                 vdev_space_update(vd, 0, 0, msp->ms_size);
2359         }
2360
2361         defer_tree = &msp->ms_defertree[txg % TXG_DEFER_SIZE];
2362
2363         uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) -
2364             metaslab_class_get_alloc(spa_normal_class(spa));
2365         if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing) {
2366                 defer_allowed = B_FALSE;
2367         }
2368
2369         defer_delta = 0;
2370         alloc_delta = space_map_alloc_delta(msp->ms_sm);
2371         if (defer_allowed) {
2372                 defer_delta = range_tree_space(msp->ms_freedtree) -
2373                     range_tree_space(*defer_tree);
2374         } else {
2375                 defer_delta -= range_tree_space(*defer_tree);
2376         }
2377
2378         vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
2379
2380         /*
2381          * If there's a metaslab_load() in progress, wait for it to complete
2382          * so that we have a consistent view of the in-core space map.
2383          */
2384         metaslab_load_wait(msp);
2385
2386         /*
2387          * Move the frees from the defer_tree back to the free
2388          * range tree (if it's loaded). Swap the freed_tree and the
2389          * defer_tree -- this is safe to do because we've just emptied out
2390          * the defer_tree.
2391          */
2392         range_tree_vacate(*defer_tree,
2393             msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
2394         if (defer_allowed) {
2395                 range_tree_swap(&msp->ms_freedtree, defer_tree);
2396         } else {
2397                 range_tree_vacate(msp->ms_freedtree,
2398                     msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
2399         }
2400
2401         space_map_update(msp->ms_sm);
2402
2403         msp->ms_deferspace += defer_delta;
2404         ASSERT3S(msp->ms_deferspace, >=, 0);
2405         ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
2406         if (msp->ms_deferspace != 0) {
2407                 /*
2408                  * Keep syncing this metaslab until all deferred frees
2409                  * are back in circulation.
2410                  */
2411                 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
2412         }
2413
2414         /*
2415          * Calculate the new weights before unloading any metaslabs.
2416          * This will give us the most accurate weighting.
2417          */
2418         metaslab_group_sort(mg, msp, metaslab_weight(msp));
2419
2420         /*
2421          * If the metaslab is loaded and we've not tried to load or allocate
2422          * from it in 'metaslab_unload_delay' txgs, then unload it.
2423          */
2424         if (msp->ms_loaded &&
2425             msp->ms_selected_txg + metaslab_unload_delay < txg) {
2426
2427                 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
2428                         VERIFY0(range_tree_space(
2429                             msp->ms_alloctree[(txg + t) & TXG_MASK]));
2430                 }
2431
2432                 if (!metaslab_debug_unload)
2433                         metaslab_unload(msp);
2434         }
2435
2436         ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
2437         ASSERT0(range_tree_space(msp->ms_freeingtree));
2438         ASSERT0(range_tree_space(msp->ms_freedtree));
2439
2440         mutex_exit(&msp->ms_lock);
2441 }
2442
2443 void
2444 metaslab_sync_reassess(metaslab_group_t *mg)
2445 {
2446         spa_t *spa = mg->mg_class->mc_spa;
2447
2448         spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
2449         metaslab_group_alloc_update(mg);
2450         mg->mg_fragmentation = metaslab_group_fragmentation(mg);
2451
2452         /*
2453          * Preload the next potential metaslabs but only on active
2454          * metaslab groups. We can get into a state where the metaslab
2455          * is no longer active since we dirty metaslabs as we remove a
2456          * a device, thus potentially making the metaslab group eligible
2457          * for preloading.
2458          */
2459         if (mg->mg_activation_count > 0) {
2460                 metaslab_group_preload(mg);
2461         }
2462         spa_config_exit(spa, SCL_ALLOC, FTAG);
2463 }
2464
2465 static uint64_t
2466 metaslab_distance(metaslab_t *msp, dva_t *dva)
2467 {
2468         uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
2469         uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
2470         uint64_t start = msp->ms_id;
2471
2472         if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
2473                 return (1ULL << 63);
2474
2475         if (offset < start)
2476                 return ((start - offset) << ms_shift);
2477         if (offset > start)
2478                 return ((offset - start) << ms_shift);
2479         return (0);
2480 }
2481
2482 /*
2483  * ==========================================================================
2484  * Metaslab allocation tracing facility
2485  * ==========================================================================
2486  */
2487 #ifdef _METASLAB_TRACING
2488 kstat_t *metaslab_trace_ksp;
2489 kstat_named_t metaslab_trace_over_limit;
2490
2491 void
2492 metaslab_alloc_trace_init(void)
2493 {
2494         ASSERT(metaslab_alloc_trace_cache == NULL);
2495         metaslab_alloc_trace_cache = kmem_cache_create(
2496             "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t),
2497             0, NULL, NULL, NULL, NULL, NULL, 0);
2498         metaslab_trace_ksp = kstat_create("zfs", 0, "metaslab_trace_stats",
2499             "misc", KSTAT_TYPE_NAMED, 1, KSTAT_FLAG_VIRTUAL);
2500         if (metaslab_trace_ksp != NULL) {
2501                 metaslab_trace_ksp->ks_data = &metaslab_trace_over_limit;
2502                 kstat_named_init(&metaslab_trace_over_limit,
2503                     "metaslab_trace_over_limit", KSTAT_DATA_UINT64);
2504                 kstat_install(metaslab_trace_ksp);
2505         }
2506 }
2507
2508 void
2509 metaslab_alloc_trace_fini(void)
2510 {
2511         if (metaslab_trace_ksp != NULL) {
2512                 kstat_delete(metaslab_trace_ksp);
2513                 metaslab_trace_ksp = NULL;
2514         }
2515         kmem_cache_destroy(metaslab_alloc_trace_cache);
2516         metaslab_alloc_trace_cache = NULL;
2517 }
2518
2519 /*
2520  * Add an allocation trace element to the allocation tracing list.
2521  */
2522 static void
2523 metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg,
2524     metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset)
2525 {
2526         metaslab_alloc_trace_t *mat;
2527
2528         if (!metaslab_trace_enabled)
2529                 return;
2530
2531         /*
2532          * When the tracing list reaches its maximum we remove
2533          * the second element in the list before adding a new one.
2534          * By removing the second element we preserve the original
2535          * entry as a clue to what allocations steps have already been
2536          * performed.
2537          */
2538         if (zal->zal_size == metaslab_trace_max_entries) {
2539                 metaslab_alloc_trace_t *mat_next;
2540 #ifdef DEBUG
2541                 panic("too many entries in allocation list");
2542 #endif
2543                 atomic_inc_64(&metaslab_trace_over_limit.value.ui64);
2544                 zal->zal_size--;
2545                 mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list));
2546                 list_remove(&zal->zal_list, mat_next);
2547                 kmem_cache_free(metaslab_alloc_trace_cache, mat_next);
2548         }
2549
2550         mat = kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP);
2551         list_link_init(&mat->mat_list_node);
2552         mat->mat_mg = mg;
2553         mat->mat_msp = msp;
2554         mat->mat_size = psize;
2555         mat->mat_dva_id = dva_id;
2556         mat->mat_offset = offset;
2557         mat->mat_weight = 0;
2558
2559         if (msp != NULL)
2560                 mat->mat_weight = msp->ms_weight;
2561
2562         /*
2563          * The list is part of the zio so locking is not required. Only
2564          * a single thread will perform allocations for a given zio.
2565          */
2566         list_insert_tail(&zal->zal_list, mat);
2567         zal->zal_size++;
2568
2569         ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries);
2570 }
2571
2572 void
2573 metaslab_trace_init(zio_alloc_list_t *zal)
2574 {
2575         list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t),
2576             offsetof(metaslab_alloc_trace_t, mat_list_node));
2577         zal->zal_size = 0;
2578 }
2579
2580 void
2581 metaslab_trace_fini(zio_alloc_list_t *zal)
2582 {
2583         metaslab_alloc_trace_t *mat;
2584
2585         while ((mat = list_remove_head(&zal->zal_list)) != NULL)
2586                 kmem_cache_free(metaslab_alloc_trace_cache, mat);
2587         list_destroy(&zal->zal_list);
2588         zal->zal_size = 0;
2589 }
2590 #else
2591
2592 #define metaslab_trace_add(zal, mg, msp, psize, id, off)
2593
2594 void
2595 metaslab_alloc_trace_init(void)
2596 {
2597 }
2598
2599 void
2600 metaslab_alloc_trace_fini(void)
2601 {
2602 }
2603
2604 void
2605 metaslab_trace_init(zio_alloc_list_t *zal)
2606 {
2607 }
2608
2609 void
2610 metaslab_trace_fini(zio_alloc_list_t *zal)
2611 {
2612 }
2613
2614 #endif /* _METASLAB_TRACING */
2615
2616 /*
2617  * ==========================================================================
2618  * Metaslab block operations
2619  * ==========================================================================
2620  */
2621
2622 static void
2623 metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags)
2624 {
2625         if (!(flags & METASLAB_ASYNC_ALLOC) ||
2626             flags & METASLAB_DONT_THROTTLE)
2627                 return;
2628
2629         metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
2630         if (!mg->mg_class->mc_alloc_throttle_enabled)
2631                 return;
2632
2633         (void) refcount_add(&mg->mg_alloc_queue_depth, tag);
2634 }
2635
2636 void
2637 metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags)
2638 {
2639         if (!(flags & METASLAB_ASYNC_ALLOC) ||
2640             flags & METASLAB_DONT_THROTTLE)
2641                 return;
2642
2643         metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
2644         if (!mg->mg_class->mc_alloc_throttle_enabled)
2645                 return;
2646
2647         (void) refcount_remove(&mg->mg_alloc_queue_depth, tag);
2648 }
2649
2650 void
2651 metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag)
2652 {
2653 #ifdef ZFS_DEBUG
2654         const dva_t *dva = bp->blk_dva;
2655         int ndvas = BP_GET_NDVAS(bp);
2656
2657         for (int d = 0; d < ndvas; d++) {
2658                 uint64_t vdev = DVA_GET_VDEV(&dva[d]);
2659                 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
2660                 VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth, tag));
2661         }
2662 #endif
2663 }
2664
2665 static uint64_t
2666 metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
2667 {
2668         uint64_t start;
2669         range_tree_t *rt = msp->ms_tree;
2670         metaslab_class_t *mc = msp->ms_group->mg_class;
2671
2672         VERIFY(!msp->ms_condensing);
2673
2674         start = mc->mc_ops->msop_alloc(msp, size);
2675         if (start != -1ULL) {
2676                 metaslab_group_t *mg = msp->ms_group;
2677                 vdev_t *vd = mg->mg_vd;
2678
2679                 VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
2680                 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
2681                 VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
2682                 range_tree_remove(rt, start, size);
2683
2684                 if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
2685                         vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
2686
2687                 range_tree_add(msp->ms_alloctree[txg & TXG_MASK], start, size);
2688
2689                 /* Track the last successful allocation */
2690                 msp->ms_alloc_txg = txg;
2691                 metaslab_verify_space(msp, txg);
2692         }
2693
2694         /*
2695          * Now that we've attempted the allocation we need to update the
2696          * metaslab's maximum block size since it may have changed.
2697          */
2698         msp->ms_max_size = metaslab_block_maxsize(msp);
2699         return (start);
2700 }
2701
2702 static uint64_t
2703 metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
2704     uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
2705 {
2706         metaslab_t *msp = NULL;
2707         uint64_t offset = -1ULL;
2708         uint64_t activation_weight;
2709         uint64_t target_distance;
2710         int i;
2711
2712         activation_weight = METASLAB_WEIGHT_PRIMARY;
2713         for (i = 0; i < d; i++) {
2714                 if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
2715                         activation_weight = METASLAB_WEIGHT_SECONDARY;
2716                         break;
2717                 }
2718         }
2719
2720         metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP);
2721         search->ms_weight = UINT64_MAX;
2722         search->ms_start = 0;
2723         for (;;) {
2724                 boolean_t was_active;
2725                 avl_tree_t *t = &mg->mg_metaslab_tree;
2726                 avl_index_t idx;
2727
2728                 mutex_enter(&mg->mg_lock);
2729
2730                 /*
2731                  * Find the metaslab with the highest weight that is less
2732                  * than what we've already tried.  In the common case, this
2733                  * means that we will examine each metaslab at most once.
2734                  * Note that concurrent callers could reorder metaslabs
2735                  * by activation/passivation once we have dropped the mg_lock.
2736                  * If a metaslab is activated by another thread, and we fail
2737                  * to allocate from the metaslab we have selected, we may
2738                  * not try the newly-activated metaslab, and instead activate
2739                  * another metaslab.  This is not optimal, but generally
2740                  * does not cause any problems (a possible exception being
2741                  * if every metaslab is completely full except for the
2742                  * the newly-activated metaslab which we fail to examine).
2743                  */
2744                 msp = avl_find(t, search, &idx);
2745                 if (msp == NULL)
2746                         msp = avl_nearest(t, idx, AVL_AFTER);
2747                 for (; msp != NULL; msp = AVL_NEXT(t, msp)) {
2748
2749                         if (!metaslab_should_allocate(msp, asize)) {
2750                                 metaslab_trace_add(zal, mg, msp, asize, d,
2751                                     TRACE_TOO_SMALL);
2752                                 continue;
2753                         }
2754
2755                         /*
2756                          * If the selected metaslab is condensing, skip it.
2757                          */
2758                         if (msp->ms_condensing)
2759                                 continue;
2760
2761                         was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
2762                         if (activation_weight == METASLAB_WEIGHT_PRIMARY)
2763                                 break;
2764
2765                         target_distance = min_distance +
2766                             (space_map_allocated(msp->ms_sm) != 0 ? 0 :
2767                             min_distance >> 1);
2768
2769                         for (i = 0; i < d; i++) {
2770                                 if (metaslab_distance(msp, &dva[i]) <
2771                                     target_distance)
2772                                         break;
2773                         }
2774                         if (i == d)
2775                                 break;
2776                 }
2777                 mutex_exit(&mg->mg_lock);
2778                 if (msp == NULL) {
2779                         kmem_free(search, sizeof (*search));
2780                         return (-1ULL);
2781                 }
2782                 search->ms_weight = msp->ms_weight;
2783                 search->ms_start = msp->ms_start + 1;
2784
2785                 mutex_enter(&msp->ms_lock);
2786
2787                 /*
2788                  * Ensure that the metaslab we have selected is still
2789                  * capable of handling our request. It's possible that
2790                  * another thread may have changed the weight while we
2791                  * were blocked on the metaslab lock. We check the
2792                  * active status first to see if we need to reselect
2793                  * a new metaslab.
2794                  */
2795                 if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) {
2796                         mutex_exit(&msp->ms_lock);
2797                         continue;
2798                 }
2799
2800                 if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
2801                     activation_weight == METASLAB_WEIGHT_PRIMARY) {
2802                         metaslab_passivate(msp,
2803                             msp->ms_weight & ~METASLAB_ACTIVE_MASK);
2804                         mutex_exit(&msp->ms_lock);
2805                         continue;
2806                 }
2807
2808                 if (metaslab_activate(msp, activation_weight) != 0) {
2809                         mutex_exit(&msp->ms_lock);
2810                         continue;
2811                 }
2812                 msp->ms_selected_txg = txg;
2813
2814                 /*
2815                  * Now that we have the lock, recheck to see if we should
2816                  * continue to use this metaslab for this allocation. The
2817                  * the metaslab is now loaded so metaslab_should_allocate() can
2818                  * accurately determine if the allocation attempt should
2819                  * proceed.
2820                  */
2821                 if (!metaslab_should_allocate(msp, asize)) {
2822                         /* Passivate this metaslab and select a new one. */
2823                         metaslab_trace_add(zal, mg, msp, asize, d,
2824                             TRACE_TOO_SMALL);
2825                         goto next;
2826                 }
2827
2828
2829                 /*
2830                  * If this metaslab is currently condensing then pick again as
2831                  * we can't manipulate this metaslab until it's committed
2832                  * to disk.
2833                  */
2834                 if (msp->ms_condensing) {
2835                         metaslab_trace_add(zal, mg, msp, asize, d,
2836                             TRACE_CONDENSING);
2837                         mutex_exit(&msp->ms_lock);
2838                         continue;
2839                 }
2840
2841                 offset = metaslab_block_alloc(msp, asize, txg);
2842                 metaslab_trace_add(zal, mg, msp, asize, d, offset);
2843
2844                 if (offset != -1ULL) {
2845                         /* Proactively passivate the metaslab, if needed */
2846                         metaslab_segment_may_passivate(msp);
2847                         break;
2848                 }
2849 next:
2850                 ASSERT(msp->ms_loaded);
2851
2852                 /*
2853                  * We were unable to allocate from this metaslab so determine
2854                  * a new weight for this metaslab. Now that we have loaded
2855                  * the metaslab we can provide a better hint to the metaslab
2856                  * selector.
2857                  *
2858                  * For space-based metaslabs, we use the maximum block size.
2859                  * This information is only available when the metaslab
2860                  * is loaded and is more accurate than the generic free
2861                  * space weight that was calculated by metaslab_weight().
2862                  * This information allows us to quickly compare the maximum
2863                  * available allocation in the metaslab to the allocation
2864                  * size being requested.
2865                  *
2866                  * For segment-based metaslabs, determine the new weight
2867                  * based on the highest bucket in the range tree. We
2868                  * explicitly use the loaded segment weight (i.e. the range
2869                  * tree histogram) since it contains the space that is
2870                  * currently available for allocation and is accurate
2871                  * even within a sync pass.
2872                  */
2873                 if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
2874                         uint64_t weight = metaslab_block_maxsize(msp);
2875                         WEIGHT_SET_SPACEBASED(weight);
2876                         metaslab_passivate(msp, weight);
2877                 } else {
2878                         metaslab_passivate(msp,
2879                             metaslab_weight_from_range_tree(msp));
2880                 }
2881
2882                 /*
2883                  * We have just failed an allocation attempt, check
2884                  * that metaslab_should_allocate() agrees. Otherwise,
2885                  * we may end up in an infinite loop retrying the same
2886                  * metaslab.
2887                  */
2888                 ASSERT(!metaslab_should_allocate(msp, asize));
2889                 mutex_exit(&msp->ms_lock);
2890         }
2891         mutex_exit(&msp->ms_lock);
2892         kmem_free(search, sizeof (*search));
2893         return (offset);
2894 }
2895
2896 static uint64_t
2897 metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
2898     uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
2899 {
2900         uint64_t offset;
2901         ASSERT(mg->mg_initialized);
2902
2903         offset = metaslab_group_alloc_normal(mg, zal, asize, txg,
2904             min_distance, dva, d);
2905
2906         mutex_enter(&mg->mg_lock);
2907         if (offset == -1ULL) {
2908                 mg->mg_failed_allocations++;
2909                 metaslab_trace_add(zal, mg, NULL, asize, d,
2910                     TRACE_GROUP_FAILURE);
2911                 if (asize == SPA_GANGBLOCKSIZE) {
2912                         /*
2913                          * This metaslab group was unable to allocate
2914                          * the minimum gang block size so it must be out of
2915                          * space. We must notify the allocation throttle
2916                          * to start skipping allocation attempts to this
2917                          * metaslab group until more space becomes available.
2918                          * Note: this failure cannot be caused by the
2919                          * allocation throttle since the allocation throttle
2920                          * is only responsible for skipping devices and
2921                          * not failing block allocations.
2922                          */
2923                         mg->mg_no_free_space = B_TRUE;
2924                 }
2925         }
2926         mg->mg_allocations++;
2927         mutex_exit(&mg->mg_lock);
2928         return (offset);
2929 }
2930
2931 /*
2932  * If we have to write a ditto block (i.e. more than one DVA for a given BP)
2933  * on the same vdev as an existing DVA of this BP, then try to allocate it
2934  * at least (vdev_asize / (2 ^ ditto_same_vdev_distance_shift)) away from the
2935  * existing DVAs.
2936  */
2937 int ditto_same_vdev_distance_shift = 3;
2938
2939 /*
2940  * Allocate a block for the specified i/o.
2941  */
2942 int
2943 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
2944     dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags,
2945     zio_alloc_list_t *zal)
2946 {
2947         metaslab_group_t *mg, *fast_mg, *rotor;
2948         vdev_t *vd;
2949         boolean_t try_hard = B_FALSE;
2950
2951         ASSERT(!DVA_IS_VALID(&dva[d]));
2952
2953         /*
2954          * For testing, make some blocks above a certain size be gang blocks.
2955          */
2956         if (psize >= metaslab_force_ganging && (ddi_get_lbolt() & 3) == 0) {
2957                 metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG);
2958                 return (SET_ERROR(ENOSPC));
2959         }
2960
2961         /*
2962          * Start at the rotor and loop through all mgs until we find something.
2963          * Note that there's no locking on mc_rotor or mc_aliquot because
2964          * nothing actually breaks if we miss a few updates -- we just won't
2965          * allocate quite as evenly.  It all balances out over time.
2966          *
2967          * If we are doing ditto or log blocks, try to spread them across
2968          * consecutive vdevs.  If we're forced to reuse a vdev before we've
2969          * allocated all of our ditto blocks, then try and spread them out on
2970          * that vdev as much as possible.  If it turns out to not be possible,
2971          * gradually lower our standards until anything becomes acceptable.
2972          * Also, allocating on consecutive vdevs (as opposed to random vdevs)
2973          * gives us hope of containing our fault domains to something we're
2974          * able to reason about.  Otherwise, any two top-level vdev failures
2975          * will guarantee the loss of data.  With consecutive allocation,
2976          * only two adjacent top-level vdev failures will result in data loss.
2977          *
2978          * If we are doing gang blocks (hintdva is non-NULL), try to keep
2979          * ourselves on the same vdev as our gang block header.  That
2980          * way, we can hope for locality in vdev_cache, plus it makes our
2981          * fault domains something tractable.
2982          */
2983         if (hintdva) {
2984                 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
2985
2986                 /*
2987                  * It's possible the vdev we're using as the hint no
2988                  * longer exists or its mg has been closed (e.g. by
2989                  * device removal).  Consult the rotor when
2990                  * all else fails.
2991                  */
2992                 if (vd != NULL && vd->vdev_mg != NULL) {
2993                         mg = vd->vdev_mg;
2994
2995                         if (flags & METASLAB_HINTBP_AVOID &&
2996                             mg->mg_next != NULL)
2997                                 mg = mg->mg_next;
2998                 } else {
2999                         mg = mc->mc_rotor;
3000                 }
3001         } else if (d != 0) {
3002                 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
3003                 mg = vd->vdev_mg->mg_next;
3004         } else if (flags & METASLAB_FASTWRITE) {
3005                 mg = fast_mg = mc->mc_rotor;
3006
3007                 do {
3008                         if (fast_mg->mg_vd->vdev_pending_fastwrite <
3009                             mg->mg_vd->vdev_pending_fastwrite)
3010                                 mg = fast_mg;
3011                 } while ((fast_mg = fast_mg->mg_next) != mc->mc_rotor);
3012
3013         } else {
3014                 mg = mc->mc_rotor;
3015         }
3016
3017         /*
3018          * If the hint put us into the wrong metaslab class, or into a
3019          * metaslab group that has been passivated, just follow the rotor.
3020          */
3021         if (mg->mg_class != mc || mg->mg_activation_count <= 0)
3022                 mg = mc->mc_rotor;
3023
3024         rotor = mg;
3025 top:
3026         do {
3027                 boolean_t allocatable;
3028
3029                 ASSERT(mg->mg_activation_count == 1);
3030                 vd = mg->mg_vd;
3031
3032                 /*
3033                  * Don't allocate from faulted devices.
3034                  */
3035                 if (try_hard) {
3036                         spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
3037                         allocatable = vdev_allocatable(vd);
3038                         spa_config_exit(spa, SCL_ZIO, FTAG);
3039                 } else {
3040                         allocatable = vdev_allocatable(vd);
3041                 }
3042
3043                 /*
3044                  * Determine if the selected metaslab group is eligible
3045                  * for allocations. If we're ganging then don't allow
3046                  * this metaslab group to skip allocations since that would
3047                  * inadvertently return ENOSPC and suspend the pool
3048                  * even though space is still available.
3049                  */
3050                 if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) {
3051                         allocatable = metaslab_group_allocatable(mg, rotor,
3052                             psize);
3053                 }
3054
3055                 if (!allocatable) {
3056                         metaslab_trace_add(zal, mg, NULL, psize, d,
3057                             TRACE_NOT_ALLOCATABLE);
3058                         goto next;
3059                 }
3060
3061                 ASSERT(mg->mg_initialized);
3062
3063                 /*
3064                  * Avoid writing single-copy data to a failing,
3065                  * non-redundant vdev, unless we've already tried all
3066                  * other vdevs.
3067                  */
3068                 if ((vd->vdev_stat.vs_write_errors > 0 ||
3069                     vd->vdev_state < VDEV_STATE_HEALTHY) &&
3070                     d == 0 && !try_hard && vd->vdev_children == 0) {
3071                         metaslab_trace_add(zal, mg, NULL, psize, d,
3072                             TRACE_VDEV_ERROR);
3073                         goto next;
3074                 }
3075
3076                 ASSERT(mg->mg_class == mc);
3077
3078                 /*
3079                  * If we don't need to try hard, then require that the
3080                  * block be 1/8th of the device away from any other DVAs
3081                  * in this BP.  If we are trying hard, allow any offset
3082                  * to be used (distance=0).
3083                  */
3084                 uint64_t distance = 0;
3085                 if (!try_hard) {
3086                         distance = vd->vdev_asize >>
3087                             ditto_same_vdev_distance_shift;
3088                         if (distance <= (1ULL << vd->vdev_ms_shift))
3089                                 distance = 0;
3090                 }
3091
3092                 uint64_t asize = vdev_psize_to_asize(vd, psize);
3093                 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
3094
3095                 uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg,
3096                     distance, dva, d);
3097
3098                 if (offset != -1ULL) {
3099                         /*
3100                          * If we've just selected this metaslab group,
3101                          * figure out whether the corresponding vdev is
3102                          * over- or under-used relative to the pool,
3103                          * and set an allocation bias to even it out.
3104                          *
3105                          * Bias is also used to compensate for unequally
3106                          * sized vdevs so that space is allocated fairly.
3107                          */
3108                         if (mc->mc_aliquot == 0 && metaslab_bias_enabled) {
3109                                 vdev_stat_t *vs = &vd->vdev_stat;
3110                                 int64_t vs_free = vs->vs_space - vs->vs_alloc;
3111                                 int64_t mc_free = mc->mc_space - mc->mc_alloc;
3112                                 int64_t ratio;
3113
3114                                 /*
3115                                  * Calculate how much more or less we should
3116                                  * try to allocate from this device during
3117                                  * this iteration around the rotor.
3118                                  *
3119                                  * This basically introduces a zero-centered
3120                                  * bias towards the devices with the most
3121                                  * free space, while compensating for vdev
3122                                  * size differences.
3123                                  *
3124                                  * Examples:
3125                                  *  vdev V1 = 16M/128M
3126                                  *  vdev V2 = 16M/128M
3127                                  *  ratio(V1) = 100% ratio(V2) = 100%
3128                                  *
3129                                  *  vdev V1 = 16M/128M
3130                                  *  vdev V2 = 64M/128M
3131                                  *  ratio(V1) = 127% ratio(V2) =  72%
3132                                  *
3133                                  *  vdev V1 = 16M/128M
3134                                  *  vdev V2 = 64M/512M
3135                                  *  ratio(V1) =  40% ratio(V2) = 160%
3136                                  */
3137                                 ratio = (vs_free * mc->mc_alloc_groups * 100) /
3138                                     (mc_free + 1);
3139                                 mg->mg_bias = ((ratio - 100) *
3140                                     (int64_t)mg->mg_aliquot) / 100;
3141                         } else if (!metaslab_bias_enabled) {
3142                                 mg->mg_bias = 0;
3143                         }
3144
3145                         if ((flags & METASLAB_FASTWRITE) ||
3146                             atomic_add_64_nv(&mc->mc_aliquot, asize) >=
3147                             mg->mg_aliquot + mg->mg_bias) {
3148                                 mc->mc_rotor = mg->mg_next;
3149                                 mc->mc_aliquot = 0;
3150                         }
3151
3152                         DVA_SET_VDEV(&dva[d], vd->vdev_id);
3153                         DVA_SET_OFFSET(&dva[d], offset);
3154                         DVA_SET_GANG(&dva[d],
3155                             ((flags & METASLAB_GANG_HEADER) ? 1 : 0));
3156                         DVA_SET_ASIZE(&dva[d], asize);
3157
3158                         if (flags & METASLAB_FASTWRITE) {
3159                                 atomic_add_64(&vd->vdev_pending_fastwrite,
3160                                     psize);
3161                         }
3162
3163                         return (0);
3164                 }
3165 next:
3166                 mc->mc_rotor = mg->mg_next;
3167                 mc->mc_aliquot = 0;
3168         } while ((mg = mg->mg_next) != rotor);
3169
3170         /*
3171          * If we haven't tried hard, do so now.
3172          */
3173         if (!try_hard) {
3174                 try_hard = B_TRUE;
3175                 goto top;
3176         }
3177
3178         bzero(&dva[d], sizeof (dva_t));
3179
3180         metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC);
3181         return (SET_ERROR(ENOSPC));
3182 }
3183
3184 void
3185 metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize,
3186     uint64_t txg)
3187 {
3188         metaslab_t *msp;
3189         ASSERTV(spa_t *spa = vd->vdev_spa);
3190
3191         ASSERT3U(txg, ==, spa->spa_syncing_txg);
3192         ASSERT(vdev_is_concrete(vd));
3193         ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
3194         ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
3195
3196         msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
3197
3198         VERIFY(!msp->ms_condensing);
3199         VERIFY3U(offset, >=, msp->ms_start);
3200         VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size);
3201         VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
3202         VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift));
3203
3204         metaslab_check_free_impl(vd, offset, asize);
3205         mutex_enter(&msp->ms_lock);
3206         if (range_tree_space(msp->ms_freeingtree) == 0) {
3207                 vdev_dirty(vd, VDD_METASLAB, msp, txg);
3208         }
3209         range_tree_add(msp->ms_freeingtree, offset, asize);
3210         mutex_exit(&msp->ms_lock);
3211 }
3212
3213 /* ARGSUSED */
3214 void
3215 metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
3216     uint64_t size, void *arg)
3217 {
3218         uint64_t *txgp = arg;
3219
3220         if (vd->vdev_ops->vdev_op_remap != NULL)
3221                 vdev_indirect_mark_obsolete(vd, offset, size, *txgp);
3222         else
3223                 metaslab_free_impl(vd, offset, size, *txgp);
3224 }
3225
3226 static void
3227 metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size,
3228     uint64_t txg)
3229 {
3230         spa_t *spa = vd->vdev_spa;
3231
3232         ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
3233
3234         if (txg > spa_freeze_txg(spa))
3235                 return;
3236
3237         if (spa->spa_vdev_removal != NULL &&
3238             spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id &&
3239             vdev_is_concrete(vd)) {
3240                 /*
3241                  * Note: we check if the vdev is concrete because when
3242                  * we complete the removal, we first change the vdev to be
3243                  * an indirect vdev (in open context), and then (in syncing
3244                  * context) clear spa_vdev_removal.
3245                  */
3246                 free_from_removing_vdev(vd, offset, size, txg);
3247         } else if (vd->vdev_ops->vdev_op_remap != NULL) {
3248                 vdev_indirect_mark_obsolete(vd, offset, size, txg);
3249                 vd->vdev_ops->vdev_op_remap(vd, offset, size,
3250                     metaslab_free_impl_cb, &txg);
3251         } else {
3252                 metaslab_free_concrete(vd, offset, size, txg);
3253         }
3254 }
3255
3256 typedef struct remap_blkptr_cb_arg {
3257         blkptr_t *rbca_bp;
3258         spa_remap_cb_t rbca_cb;
3259         vdev_t *rbca_remap_vd;
3260         uint64_t rbca_remap_offset;
3261         void *rbca_cb_arg;
3262 } remap_blkptr_cb_arg_t;
3263
3264 void
3265 remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
3266     uint64_t size, void *arg)
3267 {
3268         remap_blkptr_cb_arg_t *rbca = arg;
3269         blkptr_t *bp = rbca->rbca_bp;
3270
3271         /* We can not remap split blocks. */
3272         if (size != DVA_GET_ASIZE(&bp->blk_dva[0]))
3273                 return;
3274         ASSERT0(inner_offset);
3275
3276         if (rbca->rbca_cb != NULL) {
3277                 /*
3278                  * At this point we know that we are not handling split
3279                  * blocks and we invoke the callback on the previous
3280                  * vdev which must be indirect.
3281                  */
3282                 ASSERT3P(rbca->rbca_remap_vd->vdev_ops, ==, &vdev_indirect_ops);
3283
3284                 rbca->rbca_cb(rbca->rbca_remap_vd->vdev_id,
3285                     rbca->rbca_remap_offset, size, rbca->rbca_cb_arg);
3286
3287                 /* set up remap_blkptr_cb_arg for the next call */
3288                 rbca->rbca_remap_vd = vd;
3289                 rbca->rbca_remap_offset = offset;
3290         }
3291
3292         /*
3293          * The phys birth time is that of dva[0].  This ensures that we know
3294          * when each dva was written, so that resilver can determine which
3295          * blocks need to be scrubbed (i.e. those written during the time
3296          * the vdev was offline).  It also ensures that the key used in
3297          * the ARC hash table is unique (i.e. dva[0] + phys_birth).  If
3298          * we didn't change the phys_birth, a lookup in the ARC for a
3299          * remapped BP could find the data that was previously stored at
3300          * this vdev + offset.
3301          */
3302         vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa,
3303             DVA_GET_VDEV(&bp->blk_dva[0]));
3304         vdev_indirect_births_t *vib = oldvd->vdev_indirect_births;
3305         bp->blk_phys_birth = vdev_indirect_births_physbirth(vib,
3306             DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0]));
3307
3308         DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id);
3309         DVA_SET_OFFSET(&bp->blk_dva[0], offset);
3310 }
3311
3312 /*
3313  * If the block pointer contains any indirect DVAs, modify them to refer to
3314  * concrete DVAs.  Note that this will sometimes not be possible, leaving
3315  * the indirect DVA in place.  This happens if the indirect DVA spans multiple
3316  * segments in the mapping (i.e. it is a "split block").
3317  *
3318  * If the BP was remapped, calls the callback on the original dva (note the
3319  * callback can be called multiple times if the original indirect DVA refers
3320  * to another indirect DVA, etc).
3321  *
3322  * Returns TRUE if the BP was remapped.
3323  */
3324 boolean_t
3325 spa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg)
3326 {
3327         remap_blkptr_cb_arg_t rbca;
3328
3329         if (!zfs_remap_blkptr_enable)
3330                 return (B_FALSE);
3331
3332         if (!spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS))
3333                 return (B_FALSE);
3334
3335         /*
3336          * Dedup BP's can not be remapped, because ddt_phys_select() depends
3337          * on DVA[0] being the same in the BP as in the DDT (dedup table).
3338          */
3339         if (BP_GET_DEDUP(bp))
3340                 return (B_FALSE);
3341
3342         /*
3343          * Gang blocks can not be remapped, because
3344          * zio_checksum_gang_verifier() depends on the DVA[0] that's in
3345          * the BP used to read the gang block header (GBH) being the same
3346          * as the DVA[0] that we allocated for the GBH.
3347          */
3348         if (BP_IS_GANG(bp))
3349                 return (B_FALSE);
3350
3351         /*
3352          * Embedded BP's have no DVA to remap.
3353          */
3354         if (BP_GET_NDVAS(bp) < 1)
3355                 return (B_FALSE);
3356
3357         /*
3358          * Note: we only remap dva[0].  If we remapped other dvas, we
3359          * would no longer know what their phys birth txg is.
3360          */
3361         dva_t *dva = &bp->blk_dva[0];
3362
3363         uint64_t offset = DVA_GET_OFFSET(dva);
3364         uint64_t size = DVA_GET_ASIZE(dva);
3365         vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
3366
3367         if (vd->vdev_ops->vdev_op_remap == NULL)
3368                 return (B_FALSE);
3369
3370         rbca.rbca_bp = bp;
3371         rbca.rbca_cb = callback;
3372         rbca.rbca_remap_vd = vd;
3373         rbca.rbca_remap_offset = offset;
3374         rbca.rbca_cb_arg = arg;
3375
3376         /*
3377          * remap_blkptr_cb() will be called in order for each level of
3378          * indirection, until a concrete vdev is reached or a split block is
3379          * encountered. old_vd and old_offset are updated within the callback
3380          * as we go from the one indirect vdev to the next one (either concrete
3381          * or indirect again) in that order.
3382          */
3383         vd->vdev_ops->vdev_op_remap(vd, offset, size, remap_blkptr_cb, &rbca);
3384
3385         /* Check if the DVA wasn't remapped because it is a split block */
3386         if (DVA_GET_VDEV(&rbca.rbca_bp->blk_dva[0]) == vd->vdev_id)
3387                 return (B_FALSE);
3388
3389         return (B_TRUE);
3390 }
3391
3392 /*
3393  * Undo the allocation of a DVA which happened in the given transaction group.
3394  */
3395 void
3396 metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
3397 {
3398         metaslab_t *msp;
3399         vdev_t *vd;
3400         uint64_t vdev = DVA_GET_VDEV(dva);
3401         uint64_t offset = DVA_GET_OFFSET(dva);
3402         uint64_t size = DVA_GET_ASIZE(dva);
3403
3404         ASSERT(DVA_IS_VALID(dva));
3405         ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
3406
3407         if (txg > spa_freeze_txg(spa))
3408                 return;
3409
3410         if ((vd = vdev_lookup_top(spa, vdev)) == NULL || !DVA_IS_VALID(dva) ||
3411             (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
3412                 zfs_panic_recover("metaslab_free_dva(): bad DVA %llu:%llu:%llu",
3413                     (u_longlong_t)vdev, (u_longlong_t)offset,
3414                     (u_longlong_t)size);
3415                 return;
3416         }
3417
3418         ASSERT(!vd->vdev_removing);
3419         ASSERT(vdev_is_concrete(vd));
3420         ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
3421         ASSERT3P(vd->vdev_indirect_mapping, ==, NULL);
3422
3423         if (DVA_GET_GANG(dva))
3424                 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
3425
3426         msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
3427
3428         mutex_enter(&msp->ms_lock);
3429         range_tree_remove(msp->ms_alloctree[txg & TXG_MASK],
3430             offset, size);
3431
3432         VERIFY(!msp->ms_condensing);
3433         VERIFY3U(offset, >=, msp->ms_start);
3434         VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
3435         VERIFY3U(range_tree_space(msp->ms_tree) + size, <=,
3436             msp->ms_size);
3437         VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
3438         VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
3439         range_tree_add(msp->ms_tree, offset, size);
3440         mutex_exit(&msp->ms_lock);
3441 }
3442
3443 /*
3444  * Free the block represented by DVA in the context of the specified
3445  * transaction group.
3446  */
3447 void
3448 metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
3449 {
3450         uint64_t vdev = DVA_GET_VDEV(dva);
3451         uint64_t offset = DVA_GET_OFFSET(dva);
3452         uint64_t size = DVA_GET_ASIZE(dva);
3453         vdev_t *vd = vdev_lookup_top(spa, vdev);
3454
3455         ASSERT(DVA_IS_VALID(dva));
3456         ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
3457
3458         if (DVA_GET_GANG(dva)) {
3459                 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
3460         }
3461
3462         metaslab_free_impl(vd, offset, size, txg);
3463 }
3464
3465 /*
3466  * Reserve some allocation slots. The reservation system must be called
3467  * before we call into the allocator. If there aren't any available slots
3468  * then the I/O will be throttled until an I/O completes and its slots are
3469  * freed up. The function returns true if it was successful in placing
3470  * the reservation.
3471  */
3472 boolean_t
3473 metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio,
3474     int flags)
3475 {
3476         uint64_t available_slots = 0;
3477         boolean_t slot_reserved = B_FALSE;
3478
3479         ASSERT(mc->mc_alloc_throttle_enabled);
3480         mutex_enter(&mc->mc_lock);
3481
3482         uint64_t reserved_slots = refcount_count(&mc->mc_alloc_slots);
3483         if (reserved_slots < mc->mc_alloc_max_slots)
3484                 available_slots = mc->mc_alloc_max_slots - reserved_slots;
3485
3486         if (slots <= available_slots || GANG_ALLOCATION(flags)) {
3487                 /*
3488                  * We reserve the slots individually so that we can unreserve
3489                  * them individually when an I/O completes.
3490                  */
3491                 for (int d = 0; d < slots; d++) {
3492                         reserved_slots = refcount_add(&mc->mc_alloc_slots, zio);
3493                 }
3494                 zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
3495                 slot_reserved = B_TRUE;
3496         }
3497
3498         mutex_exit(&mc->mc_lock);
3499         return (slot_reserved);
3500 }
3501
3502 void
3503 metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, zio_t *zio)
3504 {
3505         ASSERT(mc->mc_alloc_throttle_enabled);
3506         mutex_enter(&mc->mc_lock);
3507         for (int d = 0; d < slots; d++) {
3508                 (void) refcount_remove(&mc->mc_alloc_slots, zio);
3509         }
3510         mutex_exit(&mc->mc_lock);
3511 }
3512
3513 static int
3514 metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size,
3515     uint64_t txg)
3516 {
3517         metaslab_t *msp;
3518         spa_t *spa = vd->vdev_spa;
3519         int error = 0;
3520
3521         if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count)
3522                 return (ENXIO);
3523
3524         ASSERT3P(vd->vdev_ms, !=, NULL);
3525         msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
3526
3527         mutex_enter(&msp->ms_lock);
3528
3529         if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded)
3530                 error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
3531
3532         if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size))
3533                 error = SET_ERROR(ENOENT);
3534
3535         if (error || txg == 0) {        /* txg == 0 indicates dry run */
3536                 mutex_exit(&msp->ms_lock);
3537                 return (error);
3538         }
3539
3540         VERIFY(!msp->ms_condensing);
3541         VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
3542         VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
3543         VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size);
3544         range_tree_remove(msp->ms_tree, offset, size);
3545
3546         if (spa_writeable(spa)) {       /* don't dirty if we're zdb(1M) */
3547                 if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
3548                         vdev_dirty(vd, VDD_METASLAB, msp, txg);
3549                 range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size);
3550         }
3551
3552         mutex_exit(&msp->ms_lock);
3553
3554         return (0);
3555 }
3556
3557 typedef struct metaslab_claim_cb_arg_t {
3558         uint64_t        mcca_txg;
3559         int             mcca_error;
3560 } metaslab_claim_cb_arg_t;
3561
3562 /* ARGSUSED */
3563 static void
3564 metaslab_claim_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
3565     uint64_t size, void *arg)
3566 {
3567         metaslab_claim_cb_arg_t *mcca_arg = arg;
3568
3569         if (mcca_arg->mcca_error == 0) {
3570                 mcca_arg->mcca_error = metaslab_claim_concrete(vd, offset,
3571                     size, mcca_arg->mcca_txg);
3572         }
3573 }
3574
3575 int
3576 metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg)
3577 {
3578         if (vd->vdev_ops->vdev_op_remap != NULL) {
3579                 metaslab_claim_cb_arg_t arg;
3580
3581                 /*
3582                  * Only zdb(1M) can claim on indirect vdevs.  This is used
3583                  * to detect leaks of mapped space (that are not accounted
3584                  * for in the obsolete counts, spacemap, or bpobj).
3585                  */
3586                 ASSERT(!spa_writeable(vd->vdev_spa));
3587                 arg.mcca_error = 0;
3588                 arg.mcca_txg = txg;
3589
3590                 vd->vdev_ops->vdev_op_remap(vd, offset, size,
3591                     metaslab_claim_impl_cb, &arg);
3592
3593                 if (arg.mcca_error == 0) {
3594                         arg.mcca_error = metaslab_claim_concrete(vd,
3595                             offset, size, txg);
3596                 }
3597                 return (arg.mcca_error);
3598         } else {
3599                 return (metaslab_claim_concrete(vd, offset, size, txg));
3600         }
3601 }
3602
3603 /*
3604  * Intent log support: upon opening the pool after a crash, notify the SPA
3605  * of blocks that the intent log has allocated for immediate write, but
3606  * which are still considered free by the SPA because the last transaction
3607  * group didn't commit yet.
3608  */
3609 static int
3610 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
3611 {
3612         uint64_t vdev = DVA_GET_VDEV(dva);
3613         uint64_t offset = DVA_GET_OFFSET(dva);
3614         uint64_t size = DVA_GET_ASIZE(dva);
3615         vdev_t *vd;
3616
3617         if ((vd = vdev_lookup_top(spa, vdev)) == NULL) {
3618                 return (SET_ERROR(ENXIO));
3619         }
3620
3621         ASSERT(DVA_IS_VALID(dva));
3622
3623         if (DVA_GET_GANG(dva))
3624                 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
3625
3626         return (metaslab_claim_impl(vd, offset, size, txg));
3627 }
3628
3629 int
3630 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
3631     int ndvas, uint64_t txg, blkptr_t *hintbp, int flags,
3632     zio_alloc_list_t *zal, zio_t *zio)
3633 {
3634         dva_t *dva = bp->blk_dva;
3635         dva_t *hintdva = hintbp->blk_dva;
3636         int error = 0;
3637
3638         ASSERT(bp->blk_birth == 0);
3639         ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
3640
3641         spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
3642
3643         if (mc->mc_rotor == NULL) {     /* no vdevs in this class */
3644                 spa_config_exit(spa, SCL_ALLOC, FTAG);
3645                 return (SET_ERROR(ENOSPC));
3646         }
3647
3648         ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
3649         ASSERT(BP_GET_NDVAS(bp) == 0);
3650         ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
3651         ASSERT3P(zal, !=, NULL);
3652
3653         for (int d = 0; d < ndvas; d++) {
3654                 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
3655                     txg, flags, zal);
3656                 if (error != 0) {
3657                         for (d--; d >= 0; d--) {
3658                                 metaslab_unalloc_dva(spa, &dva[d], txg);
3659                                 metaslab_group_alloc_decrement(spa,
3660                                     DVA_GET_VDEV(&dva[d]), zio, flags);
3661                                 bzero(&dva[d], sizeof (dva_t));
3662                         }
3663                         spa_config_exit(spa, SCL_ALLOC, FTAG);
3664                         return (error);
3665                 } else {
3666                         /*
3667                          * Update the metaslab group's queue depth
3668                          * based on the newly allocated dva.
3669                          */
3670                         metaslab_group_alloc_increment(spa,
3671                             DVA_GET_VDEV(&dva[d]), zio, flags);
3672                 }
3673
3674         }
3675         ASSERT(error == 0);
3676         ASSERT(BP_GET_NDVAS(bp) == ndvas);
3677
3678         spa_config_exit(spa, SCL_ALLOC, FTAG);
3679
3680         BP_SET_BIRTH(bp, txg, 0);
3681
3682         return (0);
3683 }
3684
3685 void
3686 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
3687 {
3688         const dva_t *dva = bp->blk_dva;
3689         int ndvas = BP_GET_NDVAS(bp);
3690
3691         ASSERT(!BP_IS_HOLE(bp));
3692         ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
3693
3694         spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
3695
3696         for (int d = 0; d < ndvas; d++) {
3697                 if (now) {
3698                         metaslab_unalloc_dva(spa, &dva[d], txg);
3699                 } else {
3700                         metaslab_free_dva(spa, &dva[d], txg);
3701                 }
3702         }
3703
3704         spa_config_exit(spa, SCL_FREE, FTAG);
3705 }
3706
3707 int
3708 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
3709 {
3710         const dva_t *dva = bp->blk_dva;
3711         int ndvas = BP_GET_NDVAS(bp);
3712         int error = 0;
3713
3714         ASSERT(!BP_IS_HOLE(bp));
3715
3716         if (txg != 0) {
3717                 /*
3718                  * First do a dry run to make sure all DVAs are claimable,
3719                  * so we don't have to unwind from partial failures below.
3720                  */
3721                 if ((error = metaslab_claim(spa, bp, 0)) != 0)
3722                         return (error);
3723         }
3724
3725         spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
3726
3727         for (int d = 0; d < ndvas; d++)
3728                 if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0)
3729                         break;
3730
3731         spa_config_exit(spa, SCL_ALLOC, FTAG);
3732
3733         ASSERT(error == 0 || txg == 0);
3734
3735         return (error);
3736 }
3737
3738 void
3739 metaslab_fastwrite_mark(spa_t *spa, const blkptr_t *bp)
3740 {
3741         const dva_t *dva = bp->blk_dva;
3742         int ndvas = BP_GET_NDVAS(bp);
3743         uint64_t psize = BP_GET_PSIZE(bp);
3744         int d;
3745         vdev_t *vd;
3746
3747         ASSERT(!BP_IS_HOLE(bp));
3748         ASSERT(!BP_IS_EMBEDDED(bp));
3749         ASSERT(psize > 0);
3750
3751         spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
3752
3753         for (d = 0; d < ndvas; d++) {
3754                 if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL)
3755                         continue;
3756                 atomic_add_64(&vd->vdev_pending_fastwrite, psize);
3757         }
3758
3759         spa_config_exit(spa, SCL_VDEV, FTAG);
3760 }
3761
3762 void
3763 metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp)
3764 {
3765         const dva_t *dva = bp->blk_dva;
3766         int ndvas = BP_GET_NDVAS(bp);
3767         uint64_t psize = BP_GET_PSIZE(bp);
3768         int d;
3769         vdev_t *vd;
3770
3771         ASSERT(!BP_IS_HOLE(bp));
3772         ASSERT(!BP_IS_EMBEDDED(bp));
3773         ASSERT(psize > 0);
3774
3775         spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
3776
3777         for (d = 0; d < ndvas; d++) {
3778                 if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL)
3779                         continue;
3780                 ASSERT3U(vd->vdev_pending_fastwrite, >=, psize);
3781                 atomic_sub_64(&vd->vdev_pending_fastwrite, psize);
3782         }
3783
3784         spa_config_exit(spa, SCL_VDEV, FTAG);
3785 }
3786
3787 /* ARGSUSED */
3788 static void
3789 metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset,
3790     uint64_t size, void *arg)
3791 {
3792         if (vd->vdev_ops == &vdev_indirect_ops)
3793                 return;
3794
3795         metaslab_check_free_impl(vd, offset, size);
3796 }
3797
3798 static void
3799 metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size)
3800 {
3801         metaslab_t *msp;
3802         ASSERTV(spa_t *spa = vd->vdev_spa);
3803
3804         if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
3805                 return;
3806
3807         if (vd->vdev_ops->vdev_op_remap != NULL) {
3808                 vd->vdev_ops->vdev_op_remap(vd, offset, size,
3809                     metaslab_check_free_impl_cb, NULL);
3810                 return;
3811         }
3812
3813         ASSERT(vdev_is_concrete(vd));
3814         ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
3815         ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
3816
3817         msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
3818
3819         mutex_enter(&msp->ms_lock);
3820         if (msp->ms_loaded)
3821                 range_tree_verify(msp->ms_tree, offset, size);
3822
3823         range_tree_verify(msp->ms_freeingtree, offset, size);
3824         range_tree_verify(msp->ms_freedtree, offset, size);
3825         for (int j = 0; j < TXG_DEFER_SIZE; j++)
3826                 range_tree_verify(msp->ms_defertree[j], offset, size);
3827         mutex_exit(&msp->ms_lock);
3828 }
3829
3830 void
3831 metaslab_check_free(spa_t *spa, const blkptr_t *bp)
3832 {
3833         if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
3834                 return;
3835
3836         spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
3837         for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
3838                 uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
3839                 vdev_t *vd = vdev_lookup_top(spa, vdev);
3840                 uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
3841                 uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
3842
3843                 if (DVA_GET_GANG(&bp->blk_dva[i]))
3844                         size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
3845
3846                 ASSERT3P(vd, !=, NULL);
3847
3848                 metaslab_check_free_impl(vd, offset, size);
3849         }
3850         spa_config_exit(spa, SCL_VDEV, FTAG);
3851 }
3852
3853 #if defined(_KERNEL)
3854 /* CSTYLED */
3855 module_param(metaslab_aliquot, ulong, 0644);
3856 MODULE_PARM_DESC(metaslab_aliquot,
3857         "allocation granularity (a.k.a. stripe size)");
3858
3859 module_param(metaslab_debug_load, int, 0644);
3860 MODULE_PARM_DESC(metaslab_debug_load,
3861         "load all metaslabs when pool is first opened");
3862
3863 module_param(metaslab_debug_unload, int, 0644);
3864 MODULE_PARM_DESC(metaslab_debug_unload,
3865         "prevent metaslabs from being unloaded");
3866
3867 module_param(metaslab_preload_enabled, int, 0644);
3868 MODULE_PARM_DESC(metaslab_preload_enabled,
3869         "preload potential metaslabs during reassessment");
3870
3871 module_param(zfs_mg_noalloc_threshold, int, 0644);
3872 MODULE_PARM_DESC(zfs_mg_noalloc_threshold,
3873         "percentage of free space for metaslab group to allow allocation");
3874
3875 module_param(zfs_mg_fragmentation_threshold, int, 0644);
3876 MODULE_PARM_DESC(zfs_mg_fragmentation_threshold,
3877         "fragmentation for metaslab group to allow allocation");
3878
3879 module_param(zfs_metaslab_fragmentation_threshold, int, 0644);
3880 MODULE_PARM_DESC(zfs_metaslab_fragmentation_threshold,
3881         "fragmentation for metaslab to allow allocation");
3882
3883 module_param(metaslab_fragmentation_factor_enabled, int, 0644);
3884 MODULE_PARM_DESC(metaslab_fragmentation_factor_enabled,
3885         "use the fragmentation metric to prefer less fragmented metaslabs");
3886
3887 module_param(metaslab_lba_weighting_enabled, int, 0644);
3888 MODULE_PARM_DESC(metaslab_lba_weighting_enabled,
3889         "prefer metaslabs with lower LBAs");
3890
3891 module_param(metaslab_bias_enabled, int, 0644);
3892 MODULE_PARM_DESC(metaslab_bias_enabled,
3893         "enable metaslab group biasing");
3894
3895 module_param(zfs_metaslab_segment_weight_enabled, int, 0644);
3896 MODULE_PARM_DESC(zfs_metaslab_segment_weight_enabled,
3897         "enable segment-based metaslab selection");
3898
3899 module_param(zfs_metaslab_switch_threshold, int, 0644);
3900 MODULE_PARM_DESC(zfs_metaslab_switch_threshold,
3901         "segment-based metaslab selection maximum buckets before switching");
3902
3903 /* CSTYLED */
3904 module_param(metaslab_force_ganging, ulong, 0644);
3905 MODULE_PARM_DESC(metaslab_force_ganging,
3906         "blocks larger than this size are forced to be gang blocks");
3907 #endif